From 5205070299a757170df81bcb2a75edb8a2a1ca14 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 13 Mar 2026 21:08:12 -0700 Subject: [PATCH] feat: SKILL.md template system, 3-tier testing, DX tools (v0.3.3) (#41) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: extract command registry to commands.ts, add SNAPSHOT_FLAGS metadata - NEW: browse/src/commands.ts — command sets + COMMAND_DESCRIPTIONS + load-time validation (zero side effects) - server.ts imports from commands.ts instead of declaring sets inline - snapshot.ts: SNAPSHOT_FLAGS array drives parseSnapshotArgs (metadata-driven, no duplication) - All 186 existing tests pass * feat: SKILL.md template system with auto-generated command references - SKILL.md.tmpl + browse/SKILL.md.tmpl with {{COMMAND_REFERENCE}} and {{SNAPSHOT_FLAGS}} placeholders - scripts/gen-skill-docs.ts generates SKILL.md from templates (supports --dry-run) - Build pipeline runs gen:skill-docs before binary compilation - Generated files have AUTO-GENERATED header, committed to git * test: Tier 1 static validation — 34 tests for SKILL.md command correctness - test/helpers/skill-parser.ts: extracts $B commands from code blocks, validates against registry - test/skill-parser.test.ts: 13 parser/validator unit tests - test/skill-validation.test.ts: 13 tests validating all SKILL.md files + registry consistency - test/gen-skill-docs.test.ts: 8 generator tests (categories, sorting, freshness) * feat: DX tools (skill:check, dev:skill) + Tier 2 E2E test scaffolding - scripts/skill-check.ts: health summary for all SKILL.md files (commands, templates, freshness) - scripts/dev-skill.ts: watch mode for template development - test/helpers/session-runner.ts: Agent SDK wrapper for E2E skill tests - test/skill-e2e.test.ts: 2 E2E tests + 3 stubs (auto-skip inside Claude Code sessions) - E2E tests must run from plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts * ci: SKILL.md freshness check on push/PR + TODO updates - .github/workflows/skill-docs.yml: fails if generated SKILL.md files are stale - TODO.md: add E2E cost tracking and model pinning to future ideas * fix: restore rich descriptions lost in auto-generation - Snapshot flags: add back value hints (-d , -s , -o ) - Snapshot flags: restore parenthetical context (@e refs, @c refs, etc.) - Commands: is → includes valid states enum - Commands: console → notes --errors filter behavior - Commands: press → lists common keys (Enter, Tab, Escape) - Commands: cookie-import-browser → describes picker UI - Commands: dialog-accept → specifies alert/confirm/prompt - Tips: restore → arrow (was downgraded to ->) * test: quality evals for generated SKILL.md descriptions Catches the exact regressions we shipped and caught in review: - Snapshot flags must include value hints (-d , -s , -o ) - is command must list all valid states (visible/hidden/enabled/...) - press command must list example keys (Enter, Tab, Escape) - console command must describe --errors behavior - Snapshot -i must mention @e refs, -C must mention @c refs - All descriptions must be >= 8 chars (no empty stubs) - Tips section must use → not -> * feat: LLM-as-judge evals for SKILL.md documentation quality 4 eval tests using Anthropic API (claude-haiku, ~$0.01-0.03/run): - Command reference table: clarity/completeness/actionability >= 4/5 - Snapshot flags section: same thresholds - browse/SKILL.md overall quality - Regression: generated version must score >= hand-maintained baseline Requires ANTHROPIC_API_KEY. Auto-skips without it. Run: bun run test:eval (or ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts) * chore: bump version to 0.3.3, update changelog Co-Authored-By: Claude Opus 4.6 * docs: add ARCHITECTURE.md, update CLAUDE.md and CONTRIBUTING.md Co-Authored-By: Claude Opus 4.6 * feat: conductor.json lifecycle hooks + .env propagation across worktrees bin/dev-setup now copies .env from main worktree so API keys carry over to Conductor workspaces automatically. conductor.json wires up setup and archive hooks. Co-Authored-By: Claude Opus 4.6 * docs: complete CHANGELOG for v0.3.3 (architecture, conductor, .env) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- .env.example | 5 + .github/workflows/skill-docs.yml | 11 ++ .gitignore | 4 + ARCHITECTURE.md | 240 ++++++++++++++++++++++++++++++ CHANGELOG.md | 27 ++++ CLAUDE.md | 35 ++++- CONTRIBUTING.md | 113 +++++++++++++- SKILL.md | 90 +++++++----- SKILL.md.tmpl | 245 +++++++++++++++++++++++++++++++ TODO.md | 2 + VERSION | 2 +- bin/dev-setup | 24 ++- browse/SKILL.md | 117 +++++++++++++-- browse/SKILL.md.tmpl | 106 +++++++++++++ browse/src/commands.ts | 107 ++++++++++++++ browse/src/server.ts | 25 +--- browse/src/snapshot.ts | 76 +++++----- conductor.json | 6 + package.json | 18 ++- scripts/dev-skill.ts | 82 +++++++++++ scripts/gen-skill-docs.ts | 163 ++++++++++++++++++++ scripts/skill-check.ts | 111 ++++++++++++++ test/gen-skill-docs.test.ts | 150 +++++++++++++++++++ test/helpers/session-runner.ts | 160 ++++++++++++++++++++ test/helpers/skill-parser.ts | 133 +++++++++++++++++ test/skill-e2e.test.ts | 79 ++++++++++ test/skill-llm-eval.test.ts | 194 ++++++++++++++++++++++++ test/skill-parser.test.ts | 179 ++++++++++++++++++++++ test/skill-validation.test.ts | 100 +++++++++++++ 29 files changed, 2474 insertions(+), 130 deletions(-) create mode 100644 .env.example create mode 100644 .github/workflows/skill-docs.yml create mode 100644 ARCHITECTURE.md create mode 100644 SKILL.md.tmpl create mode 100644 browse/SKILL.md.tmpl create mode 100644 browse/src/commands.ts create mode 100644 conductor.json create mode 100644 scripts/dev-skill.ts create mode 100644 scripts/gen-skill-docs.ts create mode 100644 scripts/skill-check.ts create mode 100644 test/gen-skill-docs.test.ts create mode 100644 test/helpers/session-runner.ts create mode 100644 test/helpers/skill-parser.ts create mode 100644 test/skill-e2e.test.ts create mode 100644 test/skill-llm-eval.test.ts create mode 100644 test/skill-parser.test.ts create mode 100644 test/skill-validation.test.ts diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..04c8f010b391b1a9a15c4789c30b957a600197e4 --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +# Copy to .env and fill in values +# bun auto-loads .env — no dotenv needed + +# Required for LLM-as-judge evals (bun run test:eval) +ANTHROPIC_API_KEY=sk-ant-your-key-here diff --git a/.github/workflows/skill-docs.yml b/.github/workflows/skill-docs.yml new file mode 100644 index 0000000000000000000000000000000000000000..6f8f1744efe3ab2a6f41da8b2449084eb459cf27 --- /dev/null +++ b/.github/workflows/skill-docs.yml @@ -0,0 +1,11 @@ +name: Skill Docs Freshness +on: [push, pull_request] +jobs: + check-freshness: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: oven-sh/setup-bun@v2 + - run: bun install + - run: bun run gen:skill-docs + - run: git diff --exit-code || (echo "Generated SKILL.md files are stale. Run: bun run gen:skill-docs" && exit 1) diff --git a/.gitignore b/.gitignore index 147742acd196eaa4c24ab5dd65057cc286cf25a0..cc41a3e7bf6bc9302e4cb34dc734575941ce1f93 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,7 @@ browse/dist/ *.log bun.lock *.bun-build +.env +.env.local +.env.* +!.env.example diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000000000000000000000000000000000000..daa64a8c108efd7c14d5bc60669609a93ce1fe77 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,240 @@ +# Architecture + +This document explains **why** gstack is built the way it is. For setup and commands, see CLAUDE.md. For contributing, see CONTRIBUTING.md. + +## The core idea + +gstack gives Claude Code a persistent browser and a set of opinionated workflow skills. The browser is the hard part — everything else is Markdown. + +The key insight: an AI agent interacting with a browser needs **sub-second latency** and **persistent state**. If every command cold-starts a browser, you're waiting 3-5 seconds per tool call. If the browser dies between commands, you lose cookies, tabs, and login sessions. So gstack runs a long-lived Chromium daemon that the CLI talks to over localhost HTTP. + +``` +Claude Code gstack +───────── ────── + ┌──────────────────────┐ + Tool call: $B snapshot -i │ CLI (compiled binary)│ + ─────────────────────────→ │ • reads state file │ + │ • POST /command │ + │ to localhost:PORT │ + └──────────┬───────────┘ + │ HTTP + ┌──────────▼───────────┐ + │ Server (Bun.serve) │ + │ • dispatches command │ + │ • talks to Chromium │ + │ • returns plain text │ + └──────────┬───────────┘ + │ CDP + ┌──────────▼───────────┐ + │ Chromium (headless) │ + │ • persistent tabs │ + │ • cookies carry over │ + │ • 30min idle timeout │ + └───────────────────────┘ +``` + +First call starts everything (~3s). Every call after: ~100-200ms. + +## Why Bun + +Node.js would work. Bun is better here for three reasons: + +1. **Compiled binaries.** `bun build --compile` produces a single ~58MB executable. No `node_modules` at runtime, no `npx`, no PATH configuration. The binary just runs. This matters because gstack installs into `~/.claude/skills/` where users don't expect to manage a Node.js project. + +2. **Native SQLite.** Cookie decryption reads Chromium's SQLite cookie database directly. Bun has `new Database()` built in — no `better-sqlite3`, no native addon compilation, no gyp. One less thing that breaks on different machines. + +3. **Native TypeScript.** The server runs as `bun run server.ts` during development. No compilation step, no `ts-node`, no source maps to debug. The compiled binary is for deployment; source files are for development. + +4. **Built-in HTTP server.** `Bun.serve()` is fast, simple, and doesn't need Express or Fastify. The server handles ~10 routes total. A framework would be overhead. + +The bottleneck is always Chromium, not the CLI or server. Bun's startup speed (~1ms for the compiled binary vs ~100ms for Node) is nice but not the reason we chose it. The compiled binary and native SQLite are. + +## The daemon model + +### Why not start a browser per command? + +Playwright can launch Chromium in ~2-3 seconds. For a single screenshot, that's fine. For a QA session with 20+ commands, it's 40+ seconds of browser startup overhead. Worse: you lose all state between commands. Cookies, localStorage, login sessions, open tabs — all gone. + +The daemon model means: + +- **Persistent state.** Log in once, stay logged in. Open a tab, it stays open. localStorage persists across commands. +- **Sub-second commands.** After the first call, every command is just an HTTP POST. ~100-200ms round-trip including Chromium's work. +- **Automatic lifecycle.** The server auto-starts on first use, auto-shuts down after 30 minutes idle. No process management needed. + +### State file + +The server writes `.gstack/browse.json` (atomic write via tmp + rename, mode 0o600): + +```json +{ "pid": 12345, "port": 34567, "token": "uuid-v4", "startedAt": "...", "binaryVersion": "abc123" } +``` + +The CLI reads this file to find the server. If the file is missing, stale, or the PID is dead, the CLI spawns a new server. + +### Port selection + +Random port between 10000-60000 (retry up to 5 on collision). This means 10 Conductor workspaces can each run their own browse daemon with zero configuration and zero port conflicts. The old approach (scanning 9400-9409) broke constantly in multi-workspace setups. + +### Version auto-restart + +The build writes `git rev-parse HEAD` to `browse/dist/.version`. On each CLI invocation, if the binary's version doesn't match the running server's `binaryVersion`, the CLI kills the old server and starts a new one. This prevents the "stale binary" class of bugs entirely — rebuild the binary, next command picks it up automatically. + +## Security model + +### Localhost only + +The HTTP server binds to `localhost`, not `0.0.0.0`. It's not reachable from the network. + +### Bearer token auth + +Every server session generates a random UUID token, written to the state file with mode 0o600 (owner-only read). Every HTTP request must include `Authorization: Bearer `. If the token doesn't match, the server returns 401. + +This prevents other processes on the same machine from talking to your browse server. The cookie picker UI (`/cookie-picker`) and health check (`/health`) are exempt — they're localhost-only and don't execute commands. + +### Cookie security + +Cookies are the most sensitive data gstack handles. The design: + +1. **Keychain access requires user approval.** First cookie import per browser triggers a macOS Keychain dialog. The user must click "Allow" or "Always Allow." gstack never silently accesses credentials. + +2. **Decryption happens in-process.** Cookie values are decrypted in memory (PBKDF2 + AES-128-CBC), loaded into the Playwright context, and never written to disk in plaintext. The cookie picker UI never displays cookie values — only domain names and counts. + +3. **Database is read-only.** gstack copies the Chromium cookie DB to a temp file (to avoid SQLite lock conflicts with the running browser) and opens it read-only. It never modifies your real browser's cookie database. + +4. **Key caching is per-session.** The Keychain password + derived AES key are cached in memory for the server's lifetime. When the server shuts down (idle timeout or explicit stop), the cache is gone. + +5. **No cookie values in logs.** Console, network, and dialog logs never contain cookie values. The `cookies` command outputs cookie metadata (domain, name, expiry) but values are truncated. + +### Shell injection prevention + +The browser registry (Comet, Chrome, Arc, Brave, Edge) is hardcoded. Database paths are constructed from known constants, never from user input. Keychain access uses `Bun.spawn()` with explicit argument arrays, not shell string interpolation. + +## The ref system + +Refs (`@e1`, `@e2`, `@c1`) are how the agent addresses page elements without writing CSS selectors or XPath. + +### How it works + +``` +1. Agent runs: $B snapshot -i +2. Server calls Playwright's page.accessibility.snapshot() +3. Parser walks the ARIA tree, assigns sequential refs: @e1, @e2, @e3... +4. For each ref, builds a Playwright Locator: getByRole(role, { name }).nth(index) +5. Stores Map on the BrowserManager instance +6. Returns the annotated tree as plain text + +Later: +7. Agent runs: $B click @e3 +8. Server resolves @e3 → Locator → locator.click() +``` + +### Why Locators, not DOM mutation + +The obvious approach is to inject `data-ref="@e1"` attributes into the DOM. This breaks on: + +- **CSP (Content Security Policy).** Many production sites block DOM modification from scripts. +- **React/Vue/Svelte hydration.** Framework reconciliation can strip injected attributes. +- **Shadow DOM.** Can't reach inside shadow roots from the outside. + +Playwright Locators are external to the DOM. They use the accessibility tree (which Chromium maintains internally) and `getByRole()` queries. No DOM mutation, no CSP issues, no framework conflicts. + +### Ref lifecycle + +Refs are cleared on navigation (the `framenavigated` event on the main frame). This is correct — after navigation, all locators are stale. The agent must run `snapshot` again to get fresh refs. This is by design: stale refs should fail loudly, not click the wrong element. + +### Cursor-interactive refs (@c) + +The `-C` flag finds elements that are clickable but not in the ARIA tree — things styled with `cursor: pointer`, elements with `onclick` attributes, or custom `tabindex`. These get `@c1`, `@c2` refs in a separate namespace. This catches custom components that frameworks render as `
` but are actually buttons. + +## Logging architecture + +Three ring buffers (50,000 entries each, O(1) push): + +``` +Browser events → CircularBuffer (in-memory) → Async flush to .gstack/*.log +``` + +Console messages, network requests, and dialog events each have their own buffer. Flushing happens every 1 second — the server appends only new entries since the last flush. This means: + +- HTTP request handling is never blocked by disk I/O +- Logs survive server crashes (up to 1 second of data loss) +- Memory is bounded (50K entries × 3 buffers) +- Disk files are append-only, readable by external tools + +The `console`, `network`, and `dialog` commands read from the in-memory buffers, not disk. Disk files are for post-mortem debugging. + +## SKILL.md template system + +### The problem + +SKILL.md files tell Claude how to use the browse commands. If the docs list a flag that doesn't exist, or miss a command that was added, the agent hits errors. Hand-maintained docs always drift from code. + +### The solution + +``` +SKILL.md.tmpl (human-written prose + placeholders) + ↓ +gen-skill-docs.ts (reads source code metadata) + ↓ +SKILL.md (committed, auto-generated sections) +``` + +Templates contain the workflows, tips, and examples that require human judgment. The `{{COMMAND_REFERENCE}}` and `{{SNAPSHOT_FLAGS}}` placeholders are filled from `commands.ts` and `snapshot.ts` at build time. This is structurally sound — if a command exists in code, it appears in docs. If it doesn't exist, it can't appear. + +### Why committed, not generated at runtime? + +Three reasons: + +1. **Claude reads SKILL.md at skill load time.** There's no build step when a user invokes `/browse`. The file must already exist and be correct. +2. **CI can validate freshness.** `gen:skill-docs --dry-run` + `git diff --exit-code` catches stale docs before merge. +3. **Git blame works.** You can see when a command was added and in which commit. + +### Test tiers + +| Tier | What | Cost | Speed | +|------|------|------|-------| +| 1 — Static validation | Parse every `$B` command in SKILL.md, validate against registry | Free | <2s | +| 2 — E2E via Agent SDK | Spawn real Claude session, run `/qa`, check for errors | ~$0.50 | ~60s | +| 3 — LLM-as-judge | Haiku scores docs on clarity/completeness/actionability | ~$0.03 | ~10s | + +Tier 1 runs on every `bun test`. Tier 2 and 3 are gated behind env vars. The idea is: catch 95% of issues for free, use LLMs only for the judgment calls. + +## Command dispatch + +Commands are categorized by side effects: + +- **READ** (text, html, links, console, cookies, ...): No mutations. Safe to retry. Returns page state. +- **WRITE** (goto, click, fill, press, ...): Mutates page state. Not idempotent. +- **META** (snapshot, screenshot, tabs, chain, ...): Server-level operations that don't fit neatly into read/write. + +This isn't just organizational. The server uses it for dispatch: + +```typescript +if (READ_COMMANDS.has(cmd)) → handleReadCommand(cmd, args, bm) +if (WRITE_COMMANDS.has(cmd)) → handleWriteCommand(cmd, args, bm) +if (META_COMMANDS.has(cmd)) → handleMetaCommand(cmd, args, bm, shutdown) +``` + +The `help` command returns all three sets so agents can self-discover available commands. + +## Error philosophy + +Errors are for AI agents, not humans. Every error message must be actionable: + +- "Element not found" → "Element not found or not interactable. Run `snapshot -i` to see available elements." +- "Selector matched multiple elements" → "Selector matched multiple elements. Use @refs from `snapshot` instead." +- Timeout → "Navigation timed out after 30s. The page may be slow or the URL may be wrong." + +Playwright's native errors are rewritten through `wrapError()` to strip internal stack traces and add guidance. The agent should be able to read the error and know what to do next without human intervention. + +### Crash recovery + +The server doesn't try to self-heal. If Chromium crashes (`browser.on('disconnected')`), the server exits immediately. The CLI detects the dead server on the next command and auto-restarts. This is simpler and more reliable than trying to reconnect to a half-dead browser process. + +## What's intentionally not here + +- **No WebSocket streaming.** HTTP request/response is simpler, debuggable with curl, and fast enough. Streaming would add complexity for marginal benefit. +- **No MCP protocol.** MCP adds JSON schema overhead per request and requires a persistent connection. Plain HTTP + plain text output is lighter on tokens and easier to debug. +- **No multi-user support.** One server per workspace, one user. The token auth is defense-in-depth, not multi-tenancy. +- **No Windows/Linux cookie decryption.** macOS Keychain is the only supported credential store. Linux (GNOME Keyring/kwallet) and Windows (DPAPI) are architecturally possible but not implemented. +- **No iframe support.** Playwright can handle iframes but the ref system doesn't cross frame boundaries yet. This is the most-requested missing feature. diff --git a/CHANGELOG.md b/CHANGELOG.md index d1c5c97a514f4399558a4ede13c85bc08b47930c..6bdd600c91fd4d19a54b6d01503ff03ea37a3666 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## 0.3.3 — 2026-03-13 + +### Added +- **SKILL.md template system** — `.tmpl` files with `{{COMMAND_REFERENCE}}` and `{{SNAPSHOT_FLAGS}}` placeholders, auto-generated from source code at build time. Structurally prevents command drift between docs and code. +- **Command registry** (`browse/src/commands.ts`) — single source of truth for all browse commands with categories and enriched descriptions. Zero side effects, safe to import from build scripts and tests. +- **Snapshot flags metadata** (`SNAPSHOT_FLAGS` array in `browse/src/snapshot.ts`) — metadata-driven parser replaces hand-coded switch/case. Adding a flag in one place updates the parser, docs, and tests. +- **Tier 1 static validation** — 43 tests: parses `$B` commands from SKILL.md code blocks, validates against command registry and snapshot flag metadata +- **Tier 2 E2E tests** via Agent SDK — spawns real Claude sessions, runs skills, scans for browse errors. Gated by `SKILL_E2E=1` env var (~$0.50/run) +- **Tier 3 LLM-as-judge evals** — Haiku scores generated docs on clarity/completeness/actionability (threshold ≥4/5), plus regression test vs hand-maintained baseline. Gated by `ANTHROPIC_API_KEY` +- **`bun run skill:check`** — health dashboard showing all skills, command counts, validation status, template freshness +- **`bun run dev:skill`** — watch mode that regenerates and validates SKILL.md on every template or source file change +- **CI workflow** (`.github/workflows/skill-docs.yml`) — runs `gen:skill-docs` on push/PR, fails if generated output differs from committed files +- `bun run gen:skill-docs` script for manual regeneration +- `bun run test:eval` for LLM-as-judge evals +- `test/helpers/skill-parser.ts` — extracts and validates `$B` commands from Markdown +- `test/helpers/session-runner.ts` — Agent SDK wrapper with error pattern scanning and transcript saving +- **ARCHITECTURE.md** — design decisions document covering daemon model, security, ref system, logging, crash recovery +- **Conductor integration** (`conductor.json`) — lifecycle hooks for workspace setup/teardown +- **`.env` propagation** — `bin/dev-setup` copies `.env` from main worktree into Conductor workspaces automatically +- `.env.example` template for API key configuration + +### Changed +- Build now runs `gen:skill-docs` before compiling binaries +- `parseSnapshotArgs` is metadata-driven (iterates `SNAPSHOT_FLAGS` instead of switch/case) +- `server.ts` imports command sets from `commands.ts` instead of declaring inline +- SKILL.md and browse/SKILL.md are now generated files (edit the `.tmpl` instead) + ## 0.3.2 — 2026-03-13 ### Fixed diff --git a/CLAUDE.md b/CLAUDE.md index 917afed1577813e2ee5a1e1955ed8dfcfc1f24ef..b08c919cfec8823b64a6efc30fe9d8eaf88a3d4b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,9 +4,14 @@ ```bash bun install # install dependencies -bun test # run integration tests (browse + snapshot) +bun test # run tests (browse + snapshot + skill validation) +bun run test:eval # run LLM-as-judge evals (needs ANTHROPIC_API_KEY) +bun run test:e2e # run E2E skill tests (needs SKILL_E2E=1, ~$0.50/run) bun run dev # run CLI in dev mode, e.g. bun run dev goto https://example.com -bun run build # compile binary to browse/dist/browse +bun run build # gen docs + compile binaries +bun run gen:skill-docs # regenerate SKILL.md files from templates +bun run skill:check # health dashboard for all skills +bun run dev:skill # watch mode: auto-regen + validate on change ``` ## Project structure @@ -15,18 +20,42 @@ bun run build # compile binary to browse/dist/browse gstack/ ├── browse/ # Headless browser CLI (Playwright) │ ├── src/ # CLI + server + commands +│ │ ├── commands.ts # Command registry (single source of truth) +│ │ └── snapshot.ts # SNAPSHOT_FLAGS metadata array │ ├── test/ # Integration tests + fixtures │ └── dist/ # Compiled binary +├── scripts/ # Build + DX tooling +│ ├── gen-skill-docs.ts # Template → SKILL.md generator +│ ├── skill-check.ts # Health dashboard +│ └── dev-skill.ts # Watch mode +├── test/ # Skill validation + eval tests +│ ├── helpers/ # skill-parser.ts, session-runner.ts +│ ├── skill-validation.test.ts # Tier 1: static command validation +│ ├── gen-skill-docs.test.ts # Tier 1: generator + quality evals +│ ├── skill-e2e.test.ts # Tier 2: Agent SDK E2E +│ └── skill-llm-eval.test.ts # Tier 3: LLM-as-judge ├── ship/ # Ship workflow skill ├── review/ # PR review skill ├── plan-ceo-review/ # /plan-ceo-review skill ├── plan-eng-review/ # /plan-eng-review skill ├── retro/ # Retrospective skill ├── setup # One-time setup: build binary + symlink skills -├── SKILL.md # Browse skill (Claude discovers this) +├── SKILL.md # Generated from SKILL.md.tmpl (don't edit directly) +├── SKILL.md.tmpl # Template: edit this, run gen:skill-docs └── package.json # Build scripts for browse ``` +## SKILL.md workflow + +SKILL.md files are **generated** from `.tmpl` templates. To update docs: + +1. Edit the `.tmpl` file (e.g. `SKILL.md.tmpl` or `browse/SKILL.md.tmpl`) +2. Run `bun run gen:skill-docs` (or `bun run build` which does it automatically) +3. Commit both the `.tmpl` and generated `.md` files + +To add a new browse command: add it to `browse/src/commands.ts` and rebuild. +To add a snapshot flag: add it to `SNAPSHOT_FLAGS` in `browse/src/snapshot.ts` and rebuild. + ## Browser interaction When you need to interact with a browser (QA, dogfooding, cookie setup), use the diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d696904858da939f5ba8a43caa5902a0acee275e..d98489ee0f9e7590b358c23effedd74bc83ae22d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -60,22 +60,123 @@ bun run build bin/dev-teardown ``` -## Running tests +## Testing & evals + +### Setup ```bash -bun test # all tests (browse integration + snapshot) -bun run dev # run CLI in dev mode, e.g. bun run dev goto https://example.com -bun run build # compile binary to browse/dist/browse +# 1. Copy .env.example and add your API key +cp .env.example .env +# Edit .env → set ANTHROPIC_API_KEY=sk-ant-... + +# 2. Install deps (if you haven't already) +bun install ``` +Bun auto-loads `.env` — no extra config. Conductor workspaces inherit `.env` from the main worktree automatically (see "Conductor workspaces" below). + +### Test tiers + +| Tier | Command | Cost | What it tests | +|------|---------|------|---------------| +| 1 — Static | `bun test` | Free | Command validation, snapshot flags, SKILL.md correctness | +| 2 — E2E | `bun run test:e2e` | ~$0.50 | Full skill execution via Agent SDK | +| 3 — LLM eval | `bun run test:eval` | ~$0.03 | Doc quality scoring via LLM-as-judge | + +```bash +bun test # Tier 1 only (runs on every commit, <5s) +bun run test:eval # Tier 3: LLM-as-judge (needs ANTHROPIC_API_KEY in .env) +bun run test:e2e # Tier 2: E2E (needs SKILL_E2E=1, can't run inside Claude Code) +bun run test:all # Tier 1 + Tier 2 +``` + +### Tier 1: Static validation (free) + +Runs automatically with `bun test`. No API keys needed. + +- **Skill parser tests** (`test/skill-parser.test.ts`) — Extracts every `$B` command from SKILL.md bash code blocks and validates against the command registry in `browse/src/commands.ts`. Catches typos, removed commands, and invalid snapshot flags. +- **Skill validation tests** (`test/skill-validation.test.ts`) — Validates that SKILL.md files reference only real commands and flags, and that command descriptions meet quality thresholds. +- **Generator tests** (`test/gen-skill-docs.test.ts`) — Tests the template system: verifies placeholders resolve correctly, output includes value hints for flags (e.g. `-d ` not just `-d`), enriched descriptions for key commands (e.g. `is` lists valid states, `press` lists key examples). + +### Tier 2: E2E via Agent SDK (~$0.50/run) + +Spawns a real Claude Code session, invokes `/qa` or `/browse`, and scans tool results for errors. This is the closest thing to "does this skill actually work end-to-end?" + +```bash +# Must run from a plain terminal — can't nest inside Claude Code or Conductor +SKILL_E2E=1 bun test test/skill-e2e.test.ts +``` + +- Gated by `SKILL_E2E=1` env var (prevents accidental expensive runs) +- Auto-skips if it detects it's running inside Claude Code (Agent SDK can't nest) +- Saves full conversation transcripts on failure for debugging +- Tests live in `test/skill-e2e.test.ts`, runner logic in `test/helpers/session-runner.ts` + +### Tier 3: LLM-as-judge (~$0.03/run) + +Uses Claude Haiku to score generated SKILL.md docs on three dimensions: + +- **Clarity** — Can an AI agent understand the instructions without ambiguity? +- **Completeness** — Are all commands, flags, and usage patterns documented? +- **Actionability** — Can the agent execute tasks using only the information in the doc? + +Each dimension is scored 1-5. Threshold: every dimension must score **≥ 4**. There's also a regression test that compares generated docs against the hand-maintained baseline from `origin/main` — generated must score equal or higher. + +```bash +# Needs ANTHROPIC_API_KEY in .env +bun run test:eval +``` + +- Uses `claude-haiku-4-5` for cost efficiency +- Tests live in `test/skill-llm-eval.test.ts` +- Calls the Anthropic API directly (not Agent SDK), so it works from anywhere including inside Claude Code + +### CI + +A GitHub Action (`.github/workflows/skill-docs.yml`) runs `bun run gen:skill-docs --dry-run` on every push and PR. If the generated SKILL.md files differ from what's committed, CI fails. This catches stale docs before they merge. + Tests run against the browse binary directly — they don't require dev mode. +## Editing SKILL.md files + +SKILL.md files are **generated** from `.tmpl` templates. Don't edit the `.md` directly — your changes will be overwritten on the next build. + +```bash +# 1. Edit the template +vim SKILL.md.tmpl # or browse/SKILL.md.tmpl + +# 2. Regenerate +bun run gen:skill-docs + +# 3. Check health +bun run skill:check + +# Or use watch mode — auto-regenerates on save +bun run dev:skill +``` + +To add a browse command, add it to `browse/src/commands.ts`. To add a snapshot flag, add it to `SNAPSHOT_FLAGS` in `browse/src/snapshot.ts`. Then rebuild. + +## Conductor workspaces + +If you're using [Conductor](https://conductor.build) to run multiple Claude Code sessions in parallel, `conductor.json` wires up workspace lifecycle automatically: + +| Hook | Script | What it does | +|------|--------|-------------| +| `setup` | `bin/dev-setup` | Copies `.env` from main worktree, installs deps, symlinks skills | +| `archive` | `bin/dev-teardown` | Removes skill symlinks, cleans up `.claude/` directory | + +When Conductor creates a new workspace, `bin/dev-setup` runs automatically. It detects the main worktree (via `git worktree list`), copies your `.env` so API keys carry over, and sets up dev mode — no manual steps needed. + +**First-time setup:** Put your `ANTHROPIC_API_KEY` in `.env` in the main repo (see `.env.example`). Every Conductor workspace inherits it automatically. + ## Things to know -- **SKILL.md changes are instant.** They're just Markdown. Edit, save, invoke. +- **SKILL.md files are generated.** Edit the `.tmpl` template, not the `.md`. Run `bun run gen:skill-docs` to regenerate. - **Browse source changes need a rebuild.** If you touch `browse/src/*.ts`, run `bun run build`. - **Dev mode shadows your global install.** Project-local skills take priority over `~/.claude/skills/gstack`. `bin/dev-teardown` restores the global one. -- **Conductor workspaces are independent.** Each workspace is its own clone. Run `bin/dev-setup` in the one you're working in. +- **Conductor workspaces are independent.** Each workspace is its own git worktree. `bin/dev-setup` runs automatically via `conductor.json`. +- **`.env` propagates across worktrees.** Set it once in the main repo, all Conductor workspaces get it. - **`.claude/skills/` is gitignored.** The symlinks never get committed. ## Testing a branch in another repo diff --git a/SKILL.md b/SKILL.md index e561e2ccc5b41fe34043e8c54672b5ab2350a246..2f78a6307e1578ab8665c6d78b1585fa904ff03d 100644 --- a/SKILL.md +++ b/SKILL.md @@ -12,6 +12,8 @@ allowed-tools: - Read --- + + # gstack browse: QA Testing & Dogfooding @@ -239,15 +241,15 @@ $B css ".button" "background-color" The snapshot is your primary tool for understanding and interacting with pages. -```bash -$B snapshot -i # Interactive elements only (buttons, links, inputs) with @e refs -$B snapshot -c # Compact (no empty structural elements) -$B snapshot -d 3 # Limit depth to 3 levels -$B snapshot -s "main" # Scope to CSS selector -$B snapshot -D # Diff against previous snapshot (what changed?) -$B snapshot -a # Annotated screenshot with ref labels -$B snapshot -o /tmp/x.png # Output path for annotated screenshot -$B snapshot -C # Cursor-interactive elements (@c refs — divs with pointer, onclick) +``` +-i Interactive elements only (buttons, links, inputs) with @e refs +-c Compact (no empty structural nodes) +-d Limit depth +-s Scope to CSS selector +-D Diff against previous snapshot (what changed?) +-a Annotated screenshot with ref labels +-o Output path for screenshot +-C Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png` @@ -266,77 +268,89 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. ### Navigation | Command | Description | |---------|-------------| +| `back` | History back | +| `forward` | History forward | | `goto ` | Navigate to URL | -| `back` / `forward` | History navigation | | `reload` | Reload page | | `url` | Print current URL | ### Reading | Command | Description | |---------|-------------| -| `text` | Cleaned page text | -| `html [selector]` | innerHTML | -| `links` | All links as "text -> href" | -| `forms` | Forms + fields as JSON | | `accessibility` | Full ARIA tree | +| `forms` | Form fields as JSON | +| `html [selector]` | innerHTML | +| `links` | All links as "text → href" | +| `text` | Cleaned page text | ### Interaction | Command | Description | |---------|-------------| | `click ` | Click element | +| `cookie` | Set cookie | +| `cookie-import ` | Import cookies from JSON file | +| `cookie-import-browser [browser] [--domain d]` | Import cookies from real browser (opens picker UI, or direct with --domain) | +| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt | +| `dialog-dismiss` | Auto-dismiss next dialog | | `fill ` | Fill input | -| `select ` | Select dropdown | +| `header ` | Set custom request header | | `hover ` | Hover element | -| `type ` | Type into focused element | -| `press ` | Press key (Enter, Tab, Escape) | +| `press ` | Press key (Enter, Tab, Escape, etc.) | | `scroll [sel]` | Scroll element into view | -| `wait ` | Wait for element (max 10s) | -| `wait --networkidle` | Wait for network to be idle | -| `wait --load` | Wait for page load event | +| `select ` | Select dropdown option | +| `type ` | Type into focused element | | `upload ` | Upload file(s) | -| `cookie-import ` | Import cookies from JSON file | -| `cookie-import-browser [browser] [--domain ]` | Import cookies from real browser (opens picker UI, or direct import with --domain) | -| `dialog-accept [text]` | Auto-accept dialogs | -| `dialog-dismiss` | Auto-dismiss dialogs | +| `useragent ` | Set user agent | | `viewport ` | Set viewport size | +| `wait ` | Wait for element/condition | ### Inspection | Command | Description | |---------|-------------| -| `js ` | Run JavaScript | +| `attrs ` | Element attributes as JSON | +| `console [--clear|--errors]` | Console messages (--errors filters to error/warning) | +| `cookies` | All cookies as JSON | +| `css ` | Computed CSS value | +| `dialog [--clear]` | Dialog messages | | `eval ` | Run JS file | -| `css ` | Computed CSS | -| `attrs ` | Element attributes | | `is ` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | -| `console [--clear\|--errors]` | Console messages (--errors filters to error/warning) | +| `js ` | Run JavaScript | | `network [--clear]` | Network requests | -| `dialog [--clear]` | Dialog messages | -| `cookies` | All cookies | -| `storage` | localStorage + sessionStorage | | `perf` | Page load timings | +| `storage [set k v]` | localStorage + sessionStorage | ### Visual | Command | Description | |---------|-------------| -| `screenshot [path]` | Screenshot | +| `diff ` | Text diff between pages | | `pdf [path]` | Save as PDF | | `responsive [prefix]` | Mobile/tablet/desktop screenshots | -| `diff ` | Text diff between pages | +| `screenshot [path]` | Save screenshot | + +### Snapshot +| Command | Description | +|---------|-------------| +| `snapshot [flags]` | Accessibility tree with @refs | + +### Meta +| Command | Description | +|---------|-------------| +| `chain` | Multi-command from JSON stdin | ### Tabs | Command | Description | |---------|-------------| -| `tabs` | List tabs | -| `tab ` | Switch tab | -| `newtab [url]` | Open tab | | `closetab [id]` | Close tab | +| `newtab [url]` | Open new tab | +| `tab ` | Switch to tab | +| `tabs` | List open tabs | ### Server | Command | Description | |---------|-------------| +| `restart` | Restart server | | `status` | Health check | -| `stop` | Shutdown | -| `restart` | Restart | +| `stop` | Shutdown server | ## Tips diff --git a/SKILL.md.tmpl b/SKILL.md.tmpl new file mode 100644 index 0000000000000000000000000000000000000000..0ee150a7582090ecc2b38cf888064f98765f5053 --- /dev/null +++ b/SKILL.md.tmpl @@ -0,0 +1,245 @@ +--- +name: gstack +version: 1.1.0 +description: | + Fast headless browser for QA testing and site dogfooding. Navigate any URL, interact with + elements, verify page state, diff before/after actions, take annotated screenshots, check + responsive layouts, test forms and uploads, handle dialogs, and assert element states. + ~100ms per command. Use when you need to test a feature, verify a deployment, dogfood a + user flow, or file a bug with evidence. +allowed-tools: + - Bash + - Read + +--- + +# gstack browse: QA Testing & Dogfooding + +Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command. +Auto-shuts down after 30 min idle. State persists between calls (cookies, tabs, sessions). + +## SETUP (run this check BEFORE any browse command) + +```bash +B=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null) +if [ -n "$B" ]; then + echo "READY: $B" +else + echo "NEEDS_SETUP" +fi +``` + +If `NEEDS_SETUP`: +1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait. +2. Run: `cd && ./setup` +3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` + +## IMPORTANT + +- Use the compiled binary via Bash: `$B ` +- NEVER use `mcp__claude-in-chrome__*` tools. They are slow and unreliable. +- Browser persists between calls — cookies, login sessions, and tabs carry over. +- Dialogs (alert/confirm/prompt) are auto-accepted by default — no browser lockup. + +## QA Workflows + +### Test a user flow (login, signup, checkout, etc.) + +```bash +B=~/.claude/skills/gstack/browse/dist/browse + +# 1. Go to the page +$B goto https://app.example.com/login + +# 2. See what's interactive +$B snapshot -i + +# 3. Fill the form using refs +$B fill @e3 "test@example.com" +$B fill @e4 "password123" +$B click @e5 + +# 4. Verify it worked +$B snapshot -D # diff shows what changed after clicking +$B is visible ".dashboard" # assert the dashboard appeared +$B screenshot /tmp/after-login.png +``` + +### Verify a deployment / check prod + +```bash +$B goto https://yourapp.com +$B text # read the page — does it load? +$B console # any JS errors? +$B network # any failed requests? +$B js "document.title" # correct title? +$B is visible ".hero-section" # key elements present? +$B screenshot /tmp/prod-check.png +``` + +### Dogfood a feature end-to-end + +```bash +# Navigate to the feature +$B goto https://app.example.com/new-feature + +# Take annotated screenshot — shows every interactive element with labels +$B snapshot -i -a -o /tmp/feature-annotated.png + +# Find ALL clickable things (including divs with cursor:pointer) +$B snapshot -C + +# Walk through the flow +$B snapshot -i # baseline +$B click @e3 # interact +$B snapshot -D # what changed? (unified diff) + +# Check element states +$B is visible ".success-toast" +$B is enabled "#next-step-btn" +$B is checked "#agree-checkbox" + +# Check console for errors after interactions +$B console +``` + +### Test responsive layouts + +```bash +# Quick: 3 screenshots at mobile/tablet/desktop +$B goto https://yourapp.com +$B responsive /tmp/layout + +# Manual: specific viewport +$B viewport 375x812 # iPhone +$B screenshot /tmp/mobile.png +$B viewport 1440x900 # Desktop +$B screenshot /tmp/desktop.png +``` + +### Test file upload + +```bash +$B goto https://app.example.com/upload +$B snapshot -i +$B upload @e3 /path/to/test-file.pdf +$B is visible ".upload-success" +$B screenshot /tmp/upload-result.png +``` + +### Test forms with validation + +```bash +$B goto https://app.example.com/form +$B snapshot -i + +# Submit empty — check validation errors appear +$B click @e10 # submit button +$B snapshot -D # diff shows error messages appeared +$B is visible ".error-message" + +# Fill and resubmit +$B fill @e3 "valid input" +$B click @e10 +$B snapshot -D # diff shows errors gone, success state +``` + +### Test dialogs (delete confirmations, prompts) + +```bash +# Set up dialog handling BEFORE triggering +$B dialog-accept # will auto-accept next alert/confirm +$B click "#delete-button" # triggers confirmation dialog +$B dialog # see what dialog appeared +$B snapshot -D # verify the item was deleted + +# For prompts that need input +$B dialog-accept "my answer" # accept with text +$B click "#rename-button" # triggers prompt +``` + +### Test authenticated pages (import real browser cookies) + +```bash +# Import cookies from your real browser (opens interactive picker) +$B cookie-import-browser + +# Or import a specific domain directly +$B cookie-import-browser comet --domain .github.com + +# Now test authenticated pages +$B goto https://github.com/settings/profile +$B snapshot -i +$B screenshot /tmp/github-profile.png +``` + +### Compare two pages / environments + +```bash +$B diff https://staging.app.com https://prod.app.com +``` + +### Multi-step chain (efficient for long flows) + +```bash +echo '[ + ["goto","https://app.example.com"], + ["snapshot","-i"], + ["fill","@e3","test@test.com"], + ["fill","@e4","password"], + ["click","@e5"], + ["snapshot","-D"], + ["screenshot","/tmp/result.png"] +]' | $B chain +``` + +## Quick Assertion Patterns + +```bash +# Element exists and is visible +$B is visible ".modal" + +# Button is enabled/disabled +$B is enabled "#submit-btn" +$B is disabled "#submit-btn" + +# Checkbox state +$B is checked "#agree" + +# Input is editable +$B is editable "#name-field" + +# Element has focus +$B is focused "#search-input" + +# Page contains text +$B js "document.body.textContent.includes('Success')" + +# Element count +$B js "document.querySelectorAll('.list-item').length" + +# Specific attribute value +$B attrs "#logo" # returns all attributes as JSON + +# CSS property +$B css ".button" "background-color" +``` + +## Snapshot System + +{{SNAPSHOT_FLAGS}} + +## Command Reference + +{{COMMAND_REFERENCE}} + +## Tips + +1. **Navigate once, query many times.** `goto` loads the page; then `text`, `js`, `screenshot` all hit the loaded page instantly. +2. **Use `snapshot -i` first.** See all interactive elements, then click/fill by ref. No CSS selector guessing. +3. **Use `snapshot -D` to verify.** Baseline → action → diff. See exactly what changed. +4. **Use `is` for assertions.** `is visible .modal` is faster and more reliable than parsing page text. +5. **Use `snapshot -a` for evidence.** Annotated screenshots are great for bug reports. +6. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses. +7. **Check `console` after actions.** Catch JS errors that don't surface visually. +8. **Use `chain` for long flows.** Single command, no per-step CLI overhead. diff --git a/TODO.md b/TODO.md index 0148e707c7157f12d8b11e3c605ba717b3ca3925..ebdeb0a6db50b76c7154450fc2b49909e2aa0274 100644 --- a/TODO.md +++ b/TODO.md @@ -105,6 +105,8 @@ - [ ] CI/CD integration — `/qa` as GitHub Action step, fail PR if health score drops (P2, M) - [ ] Accessibility audit mode — `--a11y` flag for focused accessibility testing (P3, S) - [ ] Greptile training feedback loop — export suppression patterns to Greptile team for model improvement (P3, S) + - [ ] E2E test cost tracking — track cumulative API spend, warn if over threshold (P3, S) + - [ ] E2E model pinning — pin E2E tests to claude-sonnet-4-6 for cost efficiency, add retry:2 for flaky LLM (P2, XS) ## Ideas & Notes - Browser is the nervous system — every skill should be able to see, interact with, and verify the web diff --git a/VERSION b/VERSION index d15723fbe8de36b1c3ae302c77d8095459ea88e6..1c09c74e221cd58f30240fbcfd9545ed19df54d7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.3.2 +0.3.3 diff --git a/bin/dev-setup b/bin/dev-setup index 709cca4de4a8fe43c7f71271a4c20132ff605432..6c5619d215ec5e551e7c178d588ed85c754242f7 100755 --- a/bin/dev-setup +++ b/bin/dev-setup @@ -4,16 +4,34 @@ # Creates .claude/skills/gstack → (symlink to repo root) so Claude Code # discovers skills from your working tree. Changes take effect immediately. # +# Also copies .env from the main worktree if this is a Conductor workspace +# or git worktree (so API keys carry over automatically). +# # Usage: bin/dev-setup # set up # bin/dev-teardown # clean up set -e REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" -# 1. Create .claude/skills/ inside the repo +# 1. Copy .env from main worktree (if we're a worktree and don't have one) +if [ ! -f "$REPO_ROOT/.env" ]; then + MAIN_WORKTREE="$(git -C "$REPO_ROOT" worktree list --porcelain 2>/dev/null | head -1 | sed 's/^worktree //')" + if [ -n "$MAIN_WORKTREE" ] && [ "$MAIN_WORKTREE" != "$REPO_ROOT" ] && [ -f "$MAIN_WORKTREE/.env" ]; then + cp "$MAIN_WORKTREE/.env" "$REPO_ROOT/.env" + echo "Copied .env from main worktree ($MAIN_WORKTREE)" + fi +fi + +# 2. Install dependencies +if [ ! -d "$REPO_ROOT/node_modules" ]; then + echo "Installing dependencies..." + (cd "$REPO_ROOT" && bun install) +fi + +# 3. Create .claude/skills/ inside the repo mkdir -p "$REPO_ROOT/.claude/skills" -# 2. Symlink .claude/skills/gstack → repo root +# 4. Symlink .claude/skills/gstack → repo root # This makes setup think it's inside a real .claude/skills/ directory GSTACK_LINK="$REPO_ROOT/.claude/skills/gstack" if [ -L "$GSTACK_LINK" ]; then @@ -26,7 +44,7 @@ elif [ -d "$GSTACK_LINK" ]; then fi ln -s "$REPO_ROOT" "$GSTACK_LINK" -# 3. Run setup via the symlink so it detects .claude/skills/ as its parent +# 5. Run setup via the symlink so it detects .claude/skills/ as its parent "$GSTACK_LINK/setup" echo "" diff --git a/browse/SKILL.md b/browse/SKILL.md index 99c979c5318df1a623a4c5dc43b18ae190d56052..7b9a6cff8b18d181935ea99c40ae16f8df55b91d 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -12,6 +12,8 @@ allowed-tools: - Read --- + + # browse: QA Testing & Dogfooding @@ -99,30 +101,115 @@ $B diff https://staging.app.com https://prod.app.com ## Snapshot Flags +The snapshot is your primary tool for understanding and interacting with pages. + ``` --i Interactive elements only (buttons, links, inputs) +-i Interactive elements only (buttons, links, inputs) with @e refs -c Compact (no empty structural nodes) -d Limit depth -s Scope to CSS selector --D Diff against previous snapshot +-D Diff against previous snapshot (what changed?) -a Annotated screenshot with ref labels -o Output path for screenshot --C Cursor-interactive elements (@c refs) +-C Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` -Combine: `$B snapshot -i -a -C -o /tmp/annotated.png` +Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png` + +After snapshot, use @refs everywhere: +```bash +$B click @e3 $B fill @e4 "value" $B hover @e1 +$B html @e2 $B css @e5 "color" $B attrs @e6 +$B click @c1 # cursor-interactive ref (from -C) +``` -Use @refs after snapshot: `$B click @e3`, `$B fill @e4 "value"`, `$B click @c1` +Refs are invalidated on navigation — run `snapshot` again after `goto`. ## Full Command List -**Navigate:** goto, back, forward, reload, url -**Read:** text, html, links, forms, accessibility -**Snapshot:** snapshot (with flags above) -**Interact:** click, fill, select, hover, type, press, scroll, wait, wait --networkidle, wait --load, viewport, upload, cookie-import, dialog-accept, dialog-dismiss -**Inspect:** js, eval, css, attrs, is, console, console --errors, network, dialog, cookies, storage, perf -**Visual:** screenshot, pdf, responsive -**Compare:** diff -**Multi-step:** chain (pipe JSON array) -**Tabs:** tabs, tab, newtab, closetab -**Server:** status, stop, restart +### Navigation +| Command | Description | +|---------|-------------| +| `back` | History back | +| `forward` | History forward | +| `goto ` | Navigate to URL | +| `reload` | Reload page | +| `url` | Print current URL | + +### Reading +| Command | Description | +|---------|-------------| +| `accessibility` | Full ARIA tree | +| `forms` | Form fields as JSON | +| `html [selector]` | innerHTML | +| `links` | All links as "text → href" | +| `text` | Cleaned page text | + +### Interaction +| Command | Description | +|---------|-------------| +| `click ` | Click element | +| `cookie` | Set cookie | +| `cookie-import ` | Import cookies from JSON file | +| `cookie-import-browser [browser] [--domain d]` | Import cookies from real browser (opens picker UI, or direct with --domain) | +| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt | +| `dialog-dismiss` | Auto-dismiss next dialog | +| `fill ` | Fill input | +| `header ` | Set custom request header | +| `hover ` | Hover element | +| `press ` | Press key (Enter, Tab, Escape, etc.) | +| `scroll [sel]` | Scroll element into view | +| `select ` | Select dropdown option | +| `type ` | Type into focused element | +| `upload ` | Upload file(s) | +| `useragent ` | Set user agent | +| `viewport ` | Set viewport size | +| `wait ` | Wait for element/condition | + +### Inspection +| Command | Description | +|---------|-------------| +| `attrs ` | Element attributes as JSON | +| `console [--clear|--errors]` | Console messages (--errors filters to error/warning) | +| `cookies` | All cookies as JSON | +| `css ` | Computed CSS value | +| `dialog [--clear]` | Dialog messages | +| `eval ` | Run JS file | +| `is ` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | +| `js ` | Run JavaScript | +| `network [--clear]` | Network requests | +| `perf` | Page load timings | +| `storage [set k v]` | localStorage + sessionStorage | + +### Visual +| Command | Description | +|---------|-------------| +| `diff ` | Text diff between pages | +| `pdf [path]` | Save as PDF | +| `responsive [prefix]` | Mobile/tablet/desktop screenshots | +| `screenshot [path]` | Save screenshot | + +### Snapshot +| Command | Description | +|---------|-------------| +| `snapshot [flags]` | Accessibility tree with @refs | + +### Meta +| Command | Description | +|---------|-------------| +| `chain` | Multi-command from JSON stdin | + +### Tabs +| Command | Description | +|---------|-------------| +| `closetab [id]` | Close tab | +| `newtab [url]` | Open new tab | +| `tab ` | Switch to tab | +| `tabs` | List open tabs | + +### Server +| Command | Description | +|---------|-------------| +| `restart` | Restart server | +| `status` | Health check | +| `stop` | Shutdown server | diff --git a/browse/SKILL.md.tmpl b/browse/SKILL.md.tmpl new file mode 100644 index 0000000000000000000000000000000000000000..f0fd0284ad4f6277bb37514334b596da5425f511 --- /dev/null +++ b/browse/SKILL.md.tmpl @@ -0,0 +1,106 @@ +--- +name: browse +version: 1.1.0 +description: | + Fast headless browser for QA testing and site dogfooding. Navigate any URL, interact with + elements, verify page state, diff before/after actions, take annotated screenshots, check + responsive layouts, test forms and uploads, handle dialogs, and assert element states. + ~100ms per command. Use when you need to test a feature, verify a deployment, dogfood a + user flow, or file a bug with evidence. +allowed-tools: + - Bash + - Read + +--- + +# browse: QA Testing & Dogfooding + +Persistent headless Chromium. First call auto-starts (~3s), then ~100ms per command. +State persists between calls (cookies, tabs, login sessions). + +## Core QA Patterns + +### 1. Verify a page loads correctly +```bash +$B goto https://yourapp.com +$B text # content loads? +$B console # JS errors? +$B network # failed requests? +$B is visible ".main-content" # key elements present? +``` + +### 2. Test a user flow +```bash +$B goto https://app.com/login +$B snapshot -i # see all interactive elements +$B fill @e3 "user@test.com" +$B fill @e4 "password" +$B click @e5 # submit +$B snapshot -D # diff: what changed after submit? +$B is visible ".dashboard" # success state present? +``` + +### 3. Verify an action worked +```bash +$B snapshot # baseline +$B click @e3 # do something +$B snapshot -D # unified diff shows exactly what changed +``` + +### 4. Visual evidence for bug reports +```bash +$B snapshot -i -a -o /tmp/annotated.png # labeled screenshot +$B screenshot /tmp/bug.png # plain screenshot +$B console # error log +``` + +### 5. Find all clickable elements (including non-ARIA) +```bash +$B snapshot -C # finds divs with cursor:pointer, onclick, tabindex +$B click @c1 # interact with them +``` + +### 6. Assert element states +```bash +$B is visible ".modal" +$B is enabled "#submit-btn" +$B is disabled "#submit-btn" +$B is checked "#agree-checkbox" +$B is editable "#name-field" +$B is focused "#search-input" +$B js "document.body.textContent.includes('Success')" +``` + +### 7. Test responsive layouts +```bash +$B responsive /tmp/layout # mobile + tablet + desktop screenshots +$B viewport 375x812 # or set specific viewport +$B screenshot /tmp/mobile.png +``` + +### 8. Test file uploads +```bash +$B upload "#file-input" /path/to/file.pdf +$B is visible ".upload-success" +``` + +### 9. Test dialogs +```bash +$B dialog-accept "yes" # set up handler +$B click "#delete-button" # trigger dialog +$B dialog # see what appeared +$B snapshot -D # verify deletion happened +``` + +### 10. Compare environments +```bash +$B diff https://staging.app.com https://prod.app.com +``` + +## Snapshot Flags + +{{SNAPSHOT_FLAGS}} + +## Full Command List + +{{COMMAND_REFERENCE}} diff --git a/browse/src/commands.ts b/browse/src/commands.ts new file mode 100644 index 0000000000000000000000000000000000000000..c3189ace3d983517d69ecbc681a105cba85a15e0 --- /dev/null +++ b/browse/src/commands.ts @@ -0,0 +1,107 @@ +/** + * Command registry — single source of truth for all browse commands. + * + * Dependency graph: + * commands.ts ──▶ server.ts (runtime dispatch) + * ──▶ gen-skill-docs.ts (doc generation) + * ──▶ skill-parser.ts (validation) + * ──▶ skill-check.ts (health reporting) + * + * Zero side effects. Safe to import from build scripts and tests. + */ + +export const READ_COMMANDS = new Set([ + 'text', 'html', 'links', 'forms', 'accessibility', + 'js', 'eval', 'css', 'attrs', + 'console', 'network', 'cookies', 'storage', 'perf', + 'dialog', 'is', +]); + +export const WRITE_COMMANDS = new Set([ + 'goto', 'back', 'forward', 'reload', + 'click', 'fill', 'select', 'hover', 'type', 'press', 'scroll', 'wait', + 'viewport', 'cookie', 'cookie-import', 'cookie-import-browser', 'header', 'useragent', + 'upload', 'dialog-accept', 'dialog-dismiss', +]); + +export const META_COMMANDS = new Set([ + 'tabs', 'tab', 'newtab', 'closetab', + 'status', 'stop', 'restart', + 'screenshot', 'pdf', 'responsive', + 'chain', 'diff', + 'url', 'snapshot', +]); + +export const ALL_COMMANDS = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]); + +export const COMMAND_DESCRIPTIONS: Record = { + // Navigation + 'goto': { category: 'Navigation', description: 'Navigate to URL', usage: 'goto ' }, + 'back': { category: 'Navigation', description: 'History back' }, + 'forward': { category: 'Navigation', description: 'History forward' }, + 'reload': { category: 'Navigation', description: 'Reload page' }, + 'url': { category: 'Navigation', description: 'Print current URL' }, + // Reading + 'text': { category: 'Reading', description: 'Cleaned page text' }, + 'html': { category: 'Reading', description: 'innerHTML', usage: 'html [selector]' }, + 'links': { category: 'Reading', description: 'All links as "text → href"' }, + 'forms': { category: 'Reading', description: 'Form fields as JSON' }, + 'accessibility': { category: 'Reading', description: 'Full ARIA tree' }, + // Inspection + 'js': { category: 'Inspection', description: 'Run JavaScript', usage: 'js ' }, + 'eval': { category: 'Inspection', description: 'Run JS file', usage: 'eval ' }, + 'css': { category: 'Inspection', description: 'Computed CSS value', usage: 'css ' }, + 'attrs': { category: 'Inspection', description: 'Element attributes as JSON', usage: 'attrs ' }, + 'is': { category: 'Inspection', description: 'State check (visible/hidden/enabled/disabled/checked/editable/focused)', usage: 'is ' }, + 'console': { category: 'Inspection', description: 'Console messages (--errors filters to error/warning)', usage: 'console [--clear|--errors]' }, + 'network': { category: 'Inspection', description: 'Network requests', usage: 'network [--clear]' }, + 'dialog': { category: 'Inspection', description: 'Dialog messages', usage: 'dialog [--clear]' }, + 'cookies': { category: 'Inspection', description: 'All cookies as JSON' }, + 'storage': { category: 'Inspection', description: 'localStorage + sessionStorage', usage: 'storage [set k v]' }, + 'perf': { category: 'Inspection', description: 'Page load timings' }, + // Interaction + 'click': { category: 'Interaction', description: 'Click element', usage: 'click ' }, + 'fill': { category: 'Interaction', description: 'Fill input', usage: 'fill ' }, + 'select': { category: 'Interaction', description: 'Select dropdown option', usage: 'select ' }, + 'hover': { category: 'Interaction', description: 'Hover element', usage: 'hover ' }, + 'type': { category: 'Interaction', description: 'Type into focused element', usage: 'type ' }, + 'press': { category: 'Interaction', description: 'Press key (Enter, Tab, Escape, etc.)', usage: 'press ' }, + 'scroll': { category: 'Interaction', description: 'Scroll element into view', usage: 'scroll [sel]' }, + 'wait': { category: 'Interaction', description: 'Wait for element/condition', usage: 'wait ' }, + 'upload': { category: 'Interaction', description: 'Upload file(s)', usage: 'upload ' }, + 'viewport':{ category: 'Interaction', description: 'Set viewport size', usage: 'viewport ' }, + 'cookie': { category: 'Interaction', description: 'Set cookie' }, + 'cookie-import': { category: 'Interaction', description: 'Import cookies from JSON file', usage: 'cookie-import ' }, + 'cookie-import-browser': { category: 'Interaction', description: 'Import cookies from real browser (opens picker UI, or direct with --domain)', usage: 'cookie-import-browser [browser] [--domain d]' }, + 'header': { category: 'Interaction', description: 'Set custom request header', usage: 'header ' }, + 'useragent': { category: 'Interaction', description: 'Set user agent', usage: 'useragent ' }, + 'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt', usage: 'dialog-accept [text]' }, + 'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' }, + // Visual + 'screenshot': { category: 'Visual', description: 'Save screenshot', usage: 'screenshot [path]' }, + 'pdf': { category: 'Visual', description: 'Save as PDF', usage: 'pdf [path]' }, + 'responsive': { category: 'Visual', description: 'Mobile/tablet/desktop screenshots', usage: 'responsive [prefix]' }, + 'diff': { category: 'Visual', description: 'Text diff between pages', usage: 'diff ' }, + // Tabs + 'tabs': { category: 'Tabs', description: 'List open tabs' }, + 'tab': { category: 'Tabs', description: 'Switch to tab', usage: 'tab ' }, + 'newtab': { category: 'Tabs', description: 'Open new tab', usage: 'newtab [url]' }, + 'closetab':{ category: 'Tabs', description: 'Close tab', usage: 'closetab [id]' }, + // Server + 'status': { category: 'Server', description: 'Health check' }, + 'stop': { category: 'Server', description: 'Shutdown server' }, + 'restart': { category: 'Server', description: 'Restart server' }, + // Meta + 'snapshot':{ category: 'Snapshot', description: 'Accessibility tree with @refs', usage: 'snapshot [flags]' }, + 'chain': { category: 'Meta', description: 'Multi-command from JSON stdin' }, +}; + +// Load-time validation: descriptions must cover exactly the command sets +const allCmds = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]); +const descKeys = new Set(Object.keys(COMMAND_DESCRIPTIONS)); +for (const cmd of allCmds) { + if (!descKeys.has(cmd)) throw new Error(`COMMAND_DESCRIPTIONS missing entry for: ${cmd}`); +} +for (const key of descKeys) { + if (!allCmds.has(key)) throw new Error(`COMMAND_DESCRIPTIONS has unknown command: ${key}`); +} diff --git a/browse/src/server.ts b/browse/src/server.ts index 588681390c5acd5fc4cecc197b5403be0d1df493..580bd67e75178a41f1db40878be57675218ae308 100644 --- a/browse/src/server.ts +++ b/browse/src/server.ts @@ -110,28 +110,9 @@ const idleCheckInterval = setInterval(() => { } }, 60_000); -// ─── Command Sets (exported for chain command) ────────────────── -export const READ_COMMANDS = new Set([ - 'text', 'html', 'links', 'forms', 'accessibility', - 'js', 'eval', 'css', 'attrs', - 'console', 'network', 'cookies', 'storage', 'perf', - 'dialog', 'is', -]); - -export const WRITE_COMMANDS = new Set([ - 'goto', 'back', 'forward', 'reload', - 'click', 'fill', 'select', 'hover', 'type', 'press', 'scroll', 'wait', - 'viewport', 'cookie', 'cookie-import', 'cookie-import-browser', 'header', 'useragent', - 'upload', 'dialog-accept', 'dialog-dismiss', -]); - -export const META_COMMANDS = new Set([ - 'tabs', 'tab', 'newtab', 'closetab', - 'status', 'stop', 'restart', - 'screenshot', 'pdf', 'responsive', - 'chain', 'diff', - 'url', 'snapshot', -]); +// ─── Command Sets (from commands.ts — single source of truth) ─── +import { READ_COMMANDS, WRITE_COMMANDS, META_COMMANDS } from './commands'; +export { READ_COMMANDS, WRITE_COMMANDS, META_COMMANDS }; // ─── Server ──────────────────────────────────────────────────── const browserManager = new BrowserManager(); diff --git a/browse/src/snapshot.ts b/browse/src/snapshot.ts index b0c7b80f837cac950bca6d0182dc75f2ce016d25..d3a84b5e3ec1b709cffaa5dcfefdf7bab92c9635 100644 --- a/browse/src/snapshot.ts +++ b/browse/src/snapshot.ts @@ -40,6 +40,31 @@ interface SnapshotOptions { cursorInteractive?: boolean; // -C / --cursor-interactive: scan cursor:pointer etc. } +/** + * Snapshot flag metadata — single source of truth for CLI parsing and doc generation. + * + * Imported by: + * - gen-skill-docs.ts (generates {{SNAPSHOT_FLAGS}} tables) + * - skill-parser.ts (validates flags in SKILL.md examples) + */ +export const SNAPSHOT_FLAGS: Array<{ + short: string; + long: string; + description: string; + takesValue?: boolean; + valueHint?: string; + optionKey: keyof SnapshotOptions; +}> = [ + { short: '-i', long: '--interactive', description: 'Interactive elements only (buttons, links, inputs) with @e refs', optionKey: 'interactive' }, + { short: '-c', long: '--compact', description: 'Compact (no empty structural nodes)', optionKey: 'compact' }, + { short: '-d', long: '--depth', description: 'Limit depth', takesValue: true, valueHint: '', optionKey: 'depth' }, + { short: '-s', long: '--selector', description: 'Scope to CSS selector', takesValue: true, valueHint: '', optionKey: 'selector' }, + { short: '-D', long: '--diff', description: 'Diff against previous snapshot (what changed?)', optionKey: 'diff' }, + { short: '-a', long: '--annotate', description: 'Annotated screenshot with ref labels', optionKey: 'annotate' }, + { short: '-o', long: '--output', description: 'Output path for screenshot', takesValue: true, valueHint: '', optionKey: 'outputPath' }, + { short: '-C', long: '--cursor-interactive', description: 'Cursor-interactive elements (@c refs — divs with pointer, onclick)', optionKey: 'cursorInteractive' }, +]; + interface ParsedNode { indent: number; role: string; @@ -50,49 +75,24 @@ interface ParsedNode { } /** - * Parse CLI args into SnapshotOptions + * Parse CLI args into SnapshotOptions — driven by SNAPSHOT_FLAGS metadata. */ export function parseSnapshotArgs(args: string[]): SnapshotOptions { const opts: SnapshotOptions = {}; for (let i = 0; i < args.length; i++) { - switch (args[i]) { - case '-i': - case '--interactive': - opts.interactive = true; - break; - case '-c': - case '--compact': - opts.compact = true; - break; - case '-d': - case '--depth': - opts.depth = parseInt(args[++i], 10); + const flag = SNAPSHOT_FLAGS.find(f => f.short === args[i] || f.long === args[i]); + if (!flag) throw new Error(`Unknown snapshot flag: ${args[i]}`); + if (flag.takesValue) { + const value = args[++i]; + if (!value) throw new Error(`Usage: snapshot ${flag.short} `); + if (flag.optionKey === 'depth') { + (opts as any)[flag.optionKey] = parseInt(value, 10); if (isNaN(opts.depth!)) throw new Error('Usage: snapshot -d '); - break; - case '-s': - case '--selector': - opts.selector = args[++i]; - if (!opts.selector) throw new Error('Usage: snapshot -s '); - break; - case '-D': - case '--diff': - opts.diff = true; - break; - case '-a': - case '--annotate': - opts.annotate = true; - break; - case '-o': - case '--output': - opts.outputPath = args[++i]; - if (!opts.outputPath) throw new Error('Usage: snapshot -o '); - break; - case '-C': - case '--cursor-interactive': - opts.cursorInteractive = true; - break; - default: - throw new Error(`Unknown snapshot flag: ${args[i]}`); + } else { + (opts as any)[flag.optionKey] = value; + } + } else { + (opts as any)[flag.optionKey] = true; } } return opts; diff --git a/conductor.json b/conductor.json new file mode 100644 index 0000000000000000000000000000000000000000..68f7fee057900e58fcfd5b68d90ebd1c3ffbb4ce --- /dev/null +++ b/conductor.json @@ -0,0 +1,6 @@ +{ + "scripts": { + "setup": "bin/dev-setup", + "archive": "bin/dev-teardown" + } +} diff --git a/package.json b/package.json index bece501c10f56636052c5c341a4b64e99d0c54af..97614d231ecb54d956fc8fbbfc2ebb39652b1ce3 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "0.3.2", + "version": "0.3.3", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", @@ -8,10 +8,16 @@ "browse": "./browse/dist/browse" }, "scripts": { - "build": "bun build --compile browse/src/cli.ts --outfile browse/dist/browse && bun build --compile browse/src/find-browse.ts --outfile browse/dist/find-browse && git rev-parse HEAD > browse/dist/.version && rm -f .*.bun-build", + "build": "bun run gen:skill-docs && bun build --compile browse/src/cli.ts --outfile browse/dist/browse && bun build --compile browse/src/find-browse.ts --outfile browse/dist/find-browse && git rev-parse HEAD > browse/dist/.version && rm -f .*.bun-build", + "gen:skill-docs": "bun run scripts/gen-skill-docs.ts", "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", - "test": "bun test", + "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts", + "test:e2e": "SKILL_E2E=1 bun test test/skill-e2e.test.ts", + "test:eval": "bun test test/skill-llm-eval.test.ts", + "test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts", + "skill:check": "bun run scripts/skill-check.ts", + "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts" }, "dependencies": { @@ -30,5 +36,9 @@ "claude", "ai-agent", "devtools" - ] + ], + "devDependencies": { + "@anthropic-ai/claude-agent-sdk": "^0.2.75", + "@anthropic-ai/sdk": "^0.78.0" + } } diff --git a/scripts/dev-skill.ts b/scripts/dev-skill.ts new file mode 100644 index 0000000000000000000000000000000000000000..1842c837cb404565c527e478b7305f706b8e2729 --- /dev/null +++ b/scripts/dev-skill.ts @@ -0,0 +1,82 @@ +#!/usr/bin/env bun +/** + * dev:skill — Watch mode for SKILL.md template development. + * + * Watches .tmpl files, regenerates SKILL.md files on change, + * validates all $B commands immediately. + */ + +import { validateSkill } from '../test/helpers/skill-parser'; +import { execSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +const TEMPLATES = [ + { tmpl: path.join(ROOT, 'SKILL.md.tmpl'), output: 'SKILL.md' }, + { tmpl: path.join(ROOT, 'browse', 'SKILL.md.tmpl'), output: 'browse/SKILL.md' }, +]; + +function regenerateAndValidate() { + // Regenerate + try { + execSync('bun run scripts/gen-skill-docs.ts', { cwd: ROOT, stdio: 'pipe' }); + } catch (err: any) { + console.log(` [gen] ERROR: ${err.stderr?.toString().trim() || err.message}`); + return; + } + + // Validate each generated file + for (const { output } of TEMPLATES) { + const fullPath = path.join(ROOT, output); + if (!fs.existsSync(fullPath)) continue; + + const result = validateSkill(fullPath); + const totalValid = result.valid.length; + const totalInvalid = result.invalid.length; + const totalSnapErrors = result.snapshotFlagErrors.length; + + if (totalInvalid > 0 || totalSnapErrors > 0) { + console.log(` [check] \u274c ${output} (${totalValid} valid)`); + for (const inv of result.invalid) { + console.log(` Unknown command: '${inv.command}' at line ${inv.line}`); + } + for (const se of result.snapshotFlagErrors) { + console.log(` ${se.error} at line ${se.command.line}`); + } + } else { + console.log(` [check] \u2705 ${output} — ${totalValid} commands, all valid`); + } + } +} + +// Initial run +console.log(' [watch] Watching *.md.tmpl files...'); +regenerateAndValidate(); + +// Watch for changes +for (const { tmpl } of TEMPLATES) { + if (!fs.existsSync(tmpl)) continue; + fs.watch(tmpl, () => { + console.log(`\n [watch] ${path.relative(ROOT, tmpl)} changed`); + regenerateAndValidate(); + }); +} + +// Also watch commands.ts and snapshot.ts (source of truth changes) +const SOURCE_FILES = [ + path.join(ROOT, 'browse', 'src', 'commands.ts'), + path.join(ROOT, 'browse', 'src', 'snapshot.ts'), +]; + +for (const src of SOURCE_FILES) { + if (!fs.existsSync(src)) continue; + fs.watch(src, () => { + console.log(`\n [watch] ${path.relative(ROOT, src)} changed`); + regenerateAndValidate(); + }); +} + +// Keep alive +console.log(' [watch] Press Ctrl+C to stop\n'); diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts new file mode 100644 index 0000000000000000000000000000000000000000..19b680004c1764a265435dd39298cbc26fb252e1 --- /dev/null +++ b/scripts/gen-skill-docs.ts @@ -0,0 +1,163 @@ +#!/usr/bin/env bun +/** + * Generate SKILL.md files from .tmpl templates. + * + * Pipeline: + * read .tmpl → find {{PLACEHOLDERS}} → resolve from source → format → write .md + * + * Supports --dry-run: generate to memory, exit 1 if different from committed file. + * Used by skill:check and CI freshness checks. + */ + +import { COMMAND_DESCRIPTIONS } from '../browse/src/commands'; +import { SNAPSHOT_FLAGS } from '../browse/src/snapshot'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const DRY_RUN = process.argv.includes('--dry-run'); + +// ─── Placeholder Resolvers ────────────────────────────────── + +function generateCommandReference(): string { + // Group commands by category + const groups = new Map>(); + for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) { + const list = groups.get(meta.category) || []; + list.push({ command: cmd, description: meta.description, usage: meta.usage }); + groups.set(meta.category, list); + } + + // Category display order + const categoryOrder = [ + 'Navigation', 'Reading', 'Interaction', 'Inspection', + 'Visual', 'Snapshot', 'Meta', 'Tabs', 'Server', + ]; + + const sections: string[] = []; + for (const category of categoryOrder) { + const commands = groups.get(category); + if (!commands || commands.length === 0) continue; + + // Sort alphabetically within category + commands.sort((a, b) => a.command.localeCompare(b.command)); + + sections.push(`### ${category}`); + sections.push('| Command | Description |'); + sections.push('|---------|-------------|'); + for (const cmd of commands) { + const display = cmd.usage ? `\`${cmd.usage}\`` : `\`${cmd.command}\``; + sections.push(`| ${display} | ${cmd.description} |`); + } + sections.push(''); + } + + return sections.join('\n').trimEnd(); +} + +function generateSnapshotFlags(): string { + const lines: string[] = [ + 'The snapshot is your primary tool for understanding and interacting with pages.', + '', + '```', + ]; + + for (const flag of SNAPSHOT_FLAGS) { + const label = flag.valueHint ? `${flag.short} ${flag.valueHint}` : flag.short; + lines.push(`${label.padEnd(10)}${flag.description}`); + } + + lines.push('```'); + lines.push(''); + lines.push('Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png`'); + lines.push(''); + lines.push('After snapshot, use @refs everywhere:'); + lines.push('```bash'); + lines.push('$B click @e3 $B fill @e4 "value" $B hover @e1'); + lines.push('$B html @e2 $B css @e5 "color" $B attrs @e6'); + lines.push('$B click @c1 # cursor-interactive ref (from -C)'); + lines.push('```'); + lines.push(''); + lines.push('Refs are invalidated on navigation — run `snapshot` again after `goto`.'); + + return lines.join('\n'); +} + +const RESOLVERS: Record string> = { + COMMAND_REFERENCE: generateCommandReference, + SNAPSHOT_FLAGS: generateSnapshotFlags, +}; + +// ─── Template Processing ──────────────────────────────────── + +const GENERATED_HEADER = `\n\n`; + +function processTemplate(tmplPath: string): { outputPath: string; content: string } { + const tmplContent = fs.readFileSync(tmplPath, 'utf-8'); + const relTmplPath = path.relative(ROOT, tmplPath); + const outputPath = tmplPath.replace(/\.tmpl$/, ''); + + // Replace placeholders + let content = tmplContent.replace(/\{\{(\w+)\}\}/g, (match, name) => { + const resolver = RESOLVERS[name]; + if (!resolver) throw new Error(`Unknown placeholder {{${name}}} in ${relTmplPath}`); + return resolver(); + }); + + // Check for any remaining unresolved placeholders + const remaining = content.match(/\{\{(\w+)\}\}/g); + if (remaining) { + throw new Error(`Unresolved placeholders in ${relTmplPath}: ${remaining.join(', ')}`); + } + + // Prepend generated header (after frontmatter) + const header = GENERATED_HEADER.replace('{{SOURCE}}', path.basename(tmplPath)); + const fmEnd = content.indexOf('---', content.indexOf('---') + 3); + if (fmEnd !== -1) { + const insertAt = content.indexOf('\n', fmEnd) + 1; + content = content.slice(0, insertAt) + header + content.slice(insertAt); + } else { + content = header + content; + } + + return { outputPath, content }; +} + +// ─── Main ─────────────────────────────────────────────────── + +function findTemplates(): string[] { + const templates: string[] = []; + const candidates = [ + path.join(ROOT, 'SKILL.md.tmpl'), + path.join(ROOT, 'browse', 'SKILL.md.tmpl'), + ]; + for (const p of candidates) { + if (fs.existsSync(p)) templates.push(p); + } + return templates; +} + +let hasChanges = false; + +for (const tmplPath of findTemplates()) { + const { outputPath, content } = processTemplate(tmplPath); + const relOutput = path.relative(ROOT, outputPath); + + if (DRY_RUN) { + const existing = fs.existsSync(outputPath) ? fs.readFileSync(outputPath, 'utf-8') : ''; + if (existing !== content) { + console.log(`STALE: ${relOutput}`); + hasChanges = true; + } else { + console.log(`FRESH: ${relOutput}`); + } + } else { + fs.writeFileSync(outputPath, content); + console.log(`GENERATED: ${relOutput}`); + } +} + +if (DRY_RUN && hasChanges) { + console.error('\nGenerated SKILL.md files are stale. Run: bun run gen:skill-docs'); + process.exit(1); +} diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts new file mode 100644 index 0000000000000000000000000000000000000000..fd10b529f7a2e40622f3fa218f3fb444cdcafdf1 --- /dev/null +++ b/scripts/skill-check.ts @@ -0,0 +1,111 @@ +#!/usr/bin/env bun +/** + * skill:check — Health summary for all SKILL.md files. + * + * Reports: + * - Command validation (valid/invalid/snapshot errors) + * - Template coverage (which SKILL.md files have .tmpl sources) + * - Freshness check (generated files match committed files) + */ + +import { validateSkill } from '../test/helpers/skill-parser'; +import * as fs from 'fs'; +import * as path from 'path'; +import { execSync } from 'child_process'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// Find all SKILL.md files +const SKILL_FILES = [ + 'SKILL.md', + 'browse/SKILL.md', + 'qa/SKILL.md', + 'ship/SKILL.md', + 'review/SKILL.md', + 'retro/SKILL.md', + 'plan-ceo-review/SKILL.md', + 'plan-eng-review/SKILL.md', + 'setup-browser-cookies/SKILL.md', +].filter(f => fs.existsSync(path.join(ROOT, f))); + +let hasErrors = false; + +// ─── Skills ───────────────────────────────────────────────── + +console.log(' Skills:'); +for (const file of SKILL_FILES) { + const fullPath = path.join(ROOT, file); + const result = validateSkill(fullPath); + + if (result.warnings.length > 0) { + console.log(` \u26a0\ufe0f ${file.padEnd(30)} — ${result.warnings.join(', ')}`); + continue; + } + + const totalValid = result.valid.length; + const totalInvalid = result.invalid.length; + const totalSnapErrors = result.snapshotFlagErrors.length; + + if (totalInvalid > 0 || totalSnapErrors > 0) { + hasErrors = true; + console.log(` \u274c ${file.padEnd(30)} — ${totalValid} valid, ${totalInvalid} invalid, ${totalSnapErrors} snapshot errors`); + for (const inv of result.invalid) { + console.log(` line ${inv.line}: unknown command '${inv.command}'`); + } + for (const se of result.snapshotFlagErrors) { + console.log(` line ${se.command.line}: ${se.error}`); + } + } else { + console.log(` \u2705 ${file.padEnd(30)} — ${totalValid} commands, all valid`); + } +} + +// ─── Templates ────────────────────────────────────────────── + +console.log('\n Templates:'); +const TEMPLATES = [ + { tmpl: 'SKILL.md.tmpl', output: 'SKILL.md' }, + { tmpl: 'browse/SKILL.md.tmpl', output: 'browse/SKILL.md' }, +]; + +for (const { tmpl, output } of TEMPLATES) { + const tmplPath = path.join(ROOT, tmpl); + const outPath = path.join(ROOT, output); + if (!fs.existsSync(tmplPath)) { + console.log(` \u26a0\ufe0f ${output.padEnd(30)} — no template`); + continue; + } + if (!fs.existsSync(outPath)) { + hasErrors = true; + console.log(` \u274c ${output.padEnd(30)} — generated file missing! Run: bun run gen:skill-docs`); + continue; + } + console.log(` \u2705 ${tmpl.padEnd(30)} \u2192 ${output}`); +} + +// Skills without templates +for (const file of SKILL_FILES) { + const tmplPath = path.join(ROOT, file + '.tmpl'); + if (!fs.existsSync(tmplPath) && !TEMPLATES.some(t => t.output === file)) { + console.log(` \u26a0\ufe0f ${file.padEnd(30)} — no template (OK if no $B commands)`); + } +} + +// ─── Freshness ────────────────────────────────────────────── + +console.log('\n Freshness:'); +try { + execSync('bun run scripts/gen-skill-docs.ts --dry-run', { cwd: ROOT, stdio: 'pipe' }); + console.log(' \u2705 All generated files are fresh'); +} catch (err: any) { + hasErrors = true; + const output = err.stdout?.toString() || ''; + console.log(' \u274c Generated files are stale:'); + for (const line of output.split('\n').filter((l: string) => l.startsWith('STALE'))) { + console.log(` ${line}`); + } + console.log(' Run: bun run gen:skill-docs'); +} + +console.log(''); +process.exit(hasErrors ? 1 : 0); diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..ce7c98ea712bf4247fd02b497fa7e7aacc1022da --- /dev/null +++ b/test/gen-skill-docs.test.ts @@ -0,0 +1,150 @@ +import { describe, test, expect } from 'bun:test'; +import { COMMAND_DESCRIPTIONS } from '../browse/src/commands'; +import { SNAPSHOT_FLAGS } from '../browse/src/snapshot'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +describe('gen-skill-docs', () => { + test('generated SKILL.md contains all command categories', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const categories = new Set(Object.values(COMMAND_DESCRIPTIONS).map(d => d.category)); + for (const cat of categories) { + expect(content).toContain(`### ${cat}`); + } + }); + + test('generated SKILL.md contains all commands', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) { + const display = meta.usage || cmd; + expect(content).toContain(display); + } + }); + + test('command table is sorted alphabetically within categories', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + // Extract command names from the Navigation section as a test + const navSection = content.match(/### Navigation\n\|.*\n\|.*\n([\s\S]*?)(?=\n###|\n## )/); + expect(navSection).not.toBeNull(); + const rows = navSection![1].trim().split('\n'); + const commands = rows.map(r => { + const match = r.match(/\| `(\w+)/); + return match ? match[1] : ''; + }).filter(Boolean); + const sorted = [...commands].sort(); + expect(commands).toEqual(sorted); + }); + + test('generated header is present in SKILL.md', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + expect(content).toContain('AUTO-GENERATED from SKILL.md.tmpl'); + expect(content).toContain('Regenerate: bun run gen:skill-docs'); + }); + + test('generated header is present in browse/SKILL.md', () => { + const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8'); + expect(content).toContain('AUTO-GENERATED from SKILL.md.tmpl'); + }); + + test('snapshot flags section contains all flags', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + for (const flag of SNAPSHOT_FLAGS) { + expect(content).toContain(flag.short); + expect(content).toContain(flag.description); + } + }); + + test('template files exist for generated SKILL.md files', () => { + expect(fs.existsSync(path.join(ROOT, 'SKILL.md.tmpl'))).toBe(true); + expect(fs.existsSync(path.join(ROOT, 'browse', 'SKILL.md.tmpl'))).toBe(true); + }); + + test('templates contain placeholders', () => { + const rootTmpl = fs.readFileSync(path.join(ROOT, 'SKILL.md.tmpl'), 'utf-8'); + expect(rootTmpl).toContain('{{COMMAND_REFERENCE}}'); + expect(rootTmpl).toContain('{{SNAPSHOT_FLAGS}}'); + + const browseTmpl = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md.tmpl'), 'utf-8'); + expect(browseTmpl).toContain('{{COMMAND_REFERENCE}}'); + expect(browseTmpl).toContain('{{SNAPSHOT_FLAGS}}'); + }); +}); + +/** + * Quality evals — catch description regressions. + * + * These test that generated output is *useful for an AI agent*, + * not just structurally valid. Each test targets a specific + * regression we actually shipped and caught in review. + */ +describe('description quality evals', () => { + // Regression: snapshot flags lost value hints (-d , -s , -o ) + test('snapshot flags with values include value hints in output', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + for (const flag of SNAPSHOT_FLAGS) { + if (flag.takesValue) { + expect(flag.valueHint).toBeDefined(); + expect(content).toContain(`${flag.short} ${flag.valueHint}`); + } + } + }); + + // Regression: "is" lost the valid states enum + test('is command lists valid state values', () => { + const desc = COMMAND_DESCRIPTIONS['is'].description; + for (const state of ['visible', 'hidden', 'enabled', 'disabled', 'checked', 'editable', 'focused']) { + expect(desc).toContain(state); + } + }); + + // Regression: "press" lost common key examples + test('press command lists example keys', () => { + const desc = COMMAND_DESCRIPTIONS['press'].description; + expect(desc).toContain('Enter'); + expect(desc).toContain('Tab'); + expect(desc).toContain('Escape'); + }); + + // Regression: "console" lost --errors filter note + test('console command describes --errors behavior', () => { + const desc = COMMAND_DESCRIPTIONS['console'].description; + expect(desc).toContain('--errors'); + }); + + // Regression: snapshot -i lost "@e refs" context + test('snapshot -i mentions @e refs', () => { + const flag = SNAPSHOT_FLAGS.find(f => f.short === '-i')!; + expect(flag.description).toContain('@e'); + }); + + // Regression: snapshot -C lost "@c refs" context + test('snapshot -C mentions @c refs', () => { + const flag = SNAPSHOT_FLAGS.find(f => f.short === '-C')!; + expect(flag.description).toContain('@c'); + }); + + // Guard: every description must be at least 8 chars (catches empty or stub descriptions) + test('all command descriptions have meaningful length', () => { + for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) { + expect(meta.description.length).toBeGreaterThanOrEqual(8); + } + }); + + // Guard: snapshot flag descriptions must be at least 10 chars + test('all snapshot flag descriptions have meaningful length', () => { + for (const flag of SNAPSHOT_FLAGS) { + expect(flag.description.length).toBeGreaterThanOrEqual(10); + } + }); + + // Guard: generated output uses → not -> + test('generated SKILL.md uses unicode arrows', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + // Check the Tips section specifically (where we regressed -> from →) + const tipsSection = content.slice(content.indexOf('## Tips')); + expect(tipsSection).toContain('→'); + expect(tipsSection).not.toContain('->'); + }); +}); diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts new file mode 100644 index 0000000000000000000000000000000000000000..13e0b7eb2c035253d294d6ab4e05f4fb1894d882 --- /dev/null +++ b/test/helpers/session-runner.ts @@ -0,0 +1,160 @@ +/** + * Agent SDK wrapper for skill E2E testing. + * + * Spawns a Claude Code session, runs a prompt, collects messages, + * scans tool_result messages for browse errors. + */ + +import { query } from '@anthropic-ai/claude-agent-sdk'; +import * as fs from 'fs'; +import * as path from 'path'; + +export interface SkillTestResult { + messages: any[]; + toolCalls: Array<{ tool: string; input: any; output: string }>; + browseErrors: string[]; + exitReason: string; + duration: number; +} + +const BROWSE_ERROR_PATTERNS = [ + /Unknown command: \w+/, + /Unknown snapshot flag: .+/, + /Exit code 1/, + /ERROR: browse binary not found/, + /Server failed to start/, +]; + +export async function runSkillTest(options: { + prompt: string; + workingDirectory: string; + maxTurns?: number; + allowedTools?: string[]; + timeout?: number; +}): Promise { + // Fail fast if running inside an Agent SDK session — nested sessions hang + if (process.env.CLAUDECODE || process.env.CLAUDE_CODE_ENTRYPOINT) { + throw new Error( + 'Cannot run E2E skill tests inside a Claude Code session. ' + + 'Run from a plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts' + ); + } + + const { + prompt, + workingDirectory, + maxTurns = 15, + allowedTools = ['Bash', 'Read', 'Write'], + timeout = 120_000, + } = options; + + const messages: any[] = []; + const toolCalls: SkillTestResult['toolCalls'] = []; + const browseErrors: string[] = []; + let exitReason = 'unknown'; + + const startTime = Date.now(); + + // Strip all Claude-related env vars to allow nested sessions. + // Without this, the child claude process thinks it's an SDK child + // and hangs waiting for parent IPC instead of running independently. + const env: Record = {}; + for (const [key] of Object.entries(process.env)) { + if (key.startsWith('CLAUDE') || key.startsWith('CLAUDECODE')) { + env[key] = undefined; + } + } + + const q = query({ + prompt, + options: { + cwd: workingDirectory, + allowedTools, + permissionMode: 'bypassPermissions', + allowDangerouslySkipPermissions: true, + maxTurns, + env, + }, + }); + + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error(`Skill test timed out after ${timeout}ms`)), timeout); + }); + + try { + const runner = (async () => { + for await (const msg of q) { + messages.push(msg); + + // Extract tool calls from assistant messages + if (msg.type === 'assistant' && msg.message?.content) { + for (const block of msg.message.content) { + if (block.type === 'tool_use') { + toolCalls.push({ + tool: block.name, + input: block.input, + output: '', // will be filled from tool_result + }); + } + // Scan tool_result blocks for browse errors + if (block.type === 'tool_result' || (typeof block === 'object' && 'text' in block)) { + const text = typeof block === 'string' ? block : (block as any).text || ''; + for (const pattern of BROWSE_ERROR_PATTERNS) { + if (pattern.test(text)) { + browseErrors.push(text.slice(0, 200)); + } + } + } + } + } + + // Also scan user messages (which contain tool results) + if (msg.type === 'user' && msg.message?.content) { + const content = Array.isArray(msg.message.content) ? msg.message.content : [msg.message.content]; + for (const block of content) { + const text = typeof block === 'string' ? block : (block as any)?.text || (block as any)?.content || ''; + if (typeof text === 'string') { + for (const pattern of BROWSE_ERROR_PATTERNS) { + if (pattern.test(text)) { + browseErrors.push(text.slice(0, 200)); + } + } + } + } + } + + // Capture result + if (msg.type === 'result') { + exitReason = msg.subtype || 'success'; + } + } + })(); + + await Promise.race([runner, timeoutPromise]); + } catch (err: any) { + exitReason = err.message?.includes('timed out') ? 'timeout' : `error: ${err.message}`; + } + + const duration = Date.now() - startTime; + + // Save transcript on failure + if (browseErrors.length > 0 || exitReason !== 'success') { + try { + const transcriptDir = path.join(workingDirectory, '.gstack', 'test-transcripts'); + fs.mkdirSync(transcriptDir, { recursive: true }); + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const transcriptPath = path.join(transcriptDir, `e2e-${timestamp}.json`); + fs.writeFileSync(transcriptPath, JSON.stringify({ + prompt, + exitReason, + browseErrors, + duration, + messages: messages.map(m => ({ type: m.type, subtype: m.subtype })), + }, null, 2)); + } catch { + // Transcript save failures are non-fatal + } + } + + return { messages, toolCalls, browseErrors, exitReason, duration }; +} diff --git a/test/helpers/skill-parser.ts b/test/helpers/skill-parser.ts new file mode 100644 index 0000000000000000000000000000000000000000..f7fdcb30973b441081dfb6116d04510b68de8e37 --- /dev/null +++ b/test/helpers/skill-parser.ts @@ -0,0 +1,133 @@ +/** + * SKILL.md parser and validator. + * + * Extracts $B commands from code blocks, validates them against + * the command registry and snapshot flags. + * + * Used by: + * - test/skill-validation.test.ts (Tier 1 static tests) + * - scripts/skill-check.ts (health summary) + * - scripts/dev-skill.ts (watch mode) + */ + +import { ALL_COMMANDS } from '../../browse/src/commands'; +import { parseSnapshotArgs } from '../../browse/src/snapshot'; +import * as fs from 'fs'; + +export interface BrowseCommand { + command: string; + args: string[]; + line: number; + raw: string; +} + +export interface ValidationResult { + valid: BrowseCommand[]; + invalid: BrowseCommand[]; + snapshotFlagErrors: Array<{ command: BrowseCommand; error: string }>; + warnings: string[]; +} + +/** + * Extract all $B invocations from bash code blocks in a SKILL.md file. + */ +export function extractBrowseCommands(skillPath: string): BrowseCommand[] { + const content = fs.readFileSync(skillPath, 'utf-8'); + const lines = content.split('\n'); + const commands: BrowseCommand[] = []; + + let inBashBlock = false; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + + // Detect code block boundaries + if (line.trimStart().startsWith('```')) { + if (inBashBlock) { + inBashBlock = false; + } else if (line.trimStart().startsWith('```bash')) { + inBashBlock = true; + } + // Non-bash code blocks (```json, ```, ```js, etc.) are skipped + continue; + } + + if (!inBashBlock) continue; + + // Match lines with $B command invocations + // Handle multiple $B commands on one line (e.g., "$B click @e3 $B fill @e4 "value"") + const matches = line.matchAll(/\$B\s+(\S+)(?:\s+([^\$]*))?/g); + for (const match of matches) { + const command = match[1]; + let argsStr = (match[2] || '').trim(); + + // Strip inline comments (# ...) — but not inside quotes + // Simple approach: remove everything from first unquoted # onward + let inQuote = false; + for (let j = 0; j < argsStr.length; j++) { + if (argsStr[j] === '"') inQuote = !inQuote; + if (argsStr[j] === '#' && !inQuote) { + argsStr = argsStr.slice(0, j).trim(); + break; + } + } + + // Parse args — handle quoted strings + const args: string[] = []; + if (argsStr) { + const argMatches = argsStr.matchAll(/"([^"]*)"|(\S+)/g); + for (const am of argMatches) { + args.push(am[1] ?? am[2]); + } + } + + commands.push({ + command, + args, + line: i + 1, // 1-based + raw: match[0].trim(), + }); + } + } + + return commands; +} + +/** + * Extract and validate all $B commands in a SKILL.md file. + */ +export function validateSkill(skillPath: string): ValidationResult { + const commands = extractBrowseCommands(skillPath); + const result: ValidationResult = { + valid: [], + invalid: [], + snapshotFlagErrors: [], + warnings: [], + }; + + if (commands.length === 0) { + result.warnings.push('no $B commands found'); + return result; + } + + for (const cmd of commands) { + if (!ALL_COMMANDS.has(cmd.command)) { + result.invalid.push(cmd); + continue; + } + + // Validate snapshot flags + if (cmd.command === 'snapshot' && cmd.args.length > 0) { + try { + parseSnapshotArgs(cmd.args); + } catch (err: any) { + result.snapshotFlagErrors.push({ command: cmd, error: err.message }); + continue; + } + } + + result.valid.push(cmd); + } + + return result; +} diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..d395fe155bf2086433b118b73de676fd2eb09ba1 --- /dev/null +++ b/test/skill-e2e.test.ts @@ -0,0 +1,79 @@ +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { runSkillTest } from './helpers/session-runner'; +import { startTestServer } from '../browse/test/test-server'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +// Skip if SKILL_E2E not set, or if running inside a Claude Code / Agent SDK session +// (nested Agent SDK sessions hang because the parent intercepts child claude subprocesses) +const isInsideAgentSDK = !!process.env.CLAUDECODE || !!process.env.CLAUDE_CODE_ENTRYPOINT; +const describeE2E = (process.env.SKILL_E2E && !isInsideAgentSDK) ? describe : describe.skip; + +let testServer: ReturnType; +let tmpDir: string; + +describeE2E('Skill E2E tests', () => { + beforeAll(() => { + testServer = startTestServer(); + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-')); + + // Symlink browse binary into tmpdir for the skill to find + const browseBin = path.resolve(import.meta.dir, '..', 'browse', 'dist', 'browse'); + const binDir = path.join(tmpDir, 'browse', 'dist'); + fs.mkdirSync(binDir, { recursive: true }); + if (fs.existsSync(browseBin)) { + fs.symlinkSync(browseBin, path.join(binDir, 'browse')); + } + + // Also create browse/bin/find-browse so the SKILL.md setup works + const findBrowseDir = path.join(tmpDir, 'browse', 'bin'); + fs.mkdirSync(findBrowseDir, { recursive: true }); + fs.writeFileSync(path.join(findBrowseDir, 'find-browse'), `#!/bin/bash\necho "${browseBin}"\n`, { mode: 0o755 }); + }); + + afterAll(() => { + testServer?.server?.stop(); + // Clean up tmpdir + try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} + }); + + test('browse basic commands work without errors', async () => { + const result = await runSkillTest({ + prompt: `You have a browse binary at ${path.resolve(import.meta.dir, '..', 'browse', 'dist', 'browse')}. Assign it to B variable and run these commands in sequence: +1. $B goto ${testServer.url} +2. $B snapshot -i +3. $B text +4. $B screenshot /tmp/skill-e2e-test.png +Report the results of each command.`, + workingDirectory: tmpDir, + maxTurns: 10, + timeout: 60_000, + }); + + expect(result.browseErrors).toHaveLength(0); + expect(result.exitReason).toBe('success'); + }, 90_000); + + test('browse snapshot flags all work', async () => { + const result = await runSkillTest({ + prompt: `You have a browse binary at ${path.resolve(import.meta.dir, '..', 'browse', 'dist', 'browse')}. Assign it to B variable and run: +1. $B goto ${testServer.url} +2. $B snapshot -i +3. $B snapshot -c +4. $B snapshot -D +5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png +Report what each command returned.`, + workingDirectory: tmpDir, + maxTurns: 10, + timeout: 60_000, + }); + + expect(result.browseErrors).toHaveLength(0); + expect(result.exitReason).toBe('success'); + }, 90_000); + + test.todo('/qa quick completes without browse errors'); + test.todo('/ship completes without browse errors'); + test.todo('/review completes without browse errors'); +}); diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..308de814e9a76d8ca4c33c44da53d8ea4874c9a5 --- /dev/null +++ b/test/skill-llm-eval.test.ts @@ -0,0 +1,194 @@ +/** + * LLM-as-a-Judge evals for generated SKILL.md quality. + * + * Uses the Anthropic API directly (not Agent SDK) to evaluate whether + * generated command docs are clear, complete, and actionable for an AI agent. + * + * Requires: ANTHROPIC_API_KEY env var + * Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts + * + * Cost: ~$0.01-0.03 per run (haiku) + */ + +import { describe, test, expect } from 'bun:test'; +import Anthropic from '@anthropic-ai/sdk'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const hasApiKey = !!process.env.ANTHROPIC_API_KEY; +const describeEval = hasApiKey ? describe : describe.skip; + +interface JudgeScore { + clarity: number; // 1-5: can an agent understand what each command does? + completeness: number; // 1-5: are all args, flags, valid values documented? + actionability: number; // 1-5: can an agent use this to construct correct commands? + reasoning: string; // why the scores were given +} + +async function judge(section: string, prompt: string): Promise { + const client = new Anthropic(); + + const response = await client.messages.create({ + model: 'claude-haiku-4-5-20251001', + max_tokens: 1024, + messages: [{ + role: 'user', + content: `You are evaluating documentation quality for an AI coding agent's CLI tool reference. + +The agent reads this documentation to learn how to use a headless browser CLI. It needs to: +1. Understand what each command does +2. Know what arguments to pass +3. Know valid values for enum-like parameters +4. Construct correct command invocations without guessing + +Rate the following ${section} on three dimensions (1-5 scale): + +- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone? +- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything? +- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone? + +Scoring guide: +- 5: Excellent — no ambiguity, all info present +- 4: Good — minor gaps an experienced agent could infer +- 3: Adequate — some guessing required +- 2: Poor — significant info missing +- 1: Unusable — agent would fail without external help + +Respond with ONLY valid JSON in this exact format: +{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"} + +Here is the ${section} to evaluate: + +${prompt}`, + }], + }); + + const text = response.content[0].type === 'text' ? response.content[0].text : ''; + // Extract JSON from response (handle markdown code blocks) + const jsonMatch = text.match(/\{[\s\S]*\}/); + if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); + return JSON.parse(jsonMatch[0]) as JudgeScore; +} + +describeEval('LLM-as-judge quality evals', () => { + test('command reference table scores >= 4 on all dimensions', async () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + // Extract just the command reference section + const start = content.indexOf('## Command Reference'); + const end = content.indexOf('## Tips'); + const section = content.slice(start, end); + + const scores = await judge('command reference table', section); + console.log('Command reference scores:', JSON.stringify(scores, null, 2)); + + expect(scores.clarity).toBeGreaterThanOrEqual(4); + expect(scores.completeness).toBeGreaterThanOrEqual(4); + expect(scores.actionability).toBeGreaterThanOrEqual(4); + }, 30_000); + + test('snapshot flags section scores >= 4 on all dimensions', async () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const start = content.indexOf('## Snapshot System'); + const end = content.indexOf('## Command Reference'); + const section = content.slice(start, end); + + const scores = await judge('snapshot flags reference', section); + console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2)); + + expect(scores.clarity).toBeGreaterThanOrEqual(4); + expect(scores.completeness).toBeGreaterThanOrEqual(4); + expect(scores.actionability).toBeGreaterThanOrEqual(4); + }, 30_000); + + test('browse/SKILL.md overall scores >= 4', async () => { + const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8'); + // Just the reference sections (skip examples/patterns) + const start = content.indexOf('## Snapshot Flags'); + const section = content.slice(start); + + const scores = await judge('browse skill reference (flags + commands)', section); + console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2)); + + expect(scores.clarity).toBeGreaterThanOrEqual(4); + expect(scores.completeness).toBeGreaterThanOrEqual(4); + expect(scores.actionability).toBeGreaterThanOrEqual(4); + }, 30_000); + + test('regression check: compare branch vs baseline quality', async () => { + // This test compares the generated output against the hand-maintained + // baseline from main. The generated version should score equal or higher. + const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const genStart = generated.indexOf('## Command Reference'); + const genEnd = generated.indexOf('## Tips'); + const genSection = generated.slice(genStart, genEnd); + + const baseline = `## Command Reference + +### Navigation +| Command | Description | +|---------|-------------| +| \`goto \` | Navigate to URL | +| \`back\` / \`forward\` | History navigation | +| \`reload\` | Reload page | +| \`url\` | Print current URL | + +### Interaction +| Command | Description | +|---------|-------------| +| \`click \` | Click element | +| \`fill \` | Fill input | +| \`select \` | Select dropdown | +| \`hover \` | Hover element | +| \`type \` | Type into focused element | +| \`press \` | Press key (Enter, Tab, Escape) | +| \`scroll [sel]\` | Scroll element into view | +| \`wait \` | Wait for element (max 10s) | +| \`wait --networkidle\` | Wait for network to be idle | +| \`wait --load\` | Wait for page load event | + +### Inspection +| Command | Description | +|---------|-------------| +| \`js \` | Run JavaScript | +| \`css \` | Computed CSS | +| \`attrs \` | Element attributes | +| \`is \` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | +| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`; + + const client = new Anthropic(); + const response = await client.messages.create({ + model: 'claude-haiku-4-5-20251001', + max_tokens: 1024, + messages: [{ + role: 'user', + content: `You are comparing two versions of CLI documentation for an AI coding agent. + +VERSION A (baseline — hand-maintained): +${baseline} + +VERSION B (auto-generated from source): +${genSection} + +Which version is better for an AI agent trying to use these commands? Consider: +- Completeness (more commands documented? all args shown?) +- Clarity (descriptions helpful?) +- Coverage (missing commands in either version?) + +Respond with ONLY valid JSON: +{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N} + +Scores are 1-5 overall quality.`, + }], + }); + + const text = response.content[0].type === 'text' ? response.content[0].text : ''; + const jsonMatch = text.match(/\{[\s\S]*\}/); + if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); + const result = JSON.parse(jsonMatch[0]); + console.log('Regression comparison:', JSON.stringify(result, null, 2)); + + // Generated version should be at least as good as hand-maintained + expect(result.b_score).toBeGreaterThanOrEqual(result.a_score); + }, 30_000); +}); diff --git a/test/skill-parser.test.ts b/test/skill-parser.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..3c62c682cb43d0f2ab2f9031d56a75d1638a5f39 --- /dev/null +++ b/test/skill-parser.test.ts @@ -0,0 +1,179 @@ +import { describe, test, expect } from 'bun:test'; +import { extractBrowseCommands, validateSkill } from './helpers/skill-parser'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const FIXTURES_DIR = path.join(os.tmpdir(), 'skill-parser-test'); + +function writeFixture(name: string, content: string): string { + fs.mkdirSync(FIXTURES_DIR, { recursive: true }); + const p = path.join(FIXTURES_DIR, name); + fs.writeFileSync(p, content); + return p; +} + +describe('extractBrowseCommands', () => { + test('extracts $B commands from bash code blocks', () => { + const p = writeFixture('basic.md', [ + '# Test', + '```bash', + '$B goto https://example.com', + '$B snapshot -i', + '```', + ].join('\n')); + const cmds = extractBrowseCommands(p); + expect(cmds).toHaveLength(2); + expect(cmds[0].command).toBe('goto'); + expect(cmds[0].args).toEqual(['https://example.com']); + expect(cmds[1].command).toBe('snapshot'); + expect(cmds[1].args).toEqual(['-i']); + }); + + test('skips non-bash code blocks', () => { + const p = writeFixture('skip.md', [ + '```json', + '{"key": "$B goto bad"}', + '```', + '```bash', + '$B text', + '```', + ].join('\n')); + const cmds = extractBrowseCommands(p); + expect(cmds).toHaveLength(1); + expect(cmds[0].command).toBe('text'); + }); + + test('returns empty array for file with no code blocks', () => { + const p = writeFixture('no-blocks.md', '# Just text\nSome content\n'); + const cmds = extractBrowseCommands(p); + expect(cmds).toHaveLength(0); + }); + + test('returns empty array for code blocks with no $B invocations', () => { + const p = writeFixture('no-b.md', [ + '```bash', + 'echo "hello"', + 'ls -la', + '```', + ].join('\n')); + const cmds = extractBrowseCommands(p); + expect(cmds).toHaveLength(0); + }); + + test('handles multiple $B commands on one line', () => { + const p = writeFixture('multi.md', [ + '```bash', + '$B click @e3 $B fill @e4 "value" $B hover @e1', + '```', + ].join('\n')); + const cmds = extractBrowseCommands(p); + expect(cmds).toHaveLength(3); + expect(cmds[0].command).toBe('click'); + expect(cmds[1].command).toBe('fill'); + expect(cmds[1].args).toEqual(['@e4', 'value']); + expect(cmds[2].command).toBe('hover'); + }); + + test('handles quoted arguments correctly', () => { + const p = writeFixture('quoted.md', [ + '```bash', + '$B fill @e3 "test@example.com"', + '$B js "document.title"', + '```', + ].join('\n')); + const cmds = extractBrowseCommands(p); + expect(cmds[0].args).toEqual(['@e3', 'test@example.com']); + expect(cmds[1].args).toEqual(['document.title']); + }); + + test('tracks correct line numbers', () => { + const p = writeFixture('lines.md', [ + '# Header', // line 1 + '', // line 2 + '```bash', // line 3 + '$B goto x', // line 4 + '```', // line 5 + '', // line 6 + '```bash', // line 7 + '$B text', // line 8 + '```', // line 9 + ].join('\n')); + const cmds = extractBrowseCommands(p); + expect(cmds[0].line).toBe(4); + expect(cmds[1].line).toBe(8); + }); + + test('skips unlabeled code blocks', () => { + const p = writeFixture('unlabeled.md', [ + '```', + '$B snapshot -i', + '```', + ].join('\n')); + const cmds = extractBrowseCommands(p); + expect(cmds).toHaveLength(0); + }); +}); + +describe('validateSkill', () => { + test('valid commands pass validation', () => { + const p = writeFixture('valid.md', [ + '```bash', + '$B goto https://example.com', + '$B text', + '$B click @e3', + '$B snapshot -i -a', + '```', + ].join('\n')); + const result = validateSkill(p); + expect(result.valid).toHaveLength(4); + expect(result.invalid).toHaveLength(0); + expect(result.snapshotFlagErrors).toHaveLength(0); + }); + + test('invalid commands flagged in result', () => { + const p = writeFixture('invalid.md', [ + '```bash', + '$B goto https://example.com', + '$B explode', + '$B halp', + '```', + ].join('\n')); + const result = validateSkill(p); + expect(result.valid).toHaveLength(1); + expect(result.invalid).toHaveLength(2); + expect(result.invalid[0].command).toBe('explode'); + expect(result.invalid[1].command).toBe('halp'); + }); + + test('snapshot flags validated via parseSnapshotArgs', () => { + const p = writeFixture('bad-snapshot.md', [ + '```bash', + '$B snapshot --bogus', + '```', + ].join('\n')); + const result = validateSkill(p); + expect(result.snapshotFlagErrors).toHaveLength(1); + expect(result.snapshotFlagErrors[0].error).toContain('Unknown snapshot flag'); + }); + + test('returns warning when no $B commands found', () => { + const p = writeFixture('empty.md', '# Nothing here\n'); + const result = validateSkill(p); + expect(result.warnings).toContain('no $B commands found'); + }); + + test('valid snapshot flags pass', () => { + const p = writeFixture('snap-valid.md', [ + '```bash', + '$B snapshot -i -a -C -o /tmp/out.png', + '$B snapshot -D', + '$B snapshot -d 3', + '$B snapshot -s "main"', + '```', + ].join('\n')); + const result = validateSkill(p); + expect(result.valid).toHaveLength(4); + expect(result.snapshotFlagErrors).toHaveLength(0); + }); +}); diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..1c4025a297189c2c4b7f59f38b781172f7059a14 --- /dev/null +++ b/test/skill-validation.test.ts @@ -0,0 +1,100 @@ +import { describe, test, expect } from 'bun:test'; +import { validateSkill } from './helpers/skill-parser'; +import { ALL_COMMANDS, COMMAND_DESCRIPTIONS, READ_COMMANDS, WRITE_COMMANDS, META_COMMANDS } from '../browse/src/commands'; +import { SNAPSHOT_FLAGS } from '../browse/src/snapshot'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +describe('SKILL.md command validation', () => { + test('all $B commands in SKILL.md are valid browse commands', () => { + const result = validateSkill(path.join(ROOT, 'SKILL.md')); + expect(result.invalid).toHaveLength(0); + expect(result.valid.length).toBeGreaterThan(0); + }); + + test('all snapshot flags in SKILL.md are valid', () => { + const result = validateSkill(path.join(ROOT, 'SKILL.md')); + expect(result.snapshotFlagErrors).toHaveLength(0); + }); + + test('all $B commands in browse/SKILL.md are valid browse commands', () => { + const result = validateSkill(path.join(ROOT, 'browse', 'SKILL.md')); + expect(result.invalid).toHaveLength(0); + expect(result.valid.length).toBeGreaterThan(0); + }); + + test('all snapshot flags in browse/SKILL.md are valid', () => { + const result = validateSkill(path.join(ROOT, 'browse', 'SKILL.md')); + expect(result.snapshotFlagErrors).toHaveLength(0); + }); + + test('all $B commands in qa/SKILL.md are valid browse commands', () => { + const qaSkill = path.join(ROOT, 'qa', 'SKILL.md'); + if (!fs.existsSync(qaSkill)) return; // skip if missing + const result = validateSkill(qaSkill); + expect(result.invalid).toHaveLength(0); + }); + + test('all snapshot flags in qa/SKILL.md are valid', () => { + const qaSkill = path.join(ROOT, 'qa', 'SKILL.md'); + if (!fs.existsSync(qaSkill)) return; + const result = validateSkill(qaSkill); + expect(result.snapshotFlagErrors).toHaveLength(0); + }); +}); + +describe('Command registry consistency', () => { + test('COMMAND_DESCRIPTIONS covers all commands in sets', () => { + const allCmds = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]); + const descKeys = new Set(Object.keys(COMMAND_DESCRIPTIONS)); + for (const cmd of allCmds) { + expect(descKeys.has(cmd)).toBe(true); + } + }); + + test('COMMAND_DESCRIPTIONS has no extra commands not in sets', () => { + const allCmds = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]); + for (const key of Object.keys(COMMAND_DESCRIPTIONS)) { + expect(allCmds.has(key)).toBe(true); + } + }); + + test('ALL_COMMANDS matches union of all sets', () => { + const union = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]); + expect(ALL_COMMANDS.size).toBe(union.size); + for (const cmd of union) { + expect(ALL_COMMANDS.has(cmd)).toBe(true); + } + }); + + test('SNAPSHOT_FLAGS option keys are valid SnapshotOptions fields', () => { + const validKeys = new Set([ + 'interactive', 'compact', 'depth', 'selector', + 'diff', 'annotate', 'outputPath', 'cursorInteractive', + ]); + for (const flag of SNAPSHOT_FLAGS) { + expect(validKeys.has(flag.optionKey)).toBe(true); + } + }); +}); + +describe('Generated SKILL.md freshness', () => { + test('no unresolved {{placeholders}} in generated SKILL.md', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const unresolved = content.match(/\{\{\w+\}\}/g); + expect(unresolved).toBeNull(); + }); + + test('no unresolved {{placeholders}} in generated browse/SKILL.md', () => { + const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8'); + const unresolved = content.match(/\{\{\w+\}\}/g); + expect(unresolved).toBeNull(); + }); + + test('generated SKILL.md has AUTO-GENERATED header', () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + expect(content).toContain('AUTO-GENERATED'); + }); +});