From a4683742721b86f839f98d365f45213dbd1a5157 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 13 Mar 2026 22:14:14 -0700 Subject: [PATCH] fix: enrich SKILL.md docs to pass LLM evals, upgrade judge to Sonnet 4.6 (#43) * fix: enrich command descriptions and snapshot flags for LLM eval quality 14 command descriptions enriched with specific arg formats, valid values, error behavior, and return types. Fixed header usage from to :. Added cookie usage syntax. Snapshot flags now show long names, ref numbering, and output format examples. * refactor: auto-generate server.ts help text from COMMAND_DESCRIPTIONS Replace hand-maintained help block with generateHelpText() that reads from COMMAND_DESCRIPTIONS and SNAPSHOT_FLAGS. Eliminates help text drift from source of truth. * test: add usage consistency and pipe guard tests Usage consistency test cross-checks Usage: patterns in implementation against COMMAND_DESCRIPTIONS using structural skeleton comparison. Pipe guard test ensures descriptions don't contain | which would break markdown table rendering. * chore: upgrade eval judge to Sonnet 4.6, update changelog Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable, nuanced scoring. Add changelog entry for all eval improvements. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- CHANGELOG.md | 16 ++++++++ SKILL.md | 75 ++++++++++++++++++----------------- browse/SKILL.md | 63 +++++++++++++++++------------ browse/src/commands.ts | 32 +++++++-------- browse/src/server.ts | 67 ++++++++++++++++++++----------- browse/src/snapshot.ts | 8 ++-- scripts/gen-skill-docs.ts | 17 ++++++-- test/gen-skill-docs.test.ts | 8 ++++ test/skill-llm-eval.test.ts | 6 +-- test/skill-validation.test.ts | 53 +++++++++++++++++++++++++ 10 files changed, 233 insertions(+), 112 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bdd600c91fd4d19a54b6d01503ff03ea37a3666..dd179e2f8087f881107e75641ce8919c28f4a6f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## Unreleased — 2026-03-14 + +### Changed +- Enriched 14 command descriptions with specific arg formats, valid values, error behavior, and return types +- Fixed `header` usage from ` ` to `:` (matching actual implementation) +- Added `cookie` usage syntax: `cookie =` +- Enriched 4 snapshot flag descriptions with defaults, output paths, and behavior details +- Snapshot flags section now shows long flag names (`-i / --interactive`) alongside short +- Added ref numbering explanation and output format example to snapshot docs +- Replaced hand-maintained server.ts help text with auto-generated `generateHelpText()` from COMMAND_DESCRIPTIONS +- Upgraded LLM eval judge from Haiku to Sonnet 4.6 for more stable scoring + +### Added +- Usage string consistency test: cross-checks `Usage:` patterns in implementation against COMMAND_DESCRIPTIONS +- Pipe guard test: ensures no command description contains `|` (would break markdown tables) + ## 0.3.3 — 2026-03-13 ### Added diff --git a/SKILL.md b/SKILL.md index 2f78a6307e1578ab8665c6d78b1585fa904ff03d..c79b710b83d367c476020bd8e7efd0048566e9c4 100644 --- a/SKILL.md +++ b/SKILL.md @@ -23,12 +23,9 @@ Auto-shuts down after 30 min idle. State persists between calls (cookies, tabs, ## SETUP (run this check BEFORE any browse command) ```bash -BROWSE_OUTPUT=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null) -B=$(echo "$BROWSE_OUTPUT" | head -1) -META=$(echo "$BROWSE_OUTPUT" | grep "^META:" || true) +B=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null) if [ -n "$B" ]; then echo "READY: $B" - [ -n "$META" ] && echo "$META" else echo "NEEDS_SETUP" fi @@ -39,13 +36,6 @@ If `NEEDS_SETUP`: 2. Run: `cd && ./setup` 3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` -If you see `META:UPDATE_AVAILABLE`: -1. Parse the JSON payload to get `current`, `latest`, and `command`. -2. Tell the user: "A gstack update is available (current: X, latest: Y). OK to update?" -3. **STOP and wait for approval.** -4. Run the command from the META payload. -5. Re-run the setup check above to get the updated binary path. - ## IMPORTANT - Use the compiled binary via Bash: `$B ` @@ -242,25 +232,36 @@ $B css ".button" "background-color" The snapshot is your primary tool for understanding and interacting with pages. ``` --i Interactive elements only (buttons, links, inputs) with @e refs --c Compact (no empty structural nodes) --d Limit depth --s Scope to CSS selector --D Diff against previous snapshot (what changed?) --a Annotated screenshot with ref labels --o Output path for screenshot --C Cursor-interactive elements (@c refs — divs with pointer, onclick) +-i --interactive Interactive elements only (buttons, links, inputs) with @e refs +-c --compact Compact (no empty structural nodes) +-d --depth Limit tree depth (0 = root only, default: unlimited) +-s --selector Scope to CSS selector +-D --diff Unified diff against previous snapshot (first call stores baseline) +-a --annotate Annotated screenshot with red overlay boxes and ref labels +-o --output Output path for annotated screenshot (default: /tmp/browse-annotated.png) +-C --cursor-interactive Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` -Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png` +All flags can be combined freely. `-o` only applies when `-a` is also used. +Example: `$B snapshot -i -a -C -o /tmp/annotated.png` + +**Ref numbering:** @e refs are assigned sequentially (@e1, @e2, ...) in tree order. +@c refs from `-C` are numbered separately (@c1, @c2, ...). -After snapshot, use @refs everywhere: +After snapshot, use @refs as selectors in any command: ```bash $B click @e3 $B fill @e4 "value" $B hover @e1 $B html @e2 $B css @e5 "color" $B attrs @e6 $B click @c1 # cursor-interactive ref (from -C) ``` +**Output format:** indented accessibility tree with @ref IDs, one element per line. +``` + @e1 [heading] "Welcome" [level=1] + @e2 [textbox] "Email" + @e3 [button] "Submit" +``` + Refs are invalidated on navigation — run `snapshot` again after `goto`. ## Command Reference @@ -279,7 +280,7 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. |---------|-------------| | `accessibility` | Full ARIA tree | | `forms` | Form fields as JSON | -| `html [selector]` | innerHTML | +| `html [selector]` | innerHTML of selector (throws if not found), or full page HTML if no selector given | | `links` | All links as "text → href" | | `text` | Cleaned page text | @@ -287,22 +288,22 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. | Command | Description | |---------|-------------| | `click ` | Click element | -| `cookie` | Set cookie | +| `cookie =` | Set cookie on current page domain | | `cookie-import ` | Import cookies from JSON file | -| `cookie-import-browser [browser] [--domain d]` | Import cookies from real browser (opens picker UI, or direct with --domain) | -| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt | +| `cookie-import-browser [browser] [--domain d]` | Import cookies from Comet, Chrome, Arc, Brave, or Edge (opens picker, or use --domain for direct import) | +| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response | | `dialog-dismiss` | Auto-dismiss next dialog | | `fill ` | Fill input | -| `header ` | Set custom request header | +| `header :` | Set custom request header (colon-separated, sensitive values auto-redacted) | | `hover ` | Hover element | -| `press ` | Press key (Enter, Tab, Escape, etc.) | -| `scroll [sel]` | Scroll element into view | -| `select ` | Select dropdown option | +| `press ` | Press key — Enter, Tab, Escape, ArrowUp/Down/Left/Right, Backspace, Delete, Home, End, PageUp, PageDown, or modifiers like Shift+Enter | +| `scroll [sel]` | Scroll element into view, or scroll to page bottom if no selector | +| `select ` | Select dropdown option by value, label, or visible text | | `type ` | Type into focused element | -| `upload ` | Upload file(s) | +| `upload [file2...]` | Upload file(s) | | `useragent ` | Set user agent | | `viewport ` | Set viewport size | -| `wait ` | Wait for element/condition | +| `wait ` | Wait for element, network idle, or page load (timeout: 15s) | ### Inspection | Command | Description | @@ -312,30 +313,30 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. | `cookies` | All cookies as JSON | | `css ` | Computed CSS value | | `dialog [--clear]` | Dialog messages | -| `eval ` | Run JS file | +| `eval ` | Run JavaScript from file and return result as string (path must be under /tmp or cwd) | | `is ` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | -| `js ` | Run JavaScript | +| `js ` | Run JavaScript expression and return result as string | | `network [--clear]` | Network requests | | `perf` | Page load timings | -| `storage [set k v]` | localStorage + sessionStorage | +| `storage [set k v]` | Read all localStorage + sessionStorage as JSON, or set to write localStorage | ### Visual | Command | Description | |---------|-------------| | `diff ` | Text diff between pages | | `pdf [path]` | Save as PDF | -| `responsive [prefix]` | Mobile/tablet/desktop screenshots | +| `responsive [prefix]` | Screenshots at mobile (375x812), tablet (768x1024), desktop (1280x720). Saves as {prefix}-mobile.png etc. | | `screenshot [path]` | Save screenshot | ### Snapshot | Command | Description | |---------|-------------| -| `snapshot [flags]` | Accessibility tree with @refs | +| `snapshot [flags]` | Accessibility tree with @e refs for element selection. Flags: -i interactive only, -c compact, -d N depth limit, -s sel scope, -D diff vs previous, -a annotated screenshot, -o path output, -C cursor-interactive @c refs | ### Meta | Command | Description | |---------|-------------| -| `chain` | Multi-command from JSON stdin | +| `chain` | Run commands from JSON stdin. Format: [["cmd","arg1",...],...] | ### Tabs | Command | Description | diff --git a/browse/SKILL.md b/browse/SKILL.md index 7b9a6cff8b18d181935ea99c40ae16f8df55b91d..5e838649244bc27b0328d6a7b8abc89d7ece48ee 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -104,25 +104,36 @@ $B diff https://staging.app.com https://prod.app.com The snapshot is your primary tool for understanding and interacting with pages. ``` --i Interactive elements only (buttons, links, inputs) with @e refs --c Compact (no empty structural nodes) --d Limit depth --s Scope to CSS selector --D Diff against previous snapshot (what changed?) --a Annotated screenshot with ref labels --o Output path for screenshot --C Cursor-interactive elements (@c refs — divs with pointer, onclick) +-i --interactive Interactive elements only (buttons, links, inputs) with @e refs +-c --compact Compact (no empty structural nodes) +-d --depth Limit tree depth (0 = root only, default: unlimited) +-s --selector Scope to CSS selector +-D --diff Unified diff against previous snapshot (first call stores baseline) +-a --annotate Annotated screenshot with red overlay boxes and ref labels +-o --output Output path for annotated screenshot (default: /tmp/browse-annotated.png) +-C --cursor-interactive Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` -Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png` +All flags can be combined freely. `-o` only applies when `-a` is also used. +Example: `$B snapshot -i -a -C -o /tmp/annotated.png` -After snapshot, use @refs everywhere: +**Ref numbering:** @e refs are assigned sequentially (@e1, @e2, ...) in tree order. +@c refs from `-C` are numbered separately (@c1, @c2, ...). + +After snapshot, use @refs as selectors in any command: ```bash $B click @e3 $B fill @e4 "value" $B hover @e1 $B html @e2 $B css @e5 "color" $B attrs @e6 $B click @c1 # cursor-interactive ref (from -C) ``` +**Output format:** indented accessibility tree with @ref IDs, one element per line. +``` + @e1 [heading] "Welcome" [level=1] + @e2 [textbox] "Email" + @e3 [button] "Submit" +``` + Refs are invalidated on navigation — run `snapshot` again after `goto`. ## Full Command List @@ -141,7 +152,7 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. |---------|-------------| | `accessibility` | Full ARIA tree | | `forms` | Form fields as JSON | -| `html [selector]` | innerHTML | +| `html [selector]` | innerHTML of selector (throws if not found), or full page HTML if no selector given | | `links` | All links as "text → href" | | `text` | Cleaned page text | @@ -149,22 +160,22 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. | Command | Description | |---------|-------------| | `click ` | Click element | -| `cookie` | Set cookie | +| `cookie =` | Set cookie on current page domain | | `cookie-import ` | Import cookies from JSON file | -| `cookie-import-browser [browser] [--domain d]` | Import cookies from real browser (opens picker UI, or direct with --domain) | -| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt | +| `cookie-import-browser [browser] [--domain d]` | Import cookies from Comet, Chrome, Arc, Brave, or Edge (opens picker, or use --domain for direct import) | +| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response | | `dialog-dismiss` | Auto-dismiss next dialog | | `fill ` | Fill input | -| `header ` | Set custom request header | +| `header :` | Set custom request header (colon-separated, sensitive values auto-redacted) | | `hover ` | Hover element | -| `press ` | Press key (Enter, Tab, Escape, etc.) | -| `scroll [sel]` | Scroll element into view | -| `select ` | Select dropdown option | +| `press ` | Press key — Enter, Tab, Escape, ArrowUp/Down/Left/Right, Backspace, Delete, Home, End, PageUp, PageDown, or modifiers like Shift+Enter | +| `scroll [sel]` | Scroll element into view, or scroll to page bottom if no selector | +| `select ` | Select dropdown option by value, label, or visible text | | `type ` | Type into focused element | -| `upload ` | Upload file(s) | +| `upload [file2...]` | Upload file(s) | | `useragent ` | Set user agent | | `viewport ` | Set viewport size | -| `wait ` | Wait for element/condition | +| `wait ` | Wait for element, network idle, or page load (timeout: 15s) | ### Inspection | Command | Description | @@ -174,30 +185,30 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. | `cookies` | All cookies as JSON | | `css ` | Computed CSS value | | `dialog [--clear]` | Dialog messages | -| `eval ` | Run JS file | +| `eval ` | Run JavaScript from file and return result as string (path must be under /tmp or cwd) | | `is ` | State check (visible/hidden/enabled/disabled/checked/editable/focused) | -| `js ` | Run JavaScript | +| `js ` | Run JavaScript expression and return result as string | | `network [--clear]` | Network requests | | `perf` | Page load timings | -| `storage [set k v]` | localStorage + sessionStorage | +| `storage [set k v]` | Read all localStorage + sessionStorage as JSON, or set to write localStorage | ### Visual | Command | Description | |---------|-------------| | `diff ` | Text diff between pages | | `pdf [path]` | Save as PDF | -| `responsive [prefix]` | Mobile/tablet/desktop screenshots | +| `responsive [prefix]` | Screenshots at mobile (375x812), tablet (768x1024), desktop (1280x720). Saves as {prefix}-mobile.png etc. | | `screenshot [path]` | Save screenshot | ### Snapshot | Command | Description | |---------|-------------| -| `snapshot [flags]` | Accessibility tree with @refs | +| `snapshot [flags]` | Accessibility tree with @e refs for element selection. Flags: -i interactive only, -c compact, -d N depth limit, -s sel scope, -D diff vs previous, -a annotated screenshot, -o path output, -C cursor-interactive @c refs | ### Meta | Command | Description | |---------|-------------| -| `chain` | Multi-command from JSON stdin | +| `chain` | Run commands from JSON stdin. Format: [["cmd","arg1",...],...] | ### Tabs | Command | Description | diff --git a/browse/src/commands.ts b/browse/src/commands.ts index c3189ace3d983517d69ecbc681a105cba85a15e0..6024c4b6fa70b1917054306d1069b23a5e8bddfe 100644 --- a/browse/src/commands.ts +++ b/browse/src/commands.ts @@ -43,13 +43,13 @@ export const COMMAND_DESCRIPTIONS: Record' }, - 'eval': { category: 'Inspection', description: 'Run JS file', usage: 'eval ' }, + 'js': { category: 'Inspection', description: 'Run JavaScript expression and return result as string', usage: 'js ' }, + 'eval': { category: 'Inspection', description: 'Run JavaScript from file and return result as string (path must be under /tmp or cwd)', usage: 'eval ' }, 'css': { category: 'Inspection', description: 'Computed CSS value', usage: 'css ' }, 'attrs': { category: 'Inspection', description: 'Element attributes as JSON', usage: 'attrs ' }, 'is': { category: 'Inspection', description: 'State check (visible/hidden/enabled/disabled/checked/editable/focused)', usage: 'is ' }, @@ -57,30 +57,30 @@ export const COMMAND_DESCRIPTIONS: Record to write localStorage', usage: 'storage [set k v]' }, 'perf': { category: 'Inspection', description: 'Page load timings' }, // Interaction 'click': { category: 'Interaction', description: 'Click element', usage: 'click ' }, 'fill': { category: 'Interaction', description: 'Fill input', usage: 'fill ' }, - 'select': { category: 'Interaction', description: 'Select dropdown option', usage: 'select ' }, + 'select': { category: 'Interaction', description: 'Select dropdown option by value, label, or visible text', usage: 'select ' }, 'hover': { category: 'Interaction', description: 'Hover element', usage: 'hover ' }, 'type': { category: 'Interaction', description: 'Type into focused element', usage: 'type ' }, - 'press': { category: 'Interaction', description: 'Press key (Enter, Tab, Escape, etc.)', usage: 'press ' }, - 'scroll': { category: 'Interaction', description: 'Scroll element into view', usage: 'scroll [sel]' }, - 'wait': { category: 'Interaction', description: 'Wait for element/condition', usage: 'wait ' }, - 'upload': { category: 'Interaction', description: 'Upload file(s)', usage: 'upload ' }, + 'press': { category: 'Interaction', description: 'Press key — Enter, Tab, Escape, ArrowUp/Down/Left/Right, Backspace, Delete, Home, End, PageUp, PageDown, or modifiers like Shift+Enter', usage: 'press ' }, + 'scroll': { category: 'Interaction', description: 'Scroll element into view, or scroll to page bottom if no selector', usage: 'scroll [sel]' }, + 'wait': { category: 'Interaction', description: 'Wait for element, network idle, or page load (timeout: 15s)', usage: 'wait ' }, + 'upload': { category: 'Interaction', description: 'Upload file(s)', usage: 'upload [file2...]' }, 'viewport':{ category: 'Interaction', description: 'Set viewport size', usage: 'viewport ' }, - 'cookie': { category: 'Interaction', description: 'Set cookie' }, + 'cookie': { category: 'Interaction', description: 'Set cookie on current page domain', usage: 'cookie =' }, 'cookie-import': { category: 'Interaction', description: 'Import cookies from JSON file', usage: 'cookie-import ' }, - 'cookie-import-browser': { category: 'Interaction', description: 'Import cookies from real browser (opens picker UI, or direct with --domain)', usage: 'cookie-import-browser [browser] [--domain d]' }, - 'header': { category: 'Interaction', description: 'Set custom request header', usage: 'header ' }, + 'cookie-import-browser': { category: 'Interaction', description: 'Import cookies from Comet, Chrome, Arc, Brave, or Edge (opens picker, or use --domain for direct import)', usage: 'cookie-import-browser [browser] [--domain d]' }, + 'header': { category: 'Interaction', description: 'Set custom request header (colon-separated, sensitive values auto-redacted)', usage: 'header :' }, 'useragent': { category: 'Interaction', description: 'Set user agent', usage: 'useragent ' }, - 'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt', usage: 'dialog-accept [text]' }, + 'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' }, 'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' }, // Visual 'screenshot': { category: 'Visual', description: 'Save screenshot', usage: 'screenshot [path]' }, 'pdf': { category: 'Visual', description: 'Save as PDF', usage: 'pdf [path]' }, - 'responsive': { category: 'Visual', description: 'Mobile/tablet/desktop screenshots', usage: 'responsive [prefix]' }, + 'responsive': { category: 'Visual', description: 'Screenshots at mobile (375x812), tablet (768x1024), desktop (1280x720). Saves as {prefix}-mobile.png etc.', usage: 'responsive [prefix]' }, 'diff': { category: 'Visual', description: 'Text diff between pages', usage: 'diff ' }, // Tabs 'tabs': { category: 'Tabs', description: 'List open tabs' }, @@ -92,8 +92,8 @@ export const COMMAND_DESCRIPTIONS: Record(); + for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) { + const display = meta.usage || cmd; + const list = groups.get(meta.category) || []; + list.push(display); + groups.set(meta.category, list); + } + + const categoryOrder = [ + 'Navigation', 'Reading', 'Interaction', 'Inspection', + 'Visual', 'Snapshot', 'Meta', 'Tabs', 'Server', + ]; + + const lines = ['gstack browse — headless browser for AI agents', '', 'Commands:']; + for (const cat of categoryOrder) { + const cmds = groups.get(cat); + if (!cmds) continue; + lines.push(` ${(cat + ':').padEnd(15)}${cmds.join(', ')}`); + } + + // Snapshot flags from source of truth + lines.push(''); + lines.push('Snapshot flags:'); + const flagPairs: string[] = []; + for (const flag of SNAPSHOT_FLAGS) { + const label = flag.valueHint ? `${flag.short} ${flag.valueHint}` : flag.short; + flagPairs.push(`${label} ${flag.long}`); + } + // Print two flags per line for compact display + for (let i = 0; i < flagPairs.length; i += 2) { + const left = flagPairs[i].padEnd(28); + const right = flagPairs[i + 1] || ''; + lines.push(` ${left}${right}`); + } + + return lines.join('\n'); +} + // ─── Buffer (from buffers.ts) ──────────────────────────────────── import { consoleBuffer, networkBuffer, dialogBuffer, addConsoleEntry, addNetworkEntry, addDialogEntry, type LogEntry, type NetworkEntry, type DialogEntry } from './buffers'; export { consoleBuffer, networkBuffer, dialogBuffer, addConsoleEntry, addNetworkEntry, addDialogEntry, type LogEntry, type NetworkEntry, type DialogEntry }; @@ -191,29 +234,7 @@ async function handleCommand(body: any): Promise { } else if (META_COMMANDS.has(command)) { result = await handleMetaCommand(command, args, browserManager, shutdown); } else if (command === 'help') { - const helpText = [ - 'gstack browse — headless browser for AI agents', - '', - 'Commands:', - ' Navigation: goto , back, forward, reload', - ' Interaction: click , fill , select , hover, type, press, scroll, wait', - ' Read: text [sel], html [sel], links, forms, accessibility, cookies, storage, console, network, perf', - ' Evaluate: js , eval , css , attrs , is ', - ' Snapshot: snapshot [-i] [-c] [-d N] [-s sel] [-D] [-a] [-o path] [-C]', - ' Screenshot: screenshot [path], pdf [path], responsive ', - ' Tabs: tabs, tab , newtab [url], closetab [id]', - ' State: cookie , cookie-import , cookie-import-browser [browser]', - ' Headers: header [name] [value], useragent [string]', - ' Upload: upload [file2...]', - ' Dialogs: dialog, dialog-accept [text], dialog-dismiss', - ' Meta: status, stop, restart, diff, chain, help', - '', - 'Snapshot flags:', - ' -i interactive only -c compact (remove empty nodes)', - ' -d N limit depth -s sel scope to CSS selector', - ' -D diff vs previous -a annotated screenshot with ref labels', - ' -o path output file -C cursor-interactive elements', - ].join('\n'); + const helpText = generateHelpText(); return new Response(helpText, { status: 200, headers: { 'Content-Type': 'text/plain' }, diff --git a/browse/src/snapshot.ts b/browse/src/snapshot.ts index d3a84b5e3ec1b709cffaa5dcfefdf7bab92c9635..a2a3aeea1cfd8a05a0363f71a3627d5b98b5f371 100644 --- a/browse/src/snapshot.ts +++ b/browse/src/snapshot.ts @@ -57,11 +57,11 @@ export const SNAPSHOT_FLAGS: Array<{ }> = [ { short: '-i', long: '--interactive', description: 'Interactive elements only (buttons, links, inputs) with @e refs', optionKey: 'interactive' }, { short: '-c', long: '--compact', description: 'Compact (no empty structural nodes)', optionKey: 'compact' }, - { short: '-d', long: '--depth', description: 'Limit depth', takesValue: true, valueHint: '', optionKey: 'depth' }, + { short: '-d', long: '--depth', description: 'Limit tree depth (0 = root only, default: unlimited)', takesValue: true, valueHint: '', optionKey: 'depth' }, { short: '-s', long: '--selector', description: 'Scope to CSS selector', takesValue: true, valueHint: '', optionKey: 'selector' }, - { short: '-D', long: '--diff', description: 'Diff against previous snapshot (what changed?)', optionKey: 'diff' }, - { short: '-a', long: '--annotate', description: 'Annotated screenshot with ref labels', optionKey: 'annotate' }, - { short: '-o', long: '--output', description: 'Output path for screenshot', takesValue: true, valueHint: '', optionKey: 'outputPath' }, + { short: '-D', long: '--diff', description: 'Unified diff against previous snapshot (first call stores baseline)', optionKey: 'diff' }, + { short: '-a', long: '--annotate', description: 'Annotated screenshot with red overlay boxes and ref labels', optionKey: 'annotate' }, + { short: '-o', long: '--output', description: 'Output path for annotated screenshot (default: /tmp/browse-annotated.png)', takesValue: true, valueHint: '', optionKey: 'outputPath' }, { short: '-C', long: '--cursor-interactive', description: 'Cursor-interactive elements (@c refs — divs with pointer, onclick)', optionKey: 'cursorInteractive' }, ]; diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 19b680004c1764a265435dd39298cbc26fb252e1..381278ca703dfb1e19b4800732dfae17f70eed46 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -64,20 +64,31 @@ function generateSnapshotFlags(): string { for (const flag of SNAPSHOT_FLAGS) { const label = flag.valueHint ? `${flag.short} ${flag.valueHint}` : flag.short; - lines.push(`${label.padEnd(10)}${flag.description}`); + lines.push(`${label.padEnd(10)}${flag.long.padEnd(24)}${flag.description}`); } lines.push('```'); lines.push(''); - lines.push('Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png`'); + lines.push('All flags can be combined freely. `-o` only applies when `-a` is also used.'); + lines.push('Example: `$B snapshot -i -a -C -o /tmp/annotated.png`'); lines.push(''); - lines.push('After snapshot, use @refs everywhere:'); + lines.push('**Ref numbering:** @e refs are assigned sequentially (@e1, @e2, ...) in tree order.'); + lines.push('@c refs from `-C` are numbered separately (@c1, @c2, ...).'); + lines.push(''); + lines.push('After snapshot, use @refs as selectors in any command:'); lines.push('```bash'); lines.push('$B click @e3 $B fill @e4 "value" $B hover @e1'); lines.push('$B html @e2 $B css @e5 "color" $B attrs @e6'); lines.push('$B click @c1 # cursor-interactive ref (from -C)'); lines.push('```'); lines.push(''); + lines.push('**Output format:** indented accessibility tree with @ref IDs, one element per line.'); + lines.push('```'); + lines.push(' @e1 [heading] "Welcome" [level=1]'); + lines.push(' @e2 [textbox] "Email"'); + lines.push(' @e3 [button] "Submit"'); + lines.push('```'); + lines.push(''); lines.push('Refs are invalidated on navigation — run `snapshot` again after `goto`.'); return lines.join('\n'); diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index ce7c98ea712bf4247fd02b497fa7e7aacc1022da..9d3f3b9b4be518579f1f9be8f8770ca75e4101c1 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -139,6 +139,14 @@ describe('description quality evals', () => { } }); + // Guard: descriptions must not contain pipe (breaks markdown table cells) + // Usage strings are backtick-wrapped in the table so pipes there are safe. + test('no command description contains pipe character', () => { + for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) { + expect(meta.description).not.toContain('|'); + } + }); + // Guard: generated output uses → not -> test('generated SKILL.md uses unicode arrows', () => { const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index 308de814e9a76d8ca4c33c44da53d8ea4874c9a5..f978f0352c003246045506149fb80e61ce513611 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -7,7 +7,7 @@ * Requires: ANTHROPIC_API_KEY env var * Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts * - * Cost: ~$0.01-0.03 per run (haiku) + * Cost: ~$0.05-0.15 per run (sonnet) */ import { describe, test, expect } from 'bun:test'; @@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise { const client = new Anthropic(); const response = await client.messages.create({ - model: 'claude-haiku-4-5-20251001', + model: 'claude-sonnet-4-6', max_tokens: 1024, messages: [{ role: 'user', @@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => { const client = new Anthropic(); const response = await client.messages.create({ - model: 'claude-haiku-4-5-20251001', + model: 'claude-sonnet-4-6', max_tokens: 1024, messages: [{ role: 'user', diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 1c4025a297189c2c4b7f59f38b781172f7059a14..4bf6b6dddd32a0ca8fbe503cf423f109c6948c89 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -80,6 +80,59 @@ describe('Command registry consistency', () => { }); }); +describe('Usage string consistency', () => { + // Normalize a usage string to its structural skeleton for comparison. + // Replaces with <>, [optional] with [], strips parenthetical hints. + // This catches format mismatches (e.g., : vs ) + // without tripping on abbreviation differences (e.g., vs ). + function skeleton(usage: string): string { + return usage + .replace(/\(.*?\)/g, '') // strip parenthetical hints like (e.g., Enter, Tab) + .replace(/<[^>]*>/g, '<>') // normalize → <> + .replace(/\[[^\]]*\]/g, '[]') // normalize [optional] → [] + .replace(/\s+/g, ' ') // collapse whitespace + .trim(); + } + + // Cross-check Usage: patterns in implementation against COMMAND_DESCRIPTIONS + test('implementation Usage: structural format matches COMMAND_DESCRIPTIONS', () => { + const implFiles = [ + path.join(ROOT, 'browse', 'src', 'write-commands.ts'), + path.join(ROOT, 'browse', 'src', 'read-commands.ts'), + path.join(ROOT, 'browse', 'src', 'meta-commands.ts'), + ]; + + // Extract "Usage: browse " from throw new Error(...) calls + const usagePattern = /throw new Error\(['"`]Usage:\s*browse\s+(.+?)['"`]\)/g; + const implUsages = new Map(); + + for (const file of implFiles) { + const content = fs.readFileSync(file, 'utf-8'); + let match; + while ((match = usagePattern.exec(content)) !== null) { + const usage = match[1].split('\\n')[0].trim(); + const cmd = usage.split(/\s/)[0]; + implUsages.set(cmd, usage); + } + } + + // Compare structural skeletons + const mismatches: string[] = []; + for (const [cmd, implUsage] of implUsages) { + const desc = COMMAND_DESCRIPTIONS[cmd]; + if (!desc) continue; + if (!desc.usage) continue; + const descSkel = skeleton(desc.usage); + const implSkel = skeleton(implUsage); + if (descSkel !== implSkel) { + mismatches.push(`${cmd}: docs "${desc.usage}" (${descSkel}) vs impl "${implUsage}" (${implSkel})`); + } + } + + expect(mismatches).toEqual([]); + }); +}); + describe('Generated SKILL.md freshness', () => { test('no unresolved {{placeholders}} in generated SKILL.md', () => { const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');