From a4683742721b86f839f98d365f45213dbd1a5157 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Fri, 13 Mar 2026 22:14:14 -0700
Subject: [PATCH] fix: enrich SKILL.md docs to pass LLM evals, upgrade judge to
 Sonnet 4.6 (#43)

* fix: enrich command descriptions and snapshot flags for LLM eval quality

14 command descriptions enriched with specific arg formats, valid values,
error behavior, and return types. Fixed header usage from <name> <value>
to <name>:<value>. Added cookie usage syntax. Snapshot flags now show
long names, ref numbering, and output format examples.

* refactor: auto-generate server.ts help text from COMMAND_DESCRIPTIONS

Replace hand-maintained help block with generateHelpText() that reads
from COMMAND_DESCRIPTIONS and SNAPSHOT_FLAGS. Eliminates help text
drift from source of truth.

* test: add usage consistency and pipe guard tests

Usage consistency test cross-checks Usage: patterns in implementation
against COMMAND_DESCRIPTIONS using structural skeleton comparison.
Pipe guard test ensures descriptions don't contain | which would break
markdown table rendering.

* chore: upgrade eval judge to Sonnet 4.6, update changelog

Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable,
nuanced scoring. Add changelog entry for all eval improvements.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 CHANGELOG.md                  | 16 ++++++++
 SKILL.md                      | 75 ++++++++++++++++++-----------------
 browse/SKILL.md               | 63 +++++++++++++++++------------
 browse/src/commands.ts        | 32 +++++++--------
 browse/src/server.ts          | 67 ++++++++++++++++++++-----------
 browse/src/snapshot.ts        |  8 ++--
 scripts/gen-skill-docs.ts     | 17 ++++++--
 test/gen-skill-docs.test.ts   |  8 ++++
 test/skill-llm-eval.test.ts   |  6 +--
 test/skill-validation.test.ts | 53 +++++++++++++++++++++++++
 10 files changed, 233 insertions(+), 112 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6bdd600c91fd4d19a54b6d01503ff03ea37a3666..dd179e2f8087f881107e75641ce8919c28f4a6f9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # Changelog
 
+## Unreleased — 2026-03-14
+
+### Changed
+- Enriched 14 command descriptions with specific arg formats, valid values, error behavior, and return types
+- Fixed `header` usage from `<name> <value>` to `<name>:<value>` (matching actual implementation)
+- Added `cookie` usage syntax: `cookie <name>=<value>`
+- Enriched 4 snapshot flag descriptions with defaults, output paths, and behavior details
+- Snapshot flags section now shows long flag names (`-i / --interactive`) alongside short
+- Added ref numbering explanation and output format example to snapshot docs
+- Replaced hand-maintained server.ts help text with auto-generated `generateHelpText()` from COMMAND_DESCRIPTIONS
+- Upgraded LLM eval judge from Haiku to Sonnet 4.6 for more stable scoring
+
+### Added
+- Usage string consistency test: cross-checks `Usage:` patterns in implementation against COMMAND_DESCRIPTIONS
+- Pipe guard test: ensures no command description contains `|` (would break markdown tables)
+
 ## 0.3.3 — 2026-03-13
 
 ### Added
diff --git a/SKILL.md b/SKILL.md
index 2f78a6307e1578ab8665c6d78b1585fa904ff03d..c79b710b83d367c476020bd8e7efd0048566e9c4 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -23,12 +23,9 @@ Auto-shuts down after 30 min idle. State persists between calls (cookies, tabs,
 ## SETUP (run this check BEFORE any browse command)
 
 ```bash
-BROWSE_OUTPUT=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null)
-B=$(echo "$BROWSE_OUTPUT" | head -1)
-META=$(echo "$BROWSE_OUTPUT" | grep "^META:" || true)
+B=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null)
 if [ -n "$B" ]; then
   echo "READY: $B"
-  [ -n "$META" ] && echo "$META"
 else
   echo "NEEDS_SETUP"
 fi
@@ -39,13 +36,6 @@ If `NEEDS_SETUP`:
 2. Run: `cd <SKILL_DIR> && ./setup`
 3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash`
 
-If you see `META:UPDATE_AVAILABLE`:
-1. Parse the JSON payload to get `current`, `latest`, and `command`.
-2. Tell the user: "A gstack update is available (current: X, latest: Y). OK to update?"
-3. **STOP and wait for approval.**
-4. Run the command from the META payload.
-5. Re-run the setup check above to get the updated binary path.
-
 ## IMPORTANT
 
 - Use the compiled binary via Bash: `$B <command>`
@@ -242,25 +232,36 @@ $B css ".button" "background-color"
 The snapshot is your primary tool for understanding and interacting with pages.
 
 ```
--i        Interactive elements only (buttons, links, inputs) with @e refs
--c        Compact (no empty structural nodes)
--d <N>    Limit depth
--s <sel>  Scope to CSS selector
--D        Diff against previous snapshot (what changed?)
--a        Annotated screenshot with ref labels
--o <path> Output path for screenshot
--C        Cursor-interactive elements (@c refs — divs with pointer, onclick)
+-i        --interactive           Interactive elements only (buttons, links, inputs) with @e refs
+-c        --compact               Compact (no empty structural nodes)
+-d <N>    --depth                 Limit tree depth (0 = root only, default: unlimited)
+-s <sel>  --selector              Scope to CSS selector
+-D        --diff                  Unified diff against previous snapshot (first call stores baseline)
+-a        --annotate              Annotated screenshot with red overlay boxes and ref labels
+-o <path> --output                Output path for annotated screenshot (default: /tmp/browse-annotated.png)
+-C        --cursor-interactive    Cursor-interactive elements (@c refs — divs with pointer, onclick)
 ```
 
-Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png`
+All flags can be combined freely. `-o` only applies when `-a` is also used.
+Example: `$B snapshot -i -a -C -o /tmp/annotated.png`
+
+**Ref numbering:** @e refs are assigned sequentially (@e1, @e2, ...) in tree order.
+@c refs from `-C` are numbered separately (@c1, @c2, ...).
 
-After snapshot, use @refs everywhere:
+After snapshot, use @refs as selectors in any command:
 ```bash
 $B click @e3       $B fill @e4 "value"     $B hover @e1
 $B html @e2        $B css @e5 "color"      $B attrs @e6
 $B click @c1       # cursor-interactive ref (from -C)
 ```
 
+**Output format:** indented accessibility tree with @ref IDs, one element per line.
+```
+  @e1 [heading] "Welcome" [level=1]
+  @e2 [textbox] "Email"
+  @e3 [button] "Submit"
+```
+
 Refs are invalidated on navigation — run `snapshot` again after `goto`.
 
 ## Command Reference
@@ -279,7 +280,7 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`.
 |---------|-------------|
 | `accessibility` | Full ARIA tree |
 | `forms` | Form fields as JSON |
-| `html [selector]` | innerHTML |
+| `html [selector]` | innerHTML of selector (throws if not found), or full page HTML if no selector given |
 | `links` | All links as "text → href" |
 | `text` | Cleaned page text |
 
@@ -287,22 +288,22 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`.
 | Command | Description |
 |---------|-------------|
 | `click <sel>` | Click element |
-| `cookie` | Set cookie |
+| `cookie <name>=<value>` | Set cookie on current page domain |
 | `cookie-import <json>` | Import cookies from JSON file |
-| `cookie-import-browser [browser] [--domain d]` | Import cookies from real browser (opens picker UI, or direct with --domain) |
-| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt |
+| `cookie-import-browser [browser] [--domain d]` | Import cookies from Comet, Chrome, Arc, Brave, or Edge (opens picker, or use --domain for direct import) |
+| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response |
 | `dialog-dismiss` | Auto-dismiss next dialog |
 | `fill <sel> <val>` | Fill input |
-| `header <name> <value>` | Set custom request header |
+| `header <name>:<value>` | Set custom request header (colon-separated, sensitive values auto-redacted) |
 | `hover <sel>` | Hover element |
-| `press <key>` | Press key (Enter, Tab, Escape, etc.) |
-| `scroll [sel]` | Scroll element into view |
-| `select <sel> <val>` | Select dropdown option |
+| `press <key>` | Press key — Enter, Tab, Escape, ArrowUp/Down/Left/Right, Backspace, Delete, Home, End, PageUp, PageDown, or modifiers like Shift+Enter |
+| `scroll [sel]` | Scroll element into view, or scroll to page bottom if no selector |
+| `select <sel> <val>` | Select dropdown option by value, label, or visible text |
 | `type <text>` | Type into focused element |
-| `upload <sel> <file...>` | Upload file(s) |
+| `upload <sel> <file> [file2...]` | Upload file(s) |
 | `useragent <string>` | Set user agent |
 | `viewport <WxH>` | Set viewport size |
-| `wait <sel|--networkidle|--load>` | Wait for element/condition |
+| `wait <sel|--networkidle|--load>` | Wait for element, network idle, or page load (timeout: 15s) |
 
 ### Inspection
 | Command | Description |
@@ -312,30 +313,30 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`.
 | `cookies` | All cookies as JSON |
 | `css <sel> <prop>` | Computed CSS value |
 | `dialog [--clear]` | Dialog messages |
-| `eval <file>` | Run JS file |
+| `eval <file>` | Run JavaScript from file and return result as string (path must be under /tmp or cwd) |
 | `is <prop> <sel>` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
-| `js <expr>` | Run JavaScript |
+| `js <expr>` | Run JavaScript expression and return result as string |
 | `network [--clear]` | Network requests |
 | `perf` | Page load timings |
-| `storage [set k v]` | localStorage + sessionStorage |
+| `storage [set k v]` | Read all localStorage + sessionStorage as JSON, or set <key> <value> to write localStorage |
 
 ### Visual
 | Command | Description |
 |---------|-------------|
 | `diff <url1> <url2>` | Text diff between pages |
 | `pdf [path]` | Save as PDF |
-| `responsive [prefix]` | Mobile/tablet/desktop screenshots |
+| `responsive [prefix]` | Screenshots at mobile (375x812), tablet (768x1024), desktop (1280x720). Saves as {prefix}-mobile.png etc. |
 | `screenshot [path]` | Save screenshot |
 
 ### Snapshot
 | Command | Description |
 |---------|-------------|
-| `snapshot [flags]` | Accessibility tree with @refs |
+| `snapshot [flags]` | Accessibility tree with @e refs for element selection. Flags: -i interactive only, -c compact, -d N depth limit, -s sel scope, -D diff vs previous, -a annotated screenshot, -o path output, -C cursor-interactive @c refs |
 
 ### Meta
 | Command | Description |
 |---------|-------------|
-| `chain` | Multi-command from JSON stdin |
+| `chain` | Run commands from JSON stdin. Format: [["cmd","arg1",...],...] |
 
 ### Tabs
 | Command | Description |
diff --git a/browse/SKILL.md b/browse/SKILL.md
index 7b9a6cff8b18d181935ea99c40ae16f8df55b91d..5e838649244bc27b0328d6a7b8abc89d7ece48ee 100644
--- a/browse/SKILL.md
+++ b/browse/SKILL.md
@@ -104,25 +104,36 @@ $B diff https://staging.app.com https://prod.app.com
 The snapshot is your primary tool for understanding and interacting with pages.
 
 ```
--i        Interactive elements only (buttons, links, inputs) with @e refs
--c        Compact (no empty structural nodes)
--d <N>    Limit depth
--s <sel>  Scope to CSS selector
--D        Diff against previous snapshot (what changed?)
--a        Annotated screenshot with ref labels
--o <path> Output path for screenshot
--C        Cursor-interactive elements (@c refs — divs with pointer, onclick)
+-i        --interactive           Interactive elements only (buttons, links, inputs) with @e refs
+-c        --compact               Compact (no empty structural nodes)
+-d <N>    --depth                 Limit tree depth (0 = root only, default: unlimited)
+-s <sel>  --selector              Scope to CSS selector
+-D        --diff                  Unified diff against previous snapshot (first call stores baseline)
+-a        --annotate              Annotated screenshot with red overlay boxes and ref labels
+-o <path> --output                Output path for annotated screenshot (default: /tmp/browse-annotated.png)
+-C        --cursor-interactive    Cursor-interactive elements (@c refs — divs with pointer, onclick)
 ```
 
-Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png`
+All flags can be combined freely. `-o` only applies when `-a` is also used.
+Example: `$B snapshot -i -a -C -o /tmp/annotated.png`
 
-After snapshot, use @refs everywhere:
+**Ref numbering:** @e refs are assigned sequentially (@e1, @e2, ...) in tree order.
+@c refs from `-C` are numbered separately (@c1, @c2, ...).
+
+After snapshot, use @refs as selectors in any command:
 ```bash
 $B click @e3       $B fill @e4 "value"     $B hover @e1
 $B html @e2        $B css @e5 "color"      $B attrs @e6
 $B click @c1       # cursor-interactive ref (from -C)
 ```
 
+**Output format:** indented accessibility tree with @ref IDs, one element per line.
+```
+  @e1 [heading] "Welcome" [level=1]
+  @e2 [textbox] "Email"
+  @e3 [button] "Submit"
+```
+
 Refs are invalidated on navigation — run `snapshot` again after `goto`.
 
 ## Full Command List
@@ -141,7 +152,7 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`.
 |---------|-------------|
 | `accessibility` | Full ARIA tree |
 | `forms` | Form fields as JSON |
-| `html [selector]` | innerHTML |
+| `html [selector]` | innerHTML of selector (throws if not found), or full page HTML if no selector given |
 | `links` | All links as "text → href" |
 | `text` | Cleaned page text |
 
@@ -149,22 +160,22 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`.
 | Command | Description |
 |---------|-------------|
 | `click <sel>` | Click element |
-| `cookie` | Set cookie |
+| `cookie <name>=<value>` | Set cookie on current page domain |
 | `cookie-import <json>` | Import cookies from JSON file |
-| `cookie-import-browser [browser] [--domain d]` | Import cookies from real browser (opens picker UI, or direct with --domain) |
-| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt |
+| `cookie-import-browser [browser] [--domain d]` | Import cookies from Comet, Chrome, Arc, Brave, or Edge (opens picker, or use --domain for direct import) |
+| `dialog-accept [text]` | Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response |
 | `dialog-dismiss` | Auto-dismiss next dialog |
 | `fill <sel> <val>` | Fill input |
-| `header <name> <value>` | Set custom request header |
+| `header <name>:<value>` | Set custom request header (colon-separated, sensitive values auto-redacted) |
 | `hover <sel>` | Hover element |
-| `press <key>` | Press key (Enter, Tab, Escape, etc.) |
-| `scroll [sel]` | Scroll element into view |
-| `select <sel> <val>` | Select dropdown option |
+| `press <key>` | Press key — Enter, Tab, Escape, ArrowUp/Down/Left/Right, Backspace, Delete, Home, End, PageUp, PageDown, or modifiers like Shift+Enter |
+| `scroll [sel]` | Scroll element into view, or scroll to page bottom if no selector |
+| `select <sel> <val>` | Select dropdown option by value, label, or visible text |
 | `type <text>` | Type into focused element |
-| `upload <sel> <file...>` | Upload file(s) |
+| `upload <sel> <file> [file2...]` | Upload file(s) |
 | `useragent <string>` | Set user agent |
 | `viewport <WxH>` | Set viewport size |
-| `wait <sel|--networkidle|--load>` | Wait for element/condition |
+| `wait <sel|--networkidle|--load>` | Wait for element, network idle, or page load (timeout: 15s) |
 
 ### Inspection
 | Command | Description |
@@ -174,30 +185,30 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`.
 | `cookies` | All cookies as JSON |
 | `css <sel> <prop>` | Computed CSS value |
 | `dialog [--clear]` | Dialog messages |
-| `eval <file>` | Run JS file |
+| `eval <file>` | Run JavaScript from file and return result as string (path must be under /tmp or cwd) |
 | `is <prop> <sel>` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
-| `js <expr>` | Run JavaScript |
+| `js <expr>` | Run JavaScript expression and return result as string |
 | `network [--clear]` | Network requests |
 | `perf` | Page load timings |
-| `storage [set k v]` | localStorage + sessionStorage |
+| `storage [set k v]` | Read all localStorage + sessionStorage as JSON, or set <key> <value> to write localStorage |
 
 ### Visual
 | Command | Description |
 |---------|-------------|
 | `diff <url1> <url2>` | Text diff between pages |
 | `pdf [path]` | Save as PDF |
-| `responsive [prefix]` | Mobile/tablet/desktop screenshots |
+| `responsive [prefix]` | Screenshots at mobile (375x812), tablet (768x1024), desktop (1280x720). Saves as {prefix}-mobile.png etc. |
 | `screenshot [path]` | Save screenshot |
 
 ### Snapshot
 | Command | Description |
 |---------|-------------|
-| `snapshot [flags]` | Accessibility tree with @refs |
+| `snapshot [flags]` | Accessibility tree with @e refs for element selection. Flags: -i interactive only, -c compact, -d N depth limit, -s sel scope, -D diff vs previous, -a annotated screenshot, -o path output, -C cursor-interactive @c refs |
 
 ### Meta
 | Command | Description |
 |---------|-------------|
-| `chain` | Multi-command from JSON stdin |
+| `chain` | Run commands from JSON stdin. Format: [["cmd","arg1",...],...] |
 
 ### Tabs
 | Command | Description |
diff --git a/browse/src/commands.ts b/browse/src/commands.ts
index c3189ace3d983517d69ecbc681a105cba85a15e0..6024c4b6fa70b1917054306d1069b23a5e8bddfe 100644
--- a/browse/src/commands.ts
+++ b/browse/src/commands.ts
@@ -43,13 +43,13 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
   'url':     { category: 'Navigation', description: 'Print current URL' },
   // Reading
   'text':    { category: 'Reading', description: 'Cleaned page text' },
-  'html':    { category: 'Reading', description: 'innerHTML', usage: 'html [selector]' },
+  'html':    { category: 'Reading', description: 'innerHTML of selector (throws if not found), or full page HTML if no selector given', usage: 'html [selector]' },
   'links':   { category: 'Reading', description: 'All links as "text → href"' },
   'forms':   { category: 'Reading', description: 'Form fields as JSON' },
   'accessibility': { category: 'Reading', description: 'Full ARIA tree' },
   // Inspection
-  'js':      { category: 'Inspection', description: 'Run JavaScript', usage: 'js <expr>' },
-  'eval':    { category: 'Inspection', description: 'Run JS file', usage: 'eval <file>' },
+  'js':      { category: 'Inspection', description: 'Run JavaScript expression and return result as string', usage: 'js <expr>' },
+  'eval':    { category: 'Inspection', description: 'Run JavaScript from file and return result as string (path must be under /tmp or cwd)', usage: 'eval <file>' },
   'css':     { category: 'Inspection', description: 'Computed CSS value', usage: 'css <sel> <prop>' },
   'attrs':   { category: 'Inspection', description: 'Element attributes as JSON', usage: 'attrs <sel|@ref>' },
   'is':      { category: 'Inspection', description: 'State check (visible/hidden/enabled/disabled/checked/editable/focused)', usage: 'is <prop> <sel>' },
@@ -57,30 +57,30 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
   'network': { category: 'Inspection', description: 'Network requests', usage: 'network [--clear]' },
   'dialog':  { category: 'Inspection', description: 'Dialog messages', usage: 'dialog [--clear]' },
   'cookies': { category: 'Inspection', description: 'All cookies as JSON' },
-  'storage': { category: 'Inspection', description: 'localStorage + sessionStorage', usage: 'storage [set k v]' },
+  'storage': { category: 'Inspection', description: 'Read all localStorage + sessionStorage as JSON, or set <key> <value> to write localStorage', usage: 'storage [set k v]' },
   'perf':    { category: 'Inspection', description: 'Page load timings' },
   // Interaction
   'click':   { category: 'Interaction', description: 'Click element', usage: 'click <sel>' },
   'fill':    { category: 'Interaction', description: 'Fill input', usage: 'fill <sel> <val>' },
-  'select':  { category: 'Interaction', description: 'Select dropdown option', usage: 'select <sel> <val>' },
+  'select':  { category: 'Interaction', description: 'Select dropdown option by value, label, or visible text', usage: 'select <sel> <val>' },
   'hover':   { category: 'Interaction', description: 'Hover element', usage: 'hover <sel>' },
   'type':    { category: 'Interaction', description: 'Type into focused element', usage: 'type <text>' },
-  'press':   { category: 'Interaction', description: 'Press key (Enter, Tab, Escape, etc.)', usage: 'press <key>' },
-  'scroll':  { category: 'Interaction', description: 'Scroll element into view', usage: 'scroll [sel]' },
-  'wait':    { category: 'Interaction', description: 'Wait for element/condition', usage: 'wait <sel|--networkidle|--load>' },
-  'upload':  { category: 'Interaction', description: 'Upload file(s)', usage: 'upload <sel> <file...>' },
+  'press':   { category: 'Interaction', description: 'Press key — Enter, Tab, Escape, ArrowUp/Down/Left/Right, Backspace, Delete, Home, End, PageUp, PageDown, or modifiers like Shift+Enter', usage: 'press <key>' },
+  'scroll':  { category: 'Interaction', description: 'Scroll element into view, or scroll to page bottom if no selector', usage: 'scroll [sel]' },
+  'wait':    { category: 'Interaction', description: 'Wait for element, network idle, or page load (timeout: 15s)', usage: 'wait <sel|--networkidle|--load>' },
+  'upload':  { category: 'Interaction', description: 'Upload file(s)', usage: 'upload <sel> <file> [file2...]' },
   'viewport':{ category: 'Interaction', description: 'Set viewport size', usage: 'viewport <WxH>' },
-  'cookie':  { category: 'Interaction', description: 'Set cookie' },
+  'cookie':  { category: 'Interaction', description: 'Set cookie on current page domain', usage: 'cookie <name>=<value>' },
   'cookie-import': { category: 'Interaction', description: 'Import cookies from JSON file', usage: 'cookie-import <json>' },
-  'cookie-import-browser': { category: 'Interaction', description: 'Import cookies from real browser (opens picker UI, or direct with --domain)', usage: 'cookie-import-browser [browser] [--domain d]' },
-  'header':  { category: 'Interaction', description: 'Set custom request header', usage: 'header <name> <value>' },
+  'cookie-import-browser': { category: 'Interaction', description: 'Import cookies from Comet, Chrome, Arc, Brave, or Edge (opens picker, or use --domain for direct import)', usage: 'cookie-import-browser [browser] [--domain d]' },
+  'header':  { category: 'Interaction', description: 'Set custom request header (colon-separated, sensitive values auto-redacted)', usage: 'header <name>:<value>' },
   'useragent': { category: 'Interaction', description: 'Set user agent', usage: 'useragent <string>' },
-  'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt', usage: 'dialog-accept [text]' },
+  'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' },
   'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' },
   // Visual
   'screenshot': { category: 'Visual', description: 'Save screenshot', usage: 'screenshot [path]' },
   'pdf':     { category: 'Visual', description: 'Save as PDF', usage: 'pdf [path]' },
-  'responsive': { category: 'Visual', description: 'Mobile/tablet/desktop screenshots', usage: 'responsive [prefix]' },
+  'responsive': { category: 'Visual', description: 'Screenshots at mobile (375x812), tablet (768x1024), desktop (1280x720). Saves as {prefix}-mobile.png etc.', usage: 'responsive [prefix]' },
   'diff':    { category: 'Visual', description: 'Text diff between pages', usage: 'diff <url1> <url2>' },
   // Tabs
   'tabs':    { category: 'Tabs', description: 'List open tabs' },
@@ -92,8 +92,8 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
   'stop':    { category: 'Server', description: 'Shutdown server' },
   'restart': { category: 'Server', description: 'Restart server' },
   // Meta
-  'snapshot':{ category: 'Snapshot', description: 'Accessibility tree with @refs', usage: 'snapshot [flags]' },
-  'chain':   { category: 'Meta', description: 'Multi-command from JSON stdin' },
+  'snapshot':{ category: 'Snapshot', description: 'Accessibility tree with @e refs for element selection. Flags: -i interactive only, -c compact, -d N depth limit, -s sel scope, -D diff vs previous, -a annotated screenshot, -o path output, -C cursor-interactive @c refs', usage: 'snapshot [flags]' },
+  'chain':   { category: 'Meta', description: 'Run commands from JSON stdin. Format: [["cmd","arg1",...],...]' },
 };
 
 // Load-time validation: descriptions must cover exactly the command sets
diff --git a/browse/src/server.ts b/browse/src/server.ts
index 580bd67e75178a41f1db40878be57675218ae308..5e76f4214df06d93fa822d0d7f98258818eb7c90 100644
--- a/browse/src/server.ts
+++ b/browse/src/server.ts
@@ -18,6 +18,8 @@ import { handleReadCommand } from './read-commands';
 import { handleWriteCommand } from './write-commands';
 import { handleMetaCommand } from './meta-commands';
 import { handleCookiePickerRoute } from './cookie-picker-routes';
+import { COMMAND_DESCRIPTIONS } from './commands';
+import { SNAPSHOT_FLAGS } from './snapshot';
 import { resolveConfig, ensureStateDir, readVersionHash } from './config';
 import * as fs from 'fs';
 import * as path from 'path';
@@ -37,6 +39,47 @@ function validateAuth(req: Request): boolean {
   return header === `Bearer ${AUTH_TOKEN}`;
 }
 
+// ─── Help text (auto-generated from COMMAND_DESCRIPTIONS) ────────
+function generateHelpText(): string {
+  // Group commands by category
+  const groups = new Map<string, string[]>();
+  for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) {
+    const display = meta.usage || cmd;
+    const list = groups.get(meta.category) || [];
+    list.push(display);
+    groups.set(meta.category, list);
+  }
+
+  const categoryOrder = [
+    'Navigation', 'Reading', 'Interaction', 'Inspection',
+    'Visual', 'Snapshot', 'Meta', 'Tabs', 'Server',
+  ];
+
+  const lines = ['gstack browse — headless browser for AI agents', '', 'Commands:'];
+  for (const cat of categoryOrder) {
+    const cmds = groups.get(cat);
+    if (!cmds) continue;
+    lines.push(`  ${(cat + ':').padEnd(15)}${cmds.join(', ')}`);
+  }
+
+  // Snapshot flags from source of truth
+  lines.push('');
+  lines.push('Snapshot flags:');
+  const flagPairs: string[] = [];
+  for (const flag of SNAPSHOT_FLAGS) {
+    const label = flag.valueHint ? `${flag.short} ${flag.valueHint}` : flag.short;
+    flagPairs.push(`${label}  ${flag.long}`);
+  }
+  // Print two flags per line for compact display
+  for (let i = 0; i < flagPairs.length; i += 2) {
+    const left = flagPairs[i].padEnd(28);
+    const right = flagPairs[i + 1] || '';
+    lines.push(`  ${left}${right}`);
+  }
+
+  return lines.join('\n');
+}
+
 // ─── Buffer (from buffers.ts) ────────────────────────────────────
 import { consoleBuffer, networkBuffer, dialogBuffer, addConsoleEntry, addNetworkEntry, addDialogEntry, type LogEntry, type NetworkEntry, type DialogEntry } from './buffers';
 export { consoleBuffer, networkBuffer, dialogBuffer, addConsoleEntry, addNetworkEntry, addDialogEntry, type LogEntry, type NetworkEntry, type DialogEntry };
@@ -191,29 +234,7 @@ async function handleCommand(body: any): Promise<Response> {
     } else if (META_COMMANDS.has(command)) {
       result = await handleMetaCommand(command, args, browserManager, shutdown);
     } else if (command === 'help') {
-      const helpText = [
-        'gstack browse — headless browser for AI agents',
-        '',
-        'Commands:',
-        '  Navigation:    goto <url>, back, forward, reload',
-        '  Interaction:   click <sel>, fill <sel> <text>, select <sel> <val>, hover, type, press, scroll, wait',
-        '  Read:          text [sel], html [sel], links, forms, accessibility, cookies, storage, console, network, perf',
-        '  Evaluate:      js <expr>, eval <expr>, css <sel> <prop>, attrs <sel>, is <sel> <state>',
-        '  Snapshot:      snapshot [-i] [-c] [-d N] [-s sel] [-D] [-a] [-o path] [-C]',
-        '  Screenshot:    screenshot [path], pdf [path], responsive <widths>',
-        '  Tabs:          tabs, tab <id>, newtab [url], closetab [id]',
-        '  State:         cookie <set|get|clear>, cookie-import <json>, cookie-import-browser [browser]',
-        '  Headers:       header <set|clear> [name] [value], useragent [string]',
-        '  Upload:        upload <sel> <file1> [file2...]',
-        '  Dialogs:       dialog, dialog-accept [text], dialog-dismiss',
-        '  Meta:          status, stop, restart, diff, chain, help',
-        '',
-        'Snapshot flags:',
-        '  -i  interactive only    -c  compact (remove empty nodes)',
-        '  -d N  limit depth       -s sel  scope to CSS selector',
-        '  -D  diff vs previous    -a  annotated screenshot with ref labels',
-        '  -o path  output file    -C  cursor-interactive elements',
-      ].join('\n');
+      const helpText = generateHelpText();
       return new Response(helpText, {
         status: 200,
         headers: { 'Content-Type': 'text/plain' },
diff --git a/browse/src/snapshot.ts b/browse/src/snapshot.ts
index d3a84b5e3ec1b709cffaa5dcfefdf7bab92c9635..a2a3aeea1cfd8a05a0363f71a3627d5b98b5f371 100644
--- a/browse/src/snapshot.ts
+++ b/browse/src/snapshot.ts
@@ -57,11 +57,11 @@ export const SNAPSHOT_FLAGS: Array<{
 }> = [
   { short: '-i', long: '--interactive', description: 'Interactive elements only (buttons, links, inputs) with @e refs', optionKey: 'interactive' },
   { short: '-c', long: '--compact', description: 'Compact (no empty structural nodes)', optionKey: 'compact' },
-  { short: '-d', long: '--depth', description: 'Limit depth', takesValue: true, valueHint: '<N>', optionKey: 'depth' },
+  { short: '-d', long: '--depth', description: 'Limit tree depth (0 = root only, default: unlimited)', takesValue: true, valueHint: '<N>', optionKey: 'depth' },
   { short: '-s', long: '--selector', description: 'Scope to CSS selector', takesValue: true, valueHint: '<sel>', optionKey: 'selector' },
-  { short: '-D', long: '--diff', description: 'Diff against previous snapshot (what changed?)', optionKey: 'diff' },
-  { short: '-a', long: '--annotate', description: 'Annotated screenshot with ref labels', optionKey: 'annotate' },
-  { short: '-o', long: '--output', description: 'Output path for screenshot', takesValue: true, valueHint: '<path>', optionKey: 'outputPath' },
+  { short: '-D', long: '--diff', description: 'Unified diff against previous snapshot (first call stores baseline)', optionKey: 'diff' },
+  { short: '-a', long: '--annotate', description: 'Annotated screenshot with red overlay boxes and ref labels', optionKey: 'annotate' },
+  { short: '-o', long: '--output', description: 'Output path for annotated screenshot (default: /tmp/browse-annotated.png)', takesValue: true, valueHint: '<path>', optionKey: 'outputPath' },
   { short: '-C', long: '--cursor-interactive', description: 'Cursor-interactive elements (@c refs — divs with pointer, onclick)', optionKey: 'cursorInteractive' },
 ];
 
diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts
index 19b680004c1764a265435dd39298cbc26fb252e1..381278ca703dfb1e19b4800732dfae17f70eed46 100644
--- a/scripts/gen-skill-docs.ts
+++ b/scripts/gen-skill-docs.ts
@@ -64,20 +64,31 @@ function generateSnapshotFlags(): string {
 
   for (const flag of SNAPSHOT_FLAGS) {
     const label = flag.valueHint ? `${flag.short} ${flag.valueHint}` : flag.short;
-    lines.push(`${label.padEnd(10)}${flag.description}`);
+    lines.push(`${label.padEnd(10)}${flag.long.padEnd(24)}${flag.description}`);
   }
 
   lines.push('```');
   lines.push('');
-  lines.push('Combine flags: `$B snapshot -i -a -C -o /tmp/annotated.png`');
+  lines.push('All flags can be combined freely. `-o` only applies when `-a` is also used.');
+  lines.push('Example: `$B snapshot -i -a -C -o /tmp/annotated.png`');
   lines.push('');
-  lines.push('After snapshot, use @refs everywhere:');
+  lines.push('**Ref numbering:** @e refs are assigned sequentially (@e1, @e2, ...) in tree order.');
+  lines.push('@c refs from `-C` are numbered separately (@c1, @c2, ...).');
+  lines.push('');
+  lines.push('After snapshot, use @refs as selectors in any command:');
   lines.push('```bash');
   lines.push('$B click @e3       $B fill @e4 "value"     $B hover @e1');
   lines.push('$B html @e2        $B css @e5 "color"      $B attrs @e6');
   lines.push('$B click @c1       # cursor-interactive ref (from -C)');
   lines.push('```');
   lines.push('');
+  lines.push('**Output format:** indented accessibility tree with @ref IDs, one element per line.');
+  lines.push('```');
+  lines.push('  @e1 [heading] "Welcome" [level=1]');
+  lines.push('  @e2 [textbox] "Email"');
+  lines.push('  @e3 [button] "Submit"');
+  lines.push('```');
+  lines.push('');
   lines.push('Refs are invalidated on navigation — run `snapshot` again after `goto`.');
 
   return lines.join('\n');
diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts
index ce7c98ea712bf4247fd02b497fa7e7aacc1022da..9d3f3b9b4be518579f1f9be8f8770ca75e4101c1 100644
--- a/test/gen-skill-docs.test.ts
+++ b/test/gen-skill-docs.test.ts
@@ -139,6 +139,14 @@ describe('description quality evals', () => {
     }
   });
 
+  // Guard: descriptions must not contain pipe (breaks markdown table cells)
+  // Usage strings are backtick-wrapped in the table so pipes there are safe.
+  test('no command description contains pipe character', () => {
+    for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) {
+      expect(meta.description).not.toContain('|');
+    }
+  });
+
   // Guard: generated output uses → not ->
   test('generated SKILL.md uses unicode arrows', () => {
     const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts
index 308de814e9a76d8ca4c33c44da53d8ea4874c9a5..f978f0352c003246045506149fb80e61ce513611 100644
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -7,7 +7,7 @@
  * Requires: ANTHROPIC_API_KEY env var
  * Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
  *
- * Cost: ~$0.01-0.03 per run (haiku)
+ * Cost: ~$0.05-0.15 per run (sonnet)
  */
 
 import { describe, test, expect } from 'bun:test';
@@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise<JudgeScore> {
   const client = new Anthropic();
 
   const response = await client.messages.create({
-    model: 'claude-haiku-4-5-20251001',
+    model: 'claude-sonnet-4-6',
     max_tokens: 1024,
     messages: [{
       role: 'user',
@@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => {
 
     const client = new Anthropic();
     const response = await client.messages.create({
-      model: 'claude-haiku-4-5-20251001',
+      model: 'claude-sonnet-4-6',
       max_tokens: 1024,
       messages: [{
         role: 'user',
diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts
index 1c4025a297189c2c4b7f59f38b781172f7059a14..4bf6b6dddd32a0ca8fbe503cf423f109c6948c89 100644
--- a/test/skill-validation.test.ts
+++ b/test/skill-validation.test.ts
@@ -80,6 +80,59 @@ describe('Command registry consistency', () => {
   });
 });
 
+describe('Usage string consistency', () => {
+  // Normalize a usage string to its structural skeleton for comparison.
+  // Replaces <param-names> with <>, [optional] with [], strips parenthetical hints.
+  // This catches format mismatches (e.g., <name>:<value> vs <name> <value>)
+  // without tripping on abbreviation differences (e.g., <sel> vs <selector>).
+  function skeleton(usage: string): string {
+    return usage
+      .replace(/\(.*?\)/g, '')        // strip parenthetical hints like (e.g., Enter, Tab)
+      .replace(/<[^>]*>/g, '<>')      // normalize <param-name> → <>
+      .replace(/\[[^\]]*\]/g, '[]')   // normalize [optional] → []
+      .replace(/\s+/g, ' ')           // collapse whitespace
+      .trim();
+  }
+
+  // Cross-check Usage: patterns in implementation against COMMAND_DESCRIPTIONS
+  test('implementation Usage: structural format matches COMMAND_DESCRIPTIONS', () => {
+    const implFiles = [
+      path.join(ROOT, 'browse', 'src', 'write-commands.ts'),
+      path.join(ROOT, 'browse', 'src', 'read-commands.ts'),
+      path.join(ROOT, 'browse', 'src', 'meta-commands.ts'),
+    ];
+
+    // Extract "Usage: browse <pattern>" from throw new Error(...) calls
+    const usagePattern = /throw new Error\(['"`]Usage:\s*browse\s+(.+?)['"`]\)/g;
+    const implUsages = new Map<string, string>();
+
+    for (const file of implFiles) {
+      const content = fs.readFileSync(file, 'utf-8');
+      let match;
+      while ((match = usagePattern.exec(content)) !== null) {
+        const usage = match[1].split('\\n')[0].trim();
+        const cmd = usage.split(/\s/)[0];
+        implUsages.set(cmd, usage);
+      }
+    }
+
+    // Compare structural skeletons
+    const mismatches: string[] = [];
+    for (const [cmd, implUsage] of implUsages) {
+      const desc = COMMAND_DESCRIPTIONS[cmd];
+      if (!desc) continue;
+      if (!desc.usage) continue;
+      const descSkel = skeleton(desc.usage);
+      const implSkel = skeleton(implUsage);
+      if (descSkel !== implSkel) {
+        mismatches.push(`${cmd}: docs "${desc.usage}" (${descSkel}) vs impl "${implUsage}" (${implSkel})`);
+      }
+    }
+
+    expect(mismatches).toEqual([]);
+  });
+});
+
 describe('Generated SKILL.md freshness', () => {
   test('no unresolved {{placeholders}} in generated SKILL.md', () => {
     const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');