~cytrogen/gstack: fix: browse binary discovery broken for agents (v0.3.5) (#44)

15 files changed, 627 insertions(+), 47 deletions(-)

M CHANGELOG.md
M SKILL.md
M SKILL.md.tmpl
M TODOS.md
M VERSION
M browse/SKILL.md
M browse/SKILL.md.tmpl
M qa/SKILL.md
A qa/SKILL.md.tmpl
M scripts/gen-skill-docs.ts
M setup-browser-cookies/SKILL.md
A setup-browser-cookies/SKILL.md.tmpl
M test/helpers/session-runner.ts
M test/skill-e2e.test.ts
M test/skill-llm-eval.test.ts

M CHANGELOG.md => CHANGELOG.md +14 -1

@@ 22,12 22,25 @@
 - Old META-based upgrade instructions from qa and setup-browser-cookies SKILL.md files
 - Legacy `/tmp/gstack-latest-version` cache file (cleaned up by `setup` script)
 
-## Unreleased — 2026-03-14
+## 0.3.5 — 2026-03-14
+
+### Fixed
+- **Browse binary discovery broken for agents** — replaced `find-browse` indirection with explicit `browse/dist/browse` path in SKILL.md setup blocks. Agents were guessing `bin/browse` (wrong) instead of running `find-browse` to discover `browse/dist/browse` (correct).
+- **Update check exit code 1 misleading agents** — `[ -n "$_UPD" ] && echo "$_UPD"` returned exit code 1 when no update available, causing agents to think gstack was broken. Added `|| true`.
+- **browse/SKILL.md missing setup block** — `/browse` used `$B` in every example but never defined it. Added `{{BROWSE_SETUP}}` placeholder.
 
 ### Changed
 - Enriched 14 command descriptions with specific arg formats, valid values, error behavior, and return types
 - Fixed `header` usage from `<name> <value>` to `<name>:<value>` (matching actual implementation)
 - Added `cookie` usage syntax: `cookie <name>=<value>`
+- **Template system expanded** — added `{{UPDATE_CHECK}}` and `{{BROWSE_SETUP}}` placeholders to `gen-skill-docs.ts`. Converted `qa/SKILL.md` and `setup-browser-cookies/SKILL.md` to `.tmpl` templates. All 4 browse-using skills now generate from a single source of truth.
+- Setup block now checks workspace-local path first (for development), then falls back to global `~/.claude/skills/gstack/browse/dist/browse`
+
+### Added
+- 3 new e2e test cases for SKILL.md setup flow: happy path, NEEDS_SETUP, non-git-repo
+- LLM eval for setup block clarity (actionability + clarity >= 4)
+- `no such file or directory.*browse` error pattern in session-runner
+- TODO: convert remaining 5 non-browse skills to .tmpl files
 - Enriched 4 snapshot flag descriptions with defaults, output paths, and behavior details
 - Snapshot flags section now shows long flag names (`-i / --interactive`) alongside short
 - Added ref numbering explanation and output format example to snapshot docs

M SKILL.md => SKILL.md +6 -5

@@ 20,7 20,7 @@ allowed-tools:
 
 ```bash
 _UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
-[ -n "$_UPD" ] && echo "$_UPD"
+[ -n "$_UPD" ] && echo "$_UPD" || true
 ```
 
 If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (AskUserQuestion → upgrade if yes, `touch ~/.gstack/last-update-check` if no). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.


@@ 33,8 33,11 @@ Auto-shuts down after 30 min idle. State persists between calls (cookies, tabs, 
 ## SETUP (run this check BEFORE any browse command)
 
 ```bash
-B=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null)
-if [ -n "$B" ]; then
+_ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
+B=""
+[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
+[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
+if [ -x "$B" ]; then
   echo "READY: $B"
 else
   echo "NEEDS_SETUP"


@@ 58,8 61,6 @@ If `NEEDS_SETUP`:
 ### Test a user flow (login, signup, checkout, etc.)
 
 ```bash
-B=~/.claude/skills/gstack/browse/dist/browse
-
 # 1. Go to the page
 $B goto https://app.example.com/login

M SKILL.md.tmpl => SKILL.md.tmpl +2 -25

@@ 14,35 14,14 @@ allowed-tools:
 
 ---
 
-## Update Check (run first)
-
-```bash
-_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
-[ -n "$_UPD" ] && echo "$_UPD"
-```
-
-If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (AskUserQuestion → upgrade if yes, `touch ~/.gstack/last-update-check` if no). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.
+{{UPDATE_CHECK}}
 
 # gstack browse: QA Testing & Dogfooding
 
 Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command.
 Auto-shuts down after 30 min idle. State persists between calls (cookies, tabs, sessions).
 
-## SETUP (run this check BEFORE any browse command)
-
-```bash
-B=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null)
-if [ -n "$B" ]; then
-  echo "READY: $B"
-else
-  echo "NEEDS_SETUP"
-fi
-```
-
-If `NEEDS_SETUP`:
-1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait.
-2. Run: `cd <SKILL_DIR> && ./setup`
-3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash`
+{{BROWSE_SETUP}}
 
 ## IMPORTANT
 


@@ 56,8 35,6 @@ If `NEEDS_SETUP`:
 ### Test a user flow (login, signup, checkout, etc.)
 
 ```bash
-B=~/.claude/skills/gstack/browse/dist/browse
-
 # 1. Go to the page
 $B goto https://app.example.com/login

M TODOS.md => TODOS.md +12 -0

@@ 10,3 10,15 @@
 
 **Effort:** S (small)
 **Priority:** P3 (nice-to-have, revisit after adoption data)
+
+## Convert remaining skills to .tmpl files
+
+**What:** Convert ship/, review/, plan-ceo-review/, plan-eng-review/, retro/ SKILL.md files to .tmpl templates using the `{{UPDATE_CHECK}}` placeholder.
+
+**Why:** These 5 skills still have the update check preamble copy-pasted. When the preamble changes (like the `|| true` fix in v0.3.5), all 5 need manual updates. The `{{UPDATE_CHECK}}` resolver already exists in `scripts/gen-skill-docs.ts` — these skills just need to be converted.
+
+**Context:** The browse-using skills (SKILL.md, browse/, qa/, setup-browser-cookies/) were converted to .tmpl in v0.3.5. The remaining 5 skills only use `{{UPDATE_CHECK}}` (no `{{BROWSE_SETUP}}`), so the conversion is mechanical: replace the preamble with `{{UPDATE_CHECK}}`, add the path to `findTemplates()` in `scripts/gen-skill-docs.ts`, and commit both .tmpl + generated .md.
+
+**Depends on:** v0.3.5 shipping first (the `{{UPDATE_CHECK}}` resolver).
+**Effort:** S (small, ~20 min)
+**Priority:** P2 (prevents drift on next preamble change)

M VERSION => VERSION +1 -1

@@ 1,1 1,1 @@
-0.3.4
+0.3.5

M browse/SKILL.md => browse/SKILL.md +20 -1

@@ 20,7 20,7 @@ allowed-tools:
 
 ```bash
 _UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
-[ -n "$_UPD" ] && echo "$_UPD"
+[ -n "$_UPD" ] && echo "$_UPD" || true
 ```
 
 If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (AskUserQuestion → upgrade if yes, `touch ~/.gstack/last-update-check` if no). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.


@@ 30,6 30,25 @@ If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/g
 Persistent headless Chromium. First call auto-starts (~3s), then ~100ms per command.
 State persists between calls (cookies, tabs, login sessions).
 
+## SETUP (run this check BEFORE any browse command)
+
+```bash
+_ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
+B=""
+[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
+[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
+if [ -x "$B" ]; then
+  echo "READY: $B"
+else
+  echo "NEEDS_SETUP"
+fi
+```
+
+If `NEEDS_SETUP`:
+1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait.
+2. Run: `cd <SKILL_DIR> && ./setup`
+3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash`
+
 ## Core QA Patterns
 
 ### 1. Verify a page loads correctly

M browse/SKILL.md.tmpl => browse/SKILL.md.tmpl +3 -8

@@ 14,20 14,15 @@ allowed-tools:
 
 ---
 
-## Update Check (run first)
-
-```bash
-_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
-[ -n "$_UPD" ] && echo "$_UPD"
-```
-
-If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (AskUserQuestion → upgrade if yes, `touch ~/.gstack/last-update-check` if no). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.
+{{UPDATE_CHECK}}
 
 # browse: QA Testing & Dogfooding
 
 Persistent headless Chromium. First call auto-starts (~3s), then ~100ms per command.
 State persists between calls (cookies, tabs, login sessions).
 
+{{BROWSE_SETUP}}
+
 ## Core QA Patterns
 
 ### 1. Verify a page loads correctly

M qa/SKILL.md => qa/SKILL.md +10 -3

@@ 13,12 13,14 @@ allowed-tools:
   - Write
   - AskUserQuestion
 ---
+<!-- AUTO-GENERATED from SKILL.md.tmpl — do not edit directly -->
+<!-- Regenerate: bun run gen:skill-docs -->
 
 ## Update Check (run first)
 
 ```bash
 _UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
-[ -n "$_UPD" ] && echo "$_UPD"
+[ -n "$_UPD" ] && echo "$_UPD" || true
 ```
 
 If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (AskUserQuestion → upgrade if yes, `touch ~/.gstack/last-update-check` if no). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.


@@ 43,9 45,14 @@ You are a QA engineer. Test web applications like a real user — click everythi
 
 **Find the browse binary:**
 
+## SETUP (run this check BEFORE any browse command)
+
 ```bash
-B=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null)
-if [ -n "$B" ]; then
+_ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
+B=""
+[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
+[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
+if [ -x "$B" ]; then
   echo "READY: $B"
 else
   echo "NEEDS_SETUP"

A qa/SKILL.md.tmpl => qa/SKILL.md.tmpl +337 -0

@@ 0,0 1,337 @@
+---
+name: qa
+version: 1.0.0
+description: |
+  Systematically QA test a web application. Use when asked to "qa", "QA", "test this site",
+  "find bugs", "dogfood", or review quality. Four modes: diff-aware (automatic on feature
+  branches — analyzes git diff, identifies affected pages, tests them), full (systematic
+  exploration), quick (30-second smoke test), regression (compare against baseline). Produces
+  structured report with health score, screenshots, and repro steps.
+allowed-tools:
+  - Bash
+  - Read
+  - Write
+  - AskUserQuestion
+---
+
+{{UPDATE_CHECK}}
+
+# /qa: Systematic QA Testing
+
+You are a QA engineer. Test web applications like a real user — click everything, fill every form, check every state. Produce a structured report with evidence.
+
+## Setup
+
+**Parse the user's request for these parameters:**
+
+| Parameter | Default | Override example |
+|-----------|---------|-----------------|
+| Target URL | (auto-detect or required) | `https://myapp.com`, `http://localhost:3000` |
+| Mode | full | `--quick`, `--regression .gstack/qa-reports/baseline.json` |
+| Output dir | `.gstack/qa-reports/` | `Output to /tmp/qa` |
+| Scope | Full app (or diff-scoped) | `Focus on the billing page` |
+| Auth | None | `Sign in to user@example.com`, `Import cookies from cookies.json` |
+
+**If no URL is given and you're on a feature branch:** Automatically enter **diff-aware mode** (see Modes below). This is the most common case — the user just shipped code on a branch and wants to verify it works.
+
+**Find the browse binary:**
+
+{{BROWSE_SETUP}}
+
+**Create output directories:**
+
+```bash
+REPORT_DIR=".gstack/qa-reports"
+mkdir -p "$REPORT_DIR/screenshots"
+```
+
+---
+
+## Modes
+
+### Diff-aware (automatic when on a feature branch with no URL)
+
+This is the **primary mode** for developers verifying their work. When the user says `/qa` without a URL and the repo is on a feature branch, automatically:
+
+1. **Analyze the branch diff** to understand what changed:
+   ```bash
+   git diff main...HEAD --name-only
+   git log main..HEAD --oneline
+   ```
+
+2. **Identify affected pages/routes** from the changed files:
+   - Controller/route files → which URL paths they serve
+   - View/template/component files → which pages render them
+   - Model/service files → which pages use those models (check controllers that reference them)
+   - CSS/style files → which pages include those stylesheets
+   - API endpoints → test them directly with `$B js "await fetch('/api/...')"`
+   - Static pages (markdown, HTML) → navigate to them directly
+
+3. **Detect the running app** — check common local dev ports:
+   ```bash
+   $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \
+   $B goto http://localhost:4000 2>/dev/null && echo "Found app on :4000" || \
+   $B goto http://localhost:8080 2>/dev/null && echo "Found app on :8080"
+   ```
+   If no local app is found, check for a staging/preview URL in the PR or environment. If nothing works, ask the user for the URL.
+
+4. **Test each affected page/route:**
+   - Navigate to the page
+   - Take a screenshot
+   - Check console for errors
+   - If the change was interactive (forms, buttons, flows), test the interaction end-to-end
+   - Use `snapshot -D` before and after actions to verify the change had the expected effect
+
+5. **Cross-reference with commit messages and PR description** to understand *intent* — what should the change do? Verify it actually does that.
+
+6. **Report findings** scoped to the branch changes:
+   - "Changes tested: N pages/routes affected by this branch"
+   - For each: does it work? Screenshot evidence.
+   - Any regressions on adjacent pages?
+
+**If the user provides a URL with diff-aware mode:** Use that URL as the base but still scope testing to the changed files.
+
+### Full (default when URL is provided)
+Systematic exploration. Visit every reachable page. Document 5-10 well-evidenced issues. Produce health score. Takes 5-15 minutes depending on app size.
+
+### Quick (`--quick`)
+30-second smoke test. Visit homepage + top 5 navigation targets. Check: page loads? Console errors? Broken links? Produce health score. No detailed issue documentation.
+
+### Regression (`--regression <baseline>`)
+Run full mode, then load `baseline.json` from a previous run. Diff: which issues are fixed? Which are new? What's the score delta? Append regression section to report.
+
+---
+
+## Workflow
+
+### Phase 1: Initialize
+
+1. Find browse binary (see Setup above)
+2. Create output directories
+3. Copy report template from `qa/templates/qa-report-template.md` to output dir
+4. Start timer for duration tracking
+
+### Phase 2: Authenticate (if needed)
+
+**If the user specified auth credentials:**
+
+```bash
+$B goto <login-url>
+$B snapshot -i                    # find the login form
+$B fill @e3 "user@example.com"
+$B fill @e4 "[REDACTED]"         # NEVER include real passwords in report
+$B click @e5                      # submit
+$B snapshot -D                    # verify login succeeded
+```
+
+**If the user provided a cookie file:**
+
+```bash
+$B cookie-import cookies.json
+$B goto <target-url>
+```
+
+**If 2FA/OTP is required:** Ask the user for the code and wait.
+
+**If CAPTCHA blocks you:** Tell the user: "Please complete the CAPTCHA in the browser, then tell me to continue."
+
+### Phase 3: Orient
+
+Get a map of the application:
+
+```bash
+$B goto <target-url>
+$B snapshot -i -a -o "$REPORT_DIR/screenshots/initial.png"
+$B links                          # map navigation structure
+$B console --errors               # any errors on landing?
+```
+
+**Detect framework** (note in report metadata):
+- `__next` in HTML or `_next/data` requests → Next.js
+- `csrf-token` meta tag → Rails
+- `wp-content` in URLs → WordPress
+- Client-side routing with no page reloads → SPA
+
+**For SPAs:** The `links` command may return few results because navigation is client-side. Use `snapshot -i` to find nav elements (buttons, menu items) instead.
+
+### Phase 4: Explore
+
+Visit pages systematically. At each page:
+
+```bash
+$B goto <page-url>
+$B snapshot -i -a -o "$REPORT_DIR/screenshots/page-name.png"
+$B console --errors
+```
+
+Then follow the **per-page exploration checklist** (see `qa/references/issue-taxonomy.md`):
+
+1. **Visual scan** — Look at the annotated screenshot for layout issues
+2. **Interactive elements** — Click buttons, links, controls. Do they work?
+3. **Forms** — Fill and submit. Test empty, invalid, edge cases
+4. **Navigation** — Check all paths in and out
+5. **States** — Empty state, loading, error, overflow
+6. **Console** — Any new JS errors after interactions?
+7. **Responsiveness** — Check mobile viewport if relevant:
+   ```bash
+   $B viewport 375x812
+   $B screenshot "$REPORT_DIR/screenshots/page-mobile.png"
+   $B viewport 1280x720
+   ```
+
+**Depth judgment:** Spend more time on core features (homepage, dashboard, checkout, search) and less on secondary pages (about, terms, privacy).
+
+**Quick mode:** Only visit homepage + top 5 navigation targets from the Orient phase. Skip the per-page checklist — just check: loads? Console errors? Broken links visible?
+
+### Phase 5: Document
+
+Document each issue **immediately when found** — don't batch them.
+
+**Two evidence tiers:**
+
+**Interactive bugs** (broken flows, dead buttons, form failures):
+1. Take a screenshot before the action
+2. Perform the action
+3. Take a screenshot showing the result
+4. Use `snapshot -D` to show what changed
+5. Write repro steps referencing screenshots
+
+```bash
+$B screenshot "$REPORT_DIR/screenshots/issue-001-step-1.png"
+$B click @e5
+$B screenshot "$REPORT_DIR/screenshots/issue-001-result.png"
+$B snapshot -D
+```
+
+**Static bugs** (typos, layout issues, missing images):
+1. Take a single annotated screenshot showing the problem
+2. Describe what's wrong
+
+```bash
+$B snapshot -i -a -o "$REPORT_DIR/screenshots/issue-002.png"
+```
+
+**Write each issue to the report immediately** using the template format from `qa/templates/qa-report-template.md`.
+
+### Phase 6: Wrap Up
+
+1. **Compute health score** using the rubric below
+2. **Write "Top 3 Things to Fix"** — the 3 highest-severity issues
+3. **Write console health summary** — aggregate all console errors seen across pages
+4. **Update severity counts** in the summary table
+5. **Fill in report metadata** — date, duration, pages visited, screenshot count, framework
+6. **Save baseline** — write `baseline.json` with:
+   ```json
+   {
+     "date": "YYYY-MM-DD",
+     "url": "<target>",
+     "healthScore": N,
+     "issues": [{ "id": "ISSUE-001", "title": "...", "severity": "...", "category": "..." }],
+     "categoryScores": { "console": N, "links": N, ... }
+   }
+   ```
+
+**Regression mode:** After writing the report, load the baseline file. Compare:
+- Health score delta
+- Issues fixed (in baseline but not current)
+- New issues (in current but not baseline)
+- Append the regression section to the report
+
+---
+
+## Health Score Rubric
+
+Compute each category score (0-100), then take the weighted average.
+
+### Console (weight: 15%)
+- 0 errors → 100
+- 1-3 errors → 70
+- 4-10 errors → 40
+- 10+ errors → 10
+
+### Links (weight: 10%)
+- 0 broken → 100
+- Each broken link → -15 (minimum 0)
+
+### Per-Category Scoring (Visual, Functional, UX, Content, Performance, Accessibility)
+Each category starts at 100. Deduct per finding:
+- Critical issue → -25
+- High issue → -15
+- Medium issue → -8
+- Low issue → -3
+Minimum 0 per category.
+
+### Weights
+| Category | Weight |
+|----------|--------|
+| Console | 15% |
+| Links | 10% |
+| Visual | 10% |
+| Functional | 20% |
+| UX | 15% |
+| Performance | 10% |
+| Content | 5% |
+| Accessibility | 15% |
+
+### Final Score
+`score = Σ (category_score × weight)`
+
+---
+
+## Framework-Specific Guidance
+
+### Next.js
+- Check console for hydration errors (`Hydration failed`, `Text content did not match`)
+- Monitor `_next/data` requests in network — 404s indicate broken data fetching
+- Test client-side navigation (click links, don't just `goto`) — catches routing issues
+- Check for CLS (Cumulative Layout Shift) on pages with dynamic content
+
+### Rails
+- Check for N+1 query warnings in console (if development mode)
+- Verify CSRF token presence in forms
+- Test Turbo/Stimulus integration — do page transitions work smoothly?
+- Check for flash messages appearing and dismissing correctly
+
+### WordPress
+- Check for plugin conflicts (JS errors from different plugins)
+- Verify admin bar visibility for logged-in users
+- Test REST API endpoints (`/wp-json/`)
+- Check for mixed content warnings (common with WP)
+
+### General SPA (React, Vue, Angular)
+- Use `snapshot -i` for navigation — `links` command misses client-side routes
+- Check for stale state (navigate away and back — does data refresh?)
+- Test browser back/forward — does the app handle history correctly?
+- Check for memory leaks (monitor console after extended use)
+
+---
+
+## Important Rules
+
+1. **Repro is everything.** Every issue needs at least one screenshot. No exceptions.
+2. **Verify before documenting.** Retry the issue once to confirm it's reproducible, not a fluke.
+3. **Never include credentials.** Write `[REDACTED]` for passwords in repro steps.
+4. **Write incrementally.** Append each issue to the report as you find it. Don't batch.
+5. **Never read source code.** Test as a user, not a developer.
+6. **Check console after every interaction.** JS errors that don't surface visually are still bugs.
+7. **Test like a user.** Use realistic data. Walk through complete workflows end-to-end.
+8. **Depth over breadth.** 5-10 well-documented issues with evidence > 20 vague descriptions.
+9. **Never delete output files.** Screenshots and reports accumulate — that's intentional.
+10. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses.
+
+---
+
+## Output Structure
+
+```
+.gstack/qa-reports/
+├── qa-report-{domain}-{YYYY-MM-DD}.md    # Structured report
+├── screenshots/
+│   ├── initial.png                        # Landing page annotated screenshot
+│   ├── issue-001-step-1.png               # Per-issue evidence
+│   ├── issue-001-result.png
+│   └── ...
+└── baseline.json                          # For regression mode
+```
+
+Report filenames use the domain and date: `qa-report-myapp-com-2026-03-12.md`

M scripts/gen-skill-docs.ts => scripts/gen-skill-docs.ts +36 -0

@@ 94,9 94,43 @@ function generateSnapshotFlags(): string {
   return lines.join('\n');
 }
 
+function generateUpdateCheck(): string {
+  return `## Update Check (run first)
+
+\`\`\`bash
+_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
+[ -n "$_UPD" ] && echo "$_UPD" || true
+\`\`\`
+
+If output shows \`UPGRADE_AVAILABLE <old> <new>\`: read \`~/.claude/skills/gstack/gstack-upgrade/SKILL.md\` and follow the "Inline upgrade flow" (AskUserQuestion → upgrade if yes, \`touch ~/.gstack/last-update-check\` if no). If \`JUST_UPGRADED <from> <to>\`: tell user "Running gstack v{to} (just updated!)" and continue.`;
+}
+
+function generateBrowseSetup(): string {
+  return `## SETUP (run this check BEFORE any browse command)
+
+\`\`\`bash
+_ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
+B=""
+[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
+[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
+if [ -x "$B" ]; then
+  echo "READY: $B"
+else
+  echo "NEEDS_SETUP"
+fi
+\`\`\`
+
+If \`NEEDS_SETUP\`:
+1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait.
+2. Run: \`cd <SKILL_DIR> && ./setup\`
+3. If \`bun\` is not installed: \`curl -fsSL https://bun.sh/install | bash\``;
+}
+
 const RESOLVERS: Record<string, () => string> = {
   COMMAND_REFERENCE: generateCommandReference,
   SNAPSHOT_FLAGS: generateSnapshotFlags,
+  UPDATE_CHECK: generateUpdateCheck,
+  BROWSE_SETUP: generateBrowseSetup,
 };
 
 // ─── Template Processing ────────────────────────────────────


@@ 141,6 175,8 @@ function findTemplates(): string[] {
   const candidates = [
     path.join(ROOT, 'SKILL.md.tmpl'),
     path.join(ROOT, 'browse', 'SKILL.md.tmpl'),
+    path.join(ROOT, 'qa', 'SKILL.md.tmpl'),
+    path.join(ROOT, 'setup-browser-cookies', 'SKILL.md.tmpl'),
   ];
   for (const p of candidates) {
     if (fs.existsSync(p)) templates.push(p);

M setup-browser-cookies/SKILL.md => setup-browser-cookies/SKILL.md +10 -3

@@ 10,12 10,14 @@ allowed-tools:
   - Read
   - AskUserQuestion
 ---
+<!-- AUTO-GENERATED from SKILL.md.tmpl — do not edit directly -->
+<!-- Regenerate: bun run gen:skill-docs -->
 
 ## Update Check (run first)
 
 ```bash
 _UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
-[ -n "$_UPD" ] && echo "$_UPD"
+[ -n "$_UPD" ] && echo "$_UPD" || true
 ```
 
 If output shows `UPGRADE_AVAILABLE <old> <new>`: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (AskUserQuestion → upgrade if yes, `touch ~/.gstack/last-update-check` if no). If `JUST_UPGRADED <from> <to>`: tell user "Running gstack v{to} (just updated!)" and continue.


@@ 35,9 37,14 @@ Import logged-in sessions from your real Chromium browser into the headless brow
 
 ### 1. Find the browse binary
 
+## SETUP (run this check BEFORE any browse command)
+
 ```bash
-B=$(browse/bin/find-browse 2>/dev/null || ~/.claude/skills/gstack/browse/bin/find-browse 2>/dev/null)
-if [ -n "$B" ]; then
+_ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
+B=""
+[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
+[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
+if [ -x "$B" ]; then
   echo "READY: $B"
 else
   echo "NEEDS_SETUP"

A setup-browser-cookies/SKILL.md.tmpl => setup-browser-cookies/SKILL.md.tmpl +73 -0

@@ 0,0 1,73 @@
+---
+name: setup-browser-cookies
+version: 1.0.0
+description: |
+  Import cookies from your real browser (Comet, Chrome, Arc, Brave, Edge) into the
+  headless browse session. Opens an interactive picker UI where you select which
+  cookie domains to import. Use before QA testing authenticated pages.
+allowed-tools:
+  - Bash
+  - Read
+  - AskUserQuestion
+---
+
+{{UPDATE_CHECK}}
+
+# Setup Browser Cookies
+
+Import logged-in sessions from your real Chromium browser into the headless browse session.
+
+## How it works
+
+1. Find the browse binary
+2. Run `cookie-import-browser` to detect installed browsers and open the picker UI
+3. User selects which cookie domains to import in their browser
+4. Cookies are decrypted and loaded into the Playwright session
+
+## Steps
+
+### 1. Find the browse binary
+
+{{BROWSE_SETUP}}
+
+### 2. Open the cookie picker
+
+```bash
+$B cookie-import-browser
+```
+
+This auto-detects installed Chromium browsers (Comet, Chrome, Arc, Brave, Edge) and opens
+an interactive picker UI in your default browser where you can:
+- Switch between installed browsers
+- Search domains
+- Click "+" to import a domain's cookies
+- Click trash to remove imported cookies
+
+Tell the user: **"Cookie picker opened — select the domains you want to import in your browser, then tell me when you're done."**
+
+### 3. Direct import (alternative)
+
+If the user specifies a domain directly (e.g., `/setup-browser-cookies github.com`), skip the UI:
+
+```bash
+$B cookie-import-browser comet --domain github.com
+```
+
+Replace `comet` with the appropriate browser if specified.
+
+### 4. Verify
+
+After the user confirms they're done:
+
+```bash
+$B cookies
+```
+
+Show the user a summary of imported cookies (domain counts).
+
+## Notes
+
+- First import per browser may trigger a macOS Keychain dialog — click "Allow" / "Always Allow"
+- Cookie picker is served on the same port as the browse server (no extra process)
+- Only domain names and cookie counts are shown in the UI — no cookie values are exposed
+- The browse session persists cookies between commands, so imported cookies work immediately

M test/helpers/session-runner.ts => test/helpers/session-runner.ts +1 -0

@@ 23,6 23,7 @@ const BROWSE_ERROR_PATTERNS = [
   /Exit code 1/,
   /ERROR: browse binary not found/,
   /Server failed to start/,
+  /no such file or directory.*browse/i,
 ];
 
 export async function runSkillTest(options: {

M test/skill-e2e.test.ts => test/skill-e2e.test.ts +89 -0

@@ 73,6 73,95 @@ Report what each command returned.`,
     expect(result.exitReason).toBe('success');
   }, 90_000);
 
+  test('agent discovers browse binary via SKILL.md setup block', async () => {
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    // Guard: verify we extracted a valid setup block
+    expect(setupBlock).toContain('browse/dist/browse');
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions to find the browse binary and run a basic command.
+
+${setupBlock}
+
+After finding the binary, run: $B goto ${testServer.url}
+Then run: $B text
+Report whether it worked.`,
+      workingDirectory: tmpDir,
+      maxTurns: 10,
+      timeout: 60_000,
+    });
+
+    expect(result.browseErrors).toHaveLength(0);
+    expect(result.exitReason).toBe('success');
+  }, 90_000);
+
+  test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => {
+    // Create a tmpdir with no browse binary
+    const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
+
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
+
+${setupBlock}
+
+Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
+      workingDirectory: emptyDir,
+      maxTurns: 5,
+      timeout: 30_000,
+    });
+
+    // Agent should see NEEDS_SETUP (not crash or guess wrong paths)
+    const allText = result.messages
+      .map((m: any) => JSON.stringify(m))
+      .join('\n');
+    expect(allText).toContain('NEEDS_SETUP');
+
+    // Clean up
+    try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
+  }, 60_000);
+
+  test('SKILL.md setup block works outside git repo', async () => {
+    // Create a tmpdir outside any git repo
+    const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
+
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
+
+${setupBlock}
+
+Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
+      workingDirectory: nonGitDir,
+      maxTurns: 5,
+      timeout: 30_000,
+    });
+
+    // Should either find global binary (READY) or show NEEDS_SETUP — not crash
+    const allText = result.messages
+      .map((m: any) => JSON.stringify(m))
+      .join('\n');
+    expect(allText).toMatch(/READY|NEEDS_SETUP/);
+
+    // Clean up
+    try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
+  }, 60_000);
+
   test.todo('/qa quick completes without browse errors');
   test.todo('/ship completes without browse errors');
   test.todo('/review completes without browse errors');

M test/skill-llm-eval.test.ts => test/skill-llm-eval.test.ts +13 -0

@@ 115,6 115,19 @@ describeEval('LLM-as-judge quality evals', () => {
     expect(scores.actionability).toBeGreaterThanOrEqual(4);
   }, 30_000);
 
+  test('setup block scores >= 4 on actionability and clarity', async () => {
+    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = content.indexOf('## SETUP');
+    const setupEnd = content.indexOf('## IMPORTANT');
+    const section = content.slice(setupStart, setupEnd);
+
+    const scores = await judge('setup/binary discovery instructions', section);
+    console.log('Setup block scores:', JSON.stringify(scores, null, 2));
+
+    expect(scores.actionability).toBeGreaterThanOrEqual(4);
+    expect(scores.clarity).toBeGreaterThanOrEqual(4);
+  }, 30_000);
+
   test('regression check: compare branch vs baseline quality', async () => {
     // This test compares the generated output against the hand-maintained
     // baseline from main. The generated version should score equal or higher.