From d9611882769922d51f83c3dea975594caa1b8e1c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 19 Mar 2026 00:31:26 -0500 Subject: [PATCH] fix: /qa never refuses browser testing on backend-only changes (#202) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: QA skill never refuses browser testing Add anti-refusal guardrails to /qa and /qa-only skills. When the user invokes /qa, the skill must always use the browser — even if the diff shows only backend/config changes with no obvious UI surface. Falls back to Quick mode (homepage + top 5 nav targets) when no specific pages are identified from the diff. Adds LLM-as-judge eval to verify the anti-refusal behavior. * chore: bump version and changelog (v0.8.1) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- CHANGELOG.md | 6 ++++ VERSION | 2 +- qa-only/SKILL.md | 3 ++ qa/SKILL.md | 3 ++ scripts/gen-skill-docs.ts | 5 +++- test/helpers/touchfiles.ts | 1 + test/skill-llm-eval.test.ts | 55 ++++++++++++++++++++++++++++++++++++- test/touchfiles.test.ts | 3 +- 8 files changed, 74 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b90a4d0f1d8a54ba5ace8cf04b07b578d28c161..e05d64df5df30f1a72d95f8394b33bf691c9091d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [0.8.1] - 2026-03-19 + +### Fixed + +- **`/qa` no longer refuses to use the browser on backend-only changes.** Previously, if your branch only changed prompt templates, config files, or service logic, `/qa` would analyze the diff, conclude "no UI to test," and suggest running evals instead. Now it always opens the browser — falling back to a Quick mode smoke test (homepage + top 5 navigation targets) when no specific pages are identified from the diff. + ## [0.8.0] - 2026-03-19 — Multi-AI Second Opinion **`/codex` — get an independent second opinion from a completely different AI.** diff --git a/VERSION b/VERSION index a3df0a6959e154733da89a5d6063742ce6d5b851..6f4eebdf6f68fc72411793cdb19e3f1715b117f3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.8.0 +0.8.1 diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index 75d70dedfe8eda6b10d271c62667ee68273368f9..45b5a46becaa19fdbed7f79588b7ff58d9f3b16f 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -234,6 +234,8 @@ This is the **primary mode** for developers verifying their work. When the user - API endpoints → test them directly with `$B js "await fetch('/api/...')"` - Static pages (markdown, HTML) → navigate to them directly + **If no obvious pages/routes are identified from the diff:** Do not skip browser testing. The user invoked /qa because they want browser-based verification. Fall back to Quick mode — navigate to the homepage, follow the top 5 navigation targets, check console for errors, and test any interactive elements found. Backend, config, and infrastructure changes affect app behavior — always verify the app still works. + 3. **Detect the running app** — check common local dev ports: ```bash $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \ @@ -488,6 +490,7 @@ Minimum 0 per category. 9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. 10. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses. 11. **Show screenshots to the user.** After every `$B screenshot`, `$B snapshot -a -o`, or `$B responsive` command, use the Read tool on the output file(s) so the user can see them inline. For `responsive` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user. +12. **Never refuse to use the browser.** When the user invokes /qa or /qa-only, they are requesting browser-based testing. Never suggest evals, unit tests, or other alternatives as a substitute. Even if the diff appears to have no UI changes, backend changes affect app behavior — always open the browser and test. --- diff --git a/qa/SKILL.md b/qa/SKILL.md index 796d6a10f891a2fab01fb6889e7943b4f4dbb0a3..590c18d2c79948d64e39bf0bdb2e6dbf1547fd55 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -440,6 +440,8 @@ This is the **primary mode** for developers verifying their work. When the user - API endpoints → test them directly with `$B js "await fetch('/api/...')"` - Static pages (markdown, HTML) → navigate to them directly + **If no obvious pages/routes are identified from the diff:** Do not skip browser testing. The user invoked /qa because they want browser-based verification. Fall back to Quick mode — navigate to the homepage, follow the top 5 navigation targets, check console for errors, and test any interactive elements found. Backend, config, and infrastructure changes affect app behavior — always verify the app still works. + 3. **Detect the running app** — check common local dev ports: ```bash $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \ @@ -694,6 +696,7 @@ Minimum 0 per category. 9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. 10. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses. 11. **Show screenshots to the user.** After every `$B screenshot`, `$B snapshot -a -o`, or `$B responsive` command, use the Read tool on the output file(s) so the user can see them inline. For `responsive` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user. +12. **Never refuse to use the browser.** When the user invokes /qa or /qa-only, they are requesting browser-based testing. Never suggest evals, unit tests, or other alternatives as a substitute. Even if the diff appears to have no UI changes, backend changes affect app behavior — always open the browser and test. Record baseline health score at end of Phase 6. diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 7ace6c8d8beca79a47c863d97fbdc79581cd85ae..9f5460a356186645ac225293fa5e8d120f180e99 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -302,6 +302,8 @@ This is the **primary mode** for developers verifying their work. When the user - API endpoints → test them directly with \`$B js "await fetch('/api/...')"\` - Static pages (markdown, HTML) → navigate to them directly + **If no obvious pages/routes are identified from the diff:** Do not skip browser testing. The user invoked /qa because they want browser-based verification. Fall back to Quick mode — navigate to the homepage, follow the top 5 navigation targets, check console for errors, and test any interactive elements found. Backend, config, and infrastructure changes affect app behavior — always verify the app still works. + 3. **Detect the running app** — check common local dev ports: \`\`\`bash $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \\ @@ -555,7 +557,8 @@ Minimum 0 per category. 8. **Depth over breadth.** 5-10 well-documented issues with evidence > 20 vague descriptions. 9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. 10. **Use \`snapshot -C\` for tricky UIs.** Finds clickable divs that the accessibility tree misses. -11. **Show screenshots to the user.** After every \`$B screenshot\`, \`$B snapshot -a -o\`, or \`$B responsive\` command, use the Read tool on the output file(s) so the user can see them inline. For \`responsive\` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user.`; +11. **Show screenshots to the user.** After every \`$B screenshot\`, \`$B snapshot -a -o\`, or \`$B responsive\` command, use the Read tool on the output file(s) so the user can see them inline. For \`responsive\` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user. +12. **Never refuse to use the browser.** When the user invokes /qa or /qa-only, they are requesting browser-based testing. Never suggest evals, unit tests, or other alternatives as a substitute. Even if the diff appears to have no UI changes, backend changes affect app behavior — always open the browser and test.`; } function generateDesignReviewLite(_ctx: TemplateContext): string { diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index d92d9751baf12f45ff1dfc49f8242e0e9ab2328a..19eba66e1ed2fc46c2ac70b796836624f30d7e67 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -119,6 +119,7 @@ export const LLM_JUDGE_TOUCHFILES: Record = { 'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'], 'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'], 'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'], + 'qa/SKILL.md anti-refusal': ['qa/SKILL.md', 'qa/SKILL.md.tmpl', 'qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'], 'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'], 'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'], diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index 528d51157cc37c20f67eb8a4a4aef5ab546f2853..45ac44520df6c2d7778af61c58c3a43c0b45b924 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -256,7 +256,7 @@ Scores are 1-5 overall quality.`, // --- Part 7: QA skill quality evals (C6) --- -describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => { +describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric', 'qa/SKILL.md anti-refusal'], () => { const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); testIfSelected('qa/SKILL.md workflow', async () => { @@ -342,6 +342,59 @@ ${section}`); expect(scores.completeness).toBeGreaterThanOrEqual(3); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); + + testIfSelected('qa/SKILL.md anti-refusal', async () => { + const t0 = Date.now(); + // Extract both the diff-aware mode section and Important Rules section + const diffAwareStart = qaContent.indexOf('### Diff-aware'); + const diffAwareEnd = qaContent.indexOf('### Full'); + const rulesStart = qaContent.indexOf('## Important Rules'); + const rulesEnd = qaContent.indexOf('## Framework-Specific'); + const diffAwareSection = qaContent.slice(diffAwareStart, diffAwareEnd); + const rulesSection = qaContent.slice(rulesStart, rulesEnd); + + const result = await callJudge<{ would_browse: boolean; fallback_behavior: string; confidence: number; reasoning: string }>(`You are evaluating whether a QA testing skill document would cause an AI agent to USE THE BROWSER or REFUSE to use the browser in a specific scenario. + +SCENARIO: +A user runs /qa (a browser-based QA testing skill). The branch diff shows ONLY prompt template files and config file changes — no routes, views, controllers, components, or CSS were changed. The changes are "purely backend" with no obvious UI surface. + +QUESTION: Based on the document excerpts below, would the agent open the browser and test the application, or would it decline/refuse to browse and suggest running evals or unit tests instead? + +DOCUMENT EXCERPT 1 (Diff-aware mode instructions): +${diffAwareSection} + +DOCUMENT EXCERPT 2 (Important Rules): +${rulesSection} + +Respond with ONLY valid JSON: +{ + "would_browse": true or false, + "fallback_behavior": "description of what the agent would do when no UI pages are identified from the diff", + "confidence": N (1-5, how confident you are in your answer), + "reasoning": "brief explanation" +} + +Rules: +- would_browse should be true if the document instructs the agent to always use the browser regardless of diff content +- would_browse should be false if the document allows the agent to skip browser testing for non-UI changes +- confidence: 5 = document is unambiguous, 1 = document is unclear or contradictory`); + + console.log('QA anti-refusal result:', JSON.stringify(result, null, 2)); + + evalCollector?.addTest({ + name: 'qa/SKILL.md anti-refusal', + suite: 'QA skill quality evals', + tier: 'llm-judge', + passed: result.would_browse === true && result.confidence >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { would_browse: result.would_browse ? 1 : 0, confidence: result.confidence }, + judge_reasoning: result.reasoning, + }); + + expect(result.would_browse).toBe(true); + expect(result.confidence).toBeGreaterThanOrEqual(4); + }, 30_000); }); // --- Part 7: Cross-skill consistency judge (C7) --- diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index b3f844d85b2d5a5742801e60445c37a4d6587510..d89d533d45631cc758e979448ea450d209b487a8 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -123,7 +123,8 @@ describe('selectTests', () => { const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES); expect(result.selected).toContain('qa/SKILL.md workflow'); expect(result.selected).toContain('qa/SKILL.md health rubric'); - expect(result.selected.length).toBe(2); + expect(result.selected).toContain('qa/SKILL.md anti-refusal'); + expect(result.selected.length).toBe(3); }); test('SKILL.md.tmpl root template selects root-dependent tests and routing tests', () => {