import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { runSkillTest } from './helpers/session-runner'; import type { SkillTestResult } from './helpers/session-runner'; import { outcomeJudge } from './helpers/llm-judge'; import { EvalCollector } from './helpers/eval-store'; import type { EvalTestEntry } from './helpers/eval-store'; import { startTestServer } from '../browse/test/test-server'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; const ROOT = path.resolve(import.meta.dir, '..'); // Skip unless EVALS=1. Session runner strips CLAUDE* env vars to avoid nested session issues. const evalsEnabled = !!process.env.EVALS; const describeE2E = evalsEnabled ? describe : describe.skip; // Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null; /** DRY helper to record an E2E test result into the eval collector. */ function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: Partial) { evalCollector?.addTest({ name, suite, tier: 'e2e', passed: result.exitReason === 'success' && result.browseErrors.length === 0, duration_ms: result.duration, cost_usd: result.costEstimate.estimatedCost, transcript: result.transcript, output: result.output?.slice(0, 2000), turns_used: result.costEstimate.turnsUsed, browse_errors: result.browseErrors, ...extra, }); } let testServer: ReturnType; let tmpDir: string; const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse'); /** * Copy a directory tree recursively (files only, follows structure). */ function copyDirSync(src: string, dest: string) { fs.mkdirSync(dest, { recursive: true }); for (const entry of fs.readdirSync(src, { withFileTypes: true })) { const srcPath = path.join(src, entry.name); const destPath = path.join(dest, entry.name); if (entry.isDirectory()) { copyDirSync(srcPath, destPath); } else { fs.copyFileSync(srcPath, destPath); } } } /** * Set up browse shims (binary symlink, find-browse, remote-slug) in a tmpDir. */ function setupBrowseShims(dir: string) { // Symlink browse binary const binDir = path.join(dir, 'browse', 'dist'); fs.mkdirSync(binDir, { recursive: true }); if (fs.existsSync(browseBin)) { fs.symlinkSync(browseBin, path.join(binDir, 'browse')); } // find-browse shim const findBrowseDir = path.join(dir, 'browse', 'bin'); fs.mkdirSync(findBrowseDir, { recursive: true }); fs.writeFileSync( path.join(findBrowseDir, 'find-browse'), `#!/bin/bash\necho "${browseBin}"\n`, { mode: 0o755 }, ); // remote-slug shim (returns test-project) fs.writeFileSync( path.join(findBrowseDir, 'remote-slug'), `#!/bin/bash\necho "test-project"\n`, { mode: 0o755 }, ); } /** * Print cost summary after an E2E test. */ function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) { const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate; const durationSec = Math.round(result.duration / 1000); console.log(`${label}: $${estimatedCost.toFixed(2)} (${turnsUsed} turns, ${(estimatedTokens / 1000).toFixed(1)}k tokens, ${durationSec}s)`); } /** * Dump diagnostic info on planted-bug outcome failure (decision 1C). */ function dumpOutcomeDiagnostic(dir: string, label: string, report: string, judgeResult: any) { try { const transcriptDir = path.join(dir, '.gstack', 'test-transcripts'); fs.mkdirSync(transcriptDir, { recursive: true }); const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); fs.writeFileSync( path.join(transcriptDir, `${label}-outcome-${timestamp}.json`), JSON.stringify({ label, report, judgeResult }, null, 2), ); } catch { /* non-fatal */ } } describeE2E('Skill E2E tests', () => { beforeAll(() => { testServer = startTestServer(); tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-')); setupBrowseShims(tmpDir); }); afterAll(() => { testServer?.server?.stop(); try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} }); test('browse basic commands work without errors', async () => { const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence: 1. $B goto ${testServer.url} 2. $B snapshot -i 3. $B text 4. $B screenshot /tmp/skill-e2e-test.png Report the results of each command.`, workingDirectory: tmpDir, maxTurns: 10, timeout: 60_000, }); logCost('browse basic', result); recordE2E('browse basic commands', 'Skill E2E tests', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 90_000); test('browse snapshot flags all work', async () => { const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run: 1. $B goto ${testServer.url} 2. $B snapshot -i 3. $B snapshot -c 4. $B snapshot -D 5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png Report what each command returned.`, workingDirectory: tmpDir, maxTurns: 10, timeout: 60_000, }); logCost('browse snapshot', result); recordE2E('browse snapshot flags', 'Skill E2E tests', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 90_000); test('agent discovers browse binary via SKILL.md setup block', async () => { const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = skillMd.indexOf('## SETUP'); const setupEnd = skillMd.indexOf('## IMPORTANT'); const setupBlock = skillMd.slice(setupStart, setupEnd); // Guard: verify we extracted a valid setup block expect(setupBlock).toContain('browse/dist/browse'); const result = await runSkillTest({ prompt: `Follow these instructions to find the browse binary and run a basic command. ${setupBlock} After finding the binary, run: $B goto ${testServer.url} Then run: $B text Report whether it worked.`, workingDirectory: tmpDir, maxTurns: 10, timeout: 60_000, }); recordE2E('SKILL.md setup block discovery', 'Skill E2E tests', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 90_000); test('SKILL.md setup block handles missing local binary gracefully', async () => { // Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-')); const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = skillMd.indexOf('## SETUP'); const setupEnd = skillMd.indexOf('## IMPORTANT'); const setupBlock = skillMd.slice(setupStart, setupEnd); const result = await runSkillTest({ prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs. ${setupBlock} Report the exact output. Do NOT try to fix or install anything — just report what you see.`, workingDirectory: emptyDir, maxTurns: 5, timeout: 30_000, }); // Setup block should either find the global binary (READY) or show NEEDS_SETUP. // On dev machines with gstack installed globally, the fallback path // ~/.claude/skills/gstack/browse/dist/browse exists, so we get READY. // The important thing is it doesn't crash or give a confusing error. const allText = result.output || ''; recordE2E('SKILL.md setup block (no local binary)', 'Skill E2E tests', result); expect(allText).toMatch(/READY|NEEDS_SETUP/); expect(result.exitReason).toBe('success'); // Clean up try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {} }, 60_000); test('SKILL.md setup block works outside git repo', async () => { // Create a tmpdir outside any git repo const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-')); const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = skillMd.indexOf('## SETUP'); const setupEnd = skillMd.indexOf('## IMPORTANT'); const setupBlock = skillMd.slice(setupStart, setupEnd); const result = await runSkillTest({ prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs. ${setupBlock} Report the exact output — either "READY: " or "NEEDS_SETUP".`, workingDirectory: nonGitDir, maxTurns: 5, timeout: 30_000, }); // Should either find global binary (READY) or show NEEDS_SETUP — not crash const allText = result.output || ''; recordE2E('SKILL.md outside git repo', 'Skill E2E tests', result); expect(allText).toMatch(/READY|NEEDS_SETUP/); // Clean up try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {} }, 60_000); }); // --- B4: QA skill E2E --- describeE2E('QA skill E2E', () => { let qaDir: string; beforeAll(() => { testServer = testServer || startTestServer(); qaDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-')); setupBrowseShims(qaDir); // Copy qa skill files into tmpDir copyDirSync(path.join(ROOT, 'qa'), path.join(qaDir, 'qa')); // Create report directory fs.mkdirSync(path.join(qaDir, 'qa-reports'), { recursive: true }); }); afterAll(() => { testServer?.server?.stop(); try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {} }); test('/qa quick completes without browse errors', async () => { const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}" Read the file qa/SKILL.md for the QA workflow instructions. Run a Quick-depth QA test on ${testServer.url}/basic.html Do NOT use AskUserQuestion — run Quick tier directly. Write your report to ${qaDir}/qa-reports/qa-report.md`, workingDirectory: qaDir, maxTurns: 30, timeout: 180_000, }); logCost('/qa quick', result); recordE2E('/qa quick', 'QA skill E2E', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 240_000); }); // --- B5: Review skill E2E --- describeE2E('Review skill E2E', () => { let reviewDir: string; beforeAll(() => { reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-')); // Pre-build a git repo with a vulnerable file on a feature branch (decision 5A) const { spawnSync } = require('child_process'); const run = (cmd: string, args: string[]) => spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 }); run('git', ['init']); run('git', ['config', 'user.email', 'test@test.com']); run('git', ['config', 'user.name', 'Test']); // Commit a clean base on main fs.writeFileSync(path.join(reviewDir, 'app.rb'), '# clean base\nclass App\nend\n'); run('git', ['add', 'app.rb']); run('git', ['commit', '-m', 'initial commit']); // Create feature branch with vulnerable code run('git', ['checkout', '-b', 'feature/add-user-controller']); const vulnContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8'); fs.writeFileSync(path.join(reviewDir, 'user_controller.rb'), vulnContent); run('git', ['add', 'user_controller.rb']); run('git', ['commit', '-m', 'add user controller']); // Copy review skill files fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(reviewDir, 'review-SKILL.md')); fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(reviewDir, 'review-checklist.md')); fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(reviewDir, 'review-greptile-triage.md')); }); afterAll(() => { try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {} }); test('/review produces findings on SQL injection branch', async () => { const result = await runSkillTest({ prompt: `You are in a git repo on a feature branch with changes against main. Read review-SKILL.md for the review workflow instructions. Also read review-checklist.md and apply it. Run /review on the current diff (git diff main...HEAD). Write your review findings to ${reviewDir}/review-output.md`, workingDirectory: reviewDir, maxTurns: 15, timeout: 90_000, }); logCost('/review', result); recordE2E('/review SQL injection', 'Review skill E2E', result); expect(result.exitReason).toBe('success'); }, 120_000); }); // --- B6/B7/B8: Planted-bug outcome evals --- // Outcome evals also need ANTHROPIC_API_KEY for the LLM judge const hasApiKey = !!process.env.ANTHROPIC_API_KEY; const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip; describeOutcome('Planted-bug outcome evals', () => { let outcomeDir: string; beforeAll(() => { testServer = testServer || startTestServer(); outcomeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-outcome-')); setupBrowseShims(outcomeDir); // Copy qa skill files copyDirSync(path.join(ROOT, 'qa'), path.join(outcomeDir, 'qa')); }); afterAll(() => { testServer?.server?.stop(); try { fs.rmSync(outcomeDir, { recursive: true, force: true }); } catch {} }); /** * Shared planted-bug eval runner. * Runs /qa Standard on a fixture page, then scores with outcomeJudge. */ async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) { const reportDir = path.join(outcomeDir, `reports-${label}`); fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true }); const reportPath = path.join(reportDir, 'qa-report.md'); // Phase 1: runs /qa Standard const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}" Read the file qa/SKILL.md for the QA workflow instructions. Navigate to ${testServer.url}/${fixture} and run a Standard-depth QA test. Do NOT use AskUserQuestion — run Standard tier directly. Write your report to ${reportPath} Save screenshots to ${reportDir}/screenshots/ IMPORTANT — be methodical and check ALL of these: 1. Run $B console --errors to check for JavaScript errors/warnings 2. Click every link and check for 404s or broken routes 3. Fill out and submit every form — test edge cases (empty fields, invalid input) 4. Run $B snapshot -i to check interactive elements and their states 5. Check for visual issues: overflow, clipping, layout problems 6. Check accessibility: missing alt text, missing aria attributes 7. Test with different viewport sizes if relevant`, workingDirectory: outcomeDir, maxTurns: 50, timeout: 300_000, }); logCost(`/qa ${label}`, result); // Phase 1 assertions: browse mechanics expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); // Phase 2: Outcome evaluation via LLM judge const groundTruth = JSON.parse( fs.readFileSync(path.join(ROOT, 'test', 'fixtures', groundTruthFile), 'utf-8'), ); // Read the generated report (try the expected path, then glob for any .md in reportDir) let report: string; if (fs.existsSync(reportPath)) { report = fs.readFileSync(reportPath, 'utf-8'); } else { // Agent may have named it differently — find any .md in reportDir const mdFiles = fs.readdirSync(reportDir).filter(f => f.endsWith('.md')); if (mdFiles.length === 0) { dumpOutcomeDiagnostic(outcomeDir, label, '(no report file found)', { error: 'missing report' }); throw new Error(`No report file found in ${reportDir}`); } report = fs.readFileSync(path.join(reportDir, mdFiles[0]), 'utf-8'); } const judgeResult = await outcomeJudge(groundTruth, report); console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2)); // Record to eval collector with outcome judge results recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, { detection_rate: judgeResult.detection_rate, false_positives: judgeResult.false_positives, evidence_quality: judgeResult.evidence_quality, detected_bugs: judgeResult.detected, missed_bugs: judgeResult.missed, }); // Diagnostic dump on failure (decision 1C) if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) { dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult); } // Phase 2 assertions expect(judgeResult.detection_rate).toBeGreaterThanOrEqual(groundTruth.minimum_detection); expect(judgeResult.false_positives).toBeLessThanOrEqual(groundTruth.max_false_positives); expect(judgeResult.evidence_quality).toBeGreaterThanOrEqual(2); } // B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error test('/qa standard finds >= 3 of 5 planted bugs (static)', async () => { await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static'); }, 360_000); // B7: SPA — broken route, stale state, async race, missing aria, console warning test('/qa standard finds >= 3 of 5 planted SPA bugs', async () => { await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa'); }, 360_000); // B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error test('/qa standard finds >= 3 of 5 planted checkout bugs', async () => { await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout'); }, 360_000); // Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG) test.todo('/ship completes without browse errors'); }); // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { if (evalCollector) { try { await evalCollector.finalize(); } catch (err) { console.error('Failed to save eval results:', err); } } });