import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner';
import { outcomeJudge } from './helpers/llm-judge';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { startTestServer } from '../browse/test/test-server';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
// Skip unless EVALS=1. Session runner strips CLAUDE* env vars to avoid nested session issues.
const evalsEnabled = !!process.env.EVALS;
const describeE2E = evalsEnabled ? describe : describe.skip;
// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
/** DRY helper to record an E2E test result into the eval collector. */
function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: Partial<EvalTestEntry>) {
evalCollector?.addTest({
name, suite, tier: 'e2e',
passed: result.exitReason === 'success' && result.browseErrors.length === 0,
duration_ms: result.duration,
cost_usd: result.costEstimate.estimatedCost,
transcript: result.transcript,
output: result.output?.slice(0, 2000),
turns_used: result.costEstimate.turnsUsed,
browse_errors: result.browseErrors,
...extra,
});
}
let testServer: ReturnType<typeof startTestServer>;
let tmpDir: string;
const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
/**
* Copy a directory tree recursively (files only, follows structure).
*/
function copyDirSync(src: string, dest: string) {
fs.mkdirSync(dest, { recursive: true });
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
const srcPath = path.join(src, entry.name);
const destPath = path.join(dest, entry.name);
if (entry.isDirectory()) {
copyDirSync(srcPath, destPath);
} else {
fs.copyFileSync(srcPath, destPath);
}
}
}
/**
* Set up browse shims (binary symlink, find-browse, remote-slug) in a tmpDir.
*/
function setupBrowseShims(dir: string) {
// Symlink browse binary
const binDir = path.join(dir, 'browse', 'dist');
fs.mkdirSync(binDir, { recursive: true });
if (fs.existsSync(browseBin)) {
fs.symlinkSync(browseBin, path.join(binDir, 'browse'));
}
// find-browse shim
const findBrowseDir = path.join(dir, 'browse', 'bin');
fs.mkdirSync(findBrowseDir, { recursive: true });
fs.writeFileSync(
path.join(findBrowseDir, 'find-browse'),
`#!/bin/bash\necho "${browseBin}"\n`,
{ mode: 0o755 },
);
// remote-slug shim (returns test-project)
fs.writeFileSync(
path.join(findBrowseDir, 'remote-slug'),
`#!/bin/bash\necho "test-project"\n`,
{ mode: 0o755 },
);
}
/**
* Print cost summary after an E2E test.
*/
function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
const durationSec = Math.round(result.duration / 1000);
console.log(`${label}: $${estimatedCost.toFixed(2)} (${turnsUsed} turns, ${(estimatedTokens / 1000).toFixed(1)}k tokens, ${durationSec}s)`);
}
/**
* Dump diagnostic info on planted-bug outcome failure (decision 1C).
*/
function dumpOutcomeDiagnostic(dir: string, label: string, report: string, judgeResult: any) {
try {
const transcriptDir = path.join(dir, '.gstack', 'test-transcripts');
fs.mkdirSync(transcriptDir, { recursive: true });
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
fs.writeFileSync(
path.join(transcriptDir, `${label}-outcome-${timestamp}.json`),
JSON.stringify({ label, report, judgeResult }, null, 2),
);
} catch { /* non-fatal */ }
}
describeE2E('Skill E2E tests', () => {
beforeAll(() => {
testServer = startTestServer();
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
setupBrowseShims(tmpDir);
});
afterAll(() => {
testServer?.server?.stop();
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
test('browse basic commands work without errors', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
1. $B goto ${testServer.url}
2. $B snapshot -i
3. $B text
4. $B screenshot /tmp/skill-e2e-test.png
Report the results of each command.`,
workingDirectory: tmpDir,
maxTurns: 10,
timeout: 60_000,
});
logCost('browse basic', result);
recordE2E('browse basic commands', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
test('browse snapshot flags all work', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
1. $B goto ${testServer.url}
2. $B snapshot -i
3. $B snapshot -c
4. $B snapshot -D
5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
Report what each command returned.`,
workingDirectory: tmpDir,
maxTurns: 10,
timeout: 60_000,
});
logCost('browse snapshot', result);
recordE2E('browse snapshot flags', 'Skill E2E tests', result);
// browseErrors can include false positives from hallucinated paths (e.g. "baltimore" vs "bangalore")
if (result.browseErrors.length > 0) {
console.warn('Browse errors (non-fatal):', result.browseErrors);
}
expect(result.exitReason).toBe('success');
}, 90_000);
test('agent discovers browse binary via SKILL.md setup block', async () => {
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
// Guard: verify we extracted a valid setup block
expect(setupBlock).toContain('browse/dist/browse');
const result = await runSkillTest({
prompt: `Follow these instructions to find the browse binary and run a basic command.
${setupBlock}
After finding the binary, run: $B goto ${testServer.url}
Then run: $B text
Report whether it worked.`,
workingDirectory: tmpDir,
maxTurns: 10,
timeout: 60_000,
});
recordE2E('SKILL.md setup block discovery', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
test('SKILL.md setup block handles missing local binary gracefully', async () => {
// Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
const result = await runSkillTest({
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
${setupBlock}
Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
workingDirectory: emptyDir,
maxTurns: 5,
timeout: 30_000,
});
// Setup block should either find the global binary (READY) or show NEEDS_SETUP.
// On dev machines with gstack installed globally, the fallback path
// ~/.claude/skills/gstack/browse/dist/browse exists, so we get READY.
// The important thing is it doesn't crash or give a confusing error.
const allText = result.output || '';
recordE2E('SKILL.md setup block (no local binary)', 'Skill E2E tests', result);
expect(allText).toMatch(/READY|NEEDS_SETUP/);
expect(result.exitReason).toBe('success');
// Clean up
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
}, 60_000);
test('SKILL.md setup block works outside git repo', async () => {
// Create a tmpdir outside any git repo
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
const result = await runSkillTest({
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
${setupBlock}
Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
workingDirectory: nonGitDir,
maxTurns: 5,
timeout: 30_000,
});
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
const allText = result.output || '';
recordE2E('SKILL.md outside git repo', 'Skill E2E tests', result);
expect(allText).toMatch(/READY|NEEDS_SETUP/);
// Clean up
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
}, 60_000);
});
// --- B4: QA skill E2E ---
describeE2E('QA skill E2E', () => {
let qaDir: string;
beforeAll(() => {
testServer = testServer || startTestServer();
qaDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-'));
setupBrowseShims(qaDir);
// Copy qa skill files into tmpDir
copyDirSync(path.join(ROOT, 'qa'), path.join(qaDir, 'qa'));
// Create report directory
fs.mkdirSync(path.join(qaDir, 'qa-reports'), { recursive: true });
});
afterAll(() => {
testServer?.server?.stop();
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
});
test('/qa quick completes without browse errors', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
Read the file qa/SKILL.md for the QA workflow instructions.
Run a Quick-depth QA test on ${testServer.url}/basic.html
Do NOT use AskUserQuestion — run Quick tier directly.
Write your report to ${qaDir}/qa-reports/qa-report.md`,
workingDirectory: qaDir,
maxTurns: 30,
timeout: 180_000,
});
logCost('/qa quick', result);
recordE2E('/qa quick', 'QA skill E2E', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 240_000);
});
// --- B5: Review skill E2E ---
describeE2E('Review skill E2E', () => {
let reviewDir: string;
beforeAll(() => {
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-'));
// Pre-build a git repo with a vulnerable file on a feature branch (decision 5A)
const { spawnSync } = require('child_process');
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Commit a clean base on main
fs.writeFileSync(path.join(reviewDir, 'app.rb'), '# clean base\nclass App\nend\n');
run('git', ['add', 'app.rb']);
run('git', ['commit', '-m', 'initial commit']);
// Create feature branch with vulnerable code
run('git', ['checkout', '-b', 'feature/add-user-controller']);
const vulnContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8');
fs.writeFileSync(path.join(reviewDir, 'user_controller.rb'), vulnContent);
run('git', ['add', 'user_controller.rb']);
run('git', ['commit', '-m', 'add user controller']);
// Copy review skill files
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(reviewDir, 'review-SKILL.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(reviewDir, 'review-checklist.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(reviewDir, 'review-greptile-triage.md'));
});
afterAll(() => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
test('/review produces findings on SQL injection branch', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with changes against main.
Read review-SKILL.md for the review workflow instructions.
Also read review-checklist.md and apply it.
Run /review on the current diff (git diff main...HEAD).
Write your review findings to ${reviewDir}/review-output.md`,
workingDirectory: reviewDir,
maxTurns: 15,
timeout: 90_000,
});
logCost('/review', result);
recordE2E('/review SQL injection', 'Review skill E2E', result);
expect(result.exitReason).toBe('success');
}, 120_000);
});
// --- B6/B7/B8: Planted-bug outcome evals ---
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
describeOutcome('Planted-bug outcome evals', () => {
let outcomeDir: string;
beforeAll(() => {
testServer = testServer || startTestServer();
outcomeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-outcome-'));
setupBrowseShims(outcomeDir);
// Copy qa skill files
copyDirSync(path.join(ROOT, 'qa'), path.join(outcomeDir, 'qa'));
});
afterAll(() => {
testServer?.server?.stop();
try { fs.rmSync(outcomeDir, { recursive: true, force: true }); } catch {}
});
/**
* Shared planted-bug eval runner.
* Gives the agent concise bug-finding instructions (not the full QA workflow),
* then scores the report with an LLM outcome judge.
*/
async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) {
const reportDir = path.join(outcomeDir, `reports-${label}`);
fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true });
const reportPath = path.join(reportDir, 'qa-report.md');
// Phase 1: Direct bug-finding with browse. Keep prompt concise — the agent
// only has 25 turns so every turn must count (no reading long SKILL.md docs).
const result = await runSkillTest({
prompt: `You have a headless browser binary. Run these commands to find bugs on a web page.
B="${browseBin}"
Step 1 — Navigate and inspect:
$B goto ${testServer.url}/${fixture}
$B console --errors
$B snapshot -i
Step 2 — Test interactively (click links, fill forms, check states):
- Click every navigation link, check for broken routes/404s
- Fill and submit every form with valid AND invalid data (empty, bad email, etc.)
- Check $B console --errors after each action
Step 3 — Check visual/accessibility:
$B snapshot -c
$B accessibility
Step 4 — Write your findings to ${reportPath}
List every bug found with:
- Category: functional / visual / accessibility / console
- Severity: high / medium / low
- Description with evidence (what you saw, what command showed it)
Be thorough but efficient. Check console errors, test every link, test every form, check accessibility.`,
workingDirectory: outcomeDir,
maxTurns: 25,
timeout: 180_000,
});
logCost(`/qa ${label}`, result);
// Phase 1: browse mechanics. Accept error_max_turns — agent may have written
// a partial report before running out of turns. What matters is detection rate.
if (result.browseErrors.length > 0) {
console.warn(`${label} browse errors:`, result.browseErrors);
}
if (result.exitReason !== 'success' && result.exitReason !== 'error_max_turns') {
throw new Error(`${label}: unexpected exit reason: ${result.exitReason}`);
}
// Phase 2: Outcome evaluation via LLM judge
const groundTruth = JSON.parse(
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', groundTruthFile), 'utf-8'),
);
// Read the generated report (try expected path, then glob for any .md in reportDir or outcomeDir)
let report: string | null = null;
if (fs.existsSync(reportPath)) {
report = fs.readFileSync(reportPath, 'utf-8');
} else {
// Agent may have named it differently — find any .md in reportDir
try {
const mdFiles = fs.readdirSync(reportDir).filter(f => f.endsWith('.md'));
if (mdFiles.length > 0) {
report = fs.readFileSync(path.join(reportDir, mdFiles[0]), 'utf-8');
}
} catch { /* reportDir may not exist if agent hit max_turns early */ }
// Also check the agent's final output for inline report content
if (!report && result.output && result.output.length > 100) {
report = result.output;
}
}
if (!report) {
dumpOutcomeDiagnostic(outcomeDir, label, '(no report file found)', { error: 'missing report' });
recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, { error: 'no report generated' });
throw new Error(`No report file found in ${reportDir}`);
}
const judgeResult = await outcomeJudge(groundTruth, report);
console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));
// Record to eval collector with outcome judge results
recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, {
detection_rate: judgeResult.detection_rate,
false_positives: judgeResult.false_positives,
evidence_quality: judgeResult.evidence_quality,
detected_bugs: judgeResult.detected,
missed_bugs: judgeResult.missed,
});
// Diagnostic dump on failure (decision 1C)
if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult);
}
// Phase 2 assertions
expect(judgeResult.detection_rate).toBeGreaterThanOrEqual(groundTruth.minimum_detection);
expect(judgeResult.false_positives).toBeLessThanOrEqual(groundTruth.max_false_positives);
expect(judgeResult.evidence_quality).toBeGreaterThanOrEqual(2);
}
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
}, 240_000);
// B7: SPA — broken route, stale state, async race, missing aria, console warning
test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
}, 240_000);
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
}, 240_000);
// Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG)
test.todo('/ship completes without browse errors');
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
try {
await evalCollector.finalize();
} catch (err) {
console.error('Failed to save eval results:', err);
}
}
});