~cytrogen/gstack

ref: 2c5ae38542fa7f1137fbc3b07b8b59e002968ff0 gstack/test/helpers/e2e-helpers.ts -rw-r--r-- 9.5 KiB
2c5ae385 — Garry Tan feat: triple-voice multi-model review in /autoplan (v0.11.12.0) (#424) a month ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
/**
 * Shared helpers for E2E test files.
 *
 * Extracted from the monolithic skill-e2e.test.ts to support splitting
 * tests across multiple files by category.
 */

import { describe, test, afterAll } from 'bun:test';
import type { SkillTestResult } from './session-runner';
import { EvalCollector, judgePassed } from './eval-store';
import type { EvalTestEntry } from './eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './touchfiles';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';

export const ROOT = path.resolve(import.meta.dir, '..', '..');

// Skip unless EVALS=1. Session runner strips CLAUDE* env vars to avoid nested session issues.
//
// BLAME PROTOCOL: When an eval fails, do NOT claim "pre-existing" or "not related
// to our changes" without proof. Run the same eval on main to verify. These tests
// have invisible couplings — preamble text, SKILL.md content, and timing all affect
// agent behavior. See CLAUDE.md "E2E eval failure blame protocol" for details.
export const evalsEnabled = !!process.env.EVALS;

// --- Diff-based test selection ---
// When EVALS_ALL is not set, only run tests whose touchfiles were modified.
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
export let selectedTests: string[] | null = null; // null = run all

// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback
const FAST_EXCLUDED_TESTS = [
  'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch',
  'design-consultation-core', 'design-consultation-existing',
  'qa-fix-loop', 'design-review-fix',
];

if (evalsEnabled && !process.env.EVALS_ALL) {
  const baseBranch = process.env.EVALS_BASE
    || detectBaseBranch(ROOT)
    || 'main';
  const changedFiles = getChangedFiles(baseBranch, ROOT);

  if (changedFiles.length > 0) {
    const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
    selectedTests = selection.selected;
    process.stderr.write(`\nE2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`);
    if (selection.skipped.length > 0) {
      process.stderr.write(`  Skipped: ${selection.skipped.join(', ')}\n`);
    }
    process.stderr.write('\n');
  }
  // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
}

// Apply EVALS_FAST filter after diff-based selection
if (evalsEnabled && process.env.EVALS_FAST) {
  if (selectedTests === null) {
    // Run all minus excluded
    selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t));
  } else {
    selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t));
  }
  process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`);
}

export const describeE2E = evalsEnabled ? describe : describe.skip;

/** Wrap a describe block to skip entirely if none of its tests are selected. */
export function describeIfSelected(name: string, testNames: string[], fn: () => void) {
  const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t));
  (anySelected ? describeE2E : describe.skip)(name, fn);
}

// Unique run ID for this E2E session — used for heartbeat + per-run log directory
export const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').slice(0, 15);

export const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');

// Check if Anthropic API key is available (needed for outcome evals)
export const hasApiKey = !!process.env.ANTHROPIC_API_KEY;

/**
 * Copy a directory tree recursively (files only, follows structure).
 */
export function copyDirSync(src: string, dest: string) {
  fs.mkdirSync(dest, { recursive: true });
  for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
    const srcPath = path.join(src, entry.name);
    const destPath = path.join(dest, entry.name);
    if (entry.isDirectory()) {
      copyDirSync(srcPath, destPath);
    } else {
      fs.copyFileSync(srcPath, destPath);
    }
  }
}

/**
 * Set up browse shims (binary symlink, find-browse, remote-slug) in a tmpDir.
 */
export function setupBrowseShims(dir: string) {
  // Symlink browse binary
  const binDir = path.join(dir, 'browse', 'dist');
  fs.mkdirSync(binDir, { recursive: true });
  if (fs.existsSync(browseBin)) {
    fs.symlinkSync(browseBin, path.join(binDir, 'browse'));
  }

  // find-browse shim
  const findBrowseDir = path.join(dir, 'browse', 'bin');
  fs.mkdirSync(findBrowseDir, { recursive: true });
  fs.writeFileSync(
    path.join(findBrowseDir, 'find-browse'),
    `#!/bin/bash\necho "${browseBin}"\n`,
    { mode: 0o755 },
  );

  // remote-slug shim (returns test-project)
  fs.writeFileSync(
    path.join(findBrowseDir, 'remote-slug'),
    `#!/bin/bash\necho "test-project"\n`,
    { mode: 0o755 },
  );
}

/**
 * Print cost summary after an E2E test.
 */
export function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
  const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
  const durationSec = Math.round(result.duration / 1000);
  console.log(`${label}: $${estimatedCost.toFixed(2)} (${turnsUsed} turns, ${(estimatedTokens / 1000).toFixed(1)}k tokens, ${durationSec}s)`);
}

/**
 * Dump diagnostic info on planted-bug outcome failure (decision 1C).
 */
export function dumpOutcomeDiagnostic(dir: string, label: string, report: string, judgeResult: any) {
  try {
    const transcriptDir = path.join(dir, '.gstack', 'test-transcripts');
    fs.mkdirSync(transcriptDir, { recursive: true });
    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
    fs.writeFileSync(
      path.join(transcriptDir, `${label}-outcome-${timestamp}.json`),
      JSON.stringify({ label, report, judgeResult }, null, 2),
    );
  } catch { /* non-fatal */ }
}

/**
 * Create an EvalCollector for a specific suite. Returns null if evals are not enabled.
 */
export function createEvalCollector(suite: string): EvalCollector | null {
  return evalsEnabled ? new EvalCollector(suite) : null;
}

/** DRY helper to record an E2E test result into the eval collector. */
export function recordE2E(
  evalCollector: EvalCollector | null,
  name: string,
  suite: string,
  result: SkillTestResult,
  extra?: Partial<EvalTestEntry>,
) {
  // Derive last tool call from transcript for machine-readable diagnostics
  const lastTool = result.toolCalls.length > 0
    ? `${result.toolCalls[result.toolCalls.length - 1].tool}(${JSON.stringify(result.toolCalls[result.toolCalls.length - 1].input).slice(0, 60)})`
    : undefined;

  evalCollector?.addTest({
    name, suite, tier: 'e2e',
    passed: result.exitReason === 'success' && result.browseErrors.length === 0,
    duration_ms: result.duration,
    cost_usd: result.costEstimate.estimatedCost,
    transcript: result.transcript,
    output: result.output?.slice(0, 2000),
    turns_used: result.costEstimate.turnsUsed,
    browse_errors: result.browseErrors,
    exit_reason: result.exitReason,
    timeout_at_turn: result.exitReason === 'timeout' ? result.costEstimate.turnsUsed : undefined,
    last_tool_call: lastTool,
    model: result.model,
    first_response_ms: result.firstResponseMs,
    max_inter_turn_ms: result.maxInterTurnMs,
    ...extra,
  });
}

/** Finalize an eval collector (write results). */
export async function finalizeEvalCollector(evalCollector: EvalCollector | null) {
  if (evalCollector) {
    try {
      await evalCollector.finalize();
    } catch (err) {
      console.error('Failed to save eval results:', err);
    }
  }
}

// Pre-seed preamble state files so E2E tests don't waste turns on lake intro + telemetry prompts.
// These are one-time interactive prompts that burn 3-7 turns per test if not pre-seeded.
if (evalsEnabled) {
  const gstackDir = path.join(os.homedir(), '.gstack');
  fs.mkdirSync(gstackDir, { recursive: true });
  for (const f of ['.completeness-intro-seen', '.telemetry-prompted']) {
    const p = path.join(gstackDir, f);
    if (!fs.existsSync(p)) fs.writeFileSync(p, '');
  }
}

// Fail fast if Anthropic API is unreachable — don't burn through tests getting ConnectionRefused
if (evalsEnabled) {
  const check = spawnSync('sh', ['-c', 'echo "ping" | claude -p --max-turns 1 --output-format stream-json --verbose --dangerously-skip-permissions'], {
    stdio: 'pipe', timeout: 30_000,
  });
  const output = check.stdout?.toString() || '';
  if (output.includes('ConnectionRefused') || output.includes('Unable to connect')) {
    throw new Error('Anthropic API unreachable — aborting E2E suite. Fix connectivity and retry.');
  }
}

/** Skip an individual test if not selected (for multi-test describe blocks). */
export function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
  (shouldRun ? test : test.skip)(testName, fn, timeout);
}

/** Concurrent version — runs in parallel with other concurrent tests within the same describe block. */
export function testConcurrentIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}

export { judgePassed } from './eval-store';
export { EvalCollector } from './eval-store';
export type { EvalTestEntry } from './eval-store';