From f9cfabeda8d6521e31134b88db7c50f47e7ae4dc Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 14 Mar 2026 11:04:16 -0500 Subject: [PATCH] =?UTF-8?q?feat:=20add=20E2E=20observability=20=E2=80=94?= =?UTF-8?q?=20heartbeat,=20progress.log,=20NDJSON=20persistence,=20savePar?= =?UTF-8?q?tial()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit session-runner: atomic heartbeat file (e2e-live.json), per-run log directory (~/.gstack-dev/e2e-runs/{runId}/), progress.log + per-test NDJSON persistence, failure transcripts to persistent run dir instead of tmpdir. eval-store: 3 new diagnostic fields (exit_reason, timeout_at_turn, last_tool_call), savePartial() writes _partial-e2e.json after each addTest() for crash resilience, finalize() cleans up partial file. Co-Authored-By: Claude Opus 4.6 --- test/helpers/eval-store.ts | 44 ++++++++++++++++++ test/helpers/session-runner.ts | 81 ++++++++++++++++++++++++++++++---- 2 files changed, 116 insertions(+), 9 deletions(-) diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts index 40e537eb32f76c8cd8254e561aed893a3c595a8a..e42b5ba2a8a6113e551fa80b733387a33cc50b43 100644 --- a/test/helpers/eval-store.ts +++ b/test/helpers/eval-store.ts @@ -37,6 +37,11 @@ export interface EvalTestEntry { judge_scores?: Record; judge_reasoning?: string; + // Machine-readable diagnostics + exit_reason?: string; // 'success' | 'timeout' | 'error_max_turns' | 'exit_code_N' + timeout_at_turn?: number; // which turn was active when timeout hit + last_tool_call?: string; // e.g. "Write(review-output.md)" + // Outcome eval detection_rate?: number; false_positives?: number; @@ -61,6 +66,7 @@ export interface EvalResult { total_cost_usd: number; total_duration_ms: number; tests: EvalTestEntry[]; + _partial?: boolean; // true for incremental saves, absent in final } export interface TestDelta { @@ -374,6 +380,41 @@ export class EvalCollector { addTest(entry: EvalTestEntry): void { this.tests.push(entry); + this.savePartial(); + } + + /** Write incremental results after each test. Atomic write, non-fatal. */ + savePartial(): void { + try { + const git = getGitInfo(); + const version = getVersion(); + const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0); + const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0); + const passed = this.tests.filter(t => t.passed).length; + + const partial: EvalResult = { + schema_version: SCHEMA_VERSION, + version, + branch: git.branch, + git_sha: git.sha, + timestamp: new Date().toISOString(), + hostname: os.hostname(), + tier: this.tier, + total_tests: this.tests.length, + passed, + failed: this.tests.length - passed, + total_cost_usd: Math.round(totalCost * 100) / 100, + total_duration_ms: totalDuration, + tests: this.tests, + _partial: true, + }; + + fs.mkdirSync(this.evalDir, { recursive: true }); + const partialPath = path.join(this.evalDir, '_partial-e2e.json'); + const tmp = partialPath + '.tmp'; + fs.writeFileSync(tmp, JSON.stringify(partial, null, 2) + '\n'); + fs.renameSync(tmp, partialPath); + } catch { /* non-fatal — partial saves are best-effort */ } } async finalize(): Promise { @@ -403,6 +444,9 @@ export class EvalCollector { tests: this.tests, }; + // Delete partial file now that we're writing the final + try { fs.unlinkSync(path.join(this.evalDir, '_partial-e2e.json')); } catch { /* may not exist */ } + // Write eval file fs.mkdirSync(this.evalDir, { recursive: true }); const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15); diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts index b4db8e603f94787c6cbe40d8bd11814c031ecbb8..eb5628f7f21f0a35db10118b34d8999aa7f3541e 100644 --- a/test/helpers/session-runner.ts +++ b/test/helpers/session-runner.ts @@ -8,6 +8,22 @@ import * as fs from 'fs'; import * as path from 'path'; +import * as os from 'os'; + +const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev'); +const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); + +/** Sanitize test name for use as filename: strip leading slashes, replace / with - */ +export function sanitizeTestName(name: string): string { + return name.replace(/^\/+/, '').replace(/\//g, '-'); +} + +/** Atomic write: write to .tmp then rename. Non-fatal on error. */ +function atomicWriteSync(filePath: string, data: string): void { + const tmp = filePath + '.tmp'; + fs.writeFileSync(tmp, data); + fs.renameSync(tmp, filePath); +} export interface CostEstimate { inputChars: number; @@ -98,6 +114,8 @@ export async function runSkillTest(options: { maxTurns?: number; allowedTools?: string[]; timeout?: number; + testName?: string; + runId?: string; }): Promise { const { prompt, @@ -105,9 +123,22 @@ export async function runSkillTest(options: { maxTurns = 15, allowedTools = ['Bash', 'Read', 'Write'], timeout = 120_000, + testName, + runId, } = options; const startTime = Date.now(); + const startedAt = new Date().toISOString(); + + // Set up per-run log directory if runId is provided + let runDir: string | null = null; + const safeName = testName ? sanitizeTestName(testName) : null; + if (runId) { + try { + runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId); + fs.mkdirSync(runDir, { recursive: true }); + } catch { /* non-fatal */ } + } // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to // avoid shell escaping issues. --verbose is required for stream-json mode. @@ -161,7 +192,7 @@ export async function runSkillTest(options: { if (!line.trim()) continue; collectedLines.push(line); - // Real-time progress to stderr + // Real-time progress to stderr + persistent logs try { const event = JSON.parse(line); if (event.type === 'assistant') { @@ -171,13 +202,40 @@ export async function runSkillTest(options: { if (item.type === 'tool_use') { liveToolCount++; const elapsed = Math.round((Date.now() - startTime) / 1000); - process.stderr.write( - ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n` - ); + const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`; + process.stderr.write(progressLine); + + // Persist progress.log + if (runDir) { + try { fs.appendFileSync(path.join(runDir, 'progress.log'), progressLine); } catch { /* non-fatal */ } + } + + // Write heartbeat (atomic) + if (runId && testName) { + try { + const toolDesc = `${item.name}(${truncate(JSON.stringify(item.input || {}), 60)})`; + atomicWriteSync(HEARTBEAT_PATH, JSON.stringify({ + runId, + startedAt, + currentTest: testName, + status: 'running', + turn: liveTurnCount, + toolCount: liveToolCount, + lastTool: toolDesc, + lastToolAt: new Date().toISOString(), + elapsedSec: elapsed, + }, null, 2) + '\n'); + } catch { /* non-fatal */ } + } } } } } catch { /* skip — parseNDJSON will handle it later */ } + + // Append raw NDJSON line to per-test transcript file + if (runDir && safeName) { + try { fs.appendFileSync(path.join(runDir, `${safeName}.ndjson`), line + '\n'); } catch { /* non-fatal */ } + } } } } catch { /* stream read error — fall through to exit code handling */ } @@ -226,19 +284,24 @@ export async function runSkillTest(options: { } } - // Save transcript on failure + // Save failure transcript to persistent run directory (or fallback to workingDirectory) if (browseErrors.length > 0 || exitReason !== 'success') { try { - const transcriptDir = path.join(workingDirectory, '.gstack', 'test-transcripts'); - fs.mkdirSync(transcriptDir, { recursive: true }); - const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const failureDir = runDir || path.join(workingDirectory, '.gstack', 'test-transcripts'); + fs.mkdirSync(failureDir, { recursive: true }); + const failureName = safeName + ? `${safeName}-failure.json` + : `e2e-${new Date().toISOString().replace(/[:.]/g, '-')}.json`; fs.writeFileSync( - path.join(transcriptDir, `e2e-${timestamp}.json`), + path.join(failureDir, failureName), JSON.stringify({ prompt: prompt.slice(0, 500), + testName: testName || 'unknown', exitReason, browseErrors, duration, + turnAtTimeout: timedOut ? liveTurnCount : undefined, + lastToolCall: liveToolCount > 0 ? `tool #${liveToolCount}` : undefined, stderr: stderr.slice(0, 2000), result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null, }, null, 2),