From 029a7c2a37a67d007a53df4077a4c8f13b6281d1 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 14 Mar 2026 11:04:40 -0500 Subject: [PATCH] feat: eval-watch dashboard + observability unit tests (15 tests, 11 codepaths) eval-watch: live terminal dashboard reads heartbeat + partial file every 1s, shows completed/running tests, stale detection (>10min), --tail flag for progress.log tail. Pure renderDashboard() function for testability. observability.test.ts: unit tests for sanitizeTestName, heartbeat schema, progress.log format, NDJSON file naming, savePartial() with _partial flag, finalize() cleanup, diagnostic fields, watcher rendering, stale detection, and non-fatal I/O guarantees. Co-Authored-By: Claude Opus 4.6 --- scripts/eval-watch.ts | 153 ++++++++++++++++ test/helpers/observability.test.ts | 282 +++++++++++++++++++++++++++++ 2 files changed, 435 insertions(+) create mode 100644 scripts/eval-watch.ts create mode 100644 test/helpers/observability.test.ts diff --git a/scripts/eval-watch.ts b/scripts/eval-watch.ts new file mode 100644 index 0000000000000000000000000000000000000000..117d2bdb22cc69502316596489eeee278ce9033d --- /dev/null +++ b/scripts/eval-watch.ts @@ -0,0 +1,153 @@ +/** + * Live E2E test watcher dashboard. + * + * Reads heartbeat (e2e-live.json) for current test status and + * partial eval results (_partial-e2e.json) for completed tests. + * Renders a terminal dashboard every 1s. + * + * Usage: bun run eval:watch [--tail] + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev'); +const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); +const PARTIAL_PATH = path.join(GSTACK_DEV_DIR, 'evals', '_partial-e2e.json'); +const STALE_THRESHOLD_SEC = 600; // 10 minutes + +export interface HeartbeatData { + runId: string; + startedAt: string; + currentTest: string; + status: string; + turn: number; + toolCount: number; + lastTool: string; + lastToolAt: string; + elapsedSec: number; +} + +export interface PartialData { + tests: Array<{ + name: string; + passed: boolean; + cost_usd: number; + duration_ms: number; + turns_used?: number; + exit_reason?: string; + }>; + total_cost_usd: number; + _partial?: boolean; +} + +/** Read and parse a JSON file, returning null on any error. */ +function readJSON(filePath: string): T | null { + try { + return JSON.parse(fs.readFileSync(filePath, 'utf-8')); + } catch { + return null; + } +} + +/** Format seconds as Xm Ys */ +function formatDuration(sec: number): string { + if (sec < 60) return `${sec}s`; + const m = Math.floor(sec / 60); + const s = sec % 60; + return `${m}m ${s}s`; +} + +/** Render dashboard from heartbeat + partial data. Pure function for testability. */ +export function renderDashboard(heartbeat: HeartbeatData | null, partial: PartialData | null): string { + const lines: string[] = []; + + if (!heartbeat && !partial) { + lines.push('E2E Watch — No active run detected'); + lines.push(''); + lines.push(`Heartbeat: ${HEARTBEAT_PATH} (not found)`); + lines.push(`Partial: ${PARTIAL_PATH} (not found)`); + lines.push(''); + lines.push('Start a run with: EVALS=1 bun test test/skill-e2e.test.ts'); + return lines.join('\n'); + } + + const runId = heartbeat?.runId || 'unknown'; + const elapsed = heartbeat?.elapsedSec || 0; + lines.push(`E2E Watch \u2014 Run ${runId} \u2014 ${formatDuration(elapsed)}`); + lines.push('\u2550'.repeat(55)); + + // Completed tests from partial + if (partial?.tests) { + for (const t of partial.tests) { + const icon = t.passed ? '\u2713' : '\u2717'; + const cost = `$${t.cost_usd.toFixed(2)}`; + const dur = `${Math.round(t.duration_ms / 1000)}s`; + const turns = t.turns_used !== undefined ? `${t.turns_used} turns` : ''; + const name = t.name.length > 30 ? t.name.slice(0, 27) + '...' : t.name.padEnd(30); + lines.push(` ${icon} ${name} ${cost.padStart(6)} ${dur.padStart(5)} ${turns}`); + } + } + + // Current test from heartbeat + if (heartbeat && heartbeat.status === 'running') { + const name = heartbeat.currentTest.length > 30 + ? heartbeat.currentTest.slice(0, 27) + '...' + : heartbeat.currentTest.padEnd(30); + lines.push(` \u29D6 ${name} ${formatDuration(heartbeat.elapsedSec).padStart(6)} turn ${heartbeat.turn} last: ${heartbeat.lastTool}`); + + // Stale detection + const lastToolTime = new Date(heartbeat.lastToolAt).getTime(); + const staleSec = Math.round((Date.now() - lastToolTime) / 1000); + if (staleSec > STALE_THRESHOLD_SEC) { + lines.push(` \u26A0 STALE: last tool call was ${formatDuration(staleSec)} ago \u2014 run may have crashed`); + } + } + + lines.push('\u2500'.repeat(55)); + + // Summary + const completedCount = partial?.tests?.length || 0; + const totalCost = partial?.total_cost_usd || 0; + const running = heartbeat?.status === 'running' ? 1 : 0; + lines.push(` Completed: ${completedCount} Running: ${running} Cost: $${totalCost.toFixed(2)} Elapsed: ${formatDuration(elapsed)}`); + + if (heartbeat?.runId) { + const logPath = path.join(GSTACK_DEV_DIR, 'e2e-runs', heartbeat.runId, 'progress.log'); + lines.push(` Logs: ${logPath}`); + } + + return lines.join('\n'); +} + +// --- Main --- + +if (import.meta.main) { + const showTail = process.argv.includes('--tail'); + + const render = () => { + const heartbeat = readJSON(HEARTBEAT_PATH); + const partial = readJSON(PARTIAL_PATH); + + // Clear screen + process.stdout.write('\x1B[2J\x1B[H'); + process.stdout.write(renderDashboard(heartbeat, partial) + '\n'); + + // --tail: show last 10 lines of progress.log + if (showTail && heartbeat?.runId) { + const logPath = path.join(GSTACK_DEV_DIR, 'e2e-runs', heartbeat.runId, 'progress.log'); + try { + const content = fs.readFileSync(logPath, 'utf-8'); + const tail = content.split('\n').filter(l => l.trim()).slice(-10); + process.stdout.write('\nRecent progress:\n'); + for (const line of tail) { + process.stdout.write(line + '\n'); + } + } catch { /* log file may not exist yet */ } + } + }; + + render(); + setInterval(render, 1000); +} diff --git a/test/helpers/observability.test.ts b/test/helpers/observability.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..cb793b5f1ebfe0b6cd71525c4f24c043765e29bc --- /dev/null +++ b/test/helpers/observability.test.ts @@ -0,0 +1,282 @@ +/** + * Unit tests for E2E observability infrastructure. + * + * Tests heartbeat, progress.log, NDJSON persistence, savePartial(), + * finalize() cleanup, failure transcript paths, watcher rendering, + * and non-fatal I/O guarantees. + */ + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { sanitizeTestName } from './session-runner'; +import { EvalCollector } from './eval-store'; +import { renderDashboard } from '../../scripts/eval-watch'; +import type { HeartbeatData, PartialData } from '../../scripts/eval-watch'; + +let tmpDir: string; + +beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'obs-test-')); +}); + +afterEach(() => { + try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} +}); + +// --- Test 1: runDir created when runId set --- + +describe('session-runner observability', () => { + test('1: sanitizeTestName strips slashes and leading dashes', () => { + expect(sanitizeTestName('/plan-ceo-review')).toBe('plan-ceo-review'); + expect(sanitizeTestName('browse-basic')).toBe('browse-basic'); + expect(sanitizeTestName('/qa/deep/test')).toBe('qa-deep-test'); + expect(sanitizeTestName('///leading')).toBe('leading'); + }); + + test('2: heartbeat file path uses ~/.gstack-dev/e2e-live.json', () => { + // Just verify the constant is correct — actual write is tested by E2E + const expected = path.join(os.homedir(), '.gstack-dev', 'e2e-live.json'); + // Import the module and check HEARTBEAT_PATH exists in the file + const sessionRunnerSrc = fs.readFileSync( + path.resolve(__dirname, 'session-runner.ts'), 'utf-8' + ); + expect(sessionRunnerSrc).toContain("'e2e-live.json'"); + expect(sessionRunnerSrc).toContain('atomicWriteSync'); + }); + + test('3: heartbeat JSON schema has expected fields', () => { + // Verify the heartbeat write code includes all required fields + const src = fs.readFileSync( + path.resolve(__dirname, 'session-runner.ts'), 'utf-8' + ); + for (const field of ['runId', 'startedAt', 'currentTest', 'status', 'turn', 'toolCount', 'lastTool', 'lastToolAt', 'elapsedSec']) { + expect(src).toContain(field); + } + // Should NOT contain completedTests (removed per plan) + expect(src).not.toContain('completedTests'); + }); + + test('4: progress.log format matches expected pattern', () => { + // The progress line format is: " [Ns] turn T tool #C: Name(...)" + const src = fs.readFileSync( + path.resolve(__dirname, 'session-runner.ts'), 'utf-8' + ); + // Both stderr and progress.log use the same progressLine variable + expect(src).toContain('progressLine'); + expect(src).toContain("'progress.log'"); + expect(src).toContain('appendFileSync'); + }); + + test('5: NDJSON file uses sanitized test name', () => { + const src = fs.readFileSync( + path.resolve(__dirname, 'session-runner.ts'), 'utf-8' + ); + expect(src).toContain('safeName'); + expect(src).toContain('.ndjson'); + }); + + test('8: failure transcript goes to runDir when available', () => { + const src = fs.readFileSync( + path.resolve(__dirname, 'session-runner.ts'), 'utf-8' + ); + // Should use runDir as primary, workingDirectory as fallback + expect(src).toContain('runDir || path.join(workingDirectory'); + expect(src).toContain('-failure.json'); + }); + + test('11: all new I/O is wrapped in try/catch (non-fatal)', () => { + const src = fs.readFileSync( + path.resolve(__dirname, 'session-runner.ts'), 'utf-8' + ); + // Count non-fatal comments — should be present for each new I/O path + const nonFatalCount = (src.match(/\/\* non-fatal \*\//g) || []).length; + // Original had 2 (promptFile unlink + failure transcript), we added 4 more + // (runDir creation, progress.log, heartbeat, NDJSON append) + expect(nonFatalCount).toBeGreaterThanOrEqual(6); + }); +}); + +// --- Tests 6, 7: eval-store savePartial() and finalize() --- + +describe('eval-store observability', () => { + test('6: savePartial() writes valid JSON with _partial: true', () => { + const evalDir = path.join(tmpDir, 'evals'); + const collector = new EvalCollector('e2e', evalDir); + + collector.addTest({ + name: 'test-one', + suite: 'test', + tier: 'e2e', + passed: true, + duration_ms: 1000, + cost_usd: 0.05, + exit_reason: 'success', + }); + + const partialPath = path.join(evalDir, '_partial-e2e.json'); + expect(fs.existsSync(partialPath)).toBe(true); + + const partial = JSON.parse(fs.readFileSync(partialPath, 'utf-8')); + expect(partial._partial).toBe(true); + expect(partial.tests).toHaveLength(1); + expect(partial.tests[0].name).toBe('test-one'); + expect(partial.tests[0].exit_reason).toBe('success'); + expect(partial.schema_version).toBe(1); + expect(partial.total_tests).toBe(1); + expect(partial.passed).toBe(1); + }); + + test('6b: savePartial() accumulates multiple tests', () => { + const evalDir = path.join(tmpDir, 'evals'); + const collector = new EvalCollector('e2e', evalDir); + + collector.addTest({ + name: 'test-one', suite: 'test', tier: 'e2e', + passed: true, duration_ms: 1000, cost_usd: 0.05, + }); + collector.addTest({ + name: 'test-two', suite: 'test', tier: 'e2e', + passed: false, duration_ms: 2000, cost_usd: 0.10, + exit_reason: 'timeout', timeout_at_turn: 5, last_tool_call: 'Bash(ls)', + }); + + const partialPath = path.join(evalDir, '_partial-e2e.json'); + const partial = JSON.parse(fs.readFileSync(partialPath, 'utf-8')); + expect(partial.tests).toHaveLength(2); + expect(partial.total_tests).toBe(2); + expect(partial.passed).toBe(1); + expect(partial.failed).toBe(1); + expect(partial.tests[1].exit_reason).toBe('timeout'); + expect(partial.tests[1].timeout_at_turn).toBe(5); + expect(partial.tests[1].last_tool_call).toBe('Bash(ls)'); + }); + + test('7: finalize() deletes partial file', async () => { + const evalDir = path.join(tmpDir, 'evals'); + const collector = new EvalCollector('e2e', evalDir); + + collector.addTest({ + name: 'test-one', suite: 'test', tier: 'e2e', + passed: true, duration_ms: 1000, cost_usd: 0.05, + }); + + const partialPath = path.join(evalDir, '_partial-e2e.json'); + expect(fs.existsSync(partialPath)).toBe(true); + + await collector.finalize(); + + expect(fs.existsSync(partialPath)).toBe(false); + + // Final eval file should exist + const files = fs.readdirSync(evalDir).filter(f => f.endsWith('.json') && !f.startsWith('_')); + expect(files.length).toBeGreaterThanOrEqual(1); + }); + + test('EvalTestEntry includes diagnostic fields', () => { + const evalDir = path.join(tmpDir, 'evals'); + const collector = new EvalCollector('e2e', evalDir); + + collector.addTest({ + name: 'diagnostic-test', suite: 'test', tier: 'e2e', + passed: false, duration_ms: 5000, cost_usd: 0.20, + exit_reason: 'error_max_turns', + timeout_at_turn: undefined, + last_tool_call: 'Write(review-output.md)', + }); + + const partialPath = path.join(evalDir, '_partial-e2e.json'); + const partial = JSON.parse(fs.readFileSync(partialPath, 'utf-8')); + const t = partial.tests[0]; + expect(t.exit_reason).toBe('error_max_turns'); + expect(t.last_tool_call).toBe('Write(review-output.md)'); + }); +}); + +// --- Tests 9, 10: watcher dashboard rendering --- + +describe('eval-watch dashboard', () => { + test('9: renderDashboard shows completed tests and current test', () => { + const heartbeat: HeartbeatData = { + runId: '20260314-143022', + startedAt: '2026-03-14T14:30:22Z', + currentTest: 'plan-ceo-review', + status: 'running', + turn: 4, + toolCount: 3, + lastTool: 'Write(review-output.md)', + lastToolAt: new Date().toISOString(), // recent — not stale + elapsedSec: 285, + }; + + const partial: PartialData = { + tests: [ + { name: 'browse basic', passed: true, cost_usd: 0.07, duration_ms: 24000, turns_used: 6 }, + { name: '/review', passed: true, cost_usd: 0.17, duration_ms: 63000, turns_used: 13 }, + ], + total_cost_usd: 0.24, + _partial: true, + }; + + const output = renderDashboard(heartbeat, partial); + + // Should contain run ID + expect(output).toContain('20260314-143022'); + + // Should show completed tests + expect(output).toContain('browse basic'); + expect(output).toContain('/review'); + expect(output).toContain('$0.07'); + expect(output).toContain('$0.17'); + + // Should show current test + expect(output).toContain('plan-ceo-review'); + expect(output).toContain('turn 4'); + expect(output).toContain('Write(review-output.md)'); + + // Should NOT show stale warning (lastToolAt is recent) + expect(output).not.toContain('STALE'); + }); + + test('10: renderDashboard warns on stale heartbeat', () => { + const staleTime = new Date(Date.now() - 15 * 60 * 1000).toISOString(); // 15 min ago + + const heartbeat: HeartbeatData = { + runId: '20260314-143022', + startedAt: '2026-03-14T14:30:22Z', + currentTest: 'plan-ceo-review', + status: 'running', + turn: 4, + toolCount: 3, + lastTool: 'Write(review-output.md)', + lastToolAt: staleTime, + elapsedSec: 900, + }; + + const output = renderDashboard(heartbeat, null); + + expect(output).toContain('STALE'); + expect(output).toContain('may have crashed'); + }); + + test('renderDashboard handles no active run', () => { + const output = renderDashboard(null, null); + expect(output).toContain('No active run'); + expect(output).toContain('bun test'); + }); + + test('renderDashboard handles partial-only (heartbeat gone)', () => { + const partial: PartialData = { + tests: [ + { name: 'browse basic', passed: true, cost_usd: 0.07, duration_ms: 24000 }, + ], + total_cost_usd: 0.07, + _partial: true, + }; + + const output = renderDashboard(null, partial); + expect(output).toContain('browse basic'); + expect(output).toContain('$0.07'); + }); +});