From f9cfabeda8d6521e31134b88db7c50f47e7ae4dc Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sat, 14 Mar 2026 11:04:16 -0500
Subject: [PATCH] =?UTF-8?q?feat:=20add=20E2E=20observability=20=E2=80=94?=
 =?UTF-8?q?=20heartbeat,=20progress.log,=20NDJSON=20persistence,=20savePar?=
 =?UTF-8?q?tial()?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

session-runner: atomic heartbeat file (e2e-live.json), per-run log directory
(~/.gstack-dev/e2e-runs/{runId}/), progress.log + per-test NDJSON persistence,
failure transcripts to persistent run dir instead of tmpdir.

eval-store: 3 new diagnostic fields (exit_reason, timeout_at_turn, last_tool_call),
savePartial() writes _partial-e2e.json after each addTest() for crash resilience,
finalize() cleans up partial file.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test/helpers/eval-store.ts     | 44 ++++++++++++++++++
 test/helpers/session-runner.ts | 81 ++++++++++++++++++++++++++++++----
 2 files changed, 116 insertions(+), 9 deletions(-)
diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts
index 40e537eb32f76c8cd8254e561aed893a3c595a8a..e42b5ba2a8a6113e551fa80b733387a33cc50b43 100644
--- a/test/helpers/eval-store.ts
+++ b/test/helpers/eval-store.ts
@@ -37,6 +37,11 @@ export interface EvalTestEntry {
   judge_scores?: Record<string, number>;
   judge_reasoning?: string;
 
+  // Machine-readable diagnostics
+  exit_reason?: string;       // 'success' | 'timeout' | 'error_max_turns' | 'exit_code_N'
+  timeout_at_turn?: number;   // which turn was active when timeout hit
+  last_tool_call?: string;    // e.g. "Write(review-output.md)"
+
   // Outcome eval
   detection_rate?: number;
   false_positives?: number;
@@ -61,6 +66,7 @@ export interface EvalResult {
   total_cost_usd: number;
   total_duration_ms: number;
   tests: EvalTestEntry[];
+  _partial?: boolean;  // true for incremental saves, absent in final
 }
 
 export interface TestDelta {
@@ -374,6 +380,41 @@ export class EvalCollector {
 
   addTest(entry: EvalTestEntry): void {
     this.tests.push(entry);
+    this.savePartial();
+  }
+
+  /** Write incremental results after each test. Atomic write, non-fatal. */
+  savePartial(): void {
+    try {
+      const git = getGitInfo();
+      const version = getVersion();
+      const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
+      const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
+      const passed = this.tests.filter(t => t.passed).length;
+
+      const partial: EvalResult = {
+        schema_version: SCHEMA_VERSION,
+        version,
+        branch: git.branch,
+        git_sha: git.sha,
+        timestamp: new Date().toISOString(),
+        hostname: os.hostname(),
+        tier: this.tier,
+        total_tests: this.tests.length,
+        passed,
+        failed: this.tests.length - passed,
+        total_cost_usd: Math.round(totalCost * 100) / 100,
+        total_duration_ms: totalDuration,
+        tests: this.tests,
+        _partial: true,
+      };
+
+      fs.mkdirSync(this.evalDir, { recursive: true });
+      const partialPath = path.join(this.evalDir, '_partial-e2e.json');
+      const tmp = partialPath + '.tmp';
+      fs.writeFileSync(tmp, JSON.stringify(partial, null, 2) + '\n');
+      fs.renameSync(tmp, partialPath);
+    } catch { /* non-fatal — partial saves are best-effort */ }
   }
 
   async finalize(): Promise<string> {
@@ -403,6 +444,9 @@ export class EvalCollector {
       tests: this.tests,
     };
 
+    // Delete partial file now that we're writing the final
+    try { fs.unlinkSync(path.join(this.evalDir, '_partial-e2e.json')); } catch { /* may not exist */ }
+
     // Write eval file
     fs.mkdirSync(this.evalDir, { recursive: true });
     const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts
index b4db8e603f94787c6cbe40d8bd11814c031ecbb8..eb5628f7f21f0a35db10118b34d8999aa7f3541e 100644
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@@ -8,6 +8,22 @@
 
 import * as fs from 'fs';
 import * as path from 'path';
+import * as os from 'os';
+
+const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
+const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
+
+/** Sanitize test name for use as filename: strip leading slashes, replace / with - */
+export function sanitizeTestName(name: string): string {
+  return name.replace(/^\/+/, '').replace(/\//g, '-');
+}
+
+/** Atomic write: write to .tmp then rename. Non-fatal on error. */
+function atomicWriteSync(filePath: string, data: string): void {
+  const tmp = filePath + '.tmp';
+  fs.writeFileSync(tmp, data);
+  fs.renameSync(tmp, filePath);
+}
 
 export interface CostEstimate {
   inputChars: number;
@@ -98,6 +114,8 @@ export async function runSkillTest(options: {
   maxTurns?: number;
   allowedTools?: string[];
   timeout?: number;
+  testName?: string;
+  runId?: string;
 }): Promise<SkillTestResult> {
   const {
     prompt,
@@ -105,9 +123,22 @@ export async function runSkillTest(options: {
     maxTurns = 15,
     allowedTools = ['Bash', 'Read', 'Write'],
     timeout = 120_000,
+    testName,
+    runId,
   } = options;
 
   const startTime = Date.now();
+  const startedAt = new Date().toISOString();
+
+  // Set up per-run log directory if runId is provided
+  let runDir: string | null = null;
+  const safeName = testName ? sanitizeTestName(testName) : null;
+  if (runId) {
+    try {
+      runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
+      fs.mkdirSync(runDir, { recursive: true });
+    } catch { /* non-fatal */ }
+  }
 
   // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
   // avoid shell escaping issues. --verbose is required for stream-json mode.
@@ -161,7 +192,7 @@ export async function runSkillTest(options: {
         if (!line.trim()) continue;
         collectedLines.push(line);
 
-        // Real-time progress to stderr
+        // Real-time progress to stderr + persistent logs
         try {
           const event = JSON.parse(line);
           if (event.type === 'assistant') {
@@ -171,13 +202,40 @@ export async function runSkillTest(options: {
               if (item.type === 'tool_use') {
                 liveToolCount++;
                 const elapsed = Math.round((Date.now() - startTime) / 1000);
-                process.stderr.write(
-                  `  [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`
-                );
+                const progressLine = `  [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
+                process.stderr.write(progressLine);
+
+                // Persist progress.log
+                if (runDir) {
+                  try { fs.appendFileSync(path.join(runDir, 'progress.log'), progressLine); } catch { /* non-fatal */ }
+                }
+
+                // Write heartbeat (atomic)
+                if (runId && testName) {
+                  try {
+                    const toolDesc = `${item.name}(${truncate(JSON.stringify(item.input || {}), 60)})`;
+                    atomicWriteSync(HEARTBEAT_PATH, JSON.stringify({
+                      runId,
+                      startedAt,
+                      currentTest: testName,
+                      status: 'running',
+                      turn: liveTurnCount,
+                      toolCount: liveToolCount,
+                      lastTool: toolDesc,
+                      lastToolAt: new Date().toISOString(),
+                      elapsedSec: elapsed,
+                    }, null, 2) + '\n');
+                  } catch { /* non-fatal */ }
+                }
               }
             }
           }
         } catch { /* skip — parseNDJSON will handle it later */ }
+
+        // Append raw NDJSON line to per-test transcript file
+        if (runDir && safeName) {
+          try { fs.appendFileSync(path.join(runDir, `${safeName}.ndjson`), line + '\n'); } catch { /* non-fatal */ }
+        }
       }
     }
   } catch { /* stream read error — fall through to exit code handling */ }
@@ -226,19 +284,24 @@ export async function runSkillTest(options: {
     }
   }
 
-  // Save transcript on failure
+  // Save failure transcript to persistent run directory (or fallback to workingDirectory)
   if (browseErrors.length > 0 || exitReason !== 'success') {
     try {
-      const transcriptDir = path.join(workingDirectory, '.gstack', 'test-transcripts');
-      fs.mkdirSync(transcriptDir, { recursive: true });
-      const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+      const failureDir = runDir || path.join(workingDirectory, '.gstack', 'test-transcripts');
+      fs.mkdirSync(failureDir, { recursive: true });
+      const failureName = safeName
+        ? `${safeName}-failure.json`
+        : `e2e-${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
       fs.writeFileSync(
-        path.join(transcriptDir, `e2e-${timestamp}.json`),
+        path.join(failureDir, failureName),
         JSON.stringify({
           prompt: prompt.slice(0, 500),
+          testName: testName || 'unknown',
           exitReason,
           browseErrors,
           duration,
+          turnAtTimeout: timedOut ? liveTurnCount : undefined,
+          lastToolCall: liveToolCount > 0 ? `tool #${liveToolCount}` : undefined,
           stderr: stderr.slice(0, 2000),
           result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null,
         }, null, 2),