From e7347c2f8fa8bce1f067d5bca66f882f2a800cc3 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sat, 14 Mar 2026 03:49:36 -0500
Subject: [PATCH] feat: stream-json NDJSON parser for real-time E2E progress

Switch session-runner from buffered `--output-format json` to streaming
`--output-format stream-json --verbose`. Parses NDJSON line-by-line for
real-time tool-by-tool progress on stderr during 3-5 min E2E runs.

- Extract testable `parseNDJSON()` function (pure, no I/O)
- Count turns per assistant event (not per text block)
- Add `transcript: any[]` to SkillTestResult, remove dead `messages` field
- Reconstruct allText from transcript for browse error scanning
- 8 unit tests for parser (malformed lines, empty input, turn counting)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test/helpers/session-runner.test.ts |  96 ++++++++++++++
 test/helpers/session-runner.ts      | 199 +++++++++++++++++++---------
 2 files changed, 236 insertions(+), 59 deletions(-)
 create mode 100644 test/helpers/session-runner.test.ts

diff --git a/test/helpers/session-runner.test.ts b/test/helpers/session-runner.test.ts
new file mode 100644
index 0000000000000000000000000000000000000000..812d4f8adf2180c425161b72a18251346420605d
--- /dev/null
+++ b/test/helpers/session-runner.test.ts
@@ -0,0 +1,96 @@
+import { describe, test, expect } from 'bun:test';
+import { parseNDJSON } from './session-runner';
+
+// Fixture: minimal NDJSON session (system init, assistant with tool_use, tool result, assistant text, result)
+const FIXTURE_LINES = [
+  '{"type":"system","subtype":"init","session_id":"test-123"}',
+  '{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"echo hello"}}]}}',
+  '{"type":"user","tool_use_result":{"tool_use_id":"tu1","stdout":"hello\\n","stderr":""}}',
+  '{"type":"assistant","message":{"content":[{"type":"text","text":"The command printed hello."}]}}',
+  '{"type":"assistant","message":{"content":[{"type":"text","text":"Let me also read a file."},{"type":"tool_use","id":"tu2","name":"Read","input":{"file_path":"/tmp/test"}}]}}',
+  '{"type":"result","subtype":"success","total_cost_usd":0.05,"num_turns":3,"usage":{"input_tokens":100,"output_tokens":50},"result":"Done."}',
+];
+
+describe('parseNDJSON', () => {
+  test('parses valid NDJSON with system + assistant + result events', () => {
+    const parsed = parseNDJSON(FIXTURE_LINES);
+    expect(parsed.transcript).toHaveLength(6);
+    expect(parsed.transcript[0].type).toBe('system');
+    expect(parsed.transcript[5].type).toBe('result');
+  });
+
+  test('extracts tool calls from assistant.message.content[].type === tool_use', () => {
+    const parsed = parseNDJSON(FIXTURE_LINES);
+    expect(parsed.toolCalls).toHaveLength(2);
+    expect(parsed.toolCalls[0]).toEqual({
+      tool: 'Bash',
+      input: { command: 'echo hello' },
+      output: '',
+    });
+    expect(parsed.toolCalls[1]).toEqual({
+      tool: 'Read',
+      input: { file_path: '/tmp/test' },
+      output: '',
+    });
+    expect(parsed.toolCallCount).toBe(2);
+  });
+
+  test('skips malformed lines without throwing', () => {
+    const lines = [
+      '{"type":"system"}',
+      'this is not json',
+      '{"type":"assistant","message":{"content":[{"type":"text","text":"ok"}]}}',
+      '{incomplete json',
+      '{"type":"result","subtype":"success","result":"done"}',
+    ];
+    const parsed = parseNDJSON(lines);
+    expect(parsed.transcript).toHaveLength(3); // system, assistant, result
+    expect(parsed.resultLine?.subtype).toBe('success');
+  });
+
+  test('skips empty and whitespace-only lines', () => {
+    const lines = [
+      '',
+      '  ',
+      '{"type":"system"}',
+      '\t',
+      '{"type":"result","subtype":"success","result":"ok"}',
+    ];
+    const parsed = parseNDJSON(lines);
+    expect(parsed.transcript).toHaveLength(2);
+  });
+
+  test('extracts resultLine from type: "result" event', () => {
+    const parsed = parseNDJSON(FIXTURE_LINES);
+    expect(parsed.resultLine).not.toBeNull();
+    expect(parsed.resultLine.subtype).toBe('success');
+    expect(parsed.resultLine.total_cost_usd).toBe(0.05);
+    expect(parsed.resultLine.num_turns).toBe(3);
+    expect(parsed.resultLine.result).toBe('Done.');
+  });
+
+  test('counts turns correctly — one per assistant event, not per text block', () => {
+    const parsed = parseNDJSON(FIXTURE_LINES);
+    // 3 assistant events in fixture (tool_use, text, text+tool_use)
+    expect(parsed.turnCount).toBe(3);
+  });
+
+  test('handles empty input', () => {
+    const parsed = parseNDJSON([]);
+    expect(parsed.transcript).toHaveLength(0);
+    expect(parsed.resultLine).toBeNull();
+    expect(parsed.turnCount).toBe(0);
+    expect(parsed.toolCallCount).toBe(0);
+    expect(parsed.toolCalls).toHaveLength(0);
+  });
+
+  test('handles assistant event with no content array', () => {
+    const lines = [
+      '{"type":"assistant","message":{}}',
+      '{"type":"assistant"}',
+    ];
+    const parsed = parseNDJSON(lines);
+    expect(parsed.turnCount).toBe(2);
+    expect(parsed.toolCalls).toHaveLength(0);
+  });
+});
diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts
index e33f2c7e60754da8281b1d6e4d1ccb4414a57267..9e7f5cc86f4799fd4e1eb04b5ac04e1daae701d2 100644
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@@ -2,8 +2,8 @@
  * Claude CLI subprocess runner for skill E2E testing.
  *
  * Spawns `claude -p` as a completely independent process (not via Agent SDK),
- * so it works inside Claude Code sessions. Pipes prompt via stdin, collects
- * JSON output, scans for browse errors.
+ * so it works inside Claude Code sessions. Pipes prompt via stdin, streams
+ * NDJSON output for real-time progress, scans for browse errors.
  */
 
 import * as fs from 'fs';
@@ -18,13 +18,13 @@ export interface CostEstimate {
 }
 
 export interface SkillTestResult {
-  messages: any[];
   toolCalls: Array<{ tool: string; input: any; output: string }>;
   browseErrors: string[];
   exitReason: string;
   duration: number;
   output: string;
   costEstimate: CostEstimate;
+  transcript: any[];
 }
 
 const BROWSE_ERROR_PATTERNS = [
@@ -36,6 +36,63 @@ const BROWSE_ERROR_PATTERNS = [
   /no such file or directory.*browse/i,
 ];
 
+// --- Testable NDJSON parser ---
+
+export interface ParsedNDJSON {
+  transcript: any[];
+  resultLine: any | null;
+  turnCount: number;
+  toolCallCount: number;
+  toolCalls: Array<{ tool: string; input: any; output: string }>;
+}
+
+/**
+ * Parse an array of NDJSON lines into structured transcript data.
+ * Pure function — no I/O, no side effects. Used by both the streaming
+ * reader and unit tests.
+ */
+export function parseNDJSON(lines: string[]): ParsedNDJSON {
+  const transcript: any[] = [];
+  let resultLine: any = null;
+  let turnCount = 0;
+  let toolCallCount = 0;
+  const toolCalls: ParsedNDJSON['toolCalls'] = [];
+
+  for (const line of lines) {
+    if (!line.trim()) continue;
+    try {
+      const event = JSON.parse(line);
+      transcript.push(event);
+
+      // Track turns and tool calls from assistant events
+      if (event.type === 'assistant') {
+        turnCount++;
+        const content = event.message?.content || [];
+        for (const item of content) {
+          if (item.type === 'tool_use') {
+            toolCallCount++;
+            toolCalls.push({
+              tool: item.name || 'unknown',
+              input: item.input || {},
+              output: '',
+            });
+          }
+        }
+      }
+
+      if (event.type === 'result') resultLine = event;
+    } catch { /* skip malformed lines */ }
+  }
+
+  return { transcript, resultLine, turnCount, toolCallCount, toolCalls };
+}
+
+function truncate(s: string, max: number): string {
+  return s.length > max ? s.slice(0, max) + '…' : s;
+}
+
+// --- Main runner ---
+
 export async function runSkillTest(options: {
   prompt: string;
   workingDirectory: string;
@@ -53,12 +110,12 @@ export async function runSkillTest(options: {
 
   const startTime = Date.now();
 
-  // Spawn claude -p with JSON output. Prompt piped via stdin to avoid
-  // shell escaping issues. Env is passed through (child claude strips
-  // its own parent-detection vars internally).
+  // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
+  // avoid shell escaping issues. --verbose is required for stream-json mode.
   const args = [
     '-p',
-    '--output-format', 'json',
+    '--output-format', 'stream-json',
+    '--verbose',
     '--dangerously-skip-permissions',
     '--max-turns', String(maxTurns),
     '--allowed-tools', ...allowedTools,
@@ -75,7 +132,6 @@ export async function runSkillTest(options: {
   });
 
   // Race against timeout
-  let stdout = '';
   let stderr = '';
   let exitReason = 'unknown';
   let timedOut = false;
@@ -85,50 +141,76 @@ export async function runSkillTest(options: {
     proc.kill();
   }, timeout);
 
+  // Stream NDJSON from stdout for real-time progress
+  const collectedLines: string[] = [];
+  let liveTurnCount = 0;
+  let liveToolCount = 0;
+  const stderrPromise = new Response(proc.stderr).text();
+
+  const reader = proc.stdout.getReader();
+  const decoder = new TextDecoder();
+  let buf = '';
+
   try {
-    const [outBuf, errBuf] = await Promise.all([
-      new Response(proc.stdout).text(),
-      new Response(proc.stderr).text(),
-    ]);
-    stdout = outBuf;
-    stderr = errBuf;
-
-    const exitCode = await proc.exited;
-    clearTimeout(timeoutId);
-
-    if (timedOut) {
-      exitReason = 'timeout';
-    } else if (exitCode === 0) {
-      exitReason = 'success';
-    } else {
-      exitReason = `exit_code_${exitCode}`;
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+      buf += decoder.decode(value, { stream: true });
+      const lines = buf.split('\n');
+      buf = lines.pop() || '';
+      for (const line of lines) {
+        if (!line.trim()) continue;
+        collectedLines.push(line);
+
+        // Real-time progress to stderr
+        try {
+          const event = JSON.parse(line);
+          if (event.type === 'assistant') {
+            liveTurnCount++;
+            const content = event.message?.content || [];
+            for (const item of content) {
+              if (item.type === 'tool_use') {
+                liveToolCount++;
+                const elapsed = Math.round((Date.now() - startTime) / 1000);
+                process.stderr.write(
+                  `  [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`
+                );
+              }
+            }
+          }
+        } catch { /* skip — parseNDJSON will handle it later */ }
+      }
     }
-  } catch (err: any) {
-    clearTimeout(timeoutId);
-    exitReason = timedOut ? 'timeout' : `error: ${err.message}`;
-  } finally {
-    try { fs.unlinkSync(promptFile); } catch { /* non-fatal */ }
+  } catch { /* stream read error — fall through to exit code handling */ }
+
+  // Flush remaining buffer
+  if (buf.trim()) {
+    collectedLines.push(buf);
+  }
+
+  stderr = await stderrPromise;
+  const exitCode = await proc.exited;
+  clearTimeout(timeoutId);
+
+  try { fs.unlinkSync(promptFile); } catch { /* non-fatal */ }
+
+  if (timedOut) {
+    exitReason = 'timeout';
+  } else if (exitCode === 0) {
+    exitReason = 'success';
+  } else {
+    exitReason = `exit_code_${exitCode}`;
   }
 
   const duration = Date.now() - startTime;
 
-  // Parse JSON output
-  let messages: any[] = [];
-  let toolCalls: SkillTestResult['toolCalls'] = [];
+  // Parse all collected NDJSON lines
+  const parsed = parseNDJSON(collectedLines);
+  const { transcript, resultLine, toolCalls } = parsed;
   const browseErrors: string[] = [];
-  let result: any = null;
-
-  try {
-    // stdout may have stderr warnings prefixed (e.g., "[WARN] Fast mode...")
-    // Find the JSON object in the output
-    const jsonStart = stdout.indexOf('{');
-    if (jsonStart >= 0) {
-      result = JSON.parse(stdout.slice(jsonStart));
-    }
-  } catch { /* non-JSON output */ }
 
-  // Scan all output for browse errors
-  const allText = stdout + '\n' + stderr;
+  // Scan transcript + stderr for browse errors
+  const allText = transcript.map(e => JSON.stringify(e)).join('\n') + '\n' + stderr;
   for (const pattern of BROWSE_ERROR_PATTERNS) {
     const match = allText.match(pattern);
     if (match) {
@@ -136,13 +218,12 @@ export async function runSkillTest(options: {
     }
   }
 
-  // If JSON parsed, use the structured result
-  if (result) {
-    // Check result type for success
-    if (result.type === 'result' && result.subtype === 'success') {
+  // Use resultLine for structured result data
+  if (resultLine) {
+    if (resultLine.subtype === 'success') {
       exitReason = 'success';
-    } else if (result.type === 'result' && result.subtype) {
-      exitReason = result.subtype;
+    } else if (resultLine.subtype) {
+      exitReason = resultLine.subtype;
     }
   }
 
@@ -160,20 +241,20 @@ export async function runSkillTest(options: {
           browseErrors,
           duration,
           stderr: stderr.slice(0, 2000),
-          result: result ? { type: result.type, subtype: result.subtype, result: result.result?.slice?.(0, 500) } : null,
+          result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null,
         }, null, 2),
       );
     } catch { /* non-fatal */ }
   }
 
-  // Cost from JSON result (exact) or estimate from chars
-  const turnsUsed = result?.num_turns || 0;
-  const estimatedCost = result?.total_cost_usd || 0;
+  // Cost from result line (exact) or estimate from chars
+  const turnsUsed = resultLine?.num_turns || 0;
+  const estimatedCost = resultLine?.total_cost_usd || 0;
   const inputChars = prompt.length;
-  const outputChars = (result?.result || stdout).length;
-  const estimatedTokens = (result?.usage?.input_tokens || 0)
-    + (result?.usage?.output_tokens || 0)
-    + (result?.usage?.cache_read_input_tokens || 0);
+  const outputChars = (resultLine?.result || '').length;
+  const estimatedTokens = (resultLine?.usage?.input_tokens || 0)
+    + (resultLine?.usage?.output_tokens || 0)
+    + (resultLine?.usage?.cache_read_input_tokens || 0);
 
   const costEstimate: CostEstimate = {
     inputChars,
@@ -183,5 +264,5 @@ export async function runSkillTest(options: {
     turnsUsed,
   };
 
-  return { messages, toolCalls, browseErrors, exitReason, duration, output: result?.result || stdout, costEstimate };
+  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
 }