From e7347c2f8fa8bce1f067d5bca66f882f2a800cc3 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 14 Mar 2026 03:49:36 -0500 Subject: [PATCH] feat: stream-json NDJSON parser for real-time E2E progress Switch session-runner from buffered `--output-format json` to streaming `--output-format stream-json --verbose`. Parses NDJSON line-by-line for real-time tool-by-tool progress on stderr during 3-5 min E2E runs. - Extract testable `parseNDJSON()` function (pure, no I/O) - Count turns per assistant event (not per text block) - Add `transcript: any[]` to SkillTestResult, remove dead `messages` field - Reconstruct allText from transcript for browse error scanning - 8 unit tests for parser (malformed lines, empty input, turn counting) Co-Authored-By: Claude Opus 4.6 --- test/helpers/session-runner.test.ts | 96 ++++++++++++++ test/helpers/session-runner.ts | 199 +++++++++++++++++++--------- 2 files changed, 236 insertions(+), 59 deletions(-) create mode 100644 test/helpers/session-runner.test.ts diff --git a/test/helpers/session-runner.test.ts b/test/helpers/session-runner.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..812d4f8adf2180c425161b72a18251346420605d --- /dev/null +++ b/test/helpers/session-runner.test.ts @@ -0,0 +1,96 @@ +import { describe, test, expect } from 'bun:test'; +import { parseNDJSON } from './session-runner'; + +// Fixture: minimal NDJSON session (system init, assistant with tool_use, tool result, assistant text, result) +const FIXTURE_LINES = [ + '{"type":"system","subtype":"init","session_id":"test-123"}', + '{"type":"assistant","message":{"content":[{"type":"tool_use","id":"tu1","name":"Bash","input":{"command":"echo hello"}}]}}', + '{"type":"user","tool_use_result":{"tool_use_id":"tu1","stdout":"hello\\n","stderr":""}}', + '{"type":"assistant","message":{"content":[{"type":"text","text":"The command printed hello."}]}}', + '{"type":"assistant","message":{"content":[{"type":"text","text":"Let me also read a file."},{"type":"tool_use","id":"tu2","name":"Read","input":{"file_path":"/tmp/test"}}]}}', + '{"type":"result","subtype":"success","total_cost_usd":0.05,"num_turns":3,"usage":{"input_tokens":100,"output_tokens":50},"result":"Done."}', +]; + +describe('parseNDJSON', () => { + test('parses valid NDJSON with system + assistant + result events', () => { + const parsed = parseNDJSON(FIXTURE_LINES); + expect(parsed.transcript).toHaveLength(6); + expect(parsed.transcript[0].type).toBe('system'); + expect(parsed.transcript[5].type).toBe('result'); + }); + + test('extracts tool calls from assistant.message.content[].type === tool_use', () => { + const parsed = parseNDJSON(FIXTURE_LINES); + expect(parsed.toolCalls).toHaveLength(2); + expect(parsed.toolCalls[0]).toEqual({ + tool: 'Bash', + input: { command: 'echo hello' }, + output: '', + }); + expect(parsed.toolCalls[1]).toEqual({ + tool: 'Read', + input: { file_path: '/tmp/test' }, + output: '', + }); + expect(parsed.toolCallCount).toBe(2); + }); + + test('skips malformed lines without throwing', () => { + const lines = [ + '{"type":"system"}', + 'this is not json', + '{"type":"assistant","message":{"content":[{"type":"text","text":"ok"}]}}', + '{incomplete json', + '{"type":"result","subtype":"success","result":"done"}', + ]; + const parsed = parseNDJSON(lines); + expect(parsed.transcript).toHaveLength(3); // system, assistant, result + expect(parsed.resultLine?.subtype).toBe('success'); + }); + + test('skips empty and whitespace-only lines', () => { + const lines = [ + '', + ' ', + '{"type":"system"}', + '\t', + '{"type":"result","subtype":"success","result":"ok"}', + ]; + const parsed = parseNDJSON(lines); + expect(parsed.transcript).toHaveLength(2); + }); + + test('extracts resultLine from type: "result" event', () => { + const parsed = parseNDJSON(FIXTURE_LINES); + expect(parsed.resultLine).not.toBeNull(); + expect(parsed.resultLine.subtype).toBe('success'); + expect(parsed.resultLine.total_cost_usd).toBe(0.05); + expect(parsed.resultLine.num_turns).toBe(3); + expect(parsed.resultLine.result).toBe('Done.'); + }); + + test('counts turns correctly — one per assistant event, not per text block', () => { + const parsed = parseNDJSON(FIXTURE_LINES); + // 3 assistant events in fixture (tool_use, text, text+tool_use) + expect(parsed.turnCount).toBe(3); + }); + + test('handles empty input', () => { + const parsed = parseNDJSON([]); + expect(parsed.transcript).toHaveLength(0); + expect(parsed.resultLine).toBeNull(); + expect(parsed.turnCount).toBe(0); + expect(parsed.toolCallCount).toBe(0); + expect(parsed.toolCalls).toHaveLength(0); + }); + + test('handles assistant event with no content array', () => { + const lines = [ + '{"type":"assistant","message":{}}', + '{"type":"assistant"}', + ]; + const parsed = parseNDJSON(lines); + expect(parsed.turnCount).toBe(2); + expect(parsed.toolCalls).toHaveLength(0); + }); +}); diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts index e33f2c7e60754da8281b1d6e4d1ccb4414a57267..9e7f5cc86f4799fd4e1eb04b5ac04e1daae701d2 100644 --- a/test/helpers/session-runner.ts +++ b/test/helpers/session-runner.ts @@ -2,8 +2,8 @@ * Claude CLI subprocess runner for skill E2E testing. * * Spawns `claude -p` as a completely independent process (not via Agent SDK), - * so it works inside Claude Code sessions. Pipes prompt via stdin, collects - * JSON output, scans for browse errors. + * so it works inside Claude Code sessions. Pipes prompt via stdin, streams + * NDJSON output for real-time progress, scans for browse errors. */ import * as fs from 'fs'; @@ -18,13 +18,13 @@ export interface CostEstimate { } export interface SkillTestResult { - messages: any[]; toolCalls: Array<{ tool: string; input: any; output: string }>; browseErrors: string[]; exitReason: string; duration: number; output: string; costEstimate: CostEstimate; + transcript: any[]; } const BROWSE_ERROR_PATTERNS = [ @@ -36,6 +36,63 @@ const BROWSE_ERROR_PATTERNS = [ /no such file or directory.*browse/i, ]; +// --- Testable NDJSON parser --- + +export interface ParsedNDJSON { + transcript: any[]; + resultLine: any | null; + turnCount: number; + toolCallCount: number; + toolCalls: Array<{ tool: string; input: any; output: string }>; +} + +/** + * Parse an array of NDJSON lines into structured transcript data. + * Pure function — no I/O, no side effects. Used by both the streaming + * reader and unit tests. + */ +export function parseNDJSON(lines: string[]): ParsedNDJSON { + const transcript: any[] = []; + let resultLine: any = null; + let turnCount = 0; + let toolCallCount = 0; + const toolCalls: ParsedNDJSON['toolCalls'] = []; + + for (const line of lines) { + if (!line.trim()) continue; + try { + const event = JSON.parse(line); + transcript.push(event); + + // Track turns and tool calls from assistant events + if (event.type === 'assistant') { + turnCount++; + const content = event.message?.content || []; + for (const item of content) { + if (item.type === 'tool_use') { + toolCallCount++; + toolCalls.push({ + tool: item.name || 'unknown', + input: item.input || {}, + output: '', + }); + } + } + } + + if (event.type === 'result') resultLine = event; + } catch { /* skip malformed lines */ } + } + + return { transcript, resultLine, turnCount, toolCallCount, toolCalls }; +} + +function truncate(s: string, max: number): string { + return s.length > max ? s.slice(0, max) + '…' : s; +} + +// --- Main runner --- + export async function runSkillTest(options: { prompt: string; workingDirectory: string; @@ -53,12 +110,12 @@ export async function runSkillTest(options: { const startTime = Date.now(); - // Spawn claude -p with JSON output. Prompt piped via stdin to avoid - // shell escaping issues. Env is passed through (child claude strips - // its own parent-detection vars internally). + // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to + // avoid shell escaping issues. --verbose is required for stream-json mode. const args = [ '-p', - '--output-format', 'json', + '--output-format', 'stream-json', + '--verbose', '--dangerously-skip-permissions', '--max-turns', String(maxTurns), '--allowed-tools', ...allowedTools, @@ -75,7 +132,6 @@ export async function runSkillTest(options: { }); // Race against timeout - let stdout = ''; let stderr = ''; let exitReason = 'unknown'; let timedOut = false; @@ -85,50 +141,76 @@ export async function runSkillTest(options: { proc.kill(); }, timeout); + // Stream NDJSON from stdout for real-time progress + const collectedLines: string[] = []; + let liveTurnCount = 0; + let liveToolCount = 0; + const stderrPromise = new Response(proc.stderr).text(); + + const reader = proc.stdout.getReader(); + const decoder = new TextDecoder(); + let buf = ''; + try { - const [outBuf, errBuf] = await Promise.all([ - new Response(proc.stdout).text(), - new Response(proc.stderr).text(), - ]); - stdout = outBuf; - stderr = errBuf; - - const exitCode = await proc.exited; - clearTimeout(timeoutId); - - if (timedOut) { - exitReason = 'timeout'; - } else if (exitCode === 0) { - exitReason = 'success'; - } else { - exitReason = `exit_code_${exitCode}`; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + const lines = buf.split('\n'); + buf = lines.pop() || ''; + for (const line of lines) { + if (!line.trim()) continue; + collectedLines.push(line); + + // Real-time progress to stderr + try { + const event = JSON.parse(line); + if (event.type === 'assistant') { + liveTurnCount++; + const content = event.message?.content || []; + for (const item of content) { + if (item.type === 'tool_use') { + liveToolCount++; + const elapsed = Math.round((Date.now() - startTime) / 1000); + process.stderr.write( + ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n` + ); + } + } + } + } catch { /* skip — parseNDJSON will handle it later */ } + } } - } catch (err: any) { - clearTimeout(timeoutId); - exitReason = timedOut ? 'timeout' : `error: ${err.message}`; - } finally { - try { fs.unlinkSync(promptFile); } catch { /* non-fatal */ } + } catch { /* stream read error — fall through to exit code handling */ } + + // Flush remaining buffer + if (buf.trim()) { + collectedLines.push(buf); + } + + stderr = await stderrPromise; + const exitCode = await proc.exited; + clearTimeout(timeoutId); + + try { fs.unlinkSync(promptFile); } catch { /* non-fatal */ } + + if (timedOut) { + exitReason = 'timeout'; + } else if (exitCode === 0) { + exitReason = 'success'; + } else { + exitReason = `exit_code_${exitCode}`; } const duration = Date.now() - startTime; - // Parse JSON output - let messages: any[] = []; - let toolCalls: SkillTestResult['toolCalls'] = []; + // Parse all collected NDJSON lines + const parsed = parseNDJSON(collectedLines); + const { transcript, resultLine, toolCalls } = parsed; const browseErrors: string[] = []; - let result: any = null; - - try { - // stdout may have stderr warnings prefixed (e.g., "[WARN] Fast mode...") - // Find the JSON object in the output - const jsonStart = stdout.indexOf('{'); - if (jsonStart >= 0) { - result = JSON.parse(stdout.slice(jsonStart)); - } - } catch { /* non-JSON output */ } - // Scan all output for browse errors - const allText = stdout + '\n' + stderr; + // Scan transcript + stderr for browse errors + const allText = transcript.map(e => JSON.stringify(e)).join('\n') + '\n' + stderr; for (const pattern of BROWSE_ERROR_PATTERNS) { const match = allText.match(pattern); if (match) { @@ -136,13 +218,12 @@ export async function runSkillTest(options: { } } - // If JSON parsed, use the structured result - if (result) { - // Check result type for success - if (result.type === 'result' && result.subtype === 'success') { + // Use resultLine for structured result data + if (resultLine) { + if (resultLine.subtype === 'success') { exitReason = 'success'; - } else if (result.type === 'result' && result.subtype) { - exitReason = result.subtype; + } else if (resultLine.subtype) { + exitReason = resultLine.subtype; } } @@ -160,20 +241,20 @@ export async function runSkillTest(options: { browseErrors, duration, stderr: stderr.slice(0, 2000), - result: result ? { type: result.type, subtype: result.subtype, result: result.result?.slice?.(0, 500) } : null, + result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null, }, null, 2), ); } catch { /* non-fatal */ } } - // Cost from JSON result (exact) or estimate from chars - const turnsUsed = result?.num_turns || 0; - const estimatedCost = result?.total_cost_usd || 0; + // Cost from result line (exact) or estimate from chars + const turnsUsed = resultLine?.num_turns || 0; + const estimatedCost = resultLine?.total_cost_usd || 0; const inputChars = prompt.length; - const outputChars = (result?.result || stdout).length; - const estimatedTokens = (result?.usage?.input_tokens || 0) - + (result?.usage?.output_tokens || 0) - + (result?.usage?.cache_read_input_tokens || 0); + const outputChars = (resultLine?.result || '').length; + const estimatedTokens = (resultLine?.usage?.input_tokens || 0) + + (resultLine?.usage?.output_tokens || 0) + + (resultLine?.usage?.cache_read_input_tokens || 0); const costEstimate: CostEstimate = { inputChars, @@ -183,5 +264,5 @@ export async function runSkillTest(options: { turnsUsed, }; - return { messages, toolCalls, browseErrors, exitReason, duration, output: result?.result || stdout, costEstimate }; + return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript }; }