~cytrogen/gstack

f9cfabeda8d6521e31134b88db7c50f47e7ae4dc — Garry Tan a month ago eb9a919
feat: add E2E observability — heartbeat, progress.log, NDJSON persistence, savePartial()

session-runner: atomic heartbeat file (e2e-live.json), per-run log directory
(~/.gstack-dev/e2e-runs/{runId}/), progress.log + per-test NDJSON persistence,
failure transcripts to persistent run dir instead of tmpdir.

eval-store: 3 new diagnostic fields (exit_reason, timeout_at_turn, last_tool_call),
savePartial() writes _partial-e2e.json after each addTest() for crash resilience,
finalize() cleans up partial file.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2 files changed, 116 insertions(+), 9 deletions(-)

M test/helpers/eval-store.ts
M test/helpers/session-runner.ts
M test/helpers/eval-store.ts => test/helpers/eval-store.ts +44 -0
@@ 37,6 37,11 @@ export interface EvalTestEntry {
  judge_scores?: Record<string, number>;
  judge_reasoning?: string;

  // Machine-readable diagnostics
  exit_reason?: string;       // 'success' | 'timeout' | 'error_max_turns' | 'exit_code_N'
  timeout_at_turn?: number;   // which turn was active when timeout hit
  last_tool_call?: string;    // e.g. "Write(review-output.md)"

  // Outcome eval
  detection_rate?: number;
  false_positives?: number;


@@ 61,6 66,7 @@ export interface EvalResult {
  total_cost_usd: number;
  total_duration_ms: number;
  tests: EvalTestEntry[];
  _partial?: boolean;  // true for incremental saves, absent in final
}

export interface TestDelta {


@@ 374,6 380,41 @@ export class EvalCollector {

  addTest(entry: EvalTestEntry): void {
    this.tests.push(entry);
    this.savePartial();
  }

  /** Write incremental results after each test. Atomic write, non-fatal. */
  savePartial(): void {
    try {
      const git = getGitInfo();
      const version = getVersion();
      const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
      const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
      const passed = this.tests.filter(t => t.passed).length;

      const partial: EvalResult = {
        schema_version: SCHEMA_VERSION,
        version,
        branch: git.branch,
        git_sha: git.sha,
        timestamp: new Date().toISOString(),
        hostname: os.hostname(),
        tier: this.tier,
        total_tests: this.tests.length,
        passed,
        failed: this.tests.length - passed,
        total_cost_usd: Math.round(totalCost * 100) / 100,
        total_duration_ms: totalDuration,
        tests: this.tests,
        _partial: true,
      };

      fs.mkdirSync(this.evalDir, { recursive: true });
      const partialPath = path.join(this.evalDir, '_partial-e2e.json');
      const tmp = partialPath + '.tmp';
      fs.writeFileSync(tmp, JSON.stringify(partial, null, 2) + '\n');
      fs.renameSync(tmp, partialPath);
    } catch { /* non-fatal — partial saves are best-effort */ }
  }

  async finalize(): Promise<string> {


@@ 403,6 444,9 @@ export class EvalCollector {
      tests: this.tests,
    };

    // Delete partial file now that we're writing the final
    try { fs.unlinkSync(path.join(this.evalDir, '_partial-e2e.json')); } catch { /* may not exist */ }

    // Write eval file
    fs.mkdirSync(this.evalDir, { recursive: true });
    const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);

M test/helpers/session-runner.ts => test/helpers/session-runner.ts +72 -9
@@ 8,6 8,22 @@

import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';

const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');

/** Sanitize test name for use as filename: strip leading slashes, replace / with - */
export function sanitizeTestName(name: string): string {
  return name.replace(/^\/+/, '').replace(/\//g, '-');
}

/** Atomic write: write to .tmp then rename. Non-fatal on error. */
function atomicWriteSync(filePath: string, data: string): void {
  const tmp = filePath + '.tmp';
  fs.writeFileSync(tmp, data);
  fs.renameSync(tmp, filePath);
}

export interface CostEstimate {
  inputChars: number;


@@ 98,6 114,8 @@ export async function runSkillTest(options: {
  maxTurns?: number;
  allowedTools?: string[];
  timeout?: number;
  testName?: string;
  runId?: string;
}): Promise<SkillTestResult> {
  const {
    prompt,


@@ 105,9 123,22 @@ export async function runSkillTest(options: {
    maxTurns = 15,
    allowedTools = ['Bash', 'Read', 'Write'],
    timeout = 120_000,
    testName,
    runId,
  } = options;

  const startTime = Date.now();
  const startedAt = new Date().toISOString();

  // Set up per-run log directory if runId is provided
  let runDir: string | null = null;
  const safeName = testName ? sanitizeTestName(testName) : null;
  if (runId) {
    try {
      runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
      fs.mkdirSync(runDir, { recursive: true });
    } catch { /* non-fatal */ }
  }

  // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
  // avoid shell escaping issues. --verbose is required for stream-json mode.


@@ 161,7 192,7 @@ export async function runSkillTest(options: {
        if (!line.trim()) continue;
        collectedLines.push(line);

        // Real-time progress to stderr
        // Real-time progress to stderr + persistent logs
        try {
          const event = JSON.parse(line);
          if (event.type === 'assistant') {


@@ 171,13 202,40 @@ export async function runSkillTest(options: {
              if (item.type === 'tool_use') {
                liveToolCount++;
                const elapsed = Math.round((Date.now() - startTime) / 1000);
                process.stderr.write(
                  `  [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`
                );
                const progressLine = `  [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
                process.stderr.write(progressLine);

                // Persist progress.log
                if (runDir) {
                  try { fs.appendFileSync(path.join(runDir, 'progress.log'), progressLine); } catch { /* non-fatal */ }
                }

                // Write heartbeat (atomic)
                if (runId && testName) {
                  try {
                    const toolDesc = `${item.name}(${truncate(JSON.stringify(item.input || {}), 60)})`;
                    atomicWriteSync(HEARTBEAT_PATH, JSON.stringify({
                      runId,
                      startedAt,
                      currentTest: testName,
                      status: 'running',
                      turn: liveTurnCount,
                      toolCount: liveToolCount,
                      lastTool: toolDesc,
                      lastToolAt: new Date().toISOString(),
                      elapsedSec: elapsed,
                    }, null, 2) + '\n');
                  } catch { /* non-fatal */ }
                }
              }
            }
          }
        } catch { /* skip — parseNDJSON will handle it later */ }

        // Append raw NDJSON line to per-test transcript file
        if (runDir && safeName) {
          try { fs.appendFileSync(path.join(runDir, `${safeName}.ndjson`), line + '\n'); } catch { /* non-fatal */ }
        }
      }
    }
  } catch { /* stream read error — fall through to exit code handling */ }


@@ 226,19 284,24 @@ export async function runSkillTest(options: {
    }
  }

  // Save transcript on failure
  // Save failure transcript to persistent run directory (or fallback to workingDirectory)
  if (browseErrors.length > 0 || exitReason !== 'success') {
    try {
      const transcriptDir = path.join(workingDirectory, '.gstack', 'test-transcripts');
      fs.mkdirSync(transcriptDir, { recursive: true });
      const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
      const failureDir = runDir || path.join(workingDirectory, '.gstack', 'test-transcripts');
      fs.mkdirSync(failureDir, { recursive: true });
      const failureName = safeName
        ? `${safeName}-failure.json`
        : `e2e-${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
      fs.writeFileSync(
        path.join(transcriptDir, `e2e-${timestamp}.json`),
        path.join(failureDir, failureName),
        JSON.stringify({
          prompt: prompt.slice(0, 500),
          testName: testName || 'unknown',
          exitReason,
          browseErrors,
          duration,
          turnAtTimeout: timedOut ? liveTurnCount : undefined,
          lastToolCall: liveToolCount > 0 ? `tool #${liveToolCount}` : undefined,
          stderr: stderr.slice(0, 2000),
          result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null,
        }, null, 2),