~cytrogen/gstack

ref: c35e933c7db7be8bf55c089c1246432da624a4ef gstack/test/helpers/session-runner.ts -rw-r--r-- 5.3 KiB
c35e933c — Garry Tan fix: rewrite session-runner to claude -p subprocess, lower flaky baselines a month ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/**
 * Claude CLI subprocess runner for skill E2E testing.
 *
 * Spawns `claude -p` as a completely independent process (not via Agent SDK),
 * so it works inside Claude Code sessions. Pipes prompt via stdin, collects
 * JSON output, scans for browse errors.
 */

import * as fs from 'fs';
import * as path from 'path';

export interface CostEstimate {
  inputChars: number;
  outputChars: number;
  estimatedTokens: number;
  estimatedCost: number;  // USD
  turnsUsed: number;
}

export interface SkillTestResult {
  messages: any[];
  toolCalls: Array<{ tool: string; input: any; output: string }>;
  browseErrors: string[];
  exitReason: string;
  duration: number;
  output: string;
  costEstimate: CostEstimate;
}

const BROWSE_ERROR_PATTERNS = [
  /Unknown command: \w+/,
  /Unknown snapshot flag: .+/,
  /Exit code 1/,
  /ERROR: browse binary not found/,
  /Server failed to start/,
];

export async function runSkillTest(options: {
  prompt: string;
  workingDirectory: string;
  maxTurns?: number;
  allowedTools?: string[];
  timeout?: number;
}): Promise<SkillTestResult> {
  const {
    prompt,
    workingDirectory,
    maxTurns = 15,
    allowedTools = ['Bash', 'Read', 'Write'],
    timeout = 120_000,
  } = options;

  const startTime = Date.now();

  // Spawn claude -p with JSON output. Prompt piped via stdin to avoid
  // shell escaping issues. Env is passed through (child claude strips
  // its own parent-detection vars internally).
  const args = [
    '-p',
    '--output-format', 'json',
    '--dangerously-skip-permissions',
    '--max-turns', String(maxTurns),
    '--allowed-tools', ...allowedTools,
  ];

  // Write prompt to a temp file and pipe it via shell to avoid stdin buffering issues
  const promptFile = path.join(workingDirectory, '.prompt-tmp');
  fs.writeFileSync(promptFile, prompt);

  const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
    cwd: workingDirectory,
    stdout: 'pipe',
    stderr: 'pipe',
  });

  // Race against timeout
  let stdout = '';
  let stderr = '';
  let exitReason = 'unknown';
  let timedOut = false;

  const timeoutId = setTimeout(() => {
    timedOut = true;
    proc.kill();
  }, timeout);

  try {
    const [outBuf, errBuf] = await Promise.all([
      new Response(proc.stdout).text(),
      new Response(proc.stderr).text(),
    ]);
    stdout = outBuf;
    stderr = errBuf;

    const exitCode = await proc.exited;
    clearTimeout(timeoutId);

    if (timedOut) {
      exitReason = 'timeout';
    } else if (exitCode === 0) {
      exitReason = 'success';
    } else {
      exitReason = `exit_code_${exitCode}`;
    }
  } catch (err: any) {
    clearTimeout(timeoutId);
    exitReason = timedOut ? 'timeout' : `error: ${err.message}`;
  } finally {
    try { fs.unlinkSync(promptFile); } catch { /* non-fatal */ }
  }

  const duration = Date.now() - startTime;

  // Parse JSON output
  let messages: any[] = [];
  let toolCalls: SkillTestResult['toolCalls'] = [];
  const browseErrors: string[] = [];
  let result: any = null;

  try {
    // stdout may have stderr warnings prefixed (e.g., "[WARN] Fast mode...")
    // Find the JSON object in the output
    const jsonStart = stdout.indexOf('{');
    if (jsonStart >= 0) {
      result = JSON.parse(stdout.slice(jsonStart));
    }
  } catch { /* non-JSON output */ }

  // Scan all output for browse errors
  const allText = stdout + '\n' + stderr;
  for (const pattern of BROWSE_ERROR_PATTERNS) {
    const match = allText.match(pattern);
    if (match) {
      browseErrors.push(match[0].slice(0, 200));
    }
  }

  // If JSON parsed, use the structured result
  if (result) {
    // Check result type for success
    if (result.type === 'result' && result.subtype === 'success') {
      exitReason = 'success';
    } else if (result.type === 'result' && result.subtype) {
      exitReason = result.subtype;
    }
  }

  // Save transcript on failure
  if (browseErrors.length > 0 || exitReason !== 'success') {
    try {
      const transcriptDir = path.join(workingDirectory, '.gstack', 'test-transcripts');
      fs.mkdirSync(transcriptDir, { recursive: true });
      const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
      fs.writeFileSync(
        path.join(transcriptDir, `e2e-${timestamp}.json`),
        JSON.stringify({
          prompt: prompt.slice(0, 500),
          exitReason,
          browseErrors,
          duration,
          stderr: stderr.slice(0, 2000),
          result: result ? { type: result.type, subtype: result.subtype, result: result.result?.slice?.(0, 500) } : null,
        }, null, 2),
      );
    } catch { /* non-fatal */ }
  }

  // Cost from JSON result (exact) or estimate from chars
  const turnsUsed = result?.num_turns || 0;
  const estimatedCost = result?.total_cost_usd || 0;
  const inputChars = prompt.length;
  const outputChars = (result?.result || stdout).length;
  const estimatedTokens = (result?.usage?.input_tokens || 0)
    + (result?.usage?.output_tokens || 0)
    + (result?.usage?.cache_read_input_tokens || 0);

  const costEstimate: CostEstimate = {
    inputChars,
    outputChars,
    estimatedTokens,
    estimatedCost: Math.round((estimatedCost) * 100) / 100,
    turnsUsed,
  };

  return { messages, toolCalls, browseErrors, exitReason, duration, output: result?.result || stdout, costEstimate };
}