~cytrogen/gstack

ref: 1717ed28910f9bb9ee98e8415c20051a5f30138e gstack/test/helpers/session-runner.ts -rw-r--r-- 5.0 KiB
1717ed28 — Garry Tan fix: browse binary discovery broken for agents (v0.3.5) (#44) a month ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/**
 * Agent SDK wrapper for skill E2E testing.
 *
 * Spawns a Claude Code session, runs a prompt, collects messages,
 * scans tool_result messages for browse errors.
 */

import { query } from '@anthropic-ai/claude-agent-sdk';
import * as fs from 'fs';
import * as path from 'path';

export interface SkillTestResult {
  messages: any[];
  toolCalls: Array<{ tool: string; input: any; output: string }>;
  browseErrors: string[];
  exitReason: string;
  duration: number;
}

const BROWSE_ERROR_PATTERNS = [
  /Unknown command: \w+/,
  /Unknown snapshot flag: .+/,
  /Exit code 1/,
  /ERROR: browse binary not found/,
  /Server failed to start/,
  /no such file or directory.*browse/i,
];

export async function runSkillTest(options: {
  prompt: string;
  workingDirectory: string;
  maxTurns?: number;
  allowedTools?: string[];
  timeout?: number;
}): Promise<SkillTestResult> {
  // Fail fast if running inside an Agent SDK session — nested sessions hang
  if (process.env.CLAUDECODE || process.env.CLAUDE_CODE_ENTRYPOINT) {
    throw new Error(
      'Cannot run E2E skill tests inside a Claude Code session. ' +
      'Run from a plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts'
    );
  }

  const {
    prompt,
    workingDirectory,
    maxTurns = 15,
    allowedTools = ['Bash', 'Read', 'Write'],
    timeout = 120_000,
  } = options;

  const messages: any[] = [];
  const toolCalls: SkillTestResult['toolCalls'] = [];
  const browseErrors: string[] = [];
  let exitReason = 'unknown';

  const startTime = Date.now();

  // Strip all Claude-related env vars to allow nested sessions.
  // Without this, the child claude process thinks it's an SDK child
  // and hangs waiting for parent IPC instead of running independently.
  const env: Record<string, string | undefined> = {};
  for (const [key] of Object.entries(process.env)) {
    if (key.startsWith('CLAUDE') || key.startsWith('CLAUDECODE')) {
      env[key] = undefined;
    }
  }

  const q = query({
    prompt,
    options: {
      cwd: workingDirectory,
      allowedTools,
      permissionMode: 'bypassPermissions',
      allowDangerouslySkipPermissions: true,
      maxTurns,
      env,
    },
  });

  const timeoutPromise = new Promise<never>((_, reject) => {
    setTimeout(() => reject(new Error(`Skill test timed out after ${timeout}ms`)), timeout);
  });

  try {
    const runner = (async () => {
      for await (const msg of q) {
        messages.push(msg);

        // Extract tool calls from assistant messages
        if (msg.type === 'assistant' && msg.message?.content) {
          for (const block of msg.message.content) {
            if (block.type === 'tool_use') {
              toolCalls.push({
                tool: block.name,
                input: block.input,
                output: '', // will be filled from tool_result
              });
            }
            // Scan tool_result blocks for browse errors
            if (block.type === 'tool_result' || (typeof block === 'object' && 'text' in block)) {
              const text = typeof block === 'string' ? block : (block as any).text || '';
              for (const pattern of BROWSE_ERROR_PATTERNS) {
                if (pattern.test(text)) {
                  browseErrors.push(text.slice(0, 200));
                }
              }
            }
          }
        }

        // Also scan user messages (which contain tool results)
        if (msg.type === 'user' && msg.message?.content) {
          const content = Array.isArray(msg.message.content) ? msg.message.content : [msg.message.content];
          for (const block of content) {
            const text = typeof block === 'string' ? block : (block as any)?.text || (block as any)?.content || '';
            if (typeof text === 'string') {
              for (const pattern of BROWSE_ERROR_PATTERNS) {
                if (pattern.test(text)) {
                  browseErrors.push(text.slice(0, 200));
                }
              }
            }
          }
        }

        // Capture result
        if (msg.type === 'result') {
          exitReason = msg.subtype || 'success';
        }
      }
    })();

    await Promise.race([runner, timeoutPromise]);
  } catch (err: any) {
    exitReason = err.message?.includes('timed out') ? 'timeout' : `error: ${err.message}`;
  }

  const duration = Date.now() - startTime;

  // Save transcript on failure
  if (browseErrors.length > 0 || exitReason !== 'success') {
    try {
      const transcriptDir = path.join(workingDirectory, '.gstack', 'test-transcripts');
      fs.mkdirSync(transcriptDir, { recursive: true });
      const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
      const transcriptPath = path.join(transcriptDir, `e2e-${timestamp}.json`);
      fs.writeFileSync(transcriptPath, JSON.stringify({
        prompt,
        exitReason,
        browseErrors,
        duration,
        messages: messages.map(m => ({ type: m.type, subtype: m.subtype })),
      }, null, 2));
    } catch {
      // Transcript save failures are non-fatal
    }
  }

  return { messages, toolCalls, browseErrors, exitReason, duration };
}