~cytrogen/gstack

6a6b2b076641dfdbec23d1e763ba1d62532ef035 — Garry Tan a month ago ae2d841
feat: Gemini CLI E2E tests (v0.9.2.0) (#252)

* feat: add Gemini CLI session runner + JSONL parser

Subprocess wrapper for `gemini -p --output-format stream-json --yolo`
that spawns the Gemini CLI and parses NDJSON events (init, message,
tool_use, tool_result, result) into a structured GeminiResult.

Includes 10 unit tests for parseGeminiJSONL covering happy path,
malformed input, empty input, missing fields, and multi-tool scenarios.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* feat: add Gemini CLI E2E tests

Two E2E tests (gemini-discover-skill, gemini-review-findings) that
verify gstack skills work when invoked by the Gemini CLI. Follows
the same pattern as codex-e2e.test.ts — gated by EVALS=1 + binary
availability, diff-based selection via touchfiles, eval persistence.

- Add test/gemini-e2e.test.ts
- Add Gemini entries to E2E_TOUCHFILES and GLOBAL_TOUCHFILES
- Add test:gemini and test:gemini:all scripts to package.json
- Add gemini-e2e.test.ts to test:evals, test:e2e, and ignore list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* chore: bump version and changelog (v0.9.2.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
M CHANGELOG.md => CHANGELOG.md +8 -0
@@ 1,5 1,13 @@
# Changelog

## [0.9.2.0] - 2026-03-20 — Gemini CLI E2E Tests

### Added

- **Gemini CLI is now tested end-to-end.** Two E2E tests verify that gstack skills work when invoked by Google's Gemini CLI (`gemini -p`). The `gemini-discover-skill` test confirms skill discovery from `.agents/skills/`, and `gemini-review-findings` runs a full code review via gstack-review. Both parse Gemini's stream-json NDJSON output and track token usage.
- **Gemini JSONL parser with 10 unit tests.** `parseGeminiJSONL` handles all Gemini event types (init, message, tool_use, tool_result, result) with defensive parsing for malformed input. The parser is a pure function, independently testable without spawning the CLI.
- **`bun run test:gemini`** and **`bun run test:gemini:all`** scripts for running Gemini E2E tests independently. Gemini tests are also included in `test:evals` and `test:e2e` aggregate scripts.

## [0.9.1.0] - 2026-03-20 — Adversarial Spec Review + Skill Chaining

### Added

M VERSION => VERSION +1 -1
@@ 1,1 1,1 @@
0.9.1.0
0.9.2.0

M package.json => package.json +7 -5
@@ 12,13 12,15 @@
    "gen:skill-docs": "bun run scripts/gen-skill-docs.ts",
    "dev": "bun run browse/src/cli.ts",
    "server": "bun run browse/src/server.ts",
    "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts",
    "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts",
    "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts",
    "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts",
    "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts",
    "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts --ignore test/gemini-e2e.test.ts",
    "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
    "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
    "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
    "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
    "test:codex": "EVALS=1 bun test test/codex-e2e.test.ts",
    "test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts",
    "test:gemini": "EVALS=1 bun test test/gemini-e2e.test.ts",
    "test:gemini:all": "EVALS=1 EVALS_ALL=1 bun test test/gemini-e2e.test.ts",
    "skill:check": "bun run scripts/skill-check.ts",
    "dev:skill": "bun run scripts/dev-skill.ts",
    "start": "bun run browse/src/server.ts",

A test/gemini-e2e.test.ts => test/gemini-e2e.test.ts +173 -0
@@ 0,0 1,173 @@
/**
 * Gemini CLI E2E tests — verify skills work when invoked by Gemini CLI.
 *
 * Spawns `gemini -p` with stream-json output in the repo root (where
 * .agents/skills/ already exists), parses JSONL events, and validates
 * structured results. Follows the same pattern as codex-e2e.test.ts.
 *
 * Prerequisites:
 * - `gemini` binary installed (npm install -g @google/gemini-cli)
 * - Gemini authenticated via ~/.gemini/ config or GEMINI_API_KEY env var
 * - EVALS=1 env var set (same gate as Claude E2E tests)
 *
 * Skips gracefully when prerequisites are not met.
 */

import { describe, test, expect, afterAll } from 'bun:test';
import { runGeminiSkill } from './helpers/gemini-session-runner';
import type { GeminiResult } from './helpers/gemini-session-runner';
import { EvalCollector } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import * as path from 'path';

const ROOT = path.resolve(import.meta.dir, '..');

// --- Prerequisites check ---

const GEMINI_AVAILABLE = (() => {
  try {
    const result = Bun.spawnSync(['which', 'gemini']);
    return result.exitCode === 0;
  } catch { return false; }
})();

const evalsEnabled = !!process.env.EVALS;

// Skip all tests if gemini is not available or EVALS is not set.
const SKIP = !GEMINI_AVAILABLE || !evalsEnabled;

const describeGemini = SKIP ? describe.skip : describe;

// Log why we're skipping (helpful for debugging CI)
if (!evalsEnabled) {
  // Silent — same as Claude E2E tests, EVALS=1 required
} else if (!GEMINI_AVAILABLE) {
  process.stderr.write('\nGemini E2E: SKIPPED — gemini binary not found (install: npm i -g @google/gemini-cli)\n');
}

// --- Diff-based test selection ---

// Gemini E2E touchfiles — keyed by test name, same pattern as Codex E2E
const GEMINI_E2E_TOUCHFILES: Record<string, string[]> = {
  'gemini-discover-skill':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
  'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
};

let selectedTests: string[] | null = null; // null = run all

if (evalsEnabled && !process.env.EVALS_ALL) {
  const baseBranch = process.env.EVALS_BASE
    || detectBaseBranch(ROOT)
    || 'main';
  const changedFiles = getChangedFiles(baseBranch, ROOT);

  if (changedFiles.length > 0) {
    const selection = selectTests(changedFiles, GEMINI_E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
    selectedTests = selection.selected;
    process.stderr.write(`\nGemini E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(GEMINI_E2E_TOUCHFILES).length} tests\n`);
    if (selection.skipped.length > 0) {
      process.stderr.write(`  Skipped: ${selection.skipped.join(', ')}\n`);
    }
    process.stderr.write('\n');
  }
  // If changedFiles is empty (e.g., on main branch), selectedTests stays null -> run all
}

/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
  (shouldRun ? test : test.skip)(testName, fn, timeout);
}

// --- Eval result collector ---

const evalCollector = evalsEnabled && !SKIP ? new EvalCollector('e2e-gemini') : null;

/** DRY helper to record a Gemini E2E test result into the eval collector. */
function recordGeminiE2E(name: string, result: GeminiResult, passed: boolean) {
  evalCollector?.addTest({
    name,
    suite: 'gemini-e2e',
    tier: 'e2e',
    passed,
    duration_ms: result.durationMs,
    cost_usd: 0, // Gemini doesn't report cost in USD; tokens are tracked
    output: result.output?.slice(0, 2000),
    turns_used: result.toolCalls.length, // approximate: tool calls as turns
    exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
  });
}

/** Print cost summary after a Gemini E2E test. */
function logGeminiCost(label: string, result: GeminiResult) {
  const durationSec = Math.round(result.durationMs / 1000);
  console.log(`${label}: ${result.tokens} tokens, ${result.toolCalls.length} tool calls, ${durationSec}s`);
}

// Finalize eval results on exit
afterAll(async () => {
  if (evalCollector) {
    await evalCollector.finalize();
  }
});

// --- Tests ---

describeGemini('Gemini E2E', () => {

  testIfSelected('gemini-discover-skill', async () => {
    // Run Gemini in the repo root where .agents/skills/ exists
    const result = await runGeminiSkill({
      prompt: 'List any skills or instructions you have available. Just list the names.',
      timeoutMs: 60_000,
      cwd: ROOT,
    });

    logGeminiCost('gemini-discover-skill', result);

    // Gemini should have produced some output
    const passed = result.exitCode === 0 && result.output.length > 0;
    recordGeminiE2E('gemini-discover-skill', result, passed);

    expect(result.exitCode).toBe(0);
    expect(result.output.length).toBeGreaterThan(0);
    // The output should reference skills in some form
    const outputLower = result.output.toLowerCase();
    expect(
      outputLower.includes('review') || outputLower.includes('gstack') || outputLower.includes('skill'),
    ).toBe(true);
  }, 120_000);

  testIfSelected('gemini-review-findings', async () => {
    // Run gstack-review skill via Gemini on this repo
    const result = await runGeminiSkill({
      prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
      timeoutMs: 540_000,
      cwd: ROOT,
    });

    logGeminiCost('gemini-review-findings', result);

    // Should produce structured review-like output
    const output = result.output;
    const passed = result.exitCode === 0 && output.length > 50;
    recordGeminiE2E('gemini-review-findings', result, passed);

    expect(result.exitCode).toBe(0);
    expect(output.length).toBeGreaterThan(50);

    // Review output should contain some review-like content
    const outputLower = output.toLowerCase();
    const hasReviewContent =
      outputLower.includes('finding') ||
      outputLower.includes('issue') ||
      outputLower.includes('review') ||
      outputLower.includes('change') ||
      outputLower.includes('diff') ||
      outputLower.includes('clean') ||
      outputLower.includes('no issues') ||
      outputLower.includes('p1') ||
      outputLower.includes('p2');
    expect(hasReviewContent).toBe(true);
  }, 600_000);
});

A test/helpers/gemini-session-runner.test.ts => test/helpers/gemini-session-runner.test.ts +104 -0
@@ 0,0 1,104 @@
import { describe, test, expect } from 'bun:test';
import { parseGeminiJSONL } from './gemini-session-runner';

// Fixture: actual Gemini CLI stream-json output with tool use
const FIXTURE_LINES = [
  '{"type":"init","timestamp":"2026-03-20T15:14:46.455Z","session_id":"test-session-123","model":"auto-gemini-3"}',
  '{"type":"message","timestamp":"2026-03-20T15:14:46.456Z","role":"user","content":"list the files"}',
  '{"type":"message","timestamp":"2026-03-20T15:14:49.650Z","role":"assistant","content":"I will list the files.","delta":true}',
  '{"type":"tool_use","timestamp":"2026-03-20T15:14:49.690Z","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
  '{"type":"tool_result","timestamp":"2026-03-20T15:14:49.931Z","tool_id":"cmd_1","status":"success","output":"file1.ts\\nfile2.ts"}',
  '{"type":"message","timestamp":"2026-03-20T15:14:51.945Z","role":"assistant","content":"Here are the files.","delta":true}',
  '{"type":"result","timestamp":"2026-03-20T15:14:52.030Z","status":"success","stats":{"total_tokens":27147,"input_tokens":26928,"output_tokens":87,"cached":0,"duration_ms":5575,"tool_calls":1}}',
];

describe('parseGeminiJSONL', () => {
  test('extracts session ID from init event', () => {
    const parsed = parseGeminiJSONL(FIXTURE_LINES);
    expect(parsed.sessionId).toBe('test-session-123');
  });

  test('concatenates assistant message deltas into output', () => {
    const parsed = parseGeminiJSONL(FIXTURE_LINES);
    expect(parsed.output).toBe('I will list the files.Here are the files.');
  });

  test('ignores user messages', () => {
    const lines = [
      '{"type":"message","role":"user","content":"this should be ignored"}',
      '{"type":"message","role":"assistant","content":"this should be kept","delta":true}',
    ];
    const parsed = parseGeminiJSONL(lines);
    expect(parsed.output).toBe('this should be kept');
  });

  test('extracts tool names from tool_use events', () => {
    const parsed = parseGeminiJSONL(FIXTURE_LINES);
    expect(parsed.toolCalls).toHaveLength(1);
    expect(parsed.toolCalls[0]).toBe('run_shell_command');
  });

  test('extracts total tokens from result stats', () => {
    const parsed = parseGeminiJSONL(FIXTURE_LINES);
    expect(parsed.tokens).toBe(27147);
  });

  test('skips malformed lines without throwing', () => {
    const lines = [
      '{"type":"init","session_id":"ok"}',
      'this is not json',
      '{"type":"message","role":"assistant","content":"hello","delta":true}',
      '{incomplete json',
      '{"type":"result","status":"success","stats":{"total_tokens":100}}',
    ];
    const parsed = parseGeminiJSONL(lines);
    expect(parsed.sessionId).toBe('ok');
    expect(parsed.output).toBe('hello');
    expect(parsed.tokens).toBe(100);
  });

  test('skips empty and whitespace-only lines', () => {
    const lines = [
      '',
      '  ',
      '{"type":"init","session_id":"s1"}',
      '\t',
      '{"type":"result","status":"success","stats":{"total_tokens":50}}',
    ];
    const parsed = parseGeminiJSONL(lines);
    expect(parsed.sessionId).toBe('s1');
    expect(parsed.tokens).toBe(50);
  });

  test('handles empty input', () => {
    const parsed = parseGeminiJSONL([]);
    expect(parsed.output).toBe('');
    expect(parsed.toolCalls).toHaveLength(0);
    expect(parsed.tokens).toBe(0);
    expect(parsed.sessionId).toBeNull();
  });

  test('handles missing fields gracefully', () => {
    const lines = [
      '{"type":"init"}',                              // no session_id
      '{"type":"message","role":"assistant"}',         // no content
      '{"type":"tool_use"}',                           // no tool_name
      '{"type":"result","status":"success"}',          // no stats
    ];
    const parsed = parseGeminiJSONL(lines);
    expect(parsed.sessionId).toBeNull();
    expect(parsed.output).toBe('');
    expect(parsed.toolCalls).toHaveLength(0);
    expect(parsed.tokens).toBe(0);
  });

  test('handles multiple tool_use events', () => {
    const lines = [
      '{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
      '{"type":"tool_use","tool_name":"read_file","tool_id":"cmd_2","parameters":{"path":"foo.ts"}}',
      '{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_3","parameters":{"command":"cat bar.ts"}}',
    ];
    const parsed = parseGeminiJSONL(lines);
    expect(parsed.toolCalls).toEqual(['run_shell_command', 'read_file', 'run_shell_command']);
  });
});

A test/helpers/gemini-session-runner.ts => test/helpers/gemini-session-runner.ts +201 -0
@@ 0,0 1,201 @@
/**
 * Gemini CLI subprocess runner for skill E2E testing.
 *
 * Spawns `gemini -p` as an independent process, parses its stream-json
 * output, and returns structured results. Follows the same pattern as
 * codex-session-runner.ts but adapted for the Gemini CLI.
 *
 * Key differences from Codex session-runner:
 * - Uses `gemini -p` instead of `codex exec`
 * - Output is NDJSON with event types: init, message, tool_use, tool_result, result
 * - Uses `--output-format stream-json --yolo` instead of `--json -s read-only`
 * - No temp HOME needed — Gemini discovers skills from `.agents/skills/` in cwd
 * - Message events are streamed with `delta: true` — must concatenate
 */

import * as path from 'path';

// --- Interfaces ---

export interface GeminiResult {
  output: string;           // Full assistant message text (concatenated deltas)
  toolCalls: string[];      // Tool names from tool_use events
  tokens: number;           // Total tokens used
  exitCode: number;         // Process exit code
  durationMs: number;       // Wall clock time
  sessionId: string | null; // Session ID from init event
  rawLines: string[];       // Raw JSONL lines for debugging
}

// --- JSONL parser ---

export interface ParsedGeminiJSONL {
  output: string;
  toolCalls: string[];
  tokens: number;
  sessionId: string | null;
}

/**
 * Parse an array of JSONL lines from `gemini -p --output-format stream-json`.
 * Pure function — no I/O, no side effects.
 *
 * Handles these Gemini event types:
 * - init → extract session_id
 * - message (role=assistant, delta=true) → concatenate content into output
 * - tool_use → extract tool_name
 * - tool_result → logged but not extracted
 * - result → extract token usage from stats
 */
export function parseGeminiJSONL(lines: string[]): ParsedGeminiJSONL {
  const outputParts: string[] = [];
  const toolCalls: string[] = [];
  let tokens = 0;
  let sessionId: string | null = null;

  for (const line of lines) {
    if (!line.trim()) continue;
    try {
      const obj = JSON.parse(line);
      const t = obj.type || '';

      if (t === 'init') {
        const sid = obj.session_id || '';
        if (sid) sessionId = sid;
      } else if (t === 'message') {
        if (obj.role === 'assistant' && obj.content) {
          outputParts.push(obj.content);
        }
      } else if (t === 'tool_use') {
        const name = obj.tool_name || '';
        if (name) toolCalls.push(name);
      } else if (t === 'result') {
        const stats = obj.stats || {};
        tokens = (stats.total_tokens || 0);
      }
    } catch { /* skip malformed lines */ }
  }

  return {
    output: outputParts.join(''),
    toolCalls,
    tokens,
    sessionId,
  };
}

// --- Main runner ---

/**
 * Run a prompt via `gemini -p` and return structured results.
 *
 * Spawns gemini with stream-json output, parses JSONL events,
 * and returns a GeminiResult. Skips gracefully if gemini binary is not found.
 */
export async function runGeminiSkill(opts: {
  prompt: string;           // What to ask Gemini
  timeoutMs?: number;       // Default 300000 (5 min)
  cwd?: string;             // Working directory (where .agents/skills/ lives)
}): Promise<GeminiResult> {
  const {
    prompt,
    timeoutMs = 300_000,
    cwd,
  } = opts;

  const startTime = Date.now();

  // Check if gemini binary exists
  const whichResult = Bun.spawnSync(['which', 'gemini']);
  if (whichResult.exitCode !== 0) {
    return {
      output: 'SKIP: gemini binary not found',
      toolCalls: [],
      tokens: 0,
      exitCode: -1,
      durationMs: Date.now() - startTime,
      sessionId: null,
      rawLines: [],
    };
  }

  // Build gemini command
  const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];

  // Spawn gemini — uses real HOME for auth, cwd for skill discovery
  const proc = Bun.spawn(['gemini', ...args], {
    cwd: cwd || process.cwd(),
    stdout: 'pipe',
    stderr: 'pipe',
  });

  // Race against timeout
  let timedOut = false;
  const timeoutId = setTimeout(() => {
    timedOut = true;
    proc.kill();
  }, timeoutMs);

  // Stream and collect JSONL from stdout
  const collectedLines: string[] = [];
  const stderrPromise = new Response(proc.stderr).text();

  const reader = proc.stdout.getReader();
  const decoder = new TextDecoder();
  let buf = '';

  try {
    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      buf += decoder.decode(value, { stream: true });
      const lines = buf.split('\n');
      buf = lines.pop() || '';
      for (const line of lines) {
        if (!line.trim()) continue;
        collectedLines.push(line);

        // Real-time progress to stderr
        try {
          const event = JSON.parse(line);
          if (event.type === 'tool_use' && event.tool_name) {
            const elapsed = Math.round((Date.now() - startTime) / 1000);
            process.stderr.write(`  [gemini ${elapsed}s] tool: ${event.tool_name}\n`);
          } else if (event.type === 'message' && event.role === 'assistant' && event.content) {
            const elapsed = Math.round((Date.now() - startTime) / 1000);
            process.stderr.write(`  [gemini ${elapsed}s] message: ${event.content.slice(0, 100)}\n`);
          }
        } catch { /* skip — parseGeminiJSONL will handle it later */ }
      }
    }
  } catch { /* stream read error — fall through to exit code handling */ }

  // Flush remaining buffer
  if (buf.trim()) {
    collectedLines.push(buf);
  }

  const stderr = await stderrPromise;
  const exitCode = await proc.exited;
  clearTimeout(timeoutId);

  const durationMs = Date.now() - startTime;

  // Parse all collected JSONL lines
  const parsed = parseGeminiJSONL(collectedLines);

  // Log stderr if non-empty (may contain auth errors, etc.)
  if (stderr.trim()) {
    process.stderr.write(`  [gemini stderr] ${stderr.trim().slice(0, 200)}\n`);
  }

  return {
    output: parsed.output,
    toolCalls: parsed.toolCalls,
    tokens: parsed.tokens,
    exitCode: timedOut ? 124 : exitCode,
    durationMs,
    sessionId: parsed.sessionId,
    rawLines: collectedLines,
  };
}

M test/helpers/touchfiles.ts => test/helpers/touchfiles.ts +5 -0
@@ 84,6 84,10 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'codex-discover-skill':  ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
  'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],

  // Gemini E2E (tests skills via Gemini CLI)
  'gemini-discover-skill':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
  'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],

  // QA bootstrap
  'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],



@@ 160,6 164,7 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
export const GLOBAL_TOUCHFILES = [
  'test/helpers/session-runner.ts',
  'test/helpers/codex-session-runner.ts',
  'test/helpers/gemini-session-runner.ts',
  'test/helpers/eval-store.ts',
  'test/helpers/llm-judge.ts',
  'scripts/gen-skill-docs.ts',