~cytrogen/gstack

ref: 403637f0c894f1fd0ebbbb2f2728b439e607ff47 gstack/test/gemini-e2e.test.ts -rw-r--r-- 6.7 KiB
403637f0 — Garry Tan feat: rotating founder resources in /office-hours closing (v0.13.10.0) (#652) 10 days ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/**
 * Gemini CLI E2E tests — verify skills work when invoked by Gemini CLI.
 *
 * Spawns `gemini -p` with stream-json output in the repo root (where
 * .agents/skills/ already exists), parses JSONL events, and validates
 * structured results. Follows the same pattern as codex-e2e.test.ts.
 *
 * Prerequisites:
 * - `gemini` binary installed (npm install -g @google/gemini-cli)
 * - Gemini authenticated via ~/.gemini/ config or GEMINI_API_KEY env var
 * - EVALS=1 env var set (same gate as Claude E2E tests)
 *
 * Skips gracefully when prerequisites are not met.
 */

import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runGeminiSkill } from './helpers/gemini-session-runner';
import type { GeminiResult } from './helpers/gemini-session-runner';
import { EvalCollector } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { createTestWorktree, harvestAndCleanup } from './helpers/e2e-helpers';
import * as path from 'path';

const ROOT = path.resolve(import.meta.dir, '..');

// --- Prerequisites check ---

const GEMINI_AVAILABLE = (() => {
  try {
    const result = Bun.spawnSync(['which', 'gemini']);
    return result.exitCode === 0;
  } catch { return false; }
})();

const evalsEnabled = !!process.env.EVALS;

// Skip all tests if gemini is not available or EVALS is not set.
const SKIP = !GEMINI_AVAILABLE || !evalsEnabled;

const describeGemini = SKIP ? describe.skip : describe;

// Log why we're skipping (helpful for debugging CI)
if (!evalsEnabled) {
  // Silent — same as Claude E2E tests, EVALS=1 required
} else if (!GEMINI_AVAILABLE) {
  process.stderr.write('\nGemini E2E: SKIPPED — gemini binary not found (install: npm i -g @google/gemini-cli)\n');
}

// --- Diff-based test selection ---

// Gemini E2E touchfiles — keyed by test name, same pattern as Codex E2E
const GEMINI_E2E_TOUCHFILES: Record<string, string[]> = {
  'gemini-discover-skill':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
  'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
};

let selectedTests: string[] | null = null; // null = run all

if (evalsEnabled && !process.env.EVALS_ALL) {
  const baseBranch = process.env.EVALS_BASE
    || detectBaseBranch(ROOT)
    || 'main';
  const changedFiles = getChangedFiles(baseBranch, ROOT);

  if (changedFiles.length > 0) {
    const selection = selectTests(changedFiles, GEMINI_E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
    selectedTests = selection.selected;
    process.stderr.write(`\nGemini E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(GEMINI_E2E_TOUCHFILES).length} tests\n`);
    if (selection.skipped.length > 0) {
      process.stderr.write(`  Skipped: ${selection.skipped.join(', ')}\n`);
    }
    process.stderr.write('\n');
  }
  // If changedFiles is empty (e.g., on main branch), selectedTests stays null -> run all
}

/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}

// --- Eval result collector ---

const evalCollector = evalsEnabled && !SKIP ? new EvalCollector('e2e-gemini') : null;

/** DRY helper to record a Gemini E2E test result into the eval collector. */
function recordGeminiE2E(name: string, result: GeminiResult, passed: boolean) {
  evalCollector?.addTest({
    name,
    suite: 'gemini-e2e',
    tier: 'e2e',
    passed,
    duration_ms: result.durationMs,
    cost_usd: 0, // Gemini doesn't report cost in USD; tokens are tracked
    output: result.output?.slice(0, 2000),
    turns_used: result.toolCalls.length, // approximate: tool calls as turns
    exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
  });
}

/** Print cost summary after a Gemini E2E test. */
function logGeminiCost(label: string, result: GeminiResult) {
  const durationSec = Math.round(result.durationMs / 1000);
  console.log(`${label}: ${result.tokens} tokens, ${result.toolCalls.length} tool calls, ${durationSec}s`);
}

// Finalize eval results on exit
afterAll(async () => {
  if (evalCollector) {
    await evalCollector.finalize();
  }
});

// --- Tests ---

describeGemini('Gemini E2E', () => {
  let testWorktree: string;

  beforeAll(() => {
    testWorktree = createTestWorktree('gemini');
  });

  afterAll(() => {
    harvestAndCleanup('gemini');
  });

  testIfSelected('gemini-discover-skill', async () => {
    // Run Gemini in an isolated worktree (has .agents/skills/ copied from ROOT)
    const result = await runGeminiSkill({
      prompt: 'List any skills or instructions you have available. Just list the names.',
      timeoutMs: 60_000,
      cwd: testWorktree,
    });

    logGeminiCost('gemini-discover-skill', result);

    // Gemini should have produced some output
    const passed = result.exitCode === 0 && result.output.length > 0;
    recordGeminiE2E('gemini-discover-skill', result, passed);

    expect(result.exitCode).toBe(0);
    expect(result.output.length).toBeGreaterThan(0);
    // The output should reference skills in some form
    const outputLower = result.output.toLowerCase();
    expect(
      outputLower.includes('review') || outputLower.includes('gstack') || outputLower.includes('skill'),
    ).toBe(true);
  }, 120_000);

  testIfSelected('gemini-review-findings', async () => {
    // Run gstack-review skill via Gemini on worktree (isolated from main working tree)
    const result = await runGeminiSkill({
      prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
      timeoutMs: 540_000,
      cwd: testWorktree,
    });

    logGeminiCost('gemini-review-findings', result);

    // Should produce structured review-like output
    const output = result.output;
    const passed = result.exitCode === 0 && output.length > 50;
    recordGeminiE2E('gemini-review-findings', result, passed);

    expect(result.exitCode).toBe(0);
    expect(output.length).toBeGreaterThan(50);

    // Review output should contain some review-like content
    const outputLower = output.toLowerCase();
    const hasReviewContent =
      outputLower.includes('finding') ||
      outputLower.includes('issue') ||
      outputLower.includes('review') ||
      outputLower.includes('change') ||
      outputLower.includes('diff') ||
      outputLower.includes('clean') ||
      outputLower.includes('no issues') ||
      outputLower.includes('p1') ||
      outputLower.includes('p2');
    expect(hasReviewContent).toBe(true);
  }, 600_000);
});