~cytrogen/gstack

gstack/scripts/eval-summary.ts -rw-r--r-- 6.5 KiB
9c5f4797 — Cytrogen fork: 频率分级路由 + 触发式描述符重写 2 days ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env bun
/**
 * Aggregate summary of all eval runs from ~/.gstack-dev/evals/
 *
 * Usage: bun run eval:summary
 */

import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import type { EvalResult } from '../test/helpers/eval-store';
import { getProjectEvalDir } from '../test/helpers/eval-store';

const EVAL_DIR = getProjectEvalDir();

let files: string[];
try {
  files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
} catch {
  console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
  process.exit(0);
}

if (files.length === 0) {
  console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
  process.exit(0);
}

// Load all results
const results: EvalResult[] = [];
for (const file of files) {
  try {
    results.push(JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8')));
  } catch { continue; }
}

// Aggregate stats
const e2eRuns = results.filter(r => r.tier === 'e2e');
const judgeRuns = results.filter(r => r.tier === 'llm-judge');
const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;

// Duration + turns from E2E runs
const avgE2EDuration = e2eRuns.length > 0
  ? e2eRuns.reduce((s, r) => s + (r.total_duration_ms || 0), 0) / e2eRuns.length
  : 0;
const e2eTurns: number[] = [];
for (const r of e2eRuns) {
  const runTurns = r.tests.reduce((s, t) => s + (t.turns_used || 0), 0);
  if (runTurns > 0) e2eTurns.push(runTurns);
}
const avgE2ETurns = e2eTurns.length > 0
  ? e2eTurns.reduce((a, b) => a + b, 0) / e2eTurns.length
  : 0;

// Per-test efficiency stats (avg turns + duration across runs)
const testEfficiency = new Map<string, { turns: number[]; durations: number[]; costs: number[] }>();
for (const r of e2eRuns) {
  for (const t of r.tests) {
    if (!testEfficiency.has(t.name)) {
      testEfficiency.set(t.name, { turns: [], durations: [], costs: [] });
    }
    const stats = testEfficiency.get(t.name)!;
    if (t.turns_used !== undefined) stats.turns.push(t.turns_used);
    if (t.duration_ms > 0) stats.durations.push(t.duration_ms);
    if (t.cost_usd > 0) stats.costs.push(t.cost_usd);
  }
}

// Detection rates from outcome evals
const detectionRates: number[] = [];
for (const r of e2eRuns) {
  for (const t of r.tests) {
    if (t.detection_rate !== undefined) {
      detectionRates.push(t.detection_rate);
    }
  }
}
const avgDetection = detectionRates.length > 0
  ? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length
  : null;

// Flaky tests (passed in some runs, failed in others)
const testResults = new Map<string, boolean[]>();
for (const r of results) {
  for (const t of r.tests) {
    const key = `${r.tier}:${t.name}`;
    if (!testResults.has(key)) testResults.set(key, []);
    testResults.get(key)!.push(t.passed);
  }
}
const flakyTests: string[] = [];
for (const [name, outcomes] of testResults) {
  if (outcomes.length >= 2) {
    const hasPass = outcomes.some(o => o);
    const hasFail = outcomes.some(o => !o);
    if (hasPass && hasFail) flakyTests.push(name);
  }
}

// Branch stats
const branchStats = new Map<string, { runs: number; avgDetection: number; detections: number[] }>();
for (const r of e2eRuns) {
  if (!branchStats.has(r.branch)) {
    branchStats.set(r.branch, { runs: 0, avgDetection: 0, detections: [] });
  }
  const stats = branchStats.get(r.branch)!;
  stats.runs++;
  for (const t of r.tests) {
    if (t.detection_rate !== undefined) {
      stats.detections.push(t.detection_rate);
    }
  }
}
for (const stats of branchStats.values()) {
  stats.avgDetection = stats.detections.length > 0
    ? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length
    : 0;
}

// Print summary
console.log('');
console.log('Eval Summary');
console.log('═'.repeat(70));
console.log(`  Total runs:        ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
console.log(`  Total spend:       $${totalCost.toFixed(2)}`);
console.log(`  Avg cost/e2e:      $${avgE2ECost.toFixed(2)}`);
console.log(`  Avg cost/judge:    $${avgJudgeCost.toFixed(2)}`);
if (avgE2EDuration > 0) {
  console.log(`  Avg duration/e2e:  ${Math.round(avgE2EDuration / 1000)}s`);
}
if (avgE2ETurns > 0) {
  console.log(`  Avg turns/e2e:     ${Math.round(avgE2ETurns)}`);
}
if (avgDetection !== null) {
  console.log(`  Avg detection:     ${avgDetection.toFixed(1)} bugs`);
}
console.log('─'.repeat(70));

// Per-test efficiency averages (only if we have enough data)
if (testEfficiency.size > 0 && e2eRuns.length >= 2) {
  console.log('  Per-test efficiency (averages across runs):');
  const sorted = [...testEfficiency.entries()]
    .filter(([, s]) => s.turns.length >= 2)
    .sort((a, b) => {
      const avgA = a[1].costs.reduce((s, c) => s + c, 0) / a[1].costs.length;
      const avgB = b[1].costs.reduce((s, c) => s + c, 0) / b[1].costs.length;
      return avgB - avgA;
    });
  for (const [name, stats] of sorted) {
    const avgT = Math.round(stats.turns.reduce((a, b) => a + b, 0) / stats.turns.length);
    const avgD = Math.round(stats.durations.reduce((a, b) => a + b, 0) / stats.durations.length / 1000);
    const avgC = (stats.costs.reduce((a, b) => a + b, 0) / stats.costs.length).toFixed(2);
    const label = name.length > 30 ? name.slice(0, 27) + '...' : name.padEnd(30);
    console.log(`    ${label}  $${avgC}  ${avgT}t  ${avgD}s  (${stats.turns.length} runs)`);
  }
  console.log('─'.repeat(70));
}

if (flakyTests.length > 0) {
  console.log(`  Flaky tests (${flakyTests.length}):`);
  for (const name of flakyTests) {
    console.log(`    - ${name}`);
  }
  console.log('─'.repeat(70));
}

if (branchStats.size > 0) {
  console.log('  Branches:');
  const sorted = [...branchStats.entries()].sort((a, b) => b[1].avgDetection - a[1].avgDetection);
  for (const [branch, stats] of sorted) {
    const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : '';
    console.log(`    ${branch.padEnd(30)} ${stats.runs} runs${det}`);
  }
  console.log('─'.repeat(70));
}

// Date range
const timestamps = results.map(r => r.timestamp).filter(Boolean).sort();
if (timestamps.length > 0) {
  const first = timestamps[0].replace('T', ' ').slice(0, 16);
  const last = timestamps[timestamps.length - 1].replace('T', ' ').slice(0, 16);
  console.log(`  Date range: ${first}${last}`);
}

console.log(`  Dir: ${EVAL_DIR}`);
console.log('');