#!/usr/bin/env bun /** * Aggregate summary of all eval runs from ~/.gstack-dev/evals/ * * Usage: bun run eval:summary */ import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import type { EvalResult } from '../test/helpers/eval-store'; import { getProjectEvalDir } from '../test/helpers/eval-store'; const EVAL_DIR = getProjectEvalDir(); let files: string[]; try { files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json')); } catch { console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); process.exit(0); } if (files.length === 0) { console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); process.exit(0); } // Load all results const results: EvalResult[] = []; for (const file of files) { try { results.push(JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'))); } catch { continue; } } // Aggregate stats const e2eRuns = results.filter(r => r.tier === 'e2e'); const judgeRuns = results.filter(r => r.tier === 'llm-judge'); const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0); const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0; const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0; // Duration + turns from E2E runs const avgE2EDuration = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + (r.total_duration_ms || 0), 0) / e2eRuns.length : 0; const e2eTurns: number[] = []; for (const r of e2eRuns) { const runTurns = r.tests.reduce((s, t) => s + (t.turns_used || 0), 0); if (runTurns > 0) e2eTurns.push(runTurns); } const avgE2ETurns = e2eTurns.length > 0 ? e2eTurns.reduce((a, b) => a + b, 0) / e2eTurns.length : 0; // Per-test efficiency stats (avg turns + duration across runs) const testEfficiency = new Map(); for (const r of e2eRuns) { for (const t of r.tests) { if (!testEfficiency.has(t.name)) { testEfficiency.set(t.name, { turns: [], durations: [], costs: [] }); } const stats = testEfficiency.get(t.name)!; if (t.turns_used !== undefined) stats.turns.push(t.turns_used); if (t.duration_ms > 0) stats.durations.push(t.duration_ms); if (t.cost_usd > 0) stats.costs.push(t.cost_usd); } } // Detection rates from outcome evals const detectionRates: number[] = []; for (const r of e2eRuns) { for (const t of r.tests) { if (t.detection_rate !== undefined) { detectionRates.push(t.detection_rate); } } } const avgDetection = detectionRates.length > 0 ? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length : null; // Flaky tests (passed in some runs, failed in others) const testResults = new Map(); for (const r of results) { for (const t of r.tests) { const key = `${r.tier}:${t.name}`; if (!testResults.has(key)) testResults.set(key, []); testResults.get(key)!.push(t.passed); } } const flakyTests: string[] = []; for (const [name, outcomes] of testResults) { if (outcomes.length >= 2) { const hasPass = outcomes.some(o => o); const hasFail = outcomes.some(o => !o); if (hasPass && hasFail) flakyTests.push(name); } } // Branch stats const branchStats = new Map(); for (const r of e2eRuns) { if (!branchStats.has(r.branch)) { branchStats.set(r.branch, { runs: 0, avgDetection: 0, detections: [] }); } const stats = branchStats.get(r.branch)!; stats.runs++; for (const t of r.tests) { if (t.detection_rate !== undefined) { stats.detections.push(t.detection_rate); } } } for (const stats of branchStats.values()) { stats.avgDetection = stats.detections.length > 0 ? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length : 0; } // Print summary console.log(''); console.log('Eval Summary'); console.log('═'.repeat(70)); console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`); console.log(` Total spend: $${totalCost.toFixed(2)}`); console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`); console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`); if (avgE2EDuration > 0) { console.log(` Avg duration/e2e: ${Math.round(avgE2EDuration / 1000)}s`); } if (avgE2ETurns > 0) { console.log(` Avg turns/e2e: ${Math.round(avgE2ETurns)}`); } if (avgDetection !== null) { console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`); } console.log('─'.repeat(70)); // Per-test efficiency averages (only if we have enough data) if (testEfficiency.size > 0 && e2eRuns.length >= 2) { console.log(' Per-test efficiency (averages across runs):'); const sorted = [...testEfficiency.entries()] .filter(([, s]) => s.turns.length >= 2) .sort((a, b) => { const avgA = a[1].costs.reduce((s, c) => s + c, 0) / a[1].costs.length; const avgB = b[1].costs.reduce((s, c) => s + c, 0) / b[1].costs.length; return avgB - avgA; }); for (const [name, stats] of sorted) { const avgT = Math.round(stats.turns.reduce((a, b) => a + b, 0) / stats.turns.length); const avgD = Math.round(stats.durations.reduce((a, b) => a + b, 0) / stats.durations.length / 1000); const avgC = (stats.costs.reduce((a, b) => a + b, 0) / stats.costs.length).toFixed(2); const label = name.length > 30 ? name.slice(0, 27) + '...' : name.padEnd(30); console.log(` ${label} $${avgC} ${avgT}t ${avgD}s (${stats.turns.length} runs)`); } console.log('─'.repeat(70)); } if (flakyTests.length > 0) { console.log(` Flaky tests (${flakyTests.length}):`); for (const name of flakyTests) { console.log(` - ${name}`); } console.log('─'.repeat(70)); } if (branchStats.size > 0) { console.log(' Branches:'); const sorted = [...branchStats.entries()].sort((a, b) => b[1].avgDetection - a[1].avgDetection); for (const [branch, stats] of sorted) { const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : ''; console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`); } console.log('─'.repeat(70)); } // Date range const timestamps = results.map(r => r.timestamp).filter(Boolean).sort(); if (timestamps.length > 0) { const first = timestamps[0].replace('T', ' ').slice(0, 16); const last = timestamps[timestamps.length - 1].replace('T', ' ').slice(0, 16); console.log(` Date range: ${first} → ${last}`); } console.log(` Dir: ${EVAL_DIR}`); console.log('');