import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import { EvalCollector, extractToolSummary, findPreviousRun, compareEvalResults, formatComparison, generateCommentary, judgePassed, } from './eval-store'; import type { EvalResult, EvalTestEntry, ComparisonResult } from './eval-store'; let tmpDir: string; beforeEach(() => { tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-store-test-')); }); afterEach(() => { try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} }); // --- Helper to make a minimal test entry --- function makeEntry(overrides?: Partial): EvalTestEntry { return { name: 'test-1', suite: 'suite-1', tier: 'e2e', passed: true, duration_ms: 1000, cost_usd: 0.05, ...overrides, }; } // --- Helper to make a minimal EvalResult --- function makeResult(overrides?: Partial): EvalResult { return { schema_version: 1, version: '0.3.6', branch: 'main', git_sha: 'abc1234', timestamp: '2026-03-14T12:00:00.000Z', hostname: 'test-host', tier: 'e2e', total_tests: 1, passed: 1, failed: 0, total_cost_usd: 0.05, total_duration_ms: 1000, tests: [makeEntry()], ...overrides, }; } // --- EvalCollector tests --- describe('EvalCollector', () => { test('addTest accumulates entries', () => { const collector = new EvalCollector('e2e', tmpDir); collector.addTest(makeEntry({ name: 'a' })); collector.addTest(makeEntry({ name: 'b' })); collector.addTest(makeEntry({ name: 'c' })); // We can't inspect tests directly, but finalize will write them }); test('finalize writes JSON file to eval dir', async () => { const collector = new EvalCollector('e2e', tmpDir); collector.addTest(makeEntry()); const filepath = await collector.finalize(); expect(filepath).toBeTruthy(); expect(fs.existsSync(filepath)).toBe(true); const data = JSON.parse(fs.readFileSync(filepath, 'utf-8')); expect(data.tests).toHaveLength(1); expect(data.tests[0].name).toBe('test-1'); }); test('written JSON has correct schema fields', async () => { const collector = new EvalCollector('e2e', tmpDir); collector.addTest(makeEntry({ passed: true, cost_usd: 0.10, duration_ms: 2000 })); collector.addTest(makeEntry({ name: 'test-2', passed: false, cost_usd: 0.05, duration_ms: 1000 })); const filepath = await collector.finalize(); const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); expect(data.schema_version).toBe(1); expect(data.tier).toBe('e2e'); expect(data.total_tests).toBe(2); expect(data.passed).toBe(1); expect(data.failed).toBe(1); expect(data.total_cost_usd).toBe(0.15); expect(data.total_duration_ms).toBe(3000); expect(data.timestamp).toBeTruthy(); expect(data.hostname).toBeTruthy(); }); test('finalize creates directory if missing', async () => { const nestedDir = path.join(tmpDir, 'nested', 'deep', 'evals'); const collector = new EvalCollector('e2e', nestedDir); collector.addTest(makeEntry()); const filepath = await collector.finalize(); expect(fs.existsSync(filepath)).toBe(true); }); test('double finalize does not write twice', async () => { const collector = new EvalCollector('e2e', tmpDir); collector.addTest(makeEntry()); const filepath1 = await collector.finalize(); const filepath2 = await collector.finalize(); expect(filepath1).toBeTruthy(); expect(filepath2).toBe(''); // second call returns empty expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json') && !f.startsWith('_partial'))).toHaveLength(1); }); test('empty collector writes valid file', async () => { const collector = new EvalCollector('llm-judge', tmpDir); const filepath = await collector.finalize(); const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); expect(data.total_tests).toBe(0); expect(data.passed).toBe(0); expect(data.tests).toHaveLength(0); expect(data.tier).toBe('llm-judge'); }); }); // --- judgePassed tests --- describe('judgePassed', () => { test('passes when all thresholds met', () => { expect(judgePassed( { detection_rate: 3, false_positives: 1, evidence_quality: 3 }, { minimum_detection: 2, max_false_positives: 2 }, )).toBe(true); }); test('fails when detection rate below minimum', () => { expect(judgePassed( { detection_rate: 1, false_positives: 0, evidence_quality: 3 }, { minimum_detection: 2, max_false_positives: 2 }, )).toBe(false); }); test('fails when too many false positives', () => { expect(judgePassed( { detection_rate: 3, false_positives: 3, evidence_quality: 3 }, { minimum_detection: 2, max_false_positives: 2 }, )).toBe(false); }); test('fails when evidence quality below 2', () => { expect(judgePassed( { detection_rate: 3, false_positives: 0, evidence_quality: 1 }, { minimum_detection: 2, max_false_positives: 2 }, )).toBe(false); }); test('passes at exact thresholds', () => { expect(judgePassed( { detection_rate: 2, false_positives: 2, evidence_quality: 2 }, { minimum_detection: 2, max_false_positives: 2 }, )).toBe(true); }); }); // --- extractToolSummary tests --- describe('extractToolSummary', () => { test('counts tool types from transcript events', () => { const transcript = [ { type: 'system', subtype: 'init' }, { type: 'assistant', message: { content: [ { type: 'tool_use', name: 'Bash', input: {} }, ] } }, { type: 'user', tool_use_result: { stdout: '' } }, { type: 'assistant', message: { content: [ { type: 'text', text: 'ok' }, { type: 'tool_use', name: 'Read', input: {} }, ] } }, { type: 'assistant', message: { content: [ { type: 'tool_use', name: 'Bash', input: {} }, { type: 'tool_use', name: 'Write', input: {} }, ] } }, ]; const summary = extractToolSummary(transcript); expect(summary).toEqual({ Bash: 2, Read: 1, Write: 1 }); }); test('returns empty object for empty transcript', () => { expect(extractToolSummary([])).toEqual({}); }); test('handles events with no content array', () => { const transcript = [ { type: 'assistant', message: {} }, { type: 'assistant' }, ]; expect(extractToolSummary(transcript)).toEqual({}); }); }); // --- findPreviousRun tests --- describe('findPreviousRun', () => { test('finds correct file — same branch preferred, most recent', () => { // Write three eval files const files = [ { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) }, { name: '0.3.5-feature-e2e-20260313-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-13T10:00:00Z' }) }, { name: '0.3.6-feature-e2e-20260314-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-14T10:00:00Z' }) }, ]; for (const f of files) { fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data)); } // Should prefer feature branch (most recent on same branch) const result = findPreviousRun(tmpDir, 'e2e', 'feature', path.join(tmpDir, 'current.json')); expect(result).toContain('0.3.6-feature-e2e-20260314'); }); test('falls back to different branch when no same-branch match', () => { const files = [ { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) }, ]; for (const f of files) { fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data)); } const result = findPreviousRun(tmpDir, 'e2e', 'new-branch', path.join(tmpDir, 'current.json')); expect(result).toContain('0.3.5-main-e2e'); }); test('returns null when no prior runs exist', () => { const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, 'current.json')); expect(result).toBeNull(); }); test('returns null when directory does not exist', () => { const result = findPreviousRun('/nonexistent/path', 'e2e', 'main', 'current.json'); expect(result).toBeNull(); }); test('excludes the current file from results', () => { const filename = '0.3.6-main-e2e-20260314-100000.json'; fs.writeFileSync( path.join(tmpDir, filename), JSON.stringify(makeResult({ branch: 'main', timestamp: '2026-03-14T10:00:00Z' })), ); const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, filename)); expect(result).toBeNull(); // only file is excluded }); test('filters by tier', () => { fs.writeFileSync( path.join(tmpDir, '0.3.6-main-llm-judge-20260314-100000.json'), JSON.stringify(makeResult({ tier: 'llm-judge', branch: 'main', timestamp: '2026-03-14T10:00:00Z' })), ); const result = findPreviousRun(tmpDir, 'e2e', 'main', 'current.json'); expect(result).toBeNull(); // only llm-judge file, looking for e2e }); }); // --- compareEvalResults tests --- describe('compareEvalResults', () => { test('detects improved/regressed/unchanged per test', () => { const before = makeResult({ tests: [ makeEntry({ name: 'test-a', passed: false }), makeEntry({ name: 'test-b', passed: true }), makeEntry({ name: 'test-c', passed: true }), ], total_tests: 3, passed: 2, failed: 1, }); const after = makeResult({ tests: [ makeEntry({ name: 'test-a', passed: true }), // improved makeEntry({ name: 'test-b', passed: false }), // regressed makeEntry({ name: 'test-c', passed: true }), // unchanged ], total_tests: 3, passed: 2, failed: 1, }); const result = compareEvalResults(before, after, 'before.json', 'after.json'); expect(result.improved).toBe(1); expect(result.regressed).toBe(1); expect(result.unchanged).toBe(1); expect(result.deltas.find(d => d.name === 'test-a')?.status_change).toBe('improved'); expect(result.deltas.find(d => d.name === 'test-b')?.status_change).toBe('regressed'); expect(result.deltas.find(d => d.name === 'test-c')?.status_change).toBe('unchanged'); }); test('handles tests present in one run but not the other', () => { const before = makeResult({ tests: [ makeEntry({ name: 'old-test', passed: true }), makeEntry({ name: 'shared', passed: true }), ], }); const after = makeResult({ tests: [ makeEntry({ name: 'shared', passed: true }), makeEntry({ name: 'new-test', passed: true }), ], }); const result = compareEvalResults(before, after, 'before.json', 'after.json'); expect(result.deltas).toHaveLength(3); // shared + new-test + old-test (removed) expect(result.deltas.find(d => d.name.includes('old-test'))?.name).toContain('removed'); }); test('computes cost and duration deltas', () => { const before = makeResult({ total_cost_usd: 2.00, total_duration_ms: 60000 }); const after = makeResult({ total_cost_usd: 1.50, total_duration_ms: 45000 }); const result = compareEvalResults(before, after, 'a.json', 'b.json'); expect(result.total_cost_delta).toBe(-0.50); expect(result.total_duration_delta).toBe(-15000); }); }); // --- formatComparison tests --- describe('formatComparison', () => { test('produces readable output with status arrows', () => { const comparison: ComparisonResult = { before_file: 'before.json', after_file: 'after.json', before_branch: 'main', after_branch: 'feature', before_timestamp: '2026-03-13T14:30:00Z', after_timestamp: '2026-03-14T14:30:00Z', deltas: [ { name: 'browse basic', before: { passed: true, cost_usd: 0.07, turns_used: 6, duration_ms: 24000, tool_summary: { Bash: 3 } }, after: { passed: true, cost_usd: 0.06, turns_used: 5, duration_ms: 19000, tool_summary: { Bash: 4 } }, status_change: 'unchanged', }, { name: 'planted bugs static', before: { passed: false, cost_usd: 1.00, detection_rate: 3, tool_summary: {} }, after: { passed: true, cost_usd: 0.95, detection_rate: 4, tool_summary: {} }, status_change: 'improved', }, ], total_cost_delta: -0.06, total_duration_delta: -5000, improved: 1, regressed: 0, unchanged: 1, tool_count_before: 3, tool_count_after: 4, }; const output = formatComparison(comparison); expect(output).toContain('vs previous'); expect(output).toContain('main'); expect(output).toContain('1 improved'); expect(output).toContain('1 unchanged'); expect(output).toContain('↑'); // improved arrow expect(output).toContain('='); // unchanged arrow // Turns and duration deltas expect(output).toContain('6→5t'); expect(output).toContain('24→19s'); }); test('includes commentary section', () => { const comparison: ComparisonResult = { before_file: 'a.json', after_file: 'b.json', before_branch: 'main', after_branch: 'main', before_timestamp: '2026-03-13T14:30:00Z', after_timestamp: '2026-03-14T14:30:00Z', deltas: [ { name: 'test-a', before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 }, after: { passed: true, cost_usd: 0.30, turns_used: 10, duration_ms: 60000 }, status_change: 'unchanged', }, { name: 'test-b', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged', }, { name: 'test-c', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged', }, ], total_cost_delta: -0.20, total_duration_delta: -60000, improved: 0, regressed: 0, unchanged: 3, tool_count_before: 30, tool_count_after: 20, }; const output = formatComparison(comparison); expect(output).toContain('Takeaway'); expect(output).toContain('fewer turns'); expect(output).toContain('faster'); }); }); // --- generateCommentary tests --- describe('generateCommentary', () => { test('flags regressions prominently', () => { const c: ComparisonResult = { before_file: 'a.json', after_file: 'b.json', before_branch: 'main', after_branch: 'main', before_timestamp: '', after_timestamp: '', deltas: [{ name: 'critical-test', before: { passed: true, cost_usd: 0.10 }, after: { passed: false, cost_usd: 0.10 }, status_change: 'regressed', }], total_cost_delta: 0, total_duration_delta: 0, improved: 0, regressed: 1, unchanged: 0, tool_count_before: 0, tool_count_after: 0, }; const notes = generateCommentary(c); expect(notes.some(n => n.includes('REGRESSION'))).toBe(true); expect(notes.some(n => n.includes('critical-test'))).toBe(true); }); test('notes improvements', () => { const c: ComparisonResult = { before_file: 'a.json', after_file: 'b.json', before_branch: 'main', after_branch: 'main', before_timestamp: '', after_timestamp: '', deltas: [{ name: 'fixed-test', before: { passed: false, cost_usd: 0.10 }, after: { passed: true, cost_usd: 0.10 }, status_change: 'improved', }], total_cost_delta: 0, total_duration_delta: 0, improved: 1, regressed: 0, unchanged: 0, tool_count_before: 0, tool_count_after: 0, }; const notes = generateCommentary(c); expect(notes.some(n => n.includes('Fixed'))).toBe(true); expect(notes.some(n => n.includes('fixed-test'))).toBe(true); }); test('reports efficiency gains for stable tests', () => { const c: ComparisonResult = { before_file: 'a.json', after_file: 'b.json', before_branch: 'main', after_branch: 'main', before_timestamp: '', after_timestamp: '', deltas: [{ name: 'fast-test', before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 }, after: { passed: true, cost_usd: 0.25, turns_used: 10, duration_ms: 60000 }, status_change: 'unchanged', }], total_cost_delta: -0.25, total_duration_delta: -60000, improved: 0, regressed: 0, unchanged: 1, tool_count_before: 0, tool_count_after: 0, }; const notes = generateCommentary(c); expect(notes.some(n => n.includes('fewer turns'))).toBe(true); expect(notes.some(n => n.includes('faster'))).toBe(true); expect(notes.some(n => n.includes('cheaper'))).toBe(true); }); test('reports detection rate changes', () => { const c: ComparisonResult = { before_file: 'a.json', after_file: 'b.json', before_branch: 'main', after_branch: 'main', before_timestamp: '', after_timestamp: '', deltas: [{ name: 'detection-test', before: { passed: true, cost_usd: 0.50, detection_rate: 3 }, after: { passed: true, cost_usd: 0.50, detection_rate: 5 }, status_change: 'unchanged', }], total_cost_delta: 0, total_duration_delta: 0, improved: 0, regressed: 0, unchanged: 1, tool_count_before: 0, tool_count_after: 0, }; const notes = generateCommentary(c); expect(notes.some(n => n.includes('detecting 2 more bugs'))).toBe(true); }); test('produces overall summary for 3+ tests with no regressions', () => { const c: ComparisonResult = { before_file: 'a.json', after_file: 'b.json', before_branch: 'main', after_branch: 'main', before_timestamp: '', after_timestamp: '', deltas: [ { name: 'a', before: { passed: true, cost_usd: 0.50, turns_used: 10, duration_ms: 60000 }, after: { passed: true, cost_usd: 0.30, turns_used: 6, duration_ms: 40000 }, status_change: 'unchanged' }, { name: 'b', before: { passed: true, cost_usd: 0.20, turns_used: 5, duration_ms: 30000 }, after: { passed: true, cost_usd: 0.15, turns_used: 4, duration_ms: 25000 }, status_change: 'unchanged' }, { name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 3, duration_ms: 20000 }, after: { passed: true, cost_usd: 0.08, turns_used: 3, duration_ms: 18000 }, status_change: 'unchanged' }, ], total_cost_delta: -0.27, total_duration_delta: -27000, improved: 0, regressed: 0, unchanged: 3, tool_count_before: 0, tool_count_after: 0, }; const notes = generateCommentary(c); expect(notes.some(n => n.includes('Overall'))).toBe(true); expect(notes.some(n => n.includes('No regressions'))).toBe(true); }); test('returns empty for stable run with no significant changes', () => { const c: ComparisonResult = { before_file: 'a.json', after_file: 'b.json', before_branch: 'main', after_branch: 'main', before_timestamp: '', after_timestamp: '', deltas: [ { name: 'a', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 21000 }, status_change: 'unchanged' }, { name: 'b', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' }, { name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' }, ], total_cost_delta: 0, total_duration_delta: 1000, improved: 0, regressed: 0, unchanged: 3, tool_count_before: 15, tool_count_after: 15, }; const notes = generateCommentary(c); expect(notes.some(n => n.includes('Stable run'))).toBe(true); }); });