~cytrogen/gstack

ref: 103a1b35dc627ba1efa5ddc74f572b9a2f7f9a9d gstack/scripts/eval-compare.ts -rw-r--r-- 3.0 KiB
103a1b35 — Garry Tan docs: Slate agent integration research + design doc (#782) 6 days ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env bun
/**
 * Compare two eval runs from ~/.gstack-dev/evals/
 *
 * Usage:
 *   bun run eval:compare                    # compare two most recent of same tier
 *   bun run eval:compare <file>             # compare file against its predecessor
 *   bun run eval:compare <file-a> <file-b>  # compare two specific files
 */

import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import {
  findPreviousRun,
  compareEvalResults,
  formatComparison,
  getProjectEvalDir,
} from '../test/helpers/eval-store';
import type { EvalResult } from '../test/helpers/eval-store';

const EVAL_DIR = getProjectEvalDir();

function loadResult(filepath: string): EvalResult {
  // Resolve relative to EVAL_DIR if not absolute
  const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath);
  if (!fs.existsSync(resolved)) {
    console.error(`File not found: ${resolved}`);
    process.exit(1);
  }
  return JSON.parse(fs.readFileSync(resolved, 'utf-8'));
}

const args = process.argv.slice(2);

let beforeFile: string;
let afterFile: string;

if (args.length === 2) {
  // Two explicit files
  beforeFile = args[0];
  afterFile = args[1];
} else if (args.length === 1) {
  // One file — find its predecessor
  afterFile = args[0];
  const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile);
  const afterResult = loadResult(resolved);
  const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved);
  if (!prev) {
    console.log('No previous run found to compare against.');
    process.exit(0);
  }
  beforeFile = prev;
} else {
  // No args — find two most recent of the same tier
  let files: string[];
  try {
    files = fs.readdirSync(EVAL_DIR)
      .filter(f => f.endsWith('.json'))
      .sort()
      .reverse();
  } catch {
    console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
    process.exit(0);
  }

  if (files.length < 2) {
    console.log('Need at least 2 eval runs to compare. Run evals again.');
    process.exit(0);
  }

  // Most recent file
  afterFile = path.join(EVAL_DIR, files[0]);
  const afterResult = loadResult(afterFile);
  const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile);
  if (!prev) {
    console.log('No previous run of the same tier found to compare against.');
    process.exit(0);
  }
  beforeFile = prev;
}

const beforeResult = loadResult(beforeFile);
const afterResult = loadResult(afterFile);

// Warn if different tiers
if (beforeResult.tier !== afterResult.tier) {
  console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`);
}

// Warn on schema mismatch
if (beforeResult.schema_version !== afterResult.schema_version) {
  console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`);
}

const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile);
console.log(formatComparison(comparison));