From 942df42161f1d73709bc27af636ddc7112f9016f Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sat, 14 Mar 2026 01:27:42 -0500
Subject: [PATCH] =?UTF-8?q?simplify:=20one=20command=20for=20evals=20?=
 =?UTF-8?q?=E2=80=94=20bun=20run=20test:evals?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove test:eval, test:e2e, test:all. Just two commands:
- bun test (free)
- bun run test:evals (everything that costs money)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CLAUDE.md    | 9 +++------
 package.json | 3 ---
 2 files changed, 3 insertions(+), 9 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 9189fea944abd4a36e1014980ecbe540b97ca744..34e5966b5e5357bd59bcc6b4720980f5bd32db7f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -5,10 +5,7 @@
 ```bash
 bun install          # install dependencies
 bun test             # run free tests (browse + snapshot + skill validation)
-bun run test:evals   # run ALL paid evals: LLM judge + Agent SDK E2E (~$4/run)
-bun run test:eval    # run LLM-as-judge evals only (~$0.15/run)
-bun run test:e2e     # run Agent SDK E2E tests only (~$3.85/run)
-bun run test:all     # free tests + all evals
+bun run test:evals   # run paid evals: LLM judge + Agent SDK E2E (~$4/run)
 bun run dev <cmd>    # run CLI in dev mode, e.g. bun run dev goto https://example.com
 bun run build        # gen docs + compile binaries
 bun run gen:skill-docs  # regenerate SKILL.md files from templates
@@ -16,8 +13,8 @@ bun run skill:check  # health dashboard for all skills
 bun run dev:skill    # watch mode: auto-regen + validate on change
 ```
 
-All eval commands require `ANTHROPIC_API_KEY` in your environment. E2E tests must
-be run from a plain terminal (not inside Claude Code — nested sessions hang).
+`test:evals` requires `ANTHROPIC_API_KEY` and must be run from a plain terminal
+(not inside Claude Code — nested Agent SDK sessions hang).
 
 ## Project structure
 
diff --git a/package.json b/package.json
index d518633b38262efba0565bd2a6e916d8b530f981..8334d47a1c272bd0aff3df5b5d66e43c5af6b484 100644
--- a/package.json
+++ b/package.json
@@ -14,9 +14,6 @@
     "server": "bun run browse/src/server.ts",
     "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
     "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts",
-    "test:eval": "EVALS=1 bun test test/skill-llm-eval.test.ts",
-    "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts",
-    "test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts && EVALS=1 bun test test/skill-e2e.test.ts test/skill-llm-eval.test.ts",
     "skill:check": "bun run scripts/skill-check.ts",
     "dev:skill": "bun run scripts/dev-skill.ts",
     "start": "bun run browse/src/server.ts"