A .github/docker/Dockerfile.ci => .github/docker/Dockerfile.ci +50 -0
@@ 0,0 1,50 @@
+# gstack CI eval runner — pre-baked toolchain + deps
+# Rebuild weekly via ci-image.yml, on Dockerfile changes, or on lockfile changes
+FROM ubuntu:24.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ git curl unzip ca-certificates jq bc gpg \
+ && rm -rf /var/lib/apt/lists/*
+
+# GitHub CLI
+RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
+ | gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg \
+ && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \
+ | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
+ && apt-get update && apt-get install -y --no-install-recommends gh \
+ && rm -rf /var/lib/apt/lists/*
+
+# Node.js 22 LTS (needed for claude CLI)
+RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
+ && apt-get install -y --no-install-recommends nodejs \
+ && rm -rf /var/lib/apt/lists/*
+
+# Bun (install to /usr/local so non-root users can access it)
+ENV BUN_INSTALL="/usr/local"
+RUN curl -fsSL https://bun.sh/install | bash
+
+# Claude CLI
+RUN npm i -g @anthropic-ai/claude-code
+
+# Pre-install dependencies (cached layer — only rebuilds when package.json changes)
+COPY package.json /workspace/
+WORKDIR /workspace
+RUN bun install && rm -rf /tmp/*
+
+# Verify everything works
+RUN bun --version && node --version && claude --version && jq --version && gh --version
+
+# At runtime: checkout overwrites /workspace, but node_modules persists
+# if we move it out of the way and symlink back
+# Save node_modules + package.json snapshot for cache validation at runtime
+RUN mv /workspace/node_modules /opt/node_modules_cache \
+ && cp /workspace/package.json /opt/node_modules_cache/.package.json
+
+# Claude CLI refuses --dangerously-skip-permissions as root.
+# Create a non-root user for eval runs (GH Actions overrides USER, so
+# the workflow must set options.user or use gosu/su-exec at runtime).
+RUN useradd -m -s /bin/bash runner \
+ && chmod -R a+rX /opt/node_modules_cache
A .github/workflows/ci-image.yml => .github/workflows/ci-image.yml +40 -0
@@ 0,0 1,40 @@
+name: Build CI Image
+on:
+ # Rebuild weekly (Monday 6am UTC) to pick up CLI updates
+ schedule:
+ - cron: '0 6 * * 1'
+ # Rebuild on Dockerfile or lockfile changes
+ push:
+ branches: [main]
+ paths:
+ - '.github/docker/Dockerfile.ci'
+ - 'package.json'
+ # Manual trigger
+ workflow_dispatch:
+
+jobs:
+ build:
+ runs-on: ubicloud-standard-2
+ permissions:
+ contents: read
+ packages: write
+ steps:
+ - uses: actions/checkout@v4
+
+ # Copy lockfile + package.json into Docker build context
+ - run: cp package.json .github/docker/
+
+ - uses: docker/login-action@v3
+ with:
+ registry: ghcr.io
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - uses: docker/build-push-action@v6
+ with:
+ context: .github/docker
+ file: .github/docker/Dockerfile.ci
+ push: true
+ tags: |
+ ghcr.io/${{ github.repository }}/ci:latest
+ ghcr.io/${{ github.repository }}/ci:${{ github.sha }}
A .github/workflows/evals.yml => .github/workflows/evals.yml +213 -0
@@ 0,0 1,213 @@
+name: E2E Evals
+on:
+ pull_request:
+ branches: [main]
+ workflow_dispatch:
+
+concurrency:
+ group: evals-${{ github.head_ref }}
+ cancel-in-progress: true
+
+env:
+ IMAGE: ghcr.io/${{ github.repository }}/ci
+
+jobs:
+ # Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change)
+ build-image:
+ runs-on: ubicloud-standard-2
+ permissions:
+ contents: read
+ packages: write
+ outputs:
+ image-tag: ${{ steps.meta.outputs.tag }}
+ steps:
+ - uses: actions/checkout@v4
+
+ - id: meta
+ run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT"
+
+ - uses: docker/login-action@v3
+ with:
+ registry: ghcr.io
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Check if image exists
+ id: check
+ run: |
+ if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
+ echo "exists=true" >> "$GITHUB_OUTPUT"
+ else
+ echo "exists=false" >> "$GITHUB_OUTPUT"
+ fi
+
+ - if: steps.check.outputs.exists == 'false'
+ run: cp package.json .github/docker/
+
+ - if: steps.check.outputs.exists == 'false'
+ uses: docker/build-push-action@v6
+ with:
+ context: .github/docker
+ file: .github/docker/Dockerfile.ci
+ push: true
+ tags: |
+ ${{ steps.meta.outputs.tag }}
+ ${{ env.IMAGE }}:latest
+
+ evals:
+ runs-on: ubicloud-standard-2
+ needs: build-image
+ container:
+ image: ${{ needs.build-image.outputs.image-tag }}
+ credentials:
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+ options: --user runner
+ timeout-minutes: 20
+ strategy:
+ fail-fast: false
+ matrix:
+ suite:
+ - name: llm-judge
+ file: test/skill-llm-eval.test.ts
+ - name: e2e-browse
+ file: test/skill-e2e-browse.test.ts
+ - name: e2e-plan
+ file: test/skill-e2e-plan.test.ts
+ - name: e2e-deploy
+ file: test/skill-e2e-deploy.test.ts
+ - name: e2e-design
+ file: test/skill-e2e-design.test.ts
+ - name: e2e-qa-bugs
+ file: test/skill-e2e-qa-bugs.test.ts
+ - name: e2e-qa-workflow
+ file: test/skill-e2e-qa-workflow.test.ts
+ - name: e2e-review
+ file: test/skill-e2e-review.test.ts
+ - name: e2e-workflow
+ file: test/skill-e2e-workflow.test.ts
+ - name: e2e-routing
+ file: test/skill-routing-e2e.test.ts
+ - name: e2e-codex
+ file: test/codex-e2e.test.ts
+ - name: e2e-gemini
+ file: test/gemini-e2e.test.ts
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ # Restore pre-installed node_modules from Docker image via symlink (~0s vs ~15s install)
+ # If package.json changed since image was built, fall back to fresh install
+ - name: Restore deps
+ run: |
+ if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
+ ln -s /opt/node_modules_cache node_modules
+ else
+ bun install
+ fi
+
+ - run: bun run build
+
+ - name: Run ${{ matrix.suite.name }}
+ env:
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+ GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+ EVALS_CONCURRENCY: "40"
+ run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}
+
+ - name: Upload eval results
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: eval-${{ matrix.suite.name }}
+ path: ~/.gstack-dev/evals/*.json
+ retention-days: 90
+
+ report:
+ runs-on: ubicloud-standard-2
+ needs: evals
+ if: always() && github.event_name == 'pull_request'
+ timeout-minutes: 5
+ permissions:
+ contents: read
+ pull-requests: write
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 1
+
+ - name: Download all eval artifacts
+ uses: actions/download-artifact@v4
+ with:
+ pattern: eval-*
+ path: /tmp/eval-results
+ merge-multiple: true
+
+ - name: Post PR comment
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
+ if [ -z "$RESULTS" ]; then
+ echo "No eval results found"
+ exit 0
+ fi
+
+ TOTAL=0; PASSED=0; FAILED=0; COST="0"
+ SUITE_LINES=""
+ for f in $RESULTS; do
+ T=$(jq -r '.total_tests // 0' "$f")
+ P=$(jq -r '.passed // 0' "$f")
+ F=$(jq -r '.failed // 0' "$f")
+ C=$(jq -r '.total_cost_usd // 0' "$f")
+ TIER=$(jq -r '.tier // "unknown"' "$f")
+ [ "$T" -eq 0 ] && continue
+ TOTAL=$((TOTAL + T))
+ PASSED=$((PASSED + P))
+ FAILED=$((FAILED + F))
+ COST=$(echo "$COST + $C" | bc)
+ STATUS_ICON="✅"
+ [ "$F" -gt 0 ] && STATUS_ICON="❌"
+ SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
+ done
+
+ STATUS="✅ PASS"
+ [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"
+
+ BODY="## E2E Evals: ${STATUS}
+
+ **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**
+
+ | Suite | Result | Status | Cost |
+ |-------|--------|--------|------|
+ $(echo -e "$SUITE_LINES")
+
+ ---
+ *12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"
+
+ if [ "$FAILED" -gt 0 ]; then
+ FAILURES=""
+ for f in $RESULTS; do
+ F=$(jq -r '.failed // 0' "$f")
+ [ "$F" -eq 0 ] && continue
+ FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f")
+ FAILURES="${FAILURES}${FAILS}\n"
+ done
+ BODY="${BODY}
+
+ ### Failures
+ $(echo -e "$FAILURES")"
+ fi
+
+ # Update existing comment or create new one
+ COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
+ --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)
+
+ if [ -n "$COMMENT_ID" ]; then
+ gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \
+ -X PATCH -f body="$BODY"
+ else
+ gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"
+ fi
M CHANGELOG.md => CHANGELOG.md +18 -0
@@ 1,5 1,23 @@
# Changelog
+## [0.11.10.0] - 2026-03-23 — CI Evals on Ubicloud
+
+### Added
+
+- **E2E evals now run in CI on every PR.** 12 parallel GitHub Actions runners on Ubicloud spin up per PR, each running one test suite. Docker image pre-bakes bun, node, Claude CLI, and deps so setup is near-instant. Results posted as a PR comment with pass/fail + cost breakdown.
+- **3x faster eval runs.** All E2E tests run concurrently within files via `testConcurrentIfSelected`. Wall clock drops from ~18min to ~6min — limited by the slowest individual test, not sequential sum.
+- **Docker CI image** (`Dockerfile.ci`) with pre-installed toolchain. Rebuilds automatically when Dockerfile or package.json changes, cached by content hash in GHCR.
+
+### Fixed
+
+- **Routing tests now work in CI.** Skills are installed at top-level `.claude/skills/` instead of nested under `.claude/skills/gstack/` — project-level skill discovery doesn't recurse into subdirectories.
+
+### For contributors
+
+- `EVALS_CONCURRENCY=40` in CI for maximum parallelism (local default stays at 15)
+- Ubicloud runners at ~$0.006/run (10x cheaper than GitHub standard runners)
+- `workflow_dispatch` trigger for manual re-runs
+
## [0.11.9.0] - 2026-03-23 — Codex Skill Loading Fix
### Fixed
M TODOS.md => TODOS.md +8 -11
@@ 338,17 338,6 @@
**Depends on:** Video recording
-### GitHub Actions eval upload
-
-**What:** Run eval suite in CI, upload result JSON as artifact, post summary comment on PR.
-
-**Why:** CI integration catches quality regressions before merge and provides persistent eval records per PR.
-
-**Context:** Requires `ANTHROPIC_API_KEY` in CI secrets. Cost is ~$4/run. Eval persistence system (v0.3.6) writes JSON to `~/.gstack-dev/evals/` — CI would upload as GitHub Actions artifacts and use `eval:compare` to post delta comment.
-
-**Effort:** M
-**Priority:** P2
-**Depends on:** Eval persistence (shipped in v0.3.6)
### E2E model pinning — SHIPPED
@@ 553,6 542,14 @@ Shipped in v0.6.5. TemplateContext in gen-skill-docs.ts bakes skill name into pr
## Completed
+### CI eval pipeline (v0.9.9.0)
+- GitHub Actions eval upload on Ubicloud runners ($0.006/run)
+- Within-file test concurrency (test() → testConcurrentIfSelected())
+- Eval artifact upload + PR comment with pass/fail + cost
+- Baseline comparison via artifact download from main
+- EVALS_CONCURRENCY=40 for ~6min wall clock (was ~18min)
+**Completed:** v0.9.9.0
+
### Deploy pipeline (v0.9.8.0)
- /land-and-deploy — merge PR, wait for CI/deploy, canary verification
- /canary — post-deploy monitoring loop with anomaly detection
M VERSION => VERSION +1 -1
@@ 1,1 1,1 @@
-0.11.9.0
+0.11.10.0
M test/gemini-e2e.test.ts => test/gemini-e2e.test.ts +1 -1
@@ 76,7 76,7 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
- (shouldRun ? test : test.skip)(testName, fn, timeout);
+ (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
// --- Eval result collector ---
M test/skill-e2e-deploy.test.ts => test/skill-e2e-deploy.test.ts +4 -4
@@ 44,7 44,7 @@ describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], ()
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
});
- test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
+ testConcurrentIfSelected('land-and-deploy-workflow', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
@@ 110,7 110,7 @@ describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
});
- test('/canary skill produces monitoring report structure', async () => {
+ testConcurrentIfSelected('canary-workflow', async () => {
const result = await runSkillTest({
prompt: `Read canary/SKILL.md for the /canary skill instructions.
@@ 171,7 171,7 @@ describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
});
- test('/benchmark skill produces performance report structure', async () => {
+ testConcurrentIfSelected('benchmark-workflow', async () => {
const result = await runSkillTest({
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
@@ 237,7 237,7 @@ describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
});
- test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
+ testConcurrentIfSelected('setup-deploy-workflow', async () => {
const result = await runSkillTest({
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
M test/skill-e2e-design.test.ts => test/skill-e2e-design.test.ts +1 -1
@@ 560,7 560,7 @@ describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
});
- test('Test 7: /design-review audits and fixes design issues', async () => {
+ testConcurrentIfSelected('design-review-fix', async () => {
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
const result = await runSkillTest({
M test/skill-e2e-plan.test.ts => test/skill-e2e-plan.test.ts +6 -6
@@ 66,7 66,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
- test('/plan-ceo-review produces structured review output', async () => {
+ testConcurrentIfSelected('plan-ceo-review', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
@@ 150,7 150,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
- test('/plan-ceo-review SELECTIVE EXPANSION produces structured review output', async () => {
+ testConcurrentIfSelected('plan-ceo-review-selective', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
@@ 244,7 244,7 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
- test('/plan-eng-review produces structured review output', async () => {
+ testConcurrentIfSelected('plan-eng-review', async () => {
const result = await runSkillTest({
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
@@ 364,7 364,7 @@ export function main() { return Dashboard(); }
} catch {}
});
- test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
+ testConcurrentIfSelected('plan-eng-review-artifact', async () => {
// Count existing test-plan files before
const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
@@ 442,7 442,7 @@ describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'],
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
});
- test('/office-hours SKILL.md contains spec review loop', async () => {
+ testConcurrentIfSelected('office-hours-spec-review', async () => {
const result = await runSkillTest({
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
@@ 502,7 502,7 @@ describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefi
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
});
- test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
+ testConcurrentIfSelected('plan-ceo-review-benefits', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
M test/skill-e2e-qa-bugs.test.ts => test/skill-e2e-qa-bugs.test.ts +4 -4
@@ 4,7 4,7 @@ import { outcomeJudge } from './helpers/llm-judge';
import { judgePassed } from './helpers/eval-store';
import {
ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
- describeIfSelected, describeE2E,
+ describeIfSelected, describeE2E, testConcurrentIfSelected,
copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
@@ 172,17 172,17 @@ CRITICAL RULES:
}
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
- test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
+ testConcurrentIfSelected('qa-b6-static', async () => {
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
}, 360_000);
// B7: SPA — broken route, stale state, async race, missing aria, console warning
- test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
+ testConcurrentIfSelected('qa-b7-spa', async () => {
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
}, 360_000);
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
- test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
+ testConcurrentIfSelected('qa-b8-checkout', async () => {
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
}, 360_000);
M test/skill-e2e-qa-workflow.test.ts => test/skill-e2e-qa-workflow.test.ts +3 -3
@@ 37,7 37,7 @@ describeIfSelected('QA skill E2E', ['qa-quick'], () => {
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
});
- test('/qa quick completes without browse errors', async () => {
+ testConcurrentIfSelected('qa-quick', async () => {
const result = await runSkillTest({
prompt: `B="${browseBin}"
@@ 108,7 108,7 @@ describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
});
- test('/qa-only produces report without using Edit tool', async () => {
+ testConcurrentIfSelected('qa-only-no-fix', async () => {
const result = await runSkillTest({
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
@@ 227,7 227,7 @@ describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
});
- test('/qa fix loop finds bugs and commits fixes', async () => {
+ testConcurrentIfSelected('qa-fix-loop', async () => {
const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;
const result = await runSkillTest({
M test/skill-e2e-review.test.ts => test/skill-e2e-review.test.ts +4 -4
@@ 51,7 51,7 @@ describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
- test('/review produces findings on SQL injection branch', async () => {
+ testConcurrentIfSelected('review-sql-injection', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ 125,7 125,7 @@ describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'],
try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
});
- test('/review catches missing enum handlers for new status value', async () => {
+ testConcurrentIfSelected('review-enum-completeness', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ 200,7 200,7 @@ describeIfSelected('Review design lite E2E', ['review-design-lite'], () => {
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
});
- test('/review catches design anti-patterns in CSS/HTML diff', async () => {
+ testConcurrentIfSelected('review-design-lite', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ 497,7 497,7 @@ describeIfSelected('Retro E2E', ['retro'], () => {
try { fs.rmSync(retroDir, { recursive: true, force: true }); } catch {}
});
- test('/retro produces analysis from git history', async () => {
+ testConcurrentIfSelected('retro', async () => {
const result = await runSkillTest({
prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.
M test/skill-e2e-workflow.test.ts => test/skill-e2e-workflow.test.ts +3 -3
@@ 60,7 60,7 @@ describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
});
- test('/document-release updates docs without clobbering CHANGELOG', async () => {
+ testConcurrentIfSelected('document-release', async () => {
const result = await runSkillTest({
prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.
@@ 461,7 461,7 @@ describe('processPayment', () => {
try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
});
- test('/ship Step 3.4 produces coverage diagram', async () => {
+ testConcurrentIfSelected('ship-coverage-audit', async () => {
const result = await runSkillTest({
prompt: `Read the file ship/SKILL.md for the ship workflow instructions.
@@ 544,7 544,7 @@ describeIfSelected('Codex skill E2E', ['codex-review'], () => {
try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
});
- test('/codex review produces findings and GATE verdict', async () => {
+ testConcurrentIfSelected('codex-review', async () => {
// Check codex is available — skip if not installed
const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
if (codexCheck.status !== 0) {
M test/skill-llm-eval.test.ts => test/skill-llm-eval.test.ts +1 -1
@@ 56,7 56,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
/** Skip an individual test if not selected (for multi-test describe blocks). */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
- (shouldRun ? test : test.skip)(testName, fn, timeout);
+ (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
describeIfSelected('LLM-as-judge quality evals', [
M test/skill-routing-e2e.test.ts => test/skill-routing-e2e.test.ts +63 -58
@@ 44,7 44,11 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
// --- Helper functions ---
-/** Copy all SKILL.md files into tmpDir/.claude/skills/gstack/ for auto-discovery */
+/** Copy all SKILL.md files for auto-discovery.
+ * Install to BOTH project-level (.claude/skills/) AND user-level (~/.claude/skills/)
+ * because Claude Code discovers skills from both locations. In CI containers,
+ * $HOME may differ from the working directory, so we need both paths to ensure
+ * the Skill tool appears in Claude's available tools list. */
function installSkills(tmpDir: string) {
const skillDirs = [
'', // root gstack SKILL.md
@@ 54,15 58,30 @@ function installSkills(tmpDir: string) {
'gstack-upgrade', 'humanizer',
];
+ // Install to both project-level and user-level skill directories
+ const homeDir = process.env.HOME || os.homedir();
+ const installTargets = [
+ path.join(tmpDir, '.claude', 'skills'), // project-level
+ path.join(homeDir, '.claude', 'skills'), // user-level (~/.claude/skills/)
+ ];
+
for (const skill of skillDirs) {
const srcPath = path.join(ROOT, skill, 'SKILL.md');
if (!fs.existsSync(srcPath)) continue;
- const destDir = skill
- ? path.join(tmpDir, '.claude', 'skills', 'gstack', skill)
- : path.join(tmpDir, '.claude', 'skills', 'gstack');
- fs.mkdirSync(destDir, { recursive: true });
- fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
+ const skillName = skill || 'gstack';
+
+ for (const targetBase of installTargets) {
+ const destDir = path.join(targetBase, skillName);
+ fs.mkdirSync(destDir, { recursive: true });
+ fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
+ }
+ }
+
+ // Copy CLAUDE.md so Claude has project context for skill routing.
+ const claudeMdSrc = path.join(ROOT, 'CLAUDE.md');
+ if (fs.existsSync(claudeMdSrc)) {
+ fs.copyFileSync(claudeMdSrc, path.join(tmpDir, 'CLAUDE.md'));
}
}
@@ 75,6 94,31 @@ function initGitRepo(dir: string) {
run('git', ['config', 'user.name', 'Test']);
}
+/**
+ * Create a routing test working directory.
+ * Uses the actual repo checkout (ROOT) which has CLAUDE.md, .claude/skills/,
+ * and full project context. This matches the local environment where routing
+ * tests pass reliably. In containerized CI, bare tmpDirs lack the context
+ * Claude needs to make correct routing decisions.
+ */
+function createRoutingWorkDir(suffix: string): string {
+ // Clone the repo checkout into a tmpDir so concurrent tests don't interfere
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), `routing-${suffix}-`));
+ // Copy essential context files
+ const filesToCopy = ['CLAUDE.md', 'README.md', 'package.json', 'ETHOS.md'];
+ for (const f of filesToCopy) {
+ const src = path.join(ROOT, f);
+ if (fs.existsSync(src)) fs.copyFileSync(src, path.join(tmpDir, f));
+ }
+ // Copy skill files
+ installSkills(tmpDir);
+ // Init git
+ initGitRepo(tmpDir);
+ spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
+ spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
+ return tmpDir;
+}
+
function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
const durationSec = Math.round(result.duration / 1000);
@@ 104,13 148,8 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
});
test.concurrent('journey-ideation', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ideation-'));
+ const tmpDir = createRoutingWorkDir('ideation');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
- fs.writeFileSync(path.join(tmpDir, 'README.md'), '# New Project\n');
- spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
- spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
const testName = 'journey-ideation';
const expectedSkill = 'office-hours';
@@ 138,10 177,8 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}, 150_000);
test.concurrent('journey-plan-eng', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-plan-eng-'));
+ const tmpDir = createRoutingWorkDir('plan-eng');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
## Components
@@ 190,10 227,8 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}, 150_000);
test.concurrent('journey-think-bigger', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-think-bigger-'));
+ const tmpDir = createRoutingWorkDir('think-bigger');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
## Components
@@ 242,11 277,8 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}, 180_000);
test.concurrent('journey-debug', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-debug-'));
+ const tmpDir = createRoutingWorkDir('debug');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
@@ 302,11 334,8 @@ export default app;
}, 150_000);
test.concurrent('journey-qa', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-qa-'));
+ const tmpDir = createRoutingWorkDir('qa');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2));
fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true });
fs.writeFileSync(path.join(tmpDir, 'src/index.html'), '<html><body><h1>Waitlist App</h1></body></html>');
@@ 341,17 370,14 @@ export default app;
}, 150_000);
test.concurrent('journey-code-review', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-code-review-'));
+ const tmpDir = createRoutingWorkDir('code-review');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
run('git', ['add', '.']);
- run('git', ['commit', '-m', 'initial']);
+ run('git', ['commit', '-m', 'add base app']);
run('git', ['checkout', '-b', 'feature/add-waitlist']);
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// updated with waitlist feature\nimport { WaitlistService } from "./waitlist";\n');
fs.writeFileSync(path.join(tmpDir, 'waitlist.ts'), 'export class WaitlistService {\n async addParty(name: string, size: number) {\n // TODO: implement\n }\n}\n');
@@ 384,17 410,14 @@ export default app;
}, 150_000);
test.concurrent('journey-ship', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ship-'));
+ const tmpDir = createRoutingWorkDir('ship');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
run('git', ['add', '.']);
- run('git', ['commit', '-m', 'initial']);
+ run('git', ['commit', '-m', 'add base app']);
run('git', ['checkout', '-b', 'feature/waitlist']);
fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// waitlist feature\n');
run('git', ['add', '.']);
@@ 426,11 449,8 @@ export default app;
}, 150_000);
test.concurrent('journey-docs', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-docs-'));
+ const tmpDir = createRoutingWorkDir('docs');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
@@ 466,11 486,8 @@ export default app;
}, 150_000);
test.concurrent('journey-retro', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-retro-'));
+ const tmpDir = createRoutingWorkDir('retro');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
@@ 512,17 529,8 @@ export default app;
}, 150_000);
test.concurrent('journey-design-system', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-design-system-'));
+ const tmpDir = createRoutingWorkDir('design-system');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
- const run = (cmd: string, args: string[]) =>
- spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
-
- fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app' }, null, 2));
- run('git', ['add', '.']);
- run('git', ['commit', '-m', 'initial']);
const testName = 'journey-design-system';
const expectedSkill = 'design-consultation';
@@ 550,11 558,8 @@ export default app;
}, 150_000);
test.concurrent('journey-visual-qa', async () => {
- const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-visual-qa-'));
+ const tmpDir = createRoutingWorkDir('visual-qa');
try {
- initGitRepo(tmpDir);
- installSkills(tmpDir);
-
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });