~cytrogen/gstack

6156122571c5b170118698a056b71d5112822f1b — Garry Tan a month ago 3501f5d
test: E2E tests for plan review report and Codex offering (v0.11.15.0) (#449)

* chore: regen SKILL.md from template changes

Regenerated via `bun run gen:skill-docs` — was stale from prior
template updates (Codex paths, preamble resolver).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* test: add E2E tests for plan review report and codex offering

- plan-review-report: verifies plan-eng-review writes ## GSTACK REVIEW
  REPORT to the bottom of the plan file
- codex-offered-{office-hours,ceo-review,design-review,eng-review}:
  verifies each skill has Codex availability check, user prompt, and
  fallback behavior (4 concurrent lightweight tests)
- Updated touchfiles and selection count assertion

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* docs: add touchfiles to global touchfile list in CLAUDE.md

The touchfiles.ts file itself is a global touchfile that triggers all
tests when changed, but was missing from the documented list.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* chore: bump version and changelog (v0.11.15.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
6 files changed, 218 insertions(+), 4 deletions(-)

M CHANGELOG.md
M CLAUDE.md
M VERSION
M test/helpers/touchfiles.ts
M test/skill-e2e-plan.test.ts
M test/touchfiles.test.ts
M CHANGELOG.md => CHANGELOG.md +13 -0
@@ 1,5 1,18 @@
# Changelog

## [0.11.15.0] - 2026-03-24 — E2E Test Coverage for Plan Reviews & Codex

### Added

- **E2E tests verify plan review reports appear at the bottom of plans.** The `/plan-eng-review` review report is now tested end-to-end — if it stops writing `## GSTACK REVIEW REPORT` to the plan file, the test catches it.
- **E2E tests verify Codex is offered in every plan skill.** Four new lightweight tests confirm that `/office-hours`, `/plan-ceo-review`, `/plan-design-review`, and `/plan-eng-review` all check for Codex availability, prompt the user, and handle the fallback when Codex is unavailable.

### For contributors

- New E2E tests in `test/skill-e2e-plan.test.ts`: `plan-review-report`, `codex-offered-eng-review`, `codex-offered-ceo-review`, `codex-offered-office-hours`, `codex-offered-design-review`
- Updated touchfile mappings and selection count assertions
- Added `touchfiles` to the documented global touchfile list in CLAUDE.md

## [0.11.14.0] - 2026-03-24 — Windows Browse Fix

### Fixed

M CLAUDE.md => CLAUDE.md +1 -1
@@ 29,7 29,7 @@ against the previous run.
**Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based
on `git diff` against the base branch. Each test declares its file dependencies in
`test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store,
llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
llm-judge, gen-skill-docs, touchfiles) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
variants to force all tests. Run `eval:select` to preview which tests would run.

## Testing

M VERSION => VERSION +1 -1
@@ 1,1 1,1 @@
0.11.14.0
0.11.15.0

M test/helpers/touchfiles.ts => test/helpers/touchfiles.ts +7 -0
@@ 68,6 68,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'plan-ceo-review-benefits':  ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'plan-eng-review':           ['plan-eng-review/**'],
  'plan-eng-review-artifact':  ['plan-eng-review/**'],
  'plan-review-report':        ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

  // Codex offering verification
  'codex-offered-office-hours':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],
  'codex-offered-ceo-review':    ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
  'codex-offered-eng-review':    ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

  // Ship
  'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],

M test/skill-e2e-plan.test.ts => test/skill-e2e-plan.test.ts +193 -0
@@ 535,6 535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
  }, 180_000);
});

// --- Plan Review Report E2E ---
// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section
// to the bottom of the plan file (the living review status footer).

describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => {
  let planDir: string;

  beforeAll(() => {
    planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-'));
    const run = (cmd: string, args: string[]) =>
      spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });

    run('git', ['init', '-b', 'main']);
    run('git', ['config', 'user.email', 'test@test.com']);
    run('git', ['config', 'user.name', 'Test']);

    fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System

## Context
We're building a real-time notification system for our SaaS app.

## Changes
1. WebSocket server for push notifications
2. Notification preferences API
3. Email digest fallback for offline users
4. PostgreSQL table for notification storage

## Architecture
- WebSocket: Socket.io on Express
- Queue: Bull + Redis for email digests
- Storage: PostgreSQL notifications table
- Frontend: React toast component

## Open questions
- Retry policy for failed WebSocket delivery?
- Max notifications stored per user?
`);

    run('git', ['add', '.']);
    run('git', ['commit', '-m', 'add plan']);

    // Copy plan-eng-review skill
    fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
    fs.copyFileSync(
      path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
      path.join(planDir, 'plan-eng-review', 'SKILL.md'),
    );
  });

  afterAll(() => {
    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
  });

  test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-eng-review/SKILL.md for the review workflow.

Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.

Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive.
Skip the preamble bash block, lake intro, telemetry, and contributor mode sections.

CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content.

This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`,
      workingDirectory: planDir,
      maxTurns: 20,
      timeout: 360_000,
      testName: 'plan-review-report',
      runId,
      model: 'claude-opus-4-6',
    });

    logCost('/plan-eng-review report', result);
    recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, {
      passed: ['success', 'error_max_turns'].includes(result.exitReason),
    });
    expect(['success', 'error_max_turns']).toContain(result.exitReason);

    // Verify the review report was written to the plan file
    const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8');

    // Original plan content should still be present
    expect(planContent).toContain('# Plan: Add Notifications System');
    expect(planContent).toContain('WebSocket');

    // Review report section must exist
    expect(planContent).toContain('## GSTACK REVIEW REPORT');

    // Report should be at the bottom of the file
    const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT');
    const afterReport = planContent.slice(reportIndex);

    // Should contain the review table with standard rows
    expect(afterReport).toMatch(/\|\s*Review\s*\|/);
    expect(afterReport).toContain('CEO Review');
    expect(afterReport).toContain('Eng Review');
    expect(afterReport).toContain('Design Review');

    console.log('Plan review report found at bottom of plan.md');
  }, 420_000);
});

// --- Codex Offering E2E ---
// Verifies that Codex is properly offered (with availability check, user prompt,
// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review.

describeIfSelected('Codex Offering E2E', [
  'codex-offered-office-hours', 'codex-offered-ceo-review',
  'codex-offered-design-review', 'codex-offered-eng-review',
], () => {
  let testDir: string;

  beforeAll(() => {
    testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-'));
    const run = (cmd: string, args: string[]) =>
      spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 });

    run('git', ['init', '-b', 'main']);
    run('git', ['config', 'user.email', 'test@test.com']);
    run('git', ['config', 'user.name', 'Test']);
    fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n');
    run('git', ['add', '.']);
    run('git', ['commit', '-m', 'init']);

    // Copy all 4 SKILL.md files
    for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) {
      fs.mkdirSync(path.join(testDir, skill), { recursive: true });
      fs.copyFileSync(
        path.join(ROOT, skill, 'SKILL.md'),
        path.join(testDir, skill, 'SKILL.md'),
      );
    }
  });

  afterAll(() => {
    try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {}
  });

  async function checkCodexOffering(skill: string, testName: string, featureName: string) {
    const result = await runSkillTest({
      prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion".

Summarize the Codex/${featureName} integration — answer these specific questions:
1. How is Codex availability checked? (what exact bash command?)
2. How is the user prompted? (via AskUserQuestion? what are the options?)
3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?)
4. Is this step blocking (gates the workflow) or optional (can be skipped)?
5. What prompt/context is sent to Codex?

Write your summary to ${testDir}/${testName}-summary.md`,
      workingDirectory: testDir,
      maxTurns: 8,
      timeout: 120_000,
      testName,
      runId,
    });

    logCost(`/${skill} codex offering`, result);
    recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result);
    expect(result.exitReason).toBe('success');

    const summaryPath = path.join(testDir, `${testName}-summary.md`);
    expect(fs.existsSync(summaryPath)).toBe(true);

    const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
    // All skills should have codex availability check
    expect(summary).toMatch(/which codex/);
    // All skills should have fallback behavior
    expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/);
    // All skills should show it's optional/non-blocking
    expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/);

    console.log(`${skill}: Codex offering verified`);
  }

  testConcurrentIfSelected('codex-offered-office-hours', async () => {
    await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion');
  }, 180_000);

  testConcurrentIfSelected('codex-offered-ceo-review', async () => {
    await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice');
  }, 180_000);

  testConcurrentIfSelected('codex-offered-design-review', async () => {
    await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices');
  }, 180_000);

  testConcurrentIfSelected('codex-offered-eng-review', async () => {
    await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice');
  }, 180_000);
});

// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
  await finalizeEvalCollector(evalCollector);

M test/touchfiles.test.ts => test/touchfiles.test.ts +3 -2
@@ 80,8 80,9 @@ describe('selectTests', () => {
    expect(result.selected).toContain('plan-ceo-review-selective');
    expect(result.selected).toContain('plan-ceo-review-benefits');
    expect(result.selected).toContain('autoplan-core');
    expect(result.selected.length).toBe(4);
    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 4);
    expect(result.selected).toContain('codex-offered-ceo-review');
    expect(result.selected.length).toBe(5);
    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5);
  });

  test('global touchfile triggers ALL tests', () => {