From 4fe0ce9cba4b367a36004720cddb952172e7949d Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 18 Mar 2026 23:08:04 -0500 Subject: [PATCH] feat: natural language skill routing + proactive suggestions (v0.7.1) (#195) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add trigger phrases to /debug and /office-hours These two skills had zero "Use when asked to..." phrases, making them completely invisible to natural language. Users saying "debug this" or "brainstorm an idea" would get no skill invocation. * feat: add proactive triggers to all workflow skills Every skill now has "Proactively suggest when..." language so Claude surfaces skills at natural moments — not just when the user says specific trigger phrases. * feat: lifecycle map + proactive preference system Root gstack description now includes a developer workflow guide mapping 12 stages to skills. Preamble reads proactive preference via gstack-config. Users can opt out with "stop suggesting things" and re-enable with "be proactive again" — natural language toggle, no CLI needed. * test: 11 journey-stage E2E routing tests + trigger phrase validation Each test simulates a real development stage (ideation, plan review, debug, QA, ship, retro...) with realistic project context and verifies the right skill fires from natural language alone. 11/11 pass. * chore: bump version and changelog (v0.7.1) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- CHANGELOG.md | 14 + SKILL.md | 31 ++ SKILL.md.tmpl | 26 ++ VERSION | 2 +- browse/SKILL.md | 5 + debug/SKILL.md | 9 + debug/SKILL.md.tmpl | 4 + design-consultation/SKILL.md | 7 + design-consultation/SKILL.md.tmpl | 2 + design-review/SKILL.md | 7 + design-review/SKILL.md.tmpl | 2 + document-release/SKILL.md | 6 + document-release/SKILL.md.tmpl | 1 + office-hours/SKILL.md | 9 + office-hours/SKILL.md.tmpl | 4 + package.json | 10 +- plan-ceo-review/SKILL.md | 7 + plan-ceo-review/SKILL.md.tmpl | 2 + plan-design-review/SKILL.md | 7 + plan-design-review/SKILL.md.tmpl | 2 + plan-eng-review/SKILL.md | 7 + plan-eng-review/SKILL.md.tmpl | 2 + qa-only/SKILL.md | 6 + qa-only/SKILL.md.tmpl | 1 + qa/SKILL.md | 9 +- qa/SKILL.md.tmpl | 4 +- retro/SKILL.md | 6 + retro/SKILL.md.tmpl | 1 + review/SKILL.md | 6 + review/SKILL.md.tmpl | 1 + scripts/gen-skill-docs.ts | 5 + setup-browser-cookies/SKILL.md | 5 + ship/SKILL.md | 6 + ship/SKILL.md.tmpl | 1 + test/helpers/touchfiles.ts | 13 + test/skill-routing-e2e.test.ts | 605 ++++++++++++++++++++++++++++++ test/skill-validation.test.ts | 37 ++ test/touchfiles.test.ts | 9 +- 38 files changed, 870 insertions(+), 11 deletions(-) create mode 100644 test/skill-routing-e2e.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 30dbcc5a3a55b1a692a362040c67fb08c2fc4426..f1790addeffe716af4d89a783f70051f28b2dd8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## [0.7.1] - 2026-03-19 + +### Added + +- **gstack now suggests skills at natural moments.** You don't need to know slash commands — just talk about what you're doing. Brainstorming an idea? gstack suggests `/office-hours`. Something's broken? It suggests `/debug`. Ready to deploy? It suggests `/ship`. Every workflow skill now has proactive triggers that fire when the moment is right. +- **Lifecycle map.** gstack's root skill description now includes a developer workflow guide mapping 12 stages (brainstorm → plan → review → code → debug → test → ship → docs → retro) to the right skill. Claude sees this in every session. +- **Opt-out with natural language.** If proactive suggestions feel too aggressive, just say "stop suggesting things" — gstack remembers across sessions. Say "be proactive again" to re-enable. +- **11 journey-stage E2E tests.** Each test simulates a real moment in the developer lifecycle with realistic project context (plan.md, error logs, git history, code) and verifies the right skill fires from natural language alone. 11/11 pass. +- **Trigger phrase validation.** Static tests verify every workflow skill has "Use when" and "Proactively suggest" phrases — catches regressions for free. + +### Fixed + +- `/debug` and `/office-hours` were completely invisible to natural language — no trigger phrases at all. Now both have full reactive + proactive triggers. + ## [0.7.0] - 2026-03-18 — YC Office Hours **`/office-hours` — sit down with a YC partner before you write a line of code.** diff --git a/SKILL.md b/SKILL.md index e12be62105c533f0ca7c8e3c8bafac0128f38380..dd06f058fa4a8538a72dde417a57717c4022d71b 100644 --- a/SKILL.md +++ b/SKILL.md @@ -7,6 +7,32 @@ description: | responsive layouts, test forms and uploads, handle dialogs, and assert element states. ~100ms per command. Use when you need to test a feature, verify a deployment, dogfood a user flow, or file a bug with evidence. + + gstack also includes development workflow skills. When you notice the user is at + these stages, suggest the appropriate skill: + - Brainstorming a new idea → suggest /office-hours + - Reviewing a plan (strategy) → suggest /plan-ceo-review + - Reviewing a plan (architecture) → suggest /plan-eng-review + - Reviewing a plan (design) → suggest /plan-design-review + - Creating a design system → suggest /design-consultation + - Debugging errors → suggest /debug + - Testing the app → suggest /qa + - Code review before merge → suggest /review + - Visual design audit → suggest /design-review + - Ready to deploy / create PR → suggest /ship + - Post-ship doc updates → suggest /document-release + - Weekly retrospective → suggest /retro + + If the user pushes back on skill suggestions ("stop suggesting things", + "I don't need suggestions", "too aggressive"): + 1. Stop suggesting for the rest of this session + 2. Run: gstack-config set proactive false + 3. Say: "Got it — I'll stop suggesting skills. Just tell me to be proactive + again if you change your mind." + + If the user says "be proactive again" or "turn on suggestions": + 1. Run: gstack-config set proactive true + 2. Say: "Proactive suggestions are back on." allowed-tools: - Bash - Read @@ -30,8 +56,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/SKILL.md.tmpl b/SKILL.md.tmpl index dd2b23455201df3856215bd87f9e4cc479b6f1d9..7aacdb29191253cd71c31587ee50e4c81c28760c 100644 --- a/SKILL.md.tmpl +++ b/SKILL.md.tmpl @@ -7,6 +7,32 @@ description: | responsive layouts, test forms and uploads, handle dialogs, and assert element states. ~100ms per command. Use when you need to test a feature, verify a deployment, dogfood a user flow, or file a bug with evidence. + + gstack also includes development workflow skills. When you notice the user is at + these stages, suggest the appropriate skill: + - Brainstorming a new idea → suggest /office-hours + - Reviewing a plan (strategy) → suggest /plan-ceo-review + - Reviewing a plan (architecture) → suggest /plan-eng-review + - Reviewing a plan (design) → suggest /plan-design-review + - Creating a design system → suggest /design-consultation + - Debugging errors → suggest /debug + - Testing the app → suggest /qa + - Code review before merge → suggest /review + - Visual design audit → suggest /design-review + - Ready to deploy / create PR → suggest /ship + - Post-ship doc updates → suggest /document-release + - Weekly retrospective → suggest /retro + + If the user pushes back on skill suggestions ("stop suggesting things", + "I don't need suggestions", "too aggressive"): + 1. Stop suggesting for the rest of this session + 2. Run: gstack-config set proactive false + 3. Say: "Got it — I'll stop suggesting skills. Just tell me to be proactive + again if you change your mind." + + If the user says "be proactive again" or "turn on suggestions": + 1. Run: gstack-config set proactive true + 2. Say: "Proactive suggestions are back on." allowed-tools: - Bash - Read diff --git a/VERSION b/VERSION index faef31a4357c48d6e4c55e84c8be8e3bc9055e20..39e898a4f952d339c155a7939d571a5fdd6c8cfc 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.7.0 +0.7.1 diff --git a/browse/SKILL.md b/browse/SKILL.md index bf695d3bf861527b71606944bb6ddbf0535e866e..3c452c8461c6aff562eaaba3cad18133889f793b 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -31,8 +31,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/debug/SKILL.md b/debug/SKILL.md index 4448453a63658654d59a63c4e42a9cda59d3d93d..c1314556d976c9daaa22a5c700f919edd1c2a1a5 100644 --- a/debug/SKILL.md +++ b/debug/SKILL.md @@ -4,6 +4,10 @@ version: 1.0.0 description: | Systematic debugging with root cause investigation. Four phases: investigate, analyze, hypothesize, implement. Iron Law: no fixes without root cause. + Use when asked to "debug this", "fix this bug", "why is this broken", + "investigate this error", or "root cause analysis". + Proactively suggest when the user reports errors, unexpected behavior, or + is troubleshooting why something stopped working. allowed-tools: - Bash - Read @@ -30,8 +34,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/debug/SKILL.md.tmpl b/debug/SKILL.md.tmpl index 312d2420a31ac40c547722ac12b755373bfde786..90fc5bdcb9ef381ad58c50655d1fb45f23f0245f 100644 --- a/debug/SKILL.md.tmpl +++ b/debug/SKILL.md.tmpl @@ -4,6 +4,10 @@ version: 1.0.0 description: | Systematic debugging with root cause investigation. Four phases: investigate, analyze, hypothesize, implement. Iron Law: no fixes without root cause. + Use when asked to "debug this", "fix this bug", "why is this broken", + "investigate this error", or "root cause analysis". + Proactively suggest when the user reports errors, unexpected behavior, or + is troubleshooting why something stopped working. allowed-tools: - Bash - Read diff --git a/design-consultation/SKILL.md b/design-consultation/SKILL.md index c5c5bc29bfa845efc176748bae8b025f9d42f2ad..31cbf815a5960605afc81bb4c1d3ad818c6b3acd 100644 --- a/design-consultation/SKILL.md +++ b/design-consultation/SKILL.md @@ -7,6 +7,8 @@ description: | generates font+color preview pages. Creates DESIGN.md as your project's design source of truth. For existing sites, use /plan-design-review to infer the system instead. Use when asked to "design system", "brand guidelines", or "create DESIGN.md". + Proactively suggest when starting a new project's UI with no existing + design system or DESIGN.md. allowed-tools: - Bash - Read @@ -34,8 +36,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/design-consultation/SKILL.md.tmpl b/design-consultation/SKILL.md.tmpl index 2bc6725533100668f466612b27271fdde5b84b38..2532126c4cb977bb95841aad7759bc06e4942df2 100644 --- a/design-consultation/SKILL.md.tmpl +++ b/design-consultation/SKILL.md.tmpl @@ -7,6 +7,8 @@ description: | generates font+color preview pages. Creates DESIGN.md as your project's design source of truth. For existing sites, use /plan-design-review to infer the system instead. Use when asked to "design system", "brand guidelines", or "create DESIGN.md". + Proactively suggest when starting a new project's UI with no existing + design system or DESIGN.md. allowed-tools: - Bash - Read diff --git a/design-review/SKILL.md b/design-review/SKILL.md index 473e419b65da72d928fde5a28ee644e4e950cb8e..dd7fced19c8c3036c7bf439643e084ca1c435727 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -7,6 +7,8 @@ description: | in source code, committing each fix atomically and re-verifying with before/after screenshots. For plan-mode design review (before implementation), use /plan-design-review. Use when asked to "audit the design", "visual QA", "check if it looks good", or "design polish". + Proactively suggest when the user mentions visual inconsistencies or + wants to polish the look of a live site. allowed-tools: - Bash - Read @@ -34,8 +36,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/design-review/SKILL.md.tmpl b/design-review/SKILL.md.tmpl index f60a9c412ece647b2e836b302462167b338c33cf..24fe160c45acd9c53c64fe04f12aa6b88bdcb5cb 100644 --- a/design-review/SKILL.md.tmpl +++ b/design-review/SKILL.md.tmpl @@ -7,6 +7,8 @@ description: | in source code, committing each fix atomically and re-verifying with before/after screenshots. For plan-mode design review (before implementation), use /plan-design-review. Use when asked to "audit the design", "visual QA", "check if it looks good", or "design polish". + Proactively suggest when the user mentions visual inconsistencies or + wants to polish the look of a live site. allowed-tools: - Bash - Read diff --git a/document-release/SKILL.md b/document-release/SKILL.md index 88af49fb1279dc080c07467cf27bee47f8679441..4831573b4943ae9bd08e6e72c108b2d304e598af 100644 --- a/document-release/SKILL.md +++ b/document-release/SKILL.md @@ -6,6 +6,7 @@ description: | diff, updates README/ARCHITECTURE/CONTRIBUTING/CLAUDE.md to match what shipped, polishes CHANGELOG voice, cleans up TODOS, and optionally bumps VERSION. Use when asked to "update the docs", "sync documentation", or "post-ship docs". + Proactively suggest after a PR is merged or code is shipped. allowed-tools: - Bash - Read @@ -32,8 +33,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/document-release/SKILL.md.tmpl b/document-release/SKILL.md.tmpl index 2cd8d11726201a28f489e88e78a02f2076adb957..0cd1bd57450efa9a3f4fef485074f71c29c11320 100644 --- a/document-release/SKILL.md.tmpl +++ b/document-release/SKILL.md.tmpl @@ -6,6 +6,7 @@ description: | diff, updates README/ARCHITECTURE/CONTRIBUTING/CLAUDE.md to match what shipped, polishes CHANGELOG voice, cleans up TODOS, and optionally bumps VERSION. Use when asked to "update the docs", "sync documentation", or "post-ship docs". + Proactively suggest after a PR is merged or code is shipped. allowed-tools: - Bash - Read diff --git a/office-hours/SKILL.md b/office-hours/SKILL.md index fec01e2699b001bb363b60541bc2b8d578b5923e..da59e1ff56825373dd0eca7f0c62037e7f4ba11f 100644 --- a/office-hours/SKILL.md +++ b/office-hours/SKILL.md @@ -6,6 +6,10 @@ description: | demand reality, status quo, desperate specificity, narrowest wedge, observation, and future-fit. Builder mode: design thinking brainstorming for side projects, hackathons, learning, and open source. Saves a design doc. + Use when asked to "brainstorm this", "I have an idea", "help me think through + this", "office hours", or "is this worth building". + Proactively suggest when the user describes a new product idea or is exploring + whether something is worth building — before any code is written. Use before /plan-ceo-review or /plan-eng-review. allowed-tools: - Bash @@ -33,8 +37,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/office-hours/SKILL.md.tmpl b/office-hours/SKILL.md.tmpl index 4eec04b64e66ca7b9fecbf149b5c70e03e9eb797..03a8302c720c7558f8eb2fc3462f87b47fc1b0c1 100644 --- a/office-hours/SKILL.md.tmpl +++ b/office-hours/SKILL.md.tmpl @@ -6,6 +6,10 @@ description: | demand reality, status quo, desperate specificity, narrowest wedge, observation, and future-fit. Builder mode: design thinking brainstorming for side projects, hackathons, learning, and open source. Saves a design doc. + Use when asked to "brainstorm this", "I have an idea", "help me think through + this", "office hours", or "is this worth building". + Proactively suggest when the user describes a new product idea or is exploring + whether something is worth building — before any code is written. Use before /plan-ceo-review or /plan-eng-review. allowed-tools: - Bash diff --git a/package.json b/package.json index ff8b587063d0f4861184b0bd4cb939a9fd75a6ee..1c580144990bea60003b2d33b23bc489cd392a23 100644 --- a/package.json +++ b/package.json @@ -12,11 +12,11 @@ "gen:skill-docs": "bun run scripts/gen-skill-docs.ts", "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", - "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts", - "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts", - "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts", - "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts", - "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts", + "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts", + "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", + "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", + "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", + "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts", diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index 24a186744a51d3e0d4912e3b6521e750e79dc7cc..ce0395b07571c39880f991531981c433e9515270 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -8,6 +8,8 @@ description: | expansions), HOLD SCOPE (maximum rigor), SCOPE REDUCTION (strip to essentials). Use when asked to "think bigger", "expand scope", "strategy review", "rethink this", or "is this ambitious enough". + Proactively suggest when the user is questioning scope or ambition of a plan, + or when the plan feels like it could be thinking bigger. allowed-tools: - Read - Grep @@ -32,8 +34,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index 16c1b49dd19779962592213bdd14951bd32d4a43..09189af5f553ad09fe6af0a4f113681cbb148403 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -8,6 +8,8 @@ description: | expansions), HOLD SCOPE (maximum rigor), SCOPE REDUCTION (strip to essentials). Use when asked to "think bigger", "expand scope", "strategy review", "rethink this", or "is this ambitious enough". + Proactively suggest when the user is questioning scope or ambition of a plan, + or when the plan feels like it could be thinking bigger. allowed-tools: - Read - Grep diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 21e37c95b60bfcf8706a6a203ee2e24ba8de2251..faabd32898b54c4231e34607c8c1b64134ed48e9 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -7,6 +7,8 @@ description: | then fixes the plan to get there. Works in plan mode. For live site visual audits, use /design-review. Use when asked to "review the design plan" or "design critique". + Proactively suggest when the user has a plan with UI/UX components that + should be reviewed before implementation. allowed-tools: - Read - Edit @@ -32,8 +34,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/plan-design-review/SKILL.md.tmpl b/plan-design-review/SKILL.md.tmpl index e8f9c418aef175d734670bf75fc0a3982599a81c..73e383b68f13fd6de9424a68b33180acd4d23d96 100644 --- a/plan-design-review/SKILL.md.tmpl +++ b/plan-design-review/SKILL.md.tmpl @@ -7,6 +7,8 @@ description: | then fixes the plan to get there. Works in plan mode. For live site visual audits, use /design-review. Use when asked to "review the design plan" or "design critique". + Proactively suggest when the user has a plan with UI/UX components that + should be reviewed before implementation. allowed-tools: - Read - Edit diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index caafb7925d784187c63dfdcc9459c432d01b9829..d6c6ea2892da6cf6aed663db18667ea0b2abfb1e 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -6,6 +6,8 @@ description: | data flow, diagrams, edge cases, test coverage, performance. Walks through issues interactively with opinionated recommendations. Use when asked to "review the architecture", "engineering review", or "lock in the plan". + Proactively suggest when the user has a plan or design doc and is about to + start coding — to catch architecture issues before implementation. allowed-tools: - Read - Write @@ -31,8 +33,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index 1ca2b2988373becc65f91691de0efe6b609f7e1e..6a0b12176eda6dc395e3d2eb3053dd70db8f1f40 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -6,6 +6,8 @@ description: | data flow, diagrams, edge cases, test coverage, performance. Walks through issues interactively with opinionated recommendations. Use when asked to "review the architecture", "engineering review", or "lock in the plan". + Proactively suggest when the user has a plan or design doc and is about to + start coding — to catch architecture issues before implementation. allowed-tools: - Read - Write diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index a5684dd7617828085f73f4dcc0a4f7628bda46b3..0e20c5e3b8bfc762b14222d18f94ff5948867a64 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -6,6 +6,7 @@ description: | structured report with health score, screenshots, and repro steps — but never fixes anything. Use when asked to "just report bugs", "qa report only", or "test but don't fix". For the full test-fix-verify loop, use /qa instead. + Proactively suggest when the user wants a bug report without any code changes. allowed-tools: - Bash - Read @@ -29,8 +30,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/qa-only/SKILL.md.tmpl b/qa-only/SKILL.md.tmpl index 831e71ed52ff3fd82cf151131745955d9583e68d..2e2bc4f7847822f82f472825271267fd26dc4491 100644 --- a/qa-only/SKILL.md.tmpl +++ b/qa-only/SKILL.md.tmpl @@ -6,6 +6,7 @@ description: | structured report with health score, screenshots, and repro steps — but never fixes anything. Use when asked to "just report bugs", "qa report only", or "test but don't fix". For the full test-fix-verify loop, use /qa instead. + Proactively suggest when the user wants a bug report without any code changes. allowed-tools: - Bash - Read diff --git a/qa/SKILL.md b/qa/SKILL.md index 2d12fca81a0cefdf2361911945ceeb43dd474aee..8ee176be58d835c2367ac9ebd3779b4aec701d87 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -5,7 +5,9 @@ description: | Systematically QA test a web application and fix bugs found. Runs QA testing, then iteratively fixes bugs in source code, committing each fix atomically and re-verifying. Use when asked to "qa", "QA", "test this site", "find bugs", - "test and fix", or "fix what's broken". Three tiers: Quick (critical/high only), + "test and fix", or "fix what's broken". + Proactively suggest when the user says a feature is ready for testing + or asks "does this work?". Three tiers: Quick (critical/high only), Standard (+ medium), Exhaustive (+ cosmetic). Produces before/after health scores, fix evidence, and a ship-readiness summary. For report-only mode, use /qa-only. allowed-tools: @@ -35,8 +37,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/qa/SKILL.md.tmpl b/qa/SKILL.md.tmpl index bd94debe73a74495288fcac3851f027b2e624f88..292f714003a1bb5bfaeb9827cdf4c05be0d2f3e0 100644 --- a/qa/SKILL.md.tmpl +++ b/qa/SKILL.md.tmpl @@ -5,7 +5,9 @@ description: | Systematically QA test a web application and fix bugs found. Runs QA testing, then iteratively fixes bugs in source code, committing each fix atomically and re-verifying. Use when asked to "qa", "QA", "test this site", "find bugs", - "test and fix", or "fix what's broken". Three tiers: Quick (critical/high only), + "test and fix", or "fix what's broken". + Proactively suggest when the user says a feature is ready for testing + or asks "does this work?". Three tiers: Quick (critical/high only), Standard (+ medium), Exhaustive (+ cosmetic). Produces before/after health scores, fix evidence, and a ship-readiness summary. For report-only mode, use /qa-only. allowed-tools: diff --git a/retro/SKILL.md b/retro/SKILL.md index bb6bcbe9118c37d0e066e4f4fbfd7fb7bb434cda..90fb547e57c12b083d3cc7bcc303ce58aa7f8f6c 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -6,6 +6,7 @@ description: | and code quality metrics with persistent history and trend tracking. Team-aware: breaks down per-person contributions with praise and growth areas. Use when asked to "weekly retro", "what did we ship", or "engineering retrospective". + Proactively suggest at the end of a work week or sprint. allowed-tools: - Bash - Read @@ -30,8 +31,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index 92d5c40bf4b2da303cbf87675cfa65270ae5593f..41a48e7fd4162041cbfa6ba38b8fa6b81a083305 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -6,6 +6,7 @@ description: | and code quality metrics with persistent history and trend tracking. Team-aware: breaks down per-person contributions with praise and growth areas. Use when asked to "weekly retro", "what did we ship", or "engineering retrospective". + Proactively suggest at the end of a work week or sprint. allowed-tools: - Bash - Read diff --git a/review/SKILL.md b/review/SKILL.md index 354e715b76b88eaa3e4a287f66bbd85b9e0aebdb..b2da378d8659f2b0fd87b289a7a75db527f981b4 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -5,6 +5,7 @@ description: | Pre-landing PR review. Analyzes diff against the base branch for SQL safety, LLM trust boundary violations, conditional side effects, and other structural issues. Use when asked to "review this PR", "code review", "pre-landing review", or "check my diff". + Proactively suggest when the user is about to merge or land code changes. allowed-tools: - Bash - Read @@ -31,8 +32,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index 7094a15643748a3a2905143f08cb9ed7e9bd3836..20e2cf12d0861f4ac02112da663084ce3cf5ef7d 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -5,6 +5,7 @@ description: | Pre-landing PR review. Analyzes diff against the base branch for SQL safety, LLM trust boundary violations, conditional side effects, and other structural issues. Use when asked to "review this PR", "code review", "pre-landing review", or "check my diff". + Proactively suggest when the user is about to merge or land code changes. allowed-tools: - Bash - Read diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 3d569d356588b0eb6f5301259680804f42eb1c23..a53d186452508dc562479a7c7a050d4bd072e97f 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -109,8 +109,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" \`\`\` +If \`PROACTIVE\` is \`"false"\`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows \`UPGRADE_AVAILABLE \`: read \`~/.claude/skills/gstack/gstack-upgrade/SKILL.md\` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If \`JUST_UPGRADED \`: tell user "Running gstack v{to} (just updated!)" and continue. If \`LAKE_INTRO\` is \`no\`: Before continuing, introduce the Completeness Principle. diff --git a/setup-browser-cookies/SKILL.md b/setup-browser-cookies/SKILL.md index 3ae00a6bc86f7507d1e0ab8ee65dacbd3293cf13..ad9d5fbb71d0a9198db96fa9622d6204ef965a6b 100644 --- a/setup-browser-cookies/SKILL.md +++ b/setup-browser-cookies/SKILL.md @@ -28,8 +28,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/ship/SKILL.md b/ship/SKILL.md index 3f0f006745e5b7ae2697ff6da098f0c4d8e5d710..97f26fa2eadac204924ce38d07afecdb63ed1a2c 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -3,6 +3,7 @@ name: ship version: 1.0.0 description: | Ship workflow: detect + merge base branch, run tests, review diff, bump VERSION, update CHANGELOG, commit, push, create PR. Use when asked to "ship", "deploy", "push to main", "create a PR", or "merge and push". + Proactively suggest when the user says code is ready or asks about deploying. allowed-tools: - Bash - Read @@ -30,8 +31,13 @@ _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" +_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +echo "PROACTIVE: $_PROACTIVE" ``` +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index aef5c9d30aa0b74ec5165f7cded32e06cd9217ef..ed7a7f0770d3585ef861ad238fa1a02d9179a01a 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -3,6 +3,7 @@ name: ship version: 1.0.0 description: | Ship workflow: detect + merge base branch, run tests, review diff, bump VERSION, update CHANGELOG, commit, push, create PR. Use when asked to "ship", "deploy", "push to main", "create a PR", or "merge and push". + Proactively suggest when the user says code is ready or asks about deploying. allowed-tools: - Bash - Read diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 995648a1bb2d32847d4a407d0d291fb88b044572..8afe84471664c62cd4a781afa049edffefdb199e 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -90,6 +90,19 @@ export const E2E_TOUCHFILES: Record = { // gstack-upgrade 'gstack-upgrade-happy-path': ['gstack-upgrade/**'], + + // Skill routing — journey-stage tests (depend on ALL skill descriptions) + 'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-think-bigger': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-debug': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-code-review': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-ship': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-docs': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-retro': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-design-system': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], }; /** diff --git a/test/skill-routing-e2e.test.ts b/test/skill-routing-e2e.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..ee2d84b41a1373c2adfca040492c087bef33db6d --- /dev/null +++ b/test/skill-routing-e2e.test.ts @@ -0,0 +1,605 @@ +import { describe, test, expect, afterAll } from 'bun:test'; +import { runSkillTest } from './helpers/session-runner'; +import type { SkillTestResult } from './helpers/session-runner'; +import { EvalCollector } from './helpers/eval-store'; +import type { EvalTestEntry } from './helpers/eval-store'; +import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// Skip unless EVALS=1. +const evalsEnabled = !!process.env.EVALS; +const describeE2E = evalsEnabled ? describe : describe.skip; + +// Eval result collector +const evalCollector = evalsEnabled ? new EvalCollector('e2e-routing') : null; + +// Unique run ID for this session +const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').slice(0, 15); + +// --- Diff-based test selection --- +// Journey routing tests use E2E_TOUCHFILES (entries prefixed 'journey-' in touchfiles.ts). +let selectedTests: string[] | null = null; + +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE + || detectBaseBranch(ROOT) + || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + process.stderr.write(`\nRouting E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`); + if (selection.skipped.length > 0) { + process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`); + } + process.stderr.write('\n'); + } +} + +// --- Helper functions --- + +/** Copy all SKILL.md files into tmpDir/.claude/skills/gstack/ for auto-discovery */ +function installSkills(tmpDir: string) { + const skillDirs = [ + '', // root gstack SKILL.md + 'qa', 'qa-only', 'ship', 'review', 'plan-ceo-review', 'plan-eng-review', + 'plan-design-review', 'design-review', 'design-consultation', 'retro', + 'document-release', 'debug', 'office-hours', 'browse', 'setup-browser-cookies', + 'gstack-upgrade', 'humanizer', + ]; + + for (const skill of skillDirs) { + const srcPath = path.join(ROOT, skill, 'SKILL.md'); + if (!fs.existsSync(srcPath)) continue; + + const destDir = skill + ? path.join(tmpDir, '.claude', 'skills', 'gstack', skill) + : path.join(tmpDir, '.claude', 'skills', 'gstack'); + fs.mkdirSync(destDir, { recursive: true }); + fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md')); + } +} + +/** Init a git repo with config */ +function initGitRepo(dir: string) { + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 }); + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); +} + +function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) { + const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate; + const durationSec = Math.round(result.duration / 1000); + console.log(`${label}: $${estimatedCost.toFixed(2)} (${turnsUsed} turns, ${(estimatedTokens / 1000).toFixed(1)}k tokens, ${durationSec}s)`); +} + +function recordRouting(name: string, result: SkillTestResult, expectedSkill: string, actualSkill: string | undefined) { + evalCollector?.addTest({ + name, + suite: 'Skill Routing E2E', + tier: 'e2e', + passed: actualSkill === expectedSkill, + duration_ms: result.duration, + cost_usd: result.costEstimate.estimatedCost, + transcript: result.transcript, + output: result.output?.slice(0, 2000), + turns_used: result.costEstimate.turnsUsed, + exit_reason: result.exitReason, + }); +} + +// --- Tests --- + +describeE2E('Skill Routing E2E — Developer Journey', () => { + afterAll(() => { + evalCollector?.finalize(); + }); + + test('journey-ideation', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ideation-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + fs.writeFileSync(path.join(tmpDir, 'README.md'), '# New Project\n'); + spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + const testName = 'journey-ideation'; + const expectedSkill = 'office-hours'; + const result = await runSkillTest({ + prompt: "I've been thinking about building a waitlist management tool for restaurants. The existing solutions are expensive and overcomplicated. I want something simple — a tablet app where hosts can add parties, see wait times, and text customers when their table is ready. Help me think through whether this is worth building and what the key design decisions are.", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-plan-eng', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-plan-eng-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture + +## Components +- REST API (Express.js) +- PostgreSQL database +- React frontend +- SMS integration (Twilio) + +## Data Model +- restaurants (id, name, settings) +- parties (id, restaurant_id, name, size, phone, status, created_at) +- wait_estimates (id, restaurant_id, avg_wait_minutes) + +## API Endpoints +- POST /api/parties - add party to waitlist +- GET /api/parties - list current waitlist +- PATCH /api/parties/:id/status - update party status +- GET /api/estimate - get current wait estimate +`); + spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + const testName = 'journey-plan-eng'; + const expectedSkill = 'plan-eng-review'; + const result = await runSkillTest({ + prompt: "I wrote up a plan for the waitlist app in plan.md. Can you take a look at the architecture and make sure I'm not missing any edge cases or failure modes before I start coding?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-think-bigger', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-think-bigger-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture + +## Components +- REST API (Express.js) +- PostgreSQL database +- React frontend +- SMS integration (Twilio) + +## Data Model +- restaurants (id, name, settings) +- parties (id, restaurant_id, name, size, phone, status, created_at) +- wait_estimates (id, restaurant_id, avg_wait_minutes) + +## API Endpoints +- POST /api/parties - add party to waitlist +- GET /api/parties - list current waitlist +- PATCH /api/parties/:id/status - update party status +- GET /api/estimate - get current wait estimate +`); + spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + const testName = 'journey-think-bigger'; + const expectedSkill = 'plan-ceo-review'; + const result = await runSkillTest({ + prompt: "Actually, looking at this plan again, I feel like we're thinking too small. We're just doing waitlists but what about the whole restaurant guest experience? Is there a bigger opportunity here we should go after?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 120_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 180_000); + + test('journey-debug', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-debug-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true }); + fs.writeFileSync(path.join(tmpDir, 'src/api.ts'), ` +import express from 'express'; +const app = express(); + +app.get('/api/waitlist', async (req, res) => { + const db = req.app.locals.db; + const parties = await db.query('SELECT * FROM parties WHERE status = $1', ['waiting']); + res.json(parties.rows); +}); + +export default app; +`); + fs.writeFileSync(path.join(tmpDir, 'error.log'), ` +[2026-03-18T10:23:45Z] ERROR: GET /api/waitlist - 500 Internal Server Error + TypeError: Cannot read properties of undefined (reading 'query') + at /src/api.ts:5:32 + at Layer.handle [as handle_request] (/node_modules/express/lib/router/layer.js:95:5) +[2026-03-18T10:23:46Z] ERROR: GET /api/waitlist - 500 Internal Server Error + TypeError: Cannot read properties of undefined (reading 'query') +`); + + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + run('git', ['checkout', '-b', 'feature/waitlist-api']); + + const testName = 'journey-debug'; + const expectedSkill = 'debug'; + const result = await runSkillTest({ + prompt: "The GET /api/waitlist endpoint was working fine yesterday but now it's returning 500 errors. The tests are passing locally but the endpoint fails when I hit it with curl. Can you figure out what's going on?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-qa', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-qa-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2)); + fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true }); + fs.writeFileSync(path.join(tmpDir, 'src/index.html'), '

Waitlist App

'); + spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + const testName = 'journey-qa'; + const expectedSkill = 'qa'; + const alternateSkills = ['qa-only', 'browse']; + const result = await runSkillTest({ + prompt: "I think the app is mostly working now. Can you go through the site and test everything — find any bugs and fix them?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + const acceptable = [expectedSkill, ...alternateSkills]; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect(acceptable, `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-code-review', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-code-review-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + run('git', ['checkout', '-b', 'feature/add-waitlist']); + fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// updated with waitlist feature\nimport { WaitlistService } from "./waitlist";\n'); + fs.writeFileSync(path.join(tmpDir, 'waitlist.ts'), 'export class WaitlistService {\n async addParty(name: string, size: number) {\n // TODO: implement\n }\n}\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'feat: add waitlist service']); + + const testName = 'journey-code-review'; + const expectedSkill = 'review'; + const result = await runSkillTest({ + prompt: "I'm about to merge this into main. Can you look over my changes and flag anything risky before I land it?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-ship', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ship-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + run('git', ['checkout', '-b', 'feature/waitlist']); + fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// waitlist feature\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'feat: waitlist']); + + const testName = 'journey-ship'; + const expectedSkill = 'ship'; + const result = await runSkillTest({ + prompt: "This looks good. Let's get it deployed — push the code up and create a PR.", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-docs', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-docs-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + fs.writeFileSync(path.join(tmpDir, 'README.md'), '# Waitlist App\nA simple waitlist management tool.\n'); + fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true }); + fs.writeFileSync(path.join(tmpDir, 'src/api.ts'), '// API code\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'feat: ship waitlist feature']); + + const testName = 'journey-docs'; + const expectedSkill = 'document-release'; + const result = await runSkillTest({ + prompt: "We just shipped the waitlist feature. Can you go through the README and any other docs and make sure they match what we actually built?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-retro', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-retro-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + fs.writeFileSync(path.join(tmpDir, 'api.ts'), 'export function getParties() { return []; }\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'feat: add parties API', '--date', '2026-03-12T09:30:00']); + + fs.writeFileSync(path.join(tmpDir, 'ui.tsx'), 'export function WaitlistView() { return
Waitlist
; }\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'feat: add waitlist UI', '--date', '2026-03-13T14:00:00']); + + fs.writeFileSync(path.join(tmpDir, 'README.md'), '# Waitlist App\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'docs: add README', '--date', '2026-03-14T16:00:00']); + + const testName = 'journey-retro'; + const expectedSkill = 'retro'; + const result = await runSkillTest({ + prompt: "It's Friday. What did we ship this week? I want to do a quick retrospective on what the team accomplished.", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-design-system', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-design-system-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app' }, null, 2)); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + const testName = 'journey-design-system'; + const expectedSkill = 'design-consultation'; + const result = await runSkillTest({ + prompt: "Before we build the UI, I want to establish a design system — typography, colors, spacing, the whole thing. Can you put together brand guidelines for this project?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); + + test('journey-visual-qa', async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-visual-qa-')); + try { + initGitRepo(tmpDir); + installSkills(tmpDir); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 }); + + fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true }); + fs.writeFileSync(path.join(tmpDir, 'src/styles.css'), ` +body { font-family: sans-serif; } +.header { font-size: 24px; margin: 20px; } +.card { padding: 16px; margin: 8px; border: 1px solid #ccc; } +.button { background: #007bff; color: white; padding: 10px 20px; } +`); + fs.writeFileSync(path.join(tmpDir, 'src/index.html'), ` + + + +
Waitlist
+
Party of 4 - Smith
+
Party of 2 - Jones
+ + +`); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial UI']); + + const testName = 'journey-visual-qa'; + const expectedSkill = 'design-review'; + const result = await runSkillTest({ + prompt: "Something looks off on the site. The spacing between sections is inconsistent and the font sizes don't feel right. Can you audit the visual design and fix anything that doesn't look polished?", + workingDirectory: tmpDir, + maxTurns: 5, + allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'], + timeout: 60_000, + testName, + runId, + }); + + const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill'); + const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined; + + logCost(`journey: ${testName}`, result); + recordRouting(testName, result, expectedSkill, actualSkill); + + expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0); + expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, 90_000); +}); diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 3687ecee6d868ef4b2648b3a5bb2331995d11ee5..292c1a81c0372f395f6797bfaaec30ad3d41042a 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -1120,3 +1120,40 @@ describe('QA report template', () => { expect(content).toContain('**Precondition:**'); }); }); + +// --- Trigger phrase validation --- + +describe('Skill trigger phrases', () => { + // Skills that must have "Use when" trigger phrases in their description. + // Excluded: root gstack (browser tool), gstack-upgrade (gstack-specific), + // setup-browser-cookies (utility), humanizer (text tool), browse (subskill of gstack) + const SKILLS_REQUIRING_TRIGGERS = [ + 'qa', 'qa-only', 'ship', 'review', 'debug', 'office-hours', + 'plan-ceo-review', 'plan-eng-review', 'plan-design-review', + 'design-review', 'design-consultation', 'retro', 'document-release', + ]; + + for (const skill of SKILLS_REQUIRING_TRIGGERS) { + test(`${skill}/SKILL.md has "Use when" trigger phrases`, () => { + const skillPath = path.join(ROOT, skill, 'SKILL.md'); + if (!fs.existsSync(skillPath)) return; + const content = fs.readFileSync(skillPath, 'utf-8'); + // Extract description from frontmatter + const frontmatterEnd = content.indexOf('---', 4); + const frontmatter = content.slice(0, frontmatterEnd); + expect(frontmatter).toMatch(/Use when/i); + }); + } + + // Skills with proactive triggers should have "Proactively suggest" in description + for (const skill of SKILLS_REQUIRING_TRIGGERS) { + test(`${skill}/SKILL.md has "Proactively suggest" phrase`, () => { + const skillPath = path.join(ROOT, skill, 'SKILL.md'); + if (!fs.existsSync(skillPath)) return; + const content = fs.readFileSync(skillPath, 'utf-8'); + const frontmatterEnd = content.indexOf('---', 4); + const frontmatter = content.slice(0, frontmatterEnd); + expect(frontmatter).toMatch(/Proactively suggest/i); + }); + } +}); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 48613d64d11fc999f46aa93032ef69142b6650ed..b3f844d85b2d5a5742801e60445c37a4d6587510 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -115,7 +115,8 @@ describe('selectTests', () => { expect(result.selected).toContain('plan-ceo-review-selective'); expect(result.selected).toContain('retro'); expect(result.selected).toContain('retro-base-branch'); - expect(result.selected.length).toBe(4); + // Also selects journey routing tests (*/SKILL.md.tmpl matches retro/SKILL.md.tmpl) + expect(result.selected.length).toBeGreaterThanOrEqual(4); }); test('works with LLM_JUDGE_TOUCHFILES', () => { @@ -125,13 +126,15 @@ describe('selectTests', () => { expect(result.selected.length).toBe(2); }); - test('SKILL.md.tmpl root template only selects root-dependent tests', () => { + test('SKILL.md.tmpl root template selects root-dependent tests and routing tests', () => { const result = selectTests(['SKILL.md.tmpl'], E2E_TOUCHFILES); // Should select the 7 tests that depend on root SKILL.md expect(result.selected).toContain('skillmd-setup-discovery'); expect(result.selected).toContain('contributor-mode'); expect(result.selected).toContain('session-awareness'); - // Should NOT select unrelated tests + // Also selects journey routing tests (SKILL.md.tmpl in their touchfiles) + expect(result.selected).toContain('journey-ideation'); + // Should NOT select unrelated non-routing tests expect(result.selected).not.toContain('plan-ceo-review'); expect(result.selected).not.toContain('retro'); });