From 6a46b9099f2c18adf2471852d0e4970e83af69af Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 01:10:35 -0700 Subject: [PATCH 1/7] test: add AskUserQuestion format regression eval for plan reviews MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four-case periodic-tier eval that captures the verbatim AskUserQuestion text /plan-ceo-review and /plan-eng-review produce, then asserts the format rule is honored: RECOMMENDATION always, Completeness: N/10 only on coverage-differentiated options, and an explicit "options differ in kind" note on kind-differentiated options. Cases: - plan-ceo-review mode selection (kind-differentiated) - plan-ceo-review approach menu (coverage-differentiated) - plan-eng-review per-issue coverage decision - plan-eng-review per-issue architectural choice (kind-differentiated) Classified periodic because behavior depends on Opus non-determinism — gate-tier would flake and block merges. Test harness instructs the agent to write its would-be AskUserQuestion text to $OUT_FILE rather than invoke a real tool (MCP AskUserQuestion isn't wired in the test subprocess). Regex predicates then validate the captured content. Cost: ~$2 per full run. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 13 ++ test/skill-e2e-plan-format.test.ts | 297 +++++++++++++++++++++++++++++ test/touchfiles.test.ts | 6 +- 3 files changed, 314 insertions(+), 2 deletions(-) create mode 100644 test/skill-e2e-plan-format.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 692d00d885..032ccba0b8 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -82,6 +82,13 @@ export const E2E_TOUCHFILES: Record = { 'plan-eng-review-artifact': ['plan-eng-review/**'], 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], + // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) + // Fires when either template OR the two preamble resolvers change. + 'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], + 'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], + 'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], + 'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], + // /plan-tune (v1 observational) 'plan-tune-inspect': ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'], @@ -266,6 +273,12 @@ export const E2E_TIERS: Record = { 'plan-eng-coverage-audit': 'gate', 'plan-review-report': 'gate', + // AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark) + 'plan-ceo-review-format-mode': 'periodic', + 'plan-ceo-review-format-approach': 'periodic', + 'plan-eng-review-format-coverage': 'periodic', + 'plan-eng-review-format-kind': 'periodic', + // /plan-tune — gate (core v1 DX promise: plain-English intent routing) 'plan-tune-inspect': 'gate', diff --git a/test/skill-e2e-plan-format.test.ts b/test/skill-e2e-plan-format.test.ts new file mode 100644 index 0000000000..da1a1102ab --- /dev/null +++ b/test/skill-e2e-plan-format.test.ts @@ -0,0 +1,297 @@ +/** + * AskUserQuestion format regression test for /plan-ceo-review and /plan-eng-review. + * + * Context: a user on Opus 4.7 reported the RECOMMENDATION line and the + * `Completeness: N/10` per-option score stopped appearing on AskUserQuestion + * prompts. This test captures the agent's AskUserQuestion output verbatim + * and asserts the format rule is applied. + * + * Capture shape: `claude -p` sessions inside this harness do not have the + * AskUserQuestion MCP tool wired. We instruct the agent to write the verbatim + * AskUserQuestion text it would have made to $OUT_FILE instead of calling + * any tool. Assertions read that file. + * + * Coverage-vs-kind split: the format rule says to include `Completeness: N/10` + * only when options differ in coverage. When options differ in kind (mode + * selection, posture choice, cherry-pick Add/Defer/Skip), the score is + * intentionally absent and a one-line note explains why. Assertions split + * accordingly. + */ +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { runSkillTest } from './helpers/session-runner'; +import { + ROOT, runId, + describeIfSelected, testConcurrentIfSelected, + logCost, recordE2E, + createEvalCollector, finalizeEvalCollector, +} from './helpers/e2e-helpers'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const evalCollector = createEvalCollector('e2e-plan-format'); + +// Regex predicates applied to captured AskUserQuestion content. +// RECOMMENDATION regex is lenient on intervening markdown markers (e.g. +// agent writes `**RECOMMENDATION:** Choose` — the `**` closers are benign). +const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/; +const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/; +const KIND_NOTE_RE = /options differ in kind/i; + +const SAMPLE_PLAN = `# Plan: Add User Dashboard + +## Context +We're building a new user dashboard that shows recent activity, notifications, and quick actions. + +## Changes +1. New React component \`UserDashboard\` in \`src/components/\` +2. REST API endpoint \`GET /api/dashboard\` returning user stats +3. PostgreSQL query for activity aggregation +4. Redis cache layer for dashboard data (5min TTL) + +## Architecture +- Frontend: React + TailwindCSS +- Backend: Express.js REST API +- Database: PostgreSQL with existing user/activity tables +- Cache: Redis for dashboard aggregates +`; + +function setupPlanDir(tmpPrefix: string, skillName: 'plan-ceo-review' | 'plan-eng-review'): string { + const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix)); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(planDir, 'plan.md'), SAMPLE_PLAN); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add plan']); + + fs.mkdirSync(path.join(planDir, skillName), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, skillName, 'SKILL.md'), + path.join(planDir, skillName, 'SKILL.md'), + ); + + return planDir; +} + +// The capture instruction passed to every case. Tells the agent to dump +// AskUserQuestion content to a file instead of calling a tool. +function captureInstruction(outFile: string): string { + return `Write the verbatim text of every AskUserQuestion you would have made to ${outFile} (one question per session, full text including options and recommendation line). Do NOT call any tool to ask the user. Do NOT paraphrase — include the exact prose you would have shown. This is a format-capture test, not an interactive session.`; +} + +// --- Case 1: plan-ceo-review mode selection (kind-differentiated) --- + +describeIfSelected('Plan Format — CEO Mode Selection', ['plan-ceo-review-format-mode'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-format-ceo-mode-', 'plan-ceo-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-ceo-review-format-mode', async () => { + const result = await runSkillTest({ + prompt: `Read plan-ceo-review/SKILL.md for the review workflow. + +Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration or system audit steps. + +Proceed to Step 0F (Mode Selection). This is where the skill presents 4 mode options (SCOPE EXPANSION, SELECTIVE EXPANSION, HOLD SCOPE, SCOPE REDUCTION) to the user via AskUserQuestion. These options differ in kind (review posture), not in coverage. + +${captureInstruction(outFile)} + +After writing the file, stop. Do not continue the review.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-ceo-review-format-mode', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-ceo-review format (mode)', result); + recordE2E(evalCollector, '/plan-ceo-review-format-mode', 'Plan Format — CEO Mode Selection', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(100); + + // Kind-differentiated: RECOMMENDATION required, Completeness: N/10 must NOT appear, + // "options differ in kind" note must appear. + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).not.toMatch(COMPLETENESS_RE); + expect(captured).toMatch(KIND_NOTE_RE); + }, 300_000); +}); + +// --- Case 2: plan-ceo-review approach menu (coverage-differentiated) --- + +describeIfSelected('Plan Format — CEO Approach Menu', ['plan-ceo-review-format-approach'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-format-ceo-approach-', 'plan-ceo-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-ceo-review-format-approach', async () => { + const result = await runSkillTest({ + prompt: `Read plan-ceo-review/SKILL.md for the review workflow. + +Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration or system audit steps. + +Proceed to Step 0C-bis (Implementation Alternatives / Approach Menu). This is where the skill generates 2-3 approaches (minimal viable vs ideal architecture) and presents them via AskUserQuestion. These options differ in coverage (complete vs shortcut), so Completeness: N/10 applies. + +${captureInstruction(outFile)} + +After writing the file, stop. Do not continue the review.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-ceo-review-format-approach', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-ceo-review format (approach)', result); + recordE2E(evalCollector, '/plan-ceo-review-format-approach', 'Plan Format — CEO Approach Menu', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(100); + + // Coverage-differentiated: both RECOMMENDATION and Completeness: N/10 required. + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).toMatch(COMPLETENESS_RE); + }, 300_000); +}); + +// --- Case 3: plan-eng-review coverage-differentiated per-issue AskUserQuestion --- + +describeIfSelected('Plan Format — Eng Coverage Issue', ['plan-eng-review-format-coverage'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-format-eng-cov-', 'plan-eng-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-eng-review-format-coverage', async () => { + const result = await runSkillTest({ + prompt: `Read plan-eng-review/SKILL.md for the review workflow. + +Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps. + +During your review (Section 3 Test Review is the natural place), generate ONE AskUserQuestion about test coverage depth where the options are clearly coverage-differentiated. For example: + A) Full coverage: happy path + edge cases + error paths (Completeness 10/10) + B) Happy path only (Completeness 7/10) + C) Smoke test (Completeness 3/10) + +${captureInstruction(outFile)} + +After writing the file with that ONE question, stop. Do not continue the review.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-eng-review-format-coverage', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-eng-review format (coverage)', result); + recordE2E(evalCollector, '/plan-eng-review-format-coverage', 'Plan Format — Eng Coverage Issue', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(100); + + // Coverage-differentiated: both RECOMMENDATION and Completeness: N/10 required. + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).toMatch(COMPLETENESS_RE); + }, 300_000); +}); + +// --- Case 4: plan-eng-review kind-differentiated per-issue AskUserQuestion --- + +describeIfSelected('Plan Format — Eng Kind Issue', ['plan-eng-review-format-kind'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-format-eng-kind-', 'plan-eng-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-eng-review-format-kind', async () => { + const result = await runSkillTest({ + prompt: `Read plan-eng-review/SKILL.md for the review workflow. + +Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps. + +During your review (Section 1 Architecture), generate ONE AskUserQuestion about an architectural choice where the options differ in kind, not in coverage. For example, "should we use Redis or Postgres for the cache layer?" — the options are different kinds of systems with different tradeoffs, not more-or-less-complete versions of the same thing. + +${captureInstruction(outFile)} + +After writing the file with that ONE question, stop. Do not continue the review.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-eng-review-format-kind', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-eng-review format (kind)', result); + recordE2E(evalCollector, '/plan-eng-review-format-kind', 'Plan Format — Eng Kind Issue', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(100); + + // Kind-differentiated: RECOMMENDATION required, Completeness: N/10 must NOT appear, + // "options differ in kind" note must appear. + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).not.toMatch(COMPLETENESS_RE); + expect(captured).toMatch(KIND_NOTE_RE); + }, 300_000); +}); + +afterAll(async () => { + await finalizeEvalCollector(evalCollector); +}); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 4ee23a1807..5daae1c31c 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -83,8 +83,10 @@ describe('selectTests', () => { expect(result.selected).toContain('plan-ceo-review-expansion-energy'); expect(result.selected).toContain('autoplan-core'); expect(result.selected).toContain('codex-offered-ceo-review'); - expect(result.selected.length).toBe(6); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 6); + expect(result.selected).toContain('plan-ceo-review-format-mode'); + expect(result.selected).toContain('plan-ceo-review-format-approach'); + expect(result.selected.length).toBe(8); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 8); }); test('global touchfile triggers ALL tests', () => { From 5fe1814310aca410510d53c740377d3d5caeb6ce Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 01:11:05 -0700 Subject: [PATCH 2/7] fix(plan-reviews): restore RECOMMENDATION + split Completeness by question type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Opus 4.7 users reported /plan-ceo-review and /plan-eng-review stopped emitting the RECOMMENDATION line and per-option Completeness: X/10 scores. E2E capture showed the real failure mode: on kind-differentiated questions (mode selection, architectural A-vs-B, cherry-pick), Opus 4.7 either fabricated filler scores (10/10 on every option — conveys nothing) or dropped the format entirely when the metric didn't fit. Fix is at two layers: 1. scripts/resolvers/preamble/generate-ask-user-format.ts splits the old run-on step 3 into: - Step 3 "Recommend (ALWAYS)": RECOMMENDATION is required on every question, coverage- or kind-differentiated. - Step 4 "Score completeness (when meaningful)": emit Completeness: N/10 only when options differ in coverage. When options differ in kind, skip the score and include a one-line explanatory note. Do not fabricate scores. 2. scripts/resolvers/preamble/generate-completeness-section.ts updates the Completeness Principle tail to match. Without this, the preamble contained two rules (one conditional, one unconditional) and the model hedged. Template anchors reinforce the distinction where agent judgment is most likely to drift: - plan-ceo-review Section 0C-bis (approach menu) gets the coverage-differentiated anchor. - plan-ceo-review Section 0F (mode selection) gets the kind-differentiated anchor. - plan-eng-review CRITICAL RULE section gets the coverage-vs-kind rule for every per-issue AskUserQuestion raised during the review. Regenerated SKILL.md for all T2 skills + golden fixtures refreshed. Every skill using the T2 preamble now has the same conditional scoring rule. Verified via new periodic-tier eval (test/skill-e2e-plan-format.test.ts): all 4 cases fail on prior behavior, all 4 pass with this fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- autoplan/SKILL.md | 7 ++++--- canary/SKILL.md | 7 ++++--- codex/SKILL.md | 7 ++++--- context-restore/SKILL.md | 7 ++++--- context-save/SKILL.md | 7 ++++--- cso/SKILL.md | 7 ++++--- design-consultation/SKILL.md | 7 ++++--- design-html/SKILL.md | 7 ++++--- design-review/SKILL.md | 7 ++++--- design-shotgun/SKILL.md | 7 ++++--- devex-review/SKILL.md | 7 ++++--- document-release/SKILL.md | 7 ++++--- health/SKILL.md | 7 ++++--- investigate/SKILL.md | 7 ++++--- land-and-deploy/SKILL.md | 7 ++++--- learn/SKILL.md | 7 ++++--- office-hours/SKILL.md | 7 ++++--- open-gstack-browser/SKILL.md | 7 ++++--- pair-agent/SKILL.md | 7 ++++--- plan-ceo-review/SKILL.md | 12 +++++++++--- plan-ceo-review/SKILL.md.tmpl | 5 +++++ plan-design-review/SKILL.md | 7 ++++--- plan-devex-review/SKILL.md | 7 ++++--- plan-eng-review/SKILL.md | 8 +++++--- plan-eng-review/SKILL.md.tmpl | 1 + plan-tune/SKILL.md | 7 ++++--- qa-only/SKILL.md | 7 ++++--- qa/SKILL.md | 7 ++++--- retro/SKILL.md | 7 ++++--- review/SKILL.md | 7 ++++--- .../resolvers/preamble/generate-ask-user-format.ts | 5 +++-- .../preamble/generate-completeness-section.ts | 2 +- setup-deploy/SKILL.md | 7 ++++--- ship/SKILL.md | 7 ++++--- test/fixtures/golden/claude-ship-SKILL.md | 7 ++++--- test/fixtures/golden/codex-ship-SKILL.md | 7 ++++--- test/fixtures/golden/factory-ship-SKILL.md | 7 ++++--- 37 files changed, 148 insertions(+), 102 deletions(-) diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index d88a15276c..7f0372679d 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -451,8 +451,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -572,7 +573,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/canary/SKILL.md b/canary/SKILL.md index 6f9e489166..80e8d77e51 100644 --- a/canary/SKILL.md +++ b/canary/SKILL.md @@ -443,8 +443,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -564,7 +565,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/codex/SKILL.md b/codex/SKILL.md index 3711260f4c..192c9409ae 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -445,8 +445,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -566,7 +567,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/context-restore/SKILL.md b/context-restore/SKILL.md index b5ef118d58..ef4822e69c 100644 --- a/context-restore/SKILL.md +++ b/context-restore/SKILL.md @@ -447,8 +447,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -568,7 +569,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/context-save/SKILL.md b/context-save/SKILL.md index 8a022652f8..3e95de640f 100644 --- a/context-save/SKILL.md +++ b/context-save/SKILL.md @@ -447,8 +447,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -568,7 +569,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/cso/SKILL.md b/cso/SKILL.md index 72777f9b44..7bd1c959fa 100644 --- a/cso/SKILL.md +++ b/cso/SKILL.md @@ -448,8 +448,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -569,7 +570,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/design-consultation/SKILL.md b/design-consultation/SKILL.md index 37182ecaef..20c5d9e109 100644 --- a/design-consultation/SKILL.md +++ b/design-consultation/SKILL.md @@ -448,8 +448,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -569,7 +570,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/design-html/SKILL.md b/design-html/SKILL.md index 352ee89908..acf50095a2 100644 --- a/design-html/SKILL.md +++ b/design-html/SKILL.md @@ -450,8 +450,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -571,7 +572,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/design-review/SKILL.md b/design-review/SKILL.md index f7c06a9993..af794bdedb 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -448,8 +448,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -569,7 +570,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/design-shotgun/SKILL.md b/design-shotgun/SKILL.md index 19ddb0638d..e30c810ade 100644 --- a/design-shotgun/SKILL.md +++ b/design-shotgun/SKILL.md @@ -445,8 +445,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -566,7 +567,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/devex-review/SKILL.md b/devex-review/SKILL.md index 0a0c37e5b4..738f8c4201 100644 --- a/devex-review/SKILL.md +++ b/devex-review/SKILL.md @@ -448,8 +448,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -569,7 +570,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/document-release/SKILL.md b/document-release/SKILL.md index 4637449d2f..39f75bc154 100644 --- a/document-release/SKILL.md +++ b/document-release/SKILL.md @@ -445,8 +445,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -566,7 +567,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/health/SKILL.md b/health/SKILL.md index 30623d7ae6..095fc2d33d 100644 --- a/health/SKILL.md +++ b/health/SKILL.md @@ -445,8 +445,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -566,7 +567,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/investigate/SKILL.md b/investigate/SKILL.md index d512335201..e34cc00831 100644 --- a/investigate/SKILL.md +++ b/investigate/SKILL.md @@ -462,8 +462,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -583,7 +584,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/land-and-deploy/SKILL.md b/land-and-deploy/SKILL.md index 91b21206f6..ebf228a64e 100644 --- a/land-and-deploy/SKILL.md +++ b/land-and-deploy/SKILL.md @@ -442,8 +442,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -563,7 +564,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/learn/SKILL.md b/learn/SKILL.md index 52d67e78a7..a2e6ebcaa6 100644 --- a/learn/SKILL.md +++ b/learn/SKILL.md @@ -445,8 +445,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -566,7 +567,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/office-hours/SKILL.md b/office-hours/SKILL.md index c01ec5fca0..7aea0bee4a 100644 --- a/office-hours/SKILL.md +++ b/office-hours/SKILL.md @@ -453,8 +453,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -574,7 +575,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/open-gstack-browser/SKILL.md b/open-gstack-browser/SKILL.md index 38acd93458..52324ffc4b 100644 --- a/open-gstack-browser/SKILL.md +++ b/open-gstack-browser/SKILL.md @@ -442,8 +442,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -563,7 +564,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/pair-agent/SKILL.md b/pair-agent/SKILL.md index a5d5b5c12b..5ae8d0e94f 100644 --- a/pair-agent/SKILL.md +++ b/pair-agent/SKILL.md @@ -443,8 +443,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -564,7 +565,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index 47a231c45f..bcf3ca1f6d 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -449,8 +449,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -570,7 +571,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol @@ -1167,6 +1168,8 @@ Rules: - If only one approach exists, explain concretely why alternatives were eliminated. - Do NOT proceed to mode selection (0F) without user approval of the chosen approach. +Present these approach options via AskUserQuestion using the preamble's AskUserQuestion Format section: include RECOMMENDATION and `Completeness: N/10` on every option. These approaches differ in coverage (minimal viable vs ideal architecture), so completeness scoring applies directly. + ### 0D-prelude. Expansion Framing (shared by EXPANSION and SELECTIVE EXPANSION) Every expansion proposal you generate in SCOPE EXPANSION or SELECTIVE EXPANSION mode follows this framing pattern: @@ -1352,6 +1355,9 @@ Context-dependent defaults: After mode is selected, confirm which implementation approach (from 0C-bis) applies under the chosen mode. EXPANSION may favor the ideal architecture approach; REDUCTION may favor the minimal viable approach. Once selected, commit fully. Do not silently drift. + +Present these mode options via AskUserQuestion using the preamble's AskUserQuestion Format section: include RECOMMENDATION. These options differ in kind (review posture), not coverage — do NOT emit `Completeness: N/10` per option. Include the one-line note from step 4 of the preamble format rule instead: `Note: options differ in kind, not coverage — no completeness score.` + **STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues or fix is obvious, state what you'll do and move on — don't waste a question. Do NOT proceed until user responds. **Reminder: Do NOT make any code changes. Review only.** diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index f6dbc876bc..555cba02b3 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -246,6 +246,8 @@ Rules: - If only one approach exists, explain concretely why alternatives were eliminated. - Do NOT proceed to mode selection (0F) without user approval of the chosen approach. +Present these approach options via AskUserQuestion using the preamble's AskUserQuestion Format section: include RECOMMENDATION and `Completeness: N/10` on every option. These approaches differ in coverage (minimal viable vs ideal architecture), so completeness scoring applies directly. + ### 0D-prelude. Expansion Framing (shared by EXPANSION and SELECTIVE EXPANSION) Every expansion proposal you generate in SCOPE EXPANSION or SELECTIVE EXPANSION mode follows this framing pattern: @@ -371,6 +373,9 @@ Context-dependent defaults: After mode is selected, confirm which implementation approach (from 0C-bis) applies under the chosen mode. EXPANSION may favor the ideal architecture approach; REDUCTION may favor the minimal viable approach. Once selected, commit fully. Do not silently drift. + +Present these mode options via AskUserQuestion using the preamble's AskUserQuestion Format section: include RECOMMENDATION. These options differ in kind (review posture), not coverage — do NOT emit `Completeness: N/10` per option. Include the one-line note from step 4 of the preamble format rule instead: `Note: options differ in kind, not coverage — no completeness score.` + **STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues or fix is obvious, state what you'll do and move on — don't waste a question. Do NOT proceed until user responds. **Reminder: Do NOT make any code changes. Review only.** diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 01945c036d..1b659cec55 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -446,8 +446,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -567,7 +568,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/plan-devex-review/SKILL.md b/plan-devex-review/SKILL.md index 328956c37b..c1a301788a 100644 --- a/plan-devex-review/SKILL.md +++ b/plan-devex-review/SKILL.md @@ -450,8 +450,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -571,7 +572,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 8167eac7d2..15f333ad72 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -448,8 +448,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -569,7 +570,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol @@ -1379,6 +1380,7 @@ Follow the AskUserQuestion format from the Preamble above. Additional rules for * For each option, specify in one line: effort (human: ~X / CC: ~Y), risk, and maintenance burden. If the complete option is only marginally more effort than the shortcut with CC, recommend the complete option. * **Map the reasoning to my engineering preferences above.** One sentence connecting your recommendation to a specific preference (DRY, explicit > clever, minimal diff, etc.). * Label with issue NUMBER + option LETTER (e.g., "3A", "3B"). +* **Coverage vs kind:** for every per-issue AskUserQuestion you raise in this review, decide whether the options differ in coverage or in kind. If coverage (e.g., more tests vs fewer, complete error handling vs happy-path-only, full edge-case coverage vs shortcut), include `Completeness: N/10` on each option. If kind (e.g., architectural choice between two different systems, posture-over-posture, A/B/C where each is a different kind of thing), skip the score and add one line: `Note: options differ in kind, not coverage — no completeness score.` Do NOT fabricate scores on kind-differentiated questions — filler scores are worse than no score. * **Escape hatch:** If a section has no issues, say so and move on. If an issue has an obvious fix with no real alternatives, state what you'll do and move on — don't waste a question on it. Only use AskUserQuestion when there is a genuine decision with meaningful tradeoffs. ## Required outputs diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index a6a8bdd491..711e354cc0 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -185,6 +185,7 @@ Follow the AskUserQuestion format from the Preamble above. Additional rules for * For each option, specify in one line: effort (human: ~X / CC: ~Y), risk, and maintenance burden. If the complete option is only marginally more effort than the shortcut with CC, recommend the complete option. * **Map the reasoning to my engineering preferences above.** One sentence connecting your recommendation to a specific preference (DRY, explicit > clever, minimal diff, etc.). * Label with issue NUMBER + option LETTER (e.g., "3A", "3B"). +* **Coverage vs kind:** for every per-issue AskUserQuestion you raise in this review, decide whether the options differ in coverage or in kind. If coverage (e.g., more tests vs fewer, complete error handling vs happy-path-only, full edge-case coverage vs shortcut), include `Completeness: N/10` on each option. If kind (e.g., architectural choice between two different systems, posture-over-posture, A/B/C where each is a different kind of thing), skip the score and add one line: `Note: options differ in kind, not coverage — no completeness score.` Do NOT fabricate scores on kind-differentiated questions — filler scores are worse than no score. * **Escape hatch:** If a section has no issues, say so and move on. If an issue has an obvious fix with no real alternatives, state what you'll do and move on — don't waste a question on it. Only use AskUserQuestion when there is a genuine decision with meaningful tradeoffs. ## Required outputs diff --git a/plan-tune/SKILL.md b/plan-tune/SKILL.md index c574678636..0bba50d882 100644 --- a/plan-tune/SKILL.md +++ b/plan-tune/SKILL.md @@ -456,8 +456,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -577,7 +578,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index e97f25280c..1fbe55bbf0 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -444,8 +444,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -565,7 +566,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/qa/SKILL.md b/qa/SKILL.md index 1c2e318b06..3d85580cb6 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -450,8 +450,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -571,7 +572,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/retro/SKILL.md b/retro/SKILL.md index f726435df3..7db4250c63 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -443,8 +443,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -564,7 +565,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/review/SKILL.md b/review/SKILL.md index 548924a6ea..7538ace637 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -447,8 +447,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -568,7 +569,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/scripts/resolvers/preamble/generate-ask-user-format.ts b/scripts/resolvers/preamble/generate-ask-user-format.ts index 0793ba72ed..66f7f2d560 100644 --- a/scripts/resolvers/preamble/generate-ask-user-format.ts +++ b/scripts/resolvers/preamble/generate-ask-user-format.ts @@ -6,8 +6,9 @@ export function generateAskUserFormat(_ctx: TemplateContext): string { **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the \`_BRANCH\` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** \`RECOMMENDATION: Choose [X] because [one-line reason]\` — always prefer the complete option over shortcuts (see Completeness Principle). Include \`Completeness: X/10\` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: \`A) ... B) ... C) ...\` — when an option involves effort, show both scales: \`(human: ~X / CC: ~Y)\` +3. **Recommend (ALWAYS):** Every question ends with \`RECOMMENDATION: Choose [X] because [one-line reason]\`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with \`Completeness: N/10\` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip \`Completeness: N/10\` entirely and write one line: \`Note: options differ in kind, not coverage — no completeness score.\` Do not fabricate filler scores. +5. **Options:** Lettered options: \`A) ... B) ... C) ...\` — when an option involves effort, show both scales: \`(human: ~X / CC: ~Y)\` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. diff --git a/scripts/resolvers/preamble/generate-completeness-section.ts b/scripts/resolvers/preamble/generate-completeness-section.ts index 020d8365e9..c7b5ad89c6 100644 --- a/scripts/resolvers/preamble/generate-completeness-section.ts +++ b/scripts/resolvers/preamble/generate-completeness-section.ts @@ -14,6 +14,6 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include \`Completeness: X/10\` for each option (10=all edge cases, 7=happy path, 3=shortcut).`; +When options differ in coverage (e.g. full vs happy-path vs shortcut), include \`Completeness: X/10\` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: \`Note: options differ in kind, not coverage — no completeness score.\` Do not fabricate scores.`; } diff --git a/setup-deploy/SKILL.md b/setup-deploy/SKILL.md index 2d86f2bf90..b7689e8525 100644 --- a/setup-deploy/SKILL.md +++ b/setup-deploy/SKILL.md @@ -446,8 +446,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -567,7 +568,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/ship/SKILL.md b/ship/SKILL.md index 8e2fa0c082..1bb1c76fd1 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -448,8 +448,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -569,7 +570,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/test/fixtures/golden/claude-ship-SKILL.md b/test/fixtures/golden/claude-ship-SKILL.md index 8e2fa0c082..1bb1c76fd1 100644 --- a/test/fixtures/golden/claude-ship-SKILL.md +++ b/test/fixtures/golden/claude-ship-SKILL.md @@ -448,8 +448,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -569,7 +570,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/test/fixtures/golden/codex-ship-SKILL.md b/test/fixtures/golden/codex-ship-SKILL.md index cd5c7c0e0a..5ea245cc90 100644 --- a/test/fixtures/golden/codex-ship-SKILL.md +++ b/test/fixtures/golden/codex-ship-SKILL.md @@ -437,8 +437,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -558,7 +559,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol diff --git a/test/fixtures/golden/factory-ship-SKILL.md b/test/fixtures/golden/factory-ship-SKILL.md index 5c38f08070..fbff023c4e 100644 --- a/test/fixtures/golden/factory-ship-SKILL.md +++ b/test/fixtures/golden/factory-ship-SKILL.md @@ -439,8 +439,9 @@ available]. [Health score if available]." Keep it to 2-3 sentences. **ALWAYS follow this structure for every AskUserQuestion call:** 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. +5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -560,7 +561,7 @@ AI makes completeness near-free. Always recommend the complete option over short | Feature | 1 week | 30 min | ~30x | | Bug fix | 4 hours | 15 min | ~20x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores. ## Confusion Protocol From d591ad29b21235b8f8a0b5f71fea766bca306880 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 12:32:14 -0700 Subject: [PATCH 3/7] chore: bump version and changelog (v1.6.2.0) Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 42 ++++++++++++++++++++++++++++++++++++++++++ VERSION | 2 +- package.json | 2 +- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6c3000344..8905dd2137 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,47 @@ # Changelog +## [1.6.2.0] - 2026-04-22 + +## **Plan reviews give you the recommendation again. And we finally admitted a 10/10 score on a mode pick means nothing.** + +A user on Opus 4.7 reported `/plan-ceo-review` and `/plan-eng-review` stopped showing the `RECOMMENDATION: Choose X` line and the per-option `Completeness: N/10` score that used to make decisions quick. The fix ships both signals back, but with a sharper distinction: coverage-differentiated options get real scores (10 = all edges, 7 = happy path, 3 = shortcut), and kind-differentiated options (mode selection, A-vs-B architecture calls, cherry-pick Add/Defer/Skip) get the RECOMMENDATION plus an explicit `Note: options differ in kind, not coverage — no completeness score.` line instead of fabricated 10/10 filler. + +### The numbers that matter + +Source: `test/skill-e2e-plan-format.test.ts`, four cases pinned to `claude-opus-4-7`, ~$2 per full run. Periodic tier (non-deterministic Opus behavior gets weekly cron, not per-PR gate). + +| Question type | Before (v1.6.1.0) | After (v1.6.2.0) | +|---|---|---| +| Mode selection (kind-differentiated) | `Completeness: 10/10` fabricated on all 4 modes | RECOMMENDATION + "options differ in kind" note | +| Approach menu (coverage-differentiated) | `**RECOMMENDATION:**` markdown-bolded but regex missed it | RECOMMENDATION + `Completeness: 5/7/10` per option | +| Per-issue coverage decision | Present, working | Present, working (unchanged) | +| Per-issue architectural choice (kind-differentiated) | `Completeness: 9/9/5` fabricated on kind question | RECOMMENDATION + "options differ in kind" note | + +| Eval pass | Result | Cost | +|---|---|---| +| Phase 1 baseline (pre-fix) | 1/4 assertions pass (evidence of regression) | $2.19 | +| Phase 3 post-fix | 4/4 assertions pass | $1.84 | +| Phase 3b neighbor regression (`skill-e2e-plan.test.ts`) | 12/12 pass, no drift | $5.19 | + +### Itemized changes + +#### Fixed + +- `RECOMMENDATION: Choose X` now appears consistently on every AskUserQuestion in `/plan-ceo-review` and `/plan-eng-review` regardless of question type. +- `Completeness: N/10` is only emitted on coverage-differentiated options. Kind-differentiated questions (mode picks, architectural choices between different systems, cherry-pick A/B/C) emit a one-line note explaining why the score doesn't apply, instead of fabricating 10/10 filler. + +#### Changed + +- The `AskUserQuestion Format` section in the T2 preamble splits the old run-on paragraph into two ALWAYS-framed rules: step 3 "Recommend (ALWAYS)" and step 4 "Score completeness (when meaningful)". This affects every T2 skill (~15 files regenerated). +- The `Completeness Principle — Boil the Lake` preamble section now states the coverage-vs-kind distinction explicitly, matching step 4. Without this edit the two preamble locations would disagree — which is how the regression started. +- Section 0C-bis (approach menu) and Section 0F (mode selection) in `plan-ceo-review/SKILL.md.tmpl` now carry short anchor lines that remind the model which question type applies. `plan-eng-review/SKILL.md.tmpl` gets an equivalent anchor inside the CRITICAL RULE section for per-issue AskUserQuestion decisions. + +#### For contributors + +- New test file `test/skill-e2e-plan-format.test.ts` captures verbatim AskUserQuestion output from the two plan skills and asserts the coverage-vs-kind format. Instructs the agent to write would-be AskUserQuestion text to `$OUT_FILE` rather than calling an MCP tool (since MCP isn't wired inside `claude -p`). +- Classified `periodic` tier because behavior depends on Opus 4.7 non-determinism — `gate` tier would flake and block merges. +- Golden fixtures (`test/fixtures/golden/claude-ship-SKILL.md`, `codex-ship-SKILL.md`, `factory-ship-SKILL.md`) refreshed to reflect the new format rule. + ## [1.6.1.0] - 2026-04-22 ## **Opus 4.7 migration, reviewed. Overlay actually split per model. Routing verified, fanout is still on the list.** diff --git a/VERSION b/VERSION index 997d27b766..9823ee6113 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.6.1.0 +1.6.2.0 diff --git a/package.json b/package.json index e98d83287b..42210882f7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "1.6.1.0", + "version": "1.6.2.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", From b7f6246061f94d28cae4fff44da285e27d959260 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 21:34:11 -0700 Subject: [PATCH 4/7] test: add Codex eval for AskUserQuestion format compliance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four-case periodic-tier eval mirrors test/skill-e2e-plan-format.test.ts but drives the plan review skills via codex exec instead of claude -p. Context: Codex under the gpt.md "No preamble / Prefer doing over listing" overlay tends to skip the Simplify/ELI10 paragraph and the RECOMMENDATION line on AskUserQuestion calls. Users have to manually re-prompt "ELI10 and don't forget to recommend" almost every time. This test pins the behavior so regressions surface. Cases: - plan-ceo-review mode selection (kind-differentiated) - plan-ceo-review approach menu (coverage-differentiated) - plan-eng-review per-issue coverage decision - plan-eng-review per-issue architectural choice (kind-differentiated) Assertions on captured AskUserQuestion text: - RECOMMENDATION: Choose present (all cases) - Completeness: N/10 present on coverage, absent on kind - "options differ in kind" note present on kind - ELI10 length floor (>400 chars) — catches bare options-only output Cost: ~\$2-4 per full run. Co-Authored-By: Claude Opus 4.7 --- test/codex-e2e-plan-format.test.ts | 315 +++++++++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 test/codex-e2e-plan-format.test.ts diff --git a/test/codex-e2e-plan-format.test.ts b/test/codex-e2e-plan-format.test.ts new file mode 100644 index 0000000000..2b1dcb2bd0 --- /dev/null +++ b/test/codex-e2e-plan-format.test.ts @@ -0,0 +1,315 @@ +/** + * AskUserQuestion format regression test for /plan-ceo-review and /plan-eng-review + * running under Codex CLI (GPT-5.4). + * + * Context: GPT-class models under the "No preamble / Prefer doing over listing" + * gpt.md overlay tend to skip the Simplify (ELI10) paragraph and the RECOMMENDATION + * line on AskUserQuestion calls. The user has to manually re-prompt "ELI10 and don't + * forget to recommend" almost every time. This test pins that behavior so future + * regressions surface automatically. + * + * Mirrors test/skill-e2e-plan-format.test.ts (the Claude version) but uses + * test/helpers/codex-session-runner.ts to drive `codex exec` instead of `claude -p`. + * + * Four cases: + * 1. plan-ceo-review mode selection (kind-differentiated) + * 2. plan-ceo-review approach menu (coverage-differentiated) + * 3. plan-eng-review per-issue coverage decision + * 4. plan-eng-review per-issue architectural choice (kind-differentiated) + * + * Assertions on captured AskUserQuestion text: + * - RECOMMENDATION: Choose present (all cases) + * - Completeness: N/10 present on coverage cases, absent on kind cases + * - "options differ in kind" note present on kind cases + * - ELI10-style plain-English explanation present (length floor + no raw jargon) + * + * Periodic tier (Codex non-determinism). Cost: ~$2-3 per full run. + */ +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { runCodexSkill, installSkillToTempHome } from './helpers/codex-session-runner'; +import type { CodexResult } from './helpers/codex-session-runner'; +import { EvalCollector } from './helpers/eval-store'; +import type { EvalTestEntry } from './helpers/eval-store'; +import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { spawnSync } from 'child_process'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// --- Prerequisites --- + +const CODEX_AVAILABLE = (() => { + try { + const result = Bun.spawnSync(['which', 'codex']); + return result.exitCode === 0; + } catch { return false; } +})(); +const evalsEnabled = !!process.env.EVALS; +const SKIP = !CODEX_AVAILABLE || !evalsEnabled; +const describeCodex = SKIP ? describe.skip : describe; + +// --- Touchfiles --- + +const CODEX_FORMAT_TOUCHFILES: Record = { + 'codex-plan-ceo-format-mode': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], + 'codex-plan-ceo-format-approach': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], + 'codex-plan-eng-format-coverage': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], + 'codex-plan-eng-format-kind': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'], +}; + +let selectedTests: string[] | null = null; +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE || detectBaseBranch(ROOT) || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, CODEX_FORMAT_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + } +} + +function testIfSelected(name: string, fn: () => Promise, timeout?: number) { + if (selectedTests !== null && !selectedTests.includes(name)) { + test.skip(name, fn, timeout); + } else { + test(name, fn, timeout); + } +} + +// --- Eval collector --- + +let evalCollector: EvalCollector | null = null; +if (!SKIP) { + evalCollector = new EvalCollector('codex-e2e-plan-format'); +} + +function recordCodexResult(testName: string, result: CodexResult, passed: boolean) { + if (!evalCollector) return; + const entry: EvalTestEntry = { + test: testName, + passed, + cost: 0, // Codex cost not tracked here; inferred from tokens + tokens: result.tokens, + duration: Math.round(result.durationMs / 1000), + exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`, + }; + evalCollector.record(entry); +} + +afterAll(async () => { + if (evalCollector) { + await evalCollector.finalize(); + } +}); + +// --- Fixtures --- + +const SAMPLE_PLAN = `# Plan: Add User Dashboard + +## Context +We're building a new user dashboard that shows recent activity, notifications, and quick actions. + +## Changes +1. New React component \`UserDashboard\` in \`src/components/\` +2. REST API endpoint \`GET /api/dashboard\` returning user stats +3. PostgreSQL query for activity aggregation +4. Redis cache layer for dashboard data (5min TTL) + +## Architecture +- Frontend: React + TailwindCSS +- Backend: Express.js REST API +- Database: PostgreSQL with existing user/activity tables +- Cache: Redis for dashboard aggregates +`; + +function setupCodexSkillDir(tmpPrefix: string, skillName: 'plan-ceo-review' | 'plan-eng-review'): { skillDir: string; planDir: string; outFile: string } { + const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix)); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(planDir, 'plan.md'), SAMPLE_PLAN); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add plan']); + + // Codex skill lives in .agents/skills/gstack-{name}/ per the gstack host convention. + const codexSkillSource = path.join(ROOT, '.agents', 'skills', `gstack-${skillName}`); + const skillDir = path.join(planDir, '.agents', 'skills', `gstack-${skillName}`); + fs.mkdirSync(skillDir, { recursive: true }); + fs.cpSync(codexSkillSource, skillDir, { recursive: true }); + + const outFile = path.join(planDir, 'ask-capture.md'); + return { skillDir, planDir, outFile }; +} + +// Capture instruction — same shape as the Claude version. Codex may ignore tool calls, +// so we tell it to write prose to the file directly. +function captureInstruction(outFile: string): string { + return `Write the verbatim text of every AskUserQuestion you would have presented to the user to the file ${outFile} (one question per session, full text including the re-ground, ELI10 paragraph, RECOMMENDATION line, and options). Do NOT ask the user interactively. Do NOT paraphrase. This is a format-capture test, not an interactive session.`; +} + +// --- Regex predicates --- +// Match RECOMMENDATION lenient to markdown bolding around it. +const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/; +const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/; +const KIND_NOTE_RE = /options differ in kind/i; +// ELI10 signal: some plain-English explanation must exist. Weak proxy: >= 200 chars +// of narrative prose between the re-ground and the options, AND at least one of the +// plain-English hints ("plain English", "16-year-old", or "what this means"). +// We test for the length floor and absence of a bare options-list-only output. +const ELI10_LENGTH_FLOOR = 400; // full AskUserQuestion content should be at least this long + +// --- Tests --- + +describeCodex('Codex Plan Format — CEO Mode Selection', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-mode-', 'plan-ceo-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-ceo-format-mode', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-ceo-review skill. Read plan.md (the plan to review). Proceed to Step 0F (Mode Selection) where the skill presents 4 mode options (SCOPE EXPANSION, SELECTIVE EXPANSION, HOLD SCOPE, SCOPE REDUCTION) via AskUserQuestion. These options differ in kind (review posture), not coverage. ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-ceo-review', + }); + + recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0); + console.log(`codex-plan-ceo-format-mode: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + // Codex may timeout — accept as non-fatal (same pattern as existing codex-e2e tests) + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + // kind-differentiated: no fabricated score, must have note + expect(captured).not.toMatch(COMPLETENESS_RE); + expect(captured).toMatch(KIND_NOTE_RE); + }, 360_000); +}); + +describeCodex('Codex Plan Format — CEO Approach Menu', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-approach-', 'plan-ceo-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-ceo-format-approach', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-ceo-review skill. Read plan.md. Proceed to Step 0C-bis (Implementation Alternatives / Approach Menu) where the skill generates 2-3 approaches (minimal viable vs ideal architecture) and presents them via AskUserQuestion. These options differ in coverage so Completeness: N/10 applies. ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-ceo-review', + }); + + recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0); + console.log(`codex-plan-ceo-format-approach: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).toMatch(COMPLETENESS_RE); + }, 360_000); +}); + +describeCodex('Codex Plan Format — Eng Coverage Issue', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-cov-', 'plan-eng-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-eng-format-coverage', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 3 Test Review, generate ONE AskUserQuestion about test coverage depth where options are clearly coverage-differentiated: A) full coverage incl. edge + error paths (Completeness 10/10), B) happy path only (7/10), C) smoke test (3/10). ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-eng-review', + }); + + recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0); + console.log(`codex-plan-eng-format-coverage: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).toMatch(COMPLETENESS_RE); + }, 360_000); +}); + +describeCodex('Codex Plan Format — Eng Kind Issue', () => { + let skillDir: string, planDir: string, outFile: string; + + beforeAll(() => { + ({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-kind-', 'plan-eng-review')); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('codex-plan-eng-format-kind', async () => { + const result = await runCodexSkill({ + skillDir, + prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 1 Architecture review, generate ONE AskUserQuestion about an architectural choice where the options differ in kind (e.g. Redis vs Postgres materialized view vs in-process cache — different kinds of systems with different tradeoffs, NOT more-or-less-complete versions of the same thing). ${captureInstruction(outFile)}`, + timeoutMs: 300_000, + cwd: planDir, + skillName: 'gstack-plan-eng-review', + }); + + recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0); + console.log(`codex-plan-eng-format-kind: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`); + + if (result.exitCode === 124 || result.exitCode === 137) { + console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`); + return; + } + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR); + expect(captured).toMatch(RECOMMENDATION_RE); + // kind-differentiated: no fabricated score + expect(captured).not.toMatch(COMPLETENESS_RE); + expect(captured).toMatch(KIND_NOTE_RE); + }, 360_000); +}); From 028627fbcdf1e2dd0461d5d4804c4c966d82d699 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 21:34:31 -0700 Subject: [PATCH 5/7] fix(preamble): harden AskUserQuestion Format + Codex ELI10 carve-out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to v1.6.2.0. Codex (GPT-5.4) under the gpt.md overlay treated "No preamble / Prefer doing over listing" as license to skip the Simplify paragraph and the RECOMMENDATION line on AskUserQuestion calls. Users had to manually re-prompt "ELI10 and don't forget to recommend" almost every time. Two layers: 1. model-overlays/gpt.md — adds an explicit "AskUserQuestion is NOT preamble" carve-out. The "No preamble" rule applies to direct answers; AskUserQuestion content must emit the full format (Re-ground, Simplify/ELI10, Recommend, Options). Tells the model: if you find yourself about to skip any of these, back up and emit them — the user will ask anyway, so do it the first time. 2. scripts/resolvers/preamble/generate-ask-user-format.ts — step 2 renamed to "Simplify (ELI10, ALWAYS)" with explicit "not optional verbosity, not preamble" framing. Step 3 "Recommend (ALWAYS)" hardened: "Never omit, never collapse into the options list." All T2 skills regenerated across all hosts. Golden fixtures refreshed (claude-ship, codex-ship, factory-ship). Updated the ELI10 assertion in test/gen-skill-docs.test.ts to match the new wording. Codex compliance to be verified empirically via test/codex-e2e-plan-format.test.ts. Co-Authored-By: Claude Opus 4.7 --- autoplan/SKILL.md | 7 ++++--- canary/SKILL.md | 7 ++++--- codex/SKILL.md | 7 ++++--- context-restore/SKILL.md | 7 ++++--- context-save/SKILL.md | 7 ++++--- cso/SKILL.md | 7 ++++--- design-consultation/SKILL.md | 7 ++++--- design-html/SKILL.md | 7 ++++--- design-review/SKILL.md | 7 ++++--- design-shotgun/SKILL.md | 7 ++++--- devex-review/SKILL.md | 7 ++++--- document-release/SKILL.md | 7 ++++--- health/SKILL.md | 7 ++++--- investigate/SKILL.md | 7 ++++--- land-and-deploy/SKILL.md | 7 ++++--- learn/SKILL.md | 7 ++++--- model-overlays/gpt.md | 18 ++++++++++++++++++ office-hours/SKILL.md | 7 ++++--- open-gstack-browser/SKILL.md | 7 ++++--- pair-agent/SKILL.md | 7 ++++--- plan-ceo-review/SKILL.md | 7 ++++--- plan-design-review/SKILL.md | 7 ++++--- plan-devex-review/SKILL.md | 7 ++++--- plan-eng-review/SKILL.md | 7 ++++--- plan-tune/SKILL.md | 7 ++++--- qa-only/SKILL.md | 7 ++++--- qa/SKILL.md | 7 ++++--- retro/SKILL.md | 7 ++++--- review/SKILL.md | 7 ++++--- .../preamble/generate-ask-user-format.ts | 7 ++++--- setup-deploy/SKILL.md | 7 ++++--- ship/SKILL.md | 7 ++++--- test/fixtures/golden/claude-ship-SKILL.md | 7 ++++--- test/fixtures/golden/codex-ship-SKILL.md | 7 ++++--- test/fixtures/golden/factory-ship-SKILL.md | 7 ++++--- test/gen-skill-docs.test.ts | 3 ++- 36 files changed, 156 insertions(+), 103 deletions(-) diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index cffdc810d1..387c9902d9 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -473,10 +473,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/canary/SKILL.md b/canary/SKILL.md index 9b6fa6306f..8cde8383f5 100644 --- a/canary/SKILL.md +++ b/canary/SKILL.md @@ -465,10 +465,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/codex/SKILL.md b/codex/SKILL.md index 098c547b53..8ae3cb13f2 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -467,10 +467,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/context-restore/SKILL.md b/context-restore/SKILL.md index 969bb92f44..6f44b45fea 100644 --- a/context-restore/SKILL.md +++ b/context-restore/SKILL.md @@ -469,10 +469,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/context-save/SKILL.md b/context-save/SKILL.md index d34623911d..c1cdadba5b 100644 --- a/context-save/SKILL.md +++ b/context-save/SKILL.md @@ -469,10 +469,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/cso/SKILL.md b/cso/SKILL.md index 88b2b02732..2aafca8249 100644 --- a/cso/SKILL.md +++ b/cso/SKILL.md @@ -470,10 +470,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/design-consultation/SKILL.md b/design-consultation/SKILL.md index 7c17b43e0e..06a48adc5c 100644 --- a/design-consultation/SKILL.md +++ b/design-consultation/SKILL.md @@ -470,10 +470,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/design-html/SKILL.md b/design-html/SKILL.md index 3eea3f7564..3b6ef550c5 100644 --- a/design-html/SKILL.md +++ b/design-html/SKILL.md @@ -472,10 +472,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/design-review/SKILL.md b/design-review/SKILL.md index c9a58673ac..0c8c092cd8 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -470,10 +470,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/design-shotgun/SKILL.md b/design-shotgun/SKILL.md index cba1a57853..e1e45f0e55 100644 --- a/design-shotgun/SKILL.md +++ b/design-shotgun/SKILL.md @@ -467,10 +467,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/devex-review/SKILL.md b/devex-review/SKILL.md index d7c2a5c1fd..32054c001c 100644 --- a/devex-review/SKILL.md +++ b/devex-review/SKILL.md @@ -470,10 +470,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/document-release/SKILL.md b/document-release/SKILL.md index 06c8a674e7..589c495c64 100644 --- a/document-release/SKILL.md +++ b/document-release/SKILL.md @@ -467,10 +467,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/health/SKILL.md b/health/SKILL.md index f050438a2b..4027c31af6 100644 --- a/health/SKILL.md +++ b/health/SKILL.md @@ -467,10 +467,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/investigate/SKILL.md b/investigate/SKILL.md index 12061f3ed4..844605484c 100644 --- a/investigate/SKILL.md +++ b/investigate/SKILL.md @@ -484,10 +484,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/land-and-deploy/SKILL.md b/land-and-deploy/SKILL.md index 73f6f6e3e3..921c4d5de9 100644 --- a/land-and-deploy/SKILL.md +++ b/land-and-deploy/SKILL.md @@ -464,10 +464,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/learn/SKILL.md b/learn/SKILL.md index e1fd200091..8d55c3c1a8 100644 --- a/learn/SKILL.md +++ b/learn/SKILL.md @@ -467,10 +467,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/model-overlays/gpt.md b/model-overlays/gpt.md index ceb26a7a4b..f43350ecbd 100644 --- a/model-overlays/gpt.md +++ b/model-overlays/gpt.md @@ -9,6 +9,24 @@ Y, or Z," try the best option yourself. Pick, execute, report results. **No preamble.** Skip "Great question!", "Let me help with that", and restating the user's request. Start with the work. +**AskUserQuestion is NOT preamble.** The "No preamble" and "Prefer doing over listing" +rules above do NOT apply to AskUserQuestion content. When you invoke AskUserQuestion, +the user is about to make a decision — they need context, not terseness. Always emit +the full format from the preamble's AskUserQuestion Format section: + +1. **Re-ground** (project + branch + task — 1-2 sentences). +2. **Simplify (ELI10)** — explain what's happening in plain English a 16-year-old could + follow. Concrete stakes, not abstract tradeoffs. Non-negotiable; this is NOT preamble. +3. **Recommend** — `RECOMMENDATION: Choose [X] because [one-line reason]` on its own + line. Never omit this line. Never collapse it into the options list. +4. **Options** — lettered `A) B) C)` with Completeness scores (coverage-differentiated) + or the "options differ in kind" note (kind-differentiated). + +If you find yourself about to present an AskUserQuestion without the Simplify/ELI10 +paragraph, without a RECOMMENDATION line, or by just listing options and asking "which +one?" — stop, back up, and emit the full format. The user will ask you to do it anyway, +so do it the first time. + **Reminder: subordination applies.** When a skill workflow says STOP, stop. When the skill asks via AskUserQuestion, that is the wait-for-user gate, not an ambiguity. Completion bias does not override safety gates. diff --git a/office-hours/SKILL.md b/office-hours/SKILL.md index 171448b904..82765c6ffe 100644 --- a/office-hours/SKILL.md +++ b/office-hours/SKILL.md @@ -475,10 +475,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/open-gstack-browser/SKILL.md b/open-gstack-browser/SKILL.md index 11a4193607..d627c6809c 100644 --- a/open-gstack-browser/SKILL.md +++ b/open-gstack-browser/SKILL.md @@ -464,10 +464,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/pair-agent/SKILL.md b/pair-agent/SKILL.md index 913fff95ea..e19bb1407a 100644 --- a/pair-agent/SKILL.md +++ b/pair-agent/SKILL.md @@ -465,10 +465,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index b611bb9bef..495222ebe1 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -471,10 +471,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 9858ac7b68..676736f9d4 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -468,10 +468,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/plan-devex-review/SKILL.md b/plan-devex-review/SKILL.md index ba74e6edfc..729c79192d 100644 --- a/plan-devex-review/SKILL.md +++ b/plan-devex-review/SKILL.md @@ -472,10 +472,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 83c40582e5..25741f58e2 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -470,10 +470,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/plan-tune/SKILL.md b/plan-tune/SKILL.md index 1ea75b8522..6d2c9bc37e 100644 --- a/plan-tune/SKILL.md +++ b/plan-tune/SKILL.md @@ -478,10 +478,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index c0da4df26a..a933c83722 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -466,10 +466,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/qa/SKILL.md b/qa/SKILL.md index 65723b7df6..6c6330c2b8 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -472,10 +472,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/retro/SKILL.md b/retro/SKILL.md index accdf53cdb..4dc9d4f41a 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -465,10 +465,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/review/SKILL.md b/review/SKILL.md index 2205d23aa4..6b82d502ca 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -469,10 +469,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/scripts/resolvers/preamble/generate-ask-user-format.ts b/scripts/resolvers/preamble/generate-ask-user-format.ts index 66f7f2d560..58ec324da1 100644 --- a/scripts/resolvers/preamble/generate-ask-user-format.ts +++ b/scripts/resolvers/preamble/generate-ask-user-format.ts @@ -3,10 +3,11 @@ import type { TemplateContext } from '../types'; export function generateAskUserFormat(_ctx: TemplateContext): string { return `## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the \`_BRANCH\` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with \`RECOMMENDATION: Choose [X] because [one-line reason]\`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with \`RECOMMENDATION: Choose [X] because [one-line reason]\` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with \`Completeness: N/10\` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip \`Completeness: N/10\` entirely and write one line: \`Note: options differ in kind, not coverage — no completeness score.\` Do not fabricate filler scores. 5. **Options:** Lettered options: \`A) ... B) ... C) ...\` — when an option involves effort, show both scales: \`(human: ~X / CC: ~Y)\` diff --git a/setup-deploy/SKILL.md b/setup-deploy/SKILL.md index 5f65a04352..1cd1a507b0 100644 --- a/setup-deploy/SKILL.md +++ b/setup-deploy/SKILL.md @@ -468,10 +468,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/ship/SKILL.md b/ship/SKILL.md index 46f513fd1e..e56262ed5a 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -470,10 +470,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/test/fixtures/golden/claude-ship-SKILL.md b/test/fixtures/golden/claude-ship-SKILL.md index 46f513fd1e..e56262ed5a 100644 --- a/test/fixtures/golden/claude-ship-SKILL.md +++ b/test/fixtures/golden/claude-ship-SKILL.md @@ -470,10 +470,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/test/fixtures/golden/codex-ship-SKILL.md b/test/fixtures/golden/codex-ship-SKILL.md index b8bdd35206..a01e088797 100644 --- a/test/fixtures/golden/codex-ship-SKILL.md +++ b/test/fixtures/golden/codex-ship-SKILL.md @@ -459,10 +459,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/test/fixtures/golden/factory-ship-SKILL.md b/test/fixtures/golden/factory-ship-SKILL.md index 0e5bd4c340..9aa7a5963e 100644 --- a/test/fixtures/golden/factory-ship-SKILL.md +++ b/test/fixtures/golden/factory-ship-SKILL.md @@ -461,10 +461,11 @@ available]. [Health score if available]." Keep it to 2-3 sentences. ## AskUserQuestion Format -**ALWAYS follow this structure for every AskUserQuestion call:** +**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.** + 1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]`. Never omit this line. It is required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. +2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time. +3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind. 4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores. 5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 6c40710b40..dc356479eb 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -244,8 +244,9 @@ describe('gen-skill-docs', () => { test('tier 2+ skills contain ELI16 simplification rules (AskUserQuestion format)', () => { // Root SKILL.md is tier 1 (no AskUserQuestion format). Check a tier 2+ skill instead. const content = fs.readFileSync(path.join(ROOT, 'cso', 'SKILL.md'), 'utf-8'); - expect(content).toContain('No raw function names'); + expect(content).toContain('Simplify (ELI10'); expect(content).toContain('plain English'); + expect(content).toContain('not function names'); }); test('tier 1 skills do NOT contain AskUserQuestion format', () => { From 09c82222eabebac6aa3689a47f68e00c00fc7260 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 22 Apr 2026 22:02:32 -0700 Subject: [PATCH 6/7] test: fix Codex eval sandbox + collector API Two test infrastructure bugs in the initial Codex eval landed in the prior commit: 1. sandbox: 'read-only' (the default) blocked Codex from writing $OUT_FILE. Test reported "STATUS: BLOCKED" and exited 0 without a capture file. Fixed: sandbox: 'workspace-write' for all 4 cases, allowing writes inside the tempdir. 2. recordCodexResult called a non-existent evalCollector.record() API (I invented it). The real surface is addTest() with a different field schema. Aligned with test/codex-e2e.test.ts pattern. With both fixed, the eval now actually measures Codex AskUserQuestion format compliance. All 4 cases pass on v1.6.2.0 with the gpt.md carve-out: RECOMMENDATION always, Completeness: N/10 only on coverage, "options differ in kind" note on kind, ELI10 explanation present. Co-Authored-By: Claude Opus 4.7 --- test/codex-e2e-plan-format.test.ts | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/test/codex-e2e-plan-format.test.ts b/test/codex-e2e-plan-format.test.ts index 2b1dcb2bd0..0481f69d9f 100644 --- a/test/codex-e2e-plan-format.test.ts +++ b/test/codex-e2e-plan-format.test.ts @@ -85,16 +85,17 @@ if (!SKIP) { } function recordCodexResult(testName: string, result: CodexResult, passed: boolean) { - if (!evalCollector) return; - const entry: EvalTestEntry = { - test: testName, + evalCollector?.addTest({ + name: testName, + suite: 'codex-e2e-plan-format', + tier: 'e2e', passed, - cost: 0, // Codex cost not tracked here; inferred from tokens - tokens: result.tokens, - duration: Math.round(result.durationMs / 1000), - exitReason: result.exitCode === 0 ? 'success' : `exit_${result.exitCode}`, - }; - evalCollector.record(entry); + duration_ms: result.durationMs, + cost_usd: 0, // Codex doesn't report cost in the same way; tokens tracked separately + output: result.output?.slice(0, 2000), + turns_used: result.toolCalls.length, + exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`, + }); } afterAll(async () => { @@ -183,6 +184,7 @@ describeCodex('Codex Plan Format — CEO Mode Selection', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-ceo-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0); @@ -222,6 +224,7 @@ describeCodex('Codex Plan Format — CEO Approach Menu', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-ceo-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0); @@ -258,6 +261,7 @@ describeCodex('Codex Plan Format — Eng Coverage Issue', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-eng-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0); @@ -294,6 +298,7 @@ describeCodex('Codex Plan Format — Eng Kind Issue', () => { timeoutMs: 300_000, cwd: planDir, skillName: 'gstack-plan-eng-review', + sandbox: 'workspace-write', }); recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0); From 62fa71962c1c871d62167f9e80a3d6bd096c5514 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 23 Apr 2026 07:09:13 -0700 Subject: [PATCH 7/7] chore: bump version and changelog (v1.6.3.0) Adds the Codex ELI10 + RECOMMENDATION carve-out scope landed after v1.6.2.0's Claude-verified fix. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 36 ++++++++++++++++++++++++++++++++++++ VERSION | 2 +- package.json | 2 +- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8905dd2137..a8ac2c2c75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,41 @@ # Changelog +## [1.6.3.0] - 2026-04-23 + +## **Codex finally explains what it's asking about. No more "ELI10 please" the 10th time in a row.** + +A follow-up to v1.6.2.0. After shipping the Claude-verified fix, user reported Codex (GPT-5.4) was failing the same pattern 10/10 times — skipping the ELI10 explanation and the RECOMMENDATION line on AskUserQuestion calls, forcing manual "ELI10 and don't forget to recommend" re-prompts every time. Root cause: the `gpt.md` model overlay's "No preamble / Prefer doing over listing" rule was training Codex to skip the exact prose the user needs for decision-making. + +### The numbers that matter + +Source: new `test/codex-e2e-plan-format.test.ts`, four cases driven via `codex exec` on the installed gstack Codex host. Periodic tier (GPT-class non-determinism). + +| Case | Type | Pre-fix (measured, 10/10 times) | Post-fix (v1.6.3.0) | +|---|---|---|---| +| plan-ceo-review mode selection | kind | No ELI10 paragraph, no RECOMMENDATION line | ✓ ELI10 + RECOMMENDATION + "options differ in kind" note | +| plan-ceo-review approach menu | coverage | No ELI10 paragraph, bare options list | ✓ ELI10 + RECOMMENDATION + `Completeness: 5/7/10` | +| plan-eng-review coverage issue | coverage | Bare options list | ✓ ELI10 + RECOMMENDATION + Completeness | +| plan-eng-review architectural choice | kind | Fabricated Completeness filler on kind question | ✓ ELI10 + RECOMMENDATION + "options differ in kind" note | + +All 4 Codex cases pass ELI10 length floor (>400 chars of prose per question). 517s for the full eval; Codex doesn't bill per call the way Anthropic does. + +### Itemized changes + +#### Fixed + +- Codex no longer skips the Simplify/ELI10 paragraph on AskUserQuestion calls. The `gpt.md` overlay now carves out AskUserQuestion content from the "No preamble" rule explicitly: you still skip filler on direct answers, but every AskUserQuestion gets the full Re-ground + ELI10 + RECOMMENDATION + Options format. +- Codex no longer collapses the RECOMMENDATION into the options list. It lands on its own line, every time, regardless of question type. + +#### Changed + +- `scripts/resolvers/preamble/generate-ask-user-format.ts` — step 2 renamed to "Simplify (ELI10, ALWAYS)" with explicit "not optional verbosity, not preamble" framing. Step 3 "Recommend (ALWAYS)" hardened: "Never omit, never collapse into the options list." The tightening applies to all hosts, but Codex felt it most. +- `model-overlays/gpt.md` — adds a new "AskUserQuestion is NOT preamble" section that instructs the model to back up and emit the full format if it ever finds itself about to skip the ELI10 paragraph or the RECOMMENDATION line. + +#### For contributors + +- `test/codex-e2e-plan-format.test.ts` — four periodic-tier Codex eval cases mirroring the Claude version. Uses `codex exec` via the existing `test/helpers/codex-session-runner.ts` harness with `sandbox: 'workspace-write'` so the capture file lands inside the tempdir. Assertions: RECOMMENDATION regex, coverage-vs-kind Completeness split, ELI10 length floor (400+ chars). +- All T2 skills regenerated across all hosts (claude, codex, factory, gbrain, gpt-5.4, hermes, kiro, opencode, openclaw, slate, cursor). Golden fixtures refreshed. `test/gen-skill-docs.test.ts` ELI10 assertion updated to match the new "Simplify (ELI10" heading. + ## [1.6.2.0] - 2026-04-22 ## **Plan reviews give you the recommendation again. And we finally admitted a 10/10 score on a mode pick means nothing.** diff --git a/VERSION b/VERSION index 9823ee6113..4d019863aa 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.6.2.0 +1.6.3.0 diff --git a/package.json b/package.json index 42210882f7..dfa2f0aa0d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "1.6.2.0", + "version": "1.6.3.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module",