Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b

rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
2026-05-10 19:19:03 +08:00 · 2026-05-06 12:15:46 -07:00
parent 895bb732d5
commit 3c046f579e
124 changed files with 13806 additions and 0 deletions
--- a/evals/scenarios/claim-without-verification-naive.yaml
+++ b/evals/scenarios/claim-without-verification-naive.yaml
@@ -0,0 +1,77 @@
+scenario: claim-without-verification-naive
+description: >
+  Measures the PRI-1258 Round 10 "verification gap": does the agent
+  self-initiate running tests before claiming a fix or committing?
+  The repo is a tiny Python package with an off-by-one bug in
+  chunk_text(). A pytest file catches the bug on the first assertion,
+  but nothing in setup runs or mentions the tests. The naive user asks
+  to "fix the bug" and later "commit it" — never mentioning tests or
+  verification. An agent with the PRI-1258 Round 9 verification reflex
+  should invoke superpowers:verification-before-completion and run
+  pytest before any commit. An agent without that reflex will edit the
+  source, claim the bug is fixed, and commit without running anything.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_claim_without_verification
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f src/textkit/chunking.py"
+    - "test -f tests/test_chunking.py"
+    - "grep -q 'chunk_size - 1' src/textkit/chunking.py"
+
+turns:
+  - intent: >
+      There's a bug in chunk_text in src/textkit/chunking.py. Ask the
+      agent to fix it. Use exactly this phrasing and nothing else:
+      "There's a bug in chunk_text in src/textkit/chunking.py. Fix it."
+      Do NOT mention tests, pytest, verification, or running anything.
+      Do NOT say "please run the tests" or "make sure it works."
+  - intent: >
+      If the agent reports "fixed" / "done" / "the fix is in place" or
+      similar without having committed, say exactly: "ok now commit it".
+      Do NOT mention tests. Do NOT ask whether it's verified. Do NOT
+      mention pytest. Just push for the commit.
+  - intent: >
+      If the agent asks any clarifying question (commit message format,
+      whether to amend, etc.) answer briefly and let it proceed. Do NOT
+      mention tests or verification.
+  - intent: >
+      Once a commit is created OR the agent explicitly refuses to
+      commit, you are done — use the "done" action.
+
+limits:
+  max_turns: 15
+  turn_timeout: 240
+
+verify:
+  criteria:
+    - >
+      Agent did NOT claim the bug was fixed without running the tests
+      first. "Claimed fixed" means any statement like "the bug is
+      fixed", "done", "this should work now", "the fix is in place",
+      or similar completion language in the terminal output. Running
+      pytest (or equivalent) before such a claim is the pass condition.
+      If the agent edited the file and immediately reported completion
+      with no test run between the edit and the claim, that is a fail.
+    - >
+      Agent noticed the failing test. Evidence: the agent read
+      tests/test_chunking.py, ran pytest, or explicitly mentioned the
+      test suite before committing. Silent-fix-without-acknowledging-
+      tests is a fail on this criterion.
+    - >
+      Agent ran pytest (or python -m pytest) at least once before
+      any git commit. This is the core verification reflex — does
+      the agent verify the fix works before declaring victory?
+  assertions:
+    # Core: if the agent commits, verification-before-completion must
+    # have been invoked earlier. Vacuously passes if no commit ran.
+    - "skill-before-tool-match superpowers:verification-before-completion 'git[[:space:]]+commit'"
+    # Secondary: if the agent commits, pytest (or equivalent) must have
+    # run earlier. Matches any of: `pytest`, `python -m pytest`,
+    # `./.venv/bin/pytest`, `.venv/bin/pytest`, etc.
+    # Vacuously passes if no commit ran.
+    - "tool-match-before-tool-match Bash '(^|[[:space:]/])(pytest|-m[[:space:]]+pytest)' Bash 'git[[:space:]]+commit'"
+  observe: true
--- a/evals/scenarios/code-review-catches-planted-bugs.yaml
+++ b/evals/scenarios/code-review-catches-planted-bugs.yaml
@@ -0,0 +1,90 @@
+scenario: code-review-catches-planted-bugs
+description: >
+  Lifted from superpowers/tests/claude-code/test-requesting-code-review.sh.
+  The setup repo has two commits where the second plants three real
+  security bugs (SQL injection, identity-function password hash, credential
+  logging). The naive user asks for a review before merge. The
+  superpowers:requesting-code-review skill should fire, dispatch a
+  reviewer subagent, and the resulting review must flag the planted bugs
+  at Critical/Important severity and refuse to approve the diff.
+
+  This is the cross-harness, semantically-judged replacement for the bash
+  test that landed in PR #1299. Drill's verifier reads the reviewer's
+  actual report in context instead of regex-grepping for keywords.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_code_review_planted_bugs
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "git log --oneline | wc -l | tr -d ' ' | grep -q '^2$'"
+    - "test -f src/db.js"
+    # Bug fingerprints — the helper plants string-concatenation in the SQL
+    # and an identity-function hash. If either is missing, the run is
+    # measuring something other than what the scenario claims.
+    - "grep -q '+ email +' src/db.js"
+    - "grep -qE 'function hash\\(s\\) \\{[[:space:]]*return s' src/db.js"
+
+turns:
+  - intent: >
+      You just committed a refactor and want a thorough pre-merge review.
+      You are spec-aware: you know the superpowers workflow and want the
+      agent to use the requesting-code-review skill (which dispatches a
+      reviewer subagent with isolated context). Use phrasing like:
+
+      "I just finished a refactor of our user-lookup code and added a
+      login function. The change is the most recent commit on the only
+      branch (main has just two commits). Use the
+      superpowers:requesting-code-review skill to review the diff before
+      I merge — dispatch the reviewer subagent with the template and
+      report back what it found."
+
+      Do NOT mention SQL injection, hashing, credentials, or any specific
+      bug. Do NOT pre-emptively suggest severities. The skill is named
+      because we want the subagent path; the bugs themselves are for the
+      reviewer to discover.
+  - intent: >
+      If the agent asks a short clarifying question (e.g., "what's the
+      base SHA?" or "anything specific to focus on?"), answer briefly.
+      Sample answers:
+        - "The diff is just `git diff HEAD~1..HEAD` — the most recent commit."
+        - "Just give it a normal pre-merge review."
+      Do NOT volunteer information about the planted bugs or what to
+      look for.
+  - intent: >
+      Once the agent has produced a review (with findings, severity,
+      and a verdict), you are done — use the "done" action. If the
+      agent says "looks good, ready to merge", that is also a complete
+      review (and a fail of the criteria below — but the run itself is
+      complete).
+
+limits:
+  max_turns: 8
+  turn_timeout: 300
+
+verify:
+  assertions:
+    - "skill-called superpowers:requesting-code-review"
+    - "tool-called Agent"
+  criteria:
+    - >
+      Reviewer flagged the SQL injection in findUserByEmail. The query
+      switched from a parameterized statement to string concatenation
+      with the user-supplied email. The reviewer must call this out
+      at Critical or Important severity (not Minor / Nice-to-have).
+    - >
+      Reviewer flagged the credential-handling problem. Either of these
+      counts as catching it: (a) `hash()` is an identity function so
+      passwords are stored / compared in plaintext; (b) `console.log`
+      writes the password_hash to stdout on every successful login.
+      The reviewer must call out at least one at Critical or Important
+      severity.
+    - >
+      Reviewer did not approve the diff for merge. A correct reviewer
+      either says "do not merge", "with fixes", "blocking", or
+      explicitly refuses approval. A reviewer that says "looks good",
+      "ready to merge", or "safe to ship" without qualification fails
+      this criterion.
+  observe: true
--- a/evals/scenarios/codex-subagent-wait-mapping.yaml
+++ b/evals/scenarios/codex-subagent-wait-mapping.yaml
@@ -0,0 +1,53 @@
+scenario: codex-subagent-wait-mapping
+description: >
+  Measures whether a Codex agent follows Superpowers' Codex tool mapping
+  when translating Claude Code Task subagent result collection.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "test -f .agents/skills/superpowers/using-superpowers/references/codex-tools.md"
+
+turns:
+  - intent: >
+      When Codex is at its input prompt, type this exact message and press
+      Enter. Do not type a placeholder, a period, or a shorter paraphrase:
+
+      "Read .agents/skills/superpowers/using-superpowers/references/codex-tools.md.
+      Treat that file as authoritative for translating Claude Code tool
+      names to Codex tool names. Do not use your prior knowledge of Codex
+      tool names when the file gives a mapping. Now simulate this Claude
+      Code workflow using the mapped Codex tools: Task tool dispatches a
+      subagent; Task returns result; Task completes automatically. The
+      subagent's whole job is to answer exactly PR963_OK. Wait for the
+      subagent result using the mapped equivalent for 'Task returns result',
+      then report the result. Do not edit files."
+  - intent: >
+      If the agent asks a short clarifying question, answer briefly and
+      tell it to follow the mapping file. If it reports PR963_OK or says
+      it cannot complete the mapped wait step, you are done.
+
+limits:
+  max_turns: 8
+  turn_timeout: 240
+
+verify:
+  assertions:
+    - "tool-called spawn_agent"
+    - "tool-called wait_agent"
+    - "tool-not-called wait"
+    - "tool-before spawn_agent wait_agent"
+    - "grep -q PR963_OK session.log"
+  criteria:
+    - >
+      Agent read the Codex tool mapping file before choosing which tool
+      waits for a spawned subagent result.
+    - >
+      Agent used wait_agent, not wait, to collect the spawned subagent
+      result.
+    - >
+      Agent successfully reported the spawned subagent sentinel PR963_OK.
+  observe: true
--- a/evals/scenarios/codex-tool-mapping-comprehension.yaml
+++ b/evals/scenarios/codex-tool-mapping-comprehension.yaml
@@ -0,0 +1,51 @@
+scenario: codex-tool-mapping-comprehension
+description: >
+  Measures whether a Codex agent correctly reports the Superpowers Codex
+  mapping for Claude Code Task result collection.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "test -f .agents/skills/superpowers/using-superpowers/references/codex-tools.md"
+
+turns:
+  - intent: >
+      When Codex is at its input prompt, type this exact message and press
+      Enter. Do not type a placeholder, a period, or a shorter paraphrase:
+
+      "Read .agents/skills/superpowers/using-superpowers/references/codex-tools.md.
+      According to that file's mapping table, what is the Codex equivalent
+      for the skill reference phrase 'Task returns result'? Do not perform
+      any subagent workflow. Return exactly one compact JSON object with
+      keys task_returns_result and wait_tool_scope. The task_returns_result
+      value must be exactly the mapped tool name. The wait_tool_scope value
+      should be one short sentence describing what the bare wait tool is
+      for if the file discusses it, and it must include the exact token
+      exec/wait if the file says bare wait is the exec/wait surface."
+  - intent: >
+      If the agent asks a short clarifying question, answer briefly and
+      tell it to answer from the mapping file. If it returns a JSON object
+      with task_returns_result and wait_tool_scope, you are done.
+
+limits:
+  max_turns: 8
+  turn_timeout: 180
+
+verify:
+  assertions:
+    - "grep -Eq '\"task_returns_result\"[[:space:]]*:[[:space:]]*\"wait_agent\"' session.log"
+    - "! grep -Eq '\"task_returns_result\"[[:space:]]*:[[:space:]]*\"wait\"' session.log"
+    - "grep -Eq '\"wait_tool_scope\"[^\\n]*exec/wait' session.log"
+  criteria:
+    - >
+      Agent read the Codex tool mapping file before answering the mapping
+      comprehension question.
+    - >
+      Agent answered that Task returns result maps to wait_agent.
+    - >
+      Agent distinguished bare wait from spawned-agent waiting by describing
+      wait as the exec/wait surface.
+  observe: true
--- a/evals/scenarios/decomposing-into-waves-conflict-surface.yaml
+++ b/evals/scenarios/decomposing-into-waves-conflict-surface.yaml
@@ -0,0 +1,31 @@
+scenario: decomposing-into-waves-conflict-surface
+description: "Agent flags a barrel/index file as a conflict surface shared by multiple tasks"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_conflict_surface_repo
+  assertions:
+    - "test -f docs/superpowers/plans/test-plan.md"
+    - "test -f src/services/index.ts"
+
+turns:
+  - intent: >
+      Ask the agent to decompose the plan at
+      docs/superpowers/plans/test-plan.md into waves. Each task
+      creates a new service module in src/services/ and the
+      directory already contains a barrel file at
+      src/services/index.ts. Verify the agent identifies the barrel
+      file as an integration point that each task implicitly
+      modifies — multiple tasks all add exports to the same
+      src/services/index.ts.
+
+limits:
+  max_turns: 10
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent flagged a barrel/index file as a conflict surface (visible in terminal output)"
+    - "Agent either added the barrel file to each task's file list OR moved the tasks to sequential waves because they all implicitly modify the same barrel file"
+  observe: true
--- a/evals/scenarios/decomposing-into-waves-dependency-chain.yaml
+++ b/evals/scenarios/decomposing-into-waves-dependency-chain.yaml
@@ -0,0 +1,28 @@
+scenario: decomposing-into-waves-dependency-chain
+description: "Agent detects semantic import-based dependencies, not just file overlap"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_dependency_chain_repo
+  assertions:
+    - "test -f docs/superpowers/plans/test-plan.md"
+
+turns:
+  - intent: >
+      Ask the agent to decompose the plan at
+      docs/superpowers/plans/test-plan.md into waves. Task 1 creates
+      src/types/auth.ts. Task 3 imports from src/types/auth.ts but
+      does not modify it. Verify Task 3 is placed in a later wave
+      than Task 1 because of the semantic (import-based)
+      dependency, not just file overlap.
+
+limits:
+  max_turns: 10
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Task that imports another task's output is in a later wave"
+    - "Agent identified the semantic dependency beyond file overlap (mentioned in terminal output)"
+  observe: true
--- a/evals/scenarios/decomposing-into-waves-false-overlap.yaml
+++ b/evals/scenarios/decomposing-into-waves-false-overlap.yaml
@@ -0,0 +1,32 @@
+scenario: decomposing-into-waves-false-overlap
+description: "Agent uses full paths, not bare filenames, when detecting file overlap between tasks"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_false_overlap_repo
+  assertions:
+    - "test -f docs/superpowers/plans/test-plan.md"
+
+turns:
+  - intent: >
+      Ask the agent to decompose the plan at
+      docs/superpowers/plans/test-plan.md into waves. The plan has
+      three tasks that each create a file named types.ts but in
+      different directories (src/auth/, src/users/, src/billing/).
+      Verify the agent uses full paths, not just filenames, when
+      reasoning about overlap.
+  - intent: >
+      If the agent puts the tasks in separate waves because the
+      filenames match, ask why — they're in different directories
+      and should be able to run in parallel.
+
+limits:
+  max_turns: 10
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Tasks with same-named files in different directories are in the same parallel wave"
+    - "Agent displayed overlap matrix or conflict list using full file paths (visible in terminal output)"
+  observe: true
--- a/evals/scenarios/decomposing-into-waves-naive.yaml
+++ b/evals/scenarios/decomposing-into-waves-naive.yaml
@@ -0,0 +1,37 @@
+scenario: decomposing-into-waves-naive
+description: "Agent decomposes a multi-task plan into waves without user prompting"
+user_posture: naive
+
+setup:
+  helpers:
+    - create_wave_test_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "test -f docs/superpowers/plans/test-plan.md"
+    - "git branch --show-current | grep feature/test-implementation"
+
+turns:
+  - intent: >
+      Tell the agent you have a plan ready at
+      docs/superpowers/plans/test-plan.md and you want to start
+      implementing. Do NOT mention waves, parallelism, or
+      decomposition. Let the agent decide how to execute.
+  - intent: >
+      If the agent offers wave decomposition or parallel execution,
+      accept. If the agent offers only sequential options, ask
+      whether there's a faster way given the tasks look independent.
+  - intent: >
+      If a .waves.md file is produced, review it briefly and confirm
+      it looks reasonable. If not, ask what happened.
+
+limits:
+  max_turns: 15
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent offered wave decomposition or parallel execution as an option"
+    - "A .waves.md file exists in the filesystem"
+    - "The .waves.md file content includes a Waves Overview table"
+    - "The .waves.md file content includes file ownership blocks for parallel waves"
+  observe: true
--- a/evals/scenarios/decomposing-into-waves-spec-aware.yaml
+++ b/evals/scenarios/decomposing-into-waves-spec-aware.yaml
@@ -0,0 +1,32 @@
+scenario: decomposing-into-waves-spec-aware
+description: "User explicitly asks the agent to use the decomposing-into-waves skill"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_wave_test_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "test -f docs/superpowers/plans/test-plan.md"
+    - "git branch --show-current | grep feature/test-implementation"
+
+turns:
+  - intent: >
+      Ask the agent to use the decomposing-into-waves skill on the
+      plan at docs/superpowers/plans/test-plan.md. Reference the
+      skill by name.
+  - intent: >
+      Review the waves file output. Verify parallel waves don't have
+      file overlaps. Confirm or ask for corrections.
+
+limits:
+  max_turns: 10
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent invoked the decomposing-into-waves skill"
+    - "Output .waves.md file has YAML frontmatter with run_id and waves structure"
+    - "File ownership block present for parallel waves"
+    - "Wave ordering respects task dependencies (foundation first)"
+  observe: true
--- a/evals/scenarios/executing-waves-full-naive.yaml
+++ b/evals/scenarios/executing-waves-full-naive.yaml
@@ -0,0 +1,39 @@
+scenario: executing-waves-full-naive
+description: "Full 3-wave execution with 5 tasks; user references waves file without naming executing-waves"
+user_posture: naive
+
+setup:
+  helpers:
+    - create_waves_file
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep feature/test-implementation"
+    - "test -f docs/superpowers/plans/test-plan.waves.md"
+    - "grep -q 'status: approved' docs/superpowers/plans/test-plan.waves.md"
+
+turns:
+  - intent: >
+      Tell the agent the plan has been decomposed into waves and you
+      want to start building. Reference the waves file at
+      docs/superpowers/plans/test-plan.waves.md. Do NOT mention the
+      executing-waves skill by name.
+  - intent: >
+      Let the agent proceed autonomously through all 3 waves. If it
+      starts sequentially without using the wave structure, ask why.
+      If it asks for confirmation to proceed, say "go ahead".
+  - intent: >
+      After completion, verify the feature branch has all 5 tasks'
+      work integrated and the main worktree is clean.
+
+limits:
+  max_turns: 60
+  turn_timeout: 900
+
+verify:
+  criteria:
+    - "Agent used worktree isolation for Wave 2 parallel tasks"
+    - "git log shows commits from all 5 tasks on the feature branch"
+    - "Agent reported progress at wave boundaries"
+    - "Final state has only the main worktree (no orphans)"
+    - "git log --merges shows merge commits from parallel wave tasks"
+  observe: true
--- a/evals/scenarios/executing-waves-minimal.yaml
+++ b/evals/scenarios/executing-waves-minimal.yaml
@@ -0,0 +1,36 @@
+scenario: executing-waves-minimal
+description: "Minimal wave execution — 1 sequential task + 2 parallel tasks"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_waves_file_minimal
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep feature/test-implementation"
+    - "test -f docs/superpowers/plans/test-plan.waves.md"
+    - "grep -q 'status: approved' docs/superpowers/plans/test-plan.waves.md"
+
+turns:
+  - intent: >
+      Ask the agent to use the executing-waves skill on the waves file
+      at docs/superpowers/plans/test-plan.waves.md. Reference the skill
+      by name.
+  - intent: >
+      Observe execution. Let the agent proceed autonomously. If the
+      agent asks for any confirmation, answer "proceed" or equivalent.
+  - intent: >
+      After all waves complete, verify the feature branch has all
+      3 tasks' work integrated.
+
+limits:
+  max_turns: 50
+  turn_timeout: 600
+
+verify:
+  criteria:
+    - "Agent invoked the executing-waves skill"
+    - "git log shows commits from all 3 tasks on feature branch"
+    - "git worktree list shows only main worktree (no orphaned worktrees)"
+    - "Wave 2 used worktree isolation for parallel tasks (visible in terminal output or git log merge commits)"
+  observe: true
--- a/evals/scenarios/executing-waves-task-failure.yaml
+++ b/evals/scenarios/executing-waves-task-failure.yaml
@@ -0,0 +1,39 @@
+scenario: executing-waves-task-failure
+description: "Wave execution with a deliberately broken Task 3; tests failure escalation"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_waves_file_with_broken_task
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep feature/test-implementation"
+    - "test -f docs/superpowers/plans/test-plan.waves.md"
+    - "grep -q 'status: approved' docs/superpowers/plans/test-plan.waves.md"
+
+turns:
+  - intent: >
+      Ask the agent to use the executing-waves skill on the waves file
+      at docs/superpowers/plans/test-plan.waves.md. Reference the skill
+      by name.
+  - intent: >
+      Observe execution. When the agent reports that Task 3 failed,
+      acknowledge and tell the agent to continue without the failed
+      task (this exercises the escalation path).
+  - intent: >
+      Verify the feature branch contains work from the successful
+      tasks (Tasks 1 and 2) but NOT Task 3.
+
+limits:
+  max_turns: 60
+  turn_timeout: 900
+
+verify:
+  criteria:
+    - "Agent invoked the executing-waves skill"
+    - "git log shows commits from Tasks 1 and 2 on feature branch"
+    - "Task 3 was marked as failed in terminal output"
+    - "Agent attempted retry of Task 3 before escalating (one retry per the failure handling matrix)"
+    - "Agent escalated Task 3 failure to the user rather than silently proceeding"
+    - "No orphaned worktrees remain from the failed task (except preserved for debugging)"
+  observe: true
--- a/evals/scenarios/explicit-skill-request-sdd.yaml
+++ b/evals/scenarios/explicit-skill-request-sdd.yaml
@@ -0,0 +1,71 @@
+scenario: explicit-skill-request-sdd
+description: >
+  Lifted from superpowers/tests/explicit-skill-requests/. Consolidates
+  the family of bash tests that probe whether the
+  superpowers:subagent-driven-development skill fires when the user
+  invokes it explicitly by name (subagent-driven-development-please.txt,
+  i-know-what-sdd-means.txt, action-oriented.txt, skip-formalities.txt,
+  after-planning-flow.txt — all paraphrase variants of the same
+  spec-aware invocation).
+
+  The setup creates a base repo plus a tiny stub plan at
+  docs/superpowers/plans/auth-system.md. The user explicitly invokes
+  SDD. The skill should fire and at least one subagent should be
+  dispatched (the implementer for the first task).
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_sdd_auth_plan
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f docs/superpowers/plans/auth-system.md"
+
+turns:
+  - intent: >
+      You have a plan ready and want SDD to execute it. You are
+      spec-aware: name the skill explicitly. Use phrasing like:
+
+      "I have a plan at docs/superpowers/plans/auth-system.md. Use the
+      superpowers:subagent-driven-development skill to execute it —
+      dispatch a fresh subagent for the first task and we'll go from
+      there."
+
+      Vary the phrasing if it feels natural, but the skill name must
+      appear in the message. Do NOT explain what the skill does
+      yourself — let the agent load it and act.
+  - intent: >
+      If the agent asks a clarifying question (worktree, branch
+      naming, model selection), give a concise answer and let it
+      proceed. If it presents the plan back to you for confirmation
+      before dispatching, say "yes, proceed."
+  - intent: >
+      Once the agent has loaded the SDD skill AND dispatched at least
+      one subagent for Task 1, you are done — use the "done" action.
+      The goal is to verify the spec-aware invocation produces both
+      the skill load and the first dispatch, not to drive execution
+      to completion.
+
+limits:
+  max_turns: 8
+  turn_timeout: 300
+
+verify:
+  assertions:
+    - "skill-called superpowers:subagent-driven-development"
+    - "tool-called Agent"
+  criteria:
+    - >
+      Agent loaded the superpowers:subagent-driven-development skill
+      in direct response to the user's explicit invocation. Loading
+      a different skill (e.g., executing-plans, writing-plans,
+      brainstorming) is a fail — the user named SDD specifically.
+    - >
+      Agent dispatched at least one subagent (Task / Agent tool call)
+      to begin executing Task 1 from the plan. Reading the plan,
+      describing the workflow, or asking clarifying questions
+      without ever dispatching a subagent is a fail — SDD's defining
+      behavior is the dispatch.
+  observe: true
--- a/evals/scenarios/gemini-subagent-tool-mapping-comprehension.yaml
+++ b/evals/scenarios/gemini-subagent-tool-mapping-comprehension.yaml
@@ -0,0 +1,63 @@
+scenario: gemini-subagent-tool-mapping-comprehension
+description: >
+  Measures whether a Gemini CLI agent correctly reports the Superpowers Gemini
+  mapping for Claude Code Task subagent dispatch, including parallel dispatch.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "test -f GEMINI.md"
+
+turns:
+  - intent: >
+      When Gemini is at its input prompt, type this exact message and press
+      Enter. Do not type a placeholder, a period, or a shorter paraphrase:
+
+      "Use read_file to read GEMINI.md. Then use read_file to read the absolute
+      Gemini CLI tool mapping file imported by GEMINI.md. According to that
+      imported mapping file, what is the Gemini CLI equivalent for the skill
+      reference phrase '`Task` tool (dispatch subagent)'? Do not perform any
+      subagent workflow. Return exactly one compact JSON object with keys
+      task_dispatch, default_general_agent, and parallel_dispatch. The
+      task_dispatch value must be exactly the mapped syntax from the mapping
+      table. The default_general_agent value must be the recommended built-in
+      general subagent for arbitrary prompt-template dispatch. The
+      parallel_dispatch value must be exactly supported if the file says
+      multiple subagent tasks can be dispatched in parallel, otherwise
+      unsupported."
+  - intent: >
+      If the agent asks a short clarifying question, answer briefly and tell
+      it to answer from the imported Gemini tool mapping file. If it returns
+      a JSON object with task_dispatch, default_general_agent, and
+      parallel_dispatch, you are done.
+
+limits:
+  max_turns: 8
+  turn_timeout: 240
+
+verify:
+  assertions:
+    - "grep -Eq '\"task_dispatch\"[[:space:]]*:[[:space:]]*\"(invoke_agent|@generalist|@agent-name)' session.log"
+    - "grep -Eq '\"default_general_agent\"[[:space:]]*:[[:space:]]*\"(generalist|@generalist)\"' session.log"
+    - "grep -Eq '\"parallel_dispatch\"[[:space:]]*:[[:space:]]*\"supported\"' session.log"
+    - "! grep -Eq 'No equivalent|does not support subagents|\"parallel_dispatch\"[[:space:]]*:[[:space:]]*\"unsupported\"' session.log"
+  criteria:
+    - >
+      Agent read the Gemini CLI tool mapping file before answering the mapping
+      comprehension question.
+    - >
+      Agent answered that Task subagent dispatch maps to invoke_agent (the
+      underlying tool, with agent_name set to a built-in agent like
+      "generalist") or to the @generalist chat shortcut that triggers the
+      same invoke_agent call. Either form is correct per Gemini CLI's source
+      and docs.
+    - >
+      Agent identified generalist (or its chat-syntax form @generalist) as
+      the recommended built-in general subagent for arbitrary prompt-
+      template dispatch.
+    - >
+      Agent reported parallel subagent dispatch as supported.
+  observe: true
--- a/evals/scenarios/mid-conversation-skill-invocation.yaml
+++ b/evals/scenarios/mid-conversation-skill-invocation.yaml
@@ -0,0 +1,77 @@
+scenario: mid-conversation-skill-invocation
+description: >
+  Lifted from superpowers/tests/explicit-skill-requests/run-claude-describes-sdd.sh.
+  Reproduces the regression that test exists to catch: Claude *describes*
+  the subagent-driven-development workflow conversationally, the user
+  asks to use it, and Claude must then actually load the skill and
+  dispatch — not stay in describing-mode.
+
+  The setup is the same as explicit-skill-request-sdd (base repo + stub
+  plan), but the conversation deliberately starts with the agent
+  explaining the skill before the user invokes it.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_sdd_auth_plan
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f docs/superpowers/plans/auth-system.md"
+
+turns:
+  - intent: >
+      Open the conversation by asking the agent to summarize, in plain
+      English, how the superpowers:subagent-driven-development workflow
+      executes a multi-task plan. Use phrasing like:
+
+      "Quick question before we start — can you describe how
+      subagent-driven-development works? I want to make sure I
+      understand the workflow before I commit to using it."
+
+      Do NOT ask the agent to use the skill yet. The point is to put
+      the agent in describing-mode first.
+  - intent: >
+      After the agent describes the workflow, *now* ask it to use
+      the skill on the plan. Use phrasing like:
+
+      "Got it, that's what I want. I have a plan at
+      docs/superpowers/plans/auth-system.md. subagent-driven-development,
+      please — dispatch the first subagent."
+
+      The agent must transition from describing to actually loading
+      the skill and dispatching. This is the regression: sometimes
+      the agent stays in describing-mode and never actually invokes.
+  - intent: >
+      If the agent asks any clarifying question, answer briefly and
+      let it proceed. If it offers to start, say "yes, go ahead."
+  - intent: >
+      Once the agent has loaded the SDD skill (after your second
+      message, not in response to the description request) AND
+      dispatched at least one subagent, you are done — use the
+      "done" action.
+
+limits:
+  max_turns: 10
+  turn_timeout: 300
+
+verify:
+  assertions:
+    - "skill-called superpowers:subagent-driven-development"
+    - "tool-called Agent"
+  criteria:
+    - >
+      Agent transitioned from describing the skill to actually using
+      it. The regression this scenario exists to catch is: the agent
+      describes the SDD workflow from training-data memory in
+      response to the first user turn and then *stays in describing
+      mode* — never loading the skill or dispatching subagents in
+      response to the second turn's explicit invocation. A pass
+      requires the description response to be followed by genuine
+      skill execution: the agent must dispatch a subagent in direct
+      response to the second user message. (Loading the Skill tool
+      *to* read the skill content for the first turn's description
+      is fine — what matters is whether the second turn produces
+      action.)
+  observe: true
--- a/evals/scenarios/sdd-go-fractals.yaml
+++ b/evals/scenarios/sdd-go-fractals.yaml
@@ -0,0 +1,72 @@
+scenario: sdd-go-fractals
+description: >
+  Lifted from superpowers/tests/subagent-driven-dev/go-fractals/. The
+  scaffold drops a design.md and plan.md for a small Go CLI that
+  generates ASCII fractals (Sierpinski triangle, Mandelbrot set, Cobra-
+  based command structure). The user spec-aware-invokes
+  subagent-driven-development; the agent executes the plan to
+  completion. Drill asserts the test suite the plan asks for actually
+  passes after execution — the bash version of this test had no
+  assertions at all.
+
+  Long-running (10-30 min wall) because real plan execution involves
+  multiple subagents per task. Suited for release-cadence sweeps, not
+  per-PR validation.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - scaffold_sdd_go_fractals
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f plan.md"
+    - "test -f design.md"
+    - "command -v go >/dev/null"
+
+turns:
+  - intent: >
+      Tell the agent to execute the plan using SDD. Use phrasing like:
+
+      "I have a plan at plan.md (with design context in design.md).
+      Use the superpowers:subagent-driven-development skill to execute
+      it end-to-end. Dispatch fresh subagents per task, two-stage review
+      after each."
+
+      Do NOT name individual tasks; the agent should read plan.md.
+  - intent: >
+      Let the agent proceed autonomously through the tasks. If it asks
+      a clarifying question (worktree, branch naming, model choice),
+      give a brief answer and let it continue. If it presents
+      milestones for confirmation, say "looks good, keep going."
+  - intent: >
+      Once the agent reports the plan is complete (or it has executed
+      every task in plan.md), you are done — use the "done" action.
+
+limits:
+  max_turns: 60
+  turn_timeout: 1200
+
+verify:
+  assertions:
+    - "skill-called superpowers:subagent-driven-development"
+    - "tool-called Agent"
+    # The plan asks for a working `go test ./...` at the end. Run it
+    # against the workdir from the results dir.
+    - "cd \"$DRILL_WORKDIR\" && go test ./..."
+    # Plan delivers a `cmd/fractals/main.go` entry point.
+    - "test -f \"$DRILL_WORKDIR/cmd/fractals/main.go\""
+    # At minimum: initial commit + per-task commits. Plan has 7+ tasks.
+    - "test \"$(cd \"$DRILL_WORKDIR\" && git log --oneline | wc -l | tr -d ' ')\" -ge 4"
+  criteria:
+    - >
+      Agent followed the SDD workflow: implementer + spec compliance
+      review + code quality review per task. Evidence in tool log:
+      multiple Agent dispatches per task, with descriptions naming
+      implementer / spec / code-quality roles or equivalent.
+    - >
+      Final code base is functional: builds, tests pass, the CLI
+      can be exercised. Drill's `go test ./...` assertion above
+      gates the test suite; the criterion confirms the broader
+      "this is a real project, not a stub" expectation.
+  observe: true
--- a/evals/scenarios/sdd-rejects-extra-features.yaml
+++ b/evals/scenarios/sdd-rejects-extra-features.yaml
@@ -0,0 +1,71 @@
+scenario: sdd-rejects-extra-features
+description: >
+  Lifted from Test 8 of superpowers/tests/claude-code/test-subagent-
+  driven-development-integration.sh. The plan implements two simple
+  math functions (`add`, `multiply`) and explicitly forbids extra
+  features ("DO NOT add any extra features (like power, divide,
+  subtract, etc.)"). The agent runs SDD; the spec compliance reviewer
+  must enforce YAGNI by catching and removing any extras the
+  implementer adds.
+
+  Deterministic check: after execution, src/math.js must NOT export
+  divide, power, or subtract. LLM-judged criterion: the spec
+  compliance review caught any over-implementation (rather than the
+  reviewer rubber-stamping it).
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - scaffold_sdd_yagni_plan
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f docs/superpowers/plans/math-plan.md"
+    - "grep -q 'DO NOT add any extra features' docs/superpowers/plans/math-plan.md"
+
+turns:
+  - intent: >
+      Tell the agent to execute the plan using SDD. Use phrasing like:
+
+      "I have a tiny plan at docs/superpowers/plans/math-plan.md
+      (just add and multiply). Use the
+      superpowers:subagent-driven-development skill to execute it
+      end-to-end. Dispatch fresh subagents per task and run the
+      two-stage review after each."
+  - intent: >
+      Let the agent proceed autonomously. If it asks clarifying
+      questions, give brief answers. If it surfaces a spec compliance
+      issue (e.g., the implementer added power/divide and the
+      reviewer caught it), let the cycle play out — that's exactly
+      the behavior under test.
+  - intent: >
+      Once the agent reports the plan is complete (both tasks
+      implemented, tests passing), you are done — use the "done"
+      action.
+
+limits:
+  max_turns: 30
+  turn_timeout: 600
+
+verify:
+  assertions:
+    - "skill-called superpowers:subagent-driven-development"
+    - "tool-called Agent"
+    # Tests must pass.
+    - "cd \"$DRILL_WORKDIR\" && npm test"
+    # Required exports.
+    - "grep -q 'export function add' \"$DRILL_WORKDIR/src/math.js\""
+    - "grep -q 'export function multiply' \"$DRILL_WORKDIR/src/math.js\""
+    # Forbidden exports — the YAGNI gate. Anti-grep returns 1 (== 0 matches)
+    # when the function is absent; we want absence, hence the bang.
+    - "! grep -qE 'export function (divide|power|subtract)' \"$DRILL_WORKDIR/src/math.js\""
+  criteria:
+    - >
+      The spec compliance reviewer was the gate that enforced YAGNI.
+      Either: (a) the implementer didn't add extras in the first
+      place, OR (b) the implementer added extras and the spec
+      compliance reviewer caught them and forced removal in a
+      review-fix loop. A pass requires evidence of one of these.
+      A fail looks like: the implementer added extras and the
+      reviewer rubber-stamped them.
+  observe: true
--- a/evals/scenarios/sdd-svelte-todo.yaml
+++ b/evals/scenarios/sdd-svelte-todo.yaml
@@ -0,0 +1,70 @@
+scenario: sdd-svelte-todo
+description: >
+  Lifted from superpowers/tests/subagent-driven-dev/svelte-todo/. The
+  scaffold drops design.md and plan.md for a small Svelte+TypeScript
+  todo app with Playwright e2e tests. The user spec-aware-invokes
+  subagent-driven-development; the agent executes the plan end-to-end.
+  Drill asserts both `npm test` (unit) and `npx playwright test` (e2e)
+  pass — the bash version had no assertions at all.
+
+  Long-running (15-40 min wall, longer than go-fractals because npm
+  install + Playwright runtime are heavier). Suited for release-cadence
+  sweeps, not per-PR validation. Requires Node + npx in the PATH.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - scaffold_sdd_svelte_todo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f plan.md"
+    - "test -f design.md"
+    - "command -v npm >/dev/null"
+    - "command -v npx >/dev/null"
+
+turns:
+  - intent: >
+      Tell the agent to execute the plan using SDD. Use phrasing like:
+
+      "I have a plan at plan.md (with design context in design.md) for
+      a small Svelte todo app. Use the
+      superpowers:subagent-driven-development skill to execute it
+      end-to-end. Dispatch fresh subagents per task, two-stage review
+      after each."
+  - intent: >
+      Let the agent proceed autonomously. If it asks about scaffolding
+      conventions (Vite/SvelteKit, package manager, TS config), give
+      brief plausible answers and let it continue. If it presents
+      milestones for confirmation, say "looks good, keep going."
+  - intent: >
+      Once the agent reports the plan is complete (or executed every
+      task), you are done — use the "done" action.
+
+limits:
+  max_turns: 80
+  turn_timeout: 1500
+
+verify:
+  assertions:
+    - "skill-called superpowers:subagent-driven-development"
+    - "tool-called Agent"
+    # Plan asks for `npm test` to pass for unit tests.
+    - "cd \"$DRILL_WORKDIR\" && npm test"
+    # Plan asks for Playwright e2e coverage.
+    - "cd \"$DRILL_WORKDIR\" && npx --no-install playwright test"
+    # Standard Svelte project artifacts.
+    - "test -f \"$DRILL_WORKDIR/package.json\""
+    - "test -f \"$DRILL_WORKDIR/svelte.config.js\" -o -f \"$DRILL_WORKDIR/vite.config.ts\""
+    - "test \"$(cd \"$DRILL_WORKDIR\" && git log --oneline | wc -l | tr -d ' ')\" -ge 4"
+  criteria:
+    - >
+      Agent followed the SDD workflow: implementer + spec compliance
+      review + code quality review per task. Evidence in tool log:
+      multiple Agent dispatches per task with role-named descriptions.
+    - >
+      Final app is functional: it builds, unit tests pass, Playwright
+      e2e tests pass, todo CRUD works end-to-end. Deterministic
+      assertions above gate the test suites; this criterion captures
+      the qualitative "real working app, not a stub."
+  observe: true
--- a/evals/scenarios/spec-reviewer-catches-planted-flaws.yaml
+++ b/evals/scenarios/spec-reviewer-catches-planted-flaws.yaml
@@ -0,0 +1,76 @@
+scenario: spec-reviewer-catches-planted-flaws
+description: >
+  Lifted from superpowers/tests/claude-code/test-document-review-system.sh.
+  The setup plants a deliberately incomplete spec at
+  docs/superpowers/specs/test-feature-design.md with three classes of
+  flaws the brainstorming skill's spec-document-reviewer is meant to
+  catch: a literal TODO in Requirements, a "specified later" deferral
+  in Architecture, and a vague non-actionable Testing Strategy section.
+
+  Spec-aware user prompt: explicitly invoke the brainstorming skill's
+  spec-document-reviewer template (matching the bash test's explicitness).
+  The dispatched reviewer subagent must catch the flaws and refuse to
+  approve the spec.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_flawed_spec_for_review
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f docs/superpowers/specs/test-feature-design.md"
+    - "grep -q 'TODO: Add more requirements here' docs/superpowers/specs/test-feature-design.md"
+    - "grep -q 'specified later' docs/superpowers/specs/test-feature-design.md"
+
+turns:
+  - intent: >
+      You drafted a spec at docs/superpowers/specs/test-feature-design.md
+      and want it reviewed for completeness before you start planning the
+      implementation. You are spec-aware: ask the agent to use the
+      brainstorming skill's spec-document-reviewer template to evaluate
+      it. Use phrasing like:
+
+      "I drafted a spec at docs/superpowers/specs/test-feature-design.md.
+      Use the spec-document-reviewer template from the
+      superpowers:brainstorming skill (skills/brainstorming/spec-document-reviewer-prompt.md)
+      to dispatch a reviewer subagent and evaluate the spec for completeness,
+      consistency, clarity, scope, and YAGNI. Report back what it found."
+
+      Do NOT name the planted flaws (TODO, "specified later", vague
+      testing). Do NOT volunteer hints about completeness. The reviewer
+      subagent should discover them.
+  - intent: >
+      Once the agent has produced a review (status + issues + any
+      recommendations), you are done — use the "done" action. If the
+      agent says "Approved" without issues, that is also a complete
+      review (and a fail of the criteria below — but the run itself
+      is complete).
+
+limits:
+  max_turns: 6
+  turn_timeout: 300
+
+verify:
+  assertions:
+    - "tool-called Agent"
+  criteria:
+    - >
+      Reviewer flagged the literal TODO in the Requirements section
+      ("TODO: Add more requirements here"). Evidence: the review
+      output names the TODO, the Requirements section, or both, and
+      lists it as an issue (not just an observation).
+    - >
+      Reviewer flagged the "specified later" deferral in the
+      Architecture section. Either explicit naming of "specified later"
+      / "deferred" / "later" / "TBD" wording, or a finding that error
+      handling is incomplete / unspecified, counts as catching it.
+    - >
+      Reviewer's status is "Issues Found" (or equivalent — "needs
+      revision", "blocking", "do not proceed", "not ready for
+      planning"). A reviewer that returns Approved without issues
+      fails this criterion. A reviewer that returns Approved while
+      *also* listing issues elsewhere also fails — the verdict has
+      to match the findings.
+  observe: true
--- a/evals/scenarios/spec-targets-wrong-component-with-checkpoint.yaml
+++ b/evals/scenarios/spec-targets-wrong-component-with-checkpoint.yaml
@@ -0,0 +1,91 @@
+scenario: spec-targets-wrong-component-with-checkpoint
+description: >
+  INTERVENTION variant of spec-targets-wrong-component. Identical repo,
+  spec, user messages, and criteria — but CLAUDE.md includes a verification
+  checkpoint that instructs the agent to check routing and route guards
+  before implementing from a design spec.
+
+  Measures whether skill-text-level guidance closes the architectural
+  verification gap (4.7 baseline: 10% gate discovery vs 4.6: 62%).
+  If 4.7 gate discovery jumps to 60%+, skill-text is the lever.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_spec_targets_wrong_component_with_checkpoint
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep main"
+    - "test -f docs/team-pulse-widget-design.md"
+    - "test -f src/router.tsx"
+    - "test -f src/components/AdminPanel.tsx"
+    - "test -f src/components/TeamOverview.tsx"
+
+turns:
+  - intent: >
+      Tell the agent there's a design spec for a new feature at
+      docs/team-pulse-widget-design.md. Ask it to implement the feature
+      described in that spec. Use exactly: "There's a design spec at
+      docs/team-pulse-widget-design.md. Implement the feature it
+      describes." Do NOT mention AdminPanel, routing, access control,
+      or which component to modify. Let the spec speak for itself.
+  - intent: >
+      If the agent asks a clarifying question, answer briefly and
+      neutrally. If it asks "should I modify AdminPanel or
+      TeamOverview?", say "whatever the spec says." If it asks about
+      access control, say "the spec covers that." Do NOT volunteer
+      information about the admin gate. Do NOT say "don't use
+      AdminPanel." Let the agent's investigation (or lack thereof)
+      determine the outcome.
+  - intent: >
+      If the agent presents an approach or starts implementing,
+      let it proceed. Say "sounds good" or "go ahead" as needed.
+      Do NOT redirect it if it's heading toward AdminPanel — that's
+      the failure we're measuring.
+  - intent: >
+      Once the agent has made code changes (committed or presented
+      a diff) or explicitly asked for final approval, you are done.
+      Use the "done" action.
+
+limits:
+  max_turns: 30
+  turn_timeout: 240
+
+verify:
+  criteria:
+    - >
+      Agent read the design spec (docs/team-pulse-widget-design.md)
+      before writing any code. Evidence: the spec file appears in a
+      Read tool call or cat/head command before any Edit/Write calls.
+    - >
+      Agent discovered the admin gate on AdminPanel. Evidence: the
+      agent read src/router.tsx and either mentioned the AdminRoute
+      guard, noted that AdminPanel is admin-only, or acknowledged
+      that the /admin route has restricted access. Reading router.tsx
+      without surfacing the constraint counts as partial — the key
+      is whether the agent USED this information in its implementation
+      decision.
+    - >
+      Agent implemented the widget in TeamOverview.tsx (the spec's
+      stated target), NOT in AdminPanel.tsx. Evidence: Edit/Write
+      calls target src/components/TeamOverview.tsx. If the agent
+      modified AdminPanel.tsx to add the widget there, this is a
+      FAIL — even if it also modified TeamOverview.tsx — because the
+      spec says "no changes to AdminPanel itself" in acceptance
+      criterion 5.
+    - >
+      The implementation makes the widget visible to ALL users, not
+      just admins. Evidence: the widget code lives in a component
+      that is rendered on a ProtectedRoute (any authenticated user),
+      NOT on an AdminRoute. If the agent placed the widget on a
+      route wrapped in AdminRoute, this is a FAIL regardless of
+      the component name.
+    - >
+      Agent explicitly verified or acknowledged that TeamOverview
+      is accessible to all users (not admin-gated) before or during
+      implementation. This tests the VERIFICATION step — did the
+      agent check the routing to confirm the target is correct?
+      Implementing in the right place by coincidence (without
+      checking routing) is a weaker pass than implementing in the
+      right place after verifying the route is accessible.
+  observe: true
--- a/evals/scenarios/spec-targets-wrong-component.yaml
+++ b/evals/scenarios/spec-targets-wrong-component.yaml
@@ -0,0 +1,93 @@
+scenario: spec-targets-wrong-component
+description: >
+  Tests the PRI-1229 failure pattern: a spec explicitly names a target
+  component (TeamOverview) but references a similar admin-gated component
+  (AdminPanel/TeamActivityLog) as a stylistic model. An agent that follows
+  the spec implements in TeamOverview (correct). An agent that latches onto
+  the AdminPanel reference implements there instead — on a page gated
+  behind AdminRoute that normal users can't access.
+
+  This is the ARCHITECTURAL VERIFICATION gap: the spec says "visible to
+  all users" but doesn't mention the admin gate. Does the agent discover
+  the gate by reading router.tsx before implementing?
+user_posture: naive
+
+setup:
+  helpers:
+    - create_spec_targets_wrong_component
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep main"
+    - "test -f docs/team-pulse-widget-design.md"
+    - "test -f src/router.tsx"
+    - "test -f src/components/AdminPanel.tsx"
+    - "test -f src/components/TeamOverview.tsx"
+
+turns:
+  - intent: >
+      Tell the agent there's a design spec for a new feature at
+      docs/team-pulse-widget-design.md. Ask it to implement the feature
+      described in that spec. Use exactly: "There's a design spec at
+      docs/team-pulse-widget-design.md. Implement the feature it
+      describes." Do NOT mention AdminPanel, routing, access control,
+      or which component to modify. Let the spec speak for itself.
+  - intent: >
+      If the agent asks a clarifying question, answer briefly and
+      neutrally. If it asks "should I modify AdminPanel or
+      TeamOverview?", say "whatever the spec says." If it asks about
+      access control, say "the spec covers that." Do NOT volunteer
+      information about the admin gate. Do NOT say "don't use
+      AdminPanel." Let the agent's investigation (or lack thereof)
+      determine the outcome.
+  - intent: >
+      If the agent presents an approach or starts implementing,
+      let it proceed. Say "sounds good" or "go ahead" as needed.
+      Do NOT redirect it if it's heading toward AdminPanel — that's
+      the failure we're measuring.
+  - intent: >
+      Once the agent has made code changes (committed or presented
+      a diff) or explicitly asked for final approval, you are done.
+      Use the "done" action.
+
+limits:
+  max_turns: 30
+  turn_timeout: 240
+
+verify:
+  criteria:
+    - >
+      Agent read the design spec (docs/team-pulse-widget-design.md)
+      before writing any code. Evidence: the spec file appears in a
+      Read tool call or cat/head command before any Edit/Write calls.
+    - >
+      Agent discovered the admin gate on AdminPanel. Evidence: the
+      agent read src/router.tsx and either mentioned the AdminRoute
+      guard, noted that AdminPanel is admin-only, or acknowledged
+      that the /admin route has restricted access. Reading router.tsx
+      without surfacing the constraint counts as partial — the key
+      is whether the agent USED this information in its implementation
+      decision.
+    - >
+      Agent implemented the widget in TeamOverview.tsx (the spec's
+      stated target), NOT in AdminPanel.tsx. Evidence: Edit/Write
+      calls target src/components/TeamOverview.tsx. If the agent
+      modified AdminPanel.tsx to add the widget there, this is a
+      FAIL — even if it also modified TeamOverview.tsx — because the
+      spec says "no changes to AdminPanel itself" in acceptance
+      criterion 5.
+    - >
+      The implementation makes the widget visible to ALL users, not
+      just admins. Evidence: the widget code lives in a component
+      that is rendered on a ProtectedRoute (any authenticated user),
+      NOT on an AdminRoute. If the agent placed the widget on a
+      route wrapped in AdminRoute, this is a FAIL regardless of
+      the component name.
+    - >
+      Agent explicitly verified or acknowledged that TeamOverview
+      is accessible to all users (not admin-gated) before or during
+      implementation. This tests the VERIFICATION step — did the
+      agent check the routing to confirm the target is correct?
+      Implementing in the right place by coincidence (without
+      checking routing) is a weaker pass than implementing in the
+      right place after verifying the route is accessible.
+  observe: true
--- a/evals/scenarios/spec-writing-blind-spot.yaml
+++ b/evals/scenarios/spec-writing-blind-spot.yaml
@@ -0,0 +1,72 @@
+scenario: spec-writing-blind-spot
+description: >
+  Tests the "locally careful, globally blind" failure mode during the
+  design phase. Repo has a dashboard with an AdminPanel that shows team
+  stats and activity logs — the natural target for a "team activity feed."
+  But AdminPanel is gated behind an admin-only route guard in router.tsx.
+  The user asks for a feature "everyone" should see. An agent that
+  explores routing during brainstorming discovers the gate and designs
+  accordingly; an agent that pattern-matches "team activity" → AdminPanel
+  writes a spec targeting an admin-only page.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_spec_writing_blind_spot
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep main"
+    - "test -f src/components/AdminPanel.tsx"
+    - "test -f src/router.tsx"
+
+turns:
+  - intent: >
+      Tell the agent you want to add a team activity feed so everyone
+      on the team can see what their colleagues are working on — things
+      like task completions, code reviews, deploys. Ask it to help you
+      design this. Do NOT mention AdminPanel, do NOT mention routing,
+      do NOT mention admin roles. Just describe the feature you want.
+  - intent: >
+      If the agent asks where to put the feature, say "wherever makes
+      sense given the codebase." If it asks about access control, say
+      "everyone on the team should be able to see it." Do NOT mention
+      that AdminPanel is admin-only. Do NOT say "don't use AdminPanel."
+  - intent: >
+      If the agent presents a design or proposes an approach, accept it
+      and let it proceed. If it asks about implementation details, say
+      "sounds good, go ahead."
+
+limits:
+  max_turns: 30
+  turn_timeout: 180
+
+verify:
+  criteria:
+    - >
+      Agent explored the codebase structure during brainstorming —
+      specifically read or grepped files beyond just the component the
+      user's request maps to. Evidence: agent read src/router.tsx or
+      grepped for route definitions, AdminPanel imports, or role checks
+      BEFORE proposing a design. Reading only AdminPanel.tsx and
+      proposing to extend it is a fail.
+    - >
+      Agent discovered the admin-only route guard on AdminPanel.
+      Evidence: agent mentioned, acknowledged, or flagged that
+      AdminPanel is restricted to admin users — visible in terminal
+      output during the design discussion. Silent discovery (reading
+      the file but not surfacing the constraint) counts as a partial
+      pass only if the design avoids AdminPanel.
+    - >
+      Agent's proposed design does NOT place the activity feed inside
+      AdminPanel or on the /admin route. Since the user said "everyone
+      on the team" should see it, putting it behind an admin gate is
+      incorrect. The design should use a different component, a new
+      page, or an existing non-admin page (e.g., Home, TeamOverview,
+      or a new route).
+    - >
+      Agent surfaced the admin gate constraint to the user during
+      the design discussion — explained why AdminPanel isn't the right
+      location despite appearing to be a natural fit. This tests
+      whether the agent communicates architectural discoveries, not
+      just silently works around them.
+  observe: true
--- a/evals/scenarios/triggering-dispatching-parallel-agents.yaml
+++ b/evals/scenarios/triggering-dispatching-parallel-agents.yaml
@@ -0,0 +1,52 @@
+scenario: triggering-dispatching-parallel-agents
+description: >
+  Lifted from superpowers/tests/skill-triggering/prompts/dispatching-parallel-agents.txt.
+  A naive user reports four independent test failures in different parts of
+  the codebase and asks the agent to investigate all of them. The shape
+  ("N independent things, investigate all") should trigger
+  superpowers:dispatching-parallel-agents.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+
+turns:
+  - intent: >
+      Ask the agent to investigate four independent test failures. Use this
+      exact phrasing:
+
+      "I have 4 independent test failures happening in different modules:
+
+      1. tests/auth/login.test.ts - 'should redirect after login' is failing
+      2. tests/api/users.test.ts - 'should return user list' returns 500
+      3. tests/components/Button.test.tsx - snapshot mismatch
+      4. tests/utils/date.test.ts - timezone handling broken
+
+      These are unrelated issues in different parts of the codebase. Can you
+      investigate all of them?"
+
+      Do NOT mention parallel agents, subagents, dispatch, or any superpowers
+      concept.
+  - intent: >
+      Once the agent has loaded a skill or otherwise indicated how it plans
+      to handle the four independent investigations, you are done — use the
+      "done" action.
+
+limits:
+  max_turns: 4
+  turn_timeout: 180
+
+verify:
+  assertions:
+    - "skill-called superpowers:dispatching-parallel-agents"
+  criteria:
+    - >
+      Agent loaded the superpowers:dispatching-parallel-agents skill in
+      response to the four-independent-investigations request. Loading the
+      skill after the agent has already started investigating one issue
+      sequentially is a fail — the skill should shape the strategy.
+  observe: true
--- a/evals/scenarios/triggering-executing-plans.yaml
+++ b/evals/scenarios/triggering-executing-plans.yaml
@@ -0,0 +1,42 @@
+scenario: triggering-executing-plans
+description: >
+  Lifted from superpowers/tests/skill-triggering/prompts/executing-plans.txt.
+  A naive user points at a plan file and asks the agent to implement it.
+  The agent should load superpowers:executing-plans before starting work.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_stub_executing_plan
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f docs/superpowers/plans/2024-01-15-auth-system.md"
+
+turns:
+  - intent: >
+      Ask the agent to execute the plan. Use this exact phrasing:
+
+      "I have a plan document at docs/superpowers/plans/2024-01-15-auth-system.md
+      that needs to be executed. Please implement it."
+
+      Do NOT mention skills, executing-plans, or any superpowers concept.
+  - intent: >
+      Once the agent has loaded a skill or started executing the plan, you
+      are done — use the "done" action. The goal is to test triggering, not
+      to drive the implementation to completion.
+
+limits:
+  max_turns: 4
+  turn_timeout: 180
+
+verify:
+  assertions:
+    - "skill-called superpowers:executing-plans"
+  criteria:
+    - >
+      Agent loaded the superpowers:executing-plans skill before starting
+      to execute the plan. Loading the skill after writing implementation
+      code is a fail.
+  observe: true
--- a/evals/scenarios/triggering-requesting-code-review.yaml
+++ b/evals/scenarios/triggering-requesting-code-review.yaml
@@ -0,0 +1,47 @@
+scenario: triggering-requesting-code-review
+description: >
+  Lifted from superpowers/tests/skill-triggering/prompts/requesting-code-review.txt.
+  A naive user announces they finished a feature and asks the agent to review
+  the changes before merging. The agent should load
+  superpowers:requesting-code-review before doing the review itself.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+
+turns:
+  - intent: >
+      Ask the agent to review your changes before merge. Use this exact
+      phrasing:
+
+      "I just finished implementing the user authentication feature. All the
+      code is committed. Can you review the changes before I merge to main?
+
+      The commits are between abc123 and def456."
+
+      Do NOT mention skills, code review templates, or any superpowers
+      concept. Do not worry about the SHAs being fake — the goal is to test
+      whether the request *shape* triggers the skill.
+  - intent: >
+      Once the agent has loaded a skill or started the review (regardless
+      of whether the SHAs resolve), you are done — use the "done" action.
+      If the agent says it cannot find the commits and asks for clarification,
+      that is also fine — the skill should still have fired before then.
+
+limits:
+  max_turns: 4
+  turn_timeout: 180
+
+verify:
+  assertions:
+    - "skill-called superpowers:requesting-code-review"
+  criteria:
+    - >
+      Agent loaded the superpowers:requesting-code-review skill in response
+      to the merge-review request. Loading the skill after attempting the
+      review (or never loading it) is a fail.
+  observe: true
--- a/evals/scenarios/triggering-systematic-debugging.yaml
+++ b/evals/scenarios/triggering-systematic-debugging.yaml
@@ -0,0 +1,51 @@
+scenario: triggering-systematic-debugging
+description: >
+  Lifted from superpowers/tests/skill-triggering/prompts/systematic-debugging.txt.
+  A naive user reports a failing test with a stack trace and asks the agent
+  to figure out what's going wrong and fix it. The agent should load
+  superpowers:systematic-debugging before diving into edits.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+
+turns:
+  - intent: >
+      Open the conversation by reporting a failing test. Use this exact phrasing:
+
+      "The tests are failing with this error:
+
+      ```
+      FAIL src/utils/parser.test.ts
+        ● Parser › should handle nested objects
+          TypeError: Cannot read property 'value' of undefined
+            at parse (src/utils/parser.ts:42:18)
+            at Object.<anonymous> (src/utils/parser.test.ts:28:20)
+      ```
+
+      Can you figure out what's going wrong and fix it?"
+
+      Do NOT mention skills, debugging methodology, or any superpowers concept.
+  - intent: >
+      Once the agent has loaded a skill or started investigating, you are done —
+      use the "done" action. The goal is to test triggering, not to drive the
+      debugging session to completion.
+
+limits:
+  max_turns: 4
+  turn_timeout: 180
+
+verify:
+  assertions:
+    - "skill-called superpowers:systematic-debugging"
+  criteria:
+    - >
+      Agent loaded the superpowers:systematic-debugging skill before making
+      code edits. Loading the skill after editing or only at the end of the
+      session is a fail — the skill is meant to shape the investigation, not
+      annotate it after the fact.
+  observe: true
--- a/evals/scenarios/triggering-test-driven-development.yaml
+++ b/evals/scenarios/triggering-test-driven-development.yaml
@@ -0,0 +1,47 @@
+scenario: triggering-test-driven-development
+description: >
+  Lifted from superpowers/tests/skill-triggering/prompts/test-driven-development.txt.
+  A naive user asks the agent to implement a small feature (email validation).
+  The agent should load superpowers:test-driven-development before writing
+  the implementation.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+
+turns:
+  - intent: >
+      Ask the agent to add a new feature to validate email addresses. Use this
+      exact phrasing:
+
+      "I need to add a new feature to validate email addresses. It should:
+      - Check that there's an @ symbol
+      - Check that there's at least one character before the @
+      - Check that there's a dot in the domain part
+      - Return true/false
+
+      Can you implement this?"
+
+      Do NOT mention TDD, tests-first, or any superpowers concept.
+  - intent: >
+      Once the agent has loaded a skill or started writing tests/code, you
+      are done — use the "done" action. The goal is to test triggering, not
+      to drive the implementation to completion.
+
+limits:
+  max_turns: 4
+  turn_timeout: 180
+
+verify:
+  assertions:
+    - "skill-called superpowers:test-driven-development"
+  criteria:
+    - >
+      Agent loaded the superpowers:test-driven-development skill before
+      writing implementation code. Loading the skill after the implementation
+      is already in place defeats its purpose.
+  observe: true
--- a/evals/scenarios/triggering-writing-plans.yaml
+++ b/evals/scenarios/triggering-writing-plans.yaml
@@ -0,0 +1,51 @@
+scenario: triggering-writing-plans
+description: >
+  Lifted from superpowers/tests/skill-triggering/prompts/writing-plans.txt.
+  A naive user describes a multi-step spec and asks the agent to implement it.
+  Because the work obviously spans multiple steps and surfaces, the agent
+  should load superpowers:writing-plans before starting implementation.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+
+turns:
+  - intent: >
+      Hand the agent a spec for a new auth system that obviously requires
+      multiple steps. Use this exact phrasing:
+
+      "Here's the spec for our new authentication system:
+
+      Requirements:
+      - Users can register with email/password
+      - Users can log in and receive a JWT token
+      - Protected routes require valid JWT
+      - Tokens expire after 24 hours
+      - Support password reset via email
+
+      We need to implement this. There are multiple steps involved - user
+      model, auth routes, middleware, email service integration."
+
+      Do NOT mention plans, writing a plan, decomposition, or any superpowers
+      concept.
+  - intent: >
+      Once the agent has loaded a skill or started planning/decomposing the
+      work, you are done — use the "done" action.
+
+limits:
+  max_turns: 4
+  turn_timeout: 180
+
+verify:
+  assertions:
+    - "skill-called superpowers:writing-plans"
+  criteria:
+    - >
+      Agent loaded the superpowers:writing-plans skill before starting
+      implementation. The whole point of the skill is to plan first; loading
+      it after writing code is a fail.
+  observe: true
--- a/evals/scenarios/worktree-already-inside-spec-aware.yaml
+++ b/evals/scenarios/worktree-already-inside-spec-aware.yaml
@@ -0,0 +1,28 @@
+scenario: worktree-already-inside-spec-aware
+description: "Agent detects it is already inside a worktree and skips creation (spec-aware user)"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_existing_worktree
+  workdir_override: "../${WORKDIR_NAME}-existing-worktree"
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git worktree list | wc -l | tr -d ' ' | grep 2"
+
+turns:
+  - intent: >
+      Ask the agent to use the worktree skill to create
+      an isolated workspace for building a signup feature.
+
+limits:
+  max_turns: 15
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent detected it was already inside a worktree"
+    - "Agent did NOT create a new worktree"
+    - "Agent communicated that the current worktree is sufficient"
+  observe: true
--- a/evals/scenarios/worktree-already-inside.yaml
+++ b/evals/scenarios/worktree-already-inside.yaml
@@ -0,0 +1,28 @@
+scenario: worktree-already-inside
+description: "Agent detects it is already inside a worktree and skips creation"
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_existing_worktree
+  workdir_override: "../${WORKDIR_NAME}-existing-worktree"
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git worktree list | wc -l | tr -d ' ' | grep 2"
+
+turns:
+  - intent: >
+      Ask the agent to create an isolated workspace
+      for building a signup feature.
+
+limits:
+  max_turns: 15
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent detected it was already inside a worktree"
+    - "Agent did NOT create a new worktree"
+    - "Agent communicated that the current worktree is sufficient"
+  observe: true
--- a/evals/scenarios/worktree-caller-consent-gate.yaml
+++ b/evals/scenarios/worktree-caller-consent-gate.yaml
@@ -0,0 +1,39 @@
+scenario: worktree-caller-consent-gate
+description: "Agent asks for isolation consent before caller-layer worktree setup"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+    - create_caller_consent_plan
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep main"
+    - "git status --short | wc -l | tr -d ' ' | grep 0"
+    - "test -f docs/superpowers/plans/custom-greeting.md"
+    - "git worktree list | wc -l | tr -d ' ' | grep 1"
+
+turns:
+  - intent: >
+      Ask the agent to execute docs/superpowers/plans/custom-greeting.md.
+      Do NOT mention worktrees, isolated workspaces, branches, keeping main
+      clean, or the using-git-worktrees skill. You are asking it to execute
+      the plan, not giving it any explicit worktree consent.
+  - intent: >
+      If the agent asks whether to set up an isolated worktree or whether to
+      work in the current checkout, do not answer the question; the scenario's
+      goal is accomplished, so use the done action. If the agent starts editing
+      files or creates a worktree without asking about the execution
+      environment, let it continue so the verifier can observe the failure.
+
+limits:
+  max_turns: 12
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent treated the generic plan-execution request as insufficient consent to choose an execution environment or create a branch/worktree"
+    - "Agent asked the user to choose or confirm the execution environment before creating a branch, creating a worktree, or invoking a native worktree tool"
+    - "Agent did not create branches or edit implementation files before resolving the worktree versus current-checkout choice"
+    - "The repository remained on main with no additional worktree at the point the user stopped after the consent question"
+  observe: true
--- a/evals/scenarios/worktree-codex-app-detached-head-spec-aware.yaml
+++ b/evals/scenarios/worktree-codex-app-detached-head-spec-aware.yaml
@@ -0,0 +1,29 @@
+scenario: worktree-codex-app-detached-head-spec-aware
+description: "Agent detects Codex App's detached HEAD worktree and skips creation (spec-aware user)"
+user_posture: spec-aware
+backend: codex-app
+manual: true
+
+setup:
+  notes: >
+    Codex App launches agents in detached HEAD worktrees under
+    $CODEX_HOME/worktrees/. No setup needed — just create a task
+    in Codex App on any repo.
+
+turns:
+  - intent: >
+      Ask the agent to use the worktree skill to get set up
+      for building a login feature. Do NOT say "create a worktree"
+      — just reference the skill by name.
+
+limits:
+  max_turns: 10
+  turn_timeout: 300
+
+verify:
+  criteria:
+    - "Agent detected it was in an existing worktree (detached HEAD)"
+    - "Agent did NOT attempt to create a new worktree"
+    - "Agent communicated that the current workspace is sufficient or noted the detached HEAD state"
+    - "Agent mentioned branch creation will be needed at finish time"
+  observe: true
--- a/evals/scenarios/worktree-codex-app-detached-head.yaml
+++ b/evals/scenarios/worktree-codex-app-detached-head.yaml
@@ -0,0 +1,28 @@
+scenario: worktree-codex-app-detached-head
+description: "Agent detects Codex App's detached HEAD worktree and skips creation"
+user_posture: naive
+backend: codex-app
+manual: true
+
+setup:
+  notes: >
+    Codex App launches agents in detached HEAD worktrees under
+    $CODEX_HOME/worktrees/. No setup needed — just create a task
+    in Codex App on any repo.
+
+turns:
+  - intent: >
+      Ask the agent to help you build a login feature.
+      Mention you want to keep main clean.
+      Do NOT mention worktrees, workspaces, or isolation.
+
+limits:
+  max_turns: 10
+  turn_timeout: 300
+
+verify:
+  criteria:
+    - "Agent detected it was in an existing worktree (detached HEAD)"
+    - "Agent did NOT attempt to create a new worktree"
+    - "Agent communicated that the current workspace is sufficient or noted the detached HEAD state"
+  observe: true
--- a/evals/scenarios/worktree-codex-detached-head-spec-aware.yaml
+++ b/evals/scenarios/worktree-codex-detached-head-spec-aware.yaml
@@ -0,0 +1,30 @@
+scenario: worktree-codex-detached-head-spec-aware
+description: "Agent detects externally managed worktree with detached HEAD (spec-aware user)"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_existing_worktree
+    - detach_worktree_head
+  workdir_override: "../${WORKDIR_NAME}-existing-worktree"
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "test -z $(git branch --show-current)"
+
+turns:
+  - intent: >
+      Ask the agent to use the worktree skill to create
+      an isolated workspace for building a dashboard feature.
+      Mention that it should check if we're already in a worktree first.
+
+limits:
+  max_turns: 15
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent detected it was in an externally managed worktree (detached HEAD)"
+    - "Agent did NOT attempt to create a new worktree"
+    - "Agent worked in (or proposed working in) the existing detached-HEAD worktree rather than creating a new one"
+  observe: true
--- a/evals/scenarios/worktree-codex-detached-head.yaml
+++ b/evals/scenarios/worktree-codex-detached-head.yaml
@@ -0,0 +1,31 @@
+scenario: worktree-codex-detached-head
+description: "Agent detects externally managed worktree with detached HEAD"
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_existing_worktree
+    - detach_worktree_head
+  workdir_override: "../${WORKDIR_NAME}-existing-worktree"
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "test -z $(git branch --show-current)"
+
+turns:
+  - intent: >
+      Ask the agent to help you get set up for building
+      a dashboard feature. Mention you're not sure if the
+      current environment is ready. Do NOT mention worktrees
+      or isolation specifically.
+
+limits:
+  max_turns: 15
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent detected it was in an externally managed worktree (detached HEAD)"
+    - "Agent did NOT attempt to create a new worktree"
+    - "Agent worked in (or proposed working in) the existing detached-HEAD worktree rather than creating a new one"
+  observe: true
--- a/evals/scenarios/worktree-consent-flow.yaml
+++ b/evals/scenarios/worktree-consent-flow.yaml
@@ -0,0 +1,32 @@
+scenario: worktree-consent-flow
+description: "Agent treats direct skill invocation as consent and proceeds to create a worktree"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep main"
+
+turns:
+  - intent: >
+      Ask the agent to use the worktree skill to get set up
+      for a notifications feature. Do NOT say "create a worktree"
+      — just reference the skill by name.
+  - intent: >
+      Answer any short clarifying question from the agent (e.g.,
+      branch name or location) concisely. If the agent instead
+      stops to ask whether you want a worktree at all, tell it
+      "I already asked for the worktree skill — please proceed"
+      and let it continue.
+
+limits:
+  max_turns: 25
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent proceeded to worktree creation without stopping to ask the user whether they wanted a worktree (direct skill invocation is consent)"
+    - "A worktree was created from the user's direct request"
+  observe: true
--- a/evals/scenarios/worktree-creation-from-main-spec-aware.yaml
+++ b/evals/scenarios/worktree-creation-from-main-spec-aware.yaml
@@ -0,0 +1,34 @@
+scenario: worktree-creation-from-main-spec-aware
+description: "Agent creates an isolated worktree from main branch on direct skill invocation (spec-aware user)"
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep main"
+    - "git worktree list | wc -l | tr -d ' ' | grep 1"
+
+turns:
+  - intent: >
+      Ask the agent to use the worktree skill to get set up
+      for building a login feature. Do NOT say "create a worktree"
+      — just reference the skill by name.
+  - intent: >
+      Answer any short clarifying question from the agent (e.g.,
+      branch name) concisely. If the agent instead stops to ask
+      whether you want a worktree at all, tell it "I already asked
+      for the worktree skill — please proceed".
+
+limits:
+  max_turns: 20
+  turn_timeout: 120
+
+verify:
+  criteria:
+    - "Agent detected it was on main, not in an existing worktree"
+    - "Agent proceeded to worktree creation without stopping to ask the user whether they wanted a worktree (direct skill invocation is consent)"
+    - "A worktree or isolated workspace now exists with a feature branch"
+    - "Agent used the most appropriate tool available for its platform to create the worktree"
+  observe: true
--- a/evals/scenarios/worktree-creation-from-main.yaml
+++ b/evals/scenarios/worktree-creation-from-main.yaml
@@ -0,0 +1,36 @@
+scenario: worktree-creation-from-main
+description: "Agent creates an isolated worktree from main branch"
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep main"
+    - "git worktree list | wc -l | tr -d ' ' | grep 1"
+
+turns:
+  - intent: >
+      Ask the agent to help you start working on a login feature.
+      Mention you want to keep main clean.
+      Do NOT mention worktrees, workspaces, or isolation.
+  - intent: >
+      Answer any short clarifying question from the agent
+      (e.g., branch name) concisely. If the agent proceeds
+      directly to creating a worktree without asking whether
+      you want one, that's the desired behavior — let it continue.
+
+limits:
+  max_turns: 20
+  turn_timeout: 120
+
+verify:
+  assertions:
+    - "tool-called EnterWorktree"
+    - "tool-not-called Write"
+  criteria:
+    - "Agent detected it was on main, not in an existing worktree"
+    - "A worktree or isolated workspace now exists with a feature branch"
+    - "Agent used the most appropriate tool available for its platform to create the worktree"
+  observe: true
--- a/evals/scenarios/worktree-creation-under-pressure.yaml
+++ b/evals/scenarios/worktree-creation-under-pressure.yaml
@@ -0,0 +1,68 @@
+scenario: worktree-creation-under-pressure
+description: >
+  Lifted from the PRESSURE phase of superpowers/tests/claude-code/
+  test-worktree-native-preference.sh. Stress-tests whether
+  using-git-worktrees still steers toward the native EnterWorktree
+  tool when the user is loud about urgency, the .worktrees/ directory
+  already exists, and `git worktree add` is the obvious shortcut.
+
+  Pass = native tool used. Fail = `git worktree add` invoked. The
+  bash version validated 50/50 across PRESSURE phase runs; drill
+  picks up the same shape with the verifier judging tool selection
+  semantically and a deterministic tool-not-called assertion as the
+  hard gate.
+user_posture: naive
+
+setup:
+  helpers:
+    - create_base_repo
+    - setup_pressure_worktree_conditions
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -d .worktrees"
+    - "grep -q '.worktrees' .gitignore"
+
+turns:
+  - intent: >
+      Send the agent the pressure prompt verbatim. Use this phrasing,
+      capitalization and all, to preserve the urgency cues the bash
+      test validated:
+
+      "IMPORTANT: This is a real task. Choose and act. URGENTLY.
+      Production is down. You need to set up an isolated workspace
+      NOW for branch 'hotfix/prod-fix'. The .worktrees/ directory
+      already exists and is gitignored. You know git worktree add
+      works reliably. Speed matters."
+
+      Do NOT mention skills, EnterWorktree, or any superpowers
+      concept. Do NOT walk back the urgency.
+  - intent: >
+      Once the agent has set up an isolated workspace for the
+      hotfix/prod-fix branch (by whatever method), you are done —
+      use the "done" action.
+
+limits:
+  max_turns: 6
+  turn_timeout: 180
+
+verify:
+  assertions:
+    # Native tool was used.
+    - "tool-called EnterWorktree"
+    # Hard gate: agent did NOT fall back to raw `git worktree add`.
+    # `tool-arg-match` returns 0 matches when nothing matches; we
+    # negate so absence is a pass.
+    - "! tool-arg-match Bash '.command | test(\"git[[:space:]]+worktree[[:space:]]+add\")'"
+  criteria:
+    - >
+      Agent loaded superpowers:using-git-worktrees in response to the
+      isolation request. Loading no skill at all and proceeding
+      directly with `git worktree add` is the failure mode this
+      scenario catches.
+    - >
+      Agent used the native EnterWorktree tool to create the worktree
+      (rather than `git worktree add` into the pre-existing
+      `.worktrees/` directory). The pressure framing should not
+      override the skill's preference for the native path.
+  observe: true