evals: remove unreleased wave scenarios

2026-05-10 11:09:05 +08:00 · 2026-05-06 14:43:08 -07:00
parent e4191c3609
commit af465f9687
14 changed files with 14 additions and 1640 deletions
--- a/evals/README.md
+++ b/evals/README.md
@@ -32,13 +32,13 @@ export ANTHROPIC_API_KEY=sk-...
 uv run drill run worktree-creation-from-main -b claude
 # Run with N repetitions
-uv run drill run pattern-match-trap -b claude-opus-4-6 --n 5
+uv run drill run spec-writing-blind-spot -b claude-opus-4-6 --n 5
 # Sweep across multiple backends
-uv run drill run pattern-match-trap --models claude-opus-4-6,claude-opus-4-7 --n 10
+uv run drill run spec-writing-blind-spot --models claude-opus-4-6,claude-opus-4-7 --n 10
 # Compare results
-uv run drill compare pattern-match-trap
+uv run drill compare spec-writing-blind-spot
 # List available scenarios
 uv run drill list
@@ -48,10 +48,11 @@ uv run drill list
 | Category | Scenarios | Tests |
 |----------|-----------|-------|
-| Worktree | 8 scenarios (creation, detection, consent, detached HEAD) | Skill compliance for `using-git-worktrees` |
+| Worktree | 11 scenarios | Worktree creation, detection, consent, detached HEAD, and native-tool pressure |
-| Wave decomposition | 5 scenarios (naive, spec-aware, false overlap, dependency chain, conflict surface) | Plan → waves decomposition quality |
+| Skill triggering | 6 scenarios | Auto-invocation for core Superpowers skills |
-| Wave execution | 3 scenarios (minimal, full, task failure) | End-to-end wave execution + failure escalation |
+| SDD workflow | 5 scenarios | Explicit invocation, mid-conversation invocation, real-project execution, and YAGNI enforcement |
-| Pattern-match trap | 1 scenario | Investigation depth gap between 4.6 and 4.7 (PRI-1270) |
+| Review/spec/verification | 6 scenarios | Code review, spec review, architectural targeting, design blind spots, and verification reflexes |
 | Tool mapping | 3 scenarios | Codex and Gemini subagent tool-name mapping |
 ## Backends
--- a/evals/backends/claude.yaml
+++ b/evals/backends/claude.yaml
@@ -18,11 +18,11 @@ idle:
  ready_pattern: "^❯|^\\$|Human:|Enter to confirm"
 # Matches when Claude is actively working — spinners, "Thinking", time counter,
 # or "esc to cancel". Engine extends its wait deadline when any of these match
-# so the Actor doesn't interrupt long-running subagent work (e.g., wave execution).
+# so the Actor doesn't interrupt long-running subagent work.
 busy_pattern: "esc to cancel|Thinking\\.\\.\\.|\\(esc to cancel[^)]*\\)|[⠇⠏⠋⠙⠹⠸⠼⠴⠦⠧⠶⠾⠽⠻⠿]"
 # Maximum total seconds the engine will extend the deadline across all busy
-# detections during a single _wait_for_ready call. Wave execution can take
+# detections during a single _wait_for_ready call. Long-running subagent work
-# 10-20 minutes per wave, so 30 minutes gives plenty of headroom.
+# can take a while, so 30 minutes gives plenty of headroom.
 max_busy_seconds: 1800
 startup_timeout: 60
 terminal:
--- a/evals/drill/engine.py
+++ b/evals/drill/engine.py
@@ -281,8 +281,8 @@ class Engine:
        ready pattern. If the backend's busy pattern matches (spinner
        visible, "Thinking...", timer counting), the deadline is extended
        by small increments up to `max_busy_seconds` total. This prevents
-        the Actor from interrupting long-running subagent work (wave
+        the Actor from interrupting long-running subagent work (multi-file
-        execution, multi-file implementation, etc.).
+        implementation, parallel dispatch, etc.).
        Exits silently if the final deadline (timeout + busy extensions)
        passes without reaching a ready state.
--- a/evals/prompts/actor.md
+++ b/evals/prompts/actor.md
@@ -38,4 +38,4 @@ When ANY of these indicators is present:
 The cleanest approach when you see the agent is busy: if your goals are done, use "done". If not, the engine should not be asking you to act — but if it does, type a single period "." or space " " as a minimal no-op, and the next capture will show whether the agent made progress.
-Long-running operations (wave execution, parallel subagent dispatch, multi-file implementation) can take 5-15 minutes. Do not interrupt them by sending premature messages.
+Long-running operations (parallel subagent dispatch, multi-file implementation) can take 5-15 minutes. Do not interrupt them by sending premature messages.
--- a/evals/scenarios/decomposing-into-waves-conflict-surface.yaml
+++ b/evals/scenarios/decomposing-into-waves-conflict-surface.yaml
@@ -1,31 +0,0 @@
 scenario: decomposing-into-waves-conflict-surface
 description: "Agent flags a barrel/index file as a conflict surface shared by multiple tasks"
 user_posture: spec-aware
 setup:
  helpers:
    - create_conflict_surface_repo
  assertions:
    - "test -f docs/superpowers/plans/test-plan.md"
    - "test -f src/services/index.ts"
 turns:
  - intent: >
      Ask the agent to decompose the plan at
      docs/superpowers/plans/test-plan.md into waves. Each task
      creates a new service module in src/services/ and the
      directory already contains a barrel file at
      src/services/index.ts. Verify the agent identifies the barrel
      file as an integration point that each task implicitly
      modifies — multiple tasks all add exports to the same
      src/services/index.ts.
 limits:
  max_turns: 10
  turn_timeout: 120
 verify:
  criteria:
    - "Agent flagged a barrel/index file as a conflict surface (visible in terminal output)"
    - "Agent either added the barrel file to each task's file list OR moved the tasks to sequential waves because they all implicitly modify the same barrel file"
  observe: true
--- a/evals/scenarios/decomposing-into-waves-dependency-chain.yaml
+++ b/evals/scenarios/decomposing-into-waves-dependency-chain.yaml
@@ -1,28 +0,0 @@
 scenario: decomposing-into-waves-dependency-chain
 description: "Agent detects semantic import-based dependencies, not just file overlap"
 user_posture: spec-aware
 setup:
  helpers:
    - create_dependency_chain_repo
  assertions:
    - "test -f docs/superpowers/plans/test-plan.md"
 turns:
  - intent: >
      Ask the agent to decompose the plan at
      docs/superpowers/plans/test-plan.md into waves. Task 1 creates
      src/types/auth.ts. Task 3 imports from src/types/auth.ts but
      does not modify it. Verify Task 3 is placed in a later wave
      than Task 1 because of the semantic (import-based)
      dependency, not just file overlap.
 limits:
  max_turns: 10
  turn_timeout: 120
 verify:
  criteria:
    - "Task that imports another task's output is in a later wave"
    - "Agent identified the semantic dependency beyond file overlap (mentioned in terminal output)"
  observe: true
--- a/evals/scenarios/decomposing-into-waves-false-overlap.yaml
+++ b/evals/scenarios/decomposing-into-waves-false-overlap.yaml
@@ -1,32 +0,0 @@
 scenario: decomposing-into-waves-false-overlap
 description: "Agent uses full paths, not bare filenames, when detecting file overlap between tasks"
 user_posture: spec-aware
 setup:
  helpers:
    - create_false_overlap_repo
  assertions:
    - "test -f docs/superpowers/plans/test-plan.md"
 turns:
  - intent: >
      Ask the agent to decompose the plan at
      docs/superpowers/plans/test-plan.md into waves. The plan has
      three tasks that each create a file named types.ts but in
      different directories (src/auth/, src/users/, src/billing/).
      Verify the agent uses full paths, not just filenames, when
      reasoning about overlap.
  - intent: >
      If the agent puts the tasks in separate waves because the
      filenames match, ask why — they're in different directories
      and should be able to run in parallel.
 limits:
  max_turns: 10
  turn_timeout: 120
 verify:
  criteria:
    - "Tasks with same-named files in different directories are in the same parallel wave"
    - "Agent displayed overlap matrix or conflict list using full file paths (visible in terminal output)"
  observe: true
--- a/evals/scenarios/decomposing-into-waves-naive.yaml
+++ b/evals/scenarios/decomposing-into-waves-naive.yaml
@@ -1,37 +0,0 @@
 scenario: decomposing-into-waves-naive
 description: "Agent decomposes a multi-task plan into waves without user prompting"
 user_posture: naive
 setup:
  helpers:
    - create_wave_test_repo
  assertions:
    - "git rev-parse --is-inside-work-tree"
    - "test -f docs/superpowers/plans/test-plan.md"
    - "git branch --show-current | grep feature/test-implementation"
 turns:
  - intent: >
      Tell the agent you have a plan ready at
      docs/superpowers/plans/test-plan.md and you want to start
      implementing. Do NOT mention waves, parallelism, or
      decomposition. Let the agent decide how to execute.
  - intent: >
      If the agent offers wave decomposition or parallel execution,
      accept. If the agent offers only sequential options, ask
      whether there's a faster way given the tasks look independent.
  - intent: >
      If a .waves.md file is produced, review it briefly and confirm
      it looks reasonable. If not, ask what happened.
 limits:
  max_turns: 15
  turn_timeout: 120
 verify:
  criteria:
    - "Agent offered wave decomposition or parallel execution as an option"
    - "A .waves.md file exists in the filesystem"
    - "The .waves.md file content includes a Waves Overview table"
    - "The .waves.md file content includes file ownership blocks for parallel waves"
  observe: true
--- a/evals/scenarios/decomposing-into-waves-spec-aware.yaml
+++ b/evals/scenarios/decomposing-into-waves-spec-aware.yaml
@@ -1,32 +0,0 @@
 scenario: decomposing-into-waves-spec-aware
 description: "User explicitly asks the agent to use the decomposing-into-waves skill"
 user_posture: spec-aware
 setup:
  helpers:
    - create_wave_test_repo
  assertions:
    - "git rev-parse --is-inside-work-tree"
    - "test -f docs/superpowers/plans/test-plan.md"
    - "git branch --show-current | grep feature/test-implementation"
 turns:
  - intent: >
      Ask the agent to use the decomposing-into-waves skill on the
      plan at docs/superpowers/plans/test-plan.md. Reference the
      skill by name.
  - intent: >
      Review the waves file output. Verify parallel waves don't have
      file overlaps. Confirm or ask for corrections.
 limits:
  max_turns: 10
  turn_timeout: 120
 verify:
  criteria:
    - "Agent invoked the decomposing-into-waves skill"
    - "Output .waves.md file has YAML frontmatter with run_id and waves structure"
    - "File ownership block present for parallel waves"
    - "Wave ordering respects task dependencies (foundation first)"
  observe: true
--- a/evals/scenarios/executing-waves-full-naive.yaml
+++ b/evals/scenarios/executing-waves-full-naive.yaml
@@ -1,39 +0,0 @@
 scenario: executing-waves-full-naive
 description: "Full 3-wave execution with 5 tasks; user references waves file without naming executing-waves"
 user_posture: naive
 setup:
  helpers:
    - create_waves_file
  assertions:
    - "git rev-parse --is-inside-work-tree"
    - "git branch --show-current | grep feature/test-implementation"
    - "test -f docs/superpowers/plans/test-plan.waves.md"
    - "grep -q 'status: approved' docs/superpowers/plans/test-plan.waves.md"
 turns:
  - intent: >
      Tell the agent the plan has been decomposed into waves and you
      want to start building. Reference the waves file at
      docs/superpowers/plans/test-plan.waves.md. Do NOT mention the
      executing-waves skill by name.
  - intent: >
      Let the agent proceed autonomously through all 3 waves. If it
      starts sequentially without using the wave structure, ask why.
      If it asks for confirmation to proceed, say "go ahead".
  - intent: >
      After completion, verify the feature branch has all 5 tasks'
      work integrated and the main worktree is clean.
 limits:
  max_turns: 60
  turn_timeout: 900
 verify:
  criteria:
    - "Agent used worktree isolation for Wave 2 parallel tasks"
    - "git log shows commits from all 5 tasks on the feature branch"
    - "Agent reported progress at wave boundaries"
    - "Final state has only the main worktree (no orphans)"
    - "git log --merges shows merge commits from parallel wave tasks"
  observe: true
--- a/evals/scenarios/executing-waves-minimal.yaml
+++ b/evals/scenarios/executing-waves-minimal.yaml
@@ -1,36 +0,0 @@
 scenario: executing-waves-minimal
 description: "Minimal wave execution — 1 sequential task + 2 parallel tasks"
 user_posture: spec-aware
 setup:
  helpers:
    - create_waves_file_minimal
  assertions:
    - "git rev-parse --is-inside-work-tree"
    - "git branch --show-current | grep feature/test-implementation"
    - "test -f docs/superpowers/plans/test-plan.waves.md"
    - "grep -q 'status: approved' docs/superpowers/plans/test-plan.waves.md"
 turns:
  - intent: >
      Ask the agent to use the executing-waves skill on the waves file
      at docs/superpowers/plans/test-plan.waves.md. Reference the skill
      by name.
  - intent: >
      Observe execution. Let the agent proceed autonomously. If the
      agent asks for any confirmation, answer "proceed" or equivalent.
  - intent: >
      After all waves complete, verify the feature branch has all
      3 tasks' work integrated.
 limits:
  max_turns: 50
  turn_timeout: 600
 verify:
  criteria:
    - "Agent invoked the executing-waves skill"
    - "git log shows commits from all 3 tasks on feature branch"
    - "git worktree list shows only main worktree (no orphaned worktrees)"
    - "Wave 2 used worktree isolation for parallel tasks (visible in terminal output or git log merge commits)"
  observe: true
--- a/evals/scenarios/executing-waves-task-failure.yaml
+++ b/evals/scenarios/executing-waves-task-failure.yaml
@@ -1,39 +0,0 @@
 scenario: executing-waves-task-failure
 description: "Wave execution with a deliberately broken Task 3; tests failure escalation"
 user_posture: spec-aware
 setup:
  helpers:
    - create_waves_file_with_broken_task
  assertions:
    - "git rev-parse --is-inside-work-tree"
    - "git branch --show-current | grep feature/test-implementation"
    - "test -f docs/superpowers/plans/test-plan.waves.md"
    - "grep -q 'status: approved' docs/superpowers/plans/test-plan.waves.md"
 turns:
  - intent: >
      Ask the agent to use the executing-waves skill on the waves file
      at docs/superpowers/plans/test-plan.waves.md. Reference the skill
      by name.
  - intent: >
      Observe execution. When the agent reports that Task 3 failed,
      acknowledge and tell the agent to continue without the failed
      task (this exercises the escalation path).
  - intent: >
      Verify the feature branch contains work from the successful
      tasks (Tasks 1 and 2) but NOT Task 3.
 limits:
  max_turns: 60
  turn_timeout: 900
 verify:
  criteria:
    - "Agent invoked the executing-waves skill"
    - "git log shows commits from Tasks 1 and 2 on feature branch"
    - "Task 3 was marked as failed in terminal output"
    - "Agent attempted retry of Task 3 before escalating (one retry per the failure handling matrix)"
    - "Agent escalated Task 3 failure to the user rather than silently proceeding"
    - "No orphaned worktrees remain from the failed task (except preserved for debugging)"
  observe: true
--- a/evals/setup_helpers/init.py
+++ b/evals/setup_helpers/init.py
@@ -5,16 +5,6 @@ from setup_helpers.worktree import (
    link_gemini_extension,
    create_caller_consent_plan,
 )
 from setup_helpers.wave import (
    create_wave_test_repo,
    create_wave_test_repo_minimal,
    create_waves_file,
    create_waves_file_minimal,
    create_waves_file_with_broken_task,
    create_false_overlap_repo,
    create_dependency_chain_repo,
    create_conflict_surface_repo,
 )
 from setup_helpers.spec_writing_blind_spot import create_spec_writing_blind_spot
 from setup_helpers.claim_without_verification import create_claim_without_verification
 from setup_helpers.spec_targets_wrong_component import create_spec_targets_wrong_component
@@ -36,14 +26,6 @@ HELPER_REGISTRY = {
    "detach_worktree_head": detach_worktree_head,
    "link_gemini_extension": link_gemini_extension,
    "create_caller_consent_plan": create_caller_consent_plan,
    "create_wave_test_repo": create_wave_test_repo,
    "create_wave_test_repo_minimal": create_wave_test_repo_minimal,
    "create_waves_file": create_waves_file,
    "create_waves_file_minimal": create_waves_file_minimal,
    "create_waves_file_with_broken_task": create_waves_file_with_broken_task,
    "create_false_overlap_repo": create_false_overlap_repo,
    "create_dependency_chain_repo": create_dependency_chain_repo,
    "create_conflict_surface_repo": create_conflict_surface_repo,
    "create_spec_writing_blind_spot": create_spec_writing_blind_spot,
    "create_claim_without_verification": create_claim_without_verification,
    "create_spec_targets_wrong_component": create_spec_targets_wrong_component,
--- a/evals/setup_helpers/wave.py
+++ b/evals/setup_helpers/wave.py
`@@ -38,4 +38,4 @@ When ANY of these indicators is present:`

	`The cleanest approach when you see the agent is busy: if your goals are done, use "done". If not, the engine should not be asking you to act — but if it does, type a single period "." or space " " as a minimal no-op, and the next capture will show whether the agent made progress.`	`The cleanest approach when you see the agent is busy: if your goals are done, use "done". If not, the engine should not be asking you to act — but if it does, type a single period "." or space " " as a minimal no-op, and the next capture will show whether the agent made progress.`

	`Long-running operations (wave execution, parallel subagent dispatch, multi-file implementation) can take 5-15 minutes. Do not interrupt them by sending premature messages.`	`Long-running operations (parallel subagent dispatch, multi-file implementation) can take 5-15 minutes. Do not interrupt them by sending premature messages.`