Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b

rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
2026-05-11 03:29:04 +08:00 · 2026-05-06 12:15:46 -07:00
parent 895bb732d5
commit 3c046f579e
124 changed files with 13806 additions and 0 deletions
--- a/evals/scenarios/mid-conversation-skill-invocation.yaml
+++ b/evals/scenarios/mid-conversation-skill-invocation.yaml
@@ -0,0 +1,77 @@
+scenario: mid-conversation-skill-invocation
+description: >
+  Lifted from superpowers/tests/explicit-skill-requests/run-claude-describes-sdd.sh.
+  Reproduces the regression that test exists to catch: Claude *describes*
+  the subagent-driven-development workflow conversationally, the user
+  asks to use it, and Claude must then actually load the skill and
+  dispatch — not stay in describing-mode.
+
+  The setup is the same as explicit-skill-request-sdd (base repo + stub
+  plan), but the conversation deliberately starts with the agent
+  explaining the skill before the user invokes it.
+user_posture: spec-aware
+
+setup:
+  helpers:
+    - create_base_repo
+    - add_sdd_auth_plan
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+    - "git branch --show-current | grep -q main"
+    - "test -f docs/superpowers/plans/auth-system.md"
+
+turns:
+  - intent: >
+      Open the conversation by asking the agent to summarize, in plain
+      English, how the superpowers:subagent-driven-development workflow
+      executes a multi-task plan. Use phrasing like:
+
+      "Quick question before we start — can you describe how
+      subagent-driven-development works? I want to make sure I
+      understand the workflow before I commit to using it."
+
+      Do NOT ask the agent to use the skill yet. The point is to put
+      the agent in describing-mode first.
+  - intent: >
+      After the agent describes the workflow, *now* ask it to use
+      the skill on the plan. Use phrasing like:
+
+      "Got it, that's what I want. I have a plan at
+      docs/superpowers/plans/auth-system.md. subagent-driven-development,
+      please — dispatch the first subagent."
+
+      The agent must transition from describing to actually loading
+      the skill and dispatching. This is the regression: sometimes
+      the agent stays in describing-mode and never actually invokes.
+  - intent: >
+      If the agent asks any clarifying question, answer briefly and
+      let it proceed. If it offers to start, say "yes, go ahead."
+  - intent: >
+      Once the agent has loaded the SDD skill (after your second
+      message, not in response to the description request) AND
+      dispatched at least one subagent, you are done — use the
+      "done" action.
+
+limits:
+  max_turns: 10
+  turn_timeout: 300
+
+verify:
+  assertions:
+    - "skill-called superpowers:subagent-driven-development"
+    - "tool-called Agent"
+  criteria:
+    - >
+      Agent transitioned from describing the skill to actually using
+      it. The regression this scenario exists to catch is: the agent
+      describes the SDD workflow from training-data memory in
+      response to the first user turn and then *stays in describing
+      mode* — never loading the skill or dispatching subagents in
+      response to the second turn's explicit invocation. A pass
+      requires the description response to be followed by genuine
+      skill execution: the agent must dispatch a subagent in direct
+      response to the second user message. (Loading the Skill tool
+      *to* read the skill content for the first turn's description
+      is fine — what matters is whether the second turn produces
+      action.)
+  observe: true