Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b

rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
2026-05-10 02:59:04 +08:00 · 2026-05-06 12:15:46 -07:00
parent 2e46e9590d
commit 3b412a3836
124 changed files with 13806 additions and 0 deletions
--- a/evals/tests/init.py
+++ b/evals/tests/init.py
--- a/evals/tests/fixtures/tools_empty.jsonl
+++ b/evals/tests/fixtures/tools_empty.jsonl
--- a/evals/tests/fixtures/tools_multi.jsonl
+++ b/evals/tests/fixtures/tools_multi.jsonl
@@ -0,0 +1,5 @@
+{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
+{"tool": "Skill", "args": {"skill": "superpowers:worktree"}, "source": "native"}
+{"tool": "Edit", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
+{"tool": "Read", "args": {"file_path": "/tmp/bar.py"}, "source": "native"}
+{"tool": "Bash", "args": {"command": "git status"}, "source": "shell"}
--- a/evals/tests/fixtures/tools_ordered.jsonl
+++ b/evals/tests/fixtures/tools_ordered.jsonl
@@ -0,0 +1,4 @@
+{"tool": "EnterWorktree", "args": {"branch": "feature/login"}, "source": "native"}
+{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
+{"tool": "Edit", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
+{"tool": "Bash", "args": {"command": "pytest"}, "source": "shell"}
--- a/evals/tests/fixtures/tools_single.jsonl
+++ b/evals/tests/fixtures/tools_single.jsonl
@@ -0,0 +1 @@
+{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
--- a/evals/tests/test_actor.py
+++ b/evals/tests/test_actor.py
@@ -0,0 +1,51 @@
+from drill.actor import Actor, ActorAction
+
+
+class TestActorAction:
+    def test_parse_type_action(self):
+        action = ActorAction.from_tool_result({"action": "type", "text": "create a worktree"})
+        assert action.action == "type"
+        assert action.text == "create a worktree"
+
+    def test_parse_done_action(self):
+        action = ActorAction.from_tool_result({"action": "done"})
+        assert action.action == "done"
+
+    def test_parse_stuck_action(self):
+        action = ActorAction.from_tool_result({"action": "stuck"})
+        assert action.action == "stuck"
+
+    def test_parse_key_action(self):
+        action = ActorAction.from_tool_result({"action": "key", "key": "ctrl-c"})
+        assert action.action == "key"
+        assert action.key == "ctrl-c"
+
+
+class TestActorPrompt:
+    def test_builds_system_prompt_naive(self):
+        actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
+        prompt = actor.build_system_prompt(
+            posture="naive",
+            intents=["Ask the agent to create a worktree"],
+        )
+        assert "plain language" in prompt.lower() or "don't know" in prompt.lower()
+        assert "create a worktree" in prompt
+
+    def test_builds_system_prompt_spec_aware(self):
+        actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
+        prompt = actor.build_system_prompt(
+            posture="spec-aware",
+            intents=["Use the worktree skill"],
+        )
+        assert "skill" in prompt.lower() or "convention" in prompt.lower()
+
+
+class TestActorContext:
+    def test_appends_terminal_captures(self):
+        actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
+        actor.append_capture("Screen 1: Welcome to Claude")
+        actor.append_capture("Screen 2: ❯ ")
+        messages = actor.build_messages()
+        assert len(messages) == 2
+        assert "Screen 1" in messages[0]["content"]
+        assert "Screen 2" in messages[1]["content"]
--- a/evals/tests/test_assertions.py
+++ b/evals/tests/test_assertions.py
@@ -0,0 +1,106 @@
+from drill.assertions import AssertionResult, run_verify_assertions
+
+
+class TestAssertionResult:
+    def test_passing_to_criterion_result(self):
+        ar = AssertionResult(
+            command="tool-called Read",
+            passed=True,
+            exit_code=0,
+            stdout="PASS: Read called 3 time(s)",
+            stderr="",
+        )
+        cr = ar.to_criterion_result()
+        assert cr.verdict == "pass"
+        assert cr.source == "assertion"
+        assert "[assertion]" in cr.criterion
+        assert "tool-called Read" in cr.criterion
+
+    def test_failing_to_criterion_result(self):
+        ar = AssertionResult(
+            command="tool-not-called Write",
+            passed=False,
+            exit_code=1,
+            stdout="",
+            stderr="FAIL: Write called 2 time(s)",
+        )
+        cr = ar.to_criterion_result()
+        assert cr.verdict == "fail"
+        assert cr.source == "assertion"
+        assert "stderr: FAIL" in cr.evidence
+
+
+class TestRunVerifyAssertions:
+    def test_passing_assertion(self, tmp_path):
+        tc = '{"tool": "Read", "args": {}, "source": "native"}\n'
+        (tmp_path / "tool_calls.jsonl").write_text(tc)
+        results = run_verify_assertions(
+            assertions=["grep -q Read tool_calls.jsonl"],
+            results_dir=tmp_path,
+            workdir=tmp_path,
+        )
+        assert len(results) == 1
+        assert results[0].passed is True
+        assert results[0].exit_code == 0
+
+    def test_failing_assertion(self, tmp_path):
+        tc = '{"tool": "Read", "args": {}, "source": "native"}\n'
+        (tmp_path / "tool_calls.jsonl").write_text(tc)
+        results = run_verify_assertions(
+            assertions=["grep -q NonexistentTool tool_calls.jsonl"],
+            results_dir=tmp_path,
+            workdir=tmp_path,
+        )
+        assert len(results) == 1
+        assert results[0].passed is False
+
+    def test_runs_all_assertions(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text('{"tool": "Read"}\n')
+        results = run_verify_assertions(
+            assertions=[
+                "grep -q Read tool_calls.jsonl",
+                "grep -q Write tool_calls.jsonl",
+                "grep -q Read tool_calls.jsonl",
+            ],
+            results_dir=tmp_path,
+            workdir=tmp_path,
+        )
+        assert len(results) == 3
+        assert results[0].passed is True
+        assert results[1].passed is False
+        assert results[2].passed is True
+
+    def test_timeout_handling(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text("{}\n")
+        results = run_verify_assertions(
+            assertions=["sleep 30"],
+            results_dir=tmp_path,
+            workdir=tmp_path,
+            timeout_seconds=1,
+        )
+        assert len(results) == 1
+        assert results[0].passed is False
+        assert results[0].exit_code == 124
+        assert "Timed out" in results[0].stderr
+
+    def test_drill_workdir_env_var(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text("{}\n")
+        workdir = tmp_path / "scenario-workdir"
+        workdir.mkdir()
+        results = run_verify_assertions(
+            assertions=['test "$DRILL_WORKDIR" = "' + str(workdir) + '"'],
+            results_dir=tmp_path,
+            workdir=workdir,
+        )
+        assert len(results) == 1
+        assert results[0].passed is True
+
+    def test_bin_dir_on_path(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text("{}\n")
+        results = run_verify_assertions(
+            assertions=["echo $PATH | grep -q bin"],
+            results_dir=tmp_path,
+            workdir=tmp_path,
+        )
+        assert len(results) == 1
+        assert results[0].passed is True
--- a/evals/tests/test_backend.py
+++ b/evals/tests/test_backend.py
@@ -0,0 +1,145 @@
+from pathlib import Path
+
+import pytest
+
+from drill.backend import Backend, load_backend
+
+
+@pytest.fixture
+def backends_dir():
+    return Path(__file__).parent.parent / "backends"
+
+
+class TestLoadBackend:
+    def test_loads_claude_backend(self, backends_dir):
+        backend = load_backend("claude", backends_dir)
+        assert backend.name == "claude"
+        assert backend.cli == "claude"
+        assert "--dangerously-skip-permissions" in backend.args
+
+    def test_loads_codex_backend(self, backends_dir):
+        backend = load_backend("codex", backends_dir)
+        assert backend.name == "codex"
+        assert backend.cli == "codex"
+
+    def test_unknown_backend_raises(self, backends_dir):
+        with pytest.raises(FileNotFoundError):
+            load_backend("nonexistent", backends_dir)
+
+    def test_loads_claude_opus_4_6_variant(self, backends_dir, monkeypatch):
+        monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
+        backend = load_backend("claude-opus-4-6", backends_dir)
+        assert backend.name == "claude-opus-4-6"
+        assert backend.family == "claude"
+        assert backend.model == "claude-opus-4-6"
+
+
+class TestBackendBuildCommand:
+    def test_claude_build_command(self, backends_dir, monkeypatch):
+        monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
+        backend = load_backend("claude", backends_dir)
+        cmd = backend.build_command("/tmp/workdir")
+        assert cmd[0] == "claude"
+        assert "--plugin-dir" in cmd
+        assert "/tmp/superpowers" in cmd
+
+    def test_codex_build_command(self, backends_dir, monkeypatch):
+        monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
+        backend = load_backend("codex", backends_dir)
+        cmd = backend.build_command("/tmp/workdir")
+        assert cmd[0] == "codex"
+
+
+class TestBackendEnvValidation:
+    def test_missing_env_raises(self, backends_dir, monkeypatch):
+        monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+        monkeypatch.delenv("SUPERPOWERS_ROOT", raising=False)
+        backend = load_backend("claude", backends_dir)
+        with pytest.raises(EnvironmentError, match="ANTHROPIC_API_KEY"):
+            backend.validate_env()
+
+
+class TestBackendIdleDetection:
+    def test_ready_pattern_matches(self, backends_dir):
+        backend = load_backend("claude", backends_dir)
+        assert backend.is_ready_line("❯ ")
+        assert backend.is_ready_line("Human: ")
+        assert not backend.is_ready_line("Running tool...")
+
+
+class TestBackendModelExtraction:
+    def test_extract_model_from_args(self, backends_dir, monkeypatch):
+        monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
+        backend = load_backend("claude", backends_dir)
+        assert backend.model == "opus"
+
+    def test_no_model_flag_returns_none(self):
+        backend = Backend(
+            name="test",
+            cli="test",
+            args=["--foo", "bar"],
+            required_env=[],
+            hooks={"pre_run": [], "post_run": []},
+            shutdown="/exit",
+            idle={},
+            startup_timeout=30,
+            terminal={},
+            session_logs={},
+        )
+        assert backend.model is None
+
+    def test_extracts_from_short_m_flag(self):
+        backend = Backend(
+            name="test",
+            cli="test",
+            args=["-m", "gemini-2.5-flash"],
+            required_env=[],
+            hooks={"pre_run": [], "post_run": []},
+            shutdown="/exit",
+            idle={},
+            startup_timeout=30,
+            terminal={},
+            session_logs={},
+        )
+        assert backend.model == "gemini-2.5-flash"
+
+
+class TestBackendFamily:
+    def test_claude_backend_family(self, backends_dir, monkeypatch):
+        monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
+        backend = load_backend("claude", backends_dir)
+        assert backend.family == "claude"
+
+    def test_codex_backend_family(self, backends_dir):
+        backend = load_backend("codex", backends_dir)
+        assert backend.family == "codex"
+
+    def test_variant_name_preserves_family(self):
+        backend = Backend(
+            name="claude-opus-4-6",
+            cli="claude",
+            args=[],
+            required_env=[],
+            hooks={"pre_run": [], "post_run": []},
+            shutdown="/exit",
+            idle={},
+            startup_timeout=30,
+            terminal={},
+            session_logs={},
+        )
+        assert backend.family == "claude"
+
+    def test_unknown_family_is_other(self):
+        backend = Backend(
+            name="random-xyz",
+            cli="xyz",
+            args=[],
+            required_env=[],
+            hooks={"pre_run": [], "post_run": []},
+            shutdown="/exit",
+            idle={},
+            startup_timeout=30,
+            terminal={},
+            session_logs={},
+        )
+        assert backend.family == "other"
--- a/evals/tests/test_cli.py
+++ b/evals/tests/test_cli.py
@@ -0,0 +1,61 @@
+"""Tests for CLI option parsing."""
+
+from __future__ import annotations
+
+from click.testing import CliRunner
+
+from drill.cli import main
+
+
+class TestRunCommand:
+    def test_backend_required_without_models(self) -> None:
+        runner = CliRunner()
+        result = runner.invoke(main, ["run", "nonexistent"])
+        assert result.exit_code != 0
+
+    def test_n_default_is_1(self) -> None:
+        runner = CliRunner()
+        result = runner.invoke(main, ["run", "nonexistent", "--backend", "claude", "--n", "1"])
+        assert "Scenario not found" in result.output or result.exit_code != 0
+
+    def test_models_flag_accepted(self) -> None:
+        runner = CliRunner()
+        result = runner.invoke(main, ["run", "nonexistent", "--models", "claude,codex"])
+        assert "Scenario not found" in result.output or result.exit_code != 0
+
+    def test_n_must_be_positive(self) -> None:
+        runner = CliRunner()
+        result = runner.invoke(main, ["run", "nonexistent", "--backend", "claude", "--n", "0"])
+        assert result.exit_code != 0
+
+
+class TestListCommand:
+    def test_lists_scenarios(self, tmp_path):
+        scenarios_dir = tmp_path / "scenarios"
+        scenarios_dir.mkdir()
+        (scenarios_dir / "test-scenario.yaml").write_text("""
+scenario: test-scenario
+description: "A test scenario"
+user_posture: naive
+setup:
+  helpers: []
+  assertions: []
+turns: []
+limits:
+  max_turns: 5
+  turn_timeout: 30
+verify:
+  criteria: []
+  observe: false
+""")
+        runner = CliRunner()
+        result = runner.invoke(main, ["list", "--scenarios-dir", str(scenarios_dir)])
+        assert result.exit_code == 0
+        assert "test-scenario" in result.output
+
+
+class TestCompareCommand:
+    def test_sweep_flag_accepted(self) -> None:
+        runner = CliRunner()
+        result = runner.invoke(main, ["compare", "nonexistent", "--sweep", "abc123"])
+        assert result.exit_code != 0  # No results dir, but flag is parsed
--- a/evals/tests/test_compare.py
+++ b/evals/tests/test_compare.py
@@ -0,0 +1,217 @@
+"""Tests for compare module."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from drill.compare import BackendResult, format_compare_output, load_scenario_results
+
+
+def _write_verdict(path: Path, criteria: list[dict[str, str]]) -> None:
+    verdict = {
+        "criteria": criteria,
+        "observations": ["test obs"],
+        "summary": "ok",
+    }
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(verdict))
+
+
+def _write_meta(path: Path, **kwargs: object) -> None:
+    meta = {"scenario": "test", "backend": "claude", "actor_turns": 4, **kwargs}
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(meta))
+
+
+def _write_run_group(
+    path: Path, n: int, runs: list[dict[str, object]], sweep_id: str = "abc12345"
+) -> None:
+    data = {
+        "scenario": "test",
+        "backend": "claude",
+        "n": n,
+        "timestamp": "2026-04-20T14-30-00",
+        "sweep_id": sweep_id,
+        "partial": False,
+        "runs": runs,
+    }
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data))
+
+
+class TestLoadScenarioResults:
+    def test_loads_new_format_single_run(self, tmp_path: Path) -> None:
+        scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
+        run_dir = scenario_dir / "run-00"
+        criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
+        _write_verdict(run_dir / "verdict.json", criteria)
+        _write_meta(run_dir / "meta.json")
+        _write_run_group(
+            scenario_dir / "run-group.json",
+            n=1,
+            runs=[{"index": 0, "status": "pass", "duration": 10.0}],
+        )
+        results = load_scenario_results(tmp_path / "test-scenario")
+        assert "claude" in results
+        assert results["claude"].total_runs == 1
+        assert results["claude"].passed_runs == 1
+
+    def test_loads_new_format_multi_run(self, tmp_path: Path) -> None:
+        scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
+        for i in range(3):
+            run_dir = scenario_dir / f"run-{i:02d}"
+            verdict_val = "pass" if i < 2 else "fail"
+            criteria = [
+                {"criterion": "c1", "verdict": verdict_val, "evidence": "e", "rationale": "r"}
+            ]
+            _write_verdict(run_dir / "verdict.json", criteria)
+            _write_meta(run_dir / "meta.json")
+        _write_run_group(
+            scenario_dir / "run-group.json",
+            n=3,
+            runs=[
+                {"index": 0, "status": "pass", "duration": 10.0},
+                {"index": 1, "status": "pass", "duration": 11.0},
+                {"index": 2, "status": "fail", "duration": 12.0},
+            ],
+        )
+        results = load_scenario_results(tmp_path / "test-scenario")
+        assert results["claude"].total_runs == 3
+        assert results["claude"].passed_runs == 2
+        assert len(results["claude"].criterion_counts) == 1
+        assert results["claude"].criterion_counts["c1"] == (2, 3)
+
+    def test_loads_old_format_backwards_compat(self, tmp_path: Path) -> None:
+        scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00"
+        criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
+        _write_verdict(scenario_dir / "verdict.json", criteria)
+        _write_meta(scenario_dir / "meta.json")
+        results = load_scenario_results(tmp_path / "test-scenario")
+        assert "claude" in results
+        assert results["claude"].total_runs == 1
+        assert results["claude"].passed_runs == 1
+
+    def test_sweep_filter(self, tmp_path: Path) -> None:
+        base = tmp_path / "test-scenario" / "claude"
+        # Sweep A
+        dir_a = base / "2026-04-20T14-30-00-aaaa1111"
+        _write_run_group(
+            dir_a / "run-group.json",
+            n=1,
+            runs=[{"index": 0, "status": "pass", "duration": 10.0}],
+            sweep_id="aaaa1111",
+        )
+        criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
+        _write_verdict(dir_a / "run-00" / "verdict.json", criteria)
+        _write_meta(dir_a / "run-00" / "meta.json")
+        # Sweep B
+        dir_b = base / "2026-04-20T15-00-00-bbbb2222"
+        _write_run_group(
+            dir_b / "run-group.json",
+            n=1,
+            runs=[{"index": 0, "status": "fail", "duration": 10.0}],
+            sweep_id="bbbb2222",
+        )
+        criteria_b = [{"criterion": "c1", "verdict": "fail", "evidence": "e", "rationale": "r"}]
+        _write_verdict(dir_b / "run-00" / "verdict.json", criteria_b)
+        _write_meta(dir_b / "run-00" / "meta.json")
+
+        results_a = load_scenario_results(tmp_path / "test-scenario", sweep_id="aaaa1111")
+        assert results_a["claude"].passed_runs == 1
+        results_b = load_scenario_results(tmp_path / "test-scenario", sweep_id="bbbb2222")
+        assert results_b["claude"].passed_runs == 0
+
+
+class TestBackendResult:
+    def test_pass_rate(self) -> None:
+        br = BackendResult(
+            backend="claude",
+            total_runs=10,
+            passed_runs=8,
+            errored_runs=0,
+            avg_turns=4.2,
+            criterion_counts={"c1": (10, 10), "c2": (8, 10)},
+            sweep_id="abc12345",
+            timestamp="2026-04-20T14-30-00",
+            partial=False,
+        )
+        assert br.pass_rate == 0.8
+
+    def test_pass_rate_zero_runs(self) -> None:
+        br = BackendResult(
+            backend="claude",
+            total_runs=0,
+            passed_runs=0,
+            errored_runs=0,
+            avg_turns=0.0,
+            criterion_counts={},
+            sweep_id=None,
+            timestamp=None,
+            partial=False,
+        )
+        assert br.pass_rate == 0.0
+
+
+def _make_backend_result(
+    backend: str = "claude",
+    total_runs: int = 10,
+    passed_runs: int = 8,
+    errored_runs: int = 0,
+    avg_turns: float = 4.2,
+    criterion_counts: dict[str, tuple[int, int]] | None = None,
+    sweep_id: str | None = "abc12345",
+    timestamp: str | None = "2026-04-20T14-30-00",
+    partial: bool = False,
+) -> BackendResult:
+    return BackendResult(
+        backend=backend,
+        total_runs=total_runs,
+        passed_runs=passed_runs,
+        errored_runs=errored_runs,
+        avg_turns=avg_turns,
+        criterion_counts=criterion_counts or {"c1": (passed_runs, total_runs)},
+        sweep_id=sweep_id,
+        timestamp=timestamp,
+        partial=partial,
+    )
+
+
+class TestFormatCompareOutput:
+    def test_no_results(self) -> None:
+        output = format_compare_output("test", {})
+        assert "No results found" in output
+
+    def test_multi_run_includes_pass_rate_and_ci(self) -> None:
+        results = {"claude": _make_backend_result(total_runs=10, passed_runs=8)}
+        output = format_compare_output("test", results)
+        assert "Overall pass rate" in output
+        assert "95% CI" in output
+        assert "80.0%" in output
+
+    def test_multi_run_sweep_header_includes_date(self) -> None:
+        results = {"claude": _make_backend_result()}
+        output = format_compare_output("test", results)
+        assert "Sweep: abc12345 | 2026-04-20" in output
+
+    def test_single_run_simple_table(self) -> None:
+        results = {
+            "claude": _make_backend_result(
+                total_runs=1,
+                passed_runs=1,
+                criterion_counts={"c1": (1, 1)},
+            )
+        }
+        output = format_compare_output("test", results)
+        assert "PASS" in output
+        assert "Overall pass rate" not in output
+
+    def test_partial_warning(self) -> None:
+        results = {"claude": _make_backend_result(partial=True)}
+        output = format_compare_output("test", results)
+        assert "incomplete" in output.lower() or "interrupted" in output.lower()
+
+    def test_small_n_note(self) -> None:
+        results = {"claude": _make_backend_result(total_runs=5, passed_runs=3)}
+        output = format_compare_output("test", results)
+        assert "--n 10+" in output
--- a/evals/tests/test_e2e.py
+++ b/evals/tests/test_e2e.py
@@ -0,0 +1,94 @@
+"""End-to-end smoke test using a mock 'bash' backend."""
+
+import shutil
+from pathlib import Path
+
+import pytest
+
+from drill.engine import Engine, ScenarioConfig
+
+
+@pytest.fixture
+def mock_scenario(tmp_path):
+    scenario = tmp_path / "test-scenario.yaml"
+    scenario.write_text("""
+scenario: e2e-smoke-test
+description: "Smoke test"
+user_posture: naive
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+turns:
+  - intent: "List files in the current directory"
+limits:
+  max_turns: 3
+  turn_timeout: 10
+verify:
+  criteria:
+    - "Agent listed the files"
+  observe: true
+""")
+    return scenario
+
+
+@pytest.fixture
+def mock_backend(tmp_path):
+    backend_dir = tmp_path / "backends"
+    backend_dir.mkdir()
+    (backend_dir / "mock.yaml").write_text("""
+name: mock
+cli: bash
+args: []
+required_env: []
+hooks:
+  pre_run: []
+  post_run: []
+shutdown: "exit"
+idle:
+  quiescence_seconds: 1
+  ready_pattern: "\\\\$"
+startup_timeout: 5
+terminal:
+  cols: 80
+  rows: 24
+session_logs:
+  pattern: ""
+""")
+    return backend_dir
+
+
+class TestE2ESmoke:
+    def test_scenario_config_loads(self, mock_scenario):
+        config = ScenarioConfig.from_yaml(mock_scenario)
+        assert config.scenario == "e2e-smoke-test"
+
+    def test_engine_setup_works(self, mock_scenario, mock_backend):
+        fixtures_dir = Path(__file__).parent.parent / "fixtures"
+        engine = Engine(
+            scenario_path=mock_scenario,
+            backend_name="mock",
+            backends_dir=mock_backend,
+            fixtures_dir=fixtures_dir,
+            results_dir=Path("/tmp/drill-test-results"),
+        )
+        workdir = Path("/tmp/drill-e2e-smoke")
+        if workdir.exists():
+            shutil.rmtree(workdir)
+        engine._setup(workdir)
+        assert (workdir / "package.json").exists()
+        assert (workdir / "src" / "index.js").exists()
+        # Verify git state
+        import subprocess
+
+        result = subprocess.run(
+            ["git", "branch", "--show-current"], cwd=workdir, capture_output=True, text=True
+        )
+        assert result.stdout.strip() == "main"
+        result = subprocess.run(
+            ["git", "log", "--oneline"], cwd=workdir, capture_output=True, text=True
+        )
+        assert "initial commit" in result.stdout
+        # Cleanup
+        shutil.rmtree(workdir, ignore_errors=True)
--- a/evals/tests/test_engine.py
+++ b/evals/tests/test_engine.py
@@ -0,0 +1,173 @@
+from __future__ import annotations
+
+import json
+import subprocess
+from pathlib import Path
+
+from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
+
+
+class TestVerifyConfig:
+    def test_defaults(self):
+        vc = VerifyConfig()
+        assert vc.criteria == []
+        assert vc.assertions == []
+        assert vc.observe is False
+
+    def test_from_dict(self):
+        vc = VerifyConfig(
+            criteria=["test criterion"],
+            assertions=["tool-called Read"],
+            observe=True,
+        )
+        assert len(vc.criteria) == 1
+        assert len(vc.assertions) == 1
+        assert vc.observe is True
+
+
+class TestScenarioConfig:
+    def test_loads_from_yaml(self, tmp_path):
+        scenario_file = tmp_path / "test.yaml"
+        scenario_file.write_text("""
+scenario: test-scenario
+description: "A test"
+user_posture: naive
+setup:
+  helpers:
+    - create_base_repo
+  assertions:
+    - "git rev-parse --is-inside-work-tree"
+turns:
+  - intent: "Do the thing"
+limits:
+  max_turns: 10
+  turn_timeout: 60
+verify:
+  criteria:
+    - "Thing was done"
+  assertions:
+    - "tool-called Bash"
+  observe: true
+""")
+        config = ScenarioConfig.from_yaml(scenario_file)
+        assert config.scenario == "test-scenario"
+        assert config.user_posture == "naive"
+        assert config.limits["max_turns"] == 10
+        assert len(config.turns) == 1
+        assert len(config.verify.criteria) == 1
+        assert len(config.verify.assertions) == 1
+        assert config.verify.observe is True
+
+    def test_loads_without_assertions(self, tmp_path):
+        scenario_file = tmp_path / "test.yaml"
+        scenario_file.write_text("""
+scenario: minimal
+verify:
+  criteria:
+    - "Something happened"
+""")
+        config = ScenarioConfig.from_yaml(scenario_file)
+        assert config.verify.assertions == []
+        assert config.verify.observe is False
+
+    def test_loads_without_verify(self, tmp_path):
+        scenario_file = tmp_path / "test.yaml"
+        scenario_file.write_text("""
+scenario: bare-minimum
+""")
+        config = ScenarioConfig.from_yaml(scenario_file)
+        assert config.verify.criteria == []
+        assert config.verify.assertions == []
+
+
+class TestSnapshotFilesystem:
+    def test_captures_git_state(self, tmp_path):
+        subprocess.run(["git", "init", "-b", "main"], cwd=tmp_path, capture_output=True)
+        subprocess.run(
+            ["git", "commit", "--allow-empty", "-m", "init"], cwd=tmp_path, capture_output=True
+        )
+        snapshot = snapshot_filesystem(tmp_path)
+        data = json.loads(snapshot)
+        assert "git_status" in data
+        assert "branch" in data
+        assert "worktree_list" in data
+        assert "files" in data
+
+
+class TestRunResult:
+    def test_serializes_to_dir(self, tmp_path):
+        result = RunResult(
+            scenario="test",
+            backend="claude",
+            timestamp="2026-04-07T14-30-00",
+            session_log="session output here",
+            filesystem_json='{"files": []}',
+            tool_calls_jsonl='{"tool": "Bash"}\n',
+            verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
+            meta={"backend": "claude", "duration_seconds": 42, "actor_turns": 5},
+        )
+        result.save(tmp_path)
+        assert (tmp_path / "session.log").read_text() == "session output here"
+        assert (tmp_path / "filesystem.json").exists()
+        assert (tmp_path / "tool_calls.jsonl").exists()
+        assert (tmp_path / "verdict.json").exists()
+        assert (tmp_path / "meta.json").exists()
+
+
+class TestEngineAssertionIntegration:
+    def test_run_result_save_splits_artifacts_and_verdict(self, tmp_path):
+        result = RunResult(
+            scenario="test",
+            backend="claude",
+            timestamp="2026-04-20T10-00-00",
+            session_log="log here",
+            filesystem_json='{"files": []}',
+            tool_calls_jsonl='{"tool": "Bash"}\n',
+            verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
+            meta={"backend": "claude"},
+        )
+        result.save_artifacts(tmp_path)
+        assert (tmp_path / "session.log").exists()
+        assert (tmp_path / "filesystem.json").exists()
+        assert (tmp_path / "tool_calls.jsonl").exists()
+        assert not (tmp_path / "verdict.json").exists()
+        assert not (tmp_path / "meta.json").exists()
+
+        result.save_verdict(tmp_path)
+        assert (tmp_path / "verdict.json").exists()
+        assert (tmp_path / "meta.json").exists()
+
+
+class TestEngineRunParams:
+    def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
+        custom_dir = tmp_path / "custom" / "run-00"
+        result = RunResult(
+            scenario="test",
+            backend="claude",
+            timestamp="2026-04-20T10-00-00",
+            session_log="log",
+            filesystem_json='{"files": []}',
+            tool_calls_jsonl='{"tool": "Bash"}\n',
+            verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
+            meta={"backend": "claude"},
+        )
+        result.save(custom_dir)
+        assert (custom_dir / "session.log").read_text() == "log"
+        assert (custom_dir / "verdict.json").exists()
+        assert (custom_dir / "meta.json").exists()
+
+    def test_run_result_nested_dir_created(self, tmp_path: Path) -> None:
+        deep_dir = tmp_path / "a" / "b" / "c" / "run-05"
+        result = RunResult(
+            scenario="test",
+            backend="claude",
+            timestamp="2026-04-20T10-00-00",
+            session_log="log",
+            filesystem_json='{"files": []}',
+            tool_calls_jsonl='{"tool": "Bash"}\n',
+            verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
+            meta={"backend": "claude"},
+        )
+        result.save(deep_dir)
+        assert deep_dir.exists()
+        assert (deep_dir / "session.log").exists()
--- a/evals/tests/test_helpers.py
+++ b/evals/tests/test_helpers.py
@@ -0,0 +1,126 @@
+import subprocess
+from pathlib import Path
+
+BIN_DIR = Path(__file__).parent.parent / "bin"
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+
+def run_helper(name: str, args: list[str], cwd: Path) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [str(BIN_DIR / name), *args],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+    )
+
+
+class TestToolCalled:
+    def test_tool_present(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-called", ["Read"], tmp_path)
+        assert result.returncode == 0
+
+    def test_tool_absent(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-called", ["Write"], tmp_path)
+        assert result.returncode == 1
+        assert "FAIL" in result.stdout
+
+    def test_empty_jsonl(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text("")
+        result = run_helper("tool-called", ["Read"], tmp_path)
+        assert result.returncode == 1
+
+
+class TestToolNotCalled:
+    def test_tool_absent(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-not-called", ["Write"], tmp_path)
+        assert result.returncode == 0
+
+    def test_tool_present(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-not-called", ["Read"], tmp_path)
+        assert result.returncode == 1
+        assert "FAIL" in result.stdout
+
+    def test_empty_jsonl(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text("")
+        result = run_helper("tool-not-called", ["Read"], tmp_path)
+        assert result.returncode == 0
+
+
+class TestToolCount:
+    def test_gte_passes(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-count", ["Read", "gte", "2"], tmp_path)
+        assert result.returncode == 0
+
+    def test_gte_fails(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-count", ["Read", "gte", "5"], tmp_path)
+        assert result.returncode == 1
+        assert "FAIL" in result.stdout
+
+    def test_eq(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-count", ["Read", "eq", "2"], tmp_path)
+        assert result.returncode == 0
+
+    def test_lt(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-count", ["Read", "lt", "3"], tmp_path)
+        assert result.returncode == 0
+
+
+class TestToolBefore:
+    def test_correct_order(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text(
+            (FIXTURES_DIR / "tools_ordered.jsonl").read_text()
+        )
+        result = run_helper("tool-before", ["Read", "Edit"], tmp_path)
+        assert result.returncode == 0
+
+    def test_wrong_order(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text(
+            (FIXTURES_DIR / "tools_ordered.jsonl").read_text()
+        )
+        result = run_helper("tool-before", ["Edit", "EnterWorktree"], tmp_path)
+        assert result.returncode == 1
+        assert "FAIL" in result.stdout
+
+    def test_first_tool_missing(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text(
+            (FIXTURES_DIR / "tools_ordered.jsonl").read_text()
+        )
+        result = run_helper("tool-before", ["Write", "Read"], tmp_path)
+        assert result.returncode == 1
+        assert "never called" in result.stdout
+
+    def test_second_tool_missing(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text(
+            (FIXTURES_DIR / "tools_ordered.jsonl").read_text()
+        )
+        result = run_helper("tool-before", ["Read", "Write"], tmp_path)
+        assert result.returncode == 1
+        assert "never called" in result.stdout
+
+
+class TestToolArgMatch:
+    def test_matching_arg(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper(
+            "tool-arg-match", ["Skill", '.skill == "superpowers:worktree"'], tmp_path
+        )
+        assert result.returncode == 0
+
+    def test_no_matching_arg(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-arg-match", ["Skill", '.skill == "nonexistent"'], tmp_path)
+        assert result.returncode == 1
+        assert "FAIL" in result.stdout
+
+    def test_tool_not_present(self, tmp_path):
+        (tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
+        result = run_helper("tool-arg-match", ["Write", '.file_path == "/tmp/foo"'], tmp_path)
+        assert result.returncode == 1
--- a/evals/tests/test_normalizer.py
+++ b/evals/tests/test_normalizer.py
@@ -0,0 +1,179 @@
+import json
+
+from drill.normalizer import (
+    collect_new_logs,
+    filter_codex_logs_by_cwd,
+    normalize_claude_logs,
+    normalize_codex_logs,
+    normalize_gemini_logs,
+    snapshot_log_dir,
+)
+
+
+class TestSnapshotAndCollect:
+    def test_snapshot_and_collect_new_files(self, tmp_path):
+        log_dir = tmp_path / "logs"
+        log_dir.mkdir()
+        (log_dir / "old.jsonl").write_text('{"old": true}\n')
+        snapshot = snapshot_log_dir(log_dir)
+        (log_dir / "new.jsonl").write_text('{"new": true}\n')
+        new_files = collect_new_logs(log_dir, snapshot)
+        assert len(new_files) == 1
+        assert new_files[0].name == "new.jsonl"
+
+    def test_empty_dir_returns_empty(self, tmp_path):
+        log_dir = tmp_path / "logs"
+        log_dir.mkdir()
+        snapshot = snapshot_log_dir(log_dir)
+        new_files = collect_new_logs(log_dir, snapshot)
+        assert new_files == []
+
+
+class TestNormalizeClaudeLogs:
+    def test_normalizes_tool_use(self):
+        lines = [
+            json.dumps(
+                {"type": "tool_use", "name": "EnterWorktree", "input": {"branch": "add-login"}}
+            ),
+            json.dumps({"type": "tool_use", "name": "Bash", "input": {"command": "git status"}}),
+            json.dumps({"type": "text", "text": "I'll create a worktree"}),
+        ]
+        normalized = normalize_claude_logs("\n".join(lines))
+        assert len(normalized) == 2
+        assert normalized[0]["tool"] == "EnterWorktree"
+        assert normalized[0]["source"] == "native"
+        assert normalized[1]["tool"] == "Bash"
+        assert normalized[1]["source"] == "shell"
+
+
+class TestNormalizeCodexLogs:
+    def test_normalizes_local_shell_call(self):
+        lines = [
+            json.dumps(
+                {
+                    "type": "response_item",
+                    "item": {
+                        "type": "local_shell_call",
+                        "action": {"command": ["git", "worktree", "add", "feature"]},
+                        "status": "completed",
+                    },
+                }
+            ),
+            json.dumps(
+                {
+                    "type": "response_item",
+                    "item": {"type": "message", "content": [{"text": "Creating worktree"}]},
+                }
+            ),
+        ]
+        normalized = normalize_codex_logs("\n".join(lines))
+        assert len(normalized) == 1
+        assert normalized[0]["tool"] == "Bash"
+        assert "git worktree add" in normalized[0]["args"]["command"]
+        assert normalized[0]["source"] == "shell"
+
+    def test_filter_by_cwd_keeps_matching_drops_others(self, tmp_path):
+        target = "/private/tmp/drill-target"
+        match = tmp_path / "match.jsonl"
+        match.write_text(
+            json.dumps(
+                {
+                    "type": "session_meta",
+                    "payload": {"id": "abc", "cwd": target},
+                }
+            )
+            + "\n"
+        )
+        other = tmp_path / "other.jsonl"
+        other.write_text(
+            json.dumps(
+                {
+                    "type": "session_meta",
+                    "payload": {"id": "def", "cwd": "/private/tmp/drill-other"},
+                }
+            )
+            + "\n"
+        )
+        no_meta = tmp_path / "no-meta.jsonl"
+        no_meta.write_text(json.dumps({"type": "response_item", "payload": {}}) + "\n")
+        empty = tmp_path / "empty.jsonl"
+        empty.write_text("")
+        kept = filter_codex_logs_by_cwd([match, other, no_meta, empty], target)
+        assert kept == [match]
+
+    def test_normalizes_function_call_with_payload(self):
+        """Test the actual codex rollout format using payload instead of item."""
+        lines = [
+            json.dumps(
+                {
+                    "type": "response_item",
+                    "payload": {
+                        "type": "function_call",
+                        "name": "exec_command",
+                        "arguments": '{"cmd":"git worktree add .worktrees/feature",'
+                        '"workdir":"/tmp/test"}',
+                        "call_id": "call_123",
+                    },
+                }
+            ),
+            json.dumps(
+                {
+                    "type": "response_item",
+                    "payload": {
+                        "type": "function_call",
+                        "name": "apply_patch",
+                        "arguments": '{"patch":"--- a/file\\n+++ b/file"}',
+                        "call_id": "call_456",
+                    },
+                }
+            ),
+        ]
+        normalized = normalize_codex_logs("\n".join(lines))
+        assert len(normalized) == 2
+        assert normalized[0]["tool"] == "Bash"
+        assert "git worktree add" in normalized[0]["args"]["command"]
+        assert normalized[0]["source"] == "shell"
+        assert normalized[1]["tool"] == "Edit"
+        assert normalized[1]["source"] == "native"
+
+
+class TestNormalizeGeminiLogs:
+    def test_normalizes_jsonl_tool_calls(self):
+        lines = [
+            json.dumps({"kind": "main"}),
+            json.dumps(
+                {
+                    "type": "gemini",
+                    "content": "Reading file",
+                    "toolCalls": [
+                        {
+                            "id": "read_file_1",
+                            "name": "read_file",
+                            "args": {"file_path": "GEMINI.md"},
+                            "status": "success",
+                        }
+                    ],
+                }
+            ),
+            json.dumps(
+                {
+                    "type": "gemini",
+                    "content": "Running command",
+                    "toolCalls": [
+                        {
+                            "id": "shell_1",
+                            "name": "run_shell_command",
+                            "args": {"command": "git status"},
+                            "status": "success",
+                        }
+                    ],
+                }
+            ),
+        ]
+
+        normalized = normalize_gemini_logs("\n".join(lines))
+
+        assert normalized == [
+            {"tool": "Read", "args": {"file_path": "GEMINI.md"}, "source": "native"},
+            {"tool": "Bash", "args": {"command": "git status"}, "source": "shell"},
+        ]
--- a/evals/tests/test_session.py
+++ b/evals/tests/test_session.py
@@ -0,0 +1,94 @@
+import subprocess
+import time
+from unittest.mock import call, patch
+
+from drill.session import TmuxSession
+
+
+class TestTmuxSession:
+    def test_create_and_kill(self):
+        session = TmuxSession(name="drill-test-create", cols=80, rows=24)
+        session.create()
+        result = subprocess.run(
+            ["tmux", "has-session", "-t", "drill-test-create"],
+            capture_output=True,
+        )
+        assert result.returncode == 0
+        session.kill()
+        result = subprocess.run(
+            ["tmux", "has-session", "-t", "drill-test-create"],
+            capture_output=True,
+        )
+        assert result.returncode != 0
+
+    def test_send_keys_and_capture(self):
+        session = TmuxSession(name="drill-test-keys", cols=80, rows=24)
+        session.create()
+        try:
+            session.send_keys("echo hello-drill-test")
+            time.sleep(0.5)
+            output = session.capture()
+            assert "hello-drill-test" in output
+        finally:
+            session.kill()
+
+    def test_send_keys_pastes_text_then_submits(self):
+        session = TmuxSession(name="drill-test-command-shape")
+
+        with (
+            patch("drill.session.subprocess.run") as run,
+            patch("drill.session.time.sleep") as sleep,
+        ):
+            session.send_keys("hello `weird` text")
+
+        assert run.call_args_list == [
+            call(
+                [
+                    "tmux",
+                    "set-buffer",
+                    "-b",
+                    "drill-test-command-shape-input",
+                    "hello `weird` text",
+                ],
+                check=True,
+            ),
+            call(
+                [
+                    "tmux",
+                    "paste-buffer",
+                    "-d",
+                    "-b",
+                    "drill-test-command-shape-input",
+                    "-t",
+                    "drill-test-command-shape",
+                ],
+                check=True,
+            ),
+            call(["tmux", "send-keys", "-t", "drill-test-command-shape", "Enter"], check=True),
+        ]
+        sleep.assert_called_once_with(0.1)
+
+    def test_launch_command(self, tmp_path):
+        session = TmuxSession(name="drill-test-launch", cols=80, rows=24)
+        session.create()
+        try:
+            session.launch(["python3", "-c", "import time; time.sleep(30)"], cwd=str(tmp_path))
+            time.sleep(0.5)
+            assert session.is_process_alive()
+        finally:
+            session.kill()
+
+    def test_send_special_key(self, tmp_path):
+        session = TmuxSession(name="drill-test-special", cols=80, rows=24)
+        proof_file = tmp_path / "after-ctrl-c"
+        session.create()
+        try:
+            session.send_keys("cat")
+            time.sleep(0.3)
+            session.send_special_key("ctrl-c")
+            time.sleep(0.3)
+            session.send_keys(f"touch {proof_file}")
+            time.sleep(0.3)
+            assert proof_file.exists()
+        finally:
+            session.kill()
--- a/evals/tests/test_setup.py
+++ b/evals/tests/test_setup.py
@@ -0,0 +1,168 @@
+import subprocess
+from pathlib import Path
+from unittest.mock import call, patch
+
+import pytest
+
+from drill.setup import clone_template, run_assertions
+from setup_helpers.base import create_base_repo
+from setup_helpers.worktree import (
+    add_worktree,
+    create_caller_consent_plan,
+    detach_head,
+    link_gemini_extension,
+    symlink_superpowers,
+)
+from setup_helpers.spec_writing_blind_spot import create_spec_writing_blind_spot
+
+
+@pytest.fixture
+def fixtures_dir():
+    return Path(__file__).parent.parent / "fixtures"
+
+
+@pytest.fixture
+def work_dir(tmp_path):
+    return tmp_path / "test-repo"
+
+
+class TestCloneTemplate:
+    def test_clones_template_repo(self, fixtures_dir, work_dir):
+        clone_template(fixtures_dir / "template-repo", work_dir)
+        assert (work_dir / "package.json").exists()
+        assert (work_dir / "src" / "index.js").exists()
+        result = subprocess.run(
+            ["git", "log", "--oneline"],
+            cwd=work_dir,
+            capture_output=True,
+            text=True,
+        )
+        assert "initial commit" in result.stdout
+
+
+class TestCreateBaseRepo:
+    def test_creates_base_repo(self, fixtures_dir, work_dir):
+        create_base_repo(work_dir, fixtures_dir / "template-repo")
+        assert (work_dir / "package.json").exists()
+        result = subprocess.run(
+            ["git", "branch", "--show-current"],
+            cwd=work_dir,
+            capture_output=True,
+            text=True,
+        )
+        assert result.stdout.strip() == "main"
+
+
+class TestWorktreeHelpers:
+    def test_add_worktree(self, fixtures_dir, work_dir):
+        create_base_repo(work_dir, fixtures_dir / "template-repo")
+        wt_path = work_dir.parent / "feature-wt"
+        add_worktree(work_dir, "feature-branch", str(wt_path))
+        assert wt_path.exists()
+        result = subprocess.run(
+            ["git", "worktree", "list"],
+            cwd=work_dir,
+            capture_output=True,
+            text=True,
+        )
+        assert "feature-branch" in result.stdout
+
+    def test_detach_head(self, fixtures_dir, work_dir):
+        create_base_repo(work_dir, fixtures_dir / "template-repo")
+        wt_path = work_dir.parent / "detached-wt"
+        add_worktree(work_dir, "tmp-branch", str(wt_path))
+        detach_head(str(wt_path))
+        result = subprocess.run(
+            ["git", "branch", "--show-current"],
+            cwd=wt_path,
+            capture_output=True,
+            text=True,
+        )
+        assert result.stdout.strip() == ""
+
+    def test_symlink_superpowers(self, fixtures_dir, work_dir, tmp_path):
+        create_base_repo(work_dir, fixtures_dir / "template-repo")
+        fake_sp = tmp_path / "superpowers" / "skills"
+        fake_sp.mkdir(parents=True)
+        symlink_superpowers(work_dir, str(tmp_path / "superpowers"))
+        link = work_dir / ".agents" / "skills" / "superpowers"
+        assert link.is_symlink()
+
+    def test_link_gemini_extension_relinks_requested_root(self, work_dir, tmp_path):
+        work_dir.mkdir()
+        fake_sp = tmp_path / "superpowers"
+        (fake_sp / "skills" / "using-superpowers" / "references").mkdir(parents=True)
+        (fake_sp / "gemini-extension.json").write_text('{"name": "custom-superpowers"}')
+
+        with patch("setup_helpers.worktree.subprocess.run") as run:
+            link_gemini_extension(work_dir, str(fake_sp))
+
+        assert run.call_args_list == [
+            call(["gemini", "extensions", "uninstall", "custom-superpowers"], capture_output=True),
+            call(
+                ["gemini", "extensions", "link", str(fake_sp)],
+                capture_output=True,
+                input="y\n",
+                text=True,
+                check=True,
+            ),
+        ]
+        assert (work_dir / "GEMINI.md").read_text() == (
+            f"@{fake_sp}/skills/using-superpowers/SKILL.md\n"
+            f"@{fake_sp}/skills/using-superpowers/references/gemini-tools.md\n"
+        )
+
+    def test_create_caller_consent_plan(self, fixtures_dir, work_dir):
+        create_base_repo(work_dir, fixtures_dir / "template-repo")
+        create_caller_consent_plan(work_dir)
+
+        plan = work_dir / "docs" / "superpowers" / "plans" / "custom-greeting.md"
+        assert plan.exists()
+        assert "REQUIRED SUB-SKILL" in plan.read_text()
+
+        result = subprocess.run(
+            ["git", "status", "--short"],
+            cwd=work_dir,
+            capture_output=True,
+            text=True,
+        )
+        assert result.stdout.strip() == ""
+
+
+class TestSpecWritingBlindSpot:
+    def test_creates_repo_structure(self, tmp_path):
+        workdir = tmp_path / "blind-spot-repo"
+        create_spec_writing_blind_spot(workdir)
+
+        assert (workdir / "src" / "components" / "AdminPanel.tsx").exists()
+        assert (workdir / "src" / "components" / "TeamOverview.tsx").exists()
+        assert (workdir / "src" / "router.tsx").exists()
+        assert (workdir / "CLAUDE.md").exists()
+        assert not (workdir / "src" / "components" / "ActivityFeed.tsx").exists()
+
+        result = subprocess.run(
+            ["git", "branch", "--show-current"],
+            cwd=workdir, capture_output=True, text=True,
+        )
+        assert result.stdout.strip() == "main"
+
+        result = subprocess.run(
+            ["git", "log", "--oneline"],
+            cwd=workdir, capture_output=True, text=True,
+        )
+        assert result.stdout.count("\n") >= 3
+
+
+class TestRunAssertions:
+    def test_passing_assertions(self, fixtures_dir, work_dir):
+        create_base_repo(work_dir, fixtures_dir / "template-repo")
+        assertions = [
+            "git rev-parse --is-inside-work-tree",
+            "git branch --show-current | grep main",
+        ]
+        run_assertions(assertions, work_dir)
+
+    def test_failing_assertion_raises(self, fixtures_dir, work_dir):
+        create_base_repo(work_dir, fixtures_dir / "template-repo")
+        with pytest.raises(AssertionError, match="Setup assertion failed"):
+            run_assertions(["git branch --show-current | grep nonexistent"], work_dir)
--- a/evals/tests/test_stats.py
+++ b/evals/tests/test_stats.py
@@ -0,0 +1,54 @@
+"""Tests for Wilson score confidence interval."""
+
+from __future__ import annotations
+
+from drill.stats import wilson_ci
+
+
+class TestWilsonCI:
+    def test_all_pass(self) -> None:
+        lo, hi = wilson_ci(10, 10)
+        assert lo > 0.69
+        assert hi == 1.0 or hi > 0.99
+
+    def test_all_fail(self) -> None:
+        lo, hi = wilson_ci(0, 10)
+        assert lo < 0.01 or lo == 0.0
+        assert hi < 0.31
+
+    def test_half_pass(self) -> None:
+        lo, hi = wilson_ci(5, 10)
+        assert 0.18 < lo < 0.25
+        assert 0.75 < hi < 0.82
+
+    def test_zero_total(self) -> None:
+        lo, hi = wilson_ci(0, 0)
+        assert lo == 0.0
+        assert hi == 0.0
+
+    def test_single_pass(self) -> None:
+        lo, hi = wilson_ci(1, 1)
+        assert lo > 0.0
+        assert hi <= 1.0
+
+    def test_single_fail(self) -> None:
+        lo, hi = wilson_ci(0, 1)
+        assert lo == 0.0 or lo >= 0.0
+        assert hi < 1.0
+
+    def test_large_sample(self) -> None:
+        lo, hi = wilson_ci(80, 100)
+        assert 0.70 < lo < 0.75
+        assert 0.85 < hi < 0.90
+
+    def test_passed_greater_than_total_clamped(self) -> None:
+        lo, hi = wilson_ci(12, 10)
+        assert lo > 0.0
+        assert hi <= 1.0
+
+    def test_returns_tuple_of_floats(self) -> None:
+        result = wilson_ci(5, 10)
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+        assert isinstance(result[0], float)
+        assert isinstance(result[1], float)
--- a/evals/tests/test_sweep.py
+++ b/evals/tests/test_sweep.py
@@ -0,0 +1,202 @@
+"""Tests for Sweep orchestrator."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict
+from pathlib import Path
+from unittest.mock import patch
+
+from drill.engine import Engine, RunResult
+from drill.sweep import RunGroup, RunStatus, Sweep, write_run_group
+
+
+class TestRunStatus:
+    def test_pass_status(self) -> None:
+        rs = RunStatus(index=0, status="pass", duration=10.5)
+        assert rs.error is None
+        assert rs.status == "pass"
+
+    def test_error_status(self) -> None:
+        rs = RunStatus(index=2, status="error", duration=1.2, error="tmux crashed")
+        assert rs.error == "tmux crashed"
+
+    def test_serializes_to_dict(self) -> None:
+        rs = RunStatus(index=0, status="pass", duration=10.5)
+        d = asdict(rs)
+        assert d["index"] == 0
+        assert d["status"] == "pass"
+        assert d["duration"] == 10.5
+        assert d["error"] is None
+
+
+class TestRunGroup:
+    def test_creates_with_defaults(self) -> None:
+        rg = RunGroup(
+            scenario="test",
+            backend="claude",
+            n=3,
+            timestamp="2026-04-20T14-30-00",
+            sweep_id="abc12345",
+            runs=[],
+        )
+        assert rg.partial is False
+
+    def test_partial_flag(self) -> None:
+        rg = RunGroup(
+            scenario="test",
+            backend="claude",
+            n=3,
+            timestamp="2026-04-20T14-30-00",
+            sweep_id="abc12345",
+            runs=[RunStatus(index=0, status="pass", duration=10.0)],
+            partial=True,
+        )
+        assert rg.partial is True
+        assert len(rg.runs) == 1
+
+
+class TestWriteRunGroup:
+    def test_writes_json(self, tmp_path: Path) -> None:
+        rg = RunGroup(
+            scenario="test-scenario",
+            backend="claude",
+            n=2,
+            timestamp="2026-04-20T14-30-00",
+            sweep_id="abc12345",
+            runs=[
+                RunStatus(index=0, status="pass", duration=100.0),
+                RunStatus(index=1, status="fail", duration=95.0),
+            ],
+        )
+        write_run_group(rg, tmp_path)
+        path = tmp_path / "run-group.json"
+        assert path.exists()
+        data = json.loads(path.read_text())
+        assert data["scenario"] == "test-scenario"
+        assert data["sweep_id"] == "abc12345"
+        assert data["partial"] is False
+        assert len(data["runs"]) == 2
+        assert data["runs"][0]["status"] == "pass"
+        assert data["runs"][1]["status"] == "fail"
+
+    def test_writes_partial(self, tmp_path: Path) -> None:
+        rg = RunGroup(
+            scenario="test",
+            backend="claude",
+            n=5,
+            timestamp="2026-04-20T14-30-00",
+            sweep_id="abc12345",
+            runs=[RunStatus(index=0, status="pass", duration=100.0)],
+            partial=True,
+        )
+        write_run_group(rg, tmp_path)
+        data = json.loads((tmp_path / "run-group.json").read_text())
+        assert data["partial"] is True
+        assert len(data["runs"]) == 1
+
+    def test_omits_null_errors(self, tmp_path: Path) -> None:
+        rg = RunGroup(
+            scenario="test",
+            backend="claude",
+            n=1,
+            timestamp="2026-04-20T14-30-00",
+            sweep_id="abc12345",
+            runs=[RunStatus(index=0, status="pass", duration=50.0)],
+        )
+        write_run_group(rg, tmp_path)
+        data = json.loads((tmp_path / "run-group.json").read_text())
+        run_data = data["runs"][0]
+        assert "error" not in run_data
+
+
+class TestSweepIntegration:
+    def test_full_sweep_writes_run_group(self, tmp_path: Path) -> None:
+        """Test that Sweep creates run dirs and writes run-group.json."""
+        scenario_file = tmp_path / "scenarios" / "test.yaml"
+        scenario_file.parent.mkdir(parents=True)
+        scenario_file.write_text(
+            "scenario: test-scenario\n"
+            "description: test\n"
+            "user_posture: naive\n"
+            "setup: {}\n"
+            "turns:\n  - intent: do the thing\n"
+            "limits:\n  max_turns: 5\n"
+            "verify:\n  criteria:\n    - thing was done\n"
+        )
+
+        backends_dir = tmp_path / "backends"
+        backends_dir.mkdir()
+        (backends_dir / "mock-backend.yaml").write_text(
+            "name: mock-backend\n"
+            "cli: echo\n"
+            "args: []\n"
+            "required_env: []\n"
+            "hooks:\n  pre_run: []\n  post_run: []\n"
+            "shutdown: /exit\n"
+            "idle:\n  quiescence_seconds: 1\n  ready_pattern: '.'\n"
+            "startup_timeout: 5\n"
+            "terminal:\n  cols: 80\n  rows: 24\n"
+            "session_logs: {}\n"
+        )
+
+        results_dir = tmp_path / "results"
+        fixtures_dir = tmp_path / "fixtures"
+        fixtures_dir.mkdir()
+
+        fake_verdict = json.dumps(
+            {
+                "criteria": [
+                    {
+                        "criterion": "thing was done",
+                        "verdict": "pass",
+                        "evidence": "yes",
+                        "rationale": "it was done",
+                    }
+                ],
+                "observations": [],
+                "summary": "ok",
+            }
+        )
+
+        fake_result = RunResult(
+            scenario="test-scenario",
+            backend="mock-backend",
+            timestamp="2026-04-20T14-30-00",
+            session_log="log",
+            filesystem_json='{"files": []}',
+            tool_calls_jsonl='{"tool": "Bash"}',
+            verdict_json=fake_verdict,
+            meta={"actor_turns": 3},
+        )
+
+        sweep = Sweep(
+            scenario_path=scenario_file,
+            backend_names=["mock-backend"],
+            backends_dir=backends_dir,
+            fixtures_dir=fixtures_dir,
+            results_dir=results_dir,
+            n=3,
+            sweep_id="test1234",
+        )
+
+        with patch.object(Engine, "run", return_value=fake_result):
+            groups = sweep.run_all()
+
+        assert len(groups) == 1
+        group = groups[0]
+        assert group.scenario == "test-scenario"
+        assert len(group.runs) == 3
+        assert all(r.status == "pass" for r in group.runs)
+        assert group.partial is False
+
+        # Verify run-group.json was written
+        scenario_results = results_dir / "test-scenario" / "mock-backend"
+        assert scenario_results.exists()
+        group_dirs = list(scenario_results.iterdir())
+        assert len(group_dirs) == 1
+        rg_path = group_dirs[0] / "run-group.json"
+        assert rg_path.exists()
+        rg_data = json.loads(rg_path.read_text())
+        assert rg_data["sweep_id"] == "test1234"
+        assert len(rg_data["runs"]) == 3
--- a/evals/tests/test_verifier.py
+++ b/evals/tests/test_verifier.py
@@ -0,0 +1,92 @@
+from drill.verifier import CriterionResult, Verdict, Verifier
+
+
+class TestVerdict:
+    def test_parse_valid_verdict(self):
+        data = {
+            "criteria": [
+                {
+                    "criterion": "Agent detected on main",
+                    "verdict": "pass",
+                    "evidence": "Terminal showed 'main branch detected'",
+                    "rationale": "Agent correctly identified the branch",
+                }
+            ],
+            "observations": ["Agent was very fast"],
+            "summary": "Passed all checks",
+        }
+        verdict = Verdict.model_validate(data)
+        assert len(verdict.criteria) == 1
+        assert verdict.criteria[0].verdict == "pass"
+        assert verdict.score == "1/1"
+
+    def test_score_calculation(self):
+        data = {
+            "criteria": [
+                {"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
+                {"criterion": "B", "verdict": "fail", "evidence": "e", "rationale": "r"},
+                {"criterion": "C", "verdict": "pass", "evidence": "e", "rationale": "r"},
+            ],
+            "observations": [],
+            "summary": "Mixed results",
+        }
+        verdict = Verdict.model_validate(data)
+        assert verdict.score == "2/3"
+        assert verdict.passed is False
+
+    def test_all_pass(self):
+        data = {
+            "criteria": [
+                {"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
+            ],
+            "observations": [],
+            "summary": "Good",
+        }
+        verdict = Verdict.model_validate(data)
+        assert verdict.passed is True
+
+
+class TestCriterionResultSource:
+    def test_default_source_is_judge(self):
+        cr = CriterionResult(
+            criterion="test",
+            verdict="pass",
+            evidence="e",
+            rationale="r",
+        )
+        assert cr.source == "judge"
+
+    def test_assertion_source(self):
+        cr = CriterionResult(
+            criterion="test",
+            verdict="fail",
+            evidence="e",
+            rationale="r",
+            source="assertion",
+        )
+        assert cr.source == "assertion"
+
+    def test_backwards_compat_no_source_in_json(self):
+        data = {"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"}
+        cr = CriterionResult.model_validate(data)
+        assert cr.source == "judge"
+
+    def test_source_serializes_to_json(self):
+        cr = CriterionResult(
+            criterion="test",
+            verdict="pass",
+            evidence="e",
+            rationale="r",
+            source="assertion",
+        )
+        data = cr.model_dump()
+        assert data["source"] == "assertion"
+
+
+class TestVerifierPrompt:
+    def test_builds_system_prompt(self):
+        verifier = Verifier(model="claude-sonnet-4-6", temperature=0.0)
+        prompt = verifier.build_system_prompt()
+        assert "criterion" in prompt.lower()
+        assert "evidence" in prompt.lower()
+        assert "JSON" in prompt
				`@@ -0,0 +1 @@`
				`{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}`