Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b

rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding
.git/, .venv/, results/, .env/, __pycache__/, *.egg-info/,
.private-journal/.

The drill repo is unaffected by this commit; archival is a separate
manual step after this PR merges.

Source SHA recorded at evals/.drill-source-sha for divergence
detection.
This commit is contained in:
Jesse Vincent
2026-05-06 12:15:46 -07:00
committed by Drew Ritter
parent 2e46e9590d
commit 3b412a3836
124 changed files with 13806 additions and 0 deletions

0
evals/tests/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,5 @@
{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
{"tool": "Skill", "args": {"skill": "superpowers:worktree"}, "source": "native"}
{"tool": "Edit", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
{"tool": "Read", "args": {"file_path": "/tmp/bar.py"}, "source": "native"}
{"tool": "Bash", "args": {"command": "git status"}, "source": "shell"}

View File

@@ -0,0 +1,4 @@
{"tool": "EnterWorktree", "args": {"branch": "feature/login"}, "source": "native"}
{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
{"tool": "Edit", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
{"tool": "Bash", "args": {"command": "pytest"}, "source": "shell"}

View File

@@ -0,0 +1 @@
{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}

51
evals/tests/test_actor.py Normal file
View File

@@ -0,0 +1,51 @@
from drill.actor import Actor, ActorAction
class TestActorAction:
def test_parse_type_action(self):
action = ActorAction.from_tool_result({"action": "type", "text": "create a worktree"})
assert action.action == "type"
assert action.text == "create a worktree"
def test_parse_done_action(self):
action = ActorAction.from_tool_result({"action": "done"})
assert action.action == "done"
def test_parse_stuck_action(self):
action = ActorAction.from_tool_result({"action": "stuck"})
assert action.action == "stuck"
def test_parse_key_action(self):
action = ActorAction.from_tool_result({"action": "key", "key": "ctrl-c"})
assert action.action == "key"
assert action.key == "ctrl-c"
class TestActorPrompt:
def test_builds_system_prompt_naive(self):
actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
prompt = actor.build_system_prompt(
posture="naive",
intents=["Ask the agent to create a worktree"],
)
assert "plain language" in prompt.lower() or "don't know" in prompt.lower()
assert "create a worktree" in prompt
def test_builds_system_prompt_spec_aware(self):
actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
prompt = actor.build_system_prompt(
posture="spec-aware",
intents=["Use the worktree skill"],
)
assert "skill" in prompt.lower() or "convention" in prompt.lower()
class TestActorContext:
def test_appends_terminal_captures(self):
actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
actor.append_capture("Screen 1: Welcome to Claude")
actor.append_capture("Screen 2: ")
messages = actor.build_messages()
assert len(messages) == 2
assert "Screen 1" in messages[0]["content"]
assert "Screen 2" in messages[1]["content"]

View File

@@ -0,0 +1,106 @@
from drill.assertions import AssertionResult, run_verify_assertions
class TestAssertionResult:
def test_passing_to_criterion_result(self):
ar = AssertionResult(
command="tool-called Read",
passed=True,
exit_code=0,
stdout="PASS: Read called 3 time(s)",
stderr="",
)
cr = ar.to_criterion_result()
assert cr.verdict == "pass"
assert cr.source == "assertion"
assert "[assertion]" in cr.criterion
assert "tool-called Read" in cr.criterion
def test_failing_to_criterion_result(self):
ar = AssertionResult(
command="tool-not-called Write",
passed=False,
exit_code=1,
stdout="",
stderr="FAIL: Write called 2 time(s)",
)
cr = ar.to_criterion_result()
assert cr.verdict == "fail"
assert cr.source == "assertion"
assert "stderr: FAIL" in cr.evidence
class TestRunVerifyAssertions:
def test_passing_assertion(self, tmp_path):
tc = '{"tool": "Read", "args": {}, "source": "native"}\n'
(tmp_path / "tool_calls.jsonl").write_text(tc)
results = run_verify_assertions(
assertions=["grep -q Read tool_calls.jsonl"],
results_dir=tmp_path,
workdir=tmp_path,
)
assert len(results) == 1
assert results[0].passed is True
assert results[0].exit_code == 0
def test_failing_assertion(self, tmp_path):
tc = '{"tool": "Read", "args": {}, "source": "native"}\n'
(tmp_path / "tool_calls.jsonl").write_text(tc)
results = run_verify_assertions(
assertions=["grep -q NonexistentTool tool_calls.jsonl"],
results_dir=tmp_path,
workdir=tmp_path,
)
assert len(results) == 1
assert results[0].passed is False
def test_runs_all_assertions(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text('{"tool": "Read"}\n')
results = run_verify_assertions(
assertions=[
"grep -q Read tool_calls.jsonl",
"grep -q Write tool_calls.jsonl",
"grep -q Read tool_calls.jsonl",
],
results_dir=tmp_path,
workdir=tmp_path,
)
assert len(results) == 3
assert results[0].passed is True
assert results[1].passed is False
assert results[2].passed is True
def test_timeout_handling(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text("{}\n")
results = run_verify_assertions(
assertions=["sleep 30"],
results_dir=tmp_path,
workdir=tmp_path,
timeout_seconds=1,
)
assert len(results) == 1
assert results[0].passed is False
assert results[0].exit_code == 124
assert "Timed out" in results[0].stderr
def test_drill_workdir_env_var(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text("{}\n")
workdir = tmp_path / "scenario-workdir"
workdir.mkdir()
results = run_verify_assertions(
assertions=['test "$DRILL_WORKDIR" = "' + str(workdir) + '"'],
results_dir=tmp_path,
workdir=workdir,
)
assert len(results) == 1
assert results[0].passed is True
def test_bin_dir_on_path(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text("{}\n")
results = run_verify_assertions(
assertions=["echo $PATH | grep -q bin"],
results_dir=tmp_path,
workdir=tmp_path,
)
assert len(results) == 1
assert results[0].passed is True

145
evals/tests/test_backend.py Normal file
View File

@@ -0,0 +1,145 @@
from pathlib import Path
import pytest
from drill.backend import Backend, load_backend
@pytest.fixture
def backends_dir():
return Path(__file__).parent.parent / "backends"
class TestLoadBackend:
def test_loads_claude_backend(self, backends_dir):
backend = load_backend("claude", backends_dir)
assert backend.name == "claude"
assert backend.cli == "claude"
assert "--dangerously-skip-permissions" in backend.args
def test_loads_codex_backend(self, backends_dir):
backend = load_backend("codex", backends_dir)
assert backend.name == "codex"
assert backend.cli == "codex"
def test_unknown_backend_raises(self, backends_dir):
with pytest.raises(FileNotFoundError):
load_backend("nonexistent", backends_dir)
def test_loads_claude_opus_4_6_variant(self, backends_dir, monkeypatch):
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
backend = load_backend("claude-opus-4-6", backends_dir)
assert backend.name == "claude-opus-4-6"
assert backend.family == "claude"
assert backend.model == "claude-opus-4-6"
class TestBackendBuildCommand:
def test_claude_build_command(self, backends_dir, monkeypatch):
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
backend = load_backend("claude", backends_dir)
cmd = backend.build_command("/tmp/workdir")
assert cmd[0] == "claude"
assert "--plugin-dir" in cmd
assert "/tmp/superpowers" in cmd
def test_codex_build_command(self, backends_dir, monkeypatch):
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
backend = load_backend("codex", backends_dir)
cmd = backend.build_command("/tmp/workdir")
assert cmd[0] == "codex"
class TestBackendEnvValidation:
def test_missing_env_raises(self, backends_dir, monkeypatch):
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
monkeypatch.delenv("SUPERPOWERS_ROOT", raising=False)
backend = load_backend("claude", backends_dir)
with pytest.raises(EnvironmentError, match="ANTHROPIC_API_KEY"):
backend.validate_env()
class TestBackendIdleDetection:
def test_ready_pattern_matches(self, backends_dir):
backend = load_backend("claude", backends_dir)
assert backend.is_ready_line(" ")
assert backend.is_ready_line("Human: ")
assert not backend.is_ready_line("Running tool...")
class TestBackendModelExtraction:
def test_extract_model_from_args(self, backends_dir, monkeypatch):
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
backend = load_backend("claude", backends_dir)
assert backend.model == "opus"
def test_no_model_flag_returns_none(self):
backend = Backend(
name="test",
cli="test",
args=["--foo", "bar"],
required_env=[],
hooks={"pre_run": [], "post_run": []},
shutdown="/exit",
idle={},
startup_timeout=30,
terminal={},
session_logs={},
)
assert backend.model is None
def test_extracts_from_short_m_flag(self):
backend = Backend(
name="test",
cli="test",
args=["-m", "gemini-2.5-flash"],
required_env=[],
hooks={"pre_run": [], "post_run": []},
shutdown="/exit",
idle={},
startup_timeout=30,
terminal={},
session_logs={},
)
assert backend.model == "gemini-2.5-flash"
class TestBackendFamily:
def test_claude_backend_family(self, backends_dir, monkeypatch):
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
backend = load_backend("claude", backends_dir)
assert backend.family == "claude"
def test_codex_backend_family(self, backends_dir):
backend = load_backend("codex", backends_dir)
assert backend.family == "codex"
def test_variant_name_preserves_family(self):
backend = Backend(
name="claude-opus-4-6",
cli="claude",
args=[],
required_env=[],
hooks={"pre_run": [], "post_run": []},
shutdown="/exit",
idle={},
startup_timeout=30,
terminal={},
session_logs={},
)
assert backend.family == "claude"
def test_unknown_family_is_other(self):
backend = Backend(
name="random-xyz",
cli="xyz",
args=[],
required_env=[],
hooks={"pre_run": [], "post_run": []},
shutdown="/exit",
idle={},
startup_timeout=30,
terminal={},
session_logs={},
)
assert backend.family == "other"

61
evals/tests/test_cli.py Normal file
View File

@@ -0,0 +1,61 @@
"""Tests for CLI option parsing."""
from __future__ import annotations
from click.testing import CliRunner
from drill.cli import main
class TestRunCommand:
def test_backend_required_without_models(self) -> None:
runner = CliRunner()
result = runner.invoke(main, ["run", "nonexistent"])
assert result.exit_code != 0
def test_n_default_is_1(self) -> None:
runner = CliRunner()
result = runner.invoke(main, ["run", "nonexistent", "--backend", "claude", "--n", "1"])
assert "Scenario not found" in result.output or result.exit_code != 0
def test_models_flag_accepted(self) -> None:
runner = CliRunner()
result = runner.invoke(main, ["run", "nonexistent", "--models", "claude,codex"])
assert "Scenario not found" in result.output or result.exit_code != 0
def test_n_must_be_positive(self) -> None:
runner = CliRunner()
result = runner.invoke(main, ["run", "nonexistent", "--backend", "claude", "--n", "0"])
assert result.exit_code != 0
class TestListCommand:
def test_lists_scenarios(self, tmp_path):
scenarios_dir = tmp_path / "scenarios"
scenarios_dir.mkdir()
(scenarios_dir / "test-scenario.yaml").write_text("""
scenario: test-scenario
description: "A test scenario"
user_posture: naive
setup:
helpers: []
assertions: []
turns: []
limits:
max_turns: 5
turn_timeout: 30
verify:
criteria: []
observe: false
""")
runner = CliRunner()
result = runner.invoke(main, ["list", "--scenarios-dir", str(scenarios_dir)])
assert result.exit_code == 0
assert "test-scenario" in result.output
class TestCompareCommand:
def test_sweep_flag_accepted(self) -> None:
runner = CliRunner()
result = runner.invoke(main, ["compare", "nonexistent", "--sweep", "abc123"])
assert result.exit_code != 0 # No results dir, but flag is parsed

217
evals/tests/test_compare.py Normal file
View File

@@ -0,0 +1,217 @@
"""Tests for compare module."""
from __future__ import annotations
import json
from pathlib import Path
from drill.compare import BackendResult, format_compare_output, load_scenario_results
def _write_verdict(path: Path, criteria: list[dict[str, str]]) -> None:
verdict = {
"criteria": criteria,
"observations": ["test obs"],
"summary": "ok",
}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(verdict))
def _write_meta(path: Path, **kwargs: object) -> None:
meta = {"scenario": "test", "backend": "claude", "actor_turns": 4, **kwargs}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(meta))
def _write_run_group(
path: Path, n: int, runs: list[dict[str, object]], sweep_id: str = "abc12345"
) -> None:
data = {
"scenario": "test",
"backend": "claude",
"n": n,
"timestamp": "2026-04-20T14-30-00",
"sweep_id": sweep_id,
"partial": False,
"runs": runs,
}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(data))
class TestLoadScenarioResults:
def test_loads_new_format_single_run(self, tmp_path: Path) -> None:
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
run_dir = scenario_dir / "run-00"
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
_write_verdict(run_dir / "verdict.json", criteria)
_write_meta(run_dir / "meta.json")
_write_run_group(
scenario_dir / "run-group.json",
n=1,
runs=[{"index": 0, "status": "pass", "duration": 10.0}],
)
results = load_scenario_results(tmp_path / "test-scenario")
assert "claude" in results
assert results["claude"].total_runs == 1
assert results["claude"].passed_runs == 1
def test_loads_new_format_multi_run(self, tmp_path: Path) -> None:
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
for i in range(3):
run_dir = scenario_dir / f"run-{i:02d}"
verdict_val = "pass" if i < 2 else "fail"
criteria = [
{"criterion": "c1", "verdict": verdict_val, "evidence": "e", "rationale": "r"}
]
_write_verdict(run_dir / "verdict.json", criteria)
_write_meta(run_dir / "meta.json")
_write_run_group(
scenario_dir / "run-group.json",
n=3,
runs=[
{"index": 0, "status": "pass", "duration": 10.0},
{"index": 1, "status": "pass", "duration": 11.0},
{"index": 2, "status": "fail", "duration": 12.0},
],
)
results = load_scenario_results(tmp_path / "test-scenario")
assert results["claude"].total_runs == 3
assert results["claude"].passed_runs == 2
assert len(results["claude"].criterion_counts) == 1
assert results["claude"].criterion_counts["c1"] == (2, 3)
def test_loads_old_format_backwards_compat(self, tmp_path: Path) -> None:
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00"
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
_write_verdict(scenario_dir / "verdict.json", criteria)
_write_meta(scenario_dir / "meta.json")
results = load_scenario_results(tmp_path / "test-scenario")
assert "claude" in results
assert results["claude"].total_runs == 1
assert results["claude"].passed_runs == 1
def test_sweep_filter(self, tmp_path: Path) -> None:
base = tmp_path / "test-scenario" / "claude"
# Sweep A
dir_a = base / "2026-04-20T14-30-00-aaaa1111"
_write_run_group(
dir_a / "run-group.json",
n=1,
runs=[{"index": 0, "status": "pass", "duration": 10.0}],
sweep_id="aaaa1111",
)
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
_write_verdict(dir_a / "run-00" / "verdict.json", criteria)
_write_meta(dir_a / "run-00" / "meta.json")
# Sweep B
dir_b = base / "2026-04-20T15-00-00-bbbb2222"
_write_run_group(
dir_b / "run-group.json",
n=1,
runs=[{"index": 0, "status": "fail", "duration": 10.0}],
sweep_id="bbbb2222",
)
criteria_b = [{"criterion": "c1", "verdict": "fail", "evidence": "e", "rationale": "r"}]
_write_verdict(dir_b / "run-00" / "verdict.json", criteria_b)
_write_meta(dir_b / "run-00" / "meta.json")
results_a = load_scenario_results(tmp_path / "test-scenario", sweep_id="aaaa1111")
assert results_a["claude"].passed_runs == 1
results_b = load_scenario_results(tmp_path / "test-scenario", sweep_id="bbbb2222")
assert results_b["claude"].passed_runs == 0
class TestBackendResult:
def test_pass_rate(self) -> None:
br = BackendResult(
backend="claude",
total_runs=10,
passed_runs=8,
errored_runs=0,
avg_turns=4.2,
criterion_counts={"c1": (10, 10), "c2": (8, 10)},
sweep_id="abc12345",
timestamp="2026-04-20T14-30-00",
partial=False,
)
assert br.pass_rate == 0.8
def test_pass_rate_zero_runs(self) -> None:
br = BackendResult(
backend="claude",
total_runs=0,
passed_runs=0,
errored_runs=0,
avg_turns=0.0,
criterion_counts={},
sweep_id=None,
timestamp=None,
partial=False,
)
assert br.pass_rate == 0.0
def _make_backend_result(
backend: str = "claude",
total_runs: int = 10,
passed_runs: int = 8,
errored_runs: int = 0,
avg_turns: float = 4.2,
criterion_counts: dict[str, tuple[int, int]] | None = None,
sweep_id: str | None = "abc12345",
timestamp: str | None = "2026-04-20T14-30-00",
partial: bool = False,
) -> BackendResult:
return BackendResult(
backend=backend,
total_runs=total_runs,
passed_runs=passed_runs,
errored_runs=errored_runs,
avg_turns=avg_turns,
criterion_counts=criterion_counts or {"c1": (passed_runs, total_runs)},
sweep_id=sweep_id,
timestamp=timestamp,
partial=partial,
)
class TestFormatCompareOutput:
def test_no_results(self) -> None:
output = format_compare_output("test", {})
assert "No results found" in output
def test_multi_run_includes_pass_rate_and_ci(self) -> None:
results = {"claude": _make_backend_result(total_runs=10, passed_runs=8)}
output = format_compare_output("test", results)
assert "Overall pass rate" in output
assert "95% CI" in output
assert "80.0%" in output
def test_multi_run_sweep_header_includes_date(self) -> None:
results = {"claude": _make_backend_result()}
output = format_compare_output("test", results)
assert "Sweep: abc12345 | 2026-04-20" in output
def test_single_run_simple_table(self) -> None:
results = {
"claude": _make_backend_result(
total_runs=1,
passed_runs=1,
criterion_counts={"c1": (1, 1)},
)
}
output = format_compare_output("test", results)
assert "PASS" in output
assert "Overall pass rate" not in output
def test_partial_warning(self) -> None:
results = {"claude": _make_backend_result(partial=True)}
output = format_compare_output("test", results)
assert "incomplete" in output.lower() or "interrupted" in output.lower()
def test_small_n_note(self) -> None:
results = {"claude": _make_backend_result(total_runs=5, passed_runs=3)}
output = format_compare_output("test", results)
assert "--n 10+" in output

94
evals/tests/test_e2e.py Normal file
View File

@@ -0,0 +1,94 @@
"""End-to-end smoke test using a mock 'bash' backend."""
import shutil
from pathlib import Path
import pytest
from drill.engine import Engine, ScenarioConfig
@pytest.fixture
def mock_scenario(tmp_path):
scenario = tmp_path / "test-scenario.yaml"
scenario.write_text("""
scenario: e2e-smoke-test
description: "Smoke test"
user_posture: naive
setup:
helpers:
- create_base_repo
assertions:
- "git rev-parse --is-inside-work-tree"
turns:
- intent: "List files in the current directory"
limits:
max_turns: 3
turn_timeout: 10
verify:
criteria:
- "Agent listed the files"
observe: true
""")
return scenario
@pytest.fixture
def mock_backend(tmp_path):
backend_dir = tmp_path / "backends"
backend_dir.mkdir()
(backend_dir / "mock.yaml").write_text("""
name: mock
cli: bash
args: []
required_env: []
hooks:
pre_run: []
post_run: []
shutdown: "exit"
idle:
quiescence_seconds: 1
ready_pattern: "\\\\$"
startup_timeout: 5
terminal:
cols: 80
rows: 24
session_logs:
pattern: ""
""")
return backend_dir
class TestE2ESmoke:
def test_scenario_config_loads(self, mock_scenario):
config = ScenarioConfig.from_yaml(mock_scenario)
assert config.scenario == "e2e-smoke-test"
def test_engine_setup_works(self, mock_scenario, mock_backend):
fixtures_dir = Path(__file__).parent.parent / "fixtures"
engine = Engine(
scenario_path=mock_scenario,
backend_name="mock",
backends_dir=mock_backend,
fixtures_dir=fixtures_dir,
results_dir=Path("/tmp/drill-test-results"),
)
workdir = Path("/tmp/drill-e2e-smoke")
if workdir.exists():
shutil.rmtree(workdir)
engine._setup(workdir)
assert (workdir / "package.json").exists()
assert (workdir / "src" / "index.js").exists()
# Verify git state
import subprocess
result = subprocess.run(
["git", "branch", "--show-current"], cwd=workdir, capture_output=True, text=True
)
assert result.stdout.strip() == "main"
result = subprocess.run(
["git", "log", "--oneline"], cwd=workdir, capture_output=True, text=True
)
assert "initial commit" in result.stdout
# Cleanup
shutil.rmtree(workdir, ignore_errors=True)

173
evals/tests/test_engine.py Normal file
View File

@@ -0,0 +1,173 @@
from __future__ import annotations
import json
import subprocess
from pathlib import Path
from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
class TestVerifyConfig:
def test_defaults(self):
vc = VerifyConfig()
assert vc.criteria == []
assert vc.assertions == []
assert vc.observe is False
def test_from_dict(self):
vc = VerifyConfig(
criteria=["test criterion"],
assertions=["tool-called Read"],
observe=True,
)
assert len(vc.criteria) == 1
assert len(vc.assertions) == 1
assert vc.observe is True
class TestScenarioConfig:
def test_loads_from_yaml(self, tmp_path):
scenario_file = tmp_path / "test.yaml"
scenario_file.write_text("""
scenario: test-scenario
description: "A test"
user_posture: naive
setup:
helpers:
- create_base_repo
assertions:
- "git rev-parse --is-inside-work-tree"
turns:
- intent: "Do the thing"
limits:
max_turns: 10
turn_timeout: 60
verify:
criteria:
- "Thing was done"
assertions:
- "tool-called Bash"
observe: true
""")
config = ScenarioConfig.from_yaml(scenario_file)
assert config.scenario == "test-scenario"
assert config.user_posture == "naive"
assert config.limits["max_turns"] == 10
assert len(config.turns) == 1
assert len(config.verify.criteria) == 1
assert len(config.verify.assertions) == 1
assert config.verify.observe is True
def test_loads_without_assertions(self, tmp_path):
scenario_file = tmp_path / "test.yaml"
scenario_file.write_text("""
scenario: minimal
verify:
criteria:
- "Something happened"
""")
config = ScenarioConfig.from_yaml(scenario_file)
assert config.verify.assertions == []
assert config.verify.observe is False
def test_loads_without_verify(self, tmp_path):
scenario_file = tmp_path / "test.yaml"
scenario_file.write_text("""
scenario: bare-minimum
""")
config = ScenarioConfig.from_yaml(scenario_file)
assert config.verify.criteria == []
assert config.verify.assertions == []
class TestSnapshotFilesystem:
def test_captures_git_state(self, tmp_path):
subprocess.run(["git", "init", "-b", "main"], cwd=tmp_path, capture_output=True)
subprocess.run(
["git", "commit", "--allow-empty", "-m", "init"], cwd=tmp_path, capture_output=True
)
snapshot = snapshot_filesystem(tmp_path)
data = json.loads(snapshot)
assert "git_status" in data
assert "branch" in data
assert "worktree_list" in data
assert "files" in data
class TestRunResult:
def test_serializes_to_dir(self, tmp_path):
result = RunResult(
scenario="test",
backend="claude",
timestamp="2026-04-07T14-30-00",
session_log="session output here",
filesystem_json='{"files": []}',
tool_calls_jsonl='{"tool": "Bash"}\n',
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
meta={"backend": "claude", "duration_seconds": 42, "actor_turns": 5},
)
result.save(tmp_path)
assert (tmp_path / "session.log").read_text() == "session output here"
assert (tmp_path / "filesystem.json").exists()
assert (tmp_path / "tool_calls.jsonl").exists()
assert (tmp_path / "verdict.json").exists()
assert (tmp_path / "meta.json").exists()
class TestEngineAssertionIntegration:
def test_run_result_save_splits_artifacts_and_verdict(self, tmp_path):
result = RunResult(
scenario="test",
backend="claude",
timestamp="2026-04-20T10-00-00",
session_log="log here",
filesystem_json='{"files": []}',
tool_calls_jsonl='{"tool": "Bash"}\n',
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
meta={"backend": "claude"},
)
result.save_artifacts(tmp_path)
assert (tmp_path / "session.log").exists()
assert (tmp_path / "filesystem.json").exists()
assert (tmp_path / "tool_calls.jsonl").exists()
assert not (tmp_path / "verdict.json").exists()
assert not (tmp_path / "meta.json").exists()
result.save_verdict(tmp_path)
assert (tmp_path / "verdict.json").exists()
assert (tmp_path / "meta.json").exists()
class TestEngineRunParams:
def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
custom_dir = tmp_path / "custom" / "run-00"
result = RunResult(
scenario="test",
backend="claude",
timestamp="2026-04-20T10-00-00",
session_log="log",
filesystem_json='{"files": []}',
tool_calls_jsonl='{"tool": "Bash"}\n',
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
meta={"backend": "claude"},
)
result.save(custom_dir)
assert (custom_dir / "session.log").read_text() == "log"
assert (custom_dir / "verdict.json").exists()
assert (custom_dir / "meta.json").exists()
def test_run_result_nested_dir_created(self, tmp_path: Path) -> None:
deep_dir = tmp_path / "a" / "b" / "c" / "run-05"
result = RunResult(
scenario="test",
backend="claude",
timestamp="2026-04-20T10-00-00",
session_log="log",
filesystem_json='{"files": []}',
tool_calls_jsonl='{"tool": "Bash"}\n',
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
meta={"backend": "claude"},
)
result.save(deep_dir)
assert deep_dir.exists()
assert (deep_dir / "session.log").exists()

126
evals/tests/test_helpers.py Normal file
View File

@@ -0,0 +1,126 @@
import subprocess
from pathlib import Path
BIN_DIR = Path(__file__).parent.parent / "bin"
FIXTURES_DIR = Path(__file__).parent / "fixtures"
def run_helper(name: str, args: list[str], cwd: Path) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[str(BIN_DIR / name), *args],
cwd=cwd,
capture_output=True,
text=True,
)
class TestToolCalled:
def test_tool_present(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-called", ["Read"], tmp_path)
assert result.returncode == 0
def test_tool_absent(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-called", ["Write"], tmp_path)
assert result.returncode == 1
assert "FAIL" in result.stdout
def test_empty_jsonl(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text("")
result = run_helper("tool-called", ["Read"], tmp_path)
assert result.returncode == 1
class TestToolNotCalled:
def test_tool_absent(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-not-called", ["Write"], tmp_path)
assert result.returncode == 0
def test_tool_present(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-not-called", ["Read"], tmp_path)
assert result.returncode == 1
assert "FAIL" in result.stdout
def test_empty_jsonl(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text("")
result = run_helper("tool-not-called", ["Read"], tmp_path)
assert result.returncode == 0
class TestToolCount:
def test_gte_passes(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-count", ["Read", "gte", "2"], tmp_path)
assert result.returncode == 0
def test_gte_fails(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-count", ["Read", "gte", "5"], tmp_path)
assert result.returncode == 1
assert "FAIL" in result.stdout
def test_eq(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-count", ["Read", "eq", "2"], tmp_path)
assert result.returncode == 0
def test_lt(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-count", ["Read", "lt", "3"], tmp_path)
assert result.returncode == 0
class TestToolBefore:
def test_correct_order(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text(
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
)
result = run_helper("tool-before", ["Read", "Edit"], tmp_path)
assert result.returncode == 0
def test_wrong_order(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text(
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
)
result = run_helper("tool-before", ["Edit", "EnterWorktree"], tmp_path)
assert result.returncode == 1
assert "FAIL" in result.stdout
def test_first_tool_missing(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text(
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
)
result = run_helper("tool-before", ["Write", "Read"], tmp_path)
assert result.returncode == 1
assert "never called" in result.stdout
def test_second_tool_missing(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text(
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
)
result = run_helper("tool-before", ["Read", "Write"], tmp_path)
assert result.returncode == 1
assert "never called" in result.stdout
class TestToolArgMatch:
def test_matching_arg(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper(
"tool-arg-match", ["Skill", '.skill == "superpowers:worktree"'], tmp_path
)
assert result.returncode == 0
def test_no_matching_arg(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-arg-match", ["Skill", '.skill == "nonexistent"'], tmp_path)
assert result.returncode == 1
assert "FAIL" in result.stdout
def test_tool_not_present(self, tmp_path):
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
result = run_helper("tool-arg-match", ["Write", '.file_path == "/tmp/foo"'], tmp_path)
assert result.returncode == 1

View File

@@ -0,0 +1,179 @@
import json
from drill.normalizer import (
collect_new_logs,
filter_codex_logs_by_cwd,
normalize_claude_logs,
normalize_codex_logs,
normalize_gemini_logs,
snapshot_log_dir,
)
class TestSnapshotAndCollect:
def test_snapshot_and_collect_new_files(self, tmp_path):
log_dir = tmp_path / "logs"
log_dir.mkdir()
(log_dir / "old.jsonl").write_text('{"old": true}\n')
snapshot = snapshot_log_dir(log_dir)
(log_dir / "new.jsonl").write_text('{"new": true}\n')
new_files = collect_new_logs(log_dir, snapshot)
assert len(new_files) == 1
assert new_files[0].name == "new.jsonl"
def test_empty_dir_returns_empty(self, tmp_path):
log_dir = tmp_path / "logs"
log_dir.mkdir()
snapshot = snapshot_log_dir(log_dir)
new_files = collect_new_logs(log_dir, snapshot)
assert new_files == []
class TestNormalizeClaudeLogs:
def test_normalizes_tool_use(self):
lines = [
json.dumps(
{"type": "tool_use", "name": "EnterWorktree", "input": {"branch": "add-login"}}
),
json.dumps({"type": "tool_use", "name": "Bash", "input": {"command": "git status"}}),
json.dumps({"type": "text", "text": "I'll create a worktree"}),
]
normalized = normalize_claude_logs("\n".join(lines))
assert len(normalized) == 2
assert normalized[0]["tool"] == "EnterWorktree"
assert normalized[0]["source"] == "native"
assert normalized[1]["tool"] == "Bash"
assert normalized[1]["source"] == "shell"
class TestNormalizeCodexLogs:
def test_normalizes_local_shell_call(self):
lines = [
json.dumps(
{
"type": "response_item",
"item": {
"type": "local_shell_call",
"action": {"command": ["git", "worktree", "add", "feature"]},
"status": "completed",
},
}
),
json.dumps(
{
"type": "response_item",
"item": {"type": "message", "content": [{"text": "Creating worktree"}]},
}
),
]
normalized = normalize_codex_logs("\n".join(lines))
assert len(normalized) == 1
assert normalized[0]["tool"] == "Bash"
assert "git worktree add" in normalized[0]["args"]["command"]
assert normalized[0]["source"] == "shell"
def test_filter_by_cwd_keeps_matching_drops_others(self, tmp_path):
target = "/private/tmp/drill-target"
match = tmp_path / "match.jsonl"
match.write_text(
json.dumps(
{
"type": "session_meta",
"payload": {"id": "abc", "cwd": target},
}
)
+ "\n"
)
other = tmp_path / "other.jsonl"
other.write_text(
json.dumps(
{
"type": "session_meta",
"payload": {"id": "def", "cwd": "/private/tmp/drill-other"},
}
)
+ "\n"
)
no_meta = tmp_path / "no-meta.jsonl"
no_meta.write_text(json.dumps({"type": "response_item", "payload": {}}) + "\n")
empty = tmp_path / "empty.jsonl"
empty.write_text("")
kept = filter_codex_logs_by_cwd([match, other, no_meta, empty], target)
assert kept == [match]
def test_normalizes_function_call_with_payload(self):
"""Test the actual codex rollout format using payload instead of item."""
lines = [
json.dumps(
{
"type": "response_item",
"payload": {
"type": "function_call",
"name": "exec_command",
"arguments": '{"cmd":"git worktree add .worktrees/feature",'
'"workdir":"/tmp/test"}',
"call_id": "call_123",
},
}
),
json.dumps(
{
"type": "response_item",
"payload": {
"type": "function_call",
"name": "apply_patch",
"arguments": '{"patch":"--- a/file\\n+++ b/file"}',
"call_id": "call_456",
},
}
),
]
normalized = normalize_codex_logs("\n".join(lines))
assert len(normalized) == 2
assert normalized[0]["tool"] == "Bash"
assert "git worktree add" in normalized[0]["args"]["command"]
assert normalized[0]["source"] == "shell"
assert normalized[1]["tool"] == "Edit"
assert normalized[1]["source"] == "native"
class TestNormalizeGeminiLogs:
def test_normalizes_jsonl_tool_calls(self):
lines = [
json.dumps({"kind": "main"}),
json.dumps(
{
"type": "gemini",
"content": "Reading file",
"toolCalls": [
{
"id": "read_file_1",
"name": "read_file",
"args": {"file_path": "GEMINI.md"},
"status": "success",
}
],
}
),
json.dumps(
{
"type": "gemini",
"content": "Running command",
"toolCalls": [
{
"id": "shell_1",
"name": "run_shell_command",
"args": {"command": "git status"},
"status": "success",
}
],
}
),
]
normalized = normalize_gemini_logs("\n".join(lines))
assert normalized == [
{"tool": "Read", "args": {"file_path": "GEMINI.md"}, "source": "native"},
{"tool": "Bash", "args": {"command": "git status"}, "source": "shell"},
]

View File

@@ -0,0 +1,94 @@
import subprocess
import time
from unittest.mock import call, patch
from drill.session import TmuxSession
class TestTmuxSession:
def test_create_and_kill(self):
session = TmuxSession(name="drill-test-create", cols=80, rows=24)
session.create()
result = subprocess.run(
["tmux", "has-session", "-t", "drill-test-create"],
capture_output=True,
)
assert result.returncode == 0
session.kill()
result = subprocess.run(
["tmux", "has-session", "-t", "drill-test-create"],
capture_output=True,
)
assert result.returncode != 0
def test_send_keys_and_capture(self):
session = TmuxSession(name="drill-test-keys", cols=80, rows=24)
session.create()
try:
session.send_keys("echo hello-drill-test")
time.sleep(0.5)
output = session.capture()
assert "hello-drill-test" in output
finally:
session.kill()
def test_send_keys_pastes_text_then_submits(self):
session = TmuxSession(name="drill-test-command-shape")
with (
patch("drill.session.subprocess.run") as run,
patch("drill.session.time.sleep") as sleep,
):
session.send_keys("hello `weird` text")
assert run.call_args_list == [
call(
[
"tmux",
"set-buffer",
"-b",
"drill-test-command-shape-input",
"hello `weird` text",
],
check=True,
),
call(
[
"tmux",
"paste-buffer",
"-d",
"-b",
"drill-test-command-shape-input",
"-t",
"drill-test-command-shape",
],
check=True,
),
call(["tmux", "send-keys", "-t", "drill-test-command-shape", "Enter"], check=True),
]
sleep.assert_called_once_with(0.1)
def test_launch_command(self, tmp_path):
session = TmuxSession(name="drill-test-launch", cols=80, rows=24)
session.create()
try:
session.launch(["python3", "-c", "import time; time.sleep(30)"], cwd=str(tmp_path))
time.sleep(0.5)
assert session.is_process_alive()
finally:
session.kill()
def test_send_special_key(self, tmp_path):
session = TmuxSession(name="drill-test-special", cols=80, rows=24)
proof_file = tmp_path / "after-ctrl-c"
session.create()
try:
session.send_keys("cat")
time.sleep(0.3)
session.send_special_key("ctrl-c")
time.sleep(0.3)
session.send_keys(f"touch {proof_file}")
time.sleep(0.3)
assert proof_file.exists()
finally:
session.kill()

168
evals/tests/test_setup.py Normal file
View File

@@ -0,0 +1,168 @@
import subprocess
from pathlib import Path
from unittest.mock import call, patch
import pytest
from drill.setup import clone_template, run_assertions
from setup_helpers.base import create_base_repo
from setup_helpers.worktree import (
add_worktree,
create_caller_consent_plan,
detach_head,
link_gemini_extension,
symlink_superpowers,
)
from setup_helpers.spec_writing_blind_spot import create_spec_writing_blind_spot
@pytest.fixture
def fixtures_dir():
return Path(__file__).parent.parent / "fixtures"
@pytest.fixture
def work_dir(tmp_path):
return tmp_path / "test-repo"
class TestCloneTemplate:
def test_clones_template_repo(self, fixtures_dir, work_dir):
clone_template(fixtures_dir / "template-repo", work_dir)
assert (work_dir / "package.json").exists()
assert (work_dir / "src" / "index.js").exists()
result = subprocess.run(
["git", "log", "--oneline"],
cwd=work_dir,
capture_output=True,
text=True,
)
assert "initial commit" in result.stdout
class TestCreateBaseRepo:
def test_creates_base_repo(self, fixtures_dir, work_dir):
create_base_repo(work_dir, fixtures_dir / "template-repo")
assert (work_dir / "package.json").exists()
result = subprocess.run(
["git", "branch", "--show-current"],
cwd=work_dir,
capture_output=True,
text=True,
)
assert result.stdout.strip() == "main"
class TestWorktreeHelpers:
def test_add_worktree(self, fixtures_dir, work_dir):
create_base_repo(work_dir, fixtures_dir / "template-repo")
wt_path = work_dir.parent / "feature-wt"
add_worktree(work_dir, "feature-branch", str(wt_path))
assert wt_path.exists()
result = subprocess.run(
["git", "worktree", "list"],
cwd=work_dir,
capture_output=True,
text=True,
)
assert "feature-branch" in result.stdout
def test_detach_head(self, fixtures_dir, work_dir):
create_base_repo(work_dir, fixtures_dir / "template-repo")
wt_path = work_dir.parent / "detached-wt"
add_worktree(work_dir, "tmp-branch", str(wt_path))
detach_head(str(wt_path))
result = subprocess.run(
["git", "branch", "--show-current"],
cwd=wt_path,
capture_output=True,
text=True,
)
assert result.stdout.strip() == ""
def test_symlink_superpowers(self, fixtures_dir, work_dir, tmp_path):
create_base_repo(work_dir, fixtures_dir / "template-repo")
fake_sp = tmp_path / "superpowers" / "skills"
fake_sp.mkdir(parents=True)
symlink_superpowers(work_dir, str(tmp_path / "superpowers"))
link = work_dir / ".agents" / "skills" / "superpowers"
assert link.is_symlink()
def test_link_gemini_extension_relinks_requested_root(self, work_dir, tmp_path):
work_dir.mkdir()
fake_sp = tmp_path / "superpowers"
(fake_sp / "skills" / "using-superpowers" / "references").mkdir(parents=True)
(fake_sp / "gemini-extension.json").write_text('{"name": "custom-superpowers"}')
with patch("setup_helpers.worktree.subprocess.run") as run:
link_gemini_extension(work_dir, str(fake_sp))
assert run.call_args_list == [
call(["gemini", "extensions", "uninstall", "custom-superpowers"], capture_output=True),
call(
["gemini", "extensions", "link", str(fake_sp)],
capture_output=True,
input="y\n",
text=True,
check=True,
),
]
assert (work_dir / "GEMINI.md").read_text() == (
f"@{fake_sp}/skills/using-superpowers/SKILL.md\n"
f"@{fake_sp}/skills/using-superpowers/references/gemini-tools.md\n"
)
def test_create_caller_consent_plan(self, fixtures_dir, work_dir):
create_base_repo(work_dir, fixtures_dir / "template-repo")
create_caller_consent_plan(work_dir)
plan = work_dir / "docs" / "superpowers" / "plans" / "custom-greeting.md"
assert plan.exists()
assert "REQUIRED SUB-SKILL" in plan.read_text()
result = subprocess.run(
["git", "status", "--short"],
cwd=work_dir,
capture_output=True,
text=True,
)
assert result.stdout.strip() == ""
class TestSpecWritingBlindSpot:
def test_creates_repo_structure(self, tmp_path):
workdir = tmp_path / "blind-spot-repo"
create_spec_writing_blind_spot(workdir)
assert (workdir / "src" / "components" / "AdminPanel.tsx").exists()
assert (workdir / "src" / "components" / "TeamOverview.tsx").exists()
assert (workdir / "src" / "router.tsx").exists()
assert (workdir / "CLAUDE.md").exists()
assert not (workdir / "src" / "components" / "ActivityFeed.tsx").exists()
result = subprocess.run(
["git", "branch", "--show-current"],
cwd=workdir, capture_output=True, text=True,
)
assert result.stdout.strip() == "main"
result = subprocess.run(
["git", "log", "--oneline"],
cwd=workdir, capture_output=True, text=True,
)
assert result.stdout.count("\n") >= 3
class TestRunAssertions:
def test_passing_assertions(self, fixtures_dir, work_dir):
create_base_repo(work_dir, fixtures_dir / "template-repo")
assertions = [
"git rev-parse --is-inside-work-tree",
"git branch --show-current | grep main",
]
run_assertions(assertions, work_dir)
def test_failing_assertion_raises(self, fixtures_dir, work_dir):
create_base_repo(work_dir, fixtures_dir / "template-repo")
with pytest.raises(AssertionError, match="Setup assertion failed"):
run_assertions(["git branch --show-current | grep nonexistent"], work_dir)

54
evals/tests/test_stats.py Normal file
View File

@@ -0,0 +1,54 @@
"""Tests for Wilson score confidence interval."""
from __future__ import annotations
from drill.stats import wilson_ci
class TestWilsonCI:
def test_all_pass(self) -> None:
lo, hi = wilson_ci(10, 10)
assert lo > 0.69
assert hi == 1.0 or hi > 0.99
def test_all_fail(self) -> None:
lo, hi = wilson_ci(0, 10)
assert lo < 0.01 or lo == 0.0
assert hi < 0.31
def test_half_pass(self) -> None:
lo, hi = wilson_ci(5, 10)
assert 0.18 < lo < 0.25
assert 0.75 < hi < 0.82
def test_zero_total(self) -> None:
lo, hi = wilson_ci(0, 0)
assert lo == 0.0
assert hi == 0.0
def test_single_pass(self) -> None:
lo, hi = wilson_ci(1, 1)
assert lo > 0.0
assert hi <= 1.0
def test_single_fail(self) -> None:
lo, hi = wilson_ci(0, 1)
assert lo == 0.0 or lo >= 0.0
assert hi < 1.0
def test_large_sample(self) -> None:
lo, hi = wilson_ci(80, 100)
assert 0.70 < lo < 0.75
assert 0.85 < hi < 0.90
def test_passed_greater_than_total_clamped(self) -> None:
lo, hi = wilson_ci(12, 10)
assert lo > 0.0
assert hi <= 1.0
def test_returns_tuple_of_floats(self) -> None:
result = wilson_ci(5, 10)
assert isinstance(result, tuple)
assert len(result) == 2
assert isinstance(result[0], float)
assert isinstance(result[1], float)

202
evals/tests/test_sweep.py Normal file
View File

@@ -0,0 +1,202 @@
"""Tests for Sweep orchestrator."""
from __future__ import annotations
import json
from dataclasses import asdict
from pathlib import Path
from unittest.mock import patch
from drill.engine import Engine, RunResult
from drill.sweep import RunGroup, RunStatus, Sweep, write_run_group
class TestRunStatus:
def test_pass_status(self) -> None:
rs = RunStatus(index=0, status="pass", duration=10.5)
assert rs.error is None
assert rs.status == "pass"
def test_error_status(self) -> None:
rs = RunStatus(index=2, status="error", duration=1.2, error="tmux crashed")
assert rs.error == "tmux crashed"
def test_serializes_to_dict(self) -> None:
rs = RunStatus(index=0, status="pass", duration=10.5)
d = asdict(rs)
assert d["index"] == 0
assert d["status"] == "pass"
assert d["duration"] == 10.5
assert d["error"] is None
class TestRunGroup:
def test_creates_with_defaults(self) -> None:
rg = RunGroup(
scenario="test",
backend="claude",
n=3,
timestamp="2026-04-20T14-30-00",
sweep_id="abc12345",
runs=[],
)
assert rg.partial is False
def test_partial_flag(self) -> None:
rg = RunGroup(
scenario="test",
backend="claude",
n=3,
timestamp="2026-04-20T14-30-00",
sweep_id="abc12345",
runs=[RunStatus(index=0, status="pass", duration=10.0)],
partial=True,
)
assert rg.partial is True
assert len(rg.runs) == 1
class TestWriteRunGroup:
def test_writes_json(self, tmp_path: Path) -> None:
rg = RunGroup(
scenario="test-scenario",
backend="claude",
n=2,
timestamp="2026-04-20T14-30-00",
sweep_id="abc12345",
runs=[
RunStatus(index=0, status="pass", duration=100.0),
RunStatus(index=1, status="fail", duration=95.0),
],
)
write_run_group(rg, tmp_path)
path = tmp_path / "run-group.json"
assert path.exists()
data = json.loads(path.read_text())
assert data["scenario"] == "test-scenario"
assert data["sweep_id"] == "abc12345"
assert data["partial"] is False
assert len(data["runs"]) == 2
assert data["runs"][0]["status"] == "pass"
assert data["runs"][1]["status"] == "fail"
def test_writes_partial(self, tmp_path: Path) -> None:
rg = RunGroup(
scenario="test",
backend="claude",
n=5,
timestamp="2026-04-20T14-30-00",
sweep_id="abc12345",
runs=[RunStatus(index=0, status="pass", duration=100.0)],
partial=True,
)
write_run_group(rg, tmp_path)
data = json.loads((tmp_path / "run-group.json").read_text())
assert data["partial"] is True
assert len(data["runs"]) == 1
def test_omits_null_errors(self, tmp_path: Path) -> None:
rg = RunGroup(
scenario="test",
backend="claude",
n=1,
timestamp="2026-04-20T14-30-00",
sweep_id="abc12345",
runs=[RunStatus(index=0, status="pass", duration=50.0)],
)
write_run_group(rg, tmp_path)
data = json.loads((tmp_path / "run-group.json").read_text())
run_data = data["runs"][0]
assert "error" not in run_data
class TestSweepIntegration:
def test_full_sweep_writes_run_group(self, tmp_path: Path) -> None:
"""Test that Sweep creates run dirs and writes run-group.json."""
scenario_file = tmp_path / "scenarios" / "test.yaml"
scenario_file.parent.mkdir(parents=True)
scenario_file.write_text(
"scenario: test-scenario\n"
"description: test\n"
"user_posture: naive\n"
"setup: {}\n"
"turns:\n - intent: do the thing\n"
"limits:\n max_turns: 5\n"
"verify:\n criteria:\n - thing was done\n"
)
backends_dir = tmp_path / "backends"
backends_dir.mkdir()
(backends_dir / "mock-backend.yaml").write_text(
"name: mock-backend\n"
"cli: echo\n"
"args: []\n"
"required_env: []\n"
"hooks:\n pre_run: []\n post_run: []\n"
"shutdown: /exit\n"
"idle:\n quiescence_seconds: 1\n ready_pattern: '.'\n"
"startup_timeout: 5\n"
"terminal:\n cols: 80\n rows: 24\n"
"session_logs: {}\n"
)
results_dir = tmp_path / "results"
fixtures_dir = tmp_path / "fixtures"
fixtures_dir.mkdir()
fake_verdict = json.dumps(
{
"criteria": [
{
"criterion": "thing was done",
"verdict": "pass",
"evidence": "yes",
"rationale": "it was done",
}
],
"observations": [],
"summary": "ok",
}
)
fake_result = RunResult(
scenario="test-scenario",
backend="mock-backend",
timestamp="2026-04-20T14-30-00",
session_log="log",
filesystem_json='{"files": []}',
tool_calls_jsonl='{"tool": "Bash"}',
verdict_json=fake_verdict,
meta={"actor_turns": 3},
)
sweep = Sweep(
scenario_path=scenario_file,
backend_names=["mock-backend"],
backends_dir=backends_dir,
fixtures_dir=fixtures_dir,
results_dir=results_dir,
n=3,
sweep_id="test1234",
)
with patch.object(Engine, "run", return_value=fake_result):
groups = sweep.run_all()
assert len(groups) == 1
group = groups[0]
assert group.scenario == "test-scenario"
assert len(group.runs) == 3
assert all(r.status == "pass" for r in group.runs)
assert group.partial is False
# Verify run-group.json was written
scenario_results = results_dir / "test-scenario" / "mock-backend"
assert scenario_results.exists()
group_dirs = list(scenario_results.iterdir())
assert len(group_dirs) == 1
rg_path = group_dirs[0] / "run-group.json"
assert rg_path.exists()
rg_data = json.loads(rg_path.read_text())
assert rg_data["sweep_id"] == "test1234"
assert len(rg_data["runs"]) == 3

View File

@@ -0,0 +1,92 @@
from drill.verifier import CriterionResult, Verdict, Verifier
class TestVerdict:
def test_parse_valid_verdict(self):
data = {
"criteria": [
{
"criterion": "Agent detected on main",
"verdict": "pass",
"evidence": "Terminal showed 'main branch detected'",
"rationale": "Agent correctly identified the branch",
}
],
"observations": ["Agent was very fast"],
"summary": "Passed all checks",
}
verdict = Verdict.model_validate(data)
assert len(verdict.criteria) == 1
assert verdict.criteria[0].verdict == "pass"
assert verdict.score == "1/1"
def test_score_calculation(self):
data = {
"criteria": [
{"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
{"criterion": "B", "verdict": "fail", "evidence": "e", "rationale": "r"},
{"criterion": "C", "verdict": "pass", "evidence": "e", "rationale": "r"},
],
"observations": [],
"summary": "Mixed results",
}
verdict = Verdict.model_validate(data)
assert verdict.score == "2/3"
assert verdict.passed is False
def test_all_pass(self):
data = {
"criteria": [
{"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
],
"observations": [],
"summary": "Good",
}
verdict = Verdict.model_validate(data)
assert verdict.passed is True
class TestCriterionResultSource:
def test_default_source_is_judge(self):
cr = CriterionResult(
criterion="test",
verdict="pass",
evidence="e",
rationale="r",
)
assert cr.source == "judge"
def test_assertion_source(self):
cr = CriterionResult(
criterion="test",
verdict="fail",
evidence="e",
rationale="r",
source="assertion",
)
assert cr.source == "assertion"
def test_backwards_compat_no_source_in_json(self):
data = {"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"}
cr = CriterionResult.model_validate(data)
assert cr.source == "judge"
def test_source_serializes_to_json(self):
cr = CriterionResult(
criterion="test",
verdict="pass",
evidence="e",
rationale="r",
source="assertion",
)
data = cr.model_dump()
assert data["source"] == "assertion"
class TestVerifierPrompt:
def test_builds_system_prompt(self):
verifier = Verifier(model="claude-sonnet-4-6", temperature=0.0)
prompt = verifier.build_system_prompt()
assert "criterion" in prompt.lower()
assert "evidence" in prompt.lower()
assert "JSON" in prompt