Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b

rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding
.git/, .venv/, results/, .env/, __pycache__/, *.egg-info/,
.private-journal/.

The drill repo is unaffected by this commit; archival is a separate
manual step after this PR merges.

Source SHA recorded at evals/.drill-source-sha for divergence
detection.
This commit is contained in:
Jesse Vincent
2026-05-06 12:15:46 -07:00
parent 895bb732d5
commit 3c046f579e
124 changed files with 13806 additions and 0 deletions

217
evals/tests/test_compare.py Normal file
View File

@@ -0,0 +1,217 @@
"""Tests for compare module."""
from __future__ import annotations
import json
from pathlib import Path
from drill.compare import BackendResult, format_compare_output, load_scenario_results
def _write_verdict(path: Path, criteria: list[dict[str, str]]) -> None:
verdict = {
"criteria": criteria,
"observations": ["test obs"],
"summary": "ok",
}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(verdict))
def _write_meta(path: Path, **kwargs: object) -> None:
meta = {"scenario": "test", "backend": "claude", "actor_turns": 4, **kwargs}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(meta))
def _write_run_group(
path: Path, n: int, runs: list[dict[str, object]], sweep_id: str = "abc12345"
) -> None:
data = {
"scenario": "test",
"backend": "claude",
"n": n,
"timestamp": "2026-04-20T14-30-00",
"sweep_id": sweep_id,
"partial": False,
"runs": runs,
}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(data))
class TestLoadScenarioResults:
def test_loads_new_format_single_run(self, tmp_path: Path) -> None:
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
run_dir = scenario_dir / "run-00"
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
_write_verdict(run_dir / "verdict.json", criteria)
_write_meta(run_dir / "meta.json")
_write_run_group(
scenario_dir / "run-group.json",
n=1,
runs=[{"index": 0, "status": "pass", "duration": 10.0}],
)
results = load_scenario_results(tmp_path / "test-scenario")
assert "claude" in results
assert results["claude"].total_runs == 1
assert results["claude"].passed_runs == 1
def test_loads_new_format_multi_run(self, tmp_path: Path) -> None:
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
for i in range(3):
run_dir = scenario_dir / f"run-{i:02d}"
verdict_val = "pass" if i < 2 else "fail"
criteria = [
{"criterion": "c1", "verdict": verdict_val, "evidence": "e", "rationale": "r"}
]
_write_verdict(run_dir / "verdict.json", criteria)
_write_meta(run_dir / "meta.json")
_write_run_group(
scenario_dir / "run-group.json",
n=3,
runs=[
{"index": 0, "status": "pass", "duration": 10.0},
{"index": 1, "status": "pass", "duration": 11.0},
{"index": 2, "status": "fail", "duration": 12.0},
],
)
results = load_scenario_results(tmp_path / "test-scenario")
assert results["claude"].total_runs == 3
assert results["claude"].passed_runs == 2
assert len(results["claude"].criterion_counts) == 1
assert results["claude"].criterion_counts["c1"] == (2, 3)
def test_loads_old_format_backwards_compat(self, tmp_path: Path) -> None:
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00"
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
_write_verdict(scenario_dir / "verdict.json", criteria)
_write_meta(scenario_dir / "meta.json")
results = load_scenario_results(tmp_path / "test-scenario")
assert "claude" in results
assert results["claude"].total_runs == 1
assert results["claude"].passed_runs == 1
def test_sweep_filter(self, tmp_path: Path) -> None:
base = tmp_path / "test-scenario" / "claude"
# Sweep A
dir_a = base / "2026-04-20T14-30-00-aaaa1111"
_write_run_group(
dir_a / "run-group.json",
n=1,
runs=[{"index": 0, "status": "pass", "duration": 10.0}],
sweep_id="aaaa1111",
)
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
_write_verdict(dir_a / "run-00" / "verdict.json", criteria)
_write_meta(dir_a / "run-00" / "meta.json")
# Sweep B
dir_b = base / "2026-04-20T15-00-00-bbbb2222"
_write_run_group(
dir_b / "run-group.json",
n=1,
runs=[{"index": 0, "status": "fail", "duration": 10.0}],
sweep_id="bbbb2222",
)
criteria_b = [{"criterion": "c1", "verdict": "fail", "evidence": "e", "rationale": "r"}]
_write_verdict(dir_b / "run-00" / "verdict.json", criteria_b)
_write_meta(dir_b / "run-00" / "meta.json")
results_a = load_scenario_results(tmp_path / "test-scenario", sweep_id="aaaa1111")
assert results_a["claude"].passed_runs == 1
results_b = load_scenario_results(tmp_path / "test-scenario", sweep_id="bbbb2222")
assert results_b["claude"].passed_runs == 0
class TestBackendResult:
def test_pass_rate(self) -> None:
br = BackendResult(
backend="claude",
total_runs=10,
passed_runs=8,
errored_runs=0,
avg_turns=4.2,
criterion_counts={"c1": (10, 10), "c2": (8, 10)},
sweep_id="abc12345",
timestamp="2026-04-20T14-30-00",
partial=False,
)
assert br.pass_rate == 0.8
def test_pass_rate_zero_runs(self) -> None:
br = BackendResult(
backend="claude",
total_runs=0,
passed_runs=0,
errored_runs=0,
avg_turns=0.0,
criterion_counts={},
sweep_id=None,
timestamp=None,
partial=False,
)
assert br.pass_rate == 0.0
def _make_backend_result(
backend: str = "claude",
total_runs: int = 10,
passed_runs: int = 8,
errored_runs: int = 0,
avg_turns: float = 4.2,
criterion_counts: dict[str, tuple[int, int]] | None = None,
sweep_id: str | None = "abc12345",
timestamp: str | None = "2026-04-20T14-30-00",
partial: bool = False,
) -> BackendResult:
return BackendResult(
backend=backend,
total_runs=total_runs,
passed_runs=passed_runs,
errored_runs=errored_runs,
avg_turns=avg_turns,
criterion_counts=criterion_counts or {"c1": (passed_runs, total_runs)},
sweep_id=sweep_id,
timestamp=timestamp,
partial=partial,
)
class TestFormatCompareOutput:
def test_no_results(self) -> None:
output = format_compare_output("test", {})
assert "No results found" in output
def test_multi_run_includes_pass_rate_and_ci(self) -> None:
results = {"claude": _make_backend_result(total_runs=10, passed_runs=8)}
output = format_compare_output("test", results)
assert "Overall pass rate" in output
assert "95% CI" in output
assert "80.0%" in output
def test_multi_run_sweep_header_includes_date(self) -> None:
results = {"claude": _make_backend_result()}
output = format_compare_output("test", results)
assert "Sweep: abc12345 | 2026-04-20" in output
def test_single_run_simple_table(self) -> None:
results = {
"claude": _make_backend_result(
total_runs=1,
passed_runs=1,
criterion_counts={"c1": (1, 1)},
)
}
output = format_compare_output("test", results)
assert "PASS" in output
assert "Overall pass rate" not in output
def test_partial_warning(self) -> None:
results = {"claude": _make_backend_result(partial=True)}
output = format_compare_output("test", results)
assert "incomplete" in output.lower() or "interrupted" in output.lower()
def test_small_n_note(self) -> None:
results = {"claude": _make_backend_result(total_runs=5, passed_runs=3)}
output = format_compare_output("test", results)
assert "--n 10+" in output