mirror of
https://github.com/obra/superpowers.git
synced 2026-05-10 02:59:04 +08:00
Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
This commit is contained in:
committed by
Drew Ritter
parent
2e46e9590d
commit
3b412a3836
0
evals/tests/__init__.py
Normal file
0
evals/tests/__init__.py
Normal file
0
evals/tests/fixtures/tools_empty.jsonl
vendored
Normal file
0
evals/tests/fixtures/tools_empty.jsonl
vendored
Normal file
5
evals/tests/fixtures/tools_multi.jsonl
vendored
Normal file
5
evals/tests/fixtures/tools_multi.jsonl
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
|
||||
{"tool": "Skill", "args": {"skill": "superpowers:worktree"}, "source": "native"}
|
||||
{"tool": "Edit", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
|
||||
{"tool": "Read", "args": {"file_path": "/tmp/bar.py"}, "source": "native"}
|
||||
{"tool": "Bash", "args": {"command": "git status"}, "source": "shell"}
|
||||
4
evals/tests/fixtures/tools_ordered.jsonl
vendored
Normal file
4
evals/tests/fixtures/tools_ordered.jsonl
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
{"tool": "EnterWorktree", "args": {"branch": "feature/login"}, "source": "native"}
|
||||
{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
|
||||
{"tool": "Edit", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
|
||||
{"tool": "Bash", "args": {"command": "pytest"}, "source": "shell"}
|
||||
1
evals/tests/fixtures/tools_single.jsonl
vendored
Normal file
1
evals/tests/fixtures/tools_single.jsonl
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"tool": "Read", "args": {"file_path": "/tmp/foo.py"}, "source": "native"}
|
||||
51
evals/tests/test_actor.py
Normal file
51
evals/tests/test_actor.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from drill.actor import Actor, ActorAction
|
||||
|
||||
|
||||
class TestActorAction:
|
||||
def test_parse_type_action(self):
|
||||
action = ActorAction.from_tool_result({"action": "type", "text": "create a worktree"})
|
||||
assert action.action == "type"
|
||||
assert action.text == "create a worktree"
|
||||
|
||||
def test_parse_done_action(self):
|
||||
action = ActorAction.from_tool_result({"action": "done"})
|
||||
assert action.action == "done"
|
||||
|
||||
def test_parse_stuck_action(self):
|
||||
action = ActorAction.from_tool_result({"action": "stuck"})
|
||||
assert action.action == "stuck"
|
||||
|
||||
def test_parse_key_action(self):
|
||||
action = ActorAction.from_tool_result({"action": "key", "key": "ctrl-c"})
|
||||
assert action.action == "key"
|
||||
assert action.key == "ctrl-c"
|
||||
|
||||
|
||||
class TestActorPrompt:
|
||||
def test_builds_system_prompt_naive(self):
|
||||
actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
|
||||
prompt = actor.build_system_prompt(
|
||||
posture="naive",
|
||||
intents=["Ask the agent to create a worktree"],
|
||||
)
|
||||
assert "plain language" in prompt.lower() or "don't know" in prompt.lower()
|
||||
assert "create a worktree" in prompt
|
||||
|
||||
def test_builds_system_prompt_spec_aware(self):
|
||||
actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
|
||||
prompt = actor.build_system_prompt(
|
||||
posture="spec-aware",
|
||||
intents=["Use the worktree skill"],
|
||||
)
|
||||
assert "skill" in prompt.lower() or "convention" in prompt.lower()
|
||||
|
||||
|
||||
class TestActorContext:
|
||||
def test_appends_terminal_captures(self):
|
||||
actor = Actor(model="claude-sonnet-4-6", temperature=0.7)
|
||||
actor.append_capture("Screen 1: Welcome to Claude")
|
||||
actor.append_capture("Screen 2: ❯ ")
|
||||
messages = actor.build_messages()
|
||||
assert len(messages) == 2
|
||||
assert "Screen 1" in messages[0]["content"]
|
||||
assert "Screen 2" in messages[1]["content"]
|
||||
106
evals/tests/test_assertions.py
Normal file
106
evals/tests/test_assertions.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from drill.assertions import AssertionResult, run_verify_assertions
|
||||
|
||||
|
||||
class TestAssertionResult:
|
||||
def test_passing_to_criterion_result(self):
|
||||
ar = AssertionResult(
|
||||
command="tool-called Read",
|
||||
passed=True,
|
||||
exit_code=0,
|
||||
stdout="PASS: Read called 3 time(s)",
|
||||
stderr="",
|
||||
)
|
||||
cr = ar.to_criterion_result()
|
||||
assert cr.verdict == "pass"
|
||||
assert cr.source == "assertion"
|
||||
assert "[assertion]" in cr.criterion
|
||||
assert "tool-called Read" in cr.criterion
|
||||
|
||||
def test_failing_to_criterion_result(self):
|
||||
ar = AssertionResult(
|
||||
command="tool-not-called Write",
|
||||
passed=False,
|
||||
exit_code=1,
|
||||
stdout="",
|
||||
stderr="FAIL: Write called 2 time(s)",
|
||||
)
|
||||
cr = ar.to_criterion_result()
|
||||
assert cr.verdict == "fail"
|
||||
assert cr.source == "assertion"
|
||||
assert "stderr: FAIL" in cr.evidence
|
||||
|
||||
|
||||
class TestRunVerifyAssertions:
|
||||
def test_passing_assertion(self, tmp_path):
|
||||
tc = '{"tool": "Read", "args": {}, "source": "native"}\n'
|
||||
(tmp_path / "tool_calls.jsonl").write_text(tc)
|
||||
results = run_verify_assertions(
|
||||
assertions=["grep -q Read tool_calls.jsonl"],
|
||||
results_dir=tmp_path,
|
||||
workdir=tmp_path,
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results[0].passed is True
|
||||
assert results[0].exit_code == 0
|
||||
|
||||
def test_failing_assertion(self, tmp_path):
|
||||
tc = '{"tool": "Read", "args": {}, "source": "native"}\n'
|
||||
(tmp_path / "tool_calls.jsonl").write_text(tc)
|
||||
results = run_verify_assertions(
|
||||
assertions=["grep -q NonexistentTool tool_calls.jsonl"],
|
||||
results_dir=tmp_path,
|
||||
workdir=tmp_path,
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results[0].passed is False
|
||||
|
||||
def test_runs_all_assertions(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text('{"tool": "Read"}\n')
|
||||
results = run_verify_assertions(
|
||||
assertions=[
|
||||
"grep -q Read tool_calls.jsonl",
|
||||
"grep -q Write tool_calls.jsonl",
|
||||
"grep -q Read tool_calls.jsonl",
|
||||
],
|
||||
results_dir=tmp_path,
|
||||
workdir=tmp_path,
|
||||
)
|
||||
assert len(results) == 3
|
||||
assert results[0].passed is True
|
||||
assert results[1].passed is False
|
||||
assert results[2].passed is True
|
||||
|
||||
def test_timeout_handling(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text("{}\n")
|
||||
results = run_verify_assertions(
|
||||
assertions=["sleep 30"],
|
||||
results_dir=tmp_path,
|
||||
workdir=tmp_path,
|
||||
timeout_seconds=1,
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results[0].passed is False
|
||||
assert results[0].exit_code == 124
|
||||
assert "Timed out" in results[0].stderr
|
||||
|
||||
def test_drill_workdir_env_var(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text("{}\n")
|
||||
workdir = tmp_path / "scenario-workdir"
|
||||
workdir.mkdir()
|
||||
results = run_verify_assertions(
|
||||
assertions=['test "$DRILL_WORKDIR" = "' + str(workdir) + '"'],
|
||||
results_dir=tmp_path,
|
||||
workdir=workdir,
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results[0].passed is True
|
||||
|
||||
def test_bin_dir_on_path(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text("{}\n")
|
||||
results = run_verify_assertions(
|
||||
assertions=["echo $PATH | grep -q bin"],
|
||||
results_dir=tmp_path,
|
||||
workdir=tmp_path,
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert results[0].passed is True
|
||||
145
evals/tests/test_backend.py
Normal file
145
evals/tests/test_backend.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from drill.backend import Backend, load_backend
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def backends_dir():
|
||||
return Path(__file__).parent.parent / "backends"
|
||||
|
||||
|
||||
class TestLoadBackend:
|
||||
def test_loads_claude_backend(self, backends_dir):
|
||||
backend = load_backend("claude", backends_dir)
|
||||
assert backend.name == "claude"
|
||||
assert backend.cli == "claude"
|
||||
assert "--dangerously-skip-permissions" in backend.args
|
||||
|
||||
def test_loads_codex_backend(self, backends_dir):
|
||||
backend = load_backend("codex", backends_dir)
|
||||
assert backend.name == "codex"
|
||||
assert backend.cli == "codex"
|
||||
|
||||
def test_unknown_backend_raises(self, backends_dir):
|
||||
with pytest.raises(FileNotFoundError):
|
||||
load_backend("nonexistent", backends_dir)
|
||||
|
||||
def test_loads_claude_opus_4_6_variant(self, backends_dir, monkeypatch):
|
||||
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
|
||||
backend = load_backend("claude-opus-4-6", backends_dir)
|
||||
assert backend.name == "claude-opus-4-6"
|
||||
assert backend.family == "claude"
|
||||
assert backend.model == "claude-opus-4-6"
|
||||
|
||||
|
||||
class TestBackendBuildCommand:
|
||||
def test_claude_build_command(self, backends_dir, monkeypatch):
|
||||
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
|
||||
backend = load_backend("claude", backends_dir)
|
||||
cmd = backend.build_command("/tmp/workdir")
|
||||
assert cmd[0] == "claude"
|
||||
assert "--plugin-dir" in cmd
|
||||
assert "/tmp/superpowers" in cmd
|
||||
|
||||
def test_codex_build_command(self, backends_dir, monkeypatch):
|
||||
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
|
||||
backend = load_backend("codex", backends_dir)
|
||||
cmd = backend.build_command("/tmp/workdir")
|
||||
assert cmd[0] == "codex"
|
||||
|
||||
|
||||
class TestBackendEnvValidation:
|
||||
def test_missing_env_raises(self, backends_dir, monkeypatch):
|
||||
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
|
||||
monkeypatch.delenv("SUPERPOWERS_ROOT", raising=False)
|
||||
backend = load_backend("claude", backends_dir)
|
||||
with pytest.raises(EnvironmentError, match="ANTHROPIC_API_KEY"):
|
||||
backend.validate_env()
|
||||
|
||||
|
||||
class TestBackendIdleDetection:
|
||||
def test_ready_pattern_matches(self, backends_dir):
|
||||
backend = load_backend("claude", backends_dir)
|
||||
assert backend.is_ready_line("❯ ")
|
||||
assert backend.is_ready_line("Human: ")
|
||||
assert not backend.is_ready_line("Running tool...")
|
||||
|
||||
|
||||
class TestBackendModelExtraction:
|
||||
def test_extract_model_from_args(self, backends_dir, monkeypatch):
|
||||
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
|
||||
backend = load_backend("claude", backends_dir)
|
||||
assert backend.model == "opus"
|
||||
|
||||
def test_no_model_flag_returns_none(self):
|
||||
backend = Backend(
|
||||
name="test",
|
||||
cli="test",
|
||||
args=["--foo", "bar"],
|
||||
required_env=[],
|
||||
hooks={"pre_run": [], "post_run": []},
|
||||
shutdown="/exit",
|
||||
idle={},
|
||||
startup_timeout=30,
|
||||
terminal={},
|
||||
session_logs={},
|
||||
)
|
||||
assert backend.model is None
|
||||
|
||||
def test_extracts_from_short_m_flag(self):
|
||||
backend = Backend(
|
||||
name="test",
|
||||
cli="test",
|
||||
args=["-m", "gemini-2.5-flash"],
|
||||
required_env=[],
|
||||
hooks={"pre_run": [], "post_run": []},
|
||||
shutdown="/exit",
|
||||
idle={},
|
||||
startup_timeout=30,
|
||||
terminal={},
|
||||
session_logs={},
|
||||
)
|
||||
assert backend.model == "gemini-2.5-flash"
|
||||
|
||||
|
||||
class TestBackendFamily:
|
||||
def test_claude_backend_family(self, backends_dir, monkeypatch):
|
||||
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/sp")
|
||||
backend = load_backend("claude", backends_dir)
|
||||
assert backend.family == "claude"
|
||||
|
||||
def test_codex_backend_family(self, backends_dir):
|
||||
backend = load_backend("codex", backends_dir)
|
||||
assert backend.family == "codex"
|
||||
|
||||
def test_variant_name_preserves_family(self):
|
||||
backend = Backend(
|
||||
name="claude-opus-4-6",
|
||||
cli="claude",
|
||||
args=[],
|
||||
required_env=[],
|
||||
hooks={"pre_run": [], "post_run": []},
|
||||
shutdown="/exit",
|
||||
idle={},
|
||||
startup_timeout=30,
|
||||
terminal={},
|
||||
session_logs={},
|
||||
)
|
||||
assert backend.family == "claude"
|
||||
|
||||
def test_unknown_family_is_other(self):
|
||||
backend = Backend(
|
||||
name="random-xyz",
|
||||
cli="xyz",
|
||||
args=[],
|
||||
required_env=[],
|
||||
hooks={"pre_run": [], "post_run": []},
|
||||
shutdown="/exit",
|
||||
idle={},
|
||||
startup_timeout=30,
|
||||
terminal={},
|
||||
session_logs={},
|
||||
)
|
||||
assert backend.family == "other"
|
||||
61
evals/tests/test_cli.py
Normal file
61
evals/tests/test_cli.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""Tests for CLI option parsing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from click.testing import CliRunner
|
||||
|
||||
from drill.cli import main
|
||||
|
||||
|
||||
class TestRunCommand:
|
||||
def test_backend_required_without_models(self) -> None:
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(main, ["run", "nonexistent"])
|
||||
assert result.exit_code != 0
|
||||
|
||||
def test_n_default_is_1(self) -> None:
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(main, ["run", "nonexistent", "--backend", "claude", "--n", "1"])
|
||||
assert "Scenario not found" in result.output or result.exit_code != 0
|
||||
|
||||
def test_models_flag_accepted(self) -> None:
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(main, ["run", "nonexistent", "--models", "claude,codex"])
|
||||
assert "Scenario not found" in result.output or result.exit_code != 0
|
||||
|
||||
def test_n_must_be_positive(self) -> None:
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(main, ["run", "nonexistent", "--backend", "claude", "--n", "0"])
|
||||
assert result.exit_code != 0
|
||||
|
||||
|
||||
class TestListCommand:
|
||||
def test_lists_scenarios(self, tmp_path):
|
||||
scenarios_dir = tmp_path / "scenarios"
|
||||
scenarios_dir.mkdir()
|
||||
(scenarios_dir / "test-scenario.yaml").write_text("""
|
||||
scenario: test-scenario
|
||||
description: "A test scenario"
|
||||
user_posture: naive
|
||||
setup:
|
||||
helpers: []
|
||||
assertions: []
|
||||
turns: []
|
||||
limits:
|
||||
max_turns: 5
|
||||
turn_timeout: 30
|
||||
verify:
|
||||
criteria: []
|
||||
observe: false
|
||||
""")
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(main, ["list", "--scenarios-dir", str(scenarios_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert "test-scenario" in result.output
|
||||
|
||||
|
||||
class TestCompareCommand:
|
||||
def test_sweep_flag_accepted(self) -> None:
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(main, ["compare", "nonexistent", "--sweep", "abc123"])
|
||||
assert result.exit_code != 0 # No results dir, but flag is parsed
|
||||
217
evals/tests/test_compare.py
Normal file
217
evals/tests/test_compare.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""Tests for compare module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from drill.compare import BackendResult, format_compare_output, load_scenario_results
|
||||
|
||||
|
||||
def _write_verdict(path: Path, criteria: list[dict[str, str]]) -> None:
|
||||
verdict = {
|
||||
"criteria": criteria,
|
||||
"observations": ["test obs"],
|
||||
"summary": "ok",
|
||||
}
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(verdict))
|
||||
|
||||
|
||||
def _write_meta(path: Path, **kwargs: object) -> None:
|
||||
meta = {"scenario": "test", "backend": "claude", "actor_turns": 4, **kwargs}
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(meta))
|
||||
|
||||
|
||||
def _write_run_group(
|
||||
path: Path, n: int, runs: list[dict[str, object]], sweep_id: str = "abc12345"
|
||||
) -> None:
|
||||
data = {
|
||||
"scenario": "test",
|
||||
"backend": "claude",
|
||||
"n": n,
|
||||
"timestamp": "2026-04-20T14-30-00",
|
||||
"sweep_id": sweep_id,
|
||||
"partial": False,
|
||||
"runs": runs,
|
||||
}
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(data))
|
||||
|
||||
|
||||
class TestLoadScenarioResults:
|
||||
def test_loads_new_format_single_run(self, tmp_path: Path) -> None:
|
||||
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
|
||||
run_dir = scenario_dir / "run-00"
|
||||
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
|
||||
_write_verdict(run_dir / "verdict.json", criteria)
|
||||
_write_meta(run_dir / "meta.json")
|
||||
_write_run_group(
|
||||
scenario_dir / "run-group.json",
|
||||
n=1,
|
||||
runs=[{"index": 0, "status": "pass", "duration": 10.0}],
|
||||
)
|
||||
results = load_scenario_results(tmp_path / "test-scenario")
|
||||
assert "claude" in results
|
||||
assert results["claude"].total_runs == 1
|
||||
assert results["claude"].passed_runs == 1
|
||||
|
||||
def test_loads_new_format_multi_run(self, tmp_path: Path) -> None:
|
||||
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00-abc12345"
|
||||
for i in range(3):
|
||||
run_dir = scenario_dir / f"run-{i:02d}"
|
||||
verdict_val = "pass" if i < 2 else "fail"
|
||||
criteria = [
|
||||
{"criterion": "c1", "verdict": verdict_val, "evidence": "e", "rationale": "r"}
|
||||
]
|
||||
_write_verdict(run_dir / "verdict.json", criteria)
|
||||
_write_meta(run_dir / "meta.json")
|
||||
_write_run_group(
|
||||
scenario_dir / "run-group.json",
|
||||
n=3,
|
||||
runs=[
|
||||
{"index": 0, "status": "pass", "duration": 10.0},
|
||||
{"index": 1, "status": "pass", "duration": 11.0},
|
||||
{"index": 2, "status": "fail", "duration": 12.0},
|
||||
],
|
||||
)
|
||||
results = load_scenario_results(tmp_path / "test-scenario")
|
||||
assert results["claude"].total_runs == 3
|
||||
assert results["claude"].passed_runs == 2
|
||||
assert len(results["claude"].criterion_counts) == 1
|
||||
assert results["claude"].criterion_counts["c1"] == (2, 3)
|
||||
|
||||
def test_loads_old_format_backwards_compat(self, tmp_path: Path) -> None:
|
||||
scenario_dir = tmp_path / "test-scenario" / "claude" / "2026-04-20T14-30-00"
|
||||
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
|
||||
_write_verdict(scenario_dir / "verdict.json", criteria)
|
||||
_write_meta(scenario_dir / "meta.json")
|
||||
results = load_scenario_results(tmp_path / "test-scenario")
|
||||
assert "claude" in results
|
||||
assert results["claude"].total_runs == 1
|
||||
assert results["claude"].passed_runs == 1
|
||||
|
||||
def test_sweep_filter(self, tmp_path: Path) -> None:
|
||||
base = tmp_path / "test-scenario" / "claude"
|
||||
# Sweep A
|
||||
dir_a = base / "2026-04-20T14-30-00-aaaa1111"
|
||||
_write_run_group(
|
||||
dir_a / "run-group.json",
|
||||
n=1,
|
||||
runs=[{"index": 0, "status": "pass", "duration": 10.0}],
|
||||
sweep_id="aaaa1111",
|
||||
)
|
||||
criteria = [{"criterion": "c1", "verdict": "pass", "evidence": "e", "rationale": "r"}]
|
||||
_write_verdict(dir_a / "run-00" / "verdict.json", criteria)
|
||||
_write_meta(dir_a / "run-00" / "meta.json")
|
||||
# Sweep B
|
||||
dir_b = base / "2026-04-20T15-00-00-bbbb2222"
|
||||
_write_run_group(
|
||||
dir_b / "run-group.json",
|
||||
n=1,
|
||||
runs=[{"index": 0, "status": "fail", "duration": 10.0}],
|
||||
sweep_id="bbbb2222",
|
||||
)
|
||||
criteria_b = [{"criterion": "c1", "verdict": "fail", "evidence": "e", "rationale": "r"}]
|
||||
_write_verdict(dir_b / "run-00" / "verdict.json", criteria_b)
|
||||
_write_meta(dir_b / "run-00" / "meta.json")
|
||||
|
||||
results_a = load_scenario_results(tmp_path / "test-scenario", sweep_id="aaaa1111")
|
||||
assert results_a["claude"].passed_runs == 1
|
||||
results_b = load_scenario_results(tmp_path / "test-scenario", sweep_id="bbbb2222")
|
||||
assert results_b["claude"].passed_runs == 0
|
||||
|
||||
|
||||
class TestBackendResult:
|
||||
def test_pass_rate(self) -> None:
|
||||
br = BackendResult(
|
||||
backend="claude",
|
||||
total_runs=10,
|
||||
passed_runs=8,
|
||||
errored_runs=0,
|
||||
avg_turns=4.2,
|
||||
criterion_counts={"c1": (10, 10), "c2": (8, 10)},
|
||||
sweep_id="abc12345",
|
||||
timestamp="2026-04-20T14-30-00",
|
||||
partial=False,
|
||||
)
|
||||
assert br.pass_rate == 0.8
|
||||
|
||||
def test_pass_rate_zero_runs(self) -> None:
|
||||
br = BackendResult(
|
||||
backend="claude",
|
||||
total_runs=0,
|
||||
passed_runs=0,
|
||||
errored_runs=0,
|
||||
avg_turns=0.0,
|
||||
criterion_counts={},
|
||||
sweep_id=None,
|
||||
timestamp=None,
|
||||
partial=False,
|
||||
)
|
||||
assert br.pass_rate == 0.0
|
||||
|
||||
|
||||
def _make_backend_result(
|
||||
backend: str = "claude",
|
||||
total_runs: int = 10,
|
||||
passed_runs: int = 8,
|
||||
errored_runs: int = 0,
|
||||
avg_turns: float = 4.2,
|
||||
criterion_counts: dict[str, tuple[int, int]] | None = None,
|
||||
sweep_id: str | None = "abc12345",
|
||||
timestamp: str | None = "2026-04-20T14-30-00",
|
||||
partial: bool = False,
|
||||
) -> BackendResult:
|
||||
return BackendResult(
|
||||
backend=backend,
|
||||
total_runs=total_runs,
|
||||
passed_runs=passed_runs,
|
||||
errored_runs=errored_runs,
|
||||
avg_turns=avg_turns,
|
||||
criterion_counts=criterion_counts or {"c1": (passed_runs, total_runs)},
|
||||
sweep_id=sweep_id,
|
||||
timestamp=timestamp,
|
||||
partial=partial,
|
||||
)
|
||||
|
||||
|
||||
class TestFormatCompareOutput:
|
||||
def test_no_results(self) -> None:
|
||||
output = format_compare_output("test", {})
|
||||
assert "No results found" in output
|
||||
|
||||
def test_multi_run_includes_pass_rate_and_ci(self) -> None:
|
||||
results = {"claude": _make_backend_result(total_runs=10, passed_runs=8)}
|
||||
output = format_compare_output("test", results)
|
||||
assert "Overall pass rate" in output
|
||||
assert "95% CI" in output
|
||||
assert "80.0%" in output
|
||||
|
||||
def test_multi_run_sweep_header_includes_date(self) -> None:
|
||||
results = {"claude": _make_backend_result()}
|
||||
output = format_compare_output("test", results)
|
||||
assert "Sweep: abc12345 | 2026-04-20" in output
|
||||
|
||||
def test_single_run_simple_table(self) -> None:
|
||||
results = {
|
||||
"claude": _make_backend_result(
|
||||
total_runs=1,
|
||||
passed_runs=1,
|
||||
criterion_counts={"c1": (1, 1)},
|
||||
)
|
||||
}
|
||||
output = format_compare_output("test", results)
|
||||
assert "PASS" in output
|
||||
assert "Overall pass rate" not in output
|
||||
|
||||
def test_partial_warning(self) -> None:
|
||||
results = {"claude": _make_backend_result(partial=True)}
|
||||
output = format_compare_output("test", results)
|
||||
assert "incomplete" in output.lower() or "interrupted" in output.lower()
|
||||
|
||||
def test_small_n_note(self) -> None:
|
||||
results = {"claude": _make_backend_result(total_runs=5, passed_runs=3)}
|
||||
output = format_compare_output("test", results)
|
||||
assert "--n 10+" in output
|
||||
94
evals/tests/test_e2e.py
Normal file
94
evals/tests/test_e2e.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""End-to-end smoke test using a mock 'bash' backend."""
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from drill.engine import Engine, ScenarioConfig
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_scenario(tmp_path):
|
||||
scenario = tmp_path / "test-scenario.yaml"
|
||||
scenario.write_text("""
|
||||
scenario: e2e-smoke-test
|
||||
description: "Smoke test"
|
||||
user_posture: naive
|
||||
setup:
|
||||
helpers:
|
||||
- create_base_repo
|
||||
assertions:
|
||||
- "git rev-parse --is-inside-work-tree"
|
||||
turns:
|
||||
- intent: "List files in the current directory"
|
||||
limits:
|
||||
max_turns: 3
|
||||
turn_timeout: 10
|
||||
verify:
|
||||
criteria:
|
||||
- "Agent listed the files"
|
||||
observe: true
|
||||
""")
|
||||
return scenario
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_backend(tmp_path):
|
||||
backend_dir = tmp_path / "backends"
|
||||
backend_dir.mkdir()
|
||||
(backend_dir / "mock.yaml").write_text("""
|
||||
name: mock
|
||||
cli: bash
|
||||
args: []
|
||||
required_env: []
|
||||
hooks:
|
||||
pre_run: []
|
||||
post_run: []
|
||||
shutdown: "exit"
|
||||
idle:
|
||||
quiescence_seconds: 1
|
||||
ready_pattern: "\\\\$"
|
||||
startup_timeout: 5
|
||||
terminal:
|
||||
cols: 80
|
||||
rows: 24
|
||||
session_logs:
|
||||
pattern: ""
|
||||
""")
|
||||
return backend_dir
|
||||
|
||||
|
||||
class TestE2ESmoke:
|
||||
def test_scenario_config_loads(self, mock_scenario):
|
||||
config = ScenarioConfig.from_yaml(mock_scenario)
|
||||
assert config.scenario == "e2e-smoke-test"
|
||||
|
||||
def test_engine_setup_works(self, mock_scenario, mock_backend):
|
||||
fixtures_dir = Path(__file__).parent.parent / "fixtures"
|
||||
engine = Engine(
|
||||
scenario_path=mock_scenario,
|
||||
backend_name="mock",
|
||||
backends_dir=mock_backend,
|
||||
fixtures_dir=fixtures_dir,
|
||||
results_dir=Path("/tmp/drill-test-results"),
|
||||
)
|
||||
workdir = Path("/tmp/drill-e2e-smoke")
|
||||
if workdir.exists():
|
||||
shutil.rmtree(workdir)
|
||||
engine._setup(workdir)
|
||||
assert (workdir / "package.json").exists()
|
||||
assert (workdir / "src" / "index.js").exists()
|
||||
# Verify git state
|
||||
import subprocess
|
||||
|
||||
result = subprocess.run(
|
||||
["git", "branch", "--show-current"], cwd=workdir, capture_output=True, text=True
|
||||
)
|
||||
assert result.stdout.strip() == "main"
|
||||
result = subprocess.run(
|
||||
["git", "log", "--oneline"], cwd=workdir, capture_output=True, text=True
|
||||
)
|
||||
assert "initial commit" in result.stdout
|
||||
# Cleanup
|
||||
shutil.rmtree(workdir, ignore_errors=True)
|
||||
173
evals/tests/test_engine.py
Normal file
173
evals/tests/test_engine.py
Normal file
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
|
||||
|
||||
|
||||
class TestVerifyConfig:
|
||||
def test_defaults(self):
|
||||
vc = VerifyConfig()
|
||||
assert vc.criteria == []
|
||||
assert vc.assertions == []
|
||||
assert vc.observe is False
|
||||
|
||||
def test_from_dict(self):
|
||||
vc = VerifyConfig(
|
||||
criteria=["test criterion"],
|
||||
assertions=["tool-called Read"],
|
||||
observe=True,
|
||||
)
|
||||
assert len(vc.criteria) == 1
|
||||
assert len(vc.assertions) == 1
|
||||
assert vc.observe is True
|
||||
|
||||
|
||||
class TestScenarioConfig:
|
||||
def test_loads_from_yaml(self, tmp_path):
|
||||
scenario_file = tmp_path / "test.yaml"
|
||||
scenario_file.write_text("""
|
||||
scenario: test-scenario
|
||||
description: "A test"
|
||||
user_posture: naive
|
||||
setup:
|
||||
helpers:
|
||||
- create_base_repo
|
||||
assertions:
|
||||
- "git rev-parse --is-inside-work-tree"
|
||||
turns:
|
||||
- intent: "Do the thing"
|
||||
limits:
|
||||
max_turns: 10
|
||||
turn_timeout: 60
|
||||
verify:
|
||||
criteria:
|
||||
- "Thing was done"
|
||||
assertions:
|
||||
- "tool-called Bash"
|
||||
observe: true
|
||||
""")
|
||||
config = ScenarioConfig.from_yaml(scenario_file)
|
||||
assert config.scenario == "test-scenario"
|
||||
assert config.user_posture == "naive"
|
||||
assert config.limits["max_turns"] == 10
|
||||
assert len(config.turns) == 1
|
||||
assert len(config.verify.criteria) == 1
|
||||
assert len(config.verify.assertions) == 1
|
||||
assert config.verify.observe is True
|
||||
|
||||
def test_loads_without_assertions(self, tmp_path):
|
||||
scenario_file = tmp_path / "test.yaml"
|
||||
scenario_file.write_text("""
|
||||
scenario: minimal
|
||||
verify:
|
||||
criteria:
|
||||
- "Something happened"
|
||||
""")
|
||||
config = ScenarioConfig.from_yaml(scenario_file)
|
||||
assert config.verify.assertions == []
|
||||
assert config.verify.observe is False
|
||||
|
||||
def test_loads_without_verify(self, tmp_path):
|
||||
scenario_file = tmp_path / "test.yaml"
|
||||
scenario_file.write_text("""
|
||||
scenario: bare-minimum
|
||||
""")
|
||||
config = ScenarioConfig.from_yaml(scenario_file)
|
||||
assert config.verify.criteria == []
|
||||
assert config.verify.assertions == []
|
||||
|
||||
|
||||
class TestSnapshotFilesystem:
|
||||
def test_captures_git_state(self, tmp_path):
|
||||
subprocess.run(["git", "init", "-b", "main"], cwd=tmp_path, capture_output=True)
|
||||
subprocess.run(
|
||||
["git", "commit", "--allow-empty", "-m", "init"], cwd=tmp_path, capture_output=True
|
||||
)
|
||||
snapshot = snapshot_filesystem(tmp_path)
|
||||
data = json.loads(snapshot)
|
||||
assert "git_status" in data
|
||||
assert "branch" in data
|
||||
assert "worktree_list" in data
|
||||
assert "files" in data
|
||||
|
||||
|
||||
class TestRunResult:
|
||||
def test_serializes_to_dir(self, tmp_path):
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-07T14-30-00",
|
||||
session_log="session output here",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude", "duration_seconds": 42, "actor_turns": 5},
|
||||
)
|
||||
result.save(tmp_path)
|
||||
assert (tmp_path / "session.log").read_text() == "session output here"
|
||||
assert (tmp_path / "filesystem.json").exists()
|
||||
assert (tmp_path / "tool_calls.jsonl").exists()
|
||||
assert (tmp_path / "verdict.json").exists()
|
||||
assert (tmp_path / "meta.json").exists()
|
||||
|
||||
|
||||
class TestEngineAssertionIntegration:
|
||||
def test_run_result_save_splits_artifacts_and_verdict(self, tmp_path):
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-20T10-00-00",
|
||||
session_log="log here",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude"},
|
||||
)
|
||||
result.save_artifacts(tmp_path)
|
||||
assert (tmp_path / "session.log").exists()
|
||||
assert (tmp_path / "filesystem.json").exists()
|
||||
assert (tmp_path / "tool_calls.jsonl").exists()
|
||||
assert not (tmp_path / "verdict.json").exists()
|
||||
assert not (tmp_path / "meta.json").exists()
|
||||
|
||||
result.save_verdict(tmp_path)
|
||||
assert (tmp_path / "verdict.json").exists()
|
||||
assert (tmp_path / "meta.json").exists()
|
||||
|
||||
|
||||
class TestEngineRunParams:
|
||||
def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
|
||||
custom_dir = tmp_path / "custom" / "run-00"
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-20T10-00-00",
|
||||
session_log="log",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude"},
|
||||
)
|
||||
result.save(custom_dir)
|
||||
assert (custom_dir / "session.log").read_text() == "log"
|
||||
assert (custom_dir / "verdict.json").exists()
|
||||
assert (custom_dir / "meta.json").exists()
|
||||
|
||||
def test_run_result_nested_dir_created(self, tmp_path: Path) -> None:
|
||||
deep_dir = tmp_path / "a" / "b" / "c" / "run-05"
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-20T10-00-00",
|
||||
session_log="log",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude"},
|
||||
)
|
||||
result.save(deep_dir)
|
||||
assert deep_dir.exists()
|
||||
assert (deep_dir / "session.log").exists()
|
||||
126
evals/tests/test_helpers.py
Normal file
126
evals/tests/test_helpers.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
BIN_DIR = Path(__file__).parent.parent / "bin"
|
||||
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
||||
|
||||
|
||||
def run_helper(name: str, args: list[str], cwd: Path) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
[str(BIN_DIR / name), *args],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
|
||||
class TestToolCalled:
|
||||
def test_tool_present(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-called", ["Read"], tmp_path)
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_tool_absent(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-called", ["Write"], tmp_path)
|
||||
assert result.returncode == 1
|
||||
assert "FAIL" in result.stdout
|
||||
|
||||
def test_empty_jsonl(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text("")
|
||||
result = run_helper("tool-called", ["Read"], tmp_path)
|
||||
assert result.returncode == 1
|
||||
|
||||
|
||||
class TestToolNotCalled:
|
||||
def test_tool_absent(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-not-called", ["Write"], tmp_path)
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_tool_present(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-not-called", ["Read"], tmp_path)
|
||||
assert result.returncode == 1
|
||||
assert "FAIL" in result.stdout
|
||||
|
||||
def test_empty_jsonl(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text("")
|
||||
result = run_helper("tool-not-called", ["Read"], tmp_path)
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
class TestToolCount:
|
||||
def test_gte_passes(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-count", ["Read", "gte", "2"], tmp_path)
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_gte_fails(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-count", ["Read", "gte", "5"], tmp_path)
|
||||
assert result.returncode == 1
|
||||
assert "FAIL" in result.stdout
|
||||
|
||||
def test_eq(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-count", ["Read", "eq", "2"], tmp_path)
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_lt(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-count", ["Read", "lt", "3"], tmp_path)
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
class TestToolBefore:
|
||||
def test_correct_order(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text(
|
||||
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
|
||||
)
|
||||
result = run_helper("tool-before", ["Read", "Edit"], tmp_path)
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_wrong_order(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text(
|
||||
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
|
||||
)
|
||||
result = run_helper("tool-before", ["Edit", "EnterWorktree"], tmp_path)
|
||||
assert result.returncode == 1
|
||||
assert "FAIL" in result.stdout
|
||||
|
||||
def test_first_tool_missing(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text(
|
||||
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
|
||||
)
|
||||
result = run_helper("tool-before", ["Write", "Read"], tmp_path)
|
||||
assert result.returncode == 1
|
||||
assert "never called" in result.stdout
|
||||
|
||||
def test_second_tool_missing(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text(
|
||||
(FIXTURES_DIR / "tools_ordered.jsonl").read_text()
|
||||
)
|
||||
result = run_helper("tool-before", ["Read", "Write"], tmp_path)
|
||||
assert result.returncode == 1
|
||||
assert "never called" in result.stdout
|
||||
|
||||
|
||||
class TestToolArgMatch:
|
||||
def test_matching_arg(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper(
|
||||
"tool-arg-match", ["Skill", '.skill == "superpowers:worktree"'], tmp_path
|
||||
)
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_no_matching_arg(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-arg-match", ["Skill", '.skill == "nonexistent"'], tmp_path)
|
||||
assert result.returncode == 1
|
||||
assert "FAIL" in result.stdout
|
||||
|
||||
def test_tool_not_present(self, tmp_path):
|
||||
(tmp_path / "tool_calls.jsonl").write_text((FIXTURES_DIR / "tools_multi.jsonl").read_text())
|
||||
result = run_helper("tool-arg-match", ["Write", '.file_path == "/tmp/foo"'], tmp_path)
|
||||
assert result.returncode == 1
|
||||
179
evals/tests/test_normalizer.py
Normal file
179
evals/tests/test_normalizer.py
Normal file
@@ -0,0 +1,179 @@
|
||||
import json
|
||||
|
||||
from drill.normalizer import (
|
||||
collect_new_logs,
|
||||
filter_codex_logs_by_cwd,
|
||||
normalize_claude_logs,
|
||||
normalize_codex_logs,
|
||||
normalize_gemini_logs,
|
||||
snapshot_log_dir,
|
||||
)
|
||||
|
||||
|
||||
class TestSnapshotAndCollect:
|
||||
def test_snapshot_and_collect_new_files(self, tmp_path):
|
||||
log_dir = tmp_path / "logs"
|
||||
log_dir.mkdir()
|
||||
(log_dir / "old.jsonl").write_text('{"old": true}\n')
|
||||
snapshot = snapshot_log_dir(log_dir)
|
||||
(log_dir / "new.jsonl").write_text('{"new": true}\n')
|
||||
new_files = collect_new_logs(log_dir, snapshot)
|
||||
assert len(new_files) == 1
|
||||
assert new_files[0].name == "new.jsonl"
|
||||
|
||||
def test_empty_dir_returns_empty(self, tmp_path):
|
||||
log_dir = tmp_path / "logs"
|
||||
log_dir.mkdir()
|
||||
snapshot = snapshot_log_dir(log_dir)
|
||||
new_files = collect_new_logs(log_dir, snapshot)
|
||||
assert new_files == []
|
||||
|
||||
|
||||
class TestNormalizeClaudeLogs:
|
||||
def test_normalizes_tool_use(self):
|
||||
lines = [
|
||||
json.dumps(
|
||||
{"type": "tool_use", "name": "EnterWorktree", "input": {"branch": "add-login"}}
|
||||
),
|
||||
json.dumps({"type": "tool_use", "name": "Bash", "input": {"command": "git status"}}),
|
||||
json.dumps({"type": "text", "text": "I'll create a worktree"}),
|
||||
]
|
||||
normalized = normalize_claude_logs("\n".join(lines))
|
||||
assert len(normalized) == 2
|
||||
assert normalized[0]["tool"] == "EnterWorktree"
|
||||
assert normalized[0]["source"] == "native"
|
||||
assert normalized[1]["tool"] == "Bash"
|
||||
assert normalized[1]["source"] == "shell"
|
||||
|
||||
|
||||
class TestNormalizeCodexLogs:
|
||||
def test_normalizes_local_shell_call(self):
|
||||
lines = [
|
||||
json.dumps(
|
||||
{
|
||||
"type": "response_item",
|
||||
"item": {
|
||||
"type": "local_shell_call",
|
||||
"action": {"command": ["git", "worktree", "add", "feature"]},
|
||||
"status": "completed",
|
||||
},
|
||||
}
|
||||
),
|
||||
json.dumps(
|
||||
{
|
||||
"type": "response_item",
|
||||
"item": {"type": "message", "content": [{"text": "Creating worktree"}]},
|
||||
}
|
||||
),
|
||||
]
|
||||
normalized = normalize_codex_logs("\n".join(lines))
|
||||
assert len(normalized) == 1
|
||||
assert normalized[0]["tool"] == "Bash"
|
||||
assert "git worktree add" in normalized[0]["args"]["command"]
|
||||
assert normalized[0]["source"] == "shell"
|
||||
|
||||
def test_filter_by_cwd_keeps_matching_drops_others(self, tmp_path):
|
||||
target = "/private/tmp/drill-target"
|
||||
match = tmp_path / "match.jsonl"
|
||||
match.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "session_meta",
|
||||
"payload": {"id": "abc", "cwd": target},
|
||||
}
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
other = tmp_path / "other.jsonl"
|
||||
other.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "session_meta",
|
||||
"payload": {"id": "def", "cwd": "/private/tmp/drill-other"},
|
||||
}
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
no_meta = tmp_path / "no-meta.jsonl"
|
||||
no_meta.write_text(json.dumps({"type": "response_item", "payload": {}}) + "\n")
|
||||
empty = tmp_path / "empty.jsonl"
|
||||
empty.write_text("")
|
||||
kept = filter_codex_logs_by_cwd([match, other, no_meta, empty], target)
|
||||
assert kept == [match]
|
||||
|
||||
def test_normalizes_function_call_with_payload(self):
|
||||
"""Test the actual codex rollout format using payload instead of item."""
|
||||
lines = [
|
||||
json.dumps(
|
||||
{
|
||||
"type": "response_item",
|
||||
"payload": {
|
||||
"type": "function_call",
|
||||
"name": "exec_command",
|
||||
"arguments": '{"cmd":"git worktree add .worktrees/feature",'
|
||||
'"workdir":"/tmp/test"}',
|
||||
"call_id": "call_123",
|
||||
},
|
||||
}
|
||||
),
|
||||
json.dumps(
|
||||
{
|
||||
"type": "response_item",
|
||||
"payload": {
|
||||
"type": "function_call",
|
||||
"name": "apply_patch",
|
||||
"arguments": '{"patch":"--- a/file\\n+++ b/file"}',
|
||||
"call_id": "call_456",
|
||||
},
|
||||
}
|
||||
),
|
||||
]
|
||||
normalized = normalize_codex_logs("\n".join(lines))
|
||||
assert len(normalized) == 2
|
||||
assert normalized[0]["tool"] == "Bash"
|
||||
assert "git worktree add" in normalized[0]["args"]["command"]
|
||||
assert normalized[0]["source"] == "shell"
|
||||
assert normalized[1]["tool"] == "Edit"
|
||||
assert normalized[1]["source"] == "native"
|
||||
|
||||
|
||||
class TestNormalizeGeminiLogs:
|
||||
def test_normalizes_jsonl_tool_calls(self):
|
||||
lines = [
|
||||
json.dumps({"kind": "main"}),
|
||||
json.dumps(
|
||||
{
|
||||
"type": "gemini",
|
||||
"content": "Reading file",
|
||||
"toolCalls": [
|
||||
{
|
||||
"id": "read_file_1",
|
||||
"name": "read_file",
|
||||
"args": {"file_path": "GEMINI.md"},
|
||||
"status": "success",
|
||||
}
|
||||
],
|
||||
}
|
||||
),
|
||||
json.dumps(
|
||||
{
|
||||
"type": "gemini",
|
||||
"content": "Running command",
|
||||
"toolCalls": [
|
||||
{
|
||||
"id": "shell_1",
|
||||
"name": "run_shell_command",
|
||||
"args": {"command": "git status"},
|
||||
"status": "success",
|
||||
}
|
||||
],
|
||||
}
|
||||
),
|
||||
]
|
||||
|
||||
normalized = normalize_gemini_logs("\n".join(lines))
|
||||
|
||||
assert normalized == [
|
||||
{"tool": "Read", "args": {"file_path": "GEMINI.md"}, "source": "native"},
|
||||
{"tool": "Bash", "args": {"command": "git status"}, "source": "shell"},
|
||||
]
|
||||
94
evals/tests/test_session.py
Normal file
94
evals/tests/test_session.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import subprocess
|
||||
import time
|
||||
from unittest.mock import call, patch
|
||||
|
||||
from drill.session import TmuxSession
|
||||
|
||||
|
||||
class TestTmuxSession:
|
||||
def test_create_and_kill(self):
|
||||
session = TmuxSession(name="drill-test-create", cols=80, rows=24)
|
||||
session.create()
|
||||
result = subprocess.run(
|
||||
["tmux", "has-session", "-t", "drill-test-create"],
|
||||
capture_output=True,
|
||||
)
|
||||
assert result.returncode == 0
|
||||
session.kill()
|
||||
result = subprocess.run(
|
||||
["tmux", "has-session", "-t", "drill-test-create"],
|
||||
capture_output=True,
|
||||
)
|
||||
assert result.returncode != 0
|
||||
|
||||
def test_send_keys_and_capture(self):
|
||||
session = TmuxSession(name="drill-test-keys", cols=80, rows=24)
|
||||
session.create()
|
||||
try:
|
||||
session.send_keys("echo hello-drill-test")
|
||||
time.sleep(0.5)
|
||||
output = session.capture()
|
||||
assert "hello-drill-test" in output
|
||||
finally:
|
||||
session.kill()
|
||||
|
||||
def test_send_keys_pastes_text_then_submits(self):
|
||||
session = TmuxSession(name="drill-test-command-shape")
|
||||
|
||||
with (
|
||||
patch("drill.session.subprocess.run") as run,
|
||||
patch("drill.session.time.sleep") as sleep,
|
||||
):
|
||||
session.send_keys("hello `weird` text")
|
||||
|
||||
assert run.call_args_list == [
|
||||
call(
|
||||
[
|
||||
"tmux",
|
||||
"set-buffer",
|
||||
"-b",
|
||||
"drill-test-command-shape-input",
|
||||
"hello `weird` text",
|
||||
],
|
||||
check=True,
|
||||
),
|
||||
call(
|
||||
[
|
||||
"tmux",
|
||||
"paste-buffer",
|
||||
"-d",
|
||||
"-b",
|
||||
"drill-test-command-shape-input",
|
||||
"-t",
|
||||
"drill-test-command-shape",
|
||||
],
|
||||
check=True,
|
||||
),
|
||||
call(["tmux", "send-keys", "-t", "drill-test-command-shape", "Enter"], check=True),
|
||||
]
|
||||
sleep.assert_called_once_with(0.1)
|
||||
|
||||
def test_launch_command(self, tmp_path):
|
||||
session = TmuxSession(name="drill-test-launch", cols=80, rows=24)
|
||||
session.create()
|
||||
try:
|
||||
session.launch(["python3", "-c", "import time; time.sleep(30)"], cwd=str(tmp_path))
|
||||
time.sleep(0.5)
|
||||
assert session.is_process_alive()
|
||||
finally:
|
||||
session.kill()
|
||||
|
||||
def test_send_special_key(self, tmp_path):
|
||||
session = TmuxSession(name="drill-test-special", cols=80, rows=24)
|
||||
proof_file = tmp_path / "after-ctrl-c"
|
||||
session.create()
|
||||
try:
|
||||
session.send_keys("cat")
|
||||
time.sleep(0.3)
|
||||
session.send_special_key("ctrl-c")
|
||||
time.sleep(0.3)
|
||||
session.send_keys(f"touch {proof_file}")
|
||||
time.sleep(0.3)
|
||||
assert proof_file.exists()
|
||||
finally:
|
||||
session.kill()
|
||||
168
evals/tests/test_setup.py
Normal file
168
evals/tests/test_setup.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from unittest.mock import call, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from drill.setup import clone_template, run_assertions
|
||||
from setup_helpers.base import create_base_repo
|
||||
from setup_helpers.worktree import (
|
||||
add_worktree,
|
||||
create_caller_consent_plan,
|
||||
detach_head,
|
||||
link_gemini_extension,
|
||||
symlink_superpowers,
|
||||
)
|
||||
from setup_helpers.spec_writing_blind_spot import create_spec_writing_blind_spot
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fixtures_dir():
|
||||
return Path(__file__).parent.parent / "fixtures"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def work_dir(tmp_path):
|
||||
return tmp_path / "test-repo"
|
||||
|
||||
|
||||
class TestCloneTemplate:
|
||||
def test_clones_template_repo(self, fixtures_dir, work_dir):
|
||||
clone_template(fixtures_dir / "template-repo", work_dir)
|
||||
assert (work_dir / "package.json").exists()
|
||||
assert (work_dir / "src" / "index.js").exists()
|
||||
result = subprocess.run(
|
||||
["git", "log", "--oneline"],
|
||||
cwd=work_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert "initial commit" in result.stdout
|
||||
|
||||
|
||||
class TestCreateBaseRepo:
|
||||
def test_creates_base_repo(self, fixtures_dir, work_dir):
|
||||
create_base_repo(work_dir, fixtures_dir / "template-repo")
|
||||
assert (work_dir / "package.json").exists()
|
||||
result = subprocess.run(
|
||||
["git", "branch", "--show-current"],
|
||||
cwd=work_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.stdout.strip() == "main"
|
||||
|
||||
|
||||
class TestWorktreeHelpers:
|
||||
def test_add_worktree(self, fixtures_dir, work_dir):
|
||||
create_base_repo(work_dir, fixtures_dir / "template-repo")
|
||||
wt_path = work_dir.parent / "feature-wt"
|
||||
add_worktree(work_dir, "feature-branch", str(wt_path))
|
||||
assert wt_path.exists()
|
||||
result = subprocess.run(
|
||||
["git", "worktree", "list"],
|
||||
cwd=work_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert "feature-branch" in result.stdout
|
||||
|
||||
def test_detach_head(self, fixtures_dir, work_dir):
|
||||
create_base_repo(work_dir, fixtures_dir / "template-repo")
|
||||
wt_path = work_dir.parent / "detached-wt"
|
||||
add_worktree(work_dir, "tmp-branch", str(wt_path))
|
||||
detach_head(str(wt_path))
|
||||
result = subprocess.run(
|
||||
["git", "branch", "--show-current"],
|
||||
cwd=wt_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.stdout.strip() == ""
|
||||
|
||||
def test_symlink_superpowers(self, fixtures_dir, work_dir, tmp_path):
|
||||
create_base_repo(work_dir, fixtures_dir / "template-repo")
|
||||
fake_sp = tmp_path / "superpowers" / "skills"
|
||||
fake_sp.mkdir(parents=True)
|
||||
symlink_superpowers(work_dir, str(tmp_path / "superpowers"))
|
||||
link = work_dir / ".agents" / "skills" / "superpowers"
|
||||
assert link.is_symlink()
|
||||
|
||||
def test_link_gemini_extension_relinks_requested_root(self, work_dir, tmp_path):
|
||||
work_dir.mkdir()
|
||||
fake_sp = tmp_path / "superpowers"
|
||||
(fake_sp / "skills" / "using-superpowers" / "references").mkdir(parents=True)
|
||||
(fake_sp / "gemini-extension.json").write_text('{"name": "custom-superpowers"}')
|
||||
|
||||
with patch("setup_helpers.worktree.subprocess.run") as run:
|
||||
link_gemini_extension(work_dir, str(fake_sp))
|
||||
|
||||
assert run.call_args_list == [
|
||||
call(["gemini", "extensions", "uninstall", "custom-superpowers"], capture_output=True),
|
||||
call(
|
||||
["gemini", "extensions", "link", str(fake_sp)],
|
||||
capture_output=True,
|
||||
input="y\n",
|
||||
text=True,
|
||||
check=True,
|
||||
),
|
||||
]
|
||||
assert (work_dir / "GEMINI.md").read_text() == (
|
||||
f"@{fake_sp}/skills/using-superpowers/SKILL.md\n"
|
||||
f"@{fake_sp}/skills/using-superpowers/references/gemini-tools.md\n"
|
||||
)
|
||||
|
||||
def test_create_caller_consent_plan(self, fixtures_dir, work_dir):
|
||||
create_base_repo(work_dir, fixtures_dir / "template-repo")
|
||||
create_caller_consent_plan(work_dir)
|
||||
|
||||
plan = work_dir / "docs" / "superpowers" / "plans" / "custom-greeting.md"
|
||||
assert plan.exists()
|
||||
assert "REQUIRED SUB-SKILL" in plan.read_text()
|
||||
|
||||
result = subprocess.run(
|
||||
["git", "status", "--short"],
|
||||
cwd=work_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.stdout.strip() == ""
|
||||
|
||||
|
||||
class TestSpecWritingBlindSpot:
|
||||
def test_creates_repo_structure(self, tmp_path):
|
||||
workdir = tmp_path / "blind-spot-repo"
|
||||
create_spec_writing_blind_spot(workdir)
|
||||
|
||||
assert (workdir / "src" / "components" / "AdminPanel.tsx").exists()
|
||||
assert (workdir / "src" / "components" / "TeamOverview.tsx").exists()
|
||||
assert (workdir / "src" / "router.tsx").exists()
|
||||
assert (workdir / "CLAUDE.md").exists()
|
||||
assert not (workdir / "src" / "components" / "ActivityFeed.tsx").exists()
|
||||
|
||||
result = subprocess.run(
|
||||
["git", "branch", "--show-current"],
|
||||
cwd=workdir, capture_output=True, text=True,
|
||||
)
|
||||
assert result.stdout.strip() == "main"
|
||||
|
||||
result = subprocess.run(
|
||||
["git", "log", "--oneline"],
|
||||
cwd=workdir, capture_output=True, text=True,
|
||||
)
|
||||
assert result.stdout.count("\n") >= 3
|
||||
|
||||
|
||||
class TestRunAssertions:
|
||||
def test_passing_assertions(self, fixtures_dir, work_dir):
|
||||
create_base_repo(work_dir, fixtures_dir / "template-repo")
|
||||
assertions = [
|
||||
"git rev-parse --is-inside-work-tree",
|
||||
"git branch --show-current | grep main",
|
||||
]
|
||||
run_assertions(assertions, work_dir)
|
||||
|
||||
def test_failing_assertion_raises(self, fixtures_dir, work_dir):
|
||||
create_base_repo(work_dir, fixtures_dir / "template-repo")
|
||||
with pytest.raises(AssertionError, match="Setup assertion failed"):
|
||||
run_assertions(["git branch --show-current | grep nonexistent"], work_dir)
|
||||
54
evals/tests/test_stats.py
Normal file
54
evals/tests/test_stats.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""Tests for Wilson score confidence interval."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from drill.stats import wilson_ci
|
||||
|
||||
|
||||
class TestWilsonCI:
|
||||
def test_all_pass(self) -> None:
|
||||
lo, hi = wilson_ci(10, 10)
|
||||
assert lo > 0.69
|
||||
assert hi == 1.0 or hi > 0.99
|
||||
|
||||
def test_all_fail(self) -> None:
|
||||
lo, hi = wilson_ci(0, 10)
|
||||
assert lo < 0.01 or lo == 0.0
|
||||
assert hi < 0.31
|
||||
|
||||
def test_half_pass(self) -> None:
|
||||
lo, hi = wilson_ci(5, 10)
|
||||
assert 0.18 < lo < 0.25
|
||||
assert 0.75 < hi < 0.82
|
||||
|
||||
def test_zero_total(self) -> None:
|
||||
lo, hi = wilson_ci(0, 0)
|
||||
assert lo == 0.0
|
||||
assert hi == 0.0
|
||||
|
||||
def test_single_pass(self) -> None:
|
||||
lo, hi = wilson_ci(1, 1)
|
||||
assert lo > 0.0
|
||||
assert hi <= 1.0
|
||||
|
||||
def test_single_fail(self) -> None:
|
||||
lo, hi = wilson_ci(0, 1)
|
||||
assert lo == 0.0 or lo >= 0.0
|
||||
assert hi < 1.0
|
||||
|
||||
def test_large_sample(self) -> None:
|
||||
lo, hi = wilson_ci(80, 100)
|
||||
assert 0.70 < lo < 0.75
|
||||
assert 0.85 < hi < 0.90
|
||||
|
||||
def test_passed_greater_than_total_clamped(self) -> None:
|
||||
lo, hi = wilson_ci(12, 10)
|
||||
assert lo > 0.0
|
||||
assert hi <= 1.0
|
||||
|
||||
def test_returns_tuple_of_floats(self) -> None:
|
||||
result = wilson_ci(5, 10)
|
||||
assert isinstance(result, tuple)
|
||||
assert len(result) == 2
|
||||
assert isinstance(result[0], float)
|
||||
assert isinstance(result[1], float)
|
||||
202
evals/tests/test_sweep.py
Normal file
202
evals/tests/test_sweep.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""Tests for Sweep orchestrator."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from drill.engine import Engine, RunResult
|
||||
from drill.sweep import RunGroup, RunStatus, Sweep, write_run_group
|
||||
|
||||
|
||||
class TestRunStatus:
|
||||
def test_pass_status(self) -> None:
|
||||
rs = RunStatus(index=0, status="pass", duration=10.5)
|
||||
assert rs.error is None
|
||||
assert rs.status == "pass"
|
||||
|
||||
def test_error_status(self) -> None:
|
||||
rs = RunStatus(index=2, status="error", duration=1.2, error="tmux crashed")
|
||||
assert rs.error == "tmux crashed"
|
||||
|
||||
def test_serializes_to_dict(self) -> None:
|
||||
rs = RunStatus(index=0, status="pass", duration=10.5)
|
||||
d = asdict(rs)
|
||||
assert d["index"] == 0
|
||||
assert d["status"] == "pass"
|
||||
assert d["duration"] == 10.5
|
||||
assert d["error"] is None
|
||||
|
||||
|
||||
class TestRunGroup:
|
||||
def test_creates_with_defaults(self) -> None:
|
||||
rg = RunGroup(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
n=3,
|
||||
timestamp="2026-04-20T14-30-00",
|
||||
sweep_id="abc12345",
|
||||
runs=[],
|
||||
)
|
||||
assert rg.partial is False
|
||||
|
||||
def test_partial_flag(self) -> None:
|
||||
rg = RunGroup(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
n=3,
|
||||
timestamp="2026-04-20T14-30-00",
|
||||
sweep_id="abc12345",
|
||||
runs=[RunStatus(index=0, status="pass", duration=10.0)],
|
||||
partial=True,
|
||||
)
|
||||
assert rg.partial is True
|
||||
assert len(rg.runs) == 1
|
||||
|
||||
|
||||
class TestWriteRunGroup:
|
||||
def test_writes_json(self, tmp_path: Path) -> None:
|
||||
rg = RunGroup(
|
||||
scenario="test-scenario",
|
||||
backend="claude",
|
||||
n=2,
|
||||
timestamp="2026-04-20T14-30-00",
|
||||
sweep_id="abc12345",
|
||||
runs=[
|
||||
RunStatus(index=0, status="pass", duration=100.0),
|
||||
RunStatus(index=1, status="fail", duration=95.0),
|
||||
],
|
||||
)
|
||||
write_run_group(rg, tmp_path)
|
||||
path = tmp_path / "run-group.json"
|
||||
assert path.exists()
|
||||
data = json.loads(path.read_text())
|
||||
assert data["scenario"] == "test-scenario"
|
||||
assert data["sweep_id"] == "abc12345"
|
||||
assert data["partial"] is False
|
||||
assert len(data["runs"]) == 2
|
||||
assert data["runs"][0]["status"] == "pass"
|
||||
assert data["runs"][1]["status"] == "fail"
|
||||
|
||||
def test_writes_partial(self, tmp_path: Path) -> None:
|
||||
rg = RunGroup(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
n=5,
|
||||
timestamp="2026-04-20T14-30-00",
|
||||
sweep_id="abc12345",
|
||||
runs=[RunStatus(index=0, status="pass", duration=100.0)],
|
||||
partial=True,
|
||||
)
|
||||
write_run_group(rg, tmp_path)
|
||||
data = json.loads((tmp_path / "run-group.json").read_text())
|
||||
assert data["partial"] is True
|
||||
assert len(data["runs"]) == 1
|
||||
|
||||
def test_omits_null_errors(self, tmp_path: Path) -> None:
|
||||
rg = RunGroup(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
n=1,
|
||||
timestamp="2026-04-20T14-30-00",
|
||||
sweep_id="abc12345",
|
||||
runs=[RunStatus(index=0, status="pass", duration=50.0)],
|
||||
)
|
||||
write_run_group(rg, tmp_path)
|
||||
data = json.loads((tmp_path / "run-group.json").read_text())
|
||||
run_data = data["runs"][0]
|
||||
assert "error" not in run_data
|
||||
|
||||
|
||||
class TestSweepIntegration:
|
||||
def test_full_sweep_writes_run_group(self, tmp_path: Path) -> None:
|
||||
"""Test that Sweep creates run dirs and writes run-group.json."""
|
||||
scenario_file = tmp_path / "scenarios" / "test.yaml"
|
||||
scenario_file.parent.mkdir(parents=True)
|
||||
scenario_file.write_text(
|
||||
"scenario: test-scenario\n"
|
||||
"description: test\n"
|
||||
"user_posture: naive\n"
|
||||
"setup: {}\n"
|
||||
"turns:\n - intent: do the thing\n"
|
||||
"limits:\n max_turns: 5\n"
|
||||
"verify:\n criteria:\n - thing was done\n"
|
||||
)
|
||||
|
||||
backends_dir = tmp_path / "backends"
|
||||
backends_dir.mkdir()
|
||||
(backends_dir / "mock-backend.yaml").write_text(
|
||||
"name: mock-backend\n"
|
||||
"cli: echo\n"
|
||||
"args: []\n"
|
||||
"required_env: []\n"
|
||||
"hooks:\n pre_run: []\n post_run: []\n"
|
||||
"shutdown: /exit\n"
|
||||
"idle:\n quiescence_seconds: 1\n ready_pattern: '.'\n"
|
||||
"startup_timeout: 5\n"
|
||||
"terminal:\n cols: 80\n rows: 24\n"
|
||||
"session_logs: {}\n"
|
||||
)
|
||||
|
||||
results_dir = tmp_path / "results"
|
||||
fixtures_dir = tmp_path / "fixtures"
|
||||
fixtures_dir.mkdir()
|
||||
|
||||
fake_verdict = json.dumps(
|
||||
{
|
||||
"criteria": [
|
||||
{
|
||||
"criterion": "thing was done",
|
||||
"verdict": "pass",
|
||||
"evidence": "yes",
|
||||
"rationale": "it was done",
|
||||
}
|
||||
],
|
||||
"observations": [],
|
||||
"summary": "ok",
|
||||
}
|
||||
)
|
||||
|
||||
fake_result = RunResult(
|
||||
scenario="test-scenario",
|
||||
backend="mock-backend",
|
||||
timestamp="2026-04-20T14-30-00",
|
||||
session_log="log",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}',
|
||||
verdict_json=fake_verdict,
|
||||
meta={"actor_turns": 3},
|
||||
)
|
||||
|
||||
sweep = Sweep(
|
||||
scenario_path=scenario_file,
|
||||
backend_names=["mock-backend"],
|
||||
backends_dir=backends_dir,
|
||||
fixtures_dir=fixtures_dir,
|
||||
results_dir=results_dir,
|
||||
n=3,
|
||||
sweep_id="test1234",
|
||||
)
|
||||
|
||||
with patch.object(Engine, "run", return_value=fake_result):
|
||||
groups = sweep.run_all()
|
||||
|
||||
assert len(groups) == 1
|
||||
group = groups[0]
|
||||
assert group.scenario == "test-scenario"
|
||||
assert len(group.runs) == 3
|
||||
assert all(r.status == "pass" for r in group.runs)
|
||||
assert group.partial is False
|
||||
|
||||
# Verify run-group.json was written
|
||||
scenario_results = results_dir / "test-scenario" / "mock-backend"
|
||||
assert scenario_results.exists()
|
||||
group_dirs = list(scenario_results.iterdir())
|
||||
assert len(group_dirs) == 1
|
||||
rg_path = group_dirs[0] / "run-group.json"
|
||||
assert rg_path.exists()
|
||||
rg_data = json.loads(rg_path.read_text())
|
||||
assert rg_data["sweep_id"] == "test1234"
|
||||
assert len(rg_data["runs"]) == 3
|
||||
92
evals/tests/test_verifier.py
Normal file
92
evals/tests/test_verifier.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from drill.verifier import CriterionResult, Verdict, Verifier
|
||||
|
||||
|
||||
class TestVerdict:
|
||||
def test_parse_valid_verdict(self):
|
||||
data = {
|
||||
"criteria": [
|
||||
{
|
||||
"criterion": "Agent detected on main",
|
||||
"verdict": "pass",
|
||||
"evidence": "Terminal showed 'main branch detected'",
|
||||
"rationale": "Agent correctly identified the branch",
|
||||
}
|
||||
],
|
||||
"observations": ["Agent was very fast"],
|
||||
"summary": "Passed all checks",
|
||||
}
|
||||
verdict = Verdict.model_validate(data)
|
||||
assert len(verdict.criteria) == 1
|
||||
assert verdict.criteria[0].verdict == "pass"
|
||||
assert verdict.score == "1/1"
|
||||
|
||||
def test_score_calculation(self):
|
||||
data = {
|
||||
"criteria": [
|
||||
{"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
|
||||
{"criterion": "B", "verdict": "fail", "evidence": "e", "rationale": "r"},
|
||||
{"criterion": "C", "verdict": "pass", "evidence": "e", "rationale": "r"},
|
||||
],
|
||||
"observations": [],
|
||||
"summary": "Mixed results",
|
||||
}
|
||||
verdict = Verdict.model_validate(data)
|
||||
assert verdict.score == "2/3"
|
||||
assert verdict.passed is False
|
||||
|
||||
def test_all_pass(self):
|
||||
data = {
|
||||
"criteria": [
|
||||
{"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
|
||||
],
|
||||
"observations": [],
|
||||
"summary": "Good",
|
||||
}
|
||||
verdict = Verdict.model_validate(data)
|
||||
assert verdict.passed is True
|
||||
|
||||
|
||||
class TestCriterionResultSource:
|
||||
def test_default_source_is_judge(self):
|
||||
cr = CriterionResult(
|
||||
criterion="test",
|
||||
verdict="pass",
|
||||
evidence="e",
|
||||
rationale="r",
|
||||
)
|
||||
assert cr.source == "judge"
|
||||
|
||||
def test_assertion_source(self):
|
||||
cr = CriterionResult(
|
||||
criterion="test",
|
||||
verdict="fail",
|
||||
evidence="e",
|
||||
rationale="r",
|
||||
source="assertion",
|
||||
)
|
||||
assert cr.source == "assertion"
|
||||
|
||||
def test_backwards_compat_no_source_in_json(self):
|
||||
data = {"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"}
|
||||
cr = CriterionResult.model_validate(data)
|
||||
assert cr.source == "judge"
|
||||
|
||||
def test_source_serializes_to_json(self):
|
||||
cr = CriterionResult(
|
||||
criterion="test",
|
||||
verdict="pass",
|
||||
evidence="e",
|
||||
rationale="r",
|
||||
source="assertion",
|
||||
)
|
||||
data = cr.model_dump()
|
||||
assert data["source"] == "assertion"
|
||||
|
||||
|
||||
class TestVerifierPrompt:
|
||||
def test_builds_system_prompt(self):
|
||||
verifier = Verifier(model="claude-sonnet-4-6", temperature=0.0)
|
||||
prompt = verifier.build_system_prompt()
|
||||
assert "criterion" in prompt.lower()
|
||||
assert "evidence" in prompt.lower()
|
||||
assert "JSON" in prompt
|
||||
Reference in New Issue
Block a user