mirror of
https://github.com/obra/superpowers.git
synced 2026-05-10 11:09:05 +08:00
Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
This commit is contained in:
173
evals/tests/test_engine.py
Normal file
173
evals/tests/test_engine.py
Normal file
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
|
||||
|
||||
|
||||
class TestVerifyConfig:
|
||||
def test_defaults(self):
|
||||
vc = VerifyConfig()
|
||||
assert vc.criteria == []
|
||||
assert vc.assertions == []
|
||||
assert vc.observe is False
|
||||
|
||||
def test_from_dict(self):
|
||||
vc = VerifyConfig(
|
||||
criteria=["test criterion"],
|
||||
assertions=["tool-called Read"],
|
||||
observe=True,
|
||||
)
|
||||
assert len(vc.criteria) == 1
|
||||
assert len(vc.assertions) == 1
|
||||
assert vc.observe is True
|
||||
|
||||
|
||||
class TestScenarioConfig:
|
||||
def test_loads_from_yaml(self, tmp_path):
|
||||
scenario_file = tmp_path / "test.yaml"
|
||||
scenario_file.write_text("""
|
||||
scenario: test-scenario
|
||||
description: "A test"
|
||||
user_posture: naive
|
||||
setup:
|
||||
helpers:
|
||||
- create_base_repo
|
||||
assertions:
|
||||
- "git rev-parse --is-inside-work-tree"
|
||||
turns:
|
||||
- intent: "Do the thing"
|
||||
limits:
|
||||
max_turns: 10
|
||||
turn_timeout: 60
|
||||
verify:
|
||||
criteria:
|
||||
- "Thing was done"
|
||||
assertions:
|
||||
- "tool-called Bash"
|
||||
observe: true
|
||||
""")
|
||||
config = ScenarioConfig.from_yaml(scenario_file)
|
||||
assert config.scenario == "test-scenario"
|
||||
assert config.user_posture == "naive"
|
||||
assert config.limits["max_turns"] == 10
|
||||
assert len(config.turns) == 1
|
||||
assert len(config.verify.criteria) == 1
|
||||
assert len(config.verify.assertions) == 1
|
||||
assert config.verify.observe is True
|
||||
|
||||
def test_loads_without_assertions(self, tmp_path):
|
||||
scenario_file = tmp_path / "test.yaml"
|
||||
scenario_file.write_text("""
|
||||
scenario: minimal
|
||||
verify:
|
||||
criteria:
|
||||
- "Something happened"
|
||||
""")
|
||||
config = ScenarioConfig.from_yaml(scenario_file)
|
||||
assert config.verify.assertions == []
|
||||
assert config.verify.observe is False
|
||||
|
||||
def test_loads_without_verify(self, tmp_path):
|
||||
scenario_file = tmp_path / "test.yaml"
|
||||
scenario_file.write_text("""
|
||||
scenario: bare-minimum
|
||||
""")
|
||||
config = ScenarioConfig.from_yaml(scenario_file)
|
||||
assert config.verify.criteria == []
|
||||
assert config.verify.assertions == []
|
||||
|
||||
|
||||
class TestSnapshotFilesystem:
|
||||
def test_captures_git_state(self, tmp_path):
|
||||
subprocess.run(["git", "init", "-b", "main"], cwd=tmp_path, capture_output=True)
|
||||
subprocess.run(
|
||||
["git", "commit", "--allow-empty", "-m", "init"], cwd=tmp_path, capture_output=True
|
||||
)
|
||||
snapshot = snapshot_filesystem(tmp_path)
|
||||
data = json.loads(snapshot)
|
||||
assert "git_status" in data
|
||||
assert "branch" in data
|
||||
assert "worktree_list" in data
|
||||
assert "files" in data
|
||||
|
||||
|
||||
class TestRunResult:
|
||||
def test_serializes_to_dir(self, tmp_path):
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-07T14-30-00",
|
||||
session_log="session output here",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude", "duration_seconds": 42, "actor_turns": 5},
|
||||
)
|
||||
result.save(tmp_path)
|
||||
assert (tmp_path / "session.log").read_text() == "session output here"
|
||||
assert (tmp_path / "filesystem.json").exists()
|
||||
assert (tmp_path / "tool_calls.jsonl").exists()
|
||||
assert (tmp_path / "verdict.json").exists()
|
||||
assert (tmp_path / "meta.json").exists()
|
||||
|
||||
|
||||
class TestEngineAssertionIntegration:
|
||||
def test_run_result_save_splits_artifacts_and_verdict(self, tmp_path):
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-20T10-00-00",
|
||||
session_log="log here",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude"},
|
||||
)
|
||||
result.save_artifacts(tmp_path)
|
||||
assert (tmp_path / "session.log").exists()
|
||||
assert (tmp_path / "filesystem.json").exists()
|
||||
assert (tmp_path / "tool_calls.jsonl").exists()
|
||||
assert not (tmp_path / "verdict.json").exists()
|
||||
assert not (tmp_path / "meta.json").exists()
|
||||
|
||||
result.save_verdict(tmp_path)
|
||||
assert (tmp_path / "verdict.json").exists()
|
||||
assert (tmp_path / "meta.json").exists()
|
||||
|
||||
|
||||
class TestEngineRunParams:
|
||||
def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
|
||||
custom_dir = tmp_path / "custom" / "run-00"
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-20T10-00-00",
|
||||
session_log="log",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude"},
|
||||
)
|
||||
result.save(custom_dir)
|
||||
assert (custom_dir / "session.log").read_text() == "log"
|
||||
assert (custom_dir / "verdict.json").exists()
|
||||
assert (custom_dir / "meta.json").exists()
|
||||
|
||||
def test_run_result_nested_dir_created(self, tmp_path: Path) -> None:
|
||||
deep_dir = tmp_path / "a" / "b" / "c" / "run-05"
|
||||
result = RunResult(
|
||||
scenario="test",
|
||||
backend="claude",
|
||||
timestamp="2026-04-20T10-00-00",
|
||||
session_log="log",
|
||||
filesystem_json='{"files": []}',
|
||||
tool_calls_jsonl='{"tool": "Bash"}\n',
|
||||
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
||||
meta={"backend": "claude"},
|
||||
)
|
||||
result.save(deep_dir)
|
||||
assert deep_dir.exists()
|
||||
assert (deep_dir / "session.log").exists()
|
||||
Reference in New Issue
Block a user