mirror of
https://github.com/obra/superpowers.git
synced 2026-05-10 02:59:04 +08:00
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
174 lines
5.8 KiB
Python
174 lines
5.8 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
|
|
|
|
|
|
class TestVerifyConfig:
|
|
def test_defaults(self):
|
|
vc = VerifyConfig()
|
|
assert vc.criteria == []
|
|
assert vc.assertions == []
|
|
assert vc.observe is False
|
|
|
|
def test_from_dict(self):
|
|
vc = VerifyConfig(
|
|
criteria=["test criterion"],
|
|
assertions=["tool-called Read"],
|
|
observe=True,
|
|
)
|
|
assert len(vc.criteria) == 1
|
|
assert len(vc.assertions) == 1
|
|
assert vc.observe is True
|
|
|
|
|
|
class TestScenarioConfig:
|
|
def test_loads_from_yaml(self, tmp_path):
|
|
scenario_file = tmp_path / "test.yaml"
|
|
scenario_file.write_text("""
|
|
scenario: test-scenario
|
|
description: "A test"
|
|
user_posture: naive
|
|
setup:
|
|
helpers:
|
|
- create_base_repo
|
|
assertions:
|
|
- "git rev-parse --is-inside-work-tree"
|
|
turns:
|
|
- intent: "Do the thing"
|
|
limits:
|
|
max_turns: 10
|
|
turn_timeout: 60
|
|
verify:
|
|
criteria:
|
|
- "Thing was done"
|
|
assertions:
|
|
- "tool-called Bash"
|
|
observe: true
|
|
""")
|
|
config = ScenarioConfig.from_yaml(scenario_file)
|
|
assert config.scenario == "test-scenario"
|
|
assert config.user_posture == "naive"
|
|
assert config.limits["max_turns"] == 10
|
|
assert len(config.turns) == 1
|
|
assert len(config.verify.criteria) == 1
|
|
assert len(config.verify.assertions) == 1
|
|
assert config.verify.observe is True
|
|
|
|
def test_loads_without_assertions(self, tmp_path):
|
|
scenario_file = tmp_path / "test.yaml"
|
|
scenario_file.write_text("""
|
|
scenario: minimal
|
|
verify:
|
|
criteria:
|
|
- "Something happened"
|
|
""")
|
|
config = ScenarioConfig.from_yaml(scenario_file)
|
|
assert config.verify.assertions == []
|
|
assert config.verify.observe is False
|
|
|
|
def test_loads_without_verify(self, tmp_path):
|
|
scenario_file = tmp_path / "test.yaml"
|
|
scenario_file.write_text("""
|
|
scenario: bare-minimum
|
|
""")
|
|
config = ScenarioConfig.from_yaml(scenario_file)
|
|
assert config.verify.criteria == []
|
|
assert config.verify.assertions == []
|
|
|
|
|
|
class TestSnapshotFilesystem:
|
|
def test_captures_git_state(self, tmp_path):
|
|
subprocess.run(["git", "init", "-b", "main"], cwd=tmp_path, capture_output=True)
|
|
subprocess.run(
|
|
["git", "commit", "--allow-empty", "-m", "init"], cwd=tmp_path, capture_output=True
|
|
)
|
|
snapshot = snapshot_filesystem(tmp_path)
|
|
data = json.loads(snapshot)
|
|
assert "git_status" in data
|
|
assert "branch" in data
|
|
assert "worktree_list" in data
|
|
assert "files" in data
|
|
|
|
|
|
class TestRunResult:
|
|
def test_serializes_to_dir(self, tmp_path):
|
|
result = RunResult(
|
|
scenario="test",
|
|
backend="claude",
|
|
timestamp="2026-04-07T14-30-00",
|
|
session_log="session output here",
|
|
filesystem_json='{"files": []}',
|
|
tool_calls_jsonl='{"tool": "Bash"}\n',
|
|
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
|
meta={"backend": "claude", "duration_seconds": 42, "actor_turns": 5},
|
|
)
|
|
result.save(tmp_path)
|
|
assert (tmp_path / "session.log").read_text() == "session output here"
|
|
assert (tmp_path / "filesystem.json").exists()
|
|
assert (tmp_path / "tool_calls.jsonl").exists()
|
|
assert (tmp_path / "verdict.json").exists()
|
|
assert (tmp_path / "meta.json").exists()
|
|
|
|
|
|
class TestEngineAssertionIntegration:
|
|
def test_run_result_save_splits_artifacts_and_verdict(self, tmp_path):
|
|
result = RunResult(
|
|
scenario="test",
|
|
backend="claude",
|
|
timestamp="2026-04-20T10-00-00",
|
|
session_log="log here",
|
|
filesystem_json='{"files": []}',
|
|
tool_calls_jsonl='{"tool": "Bash"}\n',
|
|
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
|
meta={"backend": "claude"},
|
|
)
|
|
result.save_artifacts(tmp_path)
|
|
assert (tmp_path / "session.log").exists()
|
|
assert (tmp_path / "filesystem.json").exists()
|
|
assert (tmp_path / "tool_calls.jsonl").exists()
|
|
assert not (tmp_path / "verdict.json").exists()
|
|
assert not (tmp_path / "meta.json").exists()
|
|
|
|
result.save_verdict(tmp_path)
|
|
assert (tmp_path / "verdict.json").exists()
|
|
assert (tmp_path / "meta.json").exists()
|
|
|
|
|
|
class TestEngineRunParams:
|
|
def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
|
|
custom_dir = tmp_path / "custom" / "run-00"
|
|
result = RunResult(
|
|
scenario="test",
|
|
backend="claude",
|
|
timestamp="2026-04-20T10-00-00",
|
|
session_log="log",
|
|
filesystem_json='{"files": []}',
|
|
tool_calls_jsonl='{"tool": "Bash"}\n',
|
|
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
|
meta={"backend": "claude"},
|
|
)
|
|
result.save(custom_dir)
|
|
assert (custom_dir / "session.log").read_text() == "log"
|
|
assert (custom_dir / "verdict.json").exists()
|
|
assert (custom_dir / "meta.json").exists()
|
|
|
|
def test_run_result_nested_dir_created(self, tmp_path: Path) -> None:
|
|
deep_dir = tmp_path / "a" / "b" / "c" / "run-05"
|
|
result = RunResult(
|
|
scenario="test",
|
|
backend="claude",
|
|
timestamp="2026-04-20T10-00-00",
|
|
session_log="log",
|
|
filesystem_json='{"files": []}',
|
|
tool_calls_jsonl='{"tool": "Bash"}\n',
|
|
verdict_json='{"criteria": [], "observations": [], "summary": "ok"}',
|
|
meta={"backend": "claude"},
|
|
)
|
|
result.save(deep_dir)
|
|
assert deep_dir.exists()
|
|
assert (deep_dir / "session.log").exists()
|