mirror of
https://github.com/obra/superpowers.git
synced 2026-05-09 18:49:04 +08:00
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
93 lines
3.0 KiB
Python
93 lines
3.0 KiB
Python
from drill.verifier import CriterionResult, Verdict, Verifier
|
|
|
|
|
|
class TestVerdict:
|
|
def test_parse_valid_verdict(self):
|
|
data = {
|
|
"criteria": [
|
|
{
|
|
"criterion": "Agent detected on main",
|
|
"verdict": "pass",
|
|
"evidence": "Terminal showed 'main branch detected'",
|
|
"rationale": "Agent correctly identified the branch",
|
|
}
|
|
],
|
|
"observations": ["Agent was very fast"],
|
|
"summary": "Passed all checks",
|
|
}
|
|
verdict = Verdict.model_validate(data)
|
|
assert len(verdict.criteria) == 1
|
|
assert verdict.criteria[0].verdict == "pass"
|
|
assert verdict.score == "1/1"
|
|
|
|
def test_score_calculation(self):
|
|
data = {
|
|
"criteria": [
|
|
{"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
|
|
{"criterion": "B", "verdict": "fail", "evidence": "e", "rationale": "r"},
|
|
{"criterion": "C", "verdict": "pass", "evidence": "e", "rationale": "r"},
|
|
],
|
|
"observations": [],
|
|
"summary": "Mixed results",
|
|
}
|
|
verdict = Verdict.model_validate(data)
|
|
assert verdict.score == "2/3"
|
|
assert verdict.passed is False
|
|
|
|
def test_all_pass(self):
|
|
data = {
|
|
"criteria": [
|
|
{"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"},
|
|
],
|
|
"observations": [],
|
|
"summary": "Good",
|
|
}
|
|
verdict = Verdict.model_validate(data)
|
|
assert verdict.passed is True
|
|
|
|
|
|
class TestCriterionResultSource:
|
|
def test_default_source_is_judge(self):
|
|
cr = CriterionResult(
|
|
criterion="test",
|
|
verdict="pass",
|
|
evidence="e",
|
|
rationale="r",
|
|
)
|
|
assert cr.source == "judge"
|
|
|
|
def test_assertion_source(self):
|
|
cr = CriterionResult(
|
|
criterion="test",
|
|
verdict="fail",
|
|
evidence="e",
|
|
rationale="r",
|
|
source="assertion",
|
|
)
|
|
assert cr.source == "assertion"
|
|
|
|
def test_backwards_compat_no_source_in_json(self):
|
|
data = {"criterion": "A", "verdict": "pass", "evidence": "e", "rationale": "r"}
|
|
cr = CriterionResult.model_validate(data)
|
|
assert cr.source == "judge"
|
|
|
|
def test_source_serializes_to_json(self):
|
|
cr = CriterionResult(
|
|
criterion="test",
|
|
verdict="pass",
|
|
evidence="e",
|
|
rationale="r",
|
|
source="assertion",
|
|
)
|
|
data = cr.model_dump()
|
|
assert data["source"] == "assertion"
|
|
|
|
|
|
class TestVerifierPrompt:
|
|
def test_builds_system_prompt(self):
|
|
verifier = Verifier(model="claude-sonnet-4-6", temperature=0.0)
|
|
prompt = verifier.build_system_prompt()
|
|
assert "criterion" in prompt.lower()
|
|
assert "evidence" in prompt.lower()
|
|
assert "JSON" in prompt
|