Files
superpowers/evals/tests/test_stats.py
Jesse Vincent 3b412a3836 Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding
.git/, .venv/, results/, .env/, __pycache__/, *.egg-info/,
.private-journal/.

The drill repo is unaffected by this commit; archival is a separate
manual step after this PR merges.

Source SHA recorded at evals/.drill-source-sha for divergence
detection.
2026-05-06 15:47:39 -07:00

55 lines
1.4 KiB
Python

"""Tests for Wilson score confidence interval."""
from __future__ import annotations
from drill.stats import wilson_ci
class TestWilsonCI:
def test_all_pass(self) -> None:
lo, hi = wilson_ci(10, 10)
assert lo > 0.69
assert hi == 1.0 or hi > 0.99
def test_all_fail(self) -> None:
lo, hi = wilson_ci(0, 10)
assert lo < 0.01 or lo == 0.0
assert hi < 0.31
def test_half_pass(self) -> None:
lo, hi = wilson_ci(5, 10)
assert 0.18 < lo < 0.25
assert 0.75 < hi < 0.82
def test_zero_total(self) -> None:
lo, hi = wilson_ci(0, 0)
assert lo == 0.0
assert hi == 0.0
def test_single_pass(self) -> None:
lo, hi = wilson_ci(1, 1)
assert lo > 0.0
assert hi <= 1.0
def test_single_fail(self) -> None:
lo, hi = wilson_ci(0, 1)
assert lo == 0.0 or lo >= 0.0
assert hi < 1.0
def test_large_sample(self) -> None:
lo, hi = wilson_ci(80, 100)
assert 0.70 < lo < 0.75
assert 0.85 < hi < 0.90
def test_passed_greater_than_total_clamped(self) -> None:
lo, hi = wilson_ci(12, 10)
assert lo > 0.0
assert hi <= 1.0
def test_returns_tuple_of_floats(self) -> None:
result = wilson_ci(5, 10)
assert isinstance(result, tuple)
assert len(result) == 2
assert isinstance(result[0], float)
assert isinstance(result[1], float)