mirror of
https://github.com/obra/superpowers.git
synced 2026-05-10 19:19:03 +08:00
Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
This commit is contained in:
committed by
Drew Ritter
parent
2e46e9590d
commit
3b412a3836
137
evals/drill/cli.py
Normal file
137
evals/drill/cli.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""Drill CLI: run, compare, list."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import secrets
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
from dotenv import load_dotenv
|
||||
|
||||
PROJECT_ROOT: Path = Path(__file__).parent.parent
|
||||
|
||||
load_dotenv(PROJECT_ROOT / ".env")
|
||||
|
||||
|
||||
@click.group()
|
||||
def main() -> None:
|
||||
"""Drill: Superpowers skill compliance benchmark."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.argument("scenario")
|
||||
@click.option("--backend", "-b", default=None, help="Backend name (e.g., claude, codex)")
|
||||
@click.option("--models", "-m", default=None, help="Comma-separated backend names for sweep")
|
||||
@click.option("--n", "n_runs", type=int, default=1, help="Number of repetitions per backend")
|
||||
@click.option(
|
||||
"--backends-dir",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
default=PROJECT_ROOT / "backends",
|
||||
)
|
||||
@click.option(
|
||||
"--scenarios-dir",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
default=PROJECT_ROOT / "scenarios",
|
||||
)
|
||||
@click.option(
|
||||
"--fixtures-dir",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
default=PROJECT_ROOT / "fixtures",
|
||||
)
|
||||
@click.option("--results-dir", type=click.Path(path_type=Path), default=PROJECT_ROOT / "results")
|
||||
def run(
|
||||
scenario: str,
|
||||
backend: str | None,
|
||||
models: str | None,
|
||||
n_runs: int,
|
||||
backends_dir: Path,
|
||||
scenarios_dir: Path,
|
||||
fixtures_dir: Path,
|
||||
results_dir: Path,
|
||||
) -> None:
|
||||
"""Run a scenario against one or more backends."""
|
||||
if n_runs < 1:
|
||||
raise click.ClickException("--n must be at least 1")
|
||||
|
||||
if models:
|
||||
backend_names = [b.strip() for b in models.split(",") if b.strip()]
|
||||
elif backend:
|
||||
backend_names = [backend]
|
||||
else:
|
||||
raise click.ClickException("Either --backend or --models is required")
|
||||
|
||||
scenario_path = scenarios_dir / f"{scenario}.yaml"
|
||||
if not scenario_path.exists():
|
||||
raise click.ClickException(f"Scenario not found: {scenario_path}")
|
||||
|
||||
sweep_id = secrets.token_hex(4)
|
||||
|
||||
from drill.sweep import Sweep
|
||||
|
||||
sweep = Sweep(
|
||||
scenario_path=scenario_path,
|
||||
backend_names=backend_names,
|
||||
backends_dir=backends_dir,
|
||||
fixtures_dir=fixtures_dir,
|
||||
results_dir=results_dir,
|
||||
n=n_runs,
|
||||
sweep_id=sweep_id,
|
||||
)
|
||||
|
||||
total = len(backend_names) * n_runs
|
||||
click.echo(
|
||||
f"Running {scenario} | backends: {', '.join(backend_names)} | "
|
||||
f"n={n_runs} | total runs: {total} | sweep: {sweep_id}"
|
||||
)
|
||||
|
||||
groups = sweep.run_all()
|
||||
|
||||
for group in groups:
|
||||
passed = sum(1 for r in group.runs if r.status == "pass")
|
||||
failed = sum(1 for r in group.runs if r.status == "fail")
|
||||
errored = sum(1 for r in group.runs if r.status == "error")
|
||||
click.echo(f"\n{group.backend}: {passed} passed, {failed} failed, {errored} errors")
|
||||
if group.partial:
|
||||
click.echo(" (interrupted — partial results)")
|
||||
|
||||
|
||||
@main.command("list")
|
||||
@click.option(
|
||||
"--scenarios-dir",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
default=PROJECT_ROOT / "scenarios",
|
||||
)
|
||||
def list_scenarios(scenarios_dir: Path) -> None:
|
||||
"""List available scenarios."""
|
||||
import yaml
|
||||
|
||||
for f in sorted(scenarios_dir.glob("*.yaml")):
|
||||
with open(f) as fh:
|
||||
data = yaml.safe_load(fh)
|
||||
name = data.get("scenario", f.stem)
|
||||
desc = data.get("description", "")
|
||||
click.echo(f" {name:40s} {desc}")
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.argument("scenario")
|
||||
@click.option("--sweep", "sweep_id", default=None, help="Filter by sweep ID")
|
||||
@click.option(
|
||||
"--results-dir",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
default=PROJECT_ROOT / "results",
|
||||
)
|
||||
def compare(scenario: str, sweep_id: str | None, results_dir: Path) -> None:
|
||||
"""Compare results across backends for a scenario."""
|
||||
from drill.compare import format_compare_output, load_scenario_results
|
||||
|
||||
scenario_dir = results_dir / scenario
|
||||
if not scenario_dir.exists():
|
||||
raise click.ClickException(f"No results found for: {scenario}")
|
||||
|
||||
results = load_scenario_results(scenario_dir, sweep_id=sweep_id)
|
||||
if not results:
|
||||
raise click.ClickException(f"No results found for: {scenario}")
|
||||
|
||||
click.echo(format_compare_output(scenario, results))
|
||||
Reference in New Issue
Block a user