mirror of
https://github.com/obra/superpowers.git
synced 2026-05-12 03:59:03 +08:00
evals: add pi backend
This commit is contained in:
@@ -43,6 +43,9 @@ uv run drill run spec-writing-blind-spot -b claude-opus-4-6 --n 5
|
|||||||
# Sweep across multiple backends
|
# Sweep across multiple backends
|
||||||
uv run drill run spec-writing-blind-spot --models claude-opus-4-6,claude-opus-4-7 --n 10
|
uv run drill run spec-writing-blind-spot --models claude-opus-4-6,claude-opus-4-7 --n 10
|
||||||
|
|
||||||
|
# Run against Pi, loading the local Superpowers package via -e ${SUPERPOWERS_ROOT}
|
||||||
|
uv run drill run triggering-writing-plans -b pi
|
||||||
|
|
||||||
# Compare results
|
# Compare results
|
||||||
uv run drill compare spec-writing-blind-spot
|
uv run drill compare spec-writing-blind-spot
|
||||||
|
|
||||||
@@ -72,6 +75,7 @@ uv run drill list
|
|||||||
| `codex` | Codex CLI | — |
|
| `codex` | Codex CLI | — |
|
||||||
| `gemini` | Gemini CLI | auto-gemini-3 |
|
| `gemini` | Gemini CLI | auto-gemini-3 |
|
||||||
| `gemini-2-5-flash` | Gemini CLI | gemini-2.5-flash |
|
| `gemini-2-5-flash` | Gemini CLI | gemini-2.5-flash |
|
||||||
|
| `pi` | Pi coding agent | configured Pi default |
|
||||||
|
|
||||||
## Project structure
|
## Project structure
|
||||||
|
|
||||||
|
|||||||
23
evals/backends/pi.yaml
Normal file
23
evals/backends/pi.yaml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
name: pi
|
||||||
|
cli: pi
|
||||||
|
args:
|
||||||
|
- "-e"
|
||||||
|
- "${SUPERPOWERS_ROOT}"
|
||||||
|
required_env:
|
||||||
|
- SUPERPOWERS_ROOT
|
||||||
|
hooks:
|
||||||
|
pre_run: []
|
||||||
|
post_run: []
|
||||||
|
shutdown: "/quit"
|
||||||
|
idle:
|
||||||
|
quiescence_seconds: 5
|
||||||
|
ready_pattern: "."
|
||||||
|
busy_pattern: "esc to cancel|Thinking\\.\\.\\.|\\(esc to cancel[^)]*\\)|[⠇⠏⠋⠙⠹⠸⠼⠴⠦⠧⠶⠾⠽⠻⠿]"
|
||||||
|
max_busy_seconds: 1800
|
||||||
|
startup_timeout: 60
|
||||||
|
turn_timeout: 300
|
||||||
|
terminal:
|
||||||
|
cols: 200
|
||||||
|
rows: 50
|
||||||
|
session_logs:
|
||||||
|
pattern: "~/.pi/agent/sessions/**/*.jsonl"
|
||||||
@@ -71,7 +71,7 @@ class Backend:
|
|||||||
@property
|
@property
|
||||||
def family(self) -> str:
|
def family(self) -> str:
|
||||||
"""Normalize backend name to a family for log-dir / normalizer dispatch."""
|
"""Normalize backend name to a family for log-dir / normalizer dispatch."""
|
||||||
for fam in ("claude", "codex", "gemini"):
|
for fam in ("claude", "codex", "gemini", "pi"):
|
||||||
if self.name == fam or self.name.startswith(f"{fam}-"):
|
if self.name == fam or self.name.startswith(f"{fam}-"):
|
||||||
return fam
|
return fam
|
||||||
return "other"
|
return "other"
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from drill.normalizer import (
|
|||||||
NORMALIZERS,
|
NORMALIZERS,
|
||||||
collect_new_logs,
|
collect_new_logs,
|
||||||
filter_codex_logs_by_cwd,
|
filter_codex_logs_by_cwd,
|
||||||
|
filter_pi_logs_by_cwd,
|
||||||
snapshot_log_dir,
|
snapshot_log_dir,
|
||||||
)
|
)
|
||||||
from drill.session import TmuxSession
|
from drill.session import TmuxSession
|
||||||
@@ -348,6 +349,11 @@ class Engine:
|
|||||||
# Project name is the workdir basename, lowercased
|
# Project name is the workdir basename, lowercased
|
||||||
project = workdir.resolve().name.lower()
|
project = workdir.resolve().name.lower()
|
||||||
return Path.home() / ".gemini" / "tmp" / project
|
return Path.home() / ".gemini" / "tmp" / project
|
||||||
|
elif self.backend.family == "pi":
|
||||||
|
# Pi stores sessions under ~/.pi/agent/sessions/<encoded-cwd>/.
|
||||||
|
# Return the root and filter by the session header cwd because
|
||||||
|
# multiple evals may run concurrently under the same tree.
|
||||||
|
return Path.home() / ".pi" / "agent" / "sessions"
|
||||||
pattern = self.backend.session_logs.get("pattern", "")
|
pattern = self.backend.session_logs.get("pattern", "")
|
||||||
if not pattern:
|
if not pattern:
|
||||||
return None
|
return None
|
||||||
@@ -363,6 +369,8 @@ class Engine:
|
|||||||
new_files = collect_new_logs(log_dir, snapshot)
|
new_files = collect_new_logs(log_dir, snapshot)
|
||||||
if self.backend.family == "codex":
|
if self.backend.family == "codex":
|
||||||
new_files = filter_codex_logs_by_cwd(new_files, str(workdir.resolve()))
|
new_files = filter_codex_logs_by_cwd(new_files, str(workdir.resolve()))
|
||||||
|
elif self.backend.family == "pi":
|
||||||
|
new_files = filter_pi_logs_by_cwd(new_files, str(workdir.resolve()))
|
||||||
normalizer = NORMALIZERS.get(self.backend.family)
|
normalizer = NORMALIZERS.get(self.backend.family)
|
||||||
if not normalizer:
|
if not normalizer:
|
||||||
return []
|
return []
|
||||||
|
|||||||
@@ -74,6 +74,23 @@ def filter_codex_logs_by_cwd(paths: list[Path], target_cwd: str) -> list[Path]:
|
|||||||
return matched
|
return matched
|
||||||
|
|
||||||
|
|
||||||
|
def filter_pi_logs_by_cwd(paths: list[Path], target_cwd: str) -> list[Path]:
|
||||||
|
"""Drop Pi sessions whose header cwd doesn't match target_cwd."""
|
||||||
|
matched: list[Path] = []
|
||||||
|
for path in paths:
|
||||||
|
try:
|
||||||
|
with path.open() as f:
|
||||||
|
first_line = f.readline()
|
||||||
|
entry = json.loads(first_line)
|
||||||
|
except (OSError, json.JSONDecodeError):
|
||||||
|
continue
|
||||||
|
if entry.get("type") != "session":
|
||||||
|
continue
|
||||||
|
if entry.get("cwd") == target_cwd:
|
||||||
|
matched.append(path)
|
||||||
|
return matched
|
||||||
|
|
||||||
|
|
||||||
def normalize_claude_logs(raw_content: str) -> list[dict[str, Any]]:
|
def normalize_claude_logs(raw_content: str) -> list[dict[str, Any]]:
|
||||||
"""Normalize Claude Code session logs.
|
"""Normalize Claude Code session logs.
|
||||||
|
|
||||||
@@ -155,6 +172,52 @@ def normalize_codex_logs(raw_content: str) -> list[dict[str, Any]]:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Reverse mapping: Pi tool names → Claude Code canonical names
|
||||||
|
PI_TOOL_MAP: dict[str, str] = {
|
||||||
|
"read": "Read",
|
||||||
|
"write": "Write",
|
||||||
|
"edit": "Edit",
|
||||||
|
"bash": "Bash",
|
||||||
|
"grep": "Grep",
|
||||||
|
"find": "Glob",
|
||||||
|
"ls": "Glob",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PI_NATIVE_TOOLS = (set(PI_TOOL_MAP.values()) - {"Bash"}) | {"subagent", "todo", "manage_todo_list"}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_pi_logs(raw_content: str) -> list[dict[str, Any]]:
|
||||||
|
"""Normalize Pi JSONL session logs.
|
||||||
|
|
||||||
|
Pi session files are JSONL entries. Assistant messages contain tool calls as
|
||||||
|
content blocks: {"type": "toolCall", "name": "read", "arguments": {...}}.
|
||||||
|
"""
|
||||||
|
results: list[dict[str, Any]] = []
|
||||||
|
for line in raw_content.strip().split("\n"):
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
if entry.get("type") != "message":
|
||||||
|
continue
|
||||||
|
message = entry.get("message", {})
|
||||||
|
if message.get("role") != "assistant":
|
||||||
|
continue
|
||||||
|
for block in message.get("content", []):
|
||||||
|
if block.get("type") != "toolCall":
|
||||||
|
continue
|
||||||
|
name = block.get("name", "")
|
||||||
|
canonical = PI_TOOL_MAP.get(name, name)
|
||||||
|
source = "native" if canonical in PI_NATIVE_TOOLS else "shell"
|
||||||
|
results.append(
|
||||||
|
{"tool": canonical, "args": block.get("arguments", {}), "source": source}
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
# Reverse mapping: Gemini tool names → Claude Code canonical names
|
# Reverse mapping: Gemini tool names → Claude Code canonical names
|
||||||
GEMINI_TOOL_MAP: dict[str, str] = {
|
GEMINI_TOOL_MAP: dict[str, str] = {
|
||||||
"run_shell_command": "Bash",
|
"run_shell_command": "Bash",
|
||||||
@@ -225,4 +288,5 @@ NORMALIZERS: dict[str, Callable[[str], list[dict[str, Any]]]] = {
|
|||||||
"claude": normalize_claude_logs,
|
"claude": normalize_claude_logs,
|
||||||
"codex": normalize_codex_logs,
|
"codex": normalize_codex_logs,
|
||||||
"gemini": normalize_gemini_logs,
|
"gemini": normalize_gemini_logs,
|
||||||
|
"pi": normalize_pi_logs,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,6 +44,12 @@ class TestLoadBackend:
|
|||||||
assert flash_backend.family == "gemini"
|
assert flash_backend.family == "gemini"
|
||||||
assert flash_backend.model == "gemini-2.5-flash"
|
assert flash_backend.model == "gemini-2.5-flash"
|
||||||
|
|
||||||
|
def test_loads_pi_backend(self, backends_dir):
|
||||||
|
backend = load_backend("pi", backends_dir)
|
||||||
|
assert backend.name == "pi"
|
||||||
|
assert backend.cli == "pi"
|
||||||
|
assert backend.family == "pi"
|
||||||
|
|
||||||
|
|
||||||
class TestBackendBuildCommand:
|
class TestBackendBuildCommand:
|
||||||
def test_claude_build_command(self, backends_dir, monkeypatch):
|
def test_claude_build_command(self, backends_dir, monkeypatch):
|
||||||
@@ -60,6 +66,12 @@ class TestBackendBuildCommand:
|
|||||||
cmd = backend.build_command("/tmp/workdir")
|
cmd = backend.build_command("/tmp/workdir")
|
||||||
assert cmd[0] == "codex"
|
assert cmd[0] == "codex"
|
||||||
|
|
||||||
|
def test_pi_build_command_loads_local_superpowers_package(self, backends_dir, monkeypatch):
|
||||||
|
monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
|
||||||
|
backend = load_backend("pi", backends_dir)
|
||||||
|
cmd = backend.build_command("/tmp/workdir")
|
||||||
|
assert cmd == ["pi", "-e", "/tmp/superpowers"]
|
||||||
|
|
||||||
|
|
||||||
class TestBackendEnvValidation:
|
class TestBackendEnvValidation:
|
||||||
def test_missing_env_raises(self, backends_dir, monkeypatch):
|
def test_missing_env_raises(self, backends_dir, monkeypatch):
|
||||||
@@ -125,6 +137,21 @@ class TestBackendFamily:
|
|||||||
backend = load_backend("codex", backends_dir)
|
backend = load_backend("codex", backends_dir)
|
||||||
assert backend.family == "codex"
|
assert backend.family == "codex"
|
||||||
|
|
||||||
|
def test_pi_backend_family(self):
|
||||||
|
backend = Backend(
|
||||||
|
name="pi",
|
||||||
|
cli="pi",
|
||||||
|
args=[],
|
||||||
|
required_env=[],
|
||||||
|
hooks={"pre_run": [], "post_run": []},
|
||||||
|
shutdown="/quit",
|
||||||
|
idle={},
|
||||||
|
startup_timeout=30,
|
||||||
|
terminal={},
|
||||||
|
session_logs={},
|
||||||
|
)
|
||||||
|
assert backend.family == "pi"
|
||||||
|
|
||||||
def test_variant_name_preserves_family(self):
|
def test_variant_name_preserves_family(self):
|
||||||
backend = Backend(
|
backend = Backend(
|
||||||
name="claude-opus-4-6",
|
name="claude-opus-4-6",
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import json
|
|||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
|
from drill.engine import Engine, RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
|
||||||
|
|
||||||
|
|
||||||
class TestVerifyConfig:
|
class TestVerifyConfig:
|
||||||
@@ -138,6 +138,40 @@ class TestEngineAssertionIntegration:
|
|||||||
assert (tmp_path / "meta.json").exists()
|
assert (tmp_path / "meta.json").exists()
|
||||||
|
|
||||||
|
|
||||||
|
class TestEnginePiBackend:
|
||||||
|
def test_resolves_pi_session_log_root(self, tmp_path: Path) -> None:
|
||||||
|
scenario = tmp_path / "scenario.yaml"
|
||||||
|
scenario.write_text("scenario: test-pi\n")
|
||||||
|
backends = tmp_path / "backends"
|
||||||
|
backends.mkdir()
|
||||||
|
(backends / "pi.yaml").write_text(
|
||||||
|
"""
|
||||||
|
name: pi
|
||||||
|
cli: pi
|
||||||
|
args: []
|
||||||
|
required_env: []
|
||||||
|
hooks:
|
||||||
|
pre_run: []
|
||||||
|
post_run: []
|
||||||
|
shutdown: /quit
|
||||||
|
idle: {}
|
||||||
|
startup_timeout: 1
|
||||||
|
terminal: {}
|
||||||
|
session_logs:
|
||||||
|
pattern: ~/.pi/agent/sessions/**/*.jsonl
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
engine = Engine(
|
||||||
|
scenario_path=scenario,
|
||||||
|
backend_name="pi",
|
||||||
|
backends_dir=backends,
|
||||||
|
fixtures_dir=tmp_path,
|
||||||
|
results_dir=tmp_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert engine._resolve_log_dir(tmp_path) == Path.home() / ".pi" / "agent" / "sessions"
|
||||||
|
|
||||||
|
|
||||||
class TestEngineRunParams:
|
class TestEngineRunParams:
|
||||||
def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
|
def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
|
||||||
custom_dir = tmp_path / "custom" / "run-00"
|
custom_dir = tmp_path / "custom" / "run-00"
|
||||||
|
|||||||
@@ -3,9 +3,11 @@ import json
|
|||||||
from drill.normalizer import (
|
from drill.normalizer import (
|
||||||
collect_new_logs,
|
collect_new_logs,
|
||||||
filter_codex_logs_by_cwd,
|
filter_codex_logs_by_cwd,
|
||||||
|
filter_pi_logs_by_cwd,
|
||||||
normalize_claude_logs,
|
normalize_claude_logs,
|
||||||
normalize_codex_logs,
|
normalize_codex_logs,
|
||||||
normalize_gemini_logs,
|
normalize_gemini_logs,
|
||||||
|
normalize_pi_logs,
|
||||||
snapshot_log_dir,
|
snapshot_log_dir,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -137,6 +139,56 @@ class TestNormalizeCodexLogs:
|
|||||||
assert normalized[1]["source"] == "native"
|
assert normalized[1]["source"] == "native"
|
||||||
|
|
||||||
|
|
||||||
|
class TestNormalizePiLogs:
|
||||||
|
def test_filter_by_cwd_keeps_matching_session_headers(self, tmp_path):
|
||||||
|
target = "/tmp/drill-target"
|
||||||
|
match = tmp_path / "match.jsonl"
|
||||||
|
match.write_text(json.dumps({"type": "session", "cwd": target}) + "\n")
|
||||||
|
other = tmp_path / "other.jsonl"
|
||||||
|
other.write_text(json.dumps({"type": "session", "cwd": "/tmp/other"}) + "\n")
|
||||||
|
malformed = tmp_path / "malformed.jsonl"
|
||||||
|
malformed.write_text("not json\n")
|
||||||
|
|
||||||
|
assert filter_pi_logs_by_cwd([match, other, malformed], target) == [match]
|
||||||
|
|
||||||
|
def test_normalizes_assistant_tool_calls_from_session_entries(self):
|
||||||
|
lines = [
|
||||||
|
json.dumps({"type": "session", "cwd": "/tmp/project"}),
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"type": "message",
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "I will inspect this."},
|
||||||
|
{
|
||||||
|
"type": "toolCall",
|
||||||
|
"name": "read",
|
||||||
|
"arguments": {"path": "README.md"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "toolCall",
|
||||||
|
"name": "bash",
|
||||||
|
"arguments": {"command": "git status"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "toolCall",
|
||||||
|
"name": "subagent",
|
||||||
|
"arguments": {"agent": "reviewer"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert normalize_pi_logs("\n".join(lines)) == [
|
||||||
|
{"tool": "Read", "args": {"path": "README.md"}, "source": "native"},
|
||||||
|
{"tool": "Bash", "args": {"command": "git status"}, "source": "shell"},
|
||||||
|
{"tool": "subagent", "args": {"agent": "reviewer"}, "source": "native"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class TestNormalizeGeminiLogs:
|
class TestNormalizeGeminiLogs:
|
||||||
def test_normalizes_jsonl_tool_calls(self):
|
def test_normalizes_jsonl_tool_calls(self):
|
||||||
lines = [
|
lines = [
|
||||||
|
|||||||
Reference in New Issue
Block a user