From 7d06d7e4f0d847e1e18748d2a68f71e43f085170 Mon Sep 17 00:00:00 2001 From: Jesse Vincent Date: Thu, 7 May 2026 11:11:18 -0700 Subject: [PATCH] evals: add pi backend --- evals/README.md | 4 +++ evals/backends/pi.yaml | 23 ++++++++++++ evals/drill/backend.py | 2 +- evals/drill/engine.py | 8 +++++ evals/drill/normalizer.py | 64 ++++++++++++++++++++++++++++++++++ evals/tests/test_backend.py | 27 ++++++++++++++ evals/tests/test_engine.py | 36 ++++++++++++++++++- evals/tests/test_normalizer.py | 52 +++++++++++++++++++++++++++ 8 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 evals/backends/pi.yaml diff --git a/evals/README.md b/evals/README.md index d7afb5cb..0b9e329c 100644 --- a/evals/README.md +++ b/evals/README.md @@ -43,6 +43,9 @@ uv run drill run spec-writing-blind-spot -b claude-opus-4-6 --n 5 # Sweep across multiple backends uv run drill run spec-writing-blind-spot --models claude-opus-4-6,claude-opus-4-7 --n 10 +# Run against Pi, loading the local Superpowers package via -e ${SUPERPOWERS_ROOT} +uv run drill run triggering-writing-plans -b pi + # Compare results uv run drill compare spec-writing-blind-spot @@ -72,6 +75,7 @@ uv run drill list | `codex` | Codex CLI | — | | `gemini` | Gemini CLI | auto-gemini-3 | | `gemini-2-5-flash` | Gemini CLI | gemini-2.5-flash | +| `pi` | Pi coding agent | configured Pi default | ## Project structure diff --git a/evals/backends/pi.yaml b/evals/backends/pi.yaml new file mode 100644 index 00000000..97115529 --- /dev/null +++ b/evals/backends/pi.yaml @@ -0,0 +1,23 @@ +name: pi +cli: pi +args: + - "-e" + - "${SUPERPOWERS_ROOT}" +required_env: + - SUPERPOWERS_ROOT +hooks: + pre_run: [] + post_run: [] +shutdown: "/quit" +idle: + quiescence_seconds: 5 + ready_pattern: "." +busy_pattern: "esc to cancel|Thinking\\.\\.\\.|\\(esc to cancel[^)]*\\)|[⠇⠏⠋⠙⠹⠸⠼⠴⠦⠧⠶⠾⠽⠻⠿]" +max_busy_seconds: 1800 +startup_timeout: 60 +turn_timeout: 300 +terminal: + cols: 200 + rows: 50 +session_logs: + pattern: "~/.pi/agent/sessions/**/*.jsonl" diff --git a/evals/drill/backend.py b/evals/drill/backend.py index 503cbdb1..06e6e9cf 100644 --- a/evals/drill/backend.py +++ b/evals/drill/backend.py @@ -71,7 +71,7 @@ class Backend: @property def family(self) -> str: """Normalize backend name to a family for log-dir / normalizer dispatch.""" - for fam in ("claude", "codex", "gemini"): + for fam in ("claude", "codex", "gemini", "pi"): if self.name == fam or self.name.startswith(f"{fam}-"): return fam return "other" diff --git a/evals/drill/engine.py b/evals/drill/engine.py index 026004d0..6d9e3620 100644 --- a/evals/drill/engine.py +++ b/evals/drill/engine.py @@ -21,6 +21,7 @@ from drill.normalizer import ( NORMALIZERS, collect_new_logs, filter_codex_logs_by_cwd, + filter_pi_logs_by_cwd, snapshot_log_dir, ) from drill.session import TmuxSession @@ -348,6 +349,11 @@ class Engine: # Project name is the workdir basename, lowercased project = workdir.resolve().name.lower() return Path.home() / ".gemini" / "tmp" / project + elif self.backend.family == "pi": + # Pi stores sessions under ~/.pi/agent/sessions//. + # Return the root and filter by the session header cwd because + # multiple evals may run concurrently under the same tree. + return Path.home() / ".pi" / "agent" / "sessions" pattern = self.backend.session_logs.get("pattern", "") if not pattern: return None @@ -363,6 +369,8 @@ class Engine: new_files = collect_new_logs(log_dir, snapshot) if self.backend.family == "codex": new_files = filter_codex_logs_by_cwd(new_files, str(workdir.resolve())) + elif self.backend.family == "pi": + new_files = filter_pi_logs_by_cwd(new_files, str(workdir.resolve())) normalizer = NORMALIZERS.get(self.backend.family) if not normalizer: return [] diff --git a/evals/drill/normalizer.py b/evals/drill/normalizer.py index b88cbbc4..a1347dbd 100644 --- a/evals/drill/normalizer.py +++ b/evals/drill/normalizer.py @@ -74,6 +74,23 @@ def filter_codex_logs_by_cwd(paths: list[Path], target_cwd: str) -> list[Path]: return matched +def filter_pi_logs_by_cwd(paths: list[Path], target_cwd: str) -> list[Path]: + """Drop Pi sessions whose header cwd doesn't match target_cwd.""" + matched: list[Path] = [] + for path in paths: + try: + with path.open() as f: + first_line = f.readline() + entry = json.loads(first_line) + except (OSError, json.JSONDecodeError): + continue + if entry.get("type") != "session": + continue + if entry.get("cwd") == target_cwd: + matched.append(path) + return matched + + def normalize_claude_logs(raw_content: str) -> list[dict[str, Any]]: """Normalize Claude Code session logs. @@ -155,6 +172,52 @@ def normalize_codex_logs(raw_content: str) -> list[dict[str, Any]]: return results +# Reverse mapping: Pi tool names → Claude Code canonical names +PI_TOOL_MAP: dict[str, str] = { + "read": "Read", + "write": "Write", + "edit": "Edit", + "bash": "Bash", + "grep": "Grep", + "find": "Glob", + "ls": "Glob", +} + + +PI_NATIVE_TOOLS = (set(PI_TOOL_MAP.values()) - {"Bash"}) | {"subagent", "todo", "manage_todo_list"} + + +def normalize_pi_logs(raw_content: str) -> list[dict[str, Any]]: + """Normalize Pi JSONL session logs. + + Pi session files are JSONL entries. Assistant messages contain tool calls as + content blocks: {"type": "toolCall", "name": "read", "arguments": {...}}. + """ + results: list[dict[str, Any]] = [] + for line in raw_content.strip().split("\n"): + if not line.strip(): + continue + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + if entry.get("type") != "message": + continue + message = entry.get("message", {}) + if message.get("role") != "assistant": + continue + for block in message.get("content", []): + if block.get("type") != "toolCall": + continue + name = block.get("name", "") + canonical = PI_TOOL_MAP.get(name, name) + source = "native" if canonical in PI_NATIVE_TOOLS else "shell" + results.append( + {"tool": canonical, "args": block.get("arguments", {}), "source": source} + ) + return results + + # Reverse mapping: Gemini tool names → Claude Code canonical names GEMINI_TOOL_MAP: dict[str, str] = { "run_shell_command": "Bash", @@ -225,4 +288,5 @@ NORMALIZERS: dict[str, Callable[[str], list[dict[str, Any]]]] = { "claude": normalize_claude_logs, "codex": normalize_codex_logs, "gemini": normalize_gemini_logs, + "pi": normalize_pi_logs, } diff --git a/evals/tests/test_backend.py b/evals/tests/test_backend.py index f84742a1..fc0f8cf8 100644 --- a/evals/tests/test_backend.py +++ b/evals/tests/test_backend.py @@ -44,6 +44,12 @@ class TestLoadBackend: assert flash_backend.family == "gemini" assert flash_backend.model == "gemini-2.5-flash" + def test_loads_pi_backend(self, backends_dir): + backend = load_backend("pi", backends_dir) + assert backend.name == "pi" + assert backend.cli == "pi" + assert backend.family == "pi" + class TestBackendBuildCommand: def test_claude_build_command(self, backends_dir, monkeypatch): @@ -60,6 +66,12 @@ class TestBackendBuildCommand: cmd = backend.build_command("/tmp/workdir") assert cmd[0] == "codex" + def test_pi_build_command_loads_local_superpowers_package(self, backends_dir, monkeypatch): + monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers") + backend = load_backend("pi", backends_dir) + cmd = backend.build_command("/tmp/workdir") + assert cmd == ["pi", "-e", "/tmp/superpowers"] + class TestBackendEnvValidation: def test_missing_env_raises(self, backends_dir, monkeypatch): @@ -125,6 +137,21 @@ class TestBackendFamily: backend = load_backend("codex", backends_dir) assert backend.family == "codex" + def test_pi_backend_family(self): + backend = Backend( + name="pi", + cli="pi", + args=[], + required_env=[], + hooks={"pre_run": [], "post_run": []}, + shutdown="/quit", + idle={}, + startup_timeout=30, + terminal={}, + session_logs={}, + ) + assert backend.family == "pi" + def test_variant_name_preserves_family(self): backend = Backend( name="claude-opus-4-6", diff --git a/evals/tests/test_engine.py b/evals/tests/test_engine.py index 7ee33028..0c116458 100644 --- a/evals/tests/test_engine.py +++ b/evals/tests/test_engine.py @@ -4,7 +4,7 @@ import json import subprocess from pathlib import Path -from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem +from drill.engine import Engine, RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem class TestVerifyConfig: @@ -138,6 +138,40 @@ class TestEngineAssertionIntegration: assert (tmp_path / "meta.json").exists() +class TestEnginePiBackend: + def test_resolves_pi_session_log_root(self, tmp_path: Path) -> None: + scenario = tmp_path / "scenario.yaml" + scenario.write_text("scenario: test-pi\n") + backends = tmp_path / "backends" + backends.mkdir() + (backends / "pi.yaml").write_text( + """ +name: pi +cli: pi +args: [] +required_env: [] +hooks: + pre_run: [] + post_run: [] +shutdown: /quit +idle: {} +startup_timeout: 1 +terminal: {} +session_logs: + pattern: ~/.pi/agent/sessions/**/*.jsonl +""" + ) + engine = Engine( + scenario_path=scenario, + backend_name="pi", + backends_dir=backends, + fixtures_dir=tmp_path, + results_dir=tmp_path, + ) + + assert engine._resolve_log_dir(tmp_path) == Path.home() / ".pi" / "agent" / "sessions" + + class TestEngineRunParams: def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None: custom_dir = tmp_path / "custom" / "run-00" diff --git a/evals/tests/test_normalizer.py b/evals/tests/test_normalizer.py index 41947c5a..10ea85f0 100644 --- a/evals/tests/test_normalizer.py +++ b/evals/tests/test_normalizer.py @@ -3,9 +3,11 @@ import json from drill.normalizer import ( collect_new_logs, filter_codex_logs_by_cwd, + filter_pi_logs_by_cwd, normalize_claude_logs, normalize_codex_logs, normalize_gemini_logs, + normalize_pi_logs, snapshot_log_dir, ) @@ -137,6 +139,56 @@ class TestNormalizeCodexLogs: assert normalized[1]["source"] == "native" +class TestNormalizePiLogs: + def test_filter_by_cwd_keeps_matching_session_headers(self, tmp_path): + target = "/tmp/drill-target" + match = tmp_path / "match.jsonl" + match.write_text(json.dumps({"type": "session", "cwd": target}) + "\n") + other = tmp_path / "other.jsonl" + other.write_text(json.dumps({"type": "session", "cwd": "/tmp/other"}) + "\n") + malformed = tmp_path / "malformed.jsonl" + malformed.write_text("not json\n") + + assert filter_pi_logs_by_cwd([match, other, malformed], target) == [match] + + def test_normalizes_assistant_tool_calls_from_session_entries(self): + lines = [ + json.dumps({"type": "session", "cwd": "/tmp/project"}), + json.dumps( + { + "type": "message", + "message": { + "role": "assistant", + "content": [ + {"type": "text", "text": "I will inspect this."}, + { + "type": "toolCall", + "name": "read", + "arguments": {"path": "README.md"}, + }, + { + "type": "toolCall", + "name": "bash", + "arguments": {"command": "git status"}, + }, + { + "type": "toolCall", + "name": "subagent", + "arguments": {"agent": "reviewer"}, + }, + ], + }, + } + ), + ] + + assert normalize_pi_logs("\n".join(lines)) == [ + {"tool": "Read", "args": {"path": "README.md"}, "source": "native"}, + {"tool": "Bash", "args": {"command": "git status"}, "source": "shell"}, + {"tool": "subagent", "args": {"agent": "reviewer"}, "source": "native"}, + ] + + class TestNormalizeGeminiLogs: def test_normalizes_jsonl_tool_calls(self): lines = [