From 7d06d7e4f0d847e1e18748d2a68f71e43f085170 Mon Sep 17 00:00:00 2001
From: Jesse Vincent <jesse@fsck.com>
Date: Thu, 7 May 2026 11:11:18 -0700
Subject: [PATCH] evals: add pi backend

---
 evals/README.md                |  4 +++
 evals/backends/pi.yaml         | 23 ++++++++++++
 evals/drill/backend.py         |  2 +-
 evals/drill/engine.py          |  8 +++++
 evals/drill/normalizer.py      | 64 ++++++++++++++++++++++++++++++++++
 evals/tests/test_backend.py    | 27 ++++++++++++++
 evals/tests/test_engine.py     | 36 ++++++++++++++++++-
 evals/tests/test_normalizer.py | 52 +++++++++++++++++++++++++++
 8 files changed, 214 insertions(+), 2 deletions(-)
 create mode 100644 evals/backends/pi.yaml
diff --git a/evals/README.md b/evals/README.md
index d7afb5cb..0b9e329c 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -43,6 +43,9 @@ uv run drill run spec-writing-blind-spot -b claude-opus-4-6 --n 5
 # Sweep across multiple backends
 uv run drill run spec-writing-blind-spot --models claude-opus-4-6,claude-opus-4-7 --n 10
 
+# Run against Pi, loading the local Superpowers package via -e ${SUPERPOWERS_ROOT}
+uv run drill run triggering-writing-plans -b pi
+
 # Compare results
 uv run drill compare spec-writing-blind-spot
 
@@ -72,6 +75,7 @@ uv run drill list
 | `codex` | Codex CLI | — |
 | `gemini` | Gemini CLI | auto-gemini-3 |
 | `gemini-2-5-flash` | Gemini CLI | gemini-2.5-flash |
+| `pi` | Pi coding agent | configured Pi default |
 
 ## Project structure
 
diff --git a/evals/backends/pi.yaml b/evals/backends/pi.yaml
new file mode 100644
index 00000000..97115529
--- /dev/null
+++ b/evals/backends/pi.yaml
@@ -0,0 +1,23 @@
+name: pi
+cli: pi
+args:
+  - "-e"
+  - "${SUPERPOWERS_ROOT}"
+required_env:
+  - SUPERPOWERS_ROOT
+hooks:
+  pre_run: []
+  post_run: []
+shutdown: "/quit"
+idle:
+  quiescence_seconds: 5
+  ready_pattern: "."
+busy_pattern: "esc to cancel|Thinking\\.\\.\\.|\\(esc to cancel[^)]*\\)|[⠇⠏⠋⠙⠹⠸⠼⠴⠦⠧⠶⠾⠽⠻⠿]"
+max_busy_seconds: 1800
+startup_timeout: 60
+turn_timeout: 300
+terminal:
+  cols: 200
+  rows: 50
+session_logs:
+  pattern: "~/.pi/agent/sessions/**/*.jsonl"
diff --git a/evals/drill/backend.py b/evals/drill/backend.py
index 503cbdb1..06e6e9cf 100644
--- a/evals/drill/backend.py
+++ b/evals/drill/backend.py
@@ -71,7 +71,7 @@ class Backend:
     @property
     def family(self) -> str:
         """Normalize backend name to a family for log-dir / normalizer dispatch."""
-        for fam in ("claude", "codex", "gemini"):
+        for fam in ("claude", "codex", "gemini", "pi"):
             if self.name == fam or self.name.startswith(f"{fam}-"):
                 return fam
         return "other"
diff --git a/evals/drill/engine.py b/evals/drill/engine.py
index 026004d0..6d9e3620 100644
--- a/evals/drill/engine.py
+++ b/evals/drill/engine.py
@@ -21,6 +21,7 @@ from drill.normalizer import (
     NORMALIZERS,
     collect_new_logs,
     filter_codex_logs_by_cwd,
+    filter_pi_logs_by_cwd,
     snapshot_log_dir,
 )
 from drill.session import TmuxSession
@@ -348,6 +349,11 @@ class Engine:
             # Project name is the workdir basename, lowercased
             project = workdir.resolve().name.lower()
             return Path.home() / ".gemini" / "tmp" / project
+        elif self.backend.family == "pi":
+            # Pi stores sessions under ~/.pi/agent/sessions/<encoded-cwd>/.
+            # Return the root and filter by the session header cwd because
+            # multiple evals may run concurrently under the same tree.
+            return Path.home() / ".pi" / "agent" / "sessions"
         pattern = self.backend.session_logs.get("pattern", "")
         if not pattern:
             return None
@@ -363,6 +369,8 @@ class Engine:
         new_files = collect_new_logs(log_dir, snapshot)
         if self.backend.family == "codex":
             new_files = filter_codex_logs_by_cwd(new_files, str(workdir.resolve()))
+        elif self.backend.family == "pi":
+            new_files = filter_pi_logs_by_cwd(new_files, str(workdir.resolve()))
         normalizer = NORMALIZERS.get(self.backend.family)
         if not normalizer:
             return []
diff --git a/evals/drill/normalizer.py b/evals/drill/normalizer.py
index b88cbbc4..a1347dbd 100644
--- a/evals/drill/normalizer.py
+++ b/evals/drill/normalizer.py
@@ -74,6 +74,23 @@ def filter_codex_logs_by_cwd(paths: list[Path], target_cwd: str) -> list[Path]:
     return matched
 
 
+def filter_pi_logs_by_cwd(paths: list[Path], target_cwd: str) -> list[Path]:
+    """Drop Pi sessions whose header cwd doesn't match target_cwd."""
+    matched: list[Path] = []
+    for path in paths:
+        try:
+            with path.open() as f:
+                first_line = f.readline()
+            entry = json.loads(first_line)
+        except (OSError, json.JSONDecodeError):
+            continue
+        if entry.get("type") != "session":
+            continue
+        if entry.get("cwd") == target_cwd:
+            matched.append(path)
+    return matched
+
+
 def normalize_claude_logs(raw_content: str) -> list[dict[str, Any]]:
     """Normalize Claude Code session logs.
 
@@ -155,6 +172,52 @@ def normalize_codex_logs(raw_content: str) -> list[dict[str, Any]]:
     return results
 
 
+# Reverse mapping: Pi tool names → Claude Code canonical names
+PI_TOOL_MAP: dict[str, str] = {
+    "read": "Read",
+    "write": "Write",
+    "edit": "Edit",
+    "bash": "Bash",
+    "grep": "Grep",
+    "find": "Glob",
+    "ls": "Glob",
+}
+
+
+PI_NATIVE_TOOLS = (set(PI_TOOL_MAP.values()) - {"Bash"}) | {"subagent", "todo", "manage_todo_list"}
+
+
+def normalize_pi_logs(raw_content: str) -> list[dict[str, Any]]:
+    """Normalize Pi JSONL session logs.
+
+    Pi session files are JSONL entries. Assistant messages contain tool calls as
+    content blocks: {"type": "toolCall", "name": "read", "arguments": {...}}.
+    """
+    results: list[dict[str, Any]] = []
+    for line in raw_content.strip().split("\n"):
+        if not line.strip():
+            continue
+        try:
+            entry = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if entry.get("type") != "message":
+            continue
+        message = entry.get("message", {})
+        if message.get("role") != "assistant":
+            continue
+        for block in message.get("content", []):
+            if block.get("type") != "toolCall":
+                continue
+            name = block.get("name", "")
+            canonical = PI_TOOL_MAP.get(name, name)
+            source = "native" if canonical in PI_NATIVE_TOOLS else "shell"
+            results.append(
+                {"tool": canonical, "args": block.get("arguments", {}), "source": source}
+            )
+    return results
+
+
 # Reverse mapping: Gemini tool names → Claude Code canonical names
 GEMINI_TOOL_MAP: dict[str, str] = {
     "run_shell_command": "Bash",
@@ -225,4 +288,5 @@ NORMALIZERS: dict[str, Callable[[str], list[dict[str, Any]]]] = {
     "claude": normalize_claude_logs,
     "codex": normalize_codex_logs,
     "gemini": normalize_gemini_logs,
+    "pi": normalize_pi_logs,
 }
diff --git a/evals/tests/test_backend.py b/evals/tests/test_backend.py
index f84742a1..fc0f8cf8 100644
--- a/evals/tests/test_backend.py
+++ b/evals/tests/test_backend.py
@@ -44,6 +44,12 @@ class TestLoadBackend:
         assert flash_backend.family == "gemini"
         assert flash_backend.model == "gemini-2.5-flash"
 
+    def test_loads_pi_backend(self, backends_dir):
+        backend = load_backend("pi", backends_dir)
+        assert backend.name == "pi"
+        assert backend.cli == "pi"
+        assert backend.family == "pi"
+
 
 class TestBackendBuildCommand:
     def test_claude_build_command(self, backends_dir, monkeypatch):
@@ -60,6 +66,12 @@ class TestBackendBuildCommand:
         cmd = backend.build_command("/tmp/workdir")
         assert cmd[0] == "codex"
 
+    def test_pi_build_command_loads_local_superpowers_package(self, backends_dir, monkeypatch):
+        monkeypatch.setenv("SUPERPOWERS_ROOT", "/tmp/superpowers")
+        backend = load_backend("pi", backends_dir)
+        cmd = backend.build_command("/tmp/workdir")
+        assert cmd == ["pi", "-e", "/tmp/superpowers"]
+
 
 class TestBackendEnvValidation:
     def test_missing_env_raises(self, backends_dir, monkeypatch):
@@ -125,6 +137,21 @@ class TestBackendFamily:
         backend = load_backend("codex", backends_dir)
         assert backend.family == "codex"
 
+    def test_pi_backend_family(self):
+        backend = Backend(
+            name="pi",
+            cli="pi",
+            args=[],
+            required_env=[],
+            hooks={"pre_run": [], "post_run": []},
+            shutdown="/quit",
+            idle={},
+            startup_timeout=30,
+            terminal={},
+            session_logs={},
+        )
+        assert backend.family == "pi"
+
     def test_variant_name_preserves_family(self):
         backend = Backend(
             name="claude-opus-4-6",
diff --git a/evals/tests/test_engine.py b/evals/tests/test_engine.py
index 7ee33028..0c116458 100644
--- a/evals/tests/test_engine.py
+++ b/evals/tests/test_engine.py
@@ -4,7 +4,7 @@ import json
 import subprocess
 from pathlib import Path
 
-from drill.engine import RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
+from drill.engine import Engine, RunResult, ScenarioConfig, VerifyConfig, snapshot_filesystem
 
 
 class TestVerifyConfig:
@@ -138,6 +138,40 @@ class TestEngineAssertionIntegration:
         assert (tmp_path / "meta.json").exists()
 
 
+class TestEnginePiBackend:
+    def test_resolves_pi_session_log_root(self, tmp_path: Path) -> None:
+        scenario = tmp_path / "scenario.yaml"
+        scenario.write_text("scenario: test-pi\n")
+        backends = tmp_path / "backends"
+        backends.mkdir()
+        (backends / "pi.yaml").write_text(
+            """
+name: pi
+cli: pi
+args: []
+required_env: []
+hooks:
+  pre_run: []
+  post_run: []
+shutdown: /quit
+idle: {}
+startup_timeout: 1
+terminal: {}
+session_logs:
+  pattern: ~/.pi/agent/sessions/**/*.jsonl
+"""
+        )
+        engine = Engine(
+            scenario_path=scenario,
+            backend_name="pi",
+            backends_dir=backends,
+            fixtures_dir=tmp_path,
+            results_dir=tmp_path,
+        )
+
+        assert engine._resolve_log_dir(tmp_path) == Path.home() / ".pi" / "agent" / "sessions"
+
+
 class TestEngineRunParams:
     def test_run_result_uses_custom_output_dir(self, tmp_path: Path) -> None:
         custom_dir = tmp_path / "custom" / "run-00"
diff --git a/evals/tests/test_normalizer.py b/evals/tests/test_normalizer.py
index 41947c5a..10ea85f0 100644
--- a/evals/tests/test_normalizer.py
+++ b/evals/tests/test_normalizer.py
@@ -3,9 +3,11 @@ import json
 from drill.normalizer import (
     collect_new_logs,
     filter_codex_logs_by_cwd,
+    filter_pi_logs_by_cwd,
     normalize_claude_logs,
     normalize_codex_logs,
     normalize_gemini_logs,
+    normalize_pi_logs,
     snapshot_log_dir,
 )
 
@@ -137,6 +139,56 @@ class TestNormalizeCodexLogs:
         assert normalized[1]["source"] == "native"
 
 
+class TestNormalizePiLogs:
+    def test_filter_by_cwd_keeps_matching_session_headers(self, tmp_path):
+        target = "/tmp/drill-target"
+        match = tmp_path / "match.jsonl"
+        match.write_text(json.dumps({"type": "session", "cwd": target}) + "\n")
+        other = tmp_path / "other.jsonl"
+        other.write_text(json.dumps({"type": "session", "cwd": "/tmp/other"}) + "\n")
+        malformed = tmp_path / "malformed.jsonl"
+        malformed.write_text("not json\n")
+
+        assert filter_pi_logs_by_cwd([match, other, malformed], target) == [match]
+
+    def test_normalizes_assistant_tool_calls_from_session_entries(self):
+        lines = [
+            json.dumps({"type": "session", "cwd": "/tmp/project"}),
+            json.dumps(
+                {
+                    "type": "message",
+                    "message": {
+                        "role": "assistant",
+                        "content": [
+                            {"type": "text", "text": "I will inspect this."},
+                            {
+                                "type": "toolCall",
+                                "name": "read",
+                                "arguments": {"path": "README.md"},
+                            },
+                            {
+                                "type": "toolCall",
+                                "name": "bash",
+                                "arguments": {"command": "git status"},
+                            },
+                            {
+                                "type": "toolCall",
+                                "name": "subagent",
+                                "arguments": {"agent": "reviewer"},
+                            },
+                        ],
+                    },
+                }
+            ),
+        ]
+
+        assert normalize_pi_logs("\n".join(lines)) == [
+            {"tool": "Read", "args": {"path": "README.md"}, "source": "native"},
+            {"tool": "Bash", "args": {"command": "git status"}, "source": "shell"},
+            {"tool": "subagent", "args": {"agent": "reviewer"}, "source": "native"},
+        ]
+
+
 class TestNormalizeGeminiLogs:
     def test_normalizes_jsonl_tool_calls(self):
         lines = [