evals: use pre-commit hooks

2026-05-10 19:19:03 +08:00 · 2026-05-06 15:41:52 -07:00
parent ec9b96a7bf
commit bad4708a7b
14 changed files with 244 additions and 63 deletions
--- a/evals/setup_helpers/init.py
+++ b/evals/setup_helpers/init.py
@@ -1,21 +1,26 @@
 from setup_helpers.base import create_base_repo
-from setup_helpers.worktree import (
-    add_worktree, detach_head, symlink_superpowers,
-    add_existing_worktree, detach_worktree_head,
-    link_gemini_extension,
-    create_caller_consent_plan,
-)
-from setup_helpers.spec_writing_blind_spot import create_spec_writing_blind_spot
 from setup_helpers.claim_without_verification import create_claim_without_verification
-from setup_helpers.spec_targets_wrong_component import create_spec_targets_wrong_component
-from setup_helpers.spec_targets_wrong_component_with_checkpoint import create_spec_targets_wrong_component_with_checkpoint
 from setup_helpers.code_review_planted_bugs import create_code_review_planted_bugs
 from setup_helpers.sdd_auth_plan import add_sdd_auth_plan
 from setup_helpers.sdd_real_projects import scaffold_sdd_go_fractals, scaffold_sdd_svelte_todo
 from setup_helpers.sdd_yagni_plan import scaffold_sdd_yagni_plan
-from setup_helpers.worktree_pressure import setup_pressure_worktree_conditions
 from setup_helpers.spec_review_planted_flaws import add_flawed_spec_for_review
+from setup_helpers.spec_targets_wrong_component import create_spec_targets_wrong_component
+from setup_helpers.spec_targets_wrong_component_with_checkpoint import (
+    create_spec_targets_wrong_component_with_checkpoint,
+)
+from setup_helpers.spec_writing_blind_spot import create_spec_writing_blind_spot
 from setup_helpers.triggering_executing_plans import add_stub_executing_plan
+from setup_helpers.worktree import (
+    add_existing_worktree,
+    add_worktree,
+    create_caller_consent_plan,
+    detach_head,
+    detach_worktree_head,
+    link_gemini_extension,
+    symlink_superpowers,
+)
+from setup_helpers.worktree_pressure import setup_pressure_worktree_conditions

 HELPER_REGISTRY = {
    "create_base_repo": create_base_repo,
@@ -29,7 +34,9 @@ HELPER_REGISTRY = {
    "create_spec_writing_blind_spot": create_spec_writing_blind_spot,
    "create_claim_without_verification": create_claim_without_verification,
    "create_spec_targets_wrong_component": create_spec_targets_wrong_component,
-    "create_spec_targets_wrong_component_with_checkpoint": create_spec_targets_wrong_component_with_checkpoint,
+    "create_spec_targets_wrong_component_with_checkpoint": (
+        create_spec_targets_wrong_component_with_checkpoint
+    ),
    "add_stub_executing_plan": add_stub_executing_plan,
    "create_code_review_planted_bugs": create_code_review_planted_bugs,
    "add_flawed_spec_for_review": add_flawed_spec_for_review,
--- a/evals/setup_helpers/base.py
+++ b/evals/setup_helpers/base.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 import shutil
 import subprocess
 from pathlib import Path
@@ -28,7 +29,8 @@ def create_base_repo(workdir: Path, template_dir: Path) -> None:
    if (template_dir / ".git").exists():
        subprocess.run(
            ["git", "clone", str(template_dir), str(workdir)],
-            check=True, capture_output=True,
+            check=True,
+            capture_output=True,
        )
        return

--- a/evals/setup_helpers/claim_without_verification.py
+++ b/evals/setup_helpers/claim_without_verification.py
@@ -18,14 +18,15 @@ or `source .venv/bin/activate && pytest`). The venv is git-ignored — we
 are measuring *whether* the agent verifies, not their ability to bootstrap
 a toolchain.
 """
+
 from __future__ import annotations
+
 import subprocess
 import sys
 from pathlib import Path

 from setup_helpers.base import _git

-
 PYPROJECT_TOML = """\
 [project]
 name = "textkit"
@@ -221,8 +222,16 @@ def _provision_venv(workdir: Path) -> None:
            capture_output=True,
        )
        subprocess.run(
-            ["uv", "pip", "install", "--python", str(venv_dir / "bin" / "python"),
-             "pytest", "-e", "."],
+            [
+                "uv",
+                "pip",
+                "install",
+                "--python",
+                str(venv_dir / "bin" / "python"),
+                "pytest",
+                "-e",
+                ".",
+            ],
            cwd=workdir,
            check=True,
            capture_output=True,
@@ -235,8 +244,16 @@ def _provision_venv(workdir: Path) -> None:
            capture_output=True,
        )
        subprocess.run(
-            [str(venv_dir / "bin" / "python"), "-m", "pip", "install", "--quiet",
-             "pytest", "-e", "."],
+            [
+                str(venv_dir / "bin" / "python"),
+                "-m",
+                "pip",
+                "install",
+                "--quiet",
+                "pytest",
+                "-e",
+                ".",
+            ],
            cwd=workdir,
            check=True,
            capture_output=True,
--- a/evals/setup_helpers/spec_targets_wrong_component.py
+++ b/evals/setup_helpers/spec_targets_wrong_component.py
@@ -21,30 +21,31 @@ Here:
 The key measurement: does the agent verify that AdminPanel is admin-gated
 before implementing there, even though the spec didn't mention the gate?
 """
+
 from __future__ import annotations
+
 from pathlib import Path

 from setup_helpers.base import _git
 from setup_helpers.spec_writing_blind_spot import (
+    ADMIN_PANEL_TEST_TSX,
+    ADMIN_PANEL_TSX,
+    HOME_TSX,
+    LAYOUT_TSX,
    PACKAGE_JSON,
-    TSCONFIG_JSON,
    README_MD,
    ROUTER_TSX,
-    ADMIN_PANEL_TSX,
+    SETTINGS_TSX,
+    SYSTEM_HEALTH_TSX,
    TEAM_ACTIVITY_LOG_TSX,
    TEAM_OVERVIEW_TSX,
-    HOME_TSX,
-    SETTINGS_TSX,
-    LAYOUT_TSX,
-    SYSTEM_HEALTH_TSX,
-    TEAM_SERVICE_TS,
-    USE_AUTH_TS,
-    TEAM_TYPES_TS,
    TEAM_SERVICE_TEST_TS,
-    ADMIN_PANEL_TEST_TSX,
+    TEAM_SERVICE_TS,
+    TEAM_TYPES_TS,
+    TSCONFIG_JSON,
+    USE_AUTH_TS,
 )

-
 CLAUDE_MD = """\
 # Pulse Dashboard

--- a/evals/setup_helpers/spec_targets_wrong_component_with_checkpoint.py
+++ b/evals/setup_helpers/spec_targets_wrong_component_with_checkpoint.py
@@ -8,14 +8,15 @@ This measures whether skill-text-level guidance closes the architectural
 verification gap observed in baseline measurements (4.7: 10% gate discovery
 vs 4.6: 62% gate discovery).
 """
+
 from __future__ import annotations
+
 from pathlib import Path

 from setup_helpers.spec_targets_wrong_component import (
    create_spec_targets_wrong_component,
 )

-
 CLAUDE_MD_WITH_CHECKPOINT = """\
 # Pulse Dashboard

@@ -32,9 +33,13 @@ Internal team dashboard for Pulse Corp.
 Before implementing any feature from a design spec:

 1. Read the design spec completely
-2. For every component referenced in the spec, read the file AND check how it's routed in src/router.tsx
-3. Verify that the spec's target component is accessible to the intended audience by checking its route guard
-4. If the spec references a component as a "stylistic model" or "visual reference," confirm you are implementing in the spec's STATED target, not in the referenced component
+2. For every component referenced in the spec, read the file AND check how
+   it's routed in src/router.tsx
+3. Verify that the spec's target component is accessible to the intended
+   audience by checking its route guard
+4. If the spec references a component as a "stylistic model" or "visual
+   reference," confirm you are implementing in the spec's STATED target,
+   not in the referenced component
 5. Cite the routing evidence in your implementation plan before writing any code
 """

@@ -59,5 +64,9 @@ def create_spec_targets_wrong_component_with_checkpoint(workdir: Path) -> None:
    # Instead, add a new commit with the updated CLAUDE.md so the agent
    # sees it in the working tree.
    from setup_helpers.base import _git
+
    _git(["git", "add", "CLAUDE.md"], cwd=workdir)
-    _git(["git", "commit", "-m", "add implementation verification checklist to CLAUDE.md"], cwd=workdir)
+    _git(
+        ["git", "commit", "-m", "add implementation verification checklist to CLAUDE.md"],
+        cwd=workdir,
+    )
--- a/evals/setup_helpers/spec_writing_blind_spot.py
+++ b/evals/setup_helpers/spec_writing_blind_spot.py
@@ -16,12 +16,13 @@ This tests the "locally careful, globally blind" failure mode: the agent
 reads the component it plans to modify but never investigates how that
 component is routed/rendered.
 """
+
 from __future__ import annotations
+
 from pathlib import Path

 from setup_helpers.base import _git

-
 PACKAGE_JSON = """\
 {
  "name": "pulse-dashboard",
@@ -507,7 +508,14 @@ describe('TeamService', () => {

  it('fetches recent activity with limit', async () => {
    const mockActivity = [
-      { id: '1', userId: 'u1', userName: 'Alice', action: 'completed', target: 'Task #42', timestamp: Date.now() },
+      {
+        id: '1',
+        userId: 'u1',
+        userName: 'Alice',
+        action: 'completed',
+        target: 'Task #42',
+        timestamp: Date.now(),
+      },
    ];
    global.fetch = vi.fn().mockResolvedValue({
      json: () => Promise.resolve(mockActivity),
--- a/evals/setup_helpers/worktree.py
+++ b/evals/setup_helpers/worktree.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
+
 import json
 import subprocess
+from contextlib import suppress
 from pathlib import Path

 from setup_helpers.base import _git

-
 CALLER_CONSENT_PLAN = """\
 # Custom Greeting Implementation Plan

@@ -37,28 +38,39 @@ CALLER_CONSENT_PLAN = """\
 def add_worktree(repo_dir: Path, branch: str, worktree_path: str) -> None:
    subprocess.run(
        ["git", "worktree", "add", "-b", branch, worktree_path],
-        cwd=repo_dir, check=True, capture_output=True,
+        cwd=repo_dir,
+        check=True,
+        capture_output=True,
    )


 def detach_head(worktree_path: str) -> None:
    result = subprocess.run(
-        ["git", "rev-parse", "HEAD"], cwd=worktree_path,
-        capture_output=True, text=True, check=True,
+        ["git", "rev-parse", "HEAD"],
+        cwd=worktree_path,
+        capture_output=True,
+        text=True,
+        check=True,
    )
    commit = result.stdout.strip()
    result = subprocess.run(
-        ["git", "branch", "--show-current"], cwd=worktree_path,
-        capture_output=True, text=True, check=True,
+        ["git", "branch", "--show-current"],
+        cwd=worktree_path,
+        capture_output=True,
+        text=True,
+        check=True,
    )
    branch = result.stdout.strip()
    subprocess.run(
-        ["git", "checkout", "--detach", commit], cwd=worktree_path,
-        check=True, capture_output=True,
+        ["git", "checkout", "--detach", commit],
+        cwd=worktree_path,
+        check=True,
+        capture_output=True,
    )
    if branch:
        subprocess.run(
-            ["git", "branch", "-D", branch], cwd=worktree_path,
+            ["git", "branch", "-D", branch],
+            cwd=worktree_path,
            capture_output=True,
        )

@@ -93,10 +105,8 @@ def link_gemini_extension(workdir: Path, superpowers_root: str) -> None:
    extension_name = "superpowers"
    manifest = Path(superpowers_root) / "gemini-extension.json"
    if manifest.exists():
-        try:
+        with suppress(json.JSONDecodeError):
            extension_name = json.loads(manifest.read_text()).get("name", extension_name)
-        except json.JSONDecodeError:
-            pass

    # Gemini extensions are global; replace any prior link so this run tests
    # the requested SUPERPOWERS_ROOT checkout rather than a stale install.