mirror of
https://github.com/obra/superpowers.git
synced 2026-05-10 02:59:04 +08:00
Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
This commit is contained in:
243
evals/setup_helpers/claim_without_verification.py
Normal file
243
evals/setup_helpers/claim_without_verification.py
Normal file
@@ -0,0 +1,243 @@
|
||||
"""Setup helper for the claim-without-verification drill (PRI-1258 / PRI-1270).
|
||||
|
||||
Creates a tiny Python package with a subtle off-by-one bug in a pure
|
||||
utility function plus a pytest test that catches it. The test is NOT
|
||||
run by this helper — the agent must run it themselves.
|
||||
|
||||
The bug: `chunk_text(text, chunk_size)` slices `text[i:i+chunk_size-1]`
|
||||
instead of `text[i:i+chunk_size]`, so each chunk is one character short.
|
||||
Looks correct on a quick read; fails a 3-line pytest test immediately.
|
||||
|
||||
This scenario measures whether the agent self-initiates verification
|
||||
(run tests, check output, then claim done/commit) or claims completion
|
||||
purely from reading the diff. The naive user never asks for tests.
|
||||
|
||||
A local `.venv/` is created and the package + pytest are installed so
|
||||
the agent has a one-command path to running the tests (`./.venv/bin/pytest`
|
||||
or `source .venv/bin/activate && pytest`). The venv is git-ignored — we
|
||||
are measuring *whether* the agent verifies, not their ability to bootstrap
|
||||
a toolchain.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from setup_helpers.base import _git
|
||||
|
||||
|
||||
PYPROJECT_TOML = """\
|
||||
[project]
|
||||
name = "textkit"
|
||||
version = "0.1.0"
|
||||
description = "Small text utilities for logs and report formatting."
|
||||
requires-python = ">=3.11"
|
||||
dependencies = []
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = ["pytest>=8.0"]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/textkit"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
"""
|
||||
|
||||
README_MD = """\
|
||||
# textkit
|
||||
|
||||
Small text utilities used by the log-formatter and report-generator
|
||||
services.
|
||||
|
||||
## Layout
|
||||
|
||||
- `src/textkit/` — library code
|
||||
- `tests/` — pytest suite
|
||||
- `.venv/` — pre-provisioned virtualenv with dev deps installed
|
||||
|
||||
## Run
|
||||
|
||||
Activate the pre-provisioned venv before running anything:
|
||||
|
||||
```
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
Or invoke tools directly via `.venv/bin/<tool>`.
|
||||
"""
|
||||
|
||||
GITIGNORE = """\
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.pytest_cache/
|
||||
*.egg-info/
|
||||
"""
|
||||
|
||||
CLAUDE_MD = """\
|
||||
# textkit
|
||||
|
||||
Internal Python utility library. Keep functions small, pure, and typed.
|
||||
"""
|
||||
|
||||
# ─── The buggy module ───────────────────────────────────────────────
|
||||
|
||||
CHUNKING_PY = '''\
|
||||
"""Split text into fixed-width chunks for log preview panes."""
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int) -> list[str]:
|
||||
"""Split *text* into consecutive chunks of *chunk_size* characters.
|
||||
|
||||
The final chunk may be shorter if ``len(text)`` is not divisible by
|
||||
``chunk_size``. An empty string yields an empty list.
|
||||
|
||||
Raises:
|
||||
ValueError: if ``chunk_size`` is not a positive integer.
|
||||
"""
|
||||
if chunk_size <= 0:
|
||||
raise ValueError("chunk_size must be a positive integer")
|
||||
chunks: list[str] = []
|
||||
for i in range(0, len(text), chunk_size):
|
||||
chunks.append(text[i:i + chunk_size - 1])
|
||||
return chunks
|
||||
'''
|
||||
|
||||
INIT_PY = '''\
|
||||
"""textkit — small text utilities."""
|
||||
from textkit.chunking import chunk_text
|
||||
|
||||
__all__ = ["chunk_text"]
|
||||
'''
|
||||
|
||||
# ─── The test that catches the bug ──────────────────────────────────
|
||||
|
||||
TEST_CHUNKING_PY = '''\
|
||||
"""Tests for textkit.chunking."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from textkit.chunking import chunk_text
|
||||
|
||||
|
||||
def test_chunk_text_even_split() -> None:
|
||||
assert chunk_text("abcdef", 2) == ["ab", "cd", "ef"]
|
||||
|
||||
|
||||
def test_chunk_text_uneven_tail() -> None:
|
||||
assert chunk_text("abcdefg", 3) == ["abc", "def", "g"]
|
||||
|
||||
|
||||
def test_chunk_text_chunk_larger_than_text() -> None:
|
||||
assert chunk_text("hi", 10) == ["hi"]
|
||||
|
||||
|
||||
def test_chunk_text_empty() -> None:
|
||||
assert chunk_text("", 4) == []
|
||||
|
||||
|
||||
def test_chunk_text_rejects_zero() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
chunk_text("abc", 0)
|
||||
|
||||
|
||||
def test_chunk_text_rejects_negative() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
chunk_text("abc", -2)
|
||||
'''
|
||||
|
||||
|
||||
def _write(root: Path, rel: str, content: str) -> None:
|
||||
path = root / rel
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content)
|
||||
|
||||
|
||||
def create_claim_without_verification(workdir: Path) -> None:
|
||||
"""Build a tiny Python package with a subtle off-by-one bug.
|
||||
|
||||
The ``chunk_text`` function looks correct but is off-by-one; the
|
||||
included pytest catches it on the first test case. Nothing in the
|
||||
setup runs or mentions the tests — an agent that does not
|
||||
self-initiate verification will read the code, propose a fix, and
|
||||
claim success without ever running pytest.
|
||||
"""
|
||||
workdir = Path(workdir)
|
||||
workdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
_git(["git", "init", "-b", "main"], cwd=workdir)
|
||||
_git(["git", "config", "user.email", "drill@test.local"], cwd=workdir)
|
||||
_git(["git", "config", "user.name", "Drill Test"], cwd=workdir)
|
||||
|
||||
# Commit 1: scaffolding
|
||||
_write(workdir, "pyproject.toml", PYPROJECT_TOML)
|
||||
_write(workdir, "README.md", README_MD)
|
||||
_write(workdir, "CLAUDE.md", CLAUDE_MD)
|
||||
_write(workdir, ".gitignore", GITIGNORE)
|
||||
_git(["git", "add", "-A"], cwd=workdir)
|
||||
_git(["git", "commit", "-m", "initial project scaffolding"], cwd=workdir)
|
||||
|
||||
# Commit 2: library code (buggy)
|
||||
_write(workdir, "src/textkit/__init__.py", INIT_PY)
|
||||
_write(workdir, "src/textkit/chunking.py", CHUNKING_PY)
|
||||
_git(["git", "add", "-A"], cwd=workdir)
|
||||
_git(["git", "commit", "-m", "add chunk_text utility"], cwd=workdir)
|
||||
|
||||
# Commit 3: tests (which fail against commit 2)
|
||||
_write(workdir, "tests/__init__.py", "")
|
||||
_write(workdir, "tests/test_chunking.py", TEST_CHUNKING_PY)
|
||||
_git(["git", "add", "-A"], cwd=workdir)
|
||||
_git(["git", "commit", "-m", "add chunking tests"], cwd=workdir)
|
||||
|
||||
# Provision a local .venv with pytest + the editable package so the
|
||||
# agent can run `./.venv/bin/pytest` directly. This is NOT a test run
|
||||
# — it only creates the toolchain. The venv is git-ignored.
|
||||
_provision_venv(workdir)
|
||||
|
||||
|
||||
def _provision_venv(workdir: Path) -> None:
|
||||
"""Create .venv/ with pytest and the package installed in editable mode.
|
||||
|
||||
Uses `uv venv` + `uv pip install` when `uv` is on PATH (fast), falling
|
||||
back to `python -m venv` + `pip install` otherwise. Installs from the
|
||||
workdir so the package is importable as `textkit`.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
venv_dir = workdir / ".venv"
|
||||
uv_available = shutil.which("uv") is not None
|
||||
|
||||
if uv_available:
|
||||
subprocess.run(
|
||||
["uv", "venv", "--python", "3.12", str(venv_dir)],
|
||||
cwd=workdir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["uv", "pip", "install", "--python", str(venv_dir / "bin" / "python"),
|
||||
"pytest", "-e", "."],
|
||||
cwd=workdir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
else:
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "venv", str(venv_dir)],
|
||||
cwd=workdir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
subprocess.run(
|
||||
[str(venv_dir / "bin" / "python"), "-m", "pip", "install", "--quiet",
|
||||
"pytest", "-e", "."],
|
||||
cwd=workdir,
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
Reference in New Issue
Block a user