Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b

rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding
.git/, .venv/, results/, .env/, __pycache__/, *.egg-info/,
.private-journal/.

The drill repo is unaffected by this commit; archival is a separate
manual step after this PR merges.

Source SHA recorded at evals/.drill-source-sha for divergence
detection.
This commit is contained in:
Jesse Vincent
2026-05-06 12:15:46 -07:00
committed by Drew Ritter
parent 2e46e9590d
commit 3b412a3836
124 changed files with 13806 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env bash
# Verify a specific Skill was invoked before any Bash call whose command matches a regex.
#
# Usage: skill-before-tool-match <skill-name> <bash-command-regex>
# Example: skill-before-tool-match superpowers:verification-before-completion 'git[[:space:]]+commit'
#
# Semantics:
# - If no Bash call matches the regex, PASS (vacuously — the gated event never occurred).
# - If Bash matches but Skill with that name never appeared earlier, FAIL.
# - If both appeared and Skill came first, PASS.
# - If Skill never appeared but Bash matched, FAIL.
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
SKILL_NAME="$1"
BASH_REGEX="$2"
FILE="tool_calls.jsonl"
if [ ! -s "$FILE" ]; then
echo "FAIL: tool_calls.jsonl missing or empty"
exit 1
fi
# First index where Skill(skill=SKILL_NAME) appears (0-based).
SKILL_IDX=$(
jq -s --arg name "$SKILL_NAME" \
'to_entries | map(select(.value.tool == "Skill" and (.value.args.skill // "") == $name)) | first | (.key // -1)' \
"$FILE"
)
# First index where Bash(command =~ BASH_REGEX) appears.
BASH_IDX=$(
jq -s --arg re "$BASH_REGEX" \
'to_entries | map(select(.value.tool == "Bash" and ((.value.args.command // "") | test($re)))) | first | (.key // -1)' \
"$FILE"
)
if [ "$BASH_IDX" -lt 0 ]; then
echo "PASS: no Bash call matched /$BASH_REGEX/ — assertion is vacuous"
exit 0
fi
if [ "$SKILL_IDX" -lt 0 ]; then
echo "FAIL: Bash /$BASH_REGEX/ fired at line $((BASH_IDX + 1)) but Skill($SKILL_NAME) never fired"
exit 1
fi
if [ "$SKILL_IDX" -lt "$BASH_IDX" ]; then
echo "PASS: Skill($SKILL_NAME) at line $((SKILL_IDX + 1)) before Bash /$BASH_REGEX/ at line $((BASH_IDX + 1))"
exit 0
else
echo "FAIL: Skill($SKILL_NAME) at line $((SKILL_IDX + 1)) fired after Bash /$BASH_REGEX/ at line $((BASH_IDX + 1))"
exit 1
fi

32
evals/bin/skill-called Executable file
View File

@@ -0,0 +1,32 @@
#!/usr/bin/env bash
# Verify a specific superpowers Skill was invoked at least once.
#
# Usage: skill-called <skill-name>
# Example: skill-called superpowers:systematic-debugging
#
# Wraps the common case of `tool-arg-match Skill '.skill == "<name>"'` so
# scenario YAML doesn't have to embed jq quoting.
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
SKILL_NAME="$1"
FILE="tool_calls.jsonl"
if [ ! -s "$FILE" ]; then
echo "FAIL: tool_calls.jsonl missing or empty"
exit 1
fi
COUNT=$(
jq -s --arg name "$SKILL_NAME" \
'[.[] | select(.tool == "Skill" and (.args.skill // "") == $name)] | length' \
"$FILE"
)
if [ "$COUNT" -gt 0 ]; then
echo "PASS: Skill($SKILL_NAME) called $COUNT time(s)"
exit 0
else
echo "FAIL: Skill($SKILL_NAME) never called"
exit 1
fi

17
evals/bin/tool-arg-match Executable file
View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
TOOL="$1"
FILTER="$2"
FILE="tool_calls.jsonl"
MATCHES=$(jq -s "[.[] | select(.tool == \"$TOOL\") | select(.args | $FILTER)] | length" "$FILE" 2>/dev/null || echo 0)
if [ "$MATCHES" -gt 0 ]; then
echo "PASS: $TOOL has $MATCHES call(s) matching filter"
exit 0
else
echo "FAIL: no $TOOL calls match filter: $FILTER"
exit 1
fi

28
evals/bin/tool-before Executable file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env bash
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
TOOL_A="$1"
TOOL_B="$2"
FILE="tool_calls.jsonl"
IDX_A=$(jq -s 'to_entries | map(select(.value.tool == "'"$TOOL_A"'")) | first // empty | .key' "$FILE" 2>/dev/null)
IDX_B=$(jq -s 'to_entries | map(select(.value.tool == "'"$TOOL_B"'")) | first // empty | .key' "$FILE" 2>/dev/null)
if [ -z "$IDX_A" ] || [ "$IDX_A" = "null" ]; then
echo "FAIL: $TOOL_A never called"
exit 1
fi
if [ -z "$IDX_B" ] || [ "$IDX_B" = "null" ]; then
echo "FAIL: $TOOL_B never called"
exit 1
fi
if [ "$IDX_A" -lt "$IDX_B" ]; then
echo "PASS: $TOOL_A (line $((IDX_A + 1))) before $TOOL_B (line $((IDX_B + 1)))"
exit 0
else
echo "FAIL: $TOOL_A at line $((IDX_A + 1)) occurred after $TOOL_B at line $((IDX_B + 1))"
exit 1
fi

16
evals/bin/tool-called Executable file
View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
TOOL="$1"
FILE="tool_calls.jsonl"
COUNT=$(jq -s "[.[] | select(.tool == \"$TOOL\")] | length" "$FILE" 2>/dev/null || echo 0)
if [ "$COUNT" -gt 0 ]; then
echo "PASS: $TOOL called $COUNT time(s)"
exit 0
else
echo "FAIL: $TOOL never called"
exit 1
fi

27
evals/bin/tool-count Executable file
View File

@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
TOOL="$1"
OP="$2"
EXPECTED="$3"
FILE="tool_calls.jsonl"
COUNT=$(jq -s "[.[] | select(.tool == \"$TOOL\")] | length" "$FILE" 2>/dev/null || echo 0)
case "$OP" in
eq) TEST=$(( COUNT == EXPECTED )) ;;
gt) TEST=$(( COUNT > EXPECTED )) ;;
gte) TEST=$(( COUNT >= EXPECTED )) ;;
lt) TEST=$(( COUNT < EXPECTED )) ;;
lte) TEST=$(( COUNT <= EXPECTED )) ;;
*) echo "Unknown operator: $OP (expected: eq, gt, gte, lt, lte)"; exit 2 ;;
esac
if [ "$TEST" -eq 1 ]; then
echo "PASS: $TOOL called $COUNT time(s) ($OP $EXPECTED)"
exit 0
else
echo "FAIL: $TOOL called $COUNT time(s) (expected $OP $EXPECTED)"
exit 1
fi

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env bash
# Verify any Bash call with command matching a regex fires before any other Bash call
# matching a second regex.
#
# Usage: tool-match-before-tool-match <tool-name> <earlier-regex> <tool-name> <later-regex>
# Example: tool-match-before-tool-match Bash 'pytest' Bash 'git[[:space:]]+commit'
#
# Semantics:
# - If no call matches the "later" regex, PASS (vacuously — the gated event never happened).
# - If the "later" call fires but no "earlier" call preceded it, FAIL.
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
TOOL_A="$1"
REGEX_A="$2"
TOOL_B="$3"
REGEX_B="$4"
FILE="tool_calls.jsonl"
if [ ! -s "$FILE" ]; then
echo "FAIL: tool_calls.jsonl missing or empty"
exit 1
fi
IDX_A=$(
jq -s --arg tool "$TOOL_A" --arg re "$REGEX_A" \
'to_entries | map(select(.value.tool == $tool and ((.value.args.command // "") | test($re)))) | first | (.key // -1)' \
"$FILE"
)
IDX_B=$(
jq -s --arg tool "$TOOL_B" --arg re "$REGEX_B" \
'to_entries | map(select(.value.tool == $tool and ((.value.args.command // "") | test($re)))) | first | (.key // -1)' \
"$FILE"
)
if [ "$IDX_B" -lt 0 ]; then
echo "PASS: no $TOOL_B call matched /$REGEX_B/ — assertion is vacuous"
exit 0
fi
if [ "$IDX_A" -lt 0 ]; then
echo "FAIL: $TOOL_B /$REGEX_B/ fired at line $((IDX_B + 1)) but no $TOOL_A /$REGEX_A/ preceded it"
exit 1
fi
if [ "$IDX_A" -lt "$IDX_B" ]; then
echo "PASS: $TOOL_A /$REGEX_A/ at line $((IDX_A + 1)) before $TOOL_B /$REGEX_B/ at line $((IDX_B + 1))"
exit 0
else
echo "FAIL: $TOOL_A /$REGEX_A/ at line $((IDX_A + 1)) fired after $TOOL_B /$REGEX_B/ at line $((IDX_B + 1))"
exit 1
fi

16
evals/bin/tool-not-called Executable file
View File

@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail
command -v jq >/dev/null || { echo "jq required"; exit 127; }
TOOL="$1"
FILE="tool_calls.jsonl"
COUNT=$(jq -s "[.[] | select(.tool == \"$TOOL\")] | length" "$FILE" 2>/dev/null || echo 0)
if [ "$COUNT" -eq 0 ]; then
echo "PASS: $TOOL never called"
exit 0
else
echo "FAIL: $TOOL called $COUNT time(s) (expected 0)"
exit 1
fi