mirror of
https://github.com/obra/superpowers.git
synced 2026-05-09 18:49:04 +08:00
Lift drill into evals/ at 013fcb8b7dbefd6d3fa4653493e5d2ec8e7f985b
rsync of obra/drill@013fcb8b7d into superpowers/evals/, excluding .git/, .venv/, results/, .env/, __pycache__/, *.egg-info/, .private-journal/. The drill repo is unaffected by this commit; archival is a separate manual step after this PR merges. Source SHA recorded at evals/.drill-source-sha for divergence detection.
This commit is contained in:
committed by
Drew Ritter
parent
2e46e9590d
commit
3b412a3836
54
evals/bin/skill-before-tool-match
Executable file
54
evals/bin/skill-before-tool-match
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env bash
|
||||
# Verify a specific Skill was invoked before any Bash call whose command matches a regex.
|
||||
#
|
||||
# Usage: skill-before-tool-match <skill-name> <bash-command-regex>
|
||||
# Example: skill-before-tool-match superpowers:verification-before-completion 'git[[:space:]]+commit'
|
||||
#
|
||||
# Semantics:
|
||||
# - If no Bash call matches the regex, PASS (vacuously — the gated event never occurred).
|
||||
# - If Bash matches but Skill with that name never appeared earlier, FAIL.
|
||||
# - If both appeared and Skill came first, PASS.
|
||||
# - If Skill never appeared but Bash matched, FAIL.
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
SKILL_NAME="$1"
|
||||
BASH_REGEX="$2"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
if [ ! -s "$FILE" ]; then
|
||||
echo "FAIL: tool_calls.jsonl missing or empty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# First index where Skill(skill=SKILL_NAME) appears (0-based).
|
||||
SKILL_IDX=$(
|
||||
jq -s --arg name "$SKILL_NAME" \
|
||||
'to_entries | map(select(.value.tool == "Skill" and (.value.args.skill // "") == $name)) | first | (.key // -1)' \
|
||||
"$FILE"
|
||||
)
|
||||
|
||||
# First index where Bash(command =~ BASH_REGEX) appears.
|
||||
BASH_IDX=$(
|
||||
jq -s --arg re "$BASH_REGEX" \
|
||||
'to_entries | map(select(.value.tool == "Bash" and ((.value.args.command // "") | test($re)))) | first | (.key // -1)' \
|
||||
"$FILE"
|
||||
)
|
||||
|
||||
if [ "$BASH_IDX" -lt 0 ]; then
|
||||
echo "PASS: no Bash call matched /$BASH_REGEX/ — assertion is vacuous"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "$SKILL_IDX" -lt 0 ]; then
|
||||
echo "FAIL: Bash /$BASH_REGEX/ fired at line $((BASH_IDX + 1)) but Skill($SKILL_NAME) never fired"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$SKILL_IDX" -lt "$BASH_IDX" ]; then
|
||||
echo "PASS: Skill($SKILL_NAME) at line $((SKILL_IDX + 1)) before Bash /$BASH_REGEX/ at line $((BASH_IDX + 1))"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: Skill($SKILL_NAME) at line $((SKILL_IDX + 1)) fired after Bash /$BASH_REGEX/ at line $((BASH_IDX + 1))"
|
||||
exit 1
|
||||
fi
|
||||
32
evals/bin/skill-called
Executable file
32
evals/bin/skill-called
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
# Verify a specific superpowers Skill was invoked at least once.
|
||||
#
|
||||
# Usage: skill-called <skill-name>
|
||||
# Example: skill-called superpowers:systematic-debugging
|
||||
#
|
||||
# Wraps the common case of `tool-arg-match Skill '.skill == "<name>"'` so
|
||||
# scenario YAML doesn't have to embed jq quoting.
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
SKILL_NAME="$1"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
if [ ! -s "$FILE" ]; then
|
||||
echo "FAIL: tool_calls.jsonl missing or empty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
COUNT=$(
|
||||
jq -s --arg name "$SKILL_NAME" \
|
||||
'[.[] | select(.tool == "Skill" and (.args.skill // "") == $name)] | length' \
|
||||
"$FILE"
|
||||
)
|
||||
|
||||
if [ "$COUNT" -gt 0 ]; then
|
||||
echo "PASS: Skill($SKILL_NAME) called $COUNT time(s)"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: Skill($SKILL_NAME) never called"
|
||||
exit 1
|
||||
fi
|
||||
17
evals/bin/tool-arg-match
Executable file
17
evals/bin/tool-arg-match
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
TOOL="$1"
|
||||
FILTER="$2"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
MATCHES=$(jq -s "[.[] | select(.tool == \"$TOOL\") | select(.args | $FILTER)] | length" "$FILE" 2>/dev/null || echo 0)
|
||||
|
||||
if [ "$MATCHES" -gt 0 ]; then
|
||||
echo "PASS: $TOOL has $MATCHES call(s) matching filter"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: no $TOOL calls match filter: $FILTER"
|
||||
exit 1
|
||||
fi
|
||||
28
evals/bin/tool-before
Executable file
28
evals/bin/tool-before
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
TOOL_A="$1"
|
||||
TOOL_B="$2"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
IDX_A=$(jq -s 'to_entries | map(select(.value.tool == "'"$TOOL_A"'")) | first // empty | .key' "$FILE" 2>/dev/null)
|
||||
IDX_B=$(jq -s 'to_entries | map(select(.value.tool == "'"$TOOL_B"'")) | first // empty | .key' "$FILE" 2>/dev/null)
|
||||
|
||||
if [ -z "$IDX_A" ] || [ "$IDX_A" = "null" ]; then
|
||||
echo "FAIL: $TOOL_A never called"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$IDX_B" ] || [ "$IDX_B" = "null" ]; then
|
||||
echo "FAIL: $TOOL_B never called"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$IDX_A" -lt "$IDX_B" ]; then
|
||||
echo "PASS: $TOOL_A (line $((IDX_A + 1))) before $TOOL_B (line $((IDX_B + 1)))"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: $TOOL_A at line $((IDX_A + 1)) occurred after $TOOL_B at line $((IDX_B + 1))"
|
||||
exit 1
|
||||
fi
|
||||
16
evals/bin/tool-called
Executable file
16
evals/bin/tool-called
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
TOOL="$1"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
COUNT=$(jq -s "[.[] | select(.tool == \"$TOOL\")] | length" "$FILE" 2>/dev/null || echo 0)
|
||||
|
||||
if [ "$COUNT" -gt 0 ]; then
|
||||
echo "PASS: $TOOL called $COUNT time(s)"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: $TOOL never called"
|
||||
exit 1
|
||||
fi
|
||||
27
evals/bin/tool-count
Executable file
27
evals/bin/tool-count
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
TOOL="$1"
|
||||
OP="$2"
|
||||
EXPECTED="$3"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
COUNT=$(jq -s "[.[] | select(.tool == \"$TOOL\")] | length" "$FILE" 2>/dev/null || echo 0)
|
||||
|
||||
case "$OP" in
|
||||
eq) TEST=$(( COUNT == EXPECTED )) ;;
|
||||
gt) TEST=$(( COUNT > EXPECTED )) ;;
|
||||
gte) TEST=$(( COUNT >= EXPECTED )) ;;
|
||||
lt) TEST=$(( COUNT < EXPECTED )) ;;
|
||||
lte) TEST=$(( COUNT <= EXPECTED )) ;;
|
||||
*) echo "Unknown operator: $OP (expected: eq, gt, gte, lt, lte)"; exit 2 ;;
|
||||
esac
|
||||
|
||||
if [ "$TEST" -eq 1 ]; then
|
||||
echo "PASS: $TOOL called $COUNT time(s) ($OP $EXPECTED)"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: $TOOL called $COUNT time(s) (expected $OP $EXPECTED)"
|
||||
exit 1
|
||||
fi
|
||||
53
evals/bin/tool-match-before-tool-match
Executable file
53
evals/bin/tool-match-before-tool-match
Executable file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
# Verify any Bash call with command matching a regex fires before any other Bash call
|
||||
# matching a second regex.
|
||||
#
|
||||
# Usage: tool-match-before-tool-match <tool-name> <earlier-regex> <tool-name> <later-regex>
|
||||
# Example: tool-match-before-tool-match Bash 'pytest' Bash 'git[[:space:]]+commit'
|
||||
#
|
||||
# Semantics:
|
||||
# - If no call matches the "later" regex, PASS (vacuously — the gated event never happened).
|
||||
# - If the "later" call fires but no "earlier" call preceded it, FAIL.
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
TOOL_A="$1"
|
||||
REGEX_A="$2"
|
||||
TOOL_B="$3"
|
||||
REGEX_B="$4"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
if [ ! -s "$FILE" ]; then
|
||||
echo "FAIL: tool_calls.jsonl missing or empty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
IDX_A=$(
|
||||
jq -s --arg tool "$TOOL_A" --arg re "$REGEX_A" \
|
||||
'to_entries | map(select(.value.tool == $tool and ((.value.args.command // "") | test($re)))) | first | (.key // -1)' \
|
||||
"$FILE"
|
||||
)
|
||||
|
||||
IDX_B=$(
|
||||
jq -s --arg tool "$TOOL_B" --arg re "$REGEX_B" \
|
||||
'to_entries | map(select(.value.tool == $tool and ((.value.args.command // "") | test($re)))) | first | (.key // -1)' \
|
||||
"$FILE"
|
||||
)
|
||||
|
||||
if [ "$IDX_B" -lt 0 ]; then
|
||||
echo "PASS: no $TOOL_B call matched /$REGEX_B/ — assertion is vacuous"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "$IDX_A" -lt 0 ]; then
|
||||
echo "FAIL: $TOOL_B /$REGEX_B/ fired at line $((IDX_B + 1)) but no $TOOL_A /$REGEX_A/ preceded it"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$IDX_A" -lt "$IDX_B" ]; then
|
||||
echo "PASS: $TOOL_A /$REGEX_A/ at line $((IDX_A + 1)) before $TOOL_B /$REGEX_B/ at line $((IDX_B + 1))"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: $TOOL_A /$REGEX_A/ at line $((IDX_A + 1)) fired after $TOOL_B /$REGEX_B/ at line $((IDX_B + 1))"
|
||||
exit 1
|
||||
fi
|
||||
16
evals/bin/tool-not-called
Executable file
16
evals/bin/tool-not-called
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
command -v jq >/dev/null || { echo "jq required"; exit 127; }
|
||||
|
||||
TOOL="$1"
|
||||
FILE="tool_calls.jsonl"
|
||||
|
||||
COUNT=$(jq -s "[.[] | select(.tool == \"$TOOL\")] | length" "$FILE" 2>/dev/null || echo 0)
|
||||
|
||||
if [ "$COUNT" -eq 0 ]; then
|
||||
echo "PASS: $TOOL never called"
|
||||
exit 0
|
||||
else
|
||||
echo "FAIL: $TOOL called $COUNT time(s) (expected 0)"
|
||||
exit 1
|
||||
fi
|
||||
Reference in New Issue
Block a user