mirror of
https://github.com/obra/superpowers.git
synced 2026-04-23 09:59:05 +08:00
Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a7a8c08c02 | ||
|
|
80643c2604 | ||
|
|
95c6e16336 | ||
|
|
612fbcdd01 | ||
|
|
70bf3a9e7f | ||
|
|
c2125b41e3 | ||
|
|
ae0ef56b44 | ||
|
|
030a222af1 | ||
|
|
2a19be0b78 | ||
|
|
cd83439bb2 | ||
|
|
ed06dcbe27 | ||
|
|
baef5241aa | ||
|
|
28ba020000 | ||
|
|
a9b94ae5d7 | ||
|
|
5845b52747 | ||
|
|
3f725ff0d4 | ||
|
|
b57c27d815 | ||
|
|
718ec45d33 | ||
|
|
c7caee5647 | ||
|
|
1f611f5c0c | ||
|
|
2d7408d0c6 | ||
|
|
9464a51779 | ||
|
|
faa65e7163 | ||
|
|
24ca8cd9d5 | ||
|
|
8fbeca830a | ||
|
|
67de772d7f | ||
|
|
cf72863792 | ||
|
|
baa23b16bb | ||
|
|
0aba33be1c | ||
|
|
06310d6f5f | ||
|
|
dc11a093c3 | ||
|
|
fa946ae465 | ||
|
|
51a171cd14 | ||
|
|
466332f698 | ||
|
|
87afde2390 | ||
|
|
2a6a40fe10 | ||
|
|
97ce1f8fe0 | ||
|
|
e7e50ac947 | ||
|
|
5faddc4087 | ||
|
|
d2900eae0c | ||
|
|
4ac67830d9 |
@@ -9,7 +9,7 @@
|
||||
{
|
||||
"name": "superpowers",
|
||||
"description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques",
|
||||
"version": "3.6.2",
|
||||
"version": "4.0.1",
|
||||
"source": "./",
|
||||
"author": {
|
||||
"name": "Jesse Vincent",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "superpowers",
|
||||
"description": "Core skills library for Claude Code: TDD, debugging, collaboration patterns, and proven techniques",
|
||||
"version": "3.6.2",
|
||||
"version": "4.0.1",
|
||||
"author": {
|
||||
"name": "Jesse Vincent",
|
||||
"email": "jesse@fsck.com"
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/getting-started/**)",
|
||||
"Read(//Users/jesse/Downloads/**)",
|
||||
"Bash(~/.claude/plugins/cache/superpowers/skills/getting-started/list-skills)",
|
||||
"Bash(~/.claude/plugins/cache/superpowers/skills/getting-started/skills-search \"prompt\")",
|
||||
"Bash(~/.claude/plugins/cache/superpowers/skills/getting-started/skills-search \"communication\")",
|
||||
"Bash(~/.claude/plugins/cache/superpowers/skills/getting-started/skills-search \"interaction\")",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/meta/testing-skills-with-subagents/**)",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/collaboration/dispatching-parallel-agents/**)",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/collaboration/requesting-code-review/**)",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/collaboration/writing-plans/**)",
|
||||
"mcp__journal__search_journal",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/meta/creating-skills/**)",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/collaboration/brainstorming/**)",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/superpowers/skills/**)",
|
||||
"Read(//Users/jesse/.claude/plugins/cache/**)",
|
||||
"mcp__journal__read_journal_entry",
|
||||
"Bash(/Users/jesse/git/superpowers/superpowers/skills/getting-started/list-skills)",
|
||||
"Bash(/Users/jesse/git/superpowers/superpowers/skills/getting-started/skills-search refactor)",
|
||||
"Read(//Users/jesse/Documents/GitHub/superpowers/**)",
|
||||
"Bash(${CLAUDE_PLUGIN_ROOT}/skills/getting-started/list-skills:*)",
|
||||
"Bash(/Users/jesse/Documents/GitHub/superpowers/superpowers/skills/getting-started/list-skills)",
|
||||
"Bash(/Users/jesse/Documents/GitHub/superpowers/superpowers/skills/getting-started/skills-search editing)",
|
||||
"Bash(list-skills brainstorm)",
|
||||
"Read(//Users/jesse/.claude/commands/**)",
|
||||
"Bash(git checkout:*)",
|
||||
"Bash(/Users/jesse/.claude/plugins/cache/superpowers/skills/getting-started/list-skills)",
|
||||
"Bash(ln:*)",
|
||||
"Bash(git add:*)",
|
||||
"Bash(git commit:*)",
|
||||
"Bash(git push:*)",
|
||||
"Read(//Users/jesse/.claude/plugins/**)",
|
||||
"Read(//Users/jesse/.claude/**)",
|
||||
"Bash(cat:*)",
|
||||
"Read(//Users/jesse/.superpowers/**)",
|
||||
"Bash(find:*)",
|
||||
"Read(//Users/jesse/.clank/**)",
|
||||
"Bash(./search-conversations:*)",
|
||||
"Bash(./skills/collaboration/remembering-conversations/tool/search-conversations:*)",
|
||||
"Bash(npm install)",
|
||||
"Bash(sqlite3:*)",
|
||||
"Bash(chmod:*)",
|
||||
"Bash(/Users/jesse/Documents/GitHub/superpowers/superpowers/skills/collaboration/remembering-conversations/tool/migrate-to-config.sh:*)",
|
||||
"Read(//Users/jesse/.config/superpowers/**)",
|
||||
"Bash(./index-conversations --help)",
|
||||
"Bash(./index-conversations:*)",
|
||||
"Bash(bc)",
|
||||
"Bash(bc:*)",
|
||||
"Bash(./scripts/find-skills)",
|
||||
"Bash(./scripts/run:*)",
|
||||
"Bash(./scripts/find-skills test)",
|
||||
"Bash(find-skills:*)",
|
||||
"Bash(/Users/jesse/.claude/plugins/cache/superpowers/scripts/find-skills refactor)",
|
||||
"Bash(mkdir:*)",
|
||||
"Bash(git worktree add:*)",
|
||||
"Bash([ -f package.json ])",
|
||||
"Bash(git worktree:*)",
|
||||
"Bash(gh repo create:*)",
|
||||
"Bash(git clone:*)",
|
||||
"Bash(gh repo view:*)",
|
||||
"Bash(test:*)",
|
||||
"Bash(git ls-tree:*)",
|
||||
"Bash(git rm:*)",
|
||||
"Bash(git mv:*)",
|
||||
"Bash(/Users/jesse/Documents/GitHub/superpowers/superpowers-skills/skills/using-skills/find-skills)",
|
||||
"Bash(tree:*)",
|
||||
"Bash(/Users/jesse/Documents/GitHub/superpowers/superpowers-skills/skills/using-skills/skill-run --help)",
|
||||
"Bash(echo:*)",
|
||||
"Bash(git log:*)",
|
||||
"Bash(git show:*)",
|
||||
"Bash(git diff-tree:*)",
|
||||
"Bash(bash:*)",
|
||||
"Bash(xargs ls:*)",
|
||||
"Bash(git rev-parse:*)",
|
||||
"Bash(git reset:*)",
|
||||
"Bash(./skills/using-skills/find-skills)",
|
||||
"Bash(git rebase:*)",
|
||||
"Bash(GIT_SEQUENCE_EDITOR=\"sed -i '' 's/^pick 683707a/edit 683707a/'\" git rebase:*)",
|
||||
"Bash(gh pr create:*)",
|
||||
"Bash(for:*)",
|
||||
"Bash(do [ -f \"$skill\" ])",
|
||||
"Bash(! grep -q \"^when_to_use:\" \"$skill\")",
|
||||
"Bash(done)",
|
||||
"Bash(gh issue view:*)",
|
||||
"Bash(gh pr view:*)",
|
||||
"Bash(gh pr diff:*)",
|
||||
"Bash(/Users/jesse/Documents/GitHub/superpowers/superpowers-skills/skills/using-skills/find-skills test)",
|
||||
"Bash(xargs -I {} bash -c 'dir=$(echo {} | sed \"\"\"\"s|/SKILL.md||\"\"\"\" | xargs basename); name=$(grep \"\"\"\"^name:\"\"\"\" {} | sed \"\"\"\"s/^name: //\"\"\"\"); echo \"\"\"\"$dir -> $name\"\"\"\"')",
|
||||
"mcp__obsidian-mcp-tools__fetch",
|
||||
"Skill(superpowers:using-git-worktrees)",
|
||||
"Skill(superpowers:subagent-driven-development)",
|
||||
"Bash(./test-raw.sh:*)",
|
||||
"Bash(./chrome-ws raw \"ws://localhost:9222/devtools/page/test\" '{\"\"id\"\":1,\"\"method\"\":\"\"Browser.getVersion\"\"}')",
|
||||
"Bash(./test-tabs.sh:*)",
|
||||
"Bash(curl:*)",
|
||||
"Bash(./chrome-ws tabs:*)",
|
||||
"Bash(./chrome-ws close:*)",
|
||||
"Bash(./chrome-ws raw:*)",
|
||||
"Bash(./chrome-ws new:*)",
|
||||
"Bash(./test-navigate.sh:*)",
|
||||
"Bash(./test-interact.sh:*)",
|
||||
"Bash(./test-extract.sh)",
|
||||
"Bash(./test-wait.sh:*)",
|
||||
"Bash(./test-e2e.sh:*)",
|
||||
"Bash(./chrome-ws extract:*)",
|
||||
"Bash(./chrome-ws screenshot:*)",
|
||||
"Bash(./chrome-ws start:*)",
|
||||
"Bash(./chrome-ws navigate:*)",
|
||||
"Bash(git init:*)",
|
||||
"Bash(git tag:*)",
|
||||
"Skill(example-skills:mcp-builder)",
|
||||
"Bash(npm run build)",
|
||||
"Bash(npm run clean)",
|
||||
"Bash(timeout 3s node dist/index.js)",
|
||||
"Bash(git -C /Users/jesse/Documents/GitHub/superpowers/superpowers-chrome ls-files .claude-plugin/marketplace.json)",
|
||||
"mcp__private-journal__read_journal_entry",
|
||||
"Bash(git pull:*)",
|
||||
"Skill(elements-of-style:writing-clearly-and-concisely)",
|
||||
"Bash(gh release list:*)",
|
||||
"Bash(gh release create:*)",
|
||||
"Read(//Users/jesse/git/superpowers/superpowers-marketplace/.claude-plugin/**)",
|
||||
"mcp__plugin_episodic-memory_episodic-memory__search",
|
||||
"Skill(superpowers:writing-skills)",
|
||||
"mcp__private-journal__process_thoughts",
|
||||
"Skill(superpowers:brainstorming)",
|
||||
"Skill(superpowers:using-superpowers)",
|
||||
"Skill(episodic-memory:remembering-conversations)",
|
||||
"Skill(superpowers-developing-for-claude-code:developing-claude-code-plugins)",
|
||||
"Skill(working-with-claude-code)"
|
||||
],
|
||||
"deny": [],
|
||||
"ask": [],
|
||||
"additionalDirectories": [
|
||||
"/Users/jesse/Documents/GitHub/superpowers/superpowers-skills/",
|
||||
"/Users/jesse/Documents/GitHub/superpowers/superpowers-marketplace",
|
||||
"/Users/jesse/Documents/GitHub/superpowers/using-chrome-directly/"
|
||||
]
|
||||
}
|
||||
}
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
||||
.worktrees/
|
||||
.private-journal/
|
||||
.claude/
|
||||
|
||||
24
README.md
24
README.md
@@ -85,7 +85,7 @@ Fetch and follow instructions from https://raw.githubusercontent.com/obra/superp
|
||||
|
||||
3. **writing-plans** - Activates with approved design. Breaks work into bite-sized tasks (2-5 minutes each). Every task has exact file paths, complete code, verification steps.
|
||||
|
||||
4. **subagent-driven-development** or **executing-plans** - Activates with plan. Dispatches fresh subagent per task (same session, fast iteration) or executes in batches (parallel session, human checkpoints).
|
||||
4. **subagent-driven-development** or **executing-plans** - Activates with plan. Dispatches fresh subagent per task with two-stage review (spec compliance, then code quality), or executes in batches with human checkpoints.
|
||||
|
||||
5. **test-driven-development** - Activates during implementation. Enforces RED-GREEN-REFACTOR: write failing test, watch it fail, write minimal code, watch it pass, commit. Deletes code written before tests.
|
||||
|
||||
@@ -100,15 +100,11 @@ Fetch and follow instructions from https://raw.githubusercontent.com/obra/superp
|
||||
### Skills Library
|
||||
|
||||
**Testing**
|
||||
- **test-driven-development** - RED-GREEN-REFACTOR cycle
|
||||
- **condition-based-waiting** - Async test patterns
|
||||
- **testing-anti-patterns** - Common pitfalls to avoid
|
||||
- **test-driven-development** - RED-GREEN-REFACTOR cycle (includes testing anti-patterns reference)
|
||||
|
||||
**Debugging**
|
||||
- **systematic-debugging** - 4-phase root cause process
|
||||
- **root-cause-tracing** - Find the real problem
|
||||
**Debugging**
|
||||
- **systematic-debugging** - 4-phase root cause process (includes root-cause-tracing, defense-in-depth, condition-based-waiting techniques)
|
||||
- **verification-before-completion** - Ensure it's actually fixed
|
||||
- **defense-in-depth** - Multiple validation layers
|
||||
|
||||
**Collaboration**
|
||||
- **brainstorming** - Socratic design refinement
|
||||
@@ -119,11 +115,10 @@ Fetch and follow instructions from https://raw.githubusercontent.com/obra/superp
|
||||
- **receiving-code-review** - Responding to feedback
|
||||
- **using-git-worktrees** - Parallel development branches
|
||||
- **finishing-a-development-branch** - Merge/PR decision workflow
|
||||
- **subagent-driven-development** - Fast iteration with quality gates
|
||||
- **subagent-driven-development** - Fast iteration with two-stage review (spec compliance, then code quality)
|
||||
|
||||
**Meta**
|
||||
- **writing-skills** - Create new skills following best practices
|
||||
- **testing-skills-with-subagents** - Validate skill quality
|
||||
**Meta**
|
||||
- **writing-skills** - Create new skills following best practices (includes testing methodology)
|
||||
- **using-superpowers** - Introduction to the skills system
|
||||
|
||||
## Philosophy
|
||||
@@ -141,9 +136,8 @@ Skills live directly in this repository. To contribute:
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a branch for your skill
|
||||
3. Follow the `writing-skills` skill for creating new skills
|
||||
4. Use the `testing-skills-with-subagents` skill to validate quality
|
||||
5. Submit a PR
|
||||
3. Follow the `writing-skills` skill for creating and testing new skills
|
||||
4. Submit a PR
|
||||
|
||||
See `skills/writing-skills/SKILL.md` for the complete guide.
|
||||
|
||||
|
||||
104
RELEASE-NOTES.md
104
RELEASE-NOTES.md
@@ -1,5 +1,103 @@
|
||||
# Superpowers Release Notes
|
||||
|
||||
## v4.0.1 (2025-12-22)
|
||||
|
||||
### Fixes
|
||||
|
||||
**Clarified how to access skills in Claude Code**
|
||||
|
||||
Fixed a confusing pattern where Claude would invoke a skill via the Skill tool, then try to Read the skill file separately. The `using-superpowers` skill now explicitly states that the Skill tool loads skill content directly—no need to read files.
|
||||
|
||||
- Added "How to Access Skills" section to `using-superpowers`
|
||||
- Changed "read the skill" → "invoke the skill" in instructions
|
||||
- Updated slash commands to use fully qualified skill names (e.g., `superpowers:brainstorming`)
|
||||
|
||||
## v4.0.0 (2025-12-17)
|
||||
|
||||
### New Features
|
||||
|
||||
**Two-stage code review in subagent-driven-development**
|
||||
|
||||
Subagent workflows now use two separate review stages after each task:
|
||||
|
||||
1. **Spec compliance review** - Skeptical reviewer verifies implementation matches spec exactly. Catches missing requirements AND over-building. Won't trust implementer's report—reads actual code.
|
||||
|
||||
2. **Code quality review** - Only runs after spec compliance passes. Reviews for clean code, test coverage, maintainability.
|
||||
|
||||
This catches the common failure mode where code is well-written but doesn't match what was requested. Reviews are loops, not one-shot: if reviewer finds issues, implementer fixes them, then reviewer checks again.
|
||||
|
||||
Other subagent workflow improvements:
|
||||
- Controller provides full task text to workers (not file references)
|
||||
- Workers can ask clarifying questions before AND during work
|
||||
- Self-review checklist before reporting completion
|
||||
- Plan read once at start, extracted to TodoWrite
|
||||
|
||||
New prompt templates in `skills/subagent-driven-development/`:
|
||||
- `implementer-prompt.md` - Includes self-review checklist, encourages questions
|
||||
- `spec-reviewer-prompt.md` - Skeptical verification against requirements
|
||||
- `code-quality-reviewer-prompt.md` - Standard code review
|
||||
|
||||
**Debugging techniques consolidated with tools**
|
||||
|
||||
`systematic-debugging` now bundles supporting techniques and tools:
|
||||
- `root-cause-tracing.md` - Trace bugs backward through call stack
|
||||
- `defense-in-depth.md` - Add validation at multiple layers
|
||||
- `condition-based-waiting.md` - Replace arbitrary timeouts with condition polling
|
||||
- `find-polluter.sh` - Bisection script to find which test creates pollution
|
||||
- `condition-based-waiting-example.ts` - Complete implementation from real debugging session
|
||||
|
||||
**Testing anti-patterns reference**
|
||||
|
||||
`test-driven-development` now includes `testing-anti-patterns.md` covering:
|
||||
- Testing mock behavior instead of real behavior
|
||||
- Adding test-only methods to production classes
|
||||
- Mocking without understanding dependencies
|
||||
- Incomplete mocks that hide structural assumptions
|
||||
|
||||
**Skill test infrastructure**
|
||||
|
||||
Three new test frameworks for validating skill behavior:
|
||||
|
||||
`tests/skill-triggering/` - Validates skills trigger from naive prompts without explicit naming. Tests 6 skills to ensure descriptions alone are sufficient.
|
||||
|
||||
`tests/claude-code/` - Integration tests using `claude -p` for headless testing. Verifies skill usage via session transcript (JSONL) analysis. Includes `analyze-token-usage.py` for cost tracking.
|
||||
|
||||
`tests/subagent-driven-dev/` - End-to-end workflow validation with two complete test projects:
|
||||
- `go-fractals/` - CLI tool with Sierpinski/Mandelbrot (10 tasks)
|
||||
- `svelte-todo/` - CRUD app with localStorage and Playwright (12 tasks)
|
||||
|
||||
### Major Changes
|
||||
|
||||
**DOT flowcharts as executable specifications**
|
||||
|
||||
Rewrote key skills using DOT/GraphViz flowcharts as the authoritative process definition. Prose becomes supporting content.
|
||||
|
||||
**The Description Trap** (documented in `writing-skills`): Discovered that skill descriptions override flowchart content when descriptions contain workflow summaries. Claude follows the short description instead of reading the detailed flowchart. Fix: descriptions must be trigger-only ("Use when X") with no process details.
|
||||
|
||||
**Skill priority in using-superpowers**
|
||||
|
||||
When multiple skills apply, process skills (brainstorming, debugging) now explicitly come before implementation skills. "Build X" triggers brainstorming first, then domain skills.
|
||||
|
||||
**brainstorming trigger strengthened**
|
||||
|
||||
Description changed to imperative: "You MUST use this before any creative work—creating features, building components, adding functionality, or modifying behavior."
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
**Skill consolidation** - Six standalone skills merged:
|
||||
- `root-cause-tracing`, `defense-in-depth`, `condition-based-waiting` → bundled in `systematic-debugging/`
|
||||
- `testing-skills-with-subagents` → bundled in `writing-skills/`
|
||||
- `testing-anti-patterns` → bundled in `test-driven-development/`
|
||||
- `sharing-skills` removed (obsolete)
|
||||
|
||||
### Other Improvements
|
||||
|
||||
- **render-graphs.js** - Tool to extract DOT diagrams from skills and render to SVG
|
||||
- **Rationalizations table** in using-superpowers - Scannable format including new entries: "I need more context first", "Let me explore first", "This feels productive"
|
||||
- **docs/testing.md** - Guide to testing skills with Claude Code integration tests
|
||||
|
||||
---
|
||||
|
||||
## v3.6.2 (2025-12-03)
|
||||
|
||||
### Fixed
|
||||
@@ -98,9 +196,9 @@
|
||||
- Updated terminology: "Superpowers skills" instead of "Core skills"
|
||||
|
||||
### Files Added
|
||||
- `codex/INSTALL.md` - Installation guide for Codex users
|
||||
- `codex/superpowers-bootstrap.md` - Bootstrap instructions with Codex adaptations
|
||||
- `scripts/superpowers-codex` - Unified Node.js executable with all functionality
|
||||
- `.codex/INSTALL.md` - Installation guide for Codex users
|
||||
- `.codex/superpowers-bootstrap.md` - Bootstrap instructions with Codex adaptations
|
||||
- `.codex/superpowers-codex` - Unified Node.js executable with all functionality
|
||||
|
||||
**Note:** Codex support is experimental. The integration provides core superpowers functionality but may require refinement based on user feedback.
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
description: Interactive design refinement using Socratic method
|
||||
description: "You MUST use this before any creative work - creating features, building components, adding functionality, or modifying behavior. Explores requirements and design before implementation."
|
||||
---
|
||||
|
||||
Use and follow the brainstorming skill exactly as written
|
||||
Invoke the superpowers:brainstorming skill and follow it exactly as presented to you
|
||||
|
||||
@@ -2,4 +2,4 @@
|
||||
description: Execute plan in batches with review checkpoints
|
||||
---
|
||||
|
||||
Use the executing-plans skill exactly as written
|
||||
Invoke the superpowers:executing-plans skill and follow it exactly as presented to you
|
||||
|
||||
@@ -2,4 +2,4 @@
|
||||
description: Create detailed implementation plan with bite-sized tasks
|
||||
---
|
||||
|
||||
Use the writing-plans skill exactly as written
|
||||
Invoke the superpowers:writing-plans skill and follow it exactly as presented to you
|
||||
|
||||
711
docs/plans/2025-11-28-skills-improvements-from-user-feedback.md
Normal file
711
docs/plans/2025-11-28-skills-improvements-from-user-feedback.md
Normal file
@@ -0,0 +1,711 @@
|
||||
# Skills Improvements from User Feedback
|
||||
|
||||
**Date:** 2025-11-28
|
||||
**Status:** Draft
|
||||
**Source:** Two Claude instances using superpowers in real development scenarios
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
Two Claude instances provided detailed feedback from actual development sessions. Their feedback reveals **systematic gaps** in current skills that allowed preventable bugs to ship despite following the skills.
|
||||
|
||||
**Critical insight:** These are problem reports, not just solution proposals. The problems are real; the solutions need careful evaluation.
|
||||
|
||||
**Key themes:**
|
||||
1. **Verification gaps** - We verify operations succeed but not that they achieve intended outcomes
|
||||
2. **Process hygiene** - Background processes accumulate and interfere across subagents
|
||||
3. **Context optimization** - Subagents get too much irrelevant information
|
||||
4. **Self-reflection missing** - No prompt to critique own work before handoff
|
||||
5. **Mock safety** - Mocks can drift from interfaces without detection
|
||||
6. **Skill activation** - Skills exist but aren't being read/used
|
||||
|
||||
---
|
||||
|
||||
## Problems Identified
|
||||
|
||||
### Problem 1: Configuration Change Verification Gap
|
||||
|
||||
**What happened:**
|
||||
- Subagent tested "OpenAI integration"
|
||||
- Set `OPENAI_API_KEY` env var
|
||||
- Got status 200 responses
|
||||
- Reported "OpenAI integration working"
|
||||
- **BUT** response contained `"model": "claude-sonnet-4-20250514"` - was actually using Anthropic
|
||||
|
||||
**Root cause:**
|
||||
`verification-before-completion` checks operations succeed but not that outcomes reflect intended configuration changes.
|
||||
|
||||
**Impact:** High - False confidence in integration tests, bugs ship to production
|
||||
|
||||
**Example failure pattern:**
|
||||
- Switch LLM provider → verify status 200 but don't check model name
|
||||
- Enable feature flag → verify no errors but don't check feature is active
|
||||
- Change environment → verify deployment succeeds but don't check environment vars
|
||||
|
||||
---
|
||||
|
||||
### Problem 2: Background Process Accumulation
|
||||
|
||||
**What happened:**
|
||||
- Multiple subagents dispatched during session
|
||||
- Each started background server processes
|
||||
- Processes accumulated (4+ servers running)
|
||||
- Stale processes still bound to ports
|
||||
- Later E2E test hit stale server with wrong config
|
||||
- Confusing/incorrect test results
|
||||
|
||||
**Root cause:**
|
||||
Subagents are stateless - don't know about previous subagents' processes. No cleanup protocol.
|
||||
|
||||
**Impact:** Medium-High - Tests hit wrong server, false passes/failures, debugging confusion
|
||||
|
||||
---
|
||||
|
||||
### Problem 3: Context Bloat in Subagent Prompts
|
||||
|
||||
**What happened:**
|
||||
- Standard approach: give subagent full plan file to read
|
||||
- Experiment: give only task + pattern + file + verify command
|
||||
- Result: Faster, more focused, single-attempt completion more common
|
||||
|
||||
**Root cause:**
|
||||
Subagents waste tokens and attention on irrelevant plan sections.
|
||||
|
||||
**Impact:** Medium - Slower execution, more failed attempts
|
||||
|
||||
**What worked:**
|
||||
```
|
||||
You are adding a single E2E test to packnplay's test suite.
|
||||
|
||||
**Your task:** Add `TestE2E_FeaturePrivilegedMode` to `pkg/runner/e2e_test.go`
|
||||
|
||||
**What to test:** A local devcontainer feature that requests `"privileged": true`
|
||||
in its metadata should result in the container running with `--privileged` flag.
|
||||
|
||||
**Follow the exact pattern of TestE2E_FeatureOptionValidation** (at the end of the file)
|
||||
|
||||
**After writing, run:** `go test -v ./pkg/runner -run TestE2E_FeaturePrivilegedMode -timeout 5m`
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Problem 4: No Self-Reflection Before Handoff
|
||||
|
||||
**What happened:**
|
||||
- Added self-reflection prompt: "Look at your work with fresh eyes - what could be better?"
|
||||
- Implementer for Task 5 identified failing test was due to implementation bug, not test bug
|
||||
- Traced to line 99: `strings.Join(metadata.Entrypoint, " ")` creating invalid Docker syntax
|
||||
- Without self-reflection, would have just reported "test fails" without root cause
|
||||
|
||||
**Root cause:**
|
||||
Implementers don't naturally step back and critique their own work before reporting completion.
|
||||
|
||||
**Impact:** Medium - Bugs handed off to reviewer that implementer could have caught
|
||||
|
||||
---
|
||||
|
||||
### Problem 5: Mock-Interface Drift
|
||||
|
||||
**What happened:**
|
||||
```typescript
|
||||
// Interface defines close()
|
||||
interface PlatformAdapter {
|
||||
close(): Promise<void>;
|
||||
}
|
||||
|
||||
// Code (BUGGY) calls cleanup()
|
||||
await adapter.cleanup();
|
||||
|
||||
// Mock (MATCHES BUG) defines cleanup()
|
||||
vi.mock('web-adapter', () => ({
|
||||
WebAdapter: vi.fn().mockImplementation(() => ({
|
||||
cleanup: vi.fn().mockResolvedValue(undefined), // Wrong!
|
||||
})),
|
||||
}));
|
||||
```
|
||||
- Tests passed
|
||||
- Runtime crashed: "adapter.cleanup is not a function"
|
||||
|
||||
**Root cause:**
|
||||
Mock derived from what buggy code calls, not from interface definition. TypeScript can't catch inline mocks with wrong method names.
|
||||
|
||||
**Impact:** High - Tests give false confidence, runtime crashes
|
||||
|
||||
**Why testing-anti-patterns didn't prevent this:**
|
||||
The skill covers testing mock behavior and mocking without understanding, but not the specific pattern of "derive mock from interface, not implementation."
|
||||
|
||||
---
|
||||
|
||||
### Problem 6: Code Reviewer File Access
|
||||
|
||||
**What happened:**
|
||||
- Code reviewer subagent dispatched
|
||||
- Couldn't find test file: "The file doesn't appear to exist in the repository"
|
||||
- File actually exists
|
||||
- Reviewer didn't know to explicitly read it first
|
||||
|
||||
**Root cause:**
|
||||
Reviewer prompts don't include explicit file reading instructions.
|
||||
|
||||
**Impact:** Low-Medium - Reviews fail or incomplete
|
||||
|
||||
---
|
||||
|
||||
### Problem 7: Fix Workflow Latency
|
||||
|
||||
**What happened:**
|
||||
- Implementer identifies bug during self-reflection
|
||||
- Implementer knows the fix
|
||||
- Current workflow: report → I dispatch fixer → fixer fixes → I verify
|
||||
- Extra round-trip adds latency without adding value
|
||||
|
||||
**Root cause:**
|
||||
Rigid separation between implementer and fixer roles when implementer has already diagnosed.
|
||||
|
||||
**Impact:** Low - Latency, but no correctness issue
|
||||
|
||||
---
|
||||
|
||||
### Problem 8: Skills Not Being Read
|
||||
|
||||
**What happened:**
|
||||
- `testing-anti-patterns` skill exists
|
||||
- Neither human nor subagents read it before writing tests
|
||||
- Would have prevented some issues (though not all - see Problem 5)
|
||||
|
||||
**Root cause:**
|
||||
No enforcement that subagents read relevant skills. No prompt includes skill reading.
|
||||
|
||||
**Impact:** Medium - Skill investment wasted if not used
|
||||
|
||||
---
|
||||
|
||||
## Proposed Improvements
|
||||
|
||||
### 1. verification-before-completion: Add Configuration Change Verification
|
||||
|
||||
**Add new section:**
|
||||
|
||||
```markdown
|
||||
## Verifying Configuration Changes
|
||||
|
||||
When testing changes to configuration, providers, feature flags, or environment:
|
||||
|
||||
**Don't just verify the operation succeeded. Verify the output reflects the intended change.**
|
||||
|
||||
### Common Failure Pattern
|
||||
|
||||
Operation succeeds because *some* valid config exists, but it's not the config you intended to test.
|
||||
|
||||
### Examples
|
||||
|
||||
| Change | Insufficient | Required |
|
||||
|--------|-------------|----------|
|
||||
| Switch LLM provider | Status 200 | Response contains expected model name |
|
||||
| Enable feature flag | No errors | Feature behavior actually active |
|
||||
| Change environment | Deploy succeeds | Logs/vars reference new environment |
|
||||
| Set credentials | Auth succeeds | Authenticated user/context is correct |
|
||||
|
||||
### Gate Function
|
||||
|
||||
```
|
||||
BEFORE claiming configuration change works:
|
||||
|
||||
1. IDENTIFY: What should be DIFFERENT after this change?
|
||||
2. LOCATE: Where is that difference observable?
|
||||
- Response field (model name, user ID)
|
||||
- Log line (environment, provider)
|
||||
- Behavior (feature active/inactive)
|
||||
3. RUN: Command that shows the observable difference
|
||||
4. VERIFY: Output contains expected difference
|
||||
5. ONLY THEN: Claim configuration change works
|
||||
|
||||
Red flags:
|
||||
- "Request succeeded" without checking content
|
||||
- Checking status code but not response body
|
||||
- Verifying no errors but not positive confirmation
|
||||
```
|
||||
|
||||
**Why this works:**
|
||||
Forces verification of INTENT, not just operation success.
|
||||
|
||||
---
|
||||
|
||||
### 2. subagent-driven-development: Add Process Hygiene for E2E Tests
|
||||
|
||||
**Add new section:**
|
||||
|
||||
```markdown
|
||||
## Process Hygiene for E2E Tests
|
||||
|
||||
When dispatching subagents that start services (servers, databases, message queues):
|
||||
|
||||
### Problem
|
||||
|
||||
Subagents are stateless - they don't know about processes started by previous subagents. Background processes persist and can interfere with later tests.
|
||||
|
||||
### Solution
|
||||
|
||||
**Before dispatching E2E test subagent, include cleanup in prompt:**
|
||||
|
||||
```
|
||||
BEFORE starting any services:
|
||||
1. Kill existing processes: pkill -f "<service-pattern>" 2>/dev/null || true
|
||||
2. Wait for cleanup: sleep 1
|
||||
3. Verify port free: lsof -i :<port> && echo "ERROR: Port still in use" || echo "Port free"
|
||||
|
||||
AFTER tests complete:
|
||||
1. Kill the process you started
|
||||
2. Verify cleanup: pgrep -f "<service-pattern>" || echo "Cleanup successful"
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
```
|
||||
Task: Run E2E test of API server
|
||||
|
||||
Prompt includes:
|
||||
"Before starting the server:
|
||||
- Kill any existing servers: pkill -f 'node.*server.js' 2>/dev/null || true
|
||||
- Verify port 3001 is free: lsof -i :3001 && exit 1 || echo 'Port available'
|
||||
|
||||
After tests:
|
||||
- Kill the server you started
|
||||
- Verify: pgrep -f 'node.*server.js' || echo 'Cleanup verified'"
|
||||
```
|
||||
|
||||
### Why This Matters
|
||||
|
||||
- Stale processes serve requests with wrong config
|
||||
- Port conflicts cause silent failures
|
||||
- Process accumulation slows system
|
||||
- Confusing test results (hitting wrong server)
|
||||
```
|
||||
|
||||
**Trade-off analysis:**
|
||||
- Adds boilerplate to prompts
|
||||
- But prevents very confusing debugging
|
||||
- Worth it for E2E test subagents
|
||||
|
||||
---
|
||||
|
||||
### 3. subagent-driven-development: Add Lean Context Option
|
||||
|
||||
**Modify Step 2: Execute Task with Subagent**
|
||||
|
||||
**Before:**
|
||||
```
|
||||
Read that task carefully from [plan-file].
|
||||
```
|
||||
|
||||
**After:**
|
||||
```
|
||||
## Context Approaches
|
||||
|
||||
**Full Plan (default):**
|
||||
Use when tasks are complex or have dependencies:
|
||||
```
|
||||
Read Task N from [plan-file] carefully.
|
||||
```
|
||||
|
||||
**Lean Context (for independent tasks):**
|
||||
Use when task is standalone and pattern-based:
|
||||
```
|
||||
You are implementing: [1-2 sentence task description]
|
||||
|
||||
File to modify: [exact path]
|
||||
Pattern to follow: [reference to existing function/test]
|
||||
What to implement: [specific requirement]
|
||||
Verification: [exact command to run]
|
||||
|
||||
[Do NOT include full plan file]
|
||||
```
|
||||
|
||||
**Use lean context when:**
|
||||
- Task follows existing pattern (add similar test, implement similar feature)
|
||||
- Task is self-contained (doesn't need context from other tasks)
|
||||
- Pattern reference is sufficient (e.g., "follow TestE2E_FeatureOptionValidation")
|
||||
|
||||
**Use full plan when:**
|
||||
- Task has dependencies on other tasks
|
||||
- Requires understanding of overall architecture
|
||||
- Complex logic that needs context
|
||||
```
|
||||
|
||||
**Example:**
|
||||
```
|
||||
Lean context prompt:
|
||||
|
||||
"You are adding a test for privileged mode in devcontainer features.
|
||||
|
||||
File: pkg/runner/e2e_test.go
|
||||
Pattern: Follow TestE2E_FeatureOptionValidation (at end of file)
|
||||
Test: Feature with `"privileged": true` in metadata results in `--privileged` flag
|
||||
Verify: go test -v ./pkg/runner -run TestE2E_FeaturePrivilegedMode -timeout 5m
|
||||
|
||||
Report: Implementation, test results, any issues."
|
||||
```
|
||||
|
||||
**Why this works:**
|
||||
Reduces token usage, increases focus, faster completion when appropriate.
|
||||
|
||||
---
|
||||
|
||||
### 4. subagent-driven-development: Add Self-Reflection Step
|
||||
|
||||
**Modify Step 2: Execute Task with Subagent**
|
||||
|
||||
**Add to prompt template:**
|
||||
|
||||
```
|
||||
When done, BEFORE reporting back:
|
||||
|
||||
Take a step back and review your work with fresh eyes.
|
||||
|
||||
Ask yourself:
|
||||
- Does this actually solve the task as specified?
|
||||
- Are there edge cases I didn't consider?
|
||||
- Did I follow the pattern correctly?
|
||||
- If tests are failing, what's the ROOT CAUSE (implementation bug vs test bug)?
|
||||
- What could be better about this implementation?
|
||||
|
||||
If you identify issues during this reflection, fix them now.
|
||||
|
||||
Then report:
|
||||
- What you implemented
|
||||
- Self-reflection findings (if any)
|
||||
- Test results
|
||||
- Files changed
|
||||
```
|
||||
|
||||
**Why this works:**
|
||||
Catches bugs implementer can find themselves before handoff. Documented case: identified entrypoint bug through self-reflection.
|
||||
|
||||
**Trade-off:**
|
||||
Adds ~30 seconds per task, but catches issues before review.
|
||||
|
||||
---
|
||||
|
||||
### 5. requesting-code-review: Add Explicit File Reading
|
||||
|
||||
**Modify the code-reviewer template:**
|
||||
|
||||
**Add at the beginning:**
|
||||
|
||||
```markdown
|
||||
## Files to Review
|
||||
|
||||
BEFORE analyzing, read these files:
|
||||
|
||||
1. [List specific files that changed in the diff]
|
||||
2. [Files referenced by changes but not modified]
|
||||
|
||||
Use Read tool to load each file.
|
||||
|
||||
If you cannot find a file:
|
||||
- Check exact path from diff
|
||||
- Try alternate locations
|
||||
- Report: "Cannot locate [path] - please verify file exists"
|
||||
|
||||
DO NOT proceed with review until you've read the actual code.
|
||||
```
|
||||
|
||||
**Why this works:**
|
||||
Explicit instruction prevents "file not found" issues.
|
||||
|
||||
---
|
||||
|
||||
### 6. testing-anti-patterns: Add Mock-Interface Drift Anti-Pattern
|
||||
|
||||
**Add new Anti-Pattern 6:**
|
||||
|
||||
```markdown
|
||||
## Anti-Pattern 6: Mocks Derived from Implementation
|
||||
|
||||
**The violation:**
|
||||
```typescript
|
||||
// Code (BUGGY) calls cleanup()
|
||||
await adapter.cleanup();
|
||||
|
||||
// Mock (MATCHES BUG) has cleanup()
|
||||
const mock = {
|
||||
cleanup: vi.fn().mockResolvedValue(undefined)
|
||||
};
|
||||
|
||||
// Interface (CORRECT) defines close()
|
||||
interface PlatformAdapter {
|
||||
close(): Promise<void>;
|
||||
}
|
||||
```
|
||||
|
||||
**Why this is wrong:**
|
||||
- Mock encodes the bug into the test
|
||||
- TypeScript can't catch inline mocks with wrong method names
|
||||
- Test passes because both code and mock are wrong
|
||||
- Runtime crashes when real object is used
|
||||
|
||||
**The fix:**
|
||||
```typescript
|
||||
// ✅ GOOD: Derive mock from interface
|
||||
|
||||
// Step 1: Open interface definition (PlatformAdapter)
|
||||
// Step 2: List methods defined there (close, initialize, etc.)
|
||||
// Step 3: Mock EXACTLY those methods
|
||||
|
||||
const mock = {
|
||||
initialize: vi.fn().mockResolvedValue(undefined),
|
||||
close: vi.fn().mockResolvedValue(undefined), // From interface!
|
||||
};
|
||||
|
||||
// Now test FAILS because code calls cleanup() which doesn't exist
|
||||
// That failure reveals the bug BEFORE runtime
|
||||
```
|
||||
|
||||
### Gate Function
|
||||
|
||||
```
|
||||
BEFORE writing any mock:
|
||||
|
||||
1. STOP - Do NOT look at the code under test yet
|
||||
2. FIND: The interface/type definition for the dependency
|
||||
3. READ: The interface file
|
||||
4. LIST: Methods defined in the interface
|
||||
5. MOCK: ONLY those methods with EXACTLY those names
|
||||
6. DO NOT: Look at what your code calls
|
||||
|
||||
IF your test fails because code calls something not in mock:
|
||||
✅ GOOD - The test found a bug in your code
|
||||
Fix the code to call the correct interface method
|
||||
NOT the mock
|
||||
|
||||
Red flags:
|
||||
- "I'll mock what the code calls"
|
||||
- Copying method names from implementation
|
||||
- Mock written without reading interface
|
||||
- "The test is failing so I'll add this method to the mock"
|
||||
```
|
||||
|
||||
**Detection:**
|
||||
|
||||
When you see runtime error "X is not a function" and tests pass:
|
||||
1. Check if X is mocked
|
||||
2. Compare mock methods to interface methods
|
||||
3. Look for method name mismatches
|
||||
```
|
||||
|
||||
**Why this works:**
|
||||
Directly addresses the failure pattern from feedback.
|
||||
|
||||
---
|
||||
|
||||
### 7. subagent-driven-development: Require Skills Reading for Test Subagents
|
||||
|
||||
**Add to prompt template when task involves testing:**
|
||||
|
||||
```markdown
|
||||
BEFORE writing any tests:
|
||||
|
||||
1. Read testing-anti-patterns skill:
|
||||
Use Skill tool: superpowers:testing-anti-patterns
|
||||
|
||||
2. Apply gate functions from that skill when:
|
||||
- Writing mocks
|
||||
- Adding methods to production classes
|
||||
- Mocking dependencies
|
||||
|
||||
This is NOT optional. Tests that violate anti-patterns will be rejected in review.
|
||||
```
|
||||
|
||||
**Why this works:**
|
||||
Ensures skills are actually used, not just exist.
|
||||
|
||||
**Trade-off:**
|
||||
Adds time to each task, but prevents entire classes of bugs.
|
||||
|
||||
---
|
||||
|
||||
### 8. subagent-driven-development: Allow Implementer to Fix Self-Identified Issues
|
||||
|
||||
**Modify Step 2:**
|
||||
|
||||
**Current:**
|
||||
```
|
||||
Subagent reports back with summary of work.
|
||||
```
|
||||
|
||||
**Proposed:**
|
||||
```
|
||||
Subagent performs self-reflection, then:
|
||||
|
||||
IF self-reflection identifies fixable issues:
|
||||
1. Fix the issues
|
||||
2. Re-run verification
|
||||
3. Report: "Initial implementation + self-reflection fix"
|
||||
|
||||
ELSE:
|
||||
Report: "Implementation complete"
|
||||
|
||||
Include in report:
|
||||
- Self-reflection findings
|
||||
- Whether fixes were applied
|
||||
- Final verification results
|
||||
```
|
||||
|
||||
**Why this works:**
|
||||
Reduces latency when implementer already knows the fix. Documented case: would have saved one round-trip for entrypoint bug.
|
||||
|
||||
**Trade-off:**
|
||||
Slightly more complex prompt, but faster end-to-end.
|
||||
|
||||
---
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: High-Impact, Low-Risk (Do First)
|
||||
|
||||
1. **verification-before-completion: Configuration change verification**
|
||||
- Clear addition, doesn't change existing content
|
||||
- Addresses high-impact problem (false confidence in tests)
|
||||
- File: `skills/verification-before-completion/SKILL.md`
|
||||
|
||||
2. **testing-anti-patterns: Mock-interface drift**
|
||||
- Adds new anti-pattern, doesn't modify existing
|
||||
- Addresses high-impact problem (runtime crashes)
|
||||
- File: `skills/testing-anti-patterns/SKILL.md`
|
||||
|
||||
3. **requesting-code-review: Explicit file reading**
|
||||
- Simple addition to template
|
||||
- Fixes concrete problem (reviewers can't find files)
|
||||
- File: `skills/requesting-code-review/SKILL.md`
|
||||
|
||||
### Phase 2: Moderate Changes (Test Carefully)
|
||||
|
||||
4. **subagent-driven-development: Process hygiene**
|
||||
- Adds new section, doesn't change workflow
|
||||
- Addresses medium-high impact (test reliability)
|
||||
- File: `skills/subagent-driven-development/SKILL.md`
|
||||
|
||||
5. **subagent-driven-development: Self-reflection**
|
||||
- Changes prompt template (higher risk)
|
||||
- But documented to catch bugs
|
||||
- File: `skills/subagent-driven-development/SKILL.md`
|
||||
|
||||
6. **subagent-driven-development: Skills reading requirement**
|
||||
- Adds prompt overhead
|
||||
- But ensures skills are actually used
|
||||
- File: `skills/subagent-driven-development/SKILL.md`
|
||||
|
||||
### Phase 3: Optimization (Validate First)
|
||||
|
||||
7. **subagent-driven-development: Lean context option**
|
||||
- Adds complexity (two approaches)
|
||||
- Needs validation that it doesn't cause confusion
|
||||
- File: `skills/subagent-driven-development/SKILL.md`
|
||||
|
||||
8. **subagent-driven-development: Allow implementer to fix**
|
||||
- Changes workflow (higher risk)
|
||||
- Optimization, not bug fix
|
||||
- File: `skills/subagent-driven-development/SKILL.md`
|
||||
|
||||
---
|
||||
|
||||
## Open Questions
|
||||
|
||||
1. **Lean context approach:**
|
||||
- Should we make it the default for pattern-based tasks?
|
||||
- How do we decide which approach to use?
|
||||
- Risk of being too lean and missing important context?
|
||||
|
||||
2. **Self-reflection:**
|
||||
- Will this slow down simple tasks significantly?
|
||||
- Should it only apply to complex tasks?
|
||||
- How do we prevent "reflection fatigue" where it becomes rote?
|
||||
|
||||
3. **Process hygiene:**
|
||||
- Should this be in subagent-driven-development or a separate skill?
|
||||
- Does it apply to other workflows beyond E2E tests?
|
||||
- How do we handle cases where process SHOULD persist (dev servers)?
|
||||
|
||||
4. **Skills reading enforcement:**
|
||||
- Should we require ALL subagents to read relevant skills?
|
||||
- How do we keep prompts from becoming too long?
|
||||
- Risk of over-documenting and losing focus?
|
||||
|
||||
---
|
||||
|
||||
## Success Metrics
|
||||
|
||||
How do we know these improvements work?
|
||||
|
||||
1. **Configuration verification:**
|
||||
- Zero instances of "test passed but wrong config was used"
|
||||
- Jesse doesn't say "that's not actually testing what you think"
|
||||
|
||||
2. **Process hygiene:**
|
||||
- Zero instances of "test hit wrong server"
|
||||
- No port conflict errors during E2E test runs
|
||||
|
||||
3. **Mock-interface drift:**
|
||||
- Zero instances of "tests pass but runtime crashes on missing method"
|
||||
- No method name mismatches between mocks and interfaces
|
||||
|
||||
4. **Self-reflection:**
|
||||
- Measurable: Do implementer reports include self-reflection findings?
|
||||
- Qualitative: Do fewer bugs make it to code review?
|
||||
|
||||
5. **Skills reading:**
|
||||
- Subagent reports reference skill gate functions
|
||||
- Fewer anti-pattern violations in code review
|
||||
|
||||
---
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
### Risk: Prompt Bloat
|
||||
**Problem:** Adding all these requirements makes prompts overwhelming
|
||||
**Mitigation:**
|
||||
- Phase implementation (don't add everything at once)
|
||||
- Make some additions conditional (E2E hygiene only for E2E tests)
|
||||
- Consider templates for different task types
|
||||
|
||||
### Risk: Analysis Paralysis
|
||||
**Problem:** Too much reflection/verification slows execution
|
||||
**Mitigation:**
|
||||
- Keep gate functions quick (seconds, not minutes)
|
||||
- Make lean context opt-in initially
|
||||
- Monitor task completion times
|
||||
|
||||
### Risk: False Sense of Security
|
||||
**Problem:** Following checklist doesn't guarantee correctness
|
||||
**Mitigation:**
|
||||
- Emphasize gate functions are minimums, not maximums
|
||||
- Keep "use judgment" language in skills
|
||||
- Document that skills catch common failures, not all failures
|
||||
|
||||
### Risk: Skill Divergence
|
||||
**Problem:** Different skills give conflicting advice
|
||||
**Mitigation:**
|
||||
- Review changes across all skills for consistency
|
||||
- Document how skills interact (Integration sections)
|
||||
- Test with real scenarios before deployment
|
||||
|
||||
---
|
||||
|
||||
## Recommendation
|
||||
|
||||
**Proceed with Phase 1 immediately:**
|
||||
- verification-before-completion: Configuration change verification
|
||||
- testing-anti-patterns: Mock-interface drift
|
||||
- requesting-code-review: Explicit file reading
|
||||
|
||||
**Test Phase 2 with Jesse before finalizing:**
|
||||
- Get feedback on self-reflection impact
|
||||
- Validate process hygiene approach
|
||||
- Confirm skills reading requirement is worth overhead
|
||||
|
||||
**Hold Phase 3 pending validation:**
|
||||
- Lean context needs real-world testing
|
||||
- Implementer-fix workflow change needs careful evaluation
|
||||
|
||||
These changes address real problems documented by users while minimizing risk of making skills worse.
|
||||
303
docs/testing.md
Normal file
303
docs/testing.md
Normal file
@@ -0,0 +1,303 @@
|
||||
# Testing Superpowers Skills
|
||||
|
||||
This document describes how to test Superpowers skills, particularly the integration tests for complex skills like `subagent-driven-development`.
|
||||
|
||||
## Overview
|
||||
|
||||
Testing skills that involve subagents, workflows, and complex interactions requires running actual Claude Code sessions in headless mode and verifying their behavior through session transcripts.
|
||||
|
||||
## Test Structure
|
||||
|
||||
```
|
||||
tests/
|
||||
├── claude-code/
|
||||
│ ├── test-helpers.sh # Shared test utilities
|
||||
│ ├── test-subagent-driven-development-integration.sh
|
||||
│ ├── analyze-token-usage.py # Token analysis tool
|
||||
│ └── run-skill-tests.sh # Test runner (if exists)
|
||||
```
|
||||
|
||||
## Running Tests
|
||||
|
||||
### Integration Tests
|
||||
|
||||
Integration tests execute real Claude Code sessions with actual skills:
|
||||
|
||||
```bash
|
||||
# Run the subagent-driven-development integration test
|
||||
cd tests/claude-code
|
||||
./test-subagent-driven-development-integration.sh
|
||||
```
|
||||
|
||||
**Note:** Integration tests can take 10-30 minutes as they execute real implementation plans with multiple subagents.
|
||||
|
||||
### Requirements
|
||||
|
||||
- Must run from the **superpowers plugin directory** (not from temp directories)
|
||||
- Claude Code must be installed and available as `claude` command
|
||||
- Local dev marketplace must be enabled: `"superpowers@superpowers-dev": true` in `~/.claude/settings.json`
|
||||
|
||||
## Integration Test: subagent-driven-development
|
||||
|
||||
### What It Tests
|
||||
|
||||
The integration test verifies the `subagent-driven-development` skill correctly:
|
||||
|
||||
1. **Plan Loading**: Reads the plan once at the beginning
|
||||
2. **Full Task Text**: Provides complete task descriptions to subagents (doesn't make them read files)
|
||||
3. **Self-Review**: Ensures subagents perform self-review before reporting
|
||||
4. **Review Order**: Runs spec compliance review before code quality review
|
||||
5. **Review Loops**: Uses review loops when issues are found
|
||||
6. **Independent Verification**: Spec reviewer reads code independently, doesn't trust implementer reports
|
||||
|
||||
### How It Works
|
||||
|
||||
1. **Setup**: Creates a temporary Node.js project with a minimal implementation plan
|
||||
2. **Execution**: Runs Claude Code in headless mode with the skill
|
||||
3. **Verification**: Parses the session transcript (`.jsonl` file) to verify:
|
||||
- Skill tool was invoked
|
||||
- Subagents were dispatched (Task tool)
|
||||
- TodoWrite was used for tracking
|
||||
- Implementation files were created
|
||||
- Tests pass
|
||||
- Git commits show proper workflow
|
||||
4. **Token Analysis**: Shows token usage breakdown by subagent
|
||||
|
||||
### Test Output
|
||||
|
||||
```
|
||||
========================================
|
||||
Integration Test: subagent-driven-development
|
||||
========================================
|
||||
|
||||
Test project: /tmp/tmp.xyz123
|
||||
|
||||
=== Verification Tests ===
|
||||
|
||||
Test 1: Skill tool invoked...
|
||||
[PASS] subagent-driven-development skill was invoked
|
||||
|
||||
Test 2: Subagents dispatched...
|
||||
[PASS] 7 subagents dispatched
|
||||
|
||||
Test 3: Task tracking...
|
||||
[PASS] TodoWrite used 5 time(s)
|
||||
|
||||
Test 6: Implementation verification...
|
||||
[PASS] src/math.js created
|
||||
[PASS] add function exists
|
||||
[PASS] multiply function exists
|
||||
[PASS] test/math.test.js created
|
||||
[PASS] Tests pass
|
||||
|
||||
Test 7: Git commit history...
|
||||
[PASS] Multiple commits created (3 total)
|
||||
|
||||
Test 8: No extra features added...
|
||||
[PASS] No extra features added
|
||||
|
||||
=========================================
|
||||
Token Usage Analysis
|
||||
=========================================
|
||||
|
||||
Usage Breakdown:
|
||||
----------------------------------------------------------------------------------------------------
|
||||
Agent Description Msgs Input Output Cache Cost
|
||||
----------------------------------------------------------------------------------------------------
|
||||
main Main session (coordinator) 34 27 3,996 1,213,703 $ 4.09
|
||||
3380c209 implementing Task 1: Create Add Function 1 2 787 24,989 $ 0.09
|
||||
34b00fde implementing Task 2: Create Multiply Function 1 4 644 25,114 $ 0.09
|
||||
3801a732 reviewing whether an implementation matches... 1 5 703 25,742 $ 0.09
|
||||
4c142934 doing a final code review... 1 6 854 25,319 $ 0.09
|
||||
5f017a42 a code reviewer. Review Task 2... 1 6 504 22,949 $ 0.08
|
||||
a6b7fbe4 a code reviewer. Review Task 1... 1 6 515 22,534 $ 0.08
|
||||
f15837c0 reviewing whether an implementation matches... 1 6 416 22,485 $ 0.07
|
||||
----------------------------------------------------------------------------------------------------
|
||||
|
||||
TOTALS:
|
||||
Total messages: 41
|
||||
Input tokens: 62
|
||||
Output tokens: 8,419
|
||||
Cache creation tokens: 132,742
|
||||
Cache read tokens: 1,382,835
|
||||
|
||||
Total input (incl cache): 1,515,639
|
||||
Total tokens: 1,524,058
|
||||
|
||||
Estimated cost: $4.67
|
||||
(at $3/$15 per M tokens for input/output)
|
||||
|
||||
========================================
|
||||
Test Summary
|
||||
========================================
|
||||
|
||||
STATUS: PASSED
|
||||
```
|
||||
|
||||
## Token Analysis Tool
|
||||
|
||||
### Usage
|
||||
|
||||
Analyze token usage from any Claude Code session:
|
||||
|
||||
```bash
|
||||
python3 tests/claude-code/analyze-token-usage.py ~/.claude/projects/<project-dir>/<session-id>.jsonl
|
||||
```
|
||||
|
||||
### Finding Session Files
|
||||
|
||||
Session transcripts are stored in `~/.claude/projects/` with the working directory path encoded:
|
||||
|
||||
```bash
|
||||
# Example for /Users/jesse/Documents/GitHub/superpowers/superpowers
|
||||
SESSION_DIR="$HOME/.claude/projects/-Users-jesse-Documents-GitHub-superpowers-superpowers"
|
||||
|
||||
# Find recent sessions
|
||||
ls -lt "$SESSION_DIR"/*.jsonl | head -5
|
||||
```
|
||||
|
||||
### What It Shows
|
||||
|
||||
- **Main session usage**: Token usage by the coordinator (you or main Claude instance)
|
||||
- **Per-subagent breakdown**: Each Task invocation with:
|
||||
- Agent ID
|
||||
- Description (extracted from prompt)
|
||||
- Message count
|
||||
- Input/output tokens
|
||||
- Cache usage
|
||||
- Estimated cost
|
||||
- **Totals**: Overall token usage and cost estimate
|
||||
|
||||
### Understanding the Output
|
||||
|
||||
- **High cache reads**: Good - means prompt caching is working
|
||||
- **High input tokens on main**: Expected - coordinator has full context
|
||||
- **Similar costs per subagent**: Expected - each gets similar task complexity
|
||||
- **Cost per task**: Typical range is $0.05-$0.15 per subagent depending on task
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Skills Not Loading
|
||||
|
||||
**Problem**: Skill not found when running headless tests
|
||||
|
||||
**Solutions**:
|
||||
1. Ensure you're running FROM the superpowers directory: `cd /path/to/superpowers && tests/...`
|
||||
2. Check `~/.claude/settings.json` has `"superpowers@superpowers-dev": true` in `enabledPlugins`
|
||||
3. Verify skill exists in `skills/` directory
|
||||
|
||||
### Permission Errors
|
||||
|
||||
**Problem**: Claude blocked from writing files or accessing directories
|
||||
|
||||
**Solutions**:
|
||||
1. Use `--permission-mode bypassPermissions` flag
|
||||
2. Use `--add-dir /path/to/temp/dir` to grant access to test directories
|
||||
3. Check file permissions on test directories
|
||||
|
||||
### Test Timeouts
|
||||
|
||||
**Problem**: Test takes too long and times out
|
||||
|
||||
**Solutions**:
|
||||
1. Increase timeout: `timeout 1800 claude ...` (30 minutes)
|
||||
2. Check for infinite loops in skill logic
|
||||
3. Review subagent task complexity
|
||||
|
||||
### Session File Not Found
|
||||
|
||||
**Problem**: Can't find session transcript after test run
|
||||
|
||||
**Solutions**:
|
||||
1. Check the correct project directory in `~/.claude/projects/`
|
||||
2. Use `find ~/.claude/projects -name "*.jsonl" -mmin -60` to find recent sessions
|
||||
3. Verify test actually ran (check for errors in test output)
|
||||
|
||||
## Writing New Integration Tests
|
||||
|
||||
### Template
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
source "$SCRIPT_DIR/test-helpers.sh"
|
||||
|
||||
# Create test project
|
||||
TEST_PROJECT=$(create_test_project)
|
||||
trap "cleanup_test_project $TEST_PROJECT" EXIT
|
||||
|
||||
# Set up test files...
|
||||
cd "$TEST_PROJECT"
|
||||
|
||||
# Run Claude with skill
|
||||
PROMPT="Your test prompt here"
|
||||
cd "$SCRIPT_DIR/../.." && timeout 1800 claude -p "$PROMPT" \
|
||||
--allowed-tools=all \
|
||||
--add-dir "$TEST_PROJECT" \
|
||||
--permission-mode bypassPermissions \
|
||||
2>&1 | tee output.txt
|
||||
|
||||
# Find and analyze session
|
||||
WORKING_DIR_ESCAPED=$(echo "$SCRIPT_DIR/../.." | sed 's/\\//-/g' | sed 's/^-//')
|
||||
SESSION_DIR="$HOME/.claude/projects/$WORKING_DIR_ESCAPED"
|
||||
SESSION_FILE=$(find "$SESSION_DIR" -name "*.jsonl" -type f -mmin -60 | sort -r | head -1)
|
||||
|
||||
# Verify behavior by parsing session transcript
|
||||
if grep -q '"name":"Skill".*"skill":"your-skill-name"' "$SESSION_FILE"; then
|
||||
echo "[PASS] Skill was invoked"
|
||||
fi
|
||||
|
||||
# Show token analysis
|
||||
python3 "$SCRIPT_DIR/analyze-token-usage.py" "$SESSION_FILE"
|
||||
```
|
||||
|
||||
### Best Practices
|
||||
|
||||
1. **Always cleanup**: Use trap to cleanup temp directories
|
||||
2. **Parse transcripts**: Don't grep user-facing output - parse the `.jsonl` session file
|
||||
3. **Grant permissions**: Use `--permission-mode bypassPermissions` and `--add-dir`
|
||||
4. **Run from plugin dir**: Skills only load when running from the superpowers directory
|
||||
5. **Show token usage**: Always include token analysis for cost visibility
|
||||
6. **Test real behavior**: Verify actual files created, tests passing, commits made
|
||||
|
||||
## Session Transcript Format
|
||||
|
||||
Session transcripts are JSONL (JSON Lines) files where each line is a JSON object representing a message or tool result.
|
||||
|
||||
### Key Fields
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "assistant",
|
||||
"message": {
|
||||
"content": [...],
|
||||
"usage": {
|
||||
"input_tokens": 27,
|
||||
"output_tokens": 3996,
|
||||
"cache_read_input_tokens": 1213703
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Tool Results
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "user",
|
||||
"toolUseResult": {
|
||||
"agentId": "3380c209",
|
||||
"usage": {
|
||||
"input_tokens": 2,
|
||||
"output_tokens": 787,
|
||||
"cache_read_input_tokens": 24989
|
||||
},
|
||||
"prompt": "You are implementing Task 1...",
|
||||
"content": [{"type": "text", "text": "..."}]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `agentId` field links to subagent sessions, and the `usage` field contains token usage for that specific subagent invocation.
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: brainstorming
|
||||
description: Use when creating or developing, before writing code or implementation plans - refines rough ideas into fully-formed designs through collaborative questioning, alternative exploration, and incremental validation. Don't use during clear 'mechanical' processes
|
||||
description: "You MUST use this before any creative work - creating features, building components, adding functionality, or modifying behavior. Explores user intent, requirements and design before implementation."
|
||||
---
|
||||
|
||||
# Brainstorming Ideas Into Designs
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: dispatching-parallel-agents
|
||||
description: Use when facing 3+ independent failures that can be investigated without shared state or dependencies - dispatches multiple Claude agents to investigate and fix independent problems concurrently
|
||||
description: Use when facing 2+ independent tasks that can be worked on without shared state or sequential dependencies
|
||||
---
|
||||
|
||||
# Dispatching Parallel Agents
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: executing-plans
|
||||
description: Use when partner provides a complete implementation plan to execute in controlled batches with review checkpoints - loads plan, reviews critically, executes tasks in batches, reports for review between batches
|
||||
description: Use when you have a written implementation plan to execute in a separate session with review checkpoints
|
||||
---
|
||||
|
||||
# Executing Plans
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: requesting-code-review
|
||||
description: Use when completing tasks, implementing major features, or before merging to verify work meets requirements - dispatches superpowers:code-reviewer subagent to review implementation against plan or requirements before proceeding
|
||||
description: Use when completing tasks, implementing major features, or before merging to verify work meets requirements
|
||||
---
|
||||
|
||||
# Requesting Code Review
|
||||
|
||||
@@ -1,194 +0,0 @@
|
||||
---
|
||||
name: sharing-skills
|
||||
description: Use when you've developed a broadly useful skill and want to contribute it upstream via pull request - guides process of branching, committing, pushing, and creating PR to contribute skills back to upstream repository
|
||||
---
|
||||
|
||||
# Sharing Skills
|
||||
|
||||
## Overview
|
||||
|
||||
Contribute skills from your local branch back to the upstream repository.
|
||||
|
||||
**Workflow:** Branch → Edit/Create skill → Commit → Push → PR
|
||||
|
||||
## When to Share
|
||||
|
||||
**Share when:**
|
||||
- Skill applies broadly (not project-specific)
|
||||
- Pattern/technique others would benefit from
|
||||
- Well-tested and documented
|
||||
- Follows writing-skills guidelines
|
||||
|
||||
**Keep personal when:**
|
||||
- Project-specific or organization-specific
|
||||
- Experimental or unstable
|
||||
- Contains sensitive information
|
||||
- Too narrow/niche for general use
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- `gh` CLI installed and authenticated
|
||||
- Working directory is `~/.config/superpowers/skills/` (your local clone)
|
||||
- **REQUIRED:** Skill has been tested using writing-skills TDD process
|
||||
|
||||
## Sharing Workflow
|
||||
|
||||
### 1. Ensure You're on Main and Synced
|
||||
|
||||
```bash
|
||||
cd ~/.config/superpowers/skills/
|
||||
git checkout main
|
||||
git pull upstream main
|
||||
git push origin main # Push to your fork
|
||||
```
|
||||
|
||||
### 2. Create Feature Branch
|
||||
|
||||
```bash
|
||||
# Branch name: add-skillname-skill
|
||||
skill_name="your-skill-name"
|
||||
git checkout -b "add-${skill_name}-skill"
|
||||
```
|
||||
|
||||
### 3. Create or Edit Skill
|
||||
|
||||
```bash
|
||||
# Work on your skill in skills/
|
||||
# Create new skill or edit existing one
|
||||
# Skill should be in skills/category/skill-name/SKILL.md
|
||||
```
|
||||
|
||||
### 4. Commit Changes
|
||||
|
||||
```bash
|
||||
# Add and commit
|
||||
git add skills/your-skill-name/
|
||||
git commit -m "Add ${skill_name} skill
|
||||
|
||||
$(cat <<'EOF'
|
||||
Brief description of what this skill does and why it's useful.
|
||||
|
||||
Tested with: [describe testing approach]
|
||||
EOF
|
||||
)"
|
||||
```
|
||||
|
||||
### 5. Push to Your Fork
|
||||
|
||||
```bash
|
||||
git push -u origin "add-${skill_name}-skill"
|
||||
```
|
||||
|
||||
### 6. Create Pull Request
|
||||
|
||||
```bash
|
||||
# Create PR to upstream using gh CLI
|
||||
gh pr create \
|
||||
--repo upstream-org/upstream-repo \
|
||||
--title "Add ${skill_name} skill" \
|
||||
--body "$(cat <<'EOF'
|
||||
## Summary
|
||||
Brief description of the skill and what problem it solves.
|
||||
|
||||
## Testing
|
||||
Describe how you tested this skill (pressure scenarios, baseline tests, etc.).
|
||||
|
||||
## Context
|
||||
Any additional context about why this skill is needed and how it should be used.
|
||||
EOF
|
||||
)"
|
||||
```
|
||||
|
||||
## Complete Example
|
||||
|
||||
Here's a complete example of sharing a skill called "async-patterns":
|
||||
|
||||
```bash
|
||||
# 1. Sync with upstream
|
||||
cd ~/.config/superpowers/skills/
|
||||
git checkout main
|
||||
git pull upstream main
|
||||
git push origin main
|
||||
|
||||
# 2. Create branch
|
||||
git checkout -b "add-async-patterns-skill"
|
||||
|
||||
# 3. Create/edit the skill
|
||||
# (Work on skills/async-patterns/SKILL.md)
|
||||
|
||||
# 4. Commit
|
||||
git add skills/async-patterns/
|
||||
git commit -m "Add async-patterns skill
|
||||
|
||||
Patterns for handling asynchronous operations in tests and application code.
|
||||
|
||||
Tested with: Multiple pressure scenarios testing agent compliance."
|
||||
|
||||
# 5. Push
|
||||
git push -u origin "add-async-patterns-skill"
|
||||
|
||||
# 6. Create PR
|
||||
gh pr create \
|
||||
--repo upstream-org/upstream-repo \
|
||||
--title "Add async-patterns skill" \
|
||||
--body "## Summary
|
||||
Patterns for handling asynchronous operations correctly in tests and application code.
|
||||
|
||||
## Testing
|
||||
Tested with multiple application scenarios. Agents successfully apply patterns to new code.
|
||||
|
||||
## Context
|
||||
Addresses common async pitfalls like race conditions, improper error handling, and timing issues."
|
||||
```
|
||||
|
||||
## After PR is Merged
|
||||
|
||||
Once your PR is merged:
|
||||
|
||||
1. Sync your local main branch:
|
||||
```bash
|
||||
cd ~/.config/superpowers/skills/
|
||||
git checkout main
|
||||
git pull upstream main
|
||||
git push origin main
|
||||
```
|
||||
|
||||
2. Delete the feature branch:
|
||||
```bash
|
||||
git branch -d "add-${skill_name}-skill"
|
||||
git push origin --delete "add-${skill_name}-skill"
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**"gh: command not found"**
|
||||
- Install GitHub CLI: https://cli.github.com/
|
||||
- Authenticate: `gh auth login`
|
||||
|
||||
**"Permission denied (publickey)"**
|
||||
- Check SSH keys: `gh auth status`
|
||||
- Set up SSH: https://docs.github.com/en/authentication
|
||||
|
||||
**"Skill already exists"**
|
||||
- You're creating a modified version
|
||||
- Consider different skill name or coordinate with the skill's maintainer
|
||||
|
||||
**PR merge conflicts**
|
||||
- Rebase on latest upstream: `git fetch upstream && git rebase upstream/main`
|
||||
- Resolve conflicts
|
||||
- Force push: `git push -f origin your-branch`
|
||||
|
||||
## Multi-Skill Contributions
|
||||
|
||||
**Do NOT batch multiple skills in one PR.**
|
||||
|
||||
Each skill should:
|
||||
- Have its own feature branch
|
||||
- Have its own PR
|
||||
- Be independently reviewable
|
||||
|
||||
**Why?** Individual skills can be reviewed, iterated, and merged independently.
|
||||
|
||||
## Related Skills
|
||||
|
||||
- **writing-skills** - REQUIRED: How to create well-tested skills before sharing
|
||||
@@ -1,140 +1,159 @@
|
||||
---
|
||||
name: subagent-driven-development
|
||||
description: Use when executing implementation plans with independent tasks in the current session - dispatches fresh subagent for each task with code review between tasks, enabling fast iteration with quality gates
|
||||
description: Use when executing implementation plans with independent tasks in the current session
|
||||
---
|
||||
|
||||
# Subagent-Driven Development
|
||||
|
||||
Execute plan by dispatching fresh subagent per task, with code review after each.
|
||||
Execute plan by dispatching fresh subagent per task, with two-stage review after each: spec compliance review first, then code quality review.
|
||||
|
||||
**Core principle:** Fresh subagent per task + review between tasks = high quality, fast iteration
|
||||
**Core principle:** Fresh subagent per task + two-stage review (spec then quality) = high quality, fast iteration
|
||||
|
||||
## Overview
|
||||
## When to Use
|
||||
|
||||
```dot
|
||||
digraph when_to_use {
|
||||
"Have implementation plan?" [shape=diamond];
|
||||
"Tasks mostly independent?" [shape=diamond];
|
||||
"Stay in this session?" [shape=diamond];
|
||||
"subagent-driven-development" [shape=box];
|
||||
"executing-plans" [shape=box];
|
||||
"Manual execution or brainstorm first" [shape=box];
|
||||
|
||||
"Have implementation plan?" -> "Tasks mostly independent?" [label="yes"];
|
||||
"Have implementation plan?" -> "Manual execution or brainstorm first" [label="no"];
|
||||
"Tasks mostly independent?" -> "Stay in this session?" [label="yes"];
|
||||
"Tasks mostly independent?" -> "Manual execution or brainstorm first" [label="no - tightly coupled"];
|
||||
"Stay in this session?" -> "subagent-driven-development" [label="yes"];
|
||||
"Stay in this session?" -> "executing-plans" [label="no - parallel session"];
|
||||
}
|
||||
```
|
||||
|
||||
**vs. Executing Plans (parallel session):**
|
||||
- Same session (no context switch)
|
||||
- Fresh subagent per task (no context pollution)
|
||||
- Code review after each task (catch issues early)
|
||||
- Two-stage review after each task: spec compliance first, then code quality
|
||||
- Faster iteration (no human-in-loop between tasks)
|
||||
|
||||
**When to use:**
|
||||
- Staying in this session
|
||||
- Tasks are mostly independent
|
||||
- Want continuous progress with quality gates
|
||||
|
||||
**When NOT to use:**
|
||||
- Need to review plan first (use executing-plans)
|
||||
- Tasks are tightly coupled (manual execution better)
|
||||
- Plan needs revision (brainstorm first)
|
||||
|
||||
## The Process
|
||||
|
||||
### 1. Load Plan
|
||||
```dot
|
||||
digraph process {
|
||||
rankdir=TB;
|
||||
|
||||
Read plan file, create TodoWrite with all tasks.
|
||||
subgraph cluster_per_task {
|
||||
label="Per Task";
|
||||
"Dispatch implementer subagent (./implementer-prompt.md)" [shape=box];
|
||||
"Implementer subagent asks questions?" [shape=diamond];
|
||||
"Answer questions, provide context" [shape=box];
|
||||
"Implementer subagent implements, tests, commits, self-reviews" [shape=box];
|
||||
"Dispatch spec reviewer subagent (./spec-reviewer-prompt.md)" [shape=box];
|
||||
"Spec reviewer subagent confirms code matches spec?" [shape=diamond];
|
||||
"Implementer subagent fixes spec gaps" [shape=box];
|
||||
"Dispatch code quality reviewer subagent (./code-quality-reviewer-prompt.md)" [shape=box];
|
||||
"Code quality reviewer subagent approves?" [shape=diamond];
|
||||
"Implementer subagent fixes quality issues" [shape=box];
|
||||
"Mark task complete in TodoWrite" [shape=box];
|
||||
}
|
||||
|
||||
### 2. Execute Task with Subagent
|
||||
"Read plan, extract all tasks with full text, note context, create TodoWrite" [shape=box];
|
||||
"More tasks remain?" [shape=diamond];
|
||||
"Dispatch final code reviewer subagent for entire implementation" [shape=box];
|
||||
"Use superpowers:finishing-a-development-branch" [shape=box style=filled fillcolor=lightgreen];
|
||||
|
||||
For each task:
|
||||
|
||||
**Dispatch fresh subagent:**
|
||||
```
|
||||
Task tool (general-purpose):
|
||||
description: "Implement Task N: [task name]"
|
||||
prompt: |
|
||||
You are implementing Task N from [plan-file].
|
||||
|
||||
Read that task carefully. Your job is to:
|
||||
1. Implement exactly what the task specifies
|
||||
2. Write tests (following TDD if task says to)
|
||||
3. Verify implementation works
|
||||
4. Commit your work
|
||||
5. Report back
|
||||
|
||||
Work from: [directory]
|
||||
|
||||
Report: What you implemented, what you tested, test results, files changed, any issues
|
||||
"Read plan, extract all tasks with full text, note context, create TodoWrite" -> "Dispatch implementer subagent (./implementer-prompt.md)";
|
||||
"Dispatch implementer subagent (./implementer-prompt.md)" -> "Implementer subagent asks questions?";
|
||||
"Implementer subagent asks questions?" -> "Answer questions, provide context" [label="yes"];
|
||||
"Answer questions, provide context" -> "Dispatch implementer subagent (./implementer-prompt.md)";
|
||||
"Implementer subagent asks questions?" -> "Implementer subagent implements, tests, commits, self-reviews" [label="no"];
|
||||
"Implementer subagent implements, tests, commits, self-reviews" -> "Dispatch spec reviewer subagent (./spec-reviewer-prompt.md)";
|
||||
"Dispatch spec reviewer subagent (./spec-reviewer-prompt.md)" -> "Spec reviewer subagent confirms code matches spec?";
|
||||
"Spec reviewer subagent confirms code matches spec?" -> "Implementer subagent fixes spec gaps" [label="no"];
|
||||
"Implementer subagent fixes spec gaps" -> "Dispatch spec reviewer subagent (./spec-reviewer-prompt.md)" [label="re-review"];
|
||||
"Spec reviewer subagent confirms code matches spec?" -> "Dispatch code quality reviewer subagent (./code-quality-reviewer-prompt.md)" [label="yes"];
|
||||
"Dispatch code quality reviewer subagent (./code-quality-reviewer-prompt.md)" -> "Code quality reviewer subagent approves?";
|
||||
"Code quality reviewer subagent approves?" -> "Implementer subagent fixes quality issues" [label="no"];
|
||||
"Implementer subagent fixes quality issues" -> "Dispatch code quality reviewer subagent (./code-quality-reviewer-prompt.md)" [label="re-review"];
|
||||
"Code quality reviewer subagent approves?" -> "Mark task complete in TodoWrite" [label="yes"];
|
||||
"Mark task complete in TodoWrite" -> "More tasks remain?";
|
||||
"More tasks remain?" -> "Dispatch implementer subagent (./implementer-prompt.md)" [label="yes"];
|
||||
"More tasks remain?" -> "Dispatch final code reviewer subagent for entire implementation" [label="no"];
|
||||
"Dispatch final code reviewer subagent for entire implementation" -> "Use superpowers:finishing-a-development-branch";
|
||||
}
|
||||
```
|
||||
|
||||
**Subagent reports back** with summary of work.
|
||||
## Prompt Templates
|
||||
|
||||
### 3. Review Subagent's Work
|
||||
|
||||
**Dispatch code-reviewer subagent:**
|
||||
```
|
||||
Task tool (superpowers:code-reviewer):
|
||||
Use template at requesting-code-review/code-reviewer.md
|
||||
|
||||
WHAT_WAS_IMPLEMENTED: [from subagent's report]
|
||||
PLAN_OR_REQUIREMENTS: Task N from [plan-file]
|
||||
BASE_SHA: [commit before task]
|
||||
HEAD_SHA: [current commit]
|
||||
DESCRIPTION: [task summary]
|
||||
```
|
||||
|
||||
**Code reviewer returns:** Strengths, Issues (Critical/Important/Minor), Assessment
|
||||
|
||||
### 4. Apply Review Feedback
|
||||
|
||||
**If issues found:**
|
||||
- Fix Critical issues immediately
|
||||
- Fix Important issues before next task
|
||||
- Note Minor issues
|
||||
|
||||
**Dispatch follow-up subagent if needed:**
|
||||
```
|
||||
"Fix issues from code review: [list issues]"
|
||||
```
|
||||
|
||||
### 5. Mark Complete, Next Task
|
||||
|
||||
- Mark task as completed in TodoWrite
|
||||
- Move to next task
|
||||
- Repeat steps 2-5
|
||||
|
||||
### 6. Final Review
|
||||
|
||||
After all tasks complete, dispatch final code-reviewer:
|
||||
- Reviews entire implementation
|
||||
- Checks all plan requirements met
|
||||
- Validates overall architecture
|
||||
|
||||
### 7. Complete Development
|
||||
|
||||
After final review passes:
|
||||
- Announce: "I'm using the finishing-a-development-branch skill to complete this work."
|
||||
- **REQUIRED SUB-SKILL:** Use superpowers:finishing-a-development-branch
|
||||
- Follow that skill to verify tests, present options, execute choice
|
||||
- `./implementer-prompt.md` - Dispatch implementer subagent
|
||||
- `./spec-reviewer-prompt.md` - Dispatch spec compliance reviewer subagent
|
||||
- `./code-quality-reviewer-prompt.md` - Dispatch code quality reviewer subagent
|
||||
|
||||
## Example Workflow
|
||||
|
||||
```
|
||||
You: I'm using Subagent-Driven Development to execute this plan.
|
||||
|
||||
[Load plan, create TodoWrite]
|
||||
[Read plan file once: docs/plans/feature-plan.md]
|
||||
[Extract all 5 tasks with full text and context]
|
||||
[Create TodoWrite with all tasks]
|
||||
|
||||
Task 1: Hook installation script
|
||||
|
||||
[Dispatch implementation subagent]
|
||||
Subagent: Implemented install-hook with tests, 5/5 passing
|
||||
[Get Task 1 text and context (already extracted)]
|
||||
[Dispatch implementation subagent with full task text + context]
|
||||
|
||||
[Get git SHAs, dispatch code-reviewer]
|
||||
Reviewer: Strengths: Good test coverage. Issues: None. Ready.
|
||||
Implementer: "Before I begin - should the hook be installed at user or system level?"
|
||||
|
||||
You: "User level (~/.config/superpowers/hooks/)"
|
||||
|
||||
Implementer: "Got it. Implementing now..."
|
||||
[Later] Implementer:
|
||||
- Implemented install-hook command
|
||||
- Added tests, 5/5 passing
|
||||
- Self-review: Found I missed --force flag, added it
|
||||
- Committed
|
||||
|
||||
[Dispatch spec compliance reviewer]
|
||||
Spec reviewer: ✅ Spec compliant - all requirements met, nothing extra
|
||||
|
||||
[Get git SHAs, dispatch code quality reviewer]
|
||||
Code reviewer: Strengths: Good test coverage, clean. Issues: None. Approved.
|
||||
|
||||
[Mark Task 1 complete]
|
||||
|
||||
Task 2: Recovery modes
|
||||
|
||||
[Dispatch implementation subagent]
|
||||
Subagent: Added verify/repair, 8/8 tests passing
|
||||
[Get Task 2 text and context (already extracted)]
|
||||
[Dispatch implementation subagent with full task text + context]
|
||||
|
||||
[Dispatch code-reviewer]
|
||||
Reviewer: Strengths: Solid. Issues (Important): Missing progress reporting
|
||||
Implementer: [No questions, proceeds]
|
||||
Implementer:
|
||||
- Added verify/repair modes
|
||||
- 8/8 tests passing
|
||||
- Self-review: All good
|
||||
- Committed
|
||||
|
||||
[Dispatch fix subagent]
|
||||
Fix subagent: Added progress every 100 conversations
|
||||
[Dispatch spec compliance reviewer]
|
||||
Spec reviewer: ❌ Issues:
|
||||
- Missing: Progress reporting (spec says "report every 100 items")
|
||||
- Extra: Added --json flag (not requested)
|
||||
|
||||
[Verify fix, mark Task 2 complete]
|
||||
[Implementer fixes issues]
|
||||
Implementer: Removed --json flag, added progress reporting
|
||||
|
||||
[Spec reviewer reviews again]
|
||||
Spec reviewer: ✅ Spec compliant now
|
||||
|
||||
[Dispatch code quality reviewer]
|
||||
Code reviewer: Strengths: Solid. Issues (Important): Magic number (100)
|
||||
|
||||
[Implementer fixes]
|
||||
Implementer: Extracted PROGRESS_INTERVAL constant
|
||||
|
||||
[Code reviewer reviews again]
|
||||
Code reviewer: ✅ Approved
|
||||
|
||||
[Mark Task 2 complete]
|
||||
|
||||
...
|
||||
|
||||
@@ -151,23 +170,57 @@ Done!
|
||||
- Subagents follow TDD naturally
|
||||
- Fresh context per task (no confusion)
|
||||
- Parallel-safe (subagents don't interfere)
|
||||
- Subagent can ask questions (before AND during work)
|
||||
|
||||
**vs. Executing Plans:**
|
||||
- Same session (no handoff)
|
||||
- Continuous progress (no waiting)
|
||||
- Review checkpoints automatic
|
||||
|
||||
**Efficiency gains:**
|
||||
- No file reading overhead (controller provides full text)
|
||||
- Controller curates exactly what context is needed
|
||||
- Subagent gets complete information upfront
|
||||
- Questions surfaced before work begins (not after)
|
||||
|
||||
**Quality gates:**
|
||||
- Self-review catches issues before handoff
|
||||
- Two-stage review: spec compliance, then code quality
|
||||
- Review loops ensure fixes actually work
|
||||
- Spec compliance prevents over/under-building
|
||||
- Code quality ensures implementation is well-built
|
||||
|
||||
**Cost:**
|
||||
- More subagent invocations
|
||||
- More subagent invocations (implementer + 2 reviewers per task)
|
||||
- Controller does more prep work (extracting all tasks upfront)
|
||||
- Review loops add iterations
|
||||
- But catches issues early (cheaper than debugging later)
|
||||
|
||||
## Red Flags
|
||||
|
||||
**Never:**
|
||||
- Skip code review between tasks
|
||||
- Proceed with unfixed Critical issues
|
||||
- Skip reviews (spec compliance OR code quality)
|
||||
- Proceed with unfixed issues
|
||||
- Dispatch multiple implementation subagents in parallel (conflicts)
|
||||
- Implement without reading plan task
|
||||
- Make subagent read plan file (provide full text instead)
|
||||
- Skip scene-setting context (subagent needs to understand where task fits)
|
||||
- Ignore subagent questions (answer before letting them proceed)
|
||||
- Accept "close enough" on spec compliance (spec reviewer found issues = not done)
|
||||
- Skip review loops (reviewer found issues = implementer fixes = review again)
|
||||
- Let implementer self-review replace actual review (both are needed)
|
||||
- **Start code quality review before spec compliance is ✅** (wrong order)
|
||||
- Move to next task while either review has open issues
|
||||
|
||||
**If subagent asks questions:**
|
||||
- Answer clearly and completely
|
||||
- Provide additional context if needed
|
||||
- Don't rush them into implementation
|
||||
|
||||
**If reviewer finds issues:**
|
||||
- Implementer (same subagent) fixes them
|
||||
- Reviewer reviews again
|
||||
- Repeat until approved
|
||||
- Don't skip the re-review
|
||||
|
||||
**If subagent fails task:**
|
||||
- Dispatch fix subagent with specific instructions
|
||||
@@ -176,14 +229,12 @@ Done!
|
||||
## Integration
|
||||
|
||||
**Required workflow skills:**
|
||||
- **writing-plans** - REQUIRED: Creates the plan that this skill executes
|
||||
- **requesting-code-review** - REQUIRED: Review after each task (see Step 3)
|
||||
- **finishing-a-development-branch** - REQUIRED: Complete development after all tasks (see Step 7)
|
||||
- **superpowers:writing-plans** - Creates the plan this skill executes
|
||||
- **superpowers:requesting-code-review** - Code review template for reviewer subagents
|
||||
- **superpowers:finishing-a-development-branch** - Complete development after all tasks
|
||||
|
||||
**Subagents must use:**
|
||||
- **test-driven-development** - Subagents follow TDD for each task
|
||||
**Subagents should use:**
|
||||
- **superpowers:test-driven-development** - Subagents follow TDD for each task
|
||||
|
||||
**Alternative workflow:**
|
||||
- **executing-plans** - Use for parallel session instead of same-session execution
|
||||
|
||||
See code-reviewer template: requesting-code-review/code-reviewer.md
|
||||
- **superpowers:executing-plans** - Use for parallel session instead of same-session execution
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
# Code Quality Reviewer Prompt Template
|
||||
|
||||
Use this template when dispatching a code quality reviewer subagent.
|
||||
|
||||
**Purpose:** Verify implementation is well-built (clean, tested, maintainable)
|
||||
|
||||
**Only dispatch after spec compliance review passes.**
|
||||
|
||||
```
|
||||
Task tool (superpowers:code-reviewer):
|
||||
Use template at requesting-code-review/code-reviewer.md
|
||||
|
||||
WHAT_WAS_IMPLEMENTED: [from implementer's report]
|
||||
PLAN_OR_REQUIREMENTS: Task N from [plan-file]
|
||||
BASE_SHA: [commit before task]
|
||||
HEAD_SHA: [current commit]
|
||||
DESCRIPTION: [task summary]
|
||||
```
|
||||
|
||||
**Code reviewer returns:** Strengths, Issues (Critical/Important/Minor), Assessment
|
||||
78
skills/subagent-driven-development/implementer-prompt.md
Normal file
78
skills/subagent-driven-development/implementer-prompt.md
Normal file
@@ -0,0 +1,78 @@
|
||||
# Implementer Subagent Prompt Template
|
||||
|
||||
Use this template when dispatching an implementer subagent.
|
||||
|
||||
```
|
||||
Task tool (general-purpose):
|
||||
description: "Implement Task N: [task name]"
|
||||
prompt: |
|
||||
You are implementing Task N: [task name]
|
||||
|
||||
## Task Description
|
||||
|
||||
[FULL TEXT of task from plan - paste it here, don't make subagent read file]
|
||||
|
||||
## Context
|
||||
|
||||
[Scene-setting: where this fits, dependencies, architectural context]
|
||||
|
||||
## Before You Begin
|
||||
|
||||
If you have questions about:
|
||||
- The requirements or acceptance criteria
|
||||
- The approach or implementation strategy
|
||||
- Dependencies or assumptions
|
||||
- Anything unclear in the task description
|
||||
|
||||
**Ask them now.** Raise any concerns before starting work.
|
||||
|
||||
## Your Job
|
||||
|
||||
Once you're clear on requirements:
|
||||
1. Implement exactly what the task specifies
|
||||
2. Write tests (following TDD if task says to)
|
||||
3. Verify implementation works
|
||||
4. Commit your work
|
||||
5. Self-review (see below)
|
||||
6. Report back
|
||||
|
||||
Work from: [directory]
|
||||
|
||||
**While you work:** If you encounter something unexpected or unclear, **ask questions**.
|
||||
It's always OK to pause and clarify. Don't guess or make assumptions.
|
||||
|
||||
## Before Reporting Back: Self-Review
|
||||
|
||||
Review your work with fresh eyes. Ask yourself:
|
||||
|
||||
**Completeness:**
|
||||
- Did I fully implement everything in the spec?
|
||||
- Did I miss any requirements?
|
||||
- Are there edge cases I didn't handle?
|
||||
|
||||
**Quality:**
|
||||
- Is this my best work?
|
||||
- Are names clear and accurate (match what things do, not how they work)?
|
||||
- Is the code clean and maintainable?
|
||||
|
||||
**Discipline:**
|
||||
- Did I avoid overbuilding (YAGNI)?
|
||||
- Did I only build what was requested?
|
||||
- Did I follow existing patterns in the codebase?
|
||||
|
||||
**Testing:**
|
||||
- Do tests actually verify behavior (not just mock behavior)?
|
||||
- Did I follow TDD if required?
|
||||
- Are tests comprehensive?
|
||||
|
||||
If you find issues during self-review, fix them now before reporting.
|
||||
|
||||
## Report Format
|
||||
|
||||
When done, report:
|
||||
- What you implemented
|
||||
- What you tested and test results
|
||||
- Files changed
|
||||
- Self-review findings (if any)
|
||||
- Any issues or concerns
|
||||
```
|
||||
61
skills/subagent-driven-development/spec-reviewer-prompt.md
Normal file
61
skills/subagent-driven-development/spec-reviewer-prompt.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Spec Compliance Reviewer Prompt Template
|
||||
|
||||
Use this template when dispatching a spec compliance reviewer subagent.
|
||||
|
||||
**Purpose:** Verify implementer built what was requested (nothing more, nothing less)
|
||||
|
||||
```
|
||||
Task tool (general-purpose):
|
||||
description: "Review spec compliance for Task N"
|
||||
prompt: |
|
||||
You are reviewing whether an implementation matches its specification.
|
||||
|
||||
## What Was Requested
|
||||
|
||||
[FULL TEXT of task requirements]
|
||||
|
||||
## What Implementer Claims They Built
|
||||
|
||||
[From implementer's report]
|
||||
|
||||
## CRITICAL: Do Not Trust the Report
|
||||
|
||||
The implementer finished suspiciously quickly. Their report may be incomplete,
|
||||
inaccurate, or optimistic. You MUST verify everything independently.
|
||||
|
||||
**DO NOT:**
|
||||
- Take their word for what they implemented
|
||||
- Trust their claims about completeness
|
||||
- Accept their interpretation of requirements
|
||||
|
||||
**DO:**
|
||||
- Read the actual code they wrote
|
||||
- Compare actual implementation to requirements line by line
|
||||
- Check for missing pieces they claimed to implement
|
||||
- Look for extra features they didn't mention
|
||||
|
||||
## Your Job
|
||||
|
||||
Read the implementation code and verify:
|
||||
|
||||
**Missing requirements:**
|
||||
- Did they implement everything that was requested?
|
||||
- Are there requirements they skipped or missed?
|
||||
- Did they claim something works but didn't actually implement it?
|
||||
|
||||
**Extra/unneeded work:**
|
||||
- Did they build things that weren't requested?
|
||||
- Did they over-engineer or add unnecessary features?
|
||||
- Did they add "nice to haves" that weren't in spec?
|
||||
|
||||
**Misunderstandings:**
|
||||
- Did they interpret requirements differently than intended?
|
||||
- Did they solve the wrong problem?
|
||||
- Did they implement the right feature but wrong way?
|
||||
|
||||
**Verify by reading code, not by trusting report.**
|
||||
|
||||
Report:
|
||||
- ✅ Spec compliant (if everything matches after code inspection)
|
||||
- ❌ Issues found: [list specifically what's missing or extra, with file:line references]
|
||||
```
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: systematic-debugging
|
||||
description: Use when encountering any bug, test failure, or unexpected behavior, before proposing fixes - four-phase framework (root cause investigation, pattern analysis, hypothesis testing, implementation) that ensures understanding before attempting solutions
|
||||
description: Use when encountering any bug, test failure, or unexpected behavior, before proposing fixes
|
||||
---
|
||||
|
||||
# Systematic Debugging
|
||||
@@ -111,7 +111,7 @@ You MUST complete each phase before proceeding to the next.
|
||||
|
||||
**WHEN error is deep in call stack:**
|
||||
|
||||
**REQUIRED SUB-SKILL:** Use superpowers:root-cause-tracing for backward tracing technique
|
||||
See `root-cause-tracing.md` in this directory for the complete backward tracing technique.
|
||||
|
||||
**Quick version:**
|
||||
- Where does bad value originate?
|
||||
@@ -176,7 +176,7 @@ You MUST complete each phase before proceeding to the next.
|
||||
- Automated test if possible
|
||||
- One-off test script if no framework
|
||||
- MUST have before fixing
|
||||
- **REQUIRED SUB-SKILL:** Use superpowers:test-driven-development for writing proper failing tests
|
||||
- Use the `superpowers:test-driven-development` skill for writing proper failing tests
|
||||
|
||||
2. **Implement Single Fix**
|
||||
- Address the root cause identified
|
||||
@@ -275,16 +275,17 @@ If systematic investigation reveals issue is truly environmental, timing-depende
|
||||
|
||||
**But:** 95% of "no root cause" cases are incomplete investigation.
|
||||
|
||||
## Integration with Other Skills
|
||||
## Supporting Techniques
|
||||
|
||||
**This skill requires using:**
|
||||
- **root-cause-tracing** - REQUIRED when error is deep in call stack (see Phase 1, Step 5)
|
||||
- **test-driven-development** - REQUIRED for creating failing test case (see Phase 4, Step 1)
|
||||
These techniques are part of systematic debugging and available in this directory:
|
||||
|
||||
**Complementary skills:**
|
||||
- **defense-in-depth** - Add validation at multiple layers after finding root cause
|
||||
- **condition-based-waiting** - Replace arbitrary timeouts identified in Phase 2
|
||||
- **verification-before-completion** - Verify fix worked before claiming success
|
||||
- **`root-cause-tracing.md`** - Trace bugs backward through call stack to find original trigger
|
||||
- **`defense-in-depth.md`** - Add validation at multiple layers after finding root cause
|
||||
- **`condition-based-waiting.md`** - Replace arbitrary timeouts with condition polling
|
||||
|
||||
**Related skills:**
|
||||
- **superpowers:test-driven-development** - For creating failing test case (Phase 4, Step 1)
|
||||
- **superpowers:verification-before-completion** - Verify fix worked before claiming success
|
||||
|
||||
## Real-World Impact
|
||||
|
||||
|
||||
@@ -1,8 +1,3 @@
|
||||
---
|
||||
name: condition-based-waiting
|
||||
description: Use when tests have race conditions, timing dependencies, or inconsistent pass/fail behavior - replaces arbitrary timeouts with condition polling to wait for actual state changes, eliminating flaky tests from timing guesses
|
||||
---
|
||||
|
||||
# Condition-Based Waiting
|
||||
|
||||
## Overview
|
||||
@@ -84,7 +79,7 @@ async function waitFor<T>(
|
||||
}
|
||||
```
|
||||
|
||||
See @example.ts for complete implementation with domain-specific helpers (`waitForEvent`, `waitForEventCount`, `waitForEventMatch`) from actual debugging session.
|
||||
See `condition-based-waiting-example.ts` in this directory for complete implementation with domain-specific helpers (`waitForEvent`, `waitForEventCount`, `waitForEventMatch`) from actual debugging session.
|
||||
|
||||
## Common Mistakes
|
||||
|
||||
@@ -1,8 +1,3 @@
|
||||
---
|
||||
name: defense-in-depth
|
||||
description: Use when invalid data causes failures deep in execution, requiring validation at multiple system layers - validates at every layer data passes through to make bugs structurally impossible
|
||||
---
|
||||
|
||||
# Defense-in-Depth Validation
|
||||
|
||||
## Overview
|
||||
@@ -1,8 +1,3 @@
|
||||
---
|
||||
name: root-cause-tracing
|
||||
description: Use when errors occur deep in execution and you need to trace back to find the original trigger - systematically traces bugs backward through call stack, adding instrumentation when needed, to identify source of invalid data or incorrect behavior
|
||||
---
|
||||
|
||||
# Root Cause Tracing
|
||||
|
||||
## Overview
|
||||
@@ -103,7 +98,7 @@ npm test 2>&1 | grep 'DEBUG git init'
|
||||
|
||||
If something appears during tests but you don't know which test:
|
||||
|
||||
Use the bisection script: @find-polluter.sh
|
||||
Use the bisection script `find-polluter.sh` in this directory:
|
||||
|
||||
```bash
|
||||
./find-polluter.sh '.git' 'src/**/*.test.ts'
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: test-driven-development
|
||||
description: Use when implementing any feature or bugfix, before writing implementation code - write the test first, watch it fail, write minimal code to pass; ensures tests actually verify behavior by requiring failure first
|
||||
description: Use when implementing any feature or bugfix, before writing implementation code
|
||||
---
|
||||
|
||||
# Test-Driven Development (TDD)
|
||||
@@ -354,6 +354,13 @@ Bug found? Write failing test reproducing it. Follow TDD cycle. Test proves fix
|
||||
|
||||
Never fix bugs without a test.
|
||||
|
||||
## Testing Anti-Patterns
|
||||
|
||||
When adding mocks or test utilities, read @testing-anti-patterns.md to avoid common pitfalls:
|
||||
- Testing mock behavior instead of real behavior
|
||||
- Adding test-only methods to production classes
|
||||
- Mocking without understanding dependencies
|
||||
|
||||
## Final Rule
|
||||
|
||||
```
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
---
|
||||
name: testing-anti-patterns
|
||||
description: Use when writing or changing tests, adding mocks, or tempted to add test-only methods to production code - prevents testing mock behavior, production pollution with test-only methods, and mocking without understanding dependencies
|
||||
---
|
||||
|
||||
# Testing Anti-Patterns
|
||||
|
||||
**Load this reference when:** writing or changing tests, adding mocks, or tempted to add test-only methods to production code.
|
||||
|
||||
## Overview
|
||||
|
||||
Tests must verify real behavior, not mock behavior. Mocks are a means to isolate, not the thing being tested.
|
||||
@@ -1,101 +1,86 @@
|
||||
---
|
||||
name: using-superpowers
|
||||
description: Use when starting any conversation - establishes mandatory workflows for finding and using skills, including using Skill tool before announcing usage, following brainstorming before coding, and creating TodoWrite todos for checklists
|
||||
description: Use when starting any conversation - establishes how to find and use skills, requiring Skill tool invocation before ANY response including clarifying questions
|
||||
---
|
||||
|
||||
<EXTREMELY-IMPORTANT>
|
||||
If you think there is even a 1% chance a skill might apply to what you are doing, you ABSOLUTELY MUST read the skill.
|
||||
If you think there is even a 1% chance a skill might apply to what you are doing, you ABSOLUTELY MUST invoke the skill.
|
||||
|
||||
IF A SKILL APPLIES TO YOUR TASK, YOU DO NOT HAVE A CHOICE. YOU MUST USE IT.
|
||||
|
||||
This is not negotiable. This is not optional. You cannot rationalize your way out of this.
|
||||
</EXTREMELY-IMPORTANT>
|
||||
|
||||
# Getting Started with Skills
|
||||
## How to Access Skills
|
||||
|
||||
## MANDATORY FIRST RESPONSE PROTOCOL
|
||||
**In Claude Code:** Use the `Skill` tool. When you invoke a skill, its content is loaded and presented to you—follow it directly. Never use the Read tool on skill files.
|
||||
|
||||
Before responding to ANY user message, you MUST complete this checklist:
|
||||
**In other environments:** Check your platform's documentation for how skills are loaded.
|
||||
|
||||
1. ☐ List available skills in your mind
|
||||
2. ☐ Ask yourself: "Does ANY skill match this request?"
|
||||
3. ☐ If yes → Use the Skill tool to read and run the skill file
|
||||
4. ☐ Announce which skill you're using
|
||||
5. ☐ Follow the skill exactly
|
||||
# Using Skills
|
||||
|
||||
**Responding WITHOUT completing this checklist = automatic failure.**
|
||||
## The Rule
|
||||
|
||||
## Critical Rules
|
||||
**Check for skills BEFORE ANY RESPONSE.** This includes clarifying questions. Even 1% chance means invoke the Skill tool first.
|
||||
|
||||
1. **Follow mandatory workflows.** Brainstorming before coding. Check for relevant skills before ANY task.
|
||||
```dot
|
||||
digraph skill_flow {
|
||||
"User message received" [shape=doublecircle];
|
||||
"Might any skill apply?" [shape=diamond];
|
||||
"Invoke Skill tool" [shape=box];
|
||||
"Announce: 'Using [skill] to [purpose]'" [shape=box];
|
||||
"Has checklist?" [shape=diamond];
|
||||
"Create TodoWrite todo per item" [shape=box];
|
||||
"Follow skill exactly" [shape=box];
|
||||
"Respond (including clarifications)" [shape=doublecircle];
|
||||
|
||||
2. Execute skills with the Skill tool
|
||||
"User message received" -> "Might any skill apply?";
|
||||
"Might any skill apply?" -> "Invoke Skill tool" [label="yes, even 1%"];
|
||||
"Might any skill apply?" -> "Respond (including clarifications)" [label="definitely not"];
|
||||
"Invoke Skill tool" -> "Announce: 'Using [skill] to [purpose]'";
|
||||
"Announce: 'Using [skill] to [purpose]'" -> "Has checklist?";
|
||||
"Has checklist?" -> "Create TodoWrite todo per item" [label="yes"];
|
||||
"Has checklist?" -> "Follow skill exactly" [label="no"];
|
||||
"Create TodoWrite todo per item" -> "Follow skill exactly";
|
||||
}
|
||||
```
|
||||
|
||||
## Common Rationalizations That Mean You're About To Fail
|
||||
## Red Flags
|
||||
|
||||
If you catch yourself thinking ANY of these thoughts, STOP. You are rationalizing. Check for and use the skill.
|
||||
These thoughts mean STOP—you're rationalizing:
|
||||
|
||||
- "This is just a simple question" → WRONG. Questions are tasks. Check for skills.
|
||||
- "I can check git/files quickly" → WRONG. Files don't have conversation context. Check for skills.
|
||||
- "Let me gather information first" → WRONG. Skills tell you HOW to gather information. Check for skills.
|
||||
- "This doesn't need a formal skill" → WRONG. If a skill exists for it, use it.
|
||||
- "I remember this skill" → WRONG. Skills evolve. Run the current version.
|
||||
- "This doesn't count as a task" → WRONG. If you're taking action, it's a task. Check for skills.
|
||||
- "The skill is overkill for this" → WRONG. Skills exist because simple things become complex. Use it.
|
||||
- "I'll just do this one thing first" → WRONG. Check for skills BEFORE doing anything.
|
||||
| Thought | Reality |
|
||||
|---------|---------|
|
||||
| "This is just a simple question" | Questions are tasks. Check for skills. |
|
||||
| "I need more context first" | Skill check comes BEFORE clarifying questions. |
|
||||
| "Let me explore the codebase first" | Skills tell you HOW to explore. Check first. |
|
||||
| "I can check git/files quickly" | Files lack conversation context. Check for skills. |
|
||||
| "Let me gather information first" | Skills tell you HOW to gather information. |
|
||||
| "This doesn't need a formal skill" | If a skill exists, use it. |
|
||||
| "I remember this skill" | Skills evolve. Read current version. |
|
||||
| "This doesn't count as a task" | Action = task. Check for skills. |
|
||||
| "The skill is overkill" | Simple things become complex. Use it. |
|
||||
| "I'll just do this one thing first" | Check BEFORE doing anything. |
|
||||
| "This feels productive" | Undisciplined action wastes time. Skills prevent this. |
|
||||
|
||||
**Why:** Skills document proven techniques that save time and prevent mistakes. Not using available skills means repeating solved problems and making known errors.
|
||||
## Skill Priority
|
||||
|
||||
If a skill for your task exists, you must use it or you will fail at your task.
|
||||
When multiple skills could apply, use this order:
|
||||
|
||||
## Skills with Checklists
|
||||
1. **Process skills first** (brainstorming, debugging) - these determine HOW to approach the task
|
||||
2. **Implementation skills second** (frontend-design, mcp-builder) - these guide execution
|
||||
|
||||
If a skill has a checklist, YOU MUST create TodoWrite todos for EACH item.
|
||||
"Let's build X" → brainstorming first, then implementation skills.
|
||||
"Fix this bug" → debugging first, then domain-specific skills.
|
||||
|
||||
**Don't:**
|
||||
- Work through checklist mentally
|
||||
- Skip creating todos "to save time"
|
||||
- Batch multiple items into one todo
|
||||
- Mark complete without doing them
|
||||
## Skill Types
|
||||
|
||||
**Why:** Checklists without TodoWrite tracking = steps get skipped. Every time. The overhead of TodoWrite is tiny compared to the cost of missing steps.
|
||||
**Rigid** (TDD, debugging): Follow exactly. Don't adapt away discipline.
|
||||
|
||||
## Announcing Skill Usage
|
||||
**Flexible** (patterns): Adapt principles to context.
|
||||
|
||||
Before using a skill, announce that you are using it.
|
||||
"I'm using [Skill Name] to [what you're doing]."
|
||||
The skill itself tells you which.
|
||||
|
||||
**Examples:**
|
||||
- "I'm using the brainstorming skill to refine your idea into a design."
|
||||
- "I'm using the test-driven-development skill to implement this feature."
|
||||
## User Instructions
|
||||
|
||||
**Why:** Transparency helps your human partner understand your process and catch errors early. It also confirms you actually read the skill.
|
||||
|
||||
# About these skills
|
||||
|
||||
**Many skills contain rigid rules (TDD, debugging, verification).** Follow them exactly. Don't adapt away the discipline.
|
||||
|
||||
**Some skills are flexible patterns (architecture, naming).** Adapt core principles to your context.
|
||||
|
||||
The skill itself tells you which type it is.
|
||||
|
||||
## Instructions ≠ Permission to Skip Workflows
|
||||
|
||||
Your human partner's specific instructions describe WHAT to do, not HOW.
|
||||
|
||||
"Add X", "Fix Y" = the goal, NOT permission to skip brainstorming, TDD, or RED-GREEN-REFACTOR.
|
||||
|
||||
**Red flags:** "Instruction was specific" • "Seems simple" • "Workflow is overkill"
|
||||
|
||||
**Why:** Specific instructions mean clear requirements, which is when workflows matter MOST. Skipping process on "simple" tasks is how simple tasks become complex problems.
|
||||
|
||||
## Summary
|
||||
|
||||
**Starting any task:**
|
||||
1. If relevant skill exists → Use the skill
|
||||
3. Announce you're using it
|
||||
4. Follow what it says
|
||||
|
||||
**Skill has checklist?** TodoWrite for every item.
|
||||
|
||||
**Finding a relevant skill = mandatory to read and use it. Not optional.**
|
||||
Instructions say WHAT, not HOW. "Add X" or "Fix Y" doesn't mean skip workflows.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: writing-plans
|
||||
description: Use when design is complete and you need detailed implementation tasks for engineers with zero codebase context - creates comprehensive implementation plans with exact file paths, complete code examples, and verification steps assuming engineer has minimal domain knowledge
|
||||
description: Use when you have a spec or requirements for a multi-step task, before touching code
|
||||
---
|
||||
|
||||
# Writing Plans
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: writing-skills
|
||||
description: Use when creating new skills, editing existing skills, or verifying skills work before deployment - applies TDD to process documentation by testing with subagents before writing, iterating until bulletproof against rationalization
|
||||
description: Use when creating new skills, editing existing skills, or verifying skills work before deployment
|
||||
---
|
||||
|
||||
# Writing Skills
|
||||
@@ -95,15 +95,16 @@ skills/
|
||||
- Only two fields supported: `name` and `description`
|
||||
- Max 1024 characters total
|
||||
- `name`: Use letters, numbers, and hyphens only (no parentheses, special chars)
|
||||
- `description`: Third-person, includes BOTH what it does AND when to use it
|
||||
- `description`: Third-person, describes ONLY when to use (NOT what it does)
|
||||
- Start with "Use when..." to focus on triggering conditions
|
||||
- Include specific symptoms, situations, and contexts
|
||||
- **NEVER summarize the skill's process or workflow** (see CSO section for why)
|
||||
- Keep under 500 characters if possible
|
||||
|
||||
```markdown
|
||||
---
|
||||
name: Skill-Name-With-Hyphens
|
||||
description: Use when [specific triggering conditions and symptoms] - [what the skill does and how it helps, written in third person]
|
||||
description: Use when [specific triggering conditions and symptoms]
|
||||
---
|
||||
|
||||
# Skill Name
|
||||
@@ -143,7 +144,31 @@ Concrete results
|
||||
|
||||
**Purpose:** Claude reads description to decide which skills to load for a given task. Make it answer: "Should I read this skill right now?"
|
||||
|
||||
**Format:** Start with "Use when..." to focus on triggering conditions, then explain what it does
|
||||
**Format:** Start with "Use when..." to focus on triggering conditions
|
||||
|
||||
**CRITICAL: Description = When to Use, NOT What the Skill Does**
|
||||
|
||||
The description should ONLY describe triggering conditions. Do NOT summarize the skill's process or workflow in the description.
|
||||
|
||||
**Why this matters:** Testing revealed that when a description summarizes the skill's workflow, Claude may follow the description instead of reading the full skill content. A description saying "code review between tasks" caused Claude to do ONE review, even though the skill's flowchart clearly showed TWO reviews (spec compliance then code quality).
|
||||
|
||||
When the description was changed to just "Use when executing implementation plans with independent tasks" (no workflow summary), Claude correctly read the flowchart and followed the two-stage review process.
|
||||
|
||||
**The trap:** Descriptions that summarize workflow create a shortcut Claude will take. The skill body becomes documentation Claude skips.
|
||||
|
||||
```yaml
|
||||
# ❌ BAD: Summarizes workflow - Claude may follow this instead of reading skill
|
||||
description: Use when executing plans - dispatches subagent per task with code review between tasks
|
||||
|
||||
# ❌ BAD: Too much process detail
|
||||
description: Use for TDD - write test first, watch it fail, write minimal code, refactor
|
||||
|
||||
# ✅ GOOD: Just triggering conditions, no workflow summary
|
||||
description: Use when executing implementation plans with independent tasks in the current session
|
||||
|
||||
# ✅ GOOD: Triggering conditions only
|
||||
description: Use when implementing any feature or bugfix, before writing implementation code
|
||||
```
|
||||
|
||||
**Content:**
|
||||
- Use concrete triggers, symptoms, and situations that signal this skill applies
|
||||
@@ -151,6 +176,7 @@ Concrete results
|
||||
- Keep triggers technology-agnostic unless the skill itself is technology-specific
|
||||
- If skill is technology-specific, make that explicit in the trigger
|
||||
- Write in third person (injected into system prompt)
|
||||
- **NEVER summarize the skill's process or workflow**
|
||||
|
||||
```yaml
|
||||
# ❌ BAD: Too abstract, vague, doesn't include when to use
|
||||
@@ -162,11 +188,11 @@ description: I can help you with async tests when they're flaky
|
||||
# ❌ BAD: Mentions technology but skill isn't specific to it
|
||||
description: Use when tests use setTimeout/sleep and are flaky
|
||||
|
||||
# ✅ GOOD: Starts with "Use when", describes problem, then what it does
|
||||
description: Use when tests have race conditions, timing dependencies, or pass/fail inconsistently - replaces arbitrary timeouts with condition polling for reliable async tests
|
||||
# ✅ GOOD: Starts with "Use when", describes problem, no workflow
|
||||
description: Use when tests have race conditions, timing dependencies, or pass/fail inconsistently
|
||||
|
||||
# ✅ GOOD: Technology-specific skill with explicit trigger
|
||||
description: Use when using React Router and handling authentication redirects - provides patterns for protected routes and auth state management
|
||||
description: Use when using React Router and handling authentication redirects
|
||||
```
|
||||
|
||||
### 2. Keyword Coverage
|
||||
@@ -181,7 +207,7 @@ Use words Claude would search for:
|
||||
|
||||
**Use active voice, verb-first:**
|
||||
- ✅ `creating-skills` not `skill-creation`
|
||||
- ✅ `testing-skills-with-subagents` not `subagent-skill-testing`
|
||||
- ✅ `condition-based-waiting` not `async-test-helpers`
|
||||
|
||||
### 4. Token Efficiency (Critical)
|
||||
|
||||
@@ -288,6 +314,12 @@ digraph when_flowchart {
|
||||
|
||||
See @graphviz-conventions.dot for graphviz style rules.
|
||||
|
||||
**Visualizing for your human partner:** Use `render-graphs.js` in this directory to render a skill's flowcharts to SVG:
|
||||
```bash
|
||||
./render-graphs.js ../some-skill # Each diagram separately
|
||||
./render-graphs.js ../some-skill --combine # All diagrams in one SVG
|
||||
```
|
||||
|
||||
## Code Examples
|
||||
|
||||
**One excellent example beats many mediocre ones**
|
||||
@@ -520,7 +552,7 @@ Run same scenarios WITH skill. Agent should now comply.
|
||||
|
||||
Agent found new rationalization? Add explicit counter. Re-test until bulletproof.
|
||||
|
||||
**REQUIRED SUB-SKILL:** Use superpowers:testing-skills-with-subagents for the complete testing methodology:
|
||||
**Testing methodology:** See @testing-skills-with-subagents.md for the complete testing methodology:
|
||||
- How to write pressure scenarios
|
||||
- Pressure types (time, sunk cost, authority, exhaustion)
|
||||
- Plugging holes systematically
|
||||
|
||||
@@ -10,7 +10,7 @@ For conceptual background on how Skills work, see the [Skills overview](/en/docs
|
||||
|
||||
### Concise is key
|
||||
|
||||
The [context window](/en/docs/build-with-claude/context-windows) is a public good. Your Skill shares the context window with everything else Claude needs to know, including:
|
||||
The [context window](https://platform.claude.com/docs/en/build-with-claude/context-windows) is a public good. Your Skill shares the context window with everything else Claude needs to know, including:
|
||||
|
||||
* The system prompt
|
||||
* Conversation history
|
||||
|
||||
168
skills/writing-skills/render-graphs.js
Executable file
168
skills/writing-skills/render-graphs.js
Executable file
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* Render graphviz diagrams from a skill's SKILL.md to SVG files.
|
||||
*
|
||||
* Usage:
|
||||
* ./render-graphs.js <skill-directory> # Render each diagram separately
|
||||
* ./render-graphs.js <skill-directory> --combine # Combine all into one diagram
|
||||
*
|
||||
* Extracts all ```dot blocks from SKILL.md and renders to SVG.
|
||||
* Useful for helping your human partner visualize the process flows.
|
||||
*
|
||||
* Requires: graphviz (dot) installed on system
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { execSync } = require('child_process');
|
||||
|
||||
function extractDotBlocks(markdown) {
|
||||
const blocks = [];
|
||||
const regex = /```dot\n([\s\S]*?)```/g;
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(markdown)) !== null) {
|
||||
const content = match[1].trim();
|
||||
|
||||
// Extract digraph name
|
||||
const nameMatch = content.match(/digraph\s+(\w+)/);
|
||||
const name = nameMatch ? nameMatch[1] : `graph_${blocks.length + 1}`;
|
||||
|
||||
blocks.push({ name, content });
|
||||
}
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
function extractGraphBody(dotContent) {
|
||||
// Extract just the body (nodes and edges) from a digraph
|
||||
const match = dotContent.match(/digraph\s+\w+\s*\{([\s\S]*)\}/);
|
||||
if (!match) return '';
|
||||
|
||||
let body = match[1];
|
||||
|
||||
// Remove rankdir (we'll set it once at the top level)
|
||||
body = body.replace(/^\s*rankdir\s*=\s*\w+\s*;?\s*$/gm, '');
|
||||
|
||||
return body.trim();
|
||||
}
|
||||
|
||||
function combineGraphs(blocks, skillName) {
|
||||
const bodies = blocks.map((block, i) => {
|
||||
const body = extractGraphBody(block.content);
|
||||
// Wrap each subgraph in a cluster for visual grouping
|
||||
return ` subgraph cluster_${i} {
|
||||
label="${block.name}";
|
||||
${body.split('\n').map(line => ' ' + line).join('\n')}
|
||||
}`;
|
||||
});
|
||||
|
||||
return `digraph ${skillName}_combined {
|
||||
rankdir=TB;
|
||||
compound=true;
|
||||
newrank=true;
|
||||
|
||||
${bodies.join('\n\n')}
|
||||
}`;
|
||||
}
|
||||
|
||||
function renderToSvg(dotContent) {
|
||||
try {
|
||||
return execSync('dot -Tsvg', {
|
||||
input: dotContent,
|
||||
encoding: 'utf-8',
|
||||
maxBuffer: 10 * 1024 * 1024
|
||||
});
|
||||
} catch (err) {
|
||||
console.error('Error running dot:', err.message);
|
||||
if (err.stderr) console.error(err.stderr.toString());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function main() {
|
||||
const args = process.argv.slice(2);
|
||||
const combine = args.includes('--combine');
|
||||
const skillDirArg = args.find(a => !a.startsWith('--'));
|
||||
|
||||
if (!skillDirArg) {
|
||||
console.error('Usage: render-graphs.js <skill-directory> [--combine]');
|
||||
console.error('');
|
||||
console.error('Options:');
|
||||
console.error(' --combine Combine all diagrams into one SVG');
|
||||
console.error('');
|
||||
console.error('Example:');
|
||||
console.error(' ./render-graphs.js ../subagent-driven-development');
|
||||
console.error(' ./render-graphs.js ../subagent-driven-development --combine');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const skillDir = path.resolve(skillDirArg);
|
||||
const skillFile = path.join(skillDir, 'SKILL.md');
|
||||
const skillName = path.basename(skillDir).replace(/-/g, '_');
|
||||
|
||||
if (!fs.existsSync(skillFile)) {
|
||||
console.error(`Error: ${skillFile} not found`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Check if dot is available
|
||||
try {
|
||||
execSync('which dot', { encoding: 'utf-8' });
|
||||
} catch {
|
||||
console.error('Error: graphviz (dot) not found. Install with:');
|
||||
console.error(' brew install graphviz # macOS');
|
||||
console.error(' apt install graphviz # Linux');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const markdown = fs.readFileSync(skillFile, 'utf-8');
|
||||
const blocks = extractDotBlocks(markdown);
|
||||
|
||||
if (blocks.length === 0) {
|
||||
console.log('No ```dot blocks found in', skillFile);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log(`Found ${blocks.length} diagram(s) in ${path.basename(skillDir)}/SKILL.md`);
|
||||
|
||||
const outputDir = path.join(skillDir, 'diagrams');
|
||||
if (!fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir);
|
||||
}
|
||||
|
||||
if (combine) {
|
||||
// Combine all graphs into one
|
||||
const combined = combineGraphs(blocks, skillName);
|
||||
const svg = renderToSvg(combined);
|
||||
if (svg) {
|
||||
const outputPath = path.join(outputDir, `${skillName}_combined.svg`);
|
||||
fs.writeFileSync(outputPath, svg);
|
||||
console.log(` Rendered: ${skillName}_combined.svg`);
|
||||
|
||||
// Also write the dot source for debugging
|
||||
const dotPath = path.join(outputDir, `${skillName}_combined.dot`);
|
||||
fs.writeFileSync(dotPath, combined);
|
||||
console.log(` Source: ${skillName}_combined.dot`);
|
||||
} else {
|
||||
console.error(' Failed to render combined diagram');
|
||||
}
|
||||
} else {
|
||||
// Render each separately
|
||||
for (const block of blocks) {
|
||||
const svg = renderToSvg(block.content);
|
||||
if (svg) {
|
||||
const outputPath = path.join(outputDir, `${block.name}.svg`);
|
||||
fs.writeFileSync(outputPath, svg);
|
||||
console.log(` Rendered: ${block.name}.svg`);
|
||||
} else {
|
||||
console.error(` Failed: ${block.name}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nOutput: ${outputDir}/`);
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,10 +1,7 @@
|
||||
---
|
||||
name: testing-skills-with-subagents
|
||||
description: Use when creating or editing skills, before deployment, to verify they work under pressure and resist rationalization - applies RED-GREEN-REFACTOR cycle to process documentation by running baseline without skill, writing to address failures, iterating to close loopholes
|
||||
---
|
||||
|
||||
# Testing Skills With Subagents
|
||||
|
||||
**Load this reference when:** creating or editing skills, before deployment, to verify they work under pressure and resist rationalization.
|
||||
|
||||
## Overview
|
||||
|
||||
**Testing skills is just TDD applied to process documentation.**
|
||||
158
tests/claude-code/README.md
Normal file
158
tests/claude-code/README.md
Normal file
@@ -0,0 +1,158 @@
|
||||
# Claude Code Skills Tests
|
||||
|
||||
Automated tests for superpowers skills using Claude Code CLI.
|
||||
|
||||
## Overview
|
||||
|
||||
This test suite verifies that skills are loaded correctly and Claude follows them as expected. Tests invoke Claude Code in headless mode (`claude -p`) and verify the behavior.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Claude Code CLI installed and in PATH (`claude --version` should work)
|
||||
- Local superpowers plugin installed (see main README for installation)
|
||||
|
||||
## Running Tests
|
||||
|
||||
### Run all fast tests (recommended):
|
||||
```bash
|
||||
./run-skill-tests.sh
|
||||
```
|
||||
|
||||
### Run integration tests (slow, 10-30 minutes):
|
||||
```bash
|
||||
./run-skill-tests.sh --integration
|
||||
```
|
||||
|
||||
### Run specific test:
|
||||
```bash
|
||||
./run-skill-tests.sh --test test-subagent-driven-development.sh
|
||||
```
|
||||
|
||||
### Run with verbose output:
|
||||
```bash
|
||||
./run-skill-tests.sh --verbose
|
||||
```
|
||||
|
||||
### Set custom timeout:
|
||||
```bash
|
||||
./run-skill-tests.sh --timeout 1800 # 30 minutes for integration tests
|
||||
```
|
||||
|
||||
## Test Structure
|
||||
|
||||
### test-helpers.sh
|
||||
Common functions for skills testing:
|
||||
- `run_claude "prompt" [timeout]` - Run Claude with prompt
|
||||
- `assert_contains output pattern name` - Verify pattern exists
|
||||
- `assert_not_contains output pattern name` - Verify pattern absent
|
||||
- `assert_count output pattern count name` - Verify exact count
|
||||
- `assert_order output pattern_a pattern_b name` - Verify order
|
||||
- `create_test_project` - Create temp test directory
|
||||
- `create_test_plan project_dir` - Create sample plan file
|
||||
|
||||
### Test Files
|
||||
|
||||
Each test file:
|
||||
1. Sources `test-helpers.sh`
|
||||
2. Runs Claude Code with specific prompts
|
||||
3. Verifies expected behavior using assertions
|
||||
4. Returns 0 on success, non-zero on failure
|
||||
|
||||
## Example Test
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
source "$SCRIPT_DIR/test-helpers.sh"
|
||||
|
||||
echo "=== Test: My Skill ==="
|
||||
|
||||
# Ask Claude about the skill
|
||||
output=$(run_claude "What does the my-skill skill do?" 30)
|
||||
|
||||
# Verify response
|
||||
assert_contains "$output" "expected behavior" "Skill describes behavior"
|
||||
|
||||
echo "=== All tests passed ==="
|
||||
```
|
||||
|
||||
## Current Tests
|
||||
|
||||
### Fast Tests (run by default)
|
||||
|
||||
#### test-subagent-driven-development.sh
|
||||
Tests skill content and requirements (~2 minutes):
|
||||
- Skill loading and accessibility
|
||||
- Workflow ordering (spec compliance before code quality)
|
||||
- Self-review requirements documented
|
||||
- Plan reading efficiency documented
|
||||
- Spec compliance reviewer skepticism documented
|
||||
- Review loops documented
|
||||
- Task context provision documented
|
||||
|
||||
### Integration Tests (use --integration flag)
|
||||
|
||||
#### test-subagent-driven-development-integration.sh
|
||||
Full workflow execution test (~10-30 minutes):
|
||||
- Creates real test project with Node.js setup
|
||||
- Creates implementation plan with 2 tasks
|
||||
- Executes plan using subagent-driven-development
|
||||
- Verifies actual behaviors:
|
||||
- Plan read once at start (not per task)
|
||||
- Full task text provided in subagent prompts
|
||||
- Subagents perform self-review before reporting
|
||||
- Spec compliance review happens before code quality
|
||||
- Spec reviewer reads code independently
|
||||
- Working implementation is produced
|
||||
- Tests pass
|
||||
- Proper git commits created
|
||||
|
||||
**What it tests:**
|
||||
- The workflow actually works end-to-end
|
||||
- Our improvements are actually applied
|
||||
- Subagents follow the skill correctly
|
||||
- Final code is functional and tested
|
||||
|
||||
## Adding New Tests
|
||||
|
||||
1. Create new test file: `test-<skill-name>.sh`
|
||||
2. Source test-helpers.sh
|
||||
3. Write tests using `run_claude` and assertions
|
||||
4. Add to test list in `run-skill-tests.sh`
|
||||
5. Make executable: `chmod +x test-<skill-name>.sh`
|
||||
|
||||
## Timeout Considerations
|
||||
|
||||
- Default timeout: 5 minutes per test
|
||||
- Claude Code may take time to respond
|
||||
- Adjust with `--timeout` if needed
|
||||
- Tests should be focused to avoid long runs
|
||||
|
||||
## Debugging Failed Tests
|
||||
|
||||
With `--verbose`, you'll see full Claude output:
|
||||
```bash
|
||||
./run-skill-tests.sh --verbose --test test-subagent-driven-development.sh
|
||||
```
|
||||
|
||||
Without verbose, only failures show output.
|
||||
|
||||
## CI/CD Integration
|
||||
|
||||
To run in CI:
|
||||
```bash
|
||||
# Run with explicit timeout for CI environments
|
||||
./run-skill-tests.sh --timeout 900
|
||||
|
||||
# Exit code 0 = success, non-zero = failure
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Tests verify skill *instructions*, not full execution
|
||||
- Full workflow tests would be very slow
|
||||
- Focus on verifying key skill requirements
|
||||
- Tests should be deterministic
|
||||
- Avoid testing implementation details
|
||||
168
tests/claude-code/analyze-token-usage.py
Executable file
168
tests/claude-code/analyze-token-usage.py
Executable file
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Analyze token usage from Claude Code session transcripts.
|
||||
Breaks down usage by main session and individual subagents.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def analyze_main_session(filepath):
|
||||
"""Analyze a session file and return token usage broken down by agent."""
|
||||
main_usage = {
|
||||
'input_tokens': 0,
|
||||
'output_tokens': 0,
|
||||
'cache_creation': 0,
|
||||
'cache_read': 0,
|
||||
'messages': 0
|
||||
}
|
||||
|
||||
# Track usage per subagent
|
||||
subagent_usage = defaultdict(lambda: {
|
||||
'input_tokens': 0,
|
||||
'output_tokens': 0,
|
||||
'cache_creation': 0,
|
||||
'cache_read': 0,
|
||||
'messages': 0,
|
||||
'description': None
|
||||
})
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# Main session assistant messages
|
||||
if data.get('type') == 'assistant' and 'message' in data:
|
||||
main_usage['messages'] += 1
|
||||
msg_usage = data['message'].get('usage', {})
|
||||
main_usage['input_tokens'] += msg_usage.get('input_tokens', 0)
|
||||
main_usage['output_tokens'] += msg_usage.get('output_tokens', 0)
|
||||
main_usage['cache_creation'] += msg_usage.get('cache_creation_input_tokens', 0)
|
||||
main_usage['cache_read'] += msg_usage.get('cache_read_input_tokens', 0)
|
||||
|
||||
# Subagent tool results
|
||||
if data.get('type') == 'user' and 'toolUseResult' in data:
|
||||
result = data['toolUseResult']
|
||||
if 'usage' in result and 'agentId' in result:
|
||||
agent_id = result['agentId']
|
||||
usage = result['usage']
|
||||
|
||||
# Get description from prompt if available
|
||||
if subagent_usage[agent_id]['description'] is None:
|
||||
prompt = result.get('prompt', '')
|
||||
# Extract first line as description
|
||||
first_line = prompt.split('\n')[0] if prompt else f"agent-{agent_id}"
|
||||
if first_line.startswith('You are '):
|
||||
first_line = first_line[8:] # Remove "You are "
|
||||
subagent_usage[agent_id]['description'] = first_line[:60]
|
||||
|
||||
subagent_usage[agent_id]['messages'] += 1
|
||||
subagent_usage[agent_id]['input_tokens'] += usage.get('input_tokens', 0)
|
||||
subagent_usage[agent_id]['output_tokens'] += usage.get('output_tokens', 0)
|
||||
subagent_usage[agent_id]['cache_creation'] += usage.get('cache_creation_input_tokens', 0)
|
||||
subagent_usage[agent_id]['cache_read'] += usage.get('cache_read_input_tokens', 0)
|
||||
except:
|
||||
pass
|
||||
|
||||
return main_usage, dict(subagent_usage)
|
||||
|
||||
def format_tokens(n):
|
||||
"""Format token count with thousands separators."""
|
||||
return f"{n:,}"
|
||||
|
||||
def calculate_cost(usage, input_cost_per_m=3.0, output_cost_per_m=15.0):
|
||||
"""Calculate estimated cost in dollars."""
|
||||
total_input = usage['input_tokens'] + usage['cache_creation'] + usage['cache_read']
|
||||
input_cost = total_input * input_cost_per_m / 1_000_000
|
||||
output_cost = usage['output_tokens'] * output_cost_per_m / 1_000_000
|
||||
return input_cost + output_cost
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: analyze-token-usage.py <session-file.jsonl>")
|
||||
sys.exit(1)
|
||||
|
||||
main_session_file = sys.argv[1]
|
||||
|
||||
if not Path(main_session_file).exists():
|
||||
print(f"Error: Session file not found: {main_session_file}")
|
||||
sys.exit(1)
|
||||
|
||||
# Analyze the session
|
||||
main_usage, subagent_usage = analyze_main_session(main_session_file)
|
||||
|
||||
print("=" * 100)
|
||||
print("TOKEN USAGE ANALYSIS")
|
||||
print("=" * 100)
|
||||
print()
|
||||
|
||||
# Print breakdown
|
||||
print("Usage Breakdown:")
|
||||
print("-" * 100)
|
||||
print(f"{'Agent':<15} {'Description':<35} {'Msgs':>5} {'Input':>10} {'Output':>10} {'Cache':>10} {'Cost':>8}")
|
||||
print("-" * 100)
|
||||
|
||||
# Main session
|
||||
cost = calculate_cost(main_usage)
|
||||
print(f"{'main':<15} {'Main session (coordinator)':<35} "
|
||||
f"{main_usage['messages']:>5} "
|
||||
f"{format_tokens(main_usage['input_tokens']):>10} "
|
||||
f"{format_tokens(main_usage['output_tokens']):>10} "
|
||||
f"{format_tokens(main_usage['cache_read']):>10} "
|
||||
f"${cost:>7.2f}")
|
||||
|
||||
# Subagents (sorted by agent ID)
|
||||
for agent_id in sorted(subagent_usage.keys()):
|
||||
usage = subagent_usage[agent_id]
|
||||
cost = calculate_cost(usage)
|
||||
desc = usage['description'] or f"agent-{agent_id}"
|
||||
print(f"{agent_id:<15} {desc:<35} "
|
||||
f"{usage['messages']:>5} "
|
||||
f"{format_tokens(usage['input_tokens']):>10} "
|
||||
f"{format_tokens(usage['output_tokens']):>10} "
|
||||
f"{format_tokens(usage['cache_read']):>10} "
|
||||
f"${cost:>7.2f}")
|
||||
|
||||
print("-" * 100)
|
||||
|
||||
# Calculate totals
|
||||
total_usage = {
|
||||
'input_tokens': main_usage['input_tokens'],
|
||||
'output_tokens': main_usage['output_tokens'],
|
||||
'cache_creation': main_usage['cache_creation'],
|
||||
'cache_read': main_usage['cache_read'],
|
||||
'messages': main_usage['messages']
|
||||
}
|
||||
|
||||
for usage in subagent_usage.values():
|
||||
total_usage['input_tokens'] += usage['input_tokens']
|
||||
total_usage['output_tokens'] += usage['output_tokens']
|
||||
total_usage['cache_creation'] += usage['cache_creation']
|
||||
total_usage['cache_read'] += usage['cache_read']
|
||||
total_usage['messages'] += usage['messages']
|
||||
|
||||
total_input = total_usage['input_tokens'] + total_usage['cache_creation'] + total_usage['cache_read']
|
||||
total_tokens = total_input + total_usage['output_tokens']
|
||||
total_cost = calculate_cost(total_usage)
|
||||
|
||||
print()
|
||||
print("TOTALS:")
|
||||
print(f" Total messages: {format_tokens(total_usage['messages'])}")
|
||||
print(f" Input tokens: {format_tokens(total_usage['input_tokens'])}")
|
||||
print(f" Output tokens: {format_tokens(total_usage['output_tokens'])}")
|
||||
print(f" Cache creation tokens: {format_tokens(total_usage['cache_creation'])}")
|
||||
print(f" Cache read tokens: {format_tokens(total_usage['cache_read'])}")
|
||||
print()
|
||||
print(f" Total input (incl cache): {format_tokens(total_input)}")
|
||||
print(f" Total tokens: {format_tokens(total_tokens)}")
|
||||
print()
|
||||
print(f" Estimated cost: ${total_cost:.2f}")
|
||||
print(" (at $3/$15 per M tokens for input/output)")
|
||||
print()
|
||||
print("=" * 100)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
187
tests/claude-code/run-skill-tests.sh
Executable file
187
tests/claude-code/run-skill-tests.sh
Executable file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env bash
|
||||
# Test runner for Claude Code skills
|
||||
# Tests skills by invoking Claude Code CLI and verifying behavior
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
echo "========================================"
|
||||
echo " Claude Code Skills Test Suite"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
echo "Repository: $(cd ../.. && pwd)"
|
||||
echo "Test time: $(date)"
|
||||
echo "Claude version: $(claude --version 2>/dev/null || echo 'not found')"
|
||||
echo ""
|
||||
|
||||
# Check if Claude Code is available
|
||||
if ! command -v claude &> /dev/null; then
|
||||
echo "ERROR: Claude Code CLI not found"
|
||||
echo "Install Claude Code first: https://code.claude.com"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Parse command line arguments
|
||||
VERBOSE=false
|
||||
SPECIFIC_TEST=""
|
||||
TIMEOUT=300 # Default 5 minute timeout per test
|
||||
RUN_INTEGRATION=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--verbose|-v)
|
||||
VERBOSE=true
|
||||
shift
|
||||
;;
|
||||
--test|-t)
|
||||
SPECIFIC_TEST="$2"
|
||||
shift 2
|
||||
;;
|
||||
--timeout)
|
||||
TIMEOUT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--integration|-i)
|
||||
RUN_INTEGRATION=true
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
echo "Usage: $0 [options]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --verbose, -v Show verbose output"
|
||||
echo " --test, -t NAME Run only the specified test"
|
||||
echo " --timeout SECONDS Set timeout per test (default: 300)"
|
||||
echo " --integration, -i Run integration tests (slow, 10-30 min)"
|
||||
echo " --help, -h Show this help"
|
||||
echo ""
|
||||
echo "Tests:"
|
||||
echo " test-subagent-driven-development.sh Test skill loading and requirements"
|
||||
echo ""
|
||||
echo "Integration Tests (use --integration):"
|
||||
echo " test-subagent-driven-development-integration.sh Full workflow execution"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
echo "Use --help for usage information"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# List of skill tests to run (fast unit tests)
|
||||
tests=(
|
||||
"test-subagent-driven-development.sh"
|
||||
)
|
||||
|
||||
# Integration tests (slow, full execution)
|
||||
integration_tests=(
|
||||
"test-subagent-driven-development-integration.sh"
|
||||
)
|
||||
|
||||
# Add integration tests if requested
|
||||
if [ "$RUN_INTEGRATION" = true ]; then
|
||||
tests+=("${integration_tests[@]}")
|
||||
fi
|
||||
|
||||
# Filter to specific test if requested
|
||||
if [ -n "$SPECIFIC_TEST" ]; then
|
||||
tests=("$SPECIFIC_TEST")
|
||||
fi
|
||||
|
||||
# Track results
|
||||
passed=0
|
||||
failed=0
|
||||
skipped=0
|
||||
|
||||
# Run each test
|
||||
for test in "${tests[@]}"; do
|
||||
echo "----------------------------------------"
|
||||
echo "Running: $test"
|
||||
echo "----------------------------------------"
|
||||
|
||||
test_path="$SCRIPT_DIR/$test"
|
||||
|
||||
if [ ! -f "$test_path" ]; then
|
||||
echo " [SKIP] Test file not found: $test"
|
||||
skipped=$((skipped + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ ! -x "$test_path" ]; then
|
||||
echo " Making $test executable..."
|
||||
chmod +x "$test_path"
|
||||
fi
|
||||
|
||||
start_time=$(date +%s)
|
||||
|
||||
if [ "$VERBOSE" = true ]; then
|
||||
if timeout "$TIMEOUT" bash "$test_path"; then
|
||||
end_time=$(date +%s)
|
||||
duration=$((end_time - start_time))
|
||||
echo ""
|
||||
echo " [PASS] $test (${duration}s)"
|
||||
passed=$((passed + 1))
|
||||
else
|
||||
exit_code=$?
|
||||
end_time=$(date +%s)
|
||||
duration=$((end_time - start_time))
|
||||
echo ""
|
||||
if [ $exit_code -eq 124 ]; then
|
||||
echo " [FAIL] $test (timeout after ${TIMEOUT}s)"
|
||||
else
|
||||
echo " [FAIL] $test (${duration}s)"
|
||||
fi
|
||||
failed=$((failed + 1))
|
||||
fi
|
||||
else
|
||||
# Capture output for non-verbose mode
|
||||
if output=$(timeout "$TIMEOUT" bash "$test_path" 2>&1); then
|
||||
end_time=$(date +%s)
|
||||
duration=$((end_time - start_time))
|
||||
echo " [PASS] (${duration}s)"
|
||||
passed=$((passed + 1))
|
||||
else
|
||||
exit_code=$?
|
||||
end_time=$(date +%s)
|
||||
duration=$((end_time - start_time))
|
||||
if [ $exit_code -eq 124 ]; then
|
||||
echo " [FAIL] (timeout after ${TIMEOUT}s)"
|
||||
else
|
||||
echo " [FAIL] (${duration}s)"
|
||||
fi
|
||||
echo ""
|
||||
echo " Output:"
|
||||
echo "$output" | sed 's/^/ /'
|
||||
failed=$((failed + 1))
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
# Print summary
|
||||
echo "========================================"
|
||||
echo " Test Results Summary"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
echo " Passed: $passed"
|
||||
echo " Failed: $failed"
|
||||
echo " Skipped: $skipped"
|
||||
echo ""
|
||||
|
||||
if [ "$RUN_INTEGRATION" = false ] && [ ${#integration_tests[@]} -gt 0 ]; then
|
||||
echo "Note: Integration tests were not run (they take 10-30 minutes)."
|
||||
echo "Use --integration flag to run full workflow execution tests."
|
||||
echo ""
|
||||
fi
|
||||
|
||||
if [ $failed -gt 0 ]; then
|
||||
echo "STATUS: FAILED"
|
||||
exit 1
|
||||
else
|
||||
echo "STATUS: PASSED"
|
||||
exit 0
|
||||
fi
|
||||
202
tests/claude-code/test-helpers.sh
Executable file
202
tests/claude-code/test-helpers.sh
Executable file
@@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env bash
|
||||
# Helper functions for Claude Code skill tests
|
||||
|
||||
# Run Claude Code with a prompt and capture output
|
||||
# Usage: run_claude "prompt text" [timeout_seconds] [allowed_tools]
|
||||
run_claude() {
|
||||
local prompt="$1"
|
||||
local timeout="${2:-60}"
|
||||
local allowed_tools="${3:-}"
|
||||
local output_file=$(mktemp)
|
||||
|
||||
# Build command
|
||||
local cmd="claude -p \"$prompt\""
|
||||
if [ -n "$allowed_tools" ]; then
|
||||
cmd="$cmd --allowed-tools=$allowed_tools"
|
||||
fi
|
||||
|
||||
# Run Claude in headless mode with timeout
|
||||
if timeout "$timeout" bash -c "$cmd" > "$output_file" 2>&1; then
|
||||
cat "$output_file"
|
||||
rm -f "$output_file"
|
||||
return 0
|
||||
else
|
||||
local exit_code=$?
|
||||
cat "$output_file" >&2
|
||||
rm -f "$output_file"
|
||||
return $exit_code
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if output contains a pattern
|
||||
# Usage: assert_contains "output" "pattern" "test name"
|
||||
assert_contains() {
|
||||
local output="$1"
|
||||
local pattern="$2"
|
||||
local test_name="${3:-test}"
|
||||
|
||||
if echo "$output" | grep -q "$pattern"; then
|
||||
echo " [PASS] $test_name"
|
||||
return 0
|
||||
else
|
||||
echo " [FAIL] $test_name"
|
||||
echo " Expected to find: $pattern"
|
||||
echo " In output:"
|
||||
echo "$output" | sed 's/^/ /'
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if output does NOT contain a pattern
|
||||
# Usage: assert_not_contains "output" "pattern" "test name"
|
||||
assert_not_contains() {
|
||||
local output="$1"
|
||||
local pattern="$2"
|
||||
local test_name="${3:-test}"
|
||||
|
||||
if echo "$output" | grep -q "$pattern"; then
|
||||
echo " [FAIL] $test_name"
|
||||
echo " Did not expect to find: $pattern"
|
||||
echo " In output:"
|
||||
echo "$output" | sed 's/^/ /'
|
||||
return 1
|
||||
else
|
||||
echo " [PASS] $test_name"
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if output matches a count
|
||||
# Usage: assert_count "output" "pattern" expected_count "test name"
|
||||
assert_count() {
|
||||
local output="$1"
|
||||
local pattern="$2"
|
||||
local expected="$3"
|
||||
local test_name="${4:-test}"
|
||||
|
||||
local actual=$(echo "$output" | grep -c "$pattern" || echo "0")
|
||||
|
||||
if [ "$actual" -eq "$expected" ]; then
|
||||
echo " [PASS] $test_name (found $actual instances)"
|
||||
return 0
|
||||
else
|
||||
echo " [FAIL] $test_name"
|
||||
echo " Expected $expected instances of: $pattern"
|
||||
echo " Found $actual instances"
|
||||
echo " In output:"
|
||||
echo "$output" | sed 's/^/ /'
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if pattern A appears before pattern B
|
||||
# Usage: assert_order "output" "pattern_a" "pattern_b" "test name"
|
||||
assert_order() {
|
||||
local output="$1"
|
||||
local pattern_a="$2"
|
||||
local pattern_b="$3"
|
||||
local test_name="${4:-test}"
|
||||
|
||||
# Get line numbers where patterns appear
|
||||
local line_a=$(echo "$output" | grep -n "$pattern_a" | head -1 | cut -d: -f1)
|
||||
local line_b=$(echo "$output" | grep -n "$pattern_b" | head -1 | cut -d: -f1)
|
||||
|
||||
if [ -z "$line_a" ]; then
|
||||
echo " [FAIL] $test_name: pattern A not found: $pattern_a"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ -z "$line_b" ]; then
|
||||
echo " [FAIL] $test_name: pattern B not found: $pattern_b"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ "$line_a" -lt "$line_b" ]; then
|
||||
echo " [PASS] $test_name (A at line $line_a, B at line $line_b)"
|
||||
return 0
|
||||
else
|
||||
echo " [FAIL] $test_name"
|
||||
echo " Expected '$pattern_a' before '$pattern_b'"
|
||||
echo " But found A at line $line_a, B at line $line_b"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Create a temporary test project directory
|
||||
# Usage: test_project=$(create_test_project)
|
||||
create_test_project() {
|
||||
local test_dir=$(mktemp -d)
|
||||
echo "$test_dir"
|
||||
}
|
||||
|
||||
# Cleanup test project
|
||||
# Usage: cleanup_test_project "$test_dir"
|
||||
cleanup_test_project() {
|
||||
local test_dir="$1"
|
||||
if [ -d "$test_dir" ]; then
|
||||
rm -rf "$test_dir"
|
||||
fi
|
||||
}
|
||||
|
||||
# Create a simple plan file for testing
|
||||
# Usage: create_test_plan "$project_dir" "$plan_name"
|
||||
create_test_plan() {
|
||||
local project_dir="$1"
|
||||
local plan_name="${2:-test-plan}"
|
||||
local plan_file="$project_dir/docs/plans/$plan_name.md"
|
||||
|
||||
mkdir -p "$(dirname "$plan_file")"
|
||||
|
||||
cat > "$plan_file" <<'EOF'
|
||||
# Test Implementation Plan
|
||||
|
||||
## Task 1: Create Hello Function
|
||||
|
||||
Create a simple hello function that returns "Hello, World!".
|
||||
|
||||
**File:** `src/hello.js`
|
||||
|
||||
**Implementation:**
|
||||
```javascript
|
||||
export function hello() {
|
||||
return "Hello, World!";
|
||||
}
|
||||
```
|
||||
|
||||
**Tests:** Write a test that verifies the function returns the expected string.
|
||||
|
||||
**Verification:** `npm test`
|
||||
|
||||
## Task 2: Create Goodbye Function
|
||||
|
||||
Create a goodbye function that takes a name and returns a goodbye message.
|
||||
|
||||
**File:** `src/goodbye.js`
|
||||
|
||||
**Implementation:**
|
||||
```javascript
|
||||
export function goodbye(name) {
|
||||
return `Goodbye, ${name}!`;
|
||||
}
|
||||
```
|
||||
|
||||
**Tests:** Write tests for:
|
||||
- Default name
|
||||
- Custom name
|
||||
- Edge cases (empty string, null)
|
||||
|
||||
**Verification:** `npm test`
|
||||
EOF
|
||||
|
||||
echo "$plan_file"
|
||||
}
|
||||
|
||||
# Export functions for use in tests
|
||||
export -f run_claude
|
||||
export -f assert_contains
|
||||
export -f assert_not_contains
|
||||
export -f assert_count
|
||||
export -f assert_order
|
||||
export -f create_test_project
|
||||
export -f cleanup_test_project
|
||||
export -f create_test_plan
|
||||
314
tests/claude-code/test-subagent-driven-development-integration.sh
Executable file
314
tests/claude-code/test-subagent-driven-development-integration.sh
Executable file
@@ -0,0 +1,314 @@
|
||||
#!/usr/bin/env bash
|
||||
# Integration Test: subagent-driven-development workflow
|
||||
# Actually executes a plan and verifies the new workflow behaviors
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
source "$SCRIPT_DIR/test-helpers.sh"
|
||||
|
||||
echo "========================================"
|
||||
echo " Integration Test: subagent-driven-development"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
echo "This test executes a real plan using the skill and verifies:"
|
||||
echo " 1. Plan is read once (not per task)"
|
||||
echo " 2. Full task text provided to subagents"
|
||||
echo " 3. Subagents perform self-review"
|
||||
echo " 4. Spec compliance review before code quality"
|
||||
echo " 5. Review loops when issues found"
|
||||
echo " 6. Spec reviewer reads code independently"
|
||||
echo ""
|
||||
echo "WARNING: This test may take 10-30 minutes to complete."
|
||||
echo ""
|
||||
|
||||
# Create test project
|
||||
TEST_PROJECT=$(create_test_project)
|
||||
echo "Test project: $TEST_PROJECT"
|
||||
|
||||
# Trap to cleanup
|
||||
trap "cleanup_test_project $TEST_PROJECT" EXIT
|
||||
|
||||
# Set up minimal Node.js project
|
||||
cd "$TEST_PROJECT"
|
||||
|
||||
cat > package.json <<'EOF'
|
||||
{
|
||||
"name": "test-project",
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"test": "node --test"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
mkdir -p src test docs/plans
|
||||
|
||||
# Create a simple implementation plan
|
||||
cat > docs/plans/implementation-plan.md <<'EOF'
|
||||
# Test Implementation Plan
|
||||
|
||||
This is a minimal plan to test the subagent-driven-development workflow.
|
||||
|
||||
## Task 1: Create Add Function
|
||||
|
||||
Create a function that adds two numbers.
|
||||
|
||||
**File:** `src/math.js`
|
||||
|
||||
**Requirements:**
|
||||
- Function named `add`
|
||||
- Takes two parameters: `a` and `b`
|
||||
- Returns the sum of `a` and `b`
|
||||
- Export the function
|
||||
|
||||
**Implementation:**
|
||||
```javascript
|
||||
export function add(a, b) {
|
||||
return a + b;
|
||||
}
|
||||
```
|
||||
|
||||
**Tests:** Create `test/math.test.js` that verifies:
|
||||
- `add(2, 3)` returns `5`
|
||||
- `add(0, 0)` returns `0`
|
||||
- `add(-1, 1)` returns `0`
|
||||
|
||||
**Verification:** `npm test`
|
||||
|
||||
## Task 2: Create Multiply Function
|
||||
|
||||
Create a function that multiplies two numbers.
|
||||
|
||||
**File:** `src/math.js` (add to existing file)
|
||||
|
||||
**Requirements:**
|
||||
- Function named `multiply`
|
||||
- Takes two parameters: `a` and `b`
|
||||
- Returns the product of `a` and `b`
|
||||
- Export the function
|
||||
- DO NOT add any extra features (like power, divide, etc.)
|
||||
|
||||
**Implementation:**
|
||||
```javascript
|
||||
export function multiply(a, b) {
|
||||
return a * b;
|
||||
}
|
||||
```
|
||||
|
||||
**Tests:** Add to `test/math.test.js`:
|
||||
- `multiply(2, 3)` returns `6`
|
||||
- `multiply(0, 5)` returns `0`
|
||||
- `multiply(-2, 3)` returns `-6`
|
||||
|
||||
**Verification:** `npm test`
|
||||
EOF
|
||||
|
||||
# Initialize git repo
|
||||
git init --quiet
|
||||
git config user.email "test@test.com"
|
||||
git config user.name "Test User"
|
||||
git add .
|
||||
git commit -m "Initial commit" --quiet
|
||||
|
||||
echo ""
|
||||
echo "Project setup complete. Starting execution..."
|
||||
echo ""
|
||||
|
||||
# Run Claude with subagent-driven-development
|
||||
# Capture full output to analyze
|
||||
OUTPUT_FILE="$TEST_PROJECT/claude-output.txt"
|
||||
|
||||
# Create prompt file
|
||||
cat > "$TEST_PROJECT/prompt.txt" <<'EOF'
|
||||
I want you to execute the implementation plan at docs/plans/implementation-plan.md using the subagent-driven-development skill.
|
||||
|
||||
IMPORTANT: Follow the skill exactly. I will be verifying that you:
|
||||
1. Read the plan once at the beginning
|
||||
2. Provide full task text to subagents (don't make them read files)
|
||||
3. Ensure subagents do self-review before reporting
|
||||
4. Run spec compliance review before code quality review
|
||||
5. Use review loops when issues are found
|
||||
|
||||
Begin now. Execute the plan.
|
||||
EOF
|
||||
|
||||
# Note: We use a longer timeout since this is integration testing
|
||||
# Use --allowed-tools to enable tool usage in headless mode
|
||||
# IMPORTANT: Run from superpowers directory so local dev skills are available
|
||||
PROMPT="Change to directory $TEST_PROJECT and then execute the implementation plan at docs/plans/implementation-plan.md using the subagent-driven-development skill.
|
||||
|
||||
IMPORTANT: Follow the skill exactly. I will be verifying that you:
|
||||
1. Read the plan once at the beginning
|
||||
2. Provide full task text to subagents (don't make them read files)
|
||||
3. Ensure subagents do self-review before reporting
|
||||
4. Run spec compliance review before code quality review
|
||||
5. Use review loops when issues are found
|
||||
|
||||
Begin now. Execute the plan."
|
||||
|
||||
echo "Running Claude (output will be shown below and saved to $OUTPUT_FILE)..."
|
||||
echo "================================================================================"
|
||||
cd "$SCRIPT_DIR/../.." && timeout 1800 claude -p "$PROMPT" --allowed-tools=all --add-dir "$TEST_PROJECT" --permission-mode bypassPermissions 2>&1 | tee "$OUTPUT_FILE" || {
|
||||
echo ""
|
||||
echo "================================================================================"
|
||||
echo "EXECUTION FAILED (exit code: $?)"
|
||||
exit 1
|
||||
}
|
||||
echo "================================================================================"
|
||||
|
||||
echo ""
|
||||
echo "Execution complete. Analyzing results..."
|
||||
echo ""
|
||||
|
||||
# Find the session transcript
|
||||
# Session files are in ~/.claude/projects/-<working-dir>/<session-id>.jsonl
|
||||
WORKING_DIR_ESCAPED=$(echo "$SCRIPT_DIR/../.." | sed 's/\//-/g' | sed 's/^-//')
|
||||
SESSION_DIR="$HOME/.claude/projects/$WORKING_DIR_ESCAPED"
|
||||
|
||||
# Find the most recent session file (created during this test run)
|
||||
SESSION_FILE=$(find "$SESSION_DIR" -name "*.jsonl" -type f -mmin -60 2>/dev/null | sort -r | head -1)
|
||||
|
||||
if [ -z "$SESSION_FILE" ]; then
|
||||
echo "ERROR: Could not find session transcript file"
|
||||
echo "Looked in: $SESSION_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Analyzing session transcript: $(basename "$SESSION_FILE")"
|
||||
echo ""
|
||||
|
||||
# Verification tests
|
||||
FAILED=0
|
||||
|
||||
echo "=== Verification Tests ==="
|
||||
echo ""
|
||||
|
||||
# Test 1: Skill was invoked
|
||||
echo "Test 1: Skill tool invoked..."
|
||||
if grep -q '"name":"Skill".*"skill":"superpowers:subagent-driven-development"' "$SESSION_FILE"; then
|
||||
echo " [PASS] subagent-driven-development skill was invoked"
|
||||
else
|
||||
echo " [FAIL] Skill was not invoked"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Test 2: Subagents were used (Task tool)
|
||||
echo "Test 2: Subagents dispatched..."
|
||||
task_count=$(grep -c '"name":"Task"' "$SESSION_FILE" || echo "0")
|
||||
if [ "$task_count" -ge 2 ]; then
|
||||
echo " [PASS] $task_count subagents dispatched"
|
||||
else
|
||||
echo " [FAIL] Only $task_count subagent(s) dispatched (expected >= 2)"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Test 3: TodoWrite was used for tracking
|
||||
echo "Test 3: Task tracking..."
|
||||
todo_count=$(grep -c '"name":"TodoWrite"' "$SESSION_FILE" || echo "0")
|
||||
if [ "$todo_count" -ge 1 ]; then
|
||||
echo " [PASS] TodoWrite used $todo_count time(s) for task tracking"
|
||||
else
|
||||
echo " [FAIL] TodoWrite not used"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Test 6: Implementation actually works
|
||||
echo "Test 6: Implementation verification..."
|
||||
if [ -f "$TEST_PROJECT/src/math.js" ]; then
|
||||
echo " [PASS] src/math.js created"
|
||||
|
||||
if grep -q "export function add" "$TEST_PROJECT/src/math.js"; then
|
||||
echo " [PASS] add function exists"
|
||||
else
|
||||
echo " [FAIL] add function missing"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
|
||||
if grep -q "export function multiply" "$TEST_PROJECT/src/math.js"; then
|
||||
echo " [PASS] multiply function exists"
|
||||
else
|
||||
echo " [FAIL] multiply function missing"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
else
|
||||
echo " [FAIL] src/math.js not created"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
|
||||
if [ -f "$TEST_PROJECT/test/math.test.js" ]; then
|
||||
echo " [PASS] test/math.test.js created"
|
||||
else
|
||||
echo " [FAIL] test/math.test.js not created"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
|
||||
# Try running tests
|
||||
if cd "$TEST_PROJECT" && npm test > test-output.txt 2>&1; then
|
||||
echo " [PASS] Tests pass"
|
||||
else
|
||||
echo " [FAIL] Tests failed"
|
||||
cat test-output.txt
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Test 7: Git commits show proper workflow
|
||||
echo "Test 7: Git commit history..."
|
||||
commit_count=$(git -C "$TEST_PROJECT" log --oneline | wc -l)
|
||||
if [ "$commit_count" -gt 2 ]; then # Initial + at least 2 task commits
|
||||
echo " [PASS] Multiple commits created ($commit_count total)"
|
||||
else
|
||||
echo " [FAIL] Too few commits ($commit_count, expected >2)"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Test 8: Check for extra features (spec compliance should catch)
|
||||
echo "Test 8: No extra features added (spec compliance)..."
|
||||
if grep -q "export function divide\|export function power\|export function subtract" "$TEST_PROJECT/src/math.js" 2>/dev/null; then
|
||||
echo " [WARN] Extra features found (spec review should have caught this)"
|
||||
# Not failing on this as it tests reviewer effectiveness
|
||||
else
|
||||
echo " [PASS] No extra features added"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Token Usage Analysis
|
||||
echo "========================================="
|
||||
echo " Token Usage Analysis"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
python3 "$SCRIPT_DIR/analyze-token-usage.py" "$SESSION_FILE"
|
||||
echo ""
|
||||
|
||||
# Summary
|
||||
echo "========================================"
|
||||
echo " Test Summary"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
|
||||
if [ $FAILED -eq 0 ]; then
|
||||
echo "STATUS: PASSED"
|
||||
echo "All verification tests passed!"
|
||||
echo ""
|
||||
echo "The subagent-driven-development skill correctly:"
|
||||
echo " ✓ Reads plan once at start"
|
||||
echo " ✓ Provides full task text to subagents"
|
||||
echo " ✓ Enforces self-review"
|
||||
echo " ✓ Runs spec compliance before code quality"
|
||||
echo " ✓ Spec reviewer verifies independently"
|
||||
echo " ✓ Produces working implementation"
|
||||
exit 0
|
||||
else
|
||||
echo "STATUS: FAILED"
|
||||
echo "Failed $FAILED verification tests"
|
||||
echo ""
|
||||
echo "Output saved to: $OUTPUT_FILE"
|
||||
echo ""
|
||||
echo "Review the output to see what went wrong."
|
||||
exit 1
|
||||
fi
|
||||
139
tests/claude-code/test-subagent-driven-development.sh
Executable file
139
tests/claude-code/test-subagent-driven-development.sh
Executable file
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env bash
|
||||
# Test: subagent-driven-development skill
|
||||
# Verifies that the skill is loaded and follows correct workflow
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
source "$SCRIPT_DIR/test-helpers.sh"
|
||||
|
||||
echo "=== Test: subagent-driven-development skill ==="
|
||||
echo ""
|
||||
|
||||
# Test 1: Verify skill can be loaded
|
||||
echo "Test 1: Skill loading..."
|
||||
|
||||
output=$(run_claude "What is the subagent-driven-development skill? Describe its key steps briefly." 30)
|
||||
|
||||
if assert_contains "$output" "subagent-driven-development" "Skill is recognized"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if assert_contains "$output" "Load Plan\|read.*plan\|extract.*tasks" "Mentions loading plan"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Test 2: Verify skill describes correct workflow order
|
||||
echo "Test 2: Workflow ordering..."
|
||||
|
||||
output=$(run_claude "In the subagent-driven-development skill, what comes first: spec compliance review or code quality review? Be specific about the order." 30)
|
||||
|
||||
if assert_order "$output" "spec.*compliance" "code.*quality" "Spec compliance before code quality"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Test 3: Verify self-review is mentioned
|
||||
echo "Test 3: Self-review requirement..."
|
||||
|
||||
output=$(run_claude "Does the subagent-driven-development skill require implementers to do self-review? What should they check?" 30)
|
||||
|
||||
if assert_contains "$output" "self-review\|self review" "Mentions self-review"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if assert_contains "$output" "completeness\|Completeness" "Checks completeness"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Test 4: Verify plan is read once
|
||||
echo "Test 4: Plan reading efficiency..."
|
||||
|
||||
output=$(run_claude "In subagent-driven-development, how many times should the controller read the plan file? When does this happen?" 30)
|
||||
|
||||
if assert_contains "$output" "once\|one time\|single" "Read plan once"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if assert_contains "$output" "Step 1\|beginning\|start\|Load Plan" "Read at beginning"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Test 5: Verify spec compliance reviewer is skeptical
|
||||
echo "Test 5: Spec compliance reviewer mindset..."
|
||||
|
||||
output=$(run_claude "What is the spec compliance reviewer's attitude toward the implementer's report in subagent-driven-development?" 30)
|
||||
|
||||
if assert_contains "$output" "not trust\|don't trust\|skeptical\|verify.*independently\|suspiciously" "Reviewer is skeptical"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if assert_contains "$output" "read.*code\|inspect.*code\|verify.*code" "Reviewer reads code"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Test 6: Verify review loops
|
||||
echo "Test 6: Review loop requirements..."
|
||||
|
||||
output=$(run_claude "In subagent-driven-development, what happens if a reviewer finds issues? Is it a one-time review or a loop?" 30)
|
||||
|
||||
if assert_contains "$output" "loop\|again\|repeat\|until.*approved\|until.*compliant" "Review loops mentioned"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if assert_contains "$output" "implementer.*fix\|fix.*issues" "Implementer fixes issues"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# Test 7: Verify full task text is provided
|
||||
echo "Test 7: Task context provision..."
|
||||
|
||||
output=$(run_claude "In subagent-driven-development, how does the controller provide task information to the implementer subagent? Does it make them read a file or provide it directly?" 30)
|
||||
|
||||
if assert_contains "$output" "provide.*directly\|full.*text\|paste\|include.*prompt" "Provides text directly"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if assert_not_contains "$output" "read.*file\|open.*file" "Doesn't make subagent read file"; then
|
||||
: # pass
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
echo "=== All subagent-driven-development skill tests passed ==="
|
||||
@@ -0,0 +1,8 @@
|
||||
I have 4 independent test failures happening in different modules:
|
||||
|
||||
1. tests/auth/login.test.ts - "should redirect after login" is failing
|
||||
2. tests/api/users.test.ts - "should return user list" returns 500
|
||||
3. tests/components/Button.test.tsx - snapshot mismatch
|
||||
4. tests/utils/date.test.ts - timezone handling broken
|
||||
|
||||
These are unrelated issues in different parts of the codebase. Can you investigate all of them?
|
||||
1
tests/skill-triggering/prompts/executing-plans.txt
Normal file
1
tests/skill-triggering/prompts/executing-plans.txt
Normal file
@@ -0,0 +1 @@
|
||||
I have a plan document at docs/plans/2024-01-15-auth-system.md that needs to be executed. Please implement it.
|
||||
@@ -0,0 +1,3 @@
|
||||
I just finished implementing the user authentication feature. All the code is committed. Can you review the changes before I merge to main?
|
||||
|
||||
The commits are between abc123 and def456.
|
||||
11
tests/skill-triggering/prompts/systematic-debugging.txt
Normal file
11
tests/skill-triggering/prompts/systematic-debugging.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
The tests are failing with this error:
|
||||
|
||||
```
|
||||
FAIL src/utils/parser.test.ts
|
||||
● Parser › should handle nested objects
|
||||
TypeError: Cannot read property 'value' of undefined
|
||||
at parse (src/utils/parser.ts:42:18)
|
||||
at Object.<anonymous> (src/utils/parser.test.ts:28:20)
|
||||
```
|
||||
|
||||
Can you figure out what's going wrong and fix it?
|
||||
@@ -0,0 +1,7 @@
|
||||
I need to add a new feature to validate email addresses. It should:
|
||||
- Check that there's an @ symbol
|
||||
- Check that there's at least one character before the @
|
||||
- Check that there's a dot in the domain part
|
||||
- Return true/false
|
||||
|
||||
Can you implement this?
|
||||
10
tests/skill-triggering/prompts/writing-plans.txt
Normal file
10
tests/skill-triggering/prompts/writing-plans.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
Here's the spec for our new authentication system:
|
||||
|
||||
Requirements:
|
||||
- Users can register with email/password
|
||||
- Users can log in and receive a JWT token
|
||||
- Protected routes require valid JWT
|
||||
- Tokens expire after 24 hours
|
||||
- Support password reset via email
|
||||
|
||||
We need to implement this. There are multiple steps involved - user model, auth routes, middleware, email service integration.
|
||||
60
tests/skill-triggering/run-all.sh
Executable file
60
tests/skill-triggering/run-all.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
# Run all skill triggering tests
|
||||
# Usage: ./run-all.sh
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROMPTS_DIR="$SCRIPT_DIR/prompts"
|
||||
|
||||
SKILLS=(
|
||||
"systematic-debugging"
|
||||
"test-driven-development"
|
||||
"writing-plans"
|
||||
"dispatching-parallel-agents"
|
||||
"executing-plans"
|
||||
"requesting-code-review"
|
||||
)
|
||||
|
||||
echo "=== Running Skill Triggering Tests ==="
|
||||
echo ""
|
||||
|
||||
PASSED=0
|
||||
FAILED=0
|
||||
RESULTS=()
|
||||
|
||||
for skill in "${SKILLS[@]}"; do
|
||||
prompt_file="$PROMPTS_DIR/${skill}.txt"
|
||||
|
||||
if [ ! -f "$prompt_file" ]; then
|
||||
echo "⚠️ SKIP: No prompt file for $skill"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "Testing: $skill"
|
||||
|
||||
if "$SCRIPT_DIR/run-test.sh" "$skill" "$prompt_file" 3 2>&1 | tee /tmp/skill-test-$skill.log; then
|
||||
PASSED=$((PASSED + 1))
|
||||
RESULTS+=("✅ $skill")
|
||||
else
|
||||
FAILED=$((FAILED + 1))
|
||||
RESULTS+=("❌ $skill")
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "---"
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Summary ==="
|
||||
for result in "${RESULTS[@]}"; do
|
||||
echo " $result"
|
||||
done
|
||||
echo ""
|
||||
echo "Passed: $PASSED"
|
||||
echo "Failed: $FAILED"
|
||||
|
||||
if [ $FAILED -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
88
tests/skill-triggering/run-test.sh
Executable file
88
tests/skill-triggering/run-test.sh
Executable file
@@ -0,0 +1,88 @@
|
||||
#!/bin/bash
|
||||
# Test skill triggering with naive prompts
|
||||
# Usage: ./run-test.sh <skill-name> <prompt-file>
|
||||
#
|
||||
# Tests whether Claude triggers a skill based on a natural prompt
|
||||
# (without explicitly mentioning the skill)
|
||||
|
||||
set -e
|
||||
|
||||
SKILL_NAME="$1"
|
||||
PROMPT_FILE="$2"
|
||||
MAX_TURNS="${3:-3}"
|
||||
|
||||
if [ -z "$SKILL_NAME" ] || [ -z "$PROMPT_FILE" ]; then
|
||||
echo "Usage: $0 <skill-name> <prompt-file> [max-turns]"
|
||||
echo "Example: $0 systematic-debugging ./test-prompts/debugging.txt"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get the directory where this script lives (should be tests/skill-triggering)
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# Get the superpowers plugin root (two levels up from tests/skill-triggering)
|
||||
PLUGIN_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
TIMESTAMP=$(date +%s)
|
||||
OUTPUT_DIR="/tmp/superpowers-tests/${TIMESTAMP}/skill-triggering/${SKILL_NAME}"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
# Read prompt from file
|
||||
PROMPT=$(cat "$PROMPT_FILE")
|
||||
|
||||
echo "=== Skill Triggering Test ==="
|
||||
echo "Skill: $SKILL_NAME"
|
||||
echo "Prompt file: $PROMPT_FILE"
|
||||
echo "Max turns: $MAX_TURNS"
|
||||
echo "Output dir: $OUTPUT_DIR"
|
||||
echo ""
|
||||
|
||||
# Copy prompt for reference
|
||||
cp "$PROMPT_FILE" "$OUTPUT_DIR/prompt.txt"
|
||||
|
||||
# Run Claude
|
||||
LOG_FILE="$OUTPUT_DIR/claude-output.json"
|
||||
cd "$OUTPUT_DIR"
|
||||
|
||||
echo "Plugin dir: $PLUGIN_DIR"
|
||||
echo "Running claude -p with naive prompt..."
|
||||
timeout 300 claude -p "$PROMPT" \
|
||||
--plugin-dir "$PLUGIN_DIR" \
|
||||
--dangerously-skip-permissions \
|
||||
--max-turns "$MAX_TURNS" \
|
||||
--output-format stream-json \
|
||||
> "$LOG_FILE" 2>&1 || true
|
||||
|
||||
echo ""
|
||||
echo "=== Results ==="
|
||||
|
||||
# Check if skill was triggered (look for Skill tool invocation)
|
||||
# In stream-json, tool invocations have "name":"Skill" (not "tool":"Skill")
|
||||
# Match either "skill":"skillname" or "skill":"namespace:skillname"
|
||||
SKILL_PATTERN='"skill":"([^"]*:)?'"${SKILL_NAME}"'"'
|
||||
if grep -q '"name":"Skill"' "$LOG_FILE" && grep -qE "$SKILL_PATTERN" "$LOG_FILE"; then
|
||||
echo "✅ PASS: Skill '$SKILL_NAME' was triggered"
|
||||
TRIGGERED=true
|
||||
else
|
||||
echo "❌ FAIL: Skill '$SKILL_NAME' was NOT triggered"
|
||||
TRIGGERED=false
|
||||
fi
|
||||
|
||||
# Show what skills WERE triggered
|
||||
echo ""
|
||||
echo "Skills triggered in this run:"
|
||||
grep -o '"skill":"[^"]*"' "$LOG_FILE" 2>/dev/null | sort -u || echo " (none)"
|
||||
|
||||
# Show first assistant message
|
||||
echo ""
|
||||
echo "First assistant response (truncated):"
|
||||
grep '"type":"assistant"' "$LOG_FILE" | head -1 | jq -r '.message.content[0].text // .message.content' 2>/dev/null | head -c 500 || echo " (could not extract)"
|
||||
|
||||
echo ""
|
||||
echo "Full log: $LOG_FILE"
|
||||
echo "Timestamp: $TIMESTAMP"
|
||||
|
||||
if [ "$TRIGGERED" = "true" ]; then
|
||||
exit 0
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
81
tests/subagent-driven-dev/go-fractals/design.md
Normal file
81
tests/subagent-driven-dev/go-fractals/design.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# Go Fractals CLI - Design
|
||||
|
||||
## Overview
|
||||
|
||||
A command-line tool that generates ASCII art fractals. Supports two fractal types with configurable output.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Sierpinski triangle
|
||||
fractals sierpinski --size 32 --depth 5
|
||||
|
||||
# Mandelbrot set
|
||||
fractals mandelbrot --width 80 --height 24 --iterations 100
|
||||
|
||||
# Custom character
|
||||
fractals sierpinski --size 16 --char '#'
|
||||
|
||||
# Help
|
||||
fractals --help
|
||||
fractals sierpinski --help
|
||||
```
|
||||
|
||||
## Commands
|
||||
|
||||
### `sierpinski`
|
||||
|
||||
Generates a Sierpinski triangle using recursive subdivision.
|
||||
|
||||
Flags:
|
||||
- `--size` (default: 32) - Width of the triangle base in characters
|
||||
- `--depth` (default: 5) - Recursion depth
|
||||
- `--char` (default: '*') - Character to use for filled points
|
||||
|
||||
Output: Triangle printed to stdout, one line per row.
|
||||
|
||||
### `mandelbrot`
|
||||
|
||||
Renders the Mandelbrot set as ASCII art. Maps iteration count to characters.
|
||||
|
||||
Flags:
|
||||
- `--width` (default: 80) - Output width in characters
|
||||
- `--height` (default: 24) - Output height in characters
|
||||
- `--iterations` (default: 100) - Maximum iterations for escape calculation
|
||||
- `--char` (default: gradient) - Single character, or omit for gradient " .:-=+*#%@"
|
||||
|
||||
Output: Rectangle printed to stdout.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
cmd/
|
||||
fractals/
|
||||
main.go # Entry point, CLI setup
|
||||
internal/
|
||||
sierpinski/
|
||||
sierpinski.go # Algorithm
|
||||
sierpinski_test.go
|
||||
mandelbrot/
|
||||
mandelbrot.go # Algorithm
|
||||
mandelbrot_test.go
|
||||
cli/
|
||||
root.go # Root command, help
|
||||
sierpinski.go # Sierpinski subcommand
|
||||
mandelbrot.go # Mandelbrot subcommand
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Go 1.21+
|
||||
- `github.com/spf13/cobra` for CLI
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
1. `fractals --help` shows usage
|
||||
2. `fractals sierpinski` outputs a recognizable triangle
|
||||
3. `fractals mandelbrot` outputs a recognizable Mandelbrot set
|
||||
4. `--size`, `--width`, `--height`, `--depth`, `--iterations` flags work
|
||||
5. `--char` customizes output character
|
||||
6. Invalid inputs produce clear error messages
|
||||
7. All tests pass
|
||||
172
tests/subagent-driven-dev/go-fractals/plan.md
Normal file
172
tests/subagent-driven-dev/go-fractals/plan.md
Normal file
@@ -0,0 +1,172 @@
|
||||
# Go Fractals CLI - Implementation Plan
|
||||
|
||||
Execute this plan using the `superpowers:subagent-driven-development` skill.
|
||||
|
||||
## Context
|
||||
|
||||
Building a CLI tool that generates ASCII fractals. See `design.md` for full specification.
|
||||
|
||||
## Tasks
|
||||
|
||||
### Task 1: Project Setup
|
||||
|
||||
Create the Go module and directory structure.
|
||||
|
||||
**Do:**
|
||||
- Initialize `go.mod` with module name `github.com/superpowers-test/fractals`
|
||||
- Create directory structure: `cmd/fractals/`, `internal/sierpinski/`, `internal/mandelbrot/`, `internal/cli/`
|
||||
- Create minimal `cmd/fractals/main.go` that prints "fractals cli"
|
||||
- Add `github.com/spf13/cobra` dependency
|
||||
|
||||
**Verify:**
|
||||
- `go build ./cmd/fractals` succeeds
|
||||
- `./fractals` prints "fractals cli"
|
||||
|
||||
---
|
||||
|
||||
### Task 2: CLI Framework with Help
|
||||
|
||||
Set up Cobra root command with help output.
|
||||
|
||||
**Do:**
|
||||
- Create `internal/cli/root.go` with root command
|
||||
- Configure help text showing available subcommands
|
||||
- Wire root command into `main.go`
|
||||
|
||||
**Verify:**
|
||||
- `./fractals --help` shows usage with "sierpinski" and "mandelbrot" listed as available commands
|
||||
- `./fractals` (no args) shows help
|
||||
|
||||
---
|
||||
|
||||
### Task 3: Sierpinski Algorithm
|
||||
|
||||
Implement the Sierpinski triangle generation algorithm.
|
||||
|
||||
**Do:**
|
||||
- Create `internal/sierpinski/sierpinski.go`
|
||||
- Implement `Generate(size, depth int, char rune) []string` that returns lines of the triangle
|
||||
- Use recursive midpoint subdivision algorithm
|
||||
- Create `internal/sierpinski/sierpinski_test.go` with tests:
|
||||
- Small triangle (size=4, depth=2) matches expected output
|
||||
- Size=1 returns single character
|
||||
- Depth=0 returns filled triangle
|
||||
|
||||
**Verify:**
|
||||
- `go test ./internal/sierpinski/...` passes
|
||||
|
||||
---
|
||||
|
||||
### Task 4: Sierpinski CLI Integration
|
||||
|
||||
Wire the Sierpinski algorithm to a CLI subcommand.
|
||||
|
||||
**Do:**
|
||||
- Create `internal/cli/sierpinski.go` with `sierpinski` subcommand
|
||||
- Add flags: `--size` (default 32), `--depth` (default 5), `--char` (default '*')
|
||||
- Call `sierpinski.Generate()` and print result to stdout
|
||||
|
||||
**Verify:**
|
||||
- `./fractals sierpinski` outputs a triangle
|
||||
- `./fractals sierpinski --size 16 --depth 3` outputs smaller triangle
|
||||
- `./fractals sierpinski --help` shows flag documentation
|
||||
|
||||
---
|
||||
|
||||
### Task 5: Mandelbrot Algorithm
|
||||
|
||||
Implement the Mandelbrot set ASCII renderer.
|
||||
|
||||
**Do:**
|
||||
- Create `internal/mandelbrot/mandelbrot.go`
|
||||
- Implement `Render(width, height, maxIter int, char string) []string`
|
||||
- Map complex plane region (-2.5 to 1.0 real, -1.0 to 1.0 imaginary) to output dimensions
|
||||
- Map iteration count to character gradient " .:-=+*#%@" (or single char if provided)
|
||||
- Create `internal/mandelbrot/mandelbrot_test.go` with tests:
|
||||
- Output dimensions match requested width/height
|
||||
- Known point inside set (0,0) maps to max-iteration character
|
||||
- Known point outside set (2,0) maps to low-iteration character
|
||||
|
||||
**Verify:**
|
||||
- `go test ./internal/mandelbrot/...` passes
|
||||
|
||||
---
|
||||
|
||||
### Task 6: Mandelbrot CLI Integration
|
||||
|
||||
Wire the Mandelbrot algorithm to a CLI subcommand.
|
||||
|
||||
**Do:**
|
||||
- Create `internal/cli/mandelbrot.go` with `mandelbrot` subcommand
|
||||
- Add flags: `--width` (default 80), `--height` (default 24), `--iterations` (default 100), `--char` (default "")
|
||||
- Call `mandelbrot.Render()` and print result to stdout
|
||||
|
||||
**Verify:**
|
||||
- `./fractals mandelbrot` outputs recognizable Mandelbrot set
|
||||
- `./fractals mandelbrot --width 40 --height 12` outputs smaller version
|
||||
- `./fractals mandelbrot --help` shows flag documentation
|
||||
|
||||
---
|
||||
|
||||
### Task 7: Character Set Configuration
|
||||
|
||||
Ensure `--char` flag works consistently across both commands.
|
||||
|
||||
**Do:**
|
||||
- Verify Sierpinski `--char` flag passes character to algorithm
|
||||
- For Mandelbrot, `--char` should use single character instead of gradient
|
||||
- Add tests for custom character output
|
||||
|
||||
**Verify:**
|
||||
- `./fractals sierpinski --char '#'` uses '#' character
|
||||
- `./fractals mandelbrot --char '.'` uses '.' for all filled points
|
||||
- Tests pass
|
||||
|
||||
---
|
||||
|
||||
### Task 8: Input Validation and Error Handling
|
||||
|
||||
Add validation for invalid inputs.
|
||||
|
||||
**Do:**
|
||||
- Sierpinski: size must be > 0, depth must be >= 0
|
||||
- Mandelbrot: width/height must be > 0, iterations must be > 0
|
||||
- Return clear error messages for invalid inputs
|
||||
- Add tests for error cases
|
||||
|
||||
**Verify:**
|
||||
- `./fractals sierpinski --size 0` prints error, exits non-zero
|
||||
- `./fractals mandelbrot --width -1` prints error, exits non-zero
|
||||
- Error messages are clear and helpful
|
||||
|
||||
---
|
||||
|
||||
### Task 9: Integration Tests
|
||||
|
||||
Add integration tests that invoke the CLI.
|
||||
|
||||
**Do:**
|
||||
- Create `cmd/fractals/main_test.go` or `test/integration_test.go`
|
||||
- Test full CLI invocation for both commands
|
||||
- Verify output format and exit codes
|
||||
- Test error cases return non-zero exit
|
||||
|
||||
**Verify:**
|
||||
- `go test ./...` passes all tests including integration tests
|
||||
|
||||
---
|
||||
|
||||
### Task 10: README
|
||||
|
||||
Document usage and examples.
|
||||
|
||||
**Do:**
|
||||
- Create `README.md` with:
|
||||
- Project description
|
||||
- Installation: `go install ./cmd/fractals`
|
||||
- Usage examples for both commands
|
||||
- Example output (small samples)
|
||||
|
||||
**Verify:**
|
||||
- README accurately describes the tool
|
||||
- Examples in README actually work
|
||||
45
tests/subagent-driven-dev/go-fractals/scaffold.sh
Executable file
45
tests/subagent-driven-dev/go-fractals/scaffold.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
# Scaffold the Go Fractals test project
|
||||
# Usage: ./scaffold.sh /path/to/target/directory
|
||||
|
||||
set -e
|
||||
|
||||
TARGET_DIR="${1:?Usage: $0 <target-directory>}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
|
||||
# Create target directory
|
||||
mkdir -p "$TARGET_DIR"
|
||||
cd "$TARGET_DIR"
|
||||
|
||||
# Initialize git repo
|
||||
git init
|
||||
|
||||
# Copy design and plan
|
||||
cp "$SCRIPT_DIR/design.md" .
|
||||
cp "$SCRIPT_DIR/plan.md" .
|
||||
|
||||
# Create .claude settings to allow reads/writes in this directory
|
||||
mkdir -p .claude
|
||||
cat > .claude/settings.local.json << 'SETTINGS'
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Read(**)",
|
||||
"Edit(**)",
|
||||
"Write(**)",
|
||||
"Bash(go:*)",
|
||||
"Bash(mkdir:*)",
|
||||
"Bash(git:*)"
|
||||
]
|
||||
}
|
||||
}
|
||||
SETTINGS
|
||||
|
||||
# Create initial commit
|
||||
git add .
|
||||
git commit -m "Initial project setup with design and plan"
|
||||
|
||||
echo "Scaffolded Go Fractals project at: $TARGET_DIR"
|
||||
echo ""
|
||||
echo "To run the test:"
|
||||
echo " claude -p \"Execute this plan using superpowers:subagent-driven-development. Plan: $TARGET_DIR/plan.md\" --plugin-dir /path/to/superpowers"
|
||||
105
tests/subagent-driven-dev/run-test.sh
Executable file
105
tests/subagent-driven-dev/run-test.sh
Executable file
@@ -0,0 +1,105 @@
|
||||
#!/bin/bash
|
||||
# Run a subagent-driven-development test
|
||||
# Usage: ./run-test.sh <test-name> [--plugin-dir <path>]
|
||||
#
|
||||
# Example:
|
||||
# ./run-test.sh go-fractals
|
||||
# ./run-test.sh svelte-todo --plugin-dir /path/to/superpowers
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
TEST_NAME="${1:?Usage: $0 <test-name> [--plugin-dir <path>]}"
|
||||
shift
|
||||
|
||||
# Parse optional arguments
|
||||
PLUGIN_DIR=""
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--plugin-dir)
|
||||
PLUGIN_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Default plugin dir to parent of tests directory
|
||||
if [[ -z "$PLUGIN_DIR" ]]; then
|
||||
PLUGIN_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
fi
|
||||
|
||||
# Verify test exists
|
||||
TEST_DIR="$SCRIPT_DIR/$TEST_NAME"
|
||||
if [[ ! -d "$TEST_DIR" ]]; then
|
||||
echo "Error: Test '$TEST_NAME' not found at $TEST_DIR"
|
||||
echo "Available tests:"
|
||||
ls -1 "$SCRIPT_DIR" | grep -v '\.sh$' | grep -v '\.md$'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create timestamped output directory
|
||||
TIMESTAMP=$(date +%s)
|
||||
OUTPUT_BASE="/tmp/superpowers-tests/$TIMESTAMP/subagent-driven-development"
|
||||
OUTPUT_DIR="$OUTPUT_BASE/$TEST_NAME"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
echo "=== Subagent-Driven Development Test ==="
|
||||
echo "Test: $TEST_NAME"
|
||||
echo "Output: $OUTPUT_DIR"
|
||||
echo "Plugin: $PLUGIN_DIR"
|
||||
echo ""
|
||||
|
||||
# Scaffold the project
|
||||
echo ">>> Scaffolding project..."
|
||||
"$TEST_DIR/scaffold.sh" "$OUTPUT_DIR/project"
|
||||
echo ""
|
||||
|
||||
# Prepare the prompt
|
||||
PLAN_PATH="$OUTPUT_DIR/project/plan.md"
|
||||
PROMPT="Execute this plan using superpowers:subagent-driven-development. The plan is at: $PLAN_PATH"
|
||||
|
||||
# Run Claude with JSON output for token tracking
|
||||
LOG_FILE="$OUTPUT_DIR/claude-output.json"
|
||||
echo ">>> Running Claude..."
|
||||
echo "Prompt: $PROMPT"
|
||||
echo "Log file: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
# Run claude and capture output
|
||||
# Using stream-json to get token usage stats
|
||||
# --dangerously-skip-permissions for automated testing (subagents don't inherit parent settings)
|
||||
cd "$OUTPUT_DIR/project"
|
||||
claude -p "$PROMPT" \
|
||||
--plugin-dir "$PLUGIN_DIR" \
|
||||
--dangerously-skip-permissions \
|
||||
--output-format stream-json \
|
||||
> "$LOG_FILE" 2>&1 || true
|
||||
|
||||
# Extract final stats
|
||||
echo ""
|
||||
echo ">>> Test complete"
|
||||
echo "Project directory: $OUTPUT_DIR/project"
|
||||
echo "Claude log: $LOG_FILE"
|
||||
echo ""
|
||||
|
||||
# Show token usage if available
|
||||
if command -v jq &> /dev/null; then
|
||||
echo ">>> Token usage:"
|
||||
# Extract usage from the last message with usage info
|
||||
jq -s '[.[] | select(.type == "result")] | last | .usage' "$LOG_FILE" 2>/dev/null || echo "(could not parse usage)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo ">>> Next steps:"
|
||||
echo "1. Review the project: cd $OUTPUT_DIR/project"
|
||||
echo "2. Review Claude's log: less $LOG_FILE"
|
||||
echo "3. Check if tests pass:"
|
||||
if [[ "$TEST_NAME" == "go-fractals" ]]; then
|
||||
echo " cd $OUTPUT_DIR/project && go test ./..."
|
||||
elif [[ "$TEST_NAME" == "svelte-todo" ]]; then
|
||||
echo " cd $OUTPUT_DIR/project && npm test && npx playwright test"
|
||||
fi
|
||||
70
tests/subagent-driven-dev/svelte-todo/design.md
Normal file
70
tests/subagent-driven-dev/svelte-todo/design.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# Svelte Todo List - Design
|
||||
|
||||
## Overview
|
||||
|
||||
A simple todo list application built with Svelte. Supports creating, completing, and deleting todos with localStorage persistence.
|
||||
|
||||
## Features
|
||||
|
||||
- Add new todos
|
||||
- Mark todos as complete/incomplete
|
||||
- Delete todos
|
||||
- Filter by: All / Active / Completed
|
||||
- Clear all completed todos
|
||||
- Persist to localStorage
|
||||
- Show count of remaining items
|
||||
|
||||
## User Interface
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
│ Svelte Todos │
|
||||
├─────────────────────────────────────────┤
|
||||
│ [________________________] [Add] │
|
||||
├─────────────────────────────────────────┤
|
||||
│ [ ] Buy groceries [x] │
|
||||
│ [✓] Walk the dog [x] │
|
||||
│ [ ] Write code [x] │
|
||||
├─────────────────────────────────────────┤
|
||||
│ 2 items left │
|
||||
│ [All] [Active] [Completed] [Clear ✓] │
|
||||
└─────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
```
|
||||
src/
|
||||
App.svelte # Main app, state management
|
||||
lib/
|
||||
TodoInput.svelte # Text input + Add button
|
||||
TodoList.svelte # List container
|
||||
TodoItem.svelte # Single todo with checkbox, text, delete
|
||||
FilterBar.svelte # Filter buttons + clear completed
|
||||
store.ts # Svelte store for todos
|
||||
storage.ts # localStorage persistence
|
||||
```
|
||||
|
||||
## Data Model
|
||||
|
||||
```typescript
|
||||
interface Todo {
|
||||
id: string; // UUID
|
||||
text: string; // Todo text
|
||||
completed: boolean;
|
||||
}
|
||||
|
||||
type Filter = 'all' | 'active' | 'completed';
|
||||
```
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
1. Can add a todo by typing and pressing Enter or clicking Add
|
||||
2. Can toggle todo completion by clicking checkbox
|
||||
3. Can delete a todo by clicking X button
|
||||
4. Filter buttons show correct subset of todos
|
||||
5. "X items left" shows count of incomplete todos
|
||||
6. "Clear completed" removes all completed todos
|
||||
7. Todos persist across page refresh (localStorage)
|
||||
8. Empty state shows helpful message
|
||||
9. All tests pass
|
||||
222
tests/subagent-driven-dev/svelte-todo/plan.md
Normal file
222
tests/subagent-driven-dev/svelte-todo/plan.md
Normal file
@@ -0,0 +1,222 @@
|
||||
# Svelte Todo List - Implementation Plan
|
||||
|
||||
Execute this plan using the `superpowers:subagent-driven-development` skill.
|
||||
|
||||
## Context
|
||||
|
||||
Building a todo list app with Svelte. See `design.md` for full specification.
|
||||
|
||||
## Tasks
|
||||
|
||||
### Task 1: Project Setup
|
||||
|
||||
Create the Svelte project with Vite.
|
||||
|
||||
**Do:**
|
||||
- Run `npm create vite@latest . -- --template svelte-ts`
|
||||
- Install dependencies with `npm install`
|
||||
- Verify dev server works
|
||||
- Clean up default Vite template content from App.svelte
|
||||
|
||||
**Verify:**
|
||||
- `npm run dev` starts server
|
||||
- App shows minimal "Svelte Todos" heading
|
||||
- `npm run build` succeeds
|
||||
|
||||
---
|
||||
|
||||
### Task 2: Todo Store
|
||||
|
||||
Create the Svelte store for todo state management.
|
||||
|
||||
**Do:**
|
||||
- Create `src/lib/store.ts`
|
||||
- Define `Todo` interface with id, text, completed
|
||||
- Create writable store with initial empty array
|
||||
- Export functions: `addTodo(text)`, `toggleTodo(id)`, `deleteTodo(id)`, `clearCompleted()`
|
||||
- Create `src/lib/store.test.ts` with tests for each function
|
||||
|
||||
**Verify:**
|
||||
- Tests pass: `npm run test` (install vitest if needed)
|
||||
|
||||
---
|
||||
|
||||
### Task 3: localStorage Persistence
|
||||
|
||||
Add persistence layer for todos.
|
||||
|
||||
**Do:**
|
||||
- Create `src/lib/storage.ts`
|
||||
- Implement `loadTodos(): Todo[]` and `saveTodos(todos: Todo[])`
|
||||
- Handle JSON parse errors gracefully (return empty array)
|
||||
- Integrate with store: load on init, save on change
|
||||
- Add tests for load/save/error handling
|
||||
|
||||
**Verify:**
|
||||
- Tests pass
|
||||
- Manual test: add todo, refresh page, todo persists
|
||||
|
||||
---
|
||||
|
||||
### Task 4: TodoInput Component
|
||||
|
||||
Create the input component for adding todos.
|
||||
|
||||
**Do:**
|
||||
- Create `src/lib/TodoInput.svelte`
|
||||
- Text input bound to local state
|
||||
- Add button calls `addTodo()` and clears input
|
||||
- Enter key also submits
|
||||
- Disable Add button when input is empty
|
||||
- Add component tests
|
||||
|
||||
**Verify:**
|
||||
- Tests pass
|
||||
- Component renders input and button
|
||||
|
||||
---
|
||||
|
||||
### Task 5: TodoItem Component
|
||||
|
||||
Create the single todo item component.
|
||||
|
||||
**Do:**
|
||||
- Create `src/lib/TodoItem.svelte`
|
||||
- Props: `todo: Todo`
|
||||
- Checkbox toggles completion (calls `toggleTodo`)
|
||||
- Text with strikethrough when completed
|
||||
- Delete button (X) calls `deleteTodo`
|
||||
- Add component tests
|
||||
|
||||
**Verify:**
|
||||
- Tests pass
|
||||
- Component renders checkbox, text, delete button
|
||||
|
||||
---
|
||||
|
||||
### Task 6: TodoList Component
|
||||
|
||||
Create the list container component.
|
||||
|
||||
**Do:**
|
||||
- Create `src/lib/TodoList.svelte`
|
||||
- Props: `todos: Todo[]`
|
||||
- Renders TodoItem for each todo
|
||||
- Shows "No todos yet" when empty
|
||||
- Add component tests
|
||||
|
||||
**Verify:**
|
||||
- Tests pass
|
||||
- Component renders list of TodoItems
|
||||
|
||||
---
|
||||
|
||||
### Task 7: FilterBar Component
|
||||
|
||||
Create the filter and status bar component.
|
||||
|
||||
**Do:**
|
||||
- Create `src/lib/FilterBar.svelte`
|
||||
- Props: `todos: Todo[]`, `filter: Filter`, `onFilterChange: (f: Filter) => void`
|
||||
- Show count: "X items left" (incomplete count)
|
||||
- Three filter buttons: All, Active, Completed
|
||||
- Active filter is visually highlighted
|
||||
- "Clear completed" button (hidden when no completed todos)
|
||||
- Add component tests
|
||||
|
||||
**Verify:**
|
||||
- Tests pass
|
||||
- Component renders count, filters, clear button
|
||||
|
||||
---
|
||||
|
||||
### Task 8: App Integration
|
||||
|
||||
Wire all components together in App.svelte.
|
||||
|
||||
**Do:**
|
||||
- Import all components and store
|
||||
- Add filter state (default: 'all')
|
||||
- Compute filtered todos based on filter state
|
||||
- Render: heading, TodoInput, TodoList, FilterBar
|
||||
- Pass appropriate props to each component
|
||||
|
||||
**Verify:**
|
||||
- App renders all components
|
||||
- Adding todos works
|
||||
- Toggling works
|
||||
- Deleting works
|
||||
|
||||
---
|
||||
|
||||
### Task 9: Filter Functionality
|
||||
|
||||
Ensure filtering works end-to-end.
|
||||
|
||||
**Do:**
|
||||
- Verify filter buttons change displayed todos
|
||||
- 'all' shows all todos
|
||||
- 'active' shows only incomplete todos
|
||||
- 'completed' shows only completed todos
|
||||
- Clear completed removes completed todos and resets filter if needed
|
||||
- Add integration tests
|
||||
|
||||
**Verify:**
|
||||
- Filter tests pass
|
||||
- Manual verification of all filter states
|
||||
|
||||
---
|
||||
|
||||
### Task 10: Styling and Polish
|
||||
|
||||
Add CSS styling for usability.
|
||||
|
||||
**Do:**
|
||||
- Style the app to match the design mockup
|
||||
- Completed todos have strikethrough and muted color
|
||||
- Active filter button is highlighted
|
||||
- Input has focus styles
|
||||
- Delete button appears on hover (or always on mobile)
|
||||
- Responsive layout
|
||||
|
||||
**Verify:**
|
||||
- App is visually usable
|
||||
- Styles don't break functionality
|
||||
|
||||
---
|
||||
|
||||
### Task 11: End-to-End Tests
|
||||
|
||||
Add Playwright tests for full user flows.
|
||||
|
||||
**Do:**
|
||||
- Install Playwright: `npm init playwright@latest`
|
||||
- Create `tests/todo.spec.ts`
|
||||
- Test flows:
|
||||
- Add a todo
|
||||
- Complete a todo
|
||||
- Delete a todo
|
||||
- Filter todos
|
||||
- Clear completed
|
||||
- Persistence (add, reload, verify)
|
||||
|
||||
**Verify:**
|
||||
- `npx playwright test` passes
|
||||
|
||||
---
|
||||
|
||||
### Task 12: README
|
||||
|
||||
Document the project.
|
||||
|
||||
**Do:**
|
||||
- Create `README.md` with:
|
||||
- Project description
|
||||
- Setup: `npm install`
|
||||
- Development: `npm run dev`
|
||||
- Testing: `npm test` and `npx playwright test`
|
||||
- Build: `npm run build`
|
||||
|
||||
**Verify:**
|
||||
- README accurately describes the project
|
||||
- Instructions work
|
||||
46
tests/subagent-driven-dev/svelte-todo/scaffold.sh
Executable file
46
tests/subagent-driven-dev/svelte-todo/scaffold.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
# Scaffold the Svelte Todo test project
|
||||
# Usage: ./scaffold.sh /path/to/target/directory
|
||||
|
||||
set -e
|
||||
|
||||
TARGET_DIR="${1:?Usage: $0 <target-directory>}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
|
||||
# Create target directory
|
||||
mkdir -p "$TARGET_DIR"
|
||||
cd "$TARGET_DIR"
|
||||
|
||||
# Initialize git repo
|
||||
git init
|
||||
|
||||
# Copy design and plan
|
||||
cp "$SCRIPT_DIR/design.md" .
|
||||
cp "$SCRIPT_DIR/plan.md" .
|
||||
|
||||
# Create .claude settings to allow reads/writes in this directory
|
||||
mkdir -p .claude
|
||||
cat > .claude/settings.local.json << 'SETTINGS'
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Read(**)",
|
||||
"Edit(**)",
|
||||
"Write(**)",
|
||||
"Bash(npm:*)",
|
||||
"Bash(npx:*)",
|
||||
"Bash(mkdir:*)",
|
||||
"Bash(git:*)"
|
||||
]
|
||||
}
|
||||
}
|
||||
SETTINGS
|
||||
|
||||
# Create initial commit
|
||||
git add .
|
||||
git commit -m "Initial project setup with design and plan"
|
||||
|
||||
echo "Scaffolded Svelte Todo project at: $TARGET_DIR"
|
||||
echo ""
|
||||
echo "To run the test:"
|
||||
echo " claude -p \"Execute this plan using superpowers:subagent-driven-development. Plan: $TARGET_DIR/plan.md\" --plugin-dir /path/to/superpowers"
|
||||
Reference in New Issue
Block a user