superpowers/evals/scenarios/codex-tool-mapping-comprehension.yaml

scenario: codex-tool-mapping-comprehension
description: >
  Measures whether a Codex agent correctly reports the Superpowers Codex
  mapping for Claude Code Task result collection.
user_posture: spec-aware

setup:
  helpers:
    - create_base_repo
  assertions:
    - "git rev-parse --is-inside-work-tree"
    - "test -f .agents/skills/superpowers/using-superpowers/references/codex-tools.md"

turns:
  - intent: >
      When Codex is at its input prompt, type this exact message and press
      Enter. Do not type a placeholder, a period, or a shorter paraphrase:

      "Read .agents/skills/superpowers/using-superpowers/references/codex-tools.md.
      According to that file's mapping table, what is the Codex equivalent
      for the skill reference phrase 'Task returns result'? Do not perform
      any subagent workflow. Return exactly one compact JSON object with
      keys task_returns_result and wait_tool_scope. The task_returns_result
      value must be exactly the mapped tool name. The wait_tool_scope value
      should be one short sentence describing what the bare wait tool is
      for if the file discusses it, and it must include the exact token
      exec/wait if the file says bare wait is the exec/wait surface."
  - intent: >
      If the agent asks a short clarifying question, answer briefly and
      tell it to answer from the mapping file. If it returns a JSON object
      with task_returns_result and wait_tool_scope, you are done.

limits:
  max_turns: 8
  turn_timeout: 180

verify:
  assertions:
    - "grep -Eq '\"task_returns_result\"[[:space:]]*:[[:space:]]*\"wait_agent\"' session.log"
    - "! grep -Eq '\"task_returns_result\"[[:space:]]*:[[:space:]]*\"wait\"' session.log"
    - "grep -Eq '\"wait_tool_scope\"[^\\n]*exec/wait' session.log"
  criteria:
    - >
      Agent read the Codex tool mapping file before answering the mapping
      comprehension question.
    - >
      Agent answered that Task returns result maps to wait_agent.
    - >
      Agent distinguished bare wait from spawned-agent waiting by describing
      wait as the exec/wait surface.
  observe: true