From ec9b96a7bf5496a67b9449c5da6592839c808fe1 Mon Sep 17 00:00:00 2001 From: Drew Ritter Date: Wed, 6 May 2026 15:09:59 -0700 Subject: [PATCH] evals: add Gemini 2.5 Flash backend --- evals/README.md | 3 ++- evals/backends/gemini-2-5-flash.yaml | 23 +++++++++++++++++++++++ evals/backends/gemini.yaml | 2 +- evals/tests/test_backend.py | 11 +++++++++++ 4 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 evals/backends/gemini-2-5-flash.yaml diff --git a/evals/README.md b/evals/README.md index c74985ae..1a97d60a 100644 --- a/evals/README.md +++ b/evals/README.md @@ -64,7 +64,8 @@ uv run drill list | `claude-opus-4-6-1m` | Claude Code | opus-4-6 (1M context) | | `claude-opus-4-7-1m` | Claude Code | opus-4-7 (1M context) | | `codex` | Codex CLI | — | -| `gemini` | Gemini CLI | — | +| `gemini` | Gemini CLI | auto-gemini-3 | +| `gemini-2-5-flash` | Gemini CLI | gemini-2.5-flash | ## Project structure diff --git a/evals/backends/gemini-2-5-flash.yaml b/evals/backends/gemini-2-5-flash.yaml new file mode 100644 index 00000000..7a4e7218 --- /dev/null +++ b/evals/backends/gemini-2-5-flash.yaml @@ -0,0 +1,23 @@ +name: gemini-2-5-flash +cli: gemini +args: + - "--yolo" + - "-m" + - "gemini-2.5-flash" +required_env: [] +hooks: + pre_run: + - link_gemini_extension + post_run: [] +shutdown: "/exit" +idle: + quiescence_seconds: 5 + ready_pattern: "Type your message|^\\s*>" +busy_pattern: "Thinking\\.\\.\\.|Executing" +startup_timeout: 60 +turn_timeout: 300 +terminal: + cols: 200 + rows: 50 +session_logs: + pattern: "~/.gemini/tmp/*/chats/session-*.json" diff --git a/evals/backends/gemini.yaml b/evals/backends/gemini.yaml index 252c5efe..54d0979d 100644 --- a/evals/backends/gemini.yaml +++ b/evals/backends/gemini.yaml @@ -3,7 +3,7 @@ cli: gemini args: - "--yolo" - "-m" - - "gemini-2.5-flash" + - "auto-gemini-3" required_env: [] hooks: pre_run: diff --git a/evals/tests/test_backend.py b/evals/tests/test_backend.py index ac1d0343..f84742a1 100644 --- a/evals/tests/test_backend.py +++ b/evals/tests/test_backend.py @@ -33,6 +33,17 @@ class TestLoadBackend: assert backend.family == "claude" assert backend.model == "claude-opus-4-6" + def test_loads_gemini_default_and_flash_variant(self, backends_dir): + backend = load_backend("gemini", backends_dir) + assert backend.name == "gemini" + assert backend.family == "gemini" + assert backend.model == "auto-gemini-3" + + flash_backend = load_backend("gemini-2-5-flash", backends_dir) + assert flash_backend.name == "gemini-2-5-flash" + assert flash_backend.family == "gemini" + assert flash_backend.model == "gemini-2.5-flash" + class TestBackendBuildCommand: def test_claude_build_command(self, backends_dir, monkeypatch):