From f2a4f894ca1ed1f66c32a0ac871c862d8c656ce4 Mon Sep 17 00:00:00 2001 From: Ahmad Ragab Date: Mon, 23 Mar 2026 19:03:20 -0400 Subject: [PATCH 1/3] docs: add evaluator interface selection guide and preflight behavior - New "2b. Choose Your Evaluator Interface" section with decision table: Python API vs CLI command vs HTTP endpoint - New "Preflight Behavior" section documenting the undocumented __optimize_anything_preflight__ sentinel and 10s timeout - Document the 30s command_evaluator timeout (not configurable via CLI) Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/optimization-guide/SKILL.md | 69 +++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/skills/optimization-guide/SKILL.md b/skills/optimization-guide/SKILL.md index a798851..a156e80 100644 --- a/skills/optimization-guide/SKILL.md +++ b/skills/optimization-guide/SKILL.md @@ -17,6 +17,47 @@ Start with your current best version of the artifact. `gepa` evolves from here. ### 2. Create an Evaluator Use the **generate-evaluator** skill to create one matched to your objective. The evaluator is the most critical piece—`gepa`'s optimization quality is bounded by your evaluator's feedback quality. +### 2b. Choose Your Evaluator Interface + +Three interfaces exist. Pick based on where your evaluator code lives: + +| Your evaluator is... | Use this interface | Evaluator signature | +|---|---|---| +| Python code in the same project | **Python API** — pass a function to `optimize_anything()` | `def eval(candidate: str) -> float` or `-> tuple[float, dict]` | +| A standalone script/binary | **CLI command** — `--evaluator-command` | Reads `{"candidate": "..."}` from stdin, writes `{"score": float}` to stdout | +| A remote service | **HTTP endpoint** — `--evaluator-url` | POST `{"candidate": "..."}`, response `{"score": float}` | + +**Prefer the Python API** when your evaluator is Python code. It bypasses all CLI overhead: no preflight timeout, no argparse conflicts, no subprocess timeout, no stdin/stdout protocol. Your evaluator is just a function: + +```python +import gepa.optimize_anything as oa + +def my_evaluator(candidate: str) -> tuple[float, dict]: + score = run_my_scoring(candidate) + oa.log("Details:", some_diagnostic) # captured as ASI + return score, {"feedback": "..."} + +result = optimize_anything( + seed_candidate=open("seed.txt").read(), + evaluator=my_evaluator, + objective="maximize quality", + config=GEPAConfig(engine=EngineConfig(max_metric_calls=100)), +) +``` + +**Use CLI command** only when your evaluator is a separate process (different language, isolated environment, or shared team tooling). Wrap evaluator-specific flags in a shell script — do not pass them through `--evaluator-command`: + +```bash +# evaluators/eval.sh (bakes in your evaluator's flags) +#!/bin/bash +cd "$(dirname "$0")/.." +exec python -m my_eval.scorer --subset-size 5 --temperature 0.0 +``` + +```bash +optimize-anything optimize seed.txt --evaluator-command bash evaluators/eval.sh +``` + ### 3. Choose Optimization Mode **Single-task** (no dataset) — optimize one artifact against one evaluator: @@ -144,4 +185,30 @@ The result contains: 3. Clarify the objective: Set the `objective` string that is injected into `gepa`'s reflection prompt and specify constraints like token limits or format requirements. 4. Add background context: Use `background` for domain knowledge, constraints, or strategies such as "Target audience is non-technical users. Never use jargon." 5. Iterate on the evaluator: Improve the evaluator before increasing `budget` if optimization results on `seed.txt` are poor. -6. Set evaluator working directory: Pass `evaluator_cwd` as an absolute project path next to `seed.txt` and `evaluators/eval.sh` when `evaluators/eval.sh` or other evaluator commands use repo-relative files or scripts. \ No newline at end of file +6. Set evaluator working directory: Pass `evaluator_cwd` as an absolute project path next to `seed.txt` and `evaluators/eval.sh` when `evaluators/eval.sh` or other evaluator commands use repo-relative files or scripts. + +## Preflight Behavior (CLI only) + +When using `--evaluator-command`, the CLI runs a **preflight check** before optimization starts. It sends: + +```json +{"_protocol_version": 2, "candidate": "__optimize_anything_preflight__"} +``` + +The preflight has a **10-second timeout**. If your evaluator makes slow API calls (LLM inference, database queries), it will timeout on the real evaluation pipeline. Two solutions: + +1. **Detect the sentinel** in your evaluator and fast-return: +```python +payload = json.load(sys.stdin) +candidate = payload["candidate"] +if candidate == "__optimize_anything_preflight__": + print(json.dumps({"score": 0.5})) + sys.exit(0) +# ... actual evaluation below +``` + +2. **Use the Python API instead** — it has no preflight step at all. + +### Command Evaluator Timeout + +The `command_evaluator` has a default **30-second timeout per evaluation call** (not configurable via CLI). If your evaluator takes longer than 30s per call, use the Python API or the HTTP evaluator interface. \ No newline at end of file From 726e8219fcbab55a995f99ab2a687d47abf028a8 Mon Sep 17 00:00:00 2001 From: Ahmad Ragab Date: Mon, 23 Mar 2026 19:03:31 -0400 Subject: [PATCH 2/3] docs: add preflight sentinel detection to evaluator contract - Document __optimize_anything_preflight__ sentinel in Evaluator Contract - Add step 7 to Workflow: test preflight fast-return Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/generate-evaluator/SKILL.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/skills/generate-evaluator/SKILL.md b/skills/generate-evaluator/SKILL.md index dae7e77..7ec2e1f 100644 --- a/skills/generate-evaluator/SKILL.md +++ b/skills/generate-evaluator/SKILL.md @@ -14,6 +14,13 @@ Generate an evaluator that scores candidate artifacts for optimization with gepa - Default payload: `{"candidate": ""}` - Dataset-aware payload (`--dataset`): `{"candidate": "", "example": {...}}` - Output JSON must include `score` (float, usually in `[0,1]`), plus optional side-info fields. +- **Preflight detection** (command evaluators only): The CLI sends `"__optimize_anything_preflight__"` as the candidate text before optimization starts. Your evaluator should detect this and return immediately: + ```python + if candidate == "__optimize_anything_preflight__": + print(json.dumps({"score": 0.5})) + sys.exit(0) + ``` + This avoids the 10-second preflight timeout for evaluators that make slow API calls. ## Choose an Evaluator Pattern @@ -83,3 +90,4 @@ echo '{"candidate":"text","example":{"input":"q","expected":"a"}}' | python3 eva 4. Customize scoring logic and side-info fields. 5. Test with stdin payloads. You should see JSON with `score` plus diagnostic fields. 6. Validate score range: a good seed should score between 0.3-0.7. If above 0.85, the evaluator lacks discrimination. +7. Test preflight: `echo '{"candidate":"__optimize_anything_preflight__"}' | python3 your_evaluator.py` — should return `{"score": 0.5}` instantly. From e83ccd2345ebf07201c91de07cef5a0177fb6807 Mon Sep 17 00:00:00 2001 From: Ahmad Ragab Date: Mon, 23 Mar 2026 19:03:40 -0400 Subject: [PATCH 3/3] docs: add preflight guard to all evaluator pattern templates - Added preflight guard to shared I/O contract section - Pattern 1 (eval_prompt.py): guard after candidate extraction - Pattern 3 (eval_docs.py): guard after candidate extraction - Pattern 4 (eval_agent.py): guard checks raw payload (before .lower()) Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/evaluator-patterns/SKILL.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/skills/evaluator-patterns/SKILL.md b/skills/evaluator-patterns/SKILL.md index cfe2891..aebd403 100644 --- a/skills/evaluator-patterns/SKILL.md +++ b/skills/evaluator-patterns/SKILL.md @@ -11,6 +11,13 @@ Use this skill to generate evaluator scripts that follow the optimize-anything c - Read one JSON object from stdin: `{"candidate": "..."}` - Write one JSON object to stdout on a single line: `{"score": , ...diagnostics...}` - Return a numeric `score` (recommended in `0.0..1.0`) +- **Preflight guard** (recommended): detect `"__optimize_anything_preflight__"` candidate and fast-return + ```python + candidate = str(payload.get("candidate", "")) + if candidate == "__optimize_anything_preflight__": + print(json.dumps({"score": 0.5}, separators=(",", ":"))) + return 0 + ``` --- @@ -36,6 +43,11 @@ def main() -> int: payload = json.load(sys.stdin) candidate = str(payload.get("candidate", "")) + # Preflight guard for optimize-anything CLI + if candidate == "__optimize_anything_preflight__": + print(json.dumps({"score": 0.5}, separators=(",", ":"))) + return 0 + dimensions = [ {"name": "clarity", "weight": 0.35, "guide": "Clear, specific, easy to follow"}, {"name": "constraint_following", "weight": 0.35, "guide": "Respects constraints and boundaries"}, @@ -222,6 +234,12 @@ def clamp01(x: float) -> float: def main() -> int: payload = json.load(sys.stdin) candidate = str(payload.get("candidate", "")) + + # Preflight guard for optimize-anything CLI + if candidate == "__optimize_anything_preflight__": + print(json.dumps({"score": 0.5}, separators=(",", ":"))) + return 0 + text = candidate.strip() words = re.findall(r"\w+", text) @@ -286,6 +304,11 @@ def main() -> int: payload = json.load(sys.stdin) candidate = str(payload.get("candidate", "")).lower() + # Preflight guard for optimize-anything CLI (check raw payload, not lowercased) + if payload.get("candidate", "") == "__optimize_anything_preflight__": + print(json.dumps({"score": 0.5}, separators=(",", ":"))) + return 0 + scenarios = { "ambiguous_request": ["clarifying question", "assumption"], "tool_failure": ["retry", "fallback", "error message"],