Skip to content
16 changes: 16 additions & 0 deletions eval_protocol/benchmarks/test_aime25.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
SingleTurnRolloutProcessor,
)
from eval_protocol.pytest.evaluation_test import evaluation_test
from eval_protocol.training import GEPATrainer
from eval_protocol.training.gepa_utils import build_reflection_lm

SYSTEM_PROMPT = (
"You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}."
Expand Down Expand Up @@ -131,3 +133,17 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
metrics=metrics,
)
return row


if __name__ == "__main__":
trainer = GEPATrainer(test_aime25_pointwise)
reflection_lm = build_reflection_lm("gpt-5")

optimized_program = trainer.train(
num_threads=32,
track_stats=True,
reflection_minibatch_size=3,
reflection_lm=reflection_lm,
)

print(trainer.evaluate(optimized_program))
31 changes: 30 additions & 1 deletion eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import importlib
from datetime import datetime, timezone
from enum import Enum
from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union, Callable, Sequence

JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]

Expand Down Expand Up @@ -1190,3 +1190,32 @@ class MCPMultiClientConfiguration(BaseModel):
"""Represents a MCP configuration."""

mcpServers: Dict[str, Union[MCPConfigurationServerStdio, MCPConfigurationServerUrl]]


class EPParameters(BaseModel):
"""The parameters of an `@evaluation_test`. Used for trainable integrations."""

completion_params: Any = None
input_messages: Any = None
input_dataset: Any = None
input_rows: Any = None
data_loaders: Any = None
dataset_adapter: Optional[Callable[..., Any]] = None
rollout_processor: Any = None
rollout_processor_kwargs: Dict[str, Any] | None = None
aggregation_method: Any = Field(default="mean")
passed_threshold: Any = None
disable_browser_open: bool = False
num_runs: int = 1
filtered_row_ids: Optional[Sequence[str]] = None
max_dataset_rows: Optional[int] = None
mcp_config_path: Optional[str] = None
max_concurrent_rollouts: int = 8
max_concurrent_evaluations: int = 64
server_script_path: Optional[str] = None
steps: int = 30
mode: Any = Field(default="pointwise")
combine_datasets: bool = True
preprocess_fn: Optional[Callable[[list[EvaluationRow]], list[EvaluationRow]]] = None
logger: Any = None
exception_handler_config: Any = None
Comment thread
shreymodi1 marked this conversation as resolved.
Comment thread
cursor[bot] marked this conversation as resolved.
Comment thread
shreymodi1 marked this conversation as resolved.
35 changes: 28 additions & 7 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
EvaluationThresholdDict,
EvaluateResult,
Status,
EPParameters,
)
from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
from eval_protocol.pytest.evaluation_test_postprocess import postprocess
Expand Down Expand Up @@ -695,13 +696,33 @@ async def _collect_result(config, lst):
)
pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)

ep_params: dict[str, Any] = {
"rollout_processor": rollout_processor,
"server_script_path": server_script_path,
"mcp_config_path": mcp_config_path,
"rollout_processor_kwargs": rollout_processor_kwargs,
"mode": mode,
}
# Attach full evaluation parameter metadata for training integrations
ep_params: EPParameters = EPParameters(
completion_params=completion_params,
input_messages=input_messages,
input_dataset=input_dataset,
input_rows=input_rows,
data_loaders=data_loaders,
dataset_adapter=dataset_adapter,
rollout_processor=rollout_processor,
rollout_processor_kwargs=rollout_processor_kwargs,
aggregation_method=aggregation_method,
passed_threshold=passed_threshold,
disable_browser_open=disable_browser_open,
num_runs=num_runs,
filtered_row_ids=filtered_row_ids,
max_dataset_rows=max_dataset_rows,
mcp_config_path=mcp_config_path,
max_concurrent_rollouts=max_concurrent_rollouts,
max_concurrent_evaluations=max_concurrent_evaluations,
server_script_path=server_script_path,
steps=steps,
mode=mode,
combine_datasets=combine_datasets,
preprocess_fn=preprocess_fn,
logger=logger,
exception_handler_config=exception_handler_config,
)
Comment thread
cursor[bot] marked this conversation as resolved.

# Create the dual mode wrapper
dual_mode_wrapper = create_dual_mode_wrapper(
Expand Down
236 changes: 236 additions & 0 deletions eval_protocol/trainable_gepa_design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
## GEPA-training Interface Design for Eval Protocol

### Goals

- **Tunable prompts for existing benchmarks**: Allow benchmarks like `test_aime25.py` and `test_gpqa.py` to expose parts of their configuration (e.g., system prompts) as training parameters, without changing their core evaluation logic.
- **Tight coupling with `@evaluation_test`**: Reuse the same rollout configuration, datasets, and metrics that are already defined via `evaluation_test`, instead of duplicating that configuration in a separate training API.
- **GEPA as one optimizer backend**: Provide a clean integration point for GEPA (and potentially other optimizers later) without requiring benchmarks to depend on DSPy or GEPA directly.

### High-Level Architecture

- **Benchmark file (e.g., `test_aime25.py`)**
- Continues to define:
- Dataset adapter (`aime2025_dataset_adapter`).
- `@evaluation_test(...)`-decorated function (e.g., `test_aime25_pointwise`) that:
- Uses `SingleTurnRolloutProcessor` (or another processor).
- Computes per-row metrics and sets `row.evaluation_result`.
- Adds *optional* training wiring at the bottom, under `if __name__ == "__main__":`, that:
- Imports a training/core API from `eval_protocol.training`.
- Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate.
- Invokes a train routine (GEPA-based or otherwise).

- **Training core**
- Provides a single central abstraction:
- **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form:
- One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides.
- **Candidate representation**: Start with `dict[str, str]` (e.g., `{"system_prompt": "..."}`), anticipating future extensions (few-shot examples, tool docs, etc.).
- Includes helper utilities to:
- Build an `EPParameters` instance by introspecting an `@evaluation_test`-decorated function.
- Run a single candidate or a batch of candidates through the full rollout + evaluation pipeline, returning aggregate scores (and optionally per-row scores).

- **GEPA adapter (e.g., `eval_protocol/training/gepa_adapter.py`)**
- Wraps the training core and GEPA’s API:
- Accepts:
- An `EPConfig`.
- A candidate space definition (for now, implicit via `dict[str, str]` keys).
- GEPA configuration (budget, reflection model, seed, component selection strategy, etc.).
- Provides:
- A GEPA-compatible metric interface that:
- Given a candidate, uses `EPConfig` (and benchmark-specific logic such as a custom `dataset_adapter`) to:
- Construct or adapt rows for that candidate.
- Run rollouts (reusing the same processors and params as the test).
- Compute scalar scores (e.g., mean exact-match over a batch).
- A training routine that returns:
- A `best_candidate: dict[str, str]`.
- Optional rich result object (e.g., mapping to `GEPAResult`, additional stats).

### Relationship to `evaluation_test` and `__ep_params__`

- Existing `evaluation_test` code will attach:

```python
ep_params: dict[str, Any] = {
"rollout_processor": rollout_processor,
"server_script_path": server_script_path,
"mcp_config_path": mcp_config_path,
"rollout_processor_kwargs": rollout_processor_kwargs,
"mode": mode,
}
setattr(dual_mode_wrapper, "__ep_params__", ep_params)
```

- Design direction:
- **Use `__ep_params__` as the single source of truth**.
- **`__ep_params__` should contain all effective `evaluation_test` parameters**, including:
- Parsed `completion_params` (after env overrides).
- Dataset sources (`input_dataset`, `input_rows`, dataloaders, and `dataset_adapter`), after `parse_ep_*` transforms.
- `aggregation_method`, `num_runs`, `max_dataset_rows`, etc.
- Rollout and mode information (processor, kwargs, concurrency limits, mode).
- The training core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate training-only config.

- Training core will expose:
- A factory like:

```python
def build_ep_parameters_from_test(
test_fn: TestFunction,
) -> EPParameters:
...
```

- This function:
- Reads `test_fn.__ep_params__`.
- Reconstructs how to:
- Load and preprocess the dataset.
- Configure the rollout processor (`RolloutProcessorConfig`).
- Run rollouts and then apply the row-level metric (by calling the decorated test function in a library mode).

- Training code (e.g., `python test_aime25.py`) then becomes:
- Import the test function (e.g., `test_aime25_pointwise`).
- Build an `EPParameters` from it.
- Call into a GEPA-based trainer that uses the `EPParameters`.

### TODO for derek to figure out: how to store the changing system prompts.

- **Where tuned prompts live (storage format and location)**:
- GEPA already supports a `run_dir` for logging and checkpoints.
- We need to decide:
- Whether EP should:
- Treat `run_dir` as the canonical store and optionally add a small `best_candidate.json` there; or
- Provide an additional EP-level artifact format.
- For now, storage is left as an **explicit design TODO** and can be finalized once we have the core/adapter in place.

### Work Split: Person A vs Person B

#### Person A – training Core & `evaluation_test` Integration

- **1. Extend `evaluation_test` metadata (no behavior change)**
- Populate a single `__ep_config__` dict on the decorated test function that includes:
- Dataset specification (paths / input_rows / dataloaders, `dataset_adapter`, `max_dataset_rows`, etc.) after `parse_ep_*`.
- Parsed `completion_params` (after env overrides like `parse_ep_completion_params_overwrite`).
- Rollout settings (`rollout_processor`, `rollout_processor_kwargs`, `mode`, `max_concurrent_rollouts`, `max_concurrent_evaluations`).
- Aggregation and threshold metadata.
- Ensure:
- Backwards compatibility for existing tests.
- Clear typing and docstrings to guide future use.

- **2. Define core training abstractions in `eval_protocol/training/core.py`**
- Define:
- `EPConfig`:
- A field for every parameter `evaluation_test` accepts (dataset, adapters, completion params, rollout processor, aggregation, thresholds, etc.).
- Can be serialized/inspected for external tooling.
- Candidate type alias (initially `Candidate = dict[str, str]`).
- Implement:
- `build_ep_config_from_test(test_fn: TestFunction) -> EPConfig`.
- Reads `__ep_config__`.
- Reuses the same dataset and rollout logic as pytest, but in a library-friendly way (no pytest invocation).
- Helper(s) to:
- Run a single candidate over the dataset, possibly with:
- A subset of rows (train vs val split initially determined by the benchmark or EPConfig).
- A configurable aggregation method (mean score to start).

- **3. Minimal tests and documentation for the core**
- Add unit/integration tests that:
- Use a tiny fake `@evaluation_test` function.
- Confirm `build_ep_config_from_test` produces a config that can:
- Load mock rows.
- Run a dummy rollout processor.
- Apply a simple metric to produce scores.
- Document (in this design file or a short README) how benchmarks should think about exposing tunable pieces (e.g., via custom dataset adapters or other wiring).

#### Person B – GEPA Adapter & Benchmark Wiring

- **4. Implement GEPA integration in `eval_protocol/training/gepa_adapter.py`**
- Define a small adapter API, e.g.:

```python
class GEPATrainer:
def __init__(self, spec: trainingBenchmarkSpec, inject_fn: InjectFn, ...gepa_config...):
...

def train(self) -> tuple[Candidate, Any]:
"""Run GEPA and return best candidate plus optional rich result."""
```

- Inside, implement:
- Conversion from `(spec, inject_fn)` into a GEPA metric:
- For each candidate:
- Clone or map the base dataset rows, applying `inject_fn(candidate, row)`.
- Use the spec’s rollout runner + metric runner to compute per-example and aggregate scores.
- Return the aggregate score (and optional textual feedback) to GEPA.
- The call to `gepa.optimize(...)` with:
- `seed_candidate` constructed from the baseline configuration (e.g., default system prompt).
- Budget configuration (max metric calls / auto presets).
- Reflection config (reflection LM or other knobs) passed in via constructor.
- Mapping from `GEPAResult` (or equivalent) back into:
- `best_candidate: Candidate`.
- Optional rich result object (e.g., exposing Pareto-front stats).

- **5. Wire a first benchmark: AIME 2025**
- In `eval_protocol/benchmarks/test_aime25.py`:
- Factor the row-scoring logic inside `test_aime25_pointwise` into a **reusable metric function** (pure function that sets `row.evaluation_result` given a rolled-out row).
- Decide how candidates should influence the evaluation:
- For example, by making the dataset adapter or message-construction logic candidate-aware (e.g., changing the system prompt).
- Add a `if __name__ == "__main__":` block that:
- Imports `test_aime25_pointwise` and builds an `EPConfig` via `build_ep_config_from_test`.
- Instantiates `GEPATrainer` with:
- The `EPConfig`.
- Initial GEPA config (budget, reflection model placeholder, seed).
- Calls `trainer.train()` and prints/logs the resulting `best_candidate` for now.
- Keep storage of tuned prompts as a TODO/extension point to be resolved later.

- **6. Optional second benchmark: GPQA**
- Repeat step 5 for `test_gpqa.py`:
- Identify what’s tunable (system prompt, possibly chain-of-thought instructions).
- Extract metric logic into a reusable function.
- Add candidate-aware wiring (e.g., via dataset adapters) and an optional `__main__` entrypoint calling the same GEPA trainer.
- This will validate that:
- The abstractions generalize across tasks.
- No DSPy/GEPA-specific imports leak into benchmark files (other than a small, well-defined training API).

### Coordination Notes

- **Order of work**
- Person A should go first (or in parallel up to the point where `EPConfig` and `build_ep_config_from_test` are usable).
- Person B can stub against interfaces and adjust once Person A’s core is available.
- **Integration checkpoints**
- After Person A lands the core + tests:
- Person B wires AIME with a very simple “optimizer” (even random search) to smoke-test the path before hooking up real GEPA.
- After GEPA integration works for AIME:
- Decide on the canonical way to treat GEPA’s `run_dir` and/or additional artifacts for tuned prompts.
- Optionally add a small helper that knows how to “run evaluation once with best GEPA candidate” for CI workflows.


future:

this is how gepa defines eval:

def metric(
gold: Example,
pred: Prediction,
trace: Optional[DSPyTrace] = None,
pred_name: Optional[str] = None,
pred_trace: Optional[DSPyTrace] = None,
) -> float | ScoreWithFeedback:
"""
This function is called with the following arguments:
- gold: The gold example.
- pred: The predicted output.
- trace: Optional. The trace of the program's execution.
- pred_name: Optional. The name of the target predictor currently being optimized by GEPA, for which
the feedback is being requested.
- pred_trace: Optional. The trace of the target predictor's execution GEPA is seeking feedback for.

Note the `pred_name` and `pred_trace` arguments. During optimization, GEPA will call the metric to obtain
feedback for individual predictors being optimized. GEPA provides the name of the predictor in `pred_name`
and the sub-trace (of the trace) corresponding to the predictor in `pred_trace`.
If available at the predictor level, the metric should return {'score': float, 'feedback': str} corresponding
to the predictor.
If not available at the predictor level, the metric can also return a text feedback at the program level
(using just the gold, pred and trace).
If no feedback is returned, GEPA will use a simple text feedback consisting of just the score:
f"This trajectory got a score of {score}."
"""
...

ideally generic way to turn evaluation_test into this.
3 changes: 3 additions & 0 deletions eval_protocol/training/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from gepa_adapter import GEPATrainer
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated

__all__ = ["GEPATrainer"]
Loading
Loading