Skip to content
31 changes: 30 additions & 1 deletion eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import importlib
from datetime import datetime, timezone
from enum import Enum
from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union, Callable, Sequence

JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]

Expand Down Expand Up @@ -1190,3 +1190,32 @@ class MCPMultiClientConfiguration(BaseModel):
"""Represents a MCP configuration."""

mcpServers: Dict[str, Union[MCPConfigurationServerStdio, MCPConfigurationServerUrl]]


class EPParameters(BaseModel):
"""The parameters of an `@evaluation_test`. Used for trainable integrations."""

completion_params: Any = None
input_messages: Any = None
input_dataset: Any = None
input_rows: Any = None
data_loaders: Any = None
dataset_adapter: Optional[Callable[..., Any]] = None
rollout_processor: Any = None
rollout_processor_kwargs: Dict[str, Any] | None = None
aggregation_method: Any = Field(default="mean")
passed_threshold: Any = None
disable_browser_open: bool = False
num_runs: int = 1
filtered_row_ids: Optional[Sequence[str]] = None
max_dataset_rows: Optional[int] = None
mcp_config_path: Optional[str] = None
max_concurrent_rollouts: int = 8
max_concurrent_evaluations: int = 64
server_script_path: Optional[str] = None
steps: int = 30
mode: Any = Field(default="pointwise")
combine_datasets: bool = True
preprocess_fn: Optional[Callable[[list[EvaluationRow]], list[EvaluationRow]]] = None
logger: Any = None
exception_handler_config: Any = None
Comment thread
shreymodi1 marked this conversation as resolved.
Comment thread
cursor[bot] marked this conversation as resolved.
Comment thread
shreymodi1 marked this conversation as resolved.
35 changes: 28 additions & 7 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
EvaluationThresholdDict,
EvaluateResult,
Status,
EPParameters,
)
from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
from eval_protocol.pytest.evaluation_test_postprocess import postprocess
Expand Down Expand Up @@ -695,13 +696,33 @@ async def _collect_result(config, lst):
)
pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)

ep_params: dict[str, Any] = {
"rollout_processor": rollout_processor,
"server_script_path": server_script_path,
"mcp_config_path": mcp_config_path,
"rollout_processor_kwargs": rollout_processor_kwargs,
"mode": mode,
}
# Attach full evaluation parameter metadata for training integrations
ep_params: EPParameters = EPParameters(
completion_params=completion_params,
input_messages=input_messages,
input_dataset=input_dataset,
input_rows=input_rows,
data_loaders=data_loaders,
dataset_adapter=dataset_adapter,
rollout_processor=rollout_processor,
rollout_processor_kwargs=rollout_processor_kwargs,
aggregation_method=aggregation_method,
passed_threshold=passed_threshold,
disable_browser_open=disable_browser_open,
num_runs=num_runs,
filtered_row_ids=filtered_row_ids,
max_dataset_rows=max_dataset_rows,
mcp_config_path=mcp_config_path,
max_concurrent_rollouts=max_concurrent_rollouts,
max_concurrent_evaluations=max_concurrent_evaluations,
server_script_path=server_script_path,
steps=steps,
mode=mode,
combine_datasets=combine_datasets,
preprocess_fn=preprocess_fn,
logger=logger,
exception_handler_config=exception_handler_config,
)
Comment thread
cursor[bot] marked this conversation as resolved.

# Create the dual mode wrapper
dual_mode_wrapper = create_dual_mode_wrapper(
Expand Down
201 changes: 201 additions & 0 deletions eval_protocol/trainable_gepa_design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
## GEPA-training Interface Design for Eval Protocol

### Goals

- **Tunable prompts for existing benchmarks**: Allow benchmarks like `test_aime25.py` and `test_gpqa.py` to expose parts of their configuration (e.g., system prompts) as training parameters, without changing their core evaluation logic.
- **Tight coupling with `@evaluation_test`**: Reuse the same rollout configuration, datasets, and metrics that are already defined via `evaluation_test`, instead of duplicating that configuration in a separate training API.
- **GEPA as one optimizer backend**: Provide a clean integration point for GEPA (and potentially other optimizers later) without requiring benchmarks to depend on DSPy or GEPA directly.

### High-Level Architecture

- **Benchmark file (e.g., `test_aime25.py`)**
- Continues to define:
- Dataset adapter (`aime2025_dataset_adapter`).
- `@evaluation_test(...)`-decorated function (e.g., `test_aime25_pointwise`) that:
- Uses `SingleTurnRolloutProcessor` (or another processor).
- Computes per-row metrics and sets `row.evaluation_result`.
- Adds *optional* training wiring at the bottom, under `if __name__ == "__main__":`, that:
- Imports a training/core API from `eval_protocol.training`.
- Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate.
- Invokes a train routine (GEPA-based or otherwise).

- **training core**
- Provides a single central abstraction:
- **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form:
- One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides.
- **Candidate representation**: Start with `dict[str, str]` (e.g., `{"system_prompt": "..."}`), anticipating future extensions (few-shot examples, tool docs, etc.).
- Includes helper utilities to:
- Build an `EPParameters` instance by introspecting an `@evaluation_test`-decorated function.
- Run a single candidate or a batch of candidates through the full rollout + evaluation pipeline, returning aggregate scores (and optionally per-row scores).

- **GEPA adapter (e.g., `eval_protocol/training/gepa_adapter.py`)**
- Wraps the training core and GEPA’s API:
- Accepts:
- An `EPConfig`.
- A candidate space definition (for now, implicit via `dict[str, str]` keys).
- GEPA configuration (budget, reflection model, seed, component selection strategy, etc.).
- Provides:
- A GEPA-compatible metric interface that:
- Given a candidate, uses `EPConfig` (and benchmark-specific logic such as a custom `dataset_adapter`) to:
- Construct or adapt rows for that candidate.
- Run rollouts (reusing the same processors and params as the test).
- Compute scalar scores (e.g., mean exact-match over a batch).
- A training routine that returns:
- A `best_candidate: dict[str, str]`.
- Optional rich result object (e.g., mapping to `GEPAResult`, additional stats).

### Relationship to `evaluation_test` and `__ep_params__`

- Existing `evaluation_test` code will attach:

```python
ep_params: dict[str, Any] = {
"rollout_processor": rollout_processor,
"server_script_path": server_script_path,
"mcp_config_path": mcp_config_path,
"rollout_processor_kwargs": rollout_processor_kwargs,
"mode": mode,
}
setattr(dual_mode_wrapper, "__ep_params__", ep_params)
```

- Design direction:
- **Use `__ep_params__` as the single source of truth**.
- **`__ep_params__` should contain all effective `evaluation_test` parameters**, including:
- Parsed `completion_params` (after env overrides).
- Dataset sources (`input_dataset`, `input_rows`, dataloaders, and `dataset_adapter`), after `parse_ep_*` transforms.
- `aggregation_method`, `num_runs`, `max_dataset_rows`, etc.
- Rollout and mode information (processor, kwargs, concurrency limits, mode).
- The training core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate training-only config.

- training core will expose:
- A factory like:

```python
def build_ep_parameters_from_test(
test_fn: TestFunction,
) -> EPParameters:
...
```

- This function:
- Reads `test_fn.__ep_params__`.
- Reconstructs how to:
- Load and preprocess the dataset.
- Configure the rollout processor (`RolloutProcessorConfig`).
- Run rollouts and then apply the row-level metric (by calling the decorated test function in a library mode).

- Training code (e.g., `python test_aime25.py`) then becomes:
- Import the test function (e.g., `test_aime25_pointwise`).
- Build an `EPParameters` from it.
- Call into a GEPA-based trainer that uses the `EPParameters`.

### TODO for derek to figure out: how to store the changing system prompts.

- **Where tuned prompts live (storage format and location)**:
- GEPA already supports a `run_dir` for logging and checkpoints.
- We need to decide:
- Whether EP should:
- Treat `run_dir` as the canonical store and optionally add a small `best_candidate.json` there; or
- Provide an additional EP-level artifact format.
- For now, storage is left as an **explicit design TODO** and can be finalized once we have the core/adapter in place.

### Work Split: Person A vs Person B

#### Person A – training Core & `evaluation_test` Integration

- **1. Extend `evaluation_test` metadata (no behavior change)**
- Populate a single `__ep_config__` dict on the decorated test function that includes:
- Dataset specification (paths / input_rows / dataloaders, `dataset_adapter`, `max_dataset_rows`, etc.) after `parse_ep_*`.
- Parsed `completion_params` (after env overrides like `parse_ep_completion_params_overwrite`).
- Rollout settings (`rollout_processor`, `rollout_processor_kwargs`, `mode`, `max_concurrent_rollouts`, `max_concurrent_evaluations`).
- Aggregation and threshold metadata.
- Ensure:
- Backwards compatibility for existing tests.
- Clear typing and docstrings to guide future use.

- **2. Define core training abstractions in `eval_protocol/training/core.py`**
- Define:
- `EPConfig`:
- A field for every parameter `evaluation_test` accepts (dataset, adapters, completion params, rollout processor, aggregation, thresholds, etc.).
- Can be serialized/inspected for external tooling.
- Candidate type alias (initially `Candidate = dict[str, str]`).
- Implement:
- `build_ep_config_from_test(test_fn: TestFunction) -> EPConfig`.
- Reads `__ep_config__`.
- Reuses the same dataset and rollout logic as pytest, but in a library-friendly way (no pytest invocation).
- Helper(s) to:
- Run a single candidate over the dataset, possibly with:
- A subset of rows (train vs val split initially determined by the benchmark or EPConfig).
- A configurable aggregation method (mean score to start).

- **3. Minimal tests and documentation for the core**
- Add unit/integration tests that:
- Use a tiny fake `@evaluation_test` function.
- Confirm `build_ep_config_from_test` produces a config that can:
- Load mock rows.
- Run a dummy rollout processor.
- Apply a simple metric to produce scores.
- Document (in this design file or a short README) how benchmarks should think about exposing tunable pieces (e.g., via custom dataset adapters or other wiring).

#### Person B – GEPA Adapter & Benchmark Wiring

- **4. Implement GEPA integration in `eval_protocol/training/gepa_adapter.py`**
- Define a small adapter API, e.g.:

```python
class GEPATrainer:
def __init__(self, spec: trainingBenchmarkSpec, inject_fn: InjectFn, ...gepa_config...):
...

def train(self) -> tuple[Candidate, Any]:
"""Run GEPA and return best candidate plus optional rich result."""
```

- Inside, implement:
- Conversion from `(spec, inject_fn)` into a GEPA metric:
- For each candidate:
- Clone or map the base dataset rows, applying `inject_fn(candidate, row)`.
- Use the spec’s rollout runner + metric runner to compute per-example and aggregate scores.
- Return the aggregate score (and optional textual feedback) to GEPA.
- The call to `gepa.optimize(...)` with:
- `seed_candidate` constructed from the baseline configuration (e.g., default system prompt).
- Budget configuration (max metric calls / auto presets).
- Reflection config (reflection LM or other knobs) passed in via constructor.
- Mapping from `GEPAResult` (or equivalent) back into:
- `best_candidate: Candidate`.
- Optional rich result object (e.g., exposing Pareto-front stats).

- **5. Wire a first benchmark: AIME 2025**
- In `eval_protocol/benchmarks/test_aime25.py`:
- Factor the row-scoring logic inside `test_aime25_pointwise` into a **reusable metric function** (pure function that sets `row.evaluation_result` given a rolled-out row).
- Decide how candidates should influence the evaluation:
- For example, by making the dataset adapter or message-construction logic candidate-aware (e.g., changing the system prompt).
- Add a `if __name__ == "__main__":` block that:
- Imports `test_aime25_pointwise` and builds an `EPConfig` via `build_ep_config_from_test`.
- Instantiates `GEPATrainer` with:
- The `EPConfig`.
- Initial GEPA config (budget, reflection model placeholder, seed).
- Calls `trainer.train()` and prints/logs the resulting `best_candidate` for now.
- Keep storage of tuned prompts as a TODO/extension point to be resolved later.

- **6. Optional second benchmark: GPQA**
- Repeat step 5 for `test_gpqa.py`:
- Identify what’s tunable (system prompt, possibly chain-of-thought instructions).
- Extract metric logic into a reusable function.
- Add candidate-aware wiring (e.g., via dataset adapters) and an optional `__main__` entrypoint calling the same GEPA trainer.
- This will validate that:
- The abstractions generalize across tasks.
- No DSPy/GEPA-specific imports leak into benchmark files (other than a small, well-defined training API).

### Coordination Notes

- **Order of work**
- Person A should go first (or in parallel up to the point where `EPConfig` and `build_ep_config_from_test` are usable).
- Person B can stub against interfaces and adjust once Person A’s core is available.
- **Integration checkpoints**
- After Person A lands the core + tests:
- Person B wires AIME with a very simple “optimizer” (even random search) to smoke-test the path before hooking up real GEPA.
- After GEPA integration works for AIME:
- Decide on the canonical way to treat GEPA’s `run_dir` and/or additional artifacts for tuned prompts.
- Optionally add a small helper that knows how to “run evaluation once with best GEPA candidate” for CI workflows.
19 changes: 19 additions & 0 deletions eval_protocol/training/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import Any

from eval_protocol.models import EPParameters


def build_ep_parameters_from_test(test_fn: Any) -> EPParameters:
"""
Build an `EPParameters` instance from an `@evaluation_test`-decorated function.

The decorator is responsible for attaching a `__ep_params__` attribute that
contains all effective evaluation parameters after parsing/env overrides.
"""
if not hasattr(test_fn, "__ep_params__"):
raise ValueError(
"The provided test function does not have `__ep_params__` attached. "
"Ensure it is decorated with `@evaluation_test` from eval_protocol.pytest."
)

return getattr(test_fn, "__ep_params__")
32 changes: 32 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Message,
MetricResult,
StepOutput,
EPParameters,
)


Expand Down Expand Up @@ -721,3 +722,34 @@ def test_message_dump_for_chat_completion_request():
assert "weight" not in dictionary
assert "reasoning_content" not in dictionary
assert dictionary["content"] == "Hello, how are you?"


def test_ep_parameters_defaults():
"""EPParameters should have sensible defaults for core fields."""
params = EPParameters()

assert params.completion_params is None
assert params.num_runs == 1
assert params.disable_browser_open is False
assert params.max_concurrent_rollouts == 8
assert params.max_concurrent_evaluations == 64
assert params.mode == "pointwise"
assert params.combine_datasets is True


def test_ep_parameters_accepts_arbitrary_types():
"""EPParameters should allow rich Python types for callable/logger fields."""

def dummy_preprocess(rows):
return rows

def dummy_adapter(*args, **kwargs):
return None

logger = logging.getLogger("ep-params-test")

params = EPParameters(dataset_adapter=dummy_adapter, preprocess_fn=dummy_preprocess, logger=logger)

assert params.dataset_adapter is dummy_adapter
assert params.preprocess_fn is dummy_preprocess
assert params.logger is logger
32 changes: 32 additions & 0 deletions tests/test_training_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest

from eval_protocol.models import EPParameters
from eval_protocol.training.utils import build_ep_parameters_from_test


def test_build_ep_parameters_from_test_returns_attached_model():
"""build_ep_parameters_from_test should return the EPParameters attached to the test function."""

def dummy_test() -> None:
pass

params = EPParameters(num_runs=3, completion_params={"model": "gpt-4"})
setattr(dummy_test, "__ep_params__", params)

result = build_ep_parameters_from_test(dummy_test)

assert result is params
assert result.num_runs == 3
assert result.completion_params == {"model": "gpt-4"}


def test_build_ep_parameters_from_test_missing_attr_raises():
"""build_ep_parameters_from_test should raise when __ep_params__ is missing."""

def dummy_test_no_attr() -> None:
pass

with pytest.raises(ValueError) as exc_info:
build_ep_parameters_from_test(dummy_test_no_attr)

assert "__ep_params__" in str(exc_info.value)
Loading