eval-protocol · shreymodi1 · Dec 17, 2025 · Dec 6, 2025 · Dec 6, 2025 · Dec 6, 2025
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -3,7 +3,7 @@
 import importlib
 from datetime import datetime, timezone
 from enum import Enum
-from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
+from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union, Callable, Sequence
 
 JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
 
@@ -1190,3 +1190,32 @@ class MCPMultiClientConfiguration(BaseModel):
     """Represents a MCP configuration."""
 
     mcpServers: Dict[str, Union[MCPConfigurationServerStdio, MCPConfigurationServerUrl]]
+
+
+class EPParameters(BaseModel):
+    """The parameters of an `@evaluation_test`. Used for trainable integrations."""
+
+    completion_params: Any = None
+    input_messages: Any = None
+    input_dataset: Any = None
+    input_rows: Any = None
+    data_loaders: Any = None
+    dataset_adapter: Optional[Callable[..., Any]] = None
+    rollout_processor: Any = None
+    rollout_processor_kwargs: Dict[str, Any] | None = None
+    aggregation_method: Any = Field(default="mean")
+    passed_threshold: Any = None
+    disable_browser_open: bool = False
+    num_runs: int = 1
+    filtered_row_ids: Optional[Sequence[str]] = None
+    max_dataset_rows: Optional[int] = None
+    mcp_config_path: Optional[str] = None
+    max_concurrent_rollouts: int = 8
+    max_concurrent_evaluations: int = 64
+    server_script_path: Optional[str] = None
+    steps: int = 30
+    mode: Any = Field(default="pointwise")
+    combine_datasets: bool = True
+    preprocess_fn: Optional[Callable[[list[EvaluationRow]], list[EvaluationRow]]] = None
+    logger: Any = None
+    exception_handler_config: Any = None
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -21,6 +21,7 @@
     EvaluationThresholdDict,
     EvaluateResult,
     Status,
+    EPParameters,
 )
 from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
 from eval_protocol.pytest.evaluation_test_postprocess import postprocess
@@ -695,13 +696,33 @@ async def _collect_result(config, lst):
         )
         pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)
 
-        ep_params: dict[str, Any] = {
-            "rollout_processor": rollout_processor,
-            "server_script_path": server_script_path,
-            "mcp_config_path": mcp_config_path,
-            "rollout_processor_kwargs": rollout_processor_kwargs,
-            "mode": mode,
-        }
+        # Attach full evaluation parameter metadata for training integrations
+        ep_params: EPParameters = EPParameters(
+            completion_params=completion_params,
+            input_messages=input_messages,
+            input_dataset=input_dataset,
+            input_rows=input_rows,
+            data_loaders=data_loaders,
+            dataset_adapter=dataset_adapter,
+            rollout_processor=rollout_processor,
+            rollout_processor_kwargs=rollout_processor_kwargs,
+            aggregation_method=aggregation_method,
+            passed_threshold=passed_threshold,
+            disable_browser_open=disable_browser_open,
+            num_runs=num_runs,
+            filtered_row_ids=filtered_row_ids,
+            max_dataset_rows=max_dataset_rows,
+            mcp_config_path=mcp_config_path,
+            max_concurrent_rollouts=max_concurrent_rollouts,
+            max_concurrent_evaluations=max_concurrent_evaluations,
+            server_script_path=server_script_path,
+            steps=steps,
+            mode=mode,
+            combine_datasets=combine_datasets,
+            preprocess_fn=preprocess_fn,
+            logger=logger,
+            exception_handler_config=exception_handler_config,
+        )
 
         # Create the dual mode wrapper
         dual_mode_wrapper = create_dual_mode_wrapper(

diff --git a/eval_protocol/trainable_gepa_design.md b/eval_protocol/trainable_gepa_design.md
@@ -0,0 +1,201 @@
+## GEPA-training Interface Design for Eval Protocol
+
+### Goals
+
+- **Tunable prompts for existing benchmarks**: Allow benchmarks like `test_aime25.py` and `test_gpqa.py` to expose parts of their configuration (e.g., system prompts) as training parameters, without changing their core evaluation logic.
+- **Tight coupling with `@evaluation_test`**: Reuse the same rollout configuration, datasets, and metrics that are already defined via `evaluation_test`, instead of duplicating that configuration in a separate training API.
+- **GEPA as one optimizer backend**: Provide a clean integration point for GEPA (and potentially other optimizers later) without requiring benchmarks to depend on DSPy or GEPA directly.
+
+### High-Level Architecture
+
+- **Benchmark file (e.g., `test_aime25.py`)**
+  - Continues to define:
+    - Dataset adapter (`aime2025_dataset_adapter`).
+    - `@evaluation_test(...)`-decorated function (e.g., `test_aime25_pointwise`) that:
+      - Uses `SingleTurnRolloutProcessor` (or another processor).
+      - Computes per-row metrics and sets `row.evaluation_result`.
+  - Adds *optional* training wiring at the bottom, under `if __name__ == "__main__":`, that:
+    - Imports a training/core API from `eval_protocol.training`.
+    - Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate.
+    - Invokes a train routine (GEPA-based or otherwise).
+
+- **training core**
+  - Provides a single central abstraction:
+    - **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form:
+      - One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides.
+    - **Candidate representation**: Start with `dict[str, str]` (e.g., `{"system_prompt": "..."}`), anticipating future extensions (few-shot examples, tool docs, etc.).
+  - Includes helper utilities to:
+    - Build an `EPParameters` instance by introspecting an `@evaluation_test`-decorated function.
+    - Run a single candidate or a batch of candidates through the full rollout + evaluation pipeline, returning aggregate scores (and optionally per-row scores).
+
+- **GEPA adapter (e.g., `eval_protocol/training/gepa_adapter.py`)**
+  - Wraps the training core and GEPA’s API:
+    - Accepts:
+      - An `EPConfig`.
+      - A candidate space definition (for now, implicit via `dict[str, str]` keys).
+      - GEPA configuration (budget, reflection model, seed, component selection strategy, etc.).
+    - Provides:
+      - A GEPA-compatible metric interface that:
+        - Given a candidate, uses `EPConfig` (and benchmark-specific logic such as a custom `dataset_adapter`) to:
+          - Construct or adapt rows for that candidate.
+          - Run rollouts (reusing the same processors and params as the test).
+          - Compute scalar scores (e.g., mean exact-match over a batch).
+      - A training routine that returns:
+        - A `best_candidate: dict[str, str]`.
+        - Optional rich result object (e.g., mapping to `GEPAResult`, additional stats).
+
+### Relationship to `evaluation_test` and `__ep_params__`
+
+- Existing `evaluation_test` code will attach:
+
+```python
+ep_params: dict[str, Any] = {
+    "rollout_processor": rollout_processor,
+    "server_script_path": server_script_path,
+    "mcp_config_path": mcp_config_path,
+    "rollout_processor_kwargs": rollout_processor_kwargs,
+    "mode": mode,
+}
+setattr(dual_mode_wrapper, "__ep_params__", ep_params)
+```
+
+- Design direction:
+  - **Use `__ep_params__` as the single source of truth**.
+  - **`__ep_params__` should contain all effective `evaluation_test` parameters**, including:
+    - Parsed `completion_params` (after env overrides).
+    - Dataset sources (`input_dataset`, `input_rows`, dataloaders, and `dataset_adapter`), after `parse_ep_*` transforms.
+    - `aggregation_method`, `num_runs`, `max_dataset_rows`, etc.
+    - Rollout and mode information (processor, kwargs, concurrency limits, mode).
+  - The training core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate training-only config.
+
+- training core will expose:
+  - A factory like:
+
+    ```python
+    def build_ep_parameters_from_test(
+        test_fn: TestFunction,
+    ) -> EPParameters:
+        ...
+    ```
+
+  - This function:
+    - Reads `test_fn.__ep_params__`.
+    - Reconstructs how to:
+      - Load and preprocess the dataset.
+      - Configure the rollout processor (`RolloutProcessorConfig`).
+      - Run rollouts and then apply the row-level metric (by calling the decorated test function in a library mode).
+
+- Training code (e.g., `python test_aime25.py`) then becomes:
+  - Import the test function (e.g., `test_aime25_pointwise`).
+  - Build an `EPParameters` from it.
+  - Call into a GEPA-based trainer that uses the `EPParameters`.
+
+### TODO for derek to figure out: how to store the changing system prompts.
+
+- **Where tuned prompts live (storage format and location)**:
+  - GEPA already supports a `run_dir` for logging and checkpoints.
+  - We need to decide:
+    - Whether EP should:
+      - Treat `run_dir` as the canonical store and optionally add a small `best_candidate.json` there; or
+      - Provide an additional EP-level artifact format.
+  - For now, storage is left as an **explicit design TODO** and can be finalized once we have the core/adapter in place.
+
+### Work Split: Person A vs Person B
+
+#### Person A – training Core & `evaluation_test` Integration
+
+- **1. Extend `evaluation_test` metadata (no behavior change)**
+  - Populate a single `__ep_config__` dict on the decorated test function that includes:
+    - Dataset specification (paths / input_rows / dataloaders, `dataset_adapter`, `max_dataset_rows`, etc.) after `parse_ep_*`.
+    - Parsed `completion_params` (after env overrides like `parse_ep_completion_params_overwrite`).
+    - Rollout settings (`rollout_processor`, `rollout_processor_kwargs`, `mode`, `max_concurrent_rollouts`, `max_concurrent_evaluations`).
+    - Aggregation and threshold metadata.
+  - Ensure:
+    - Backwards compatibility for existing tests.
+    - Clear typing and docstrings to guide future use.
+
+- **2. Define core training abstractions in `eval_protocol/training/core.py`**
+  - Define:
+    - `EPConfig`:
+      - A field for every parameter `evaluation_test` accepts (dataset, adapters, completion params, rollout processor, aggregation, thresholds, etc.).
+      - Can be serialized/inspected for external tooling.
+    - Candidate type alias (initially `Candidate = dict[str, str]`).
+  - Implement:
+    - `build_ep_config_from_test(test_fn: TestFunction) -> EPConfig`.
+      - Reads `__ep_config__`.
+      - Reuses the same dataset and rollout logic as pytest, but in a library-friendly way (no pytest invocation).
+  - Helper(s) to:
+    - Run a single candidate over the dataset, possibly with:
+      - A subset of rows (train vs val split initially determined by the benchmark or EPConfig).
+      - A configurable aggregation method (mean score to start).
+
+- **3. Minimal tests and documentation for the core**
+  - Add unit/integration tests that:
+    - Use a tiny fake `@evaluation_test` function.
+    - Confirm `build_ep_config_from_test` produces a config that can:
+      - Load mock rows.
+      - Run a dummy rollout processor.
+      - Apply a simple metric to produce scores.
+  - Document (in this design file or a short README) how benchmarks should think about exposing tunable pieces (e.g., via custom dataset adapters or other wiring).
+
+#### Person B – GEPA Adapter & Benchmark Wiring
+
+- **4. Implement GEPA integration in `eval_protocol/training/gepa_adapter.py`**
+  - Define a small adapter API, e.g.:
+
+```python
+class GEPATrainer:
+    def __init__(self, spec: trainingBenchmarkSpec, inject_fn: InjectFn, ...gepa_config...):
+        ...
+
+    def train(self) -> tuple[Candidate, Any]:
+        """Run GEPA and return best candidate plus optional rich result."""
+```
+
+  - Inside, implement:
+    - Conversion from `(spec, inject_fn)` into a GEPA metric:
+      - For each candidate:
+        - Clone or map the base dataset rows, applying `inject_fn(candidate, row)`.
+        - Use the spec’s rollout runner + metric runner to compute per-example and aggregate scores.
+        - Return the aggregate score (and optional textual feedback) to GEPA.
+    - The call to `gepa.optimize(...)` with:
+      - `seed_candidate` constructed from the baseline configuration (e.g., default system prompt).
+      - Budget configuration (max metric calls / auto presets).
+      - Reflection config (reflection LM or other knobs) passed in via constructor.
+    - Mapping from `GEPAResult` (or equivalent) back into:
+      - `best_candidate: Candidate`.
+      - Optional rich result object (e.g., exposing Pareto-front stats).
+
+- **5. Wire a first benchmark: AIME 2025**
+  - In `eval_protocol/benchmarks/test_aime25.py`:
+    - Factor the row-scoring logic inside `test_aime25_pointwise` into a **reusable metric function** (pure function that sets `row.evaluation_result` given a rolled-out row).
+    - Decide how candidates should influence the evaluation:
+      - For example, by making the dataset adapter or message-construction logic candidate-aware (e.g., changing the system prompt).
+    - Add a `if __name__ == "__main__":` block that:
+      - Imports `test_aime25_pointwise` and builds an `EPConfig` via `build_ep_config_from_test`.
+      - Instantiates `GEPATrainer` with:
+        - The `EPConfig`.
+        - Initial GEPA config (budget, reflection model placeholder, seed).
+      - Calls `trainer.train()` and prints/logs the resulting `best_candidate` for now.
+    - Keep storage of tuned prompts as a TODO/extension point to be resolved later.
+
+- **6. Optional second benchmark: GPQA**
+  - Repeat step 5 for `test_gpqa.py`:
+    - Identify what’s tunable (system prompt, possibly chain-of-thought instructions).
+    - Extract metric logic into a reusable function.
+    - Add candidate-aware wiring (e.g., via dataset adapters) and an optional `__main__` entrypoint calling the same GEPA trainer.
+  - This will validate that:
+    - The abstractions generalize across tasks.
+    - No DSPy/GEPA-specific imports leak into benchmark files (other than a small, well-defined training API).
+
+### Coordination Notes
+
+- **Order of work**
+  - Person A should go first (or in parallel up to the point where `EPConfig` and `build_ep_config_from_test` are usable).
+  - Person B can stub against interfaces and adjust once Person A’s core is available.
+- **Integration checkpoints**
+  - After Person A lands the core + tests:
+    - Person B wires AIME with a very simple “optimizer” (even random search) to smoke-test the path before hooking up real GEPA.
+  - After GEPA integration works for AIME:
+    - Decide on the canonical way to treat GEPA’s `run_dir` and/or additional artifacts for tuned prompts.
+    - Optionally add a small helper that knows how to “run evaluation once with best GEPA candidate” for CI workflows.
diff --git a/eval_protocol/training/utils.py b/eval_protocol/training/utils.py
@@ -0,0 +1,19 @@
+from typing import Any
+
+from eval_protocol.models import EPParameters
+
+
+def build_ep_parameters_from_test(test_fn: Any) -> EPParameters:
+    """
+    Build an `EPParameters` instance from an `@evaluation_test`-decorated function.
+
+    The decorator is responsible for attaching a `__ep_params__` attribute that
+    contains all effective evaluation parameters after parsing/env overrides.
+    """
+    if not hasattr(test_fn, "__ep_params__"):
+        raise ValueError(
+            "The provided test function does not have `__ep_params__` attached. "
+            "Ensure it is decorated with `@evaluation_test` from eval_protocol.pytest."
+        )
+
+    return getattr(test_fn, "__ep_params__")
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -11,6 +11,7 @@
     Message,
     MetricResult,
     StepOutput,
+    EPParameters,
 )
 
 
@@ -721,3 +722,34 @@ def test_message_dump_for_chat_completion_request():
     assert "weight" not in dictionary
     assert "reasoning_content" not in dictionary
     assert dictionary["content"] == "Hello, how are you?"
+
+
+def test_ep_parameters_defaults():
+    """EPParameters should have sensible defaults for core fields."""
+    params = EPParameters()
+
+    assert params.completion_params is None
+    assert params.num_runs == 1
+    assert params.disable_browser_open is False
+    assert params.max_concurrent_rollouts == 8
+    assert params.max_concurrent_evaluations == 64
+    assert params.mode == "pointwise"
+    assert params.combine_datasets is True
+
+
+def test_ep_parameters_accepts_arbitrary_types():
+    """EPParameters should allow rich Python types for callable/logger fields."""
+
+    def dummy_preprocess(rows):
+        return rows
+
+    def dummy_adapter(*args, **kwargs):
+        return None
+
+    logger = logging.getLogger("ep-params-test")
+
+    params = EPParameters(dataset_adapter=dummy_adapter, preprocess_fn=dummy_preprocess, logger=logger)
+
+    assert params.dataset_adapter is dummy_adapter
+    assert params.preprocess_fn is dummy_preprocess
+    assert params.logger is logger
diff --git a/tests/test_training_utils.py b/tests/test_training_utils.py
@@ -0,0 +1,32 @@
+import pytest
+
+from eval_protocol.models import EPParameters
+from eval_protocol.training.utils import build_ep_parameters_from_test
+
+
+def test_build_ep_parameters_from_test_returns_attached_model():
+    """build_ep_parameters_from_test should return the EPParameters attached to the test function."""
+
+    def dummy_test() -> None:
+        pass
+
+    params = EPParameters(num_runs=3, completion_params={"model": "gpt-4"})
+    setattr(dummy_test, "__ep_params__", params)
+
+    result = build_ep_parameters_from_test(dummy_test)
+
+    assert result is params
+    assert result.num_runs == 3
+    assert result.completion_params == {"model": "gpt-4"}
+
+
+def test_build_ep_parameters_from_test_missing_attr_raises():
+    """build_ep_parameters_from_test should raise when __ep_params__ is missing."""
+
+    def dummy_test_no_attr() -> None:
+        pass
+
+    with pytest.raises(ValueError) as exc_info:
+        build_ep_parameters_from_test(dummy_test_no_attr)
+
+    assert "__ep_params__" in str(exc_info.value)