evaluation with aggregated scores

benjibc · benjibc · commit 4aa9e5c40a2f · 2025-08-10T05:24:48.000Z
diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md
@@ -0,0 +1,80 @@
+# Running AIME/GPQA Evaluations in CI and Locally
+
+This guide explains how to run the AIME2025 and GPQA evaluations using the
+pytest-based `evaluation_test` decorator, how to control dataset size and
+concurrency, how to select effort presets, and how to print/persist results
+for CI dashboards/artifacts.
+
+## Objectives
+- Simple pass/fail: ensure evaluation configs don’t regress.
+- Comparable metrics: capture aggregated accuracy across runs/rows.
+- CI-friendly outputs: print summary lines to logs and save JSON artifacts.
+
+## Prerequisites
+- `FIREWORKS_API_KEY` set in the environment
+- Install SDK: `pip install -e .[dev]`
+
+## Controls
+- Row limit
+  - Default `max_dataset_rows=2` in each test decorator for quick CI.
+  - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`.
+- Concurrency
+  - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks).
+- Repeats
+  - Set `num_runs` in the decorator (e.g., 4).
+- Effort (Fireworks reasoning)
+  - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`.
+  - The default rollout forwards it via LiteLLM `extra_body`.
+
+## Printing & Persisting Results
+- Flags:
+  - `--ep-print-summary`: print concise summary lines at end of each eval
+  - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp
+- Example GitHub Actions snippet:
+```yaml
+- name: Run AIME low effort (full)
+  run: |
+    cd python-sdk
+    pytest --ep-max-rows=all --ep-print-summary \
+      --ep-summary-json=outputs/aime_low.json \
+      -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+- name: Upload AIME results
+  uses: actions/upload-artifact@v4
+  with:
+    name: aime2025-low-summary
+    path: python-sdk/outputs/aime_low.json
+```
+
+## Examples
+### AIME (Low Effort, Full, Repeats=4, Concurrency=4)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/aime_low.json \
+  -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+```
+Expected:
+- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...`
+- JSON artifact at `outputs/aime_low.json`
+- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated
+
+For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to
+`rollout_input_params` in the test decorator and rerun with a different JSON path.
+
+### GPQA (Diamond, Low Effort)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/gpqa_low.json \
+  -q examples/gpqa/tests/test_evaluation.py -q
+```
+Adjust repeats/concurrency/effort in the test decorator similarly to AIME.
+
+## Pass/Fail Signals
+- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold.
+- Otherwise, printing and writing artifacts occur and the run succeeds for CI.
+
+## Tips
+- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests.
+- Upload JSON artifacts for dashboards and historical comparisons.
+- Keep concurrency conservative (e.g., 4) to avoid rate limiting.
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -2,6 +2,7 @@
 from typing import List
 
 from litellm import acompletion
+import litellm
 from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
 
 from eval_protocol.dataset_logger import default_logger
@@ -14,6 +15,15 @@ async def default_single_turn_rollout_processor(
 ) -> List[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
+    # Explicitly disable LiteLLM caching to avoid reused responses across runs
+    try:
+        litellm.cache = None
+        # Some versions expose a helper; ignore if unavailable
+        if hasattr(litellm, "disable_cache"):
+            litellm.disable_cache()  # type: ignore[call-arg]
+    except Exception:
+        pass
+
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
         if len(row.messages) == 0:
@@ -22,6 +32,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+        # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
+        # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
+        if "reasoning" in config.input_params:
+            request_params.setdefault("extra_body", {})
+            request_params["extra_body"]["reasoning"] = config.input_params["reasoning"]
 
         if row.tools is not None:
             request_params["tools"] = row.tools
@@ -57,8 +72,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         default_logger.log(row)
         return row
 
-    # Process all rows concurrently
-    tasks = [process_row(row) for row in rows]
+    # Process rows with bounded concurrency if configured
+    max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+        async with semaphore:
+            return await process_row(r)
+
+    tasks = [_sem_wrapper(row) for row in rows]
     dataset = list(await asyncio.gather(*tasks))
 
     return dataset
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -1,6 +1,8 @@
 import inspect
 import os
-import os
+import copy
+import math
+import statistics
 from typing import Any, Callable, Dict, List, Optional
 
 import pytest
@@ -91,11 +93,11 @@ def decorator(
         if mode == "pointwise":
             # Pointwise mode: function should accept messages and other row-level params
             if "row" not in sig.parameters:
-                raise ValueError(f"In pointwise mode, your eval function must have a parameter named 'row'")
+                raise ValueError("In pointwise mode, your eval function must have a parameter named 'row'")
 
             # validate that "Row" is of type EvaluationRow
             if sig.parameters["row"].annotation is not EvaluationRow:
-                raise ValueError(f"In pointwise mode, the 'row' parameter must be of type EvaluationRow")
+                raise ValueError("In pointwise mode, the 'row' parameter must be of type EvaluationRow")
 
             # validate that the function has a return type of EvaluationRow
             if sig.return_annotation is not EvaluationRow:
@@ -107,7 +109,7 @@ def decorator(
 
             # validate that "Rows" is of type List[EvaluationRow]
             if sig.parameters["rows"].annotation is not List[EvaluationRow]:
-                raise ValueError(f"In batch mode, the 'rows' parameter must be of type List[EvaluationRow]")
+                raise ValueError("In batch mode, the 'rows' parameter must be of type List[EvaluationRow")
 
             # validate that the function has a return type of List[EvaluationRow]
             if sig.return_annotation is not List[EvaluationRow]:
@@ -150,7 +152,13 @@ def generate_combinations():
             combinations = []
 
             # Handle optional parameters with defaults
-            datasets: List[Optional[DatasetPathParam]] = input_dataset if input_dataset is not None else [None]  # type: ignore
+            # Treat multiple dataset paths as a single combined dataset rather than
+            # parameterizing over each path separately. This produces one summary
+            # that reflects the aggregate of all provided files (e.g., AIME I+II).
+            if input_dataset is not None:
+                datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset]  # type: ignore
+            else:
+                datasets = [None]
             params: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None]  # type: ignore
             # Apply EP_MAX_DATASET_ROWS to input_messages to uniformly control row count when messages are provided
             if input_messages is not None and isinstance(input_messages, list):
@@ -222,7 +230,15 @@ def wrapper_body(**kwargs):
                     # Handle dataset loading
                     data: List[EvaluationRow] = []
                     if "dataset_path" in kwargs and kwargs["dataset_path"] is not None:
-                        data_jsonl = load_jsonl(kwargs["dataset_path"])
+                        ds_arg = kwargs["dataset_path"]
+                        # Support either a single path or a list of paths; if a list is provided,
+                        # concatenate the rows from each file in order.
+                        if isinstance(ds_arg, list):
+                            data_jsonl = []
+                            for p in ds_arg:
+                                data_jsonl.extend(load_jsonl(p))
+                        else:
+                            data_jsonl = load_jsonl(ds_arg)
                         # Apply env override for max rows if present
                         effective_max_rows = _parse_ep_max_rows(max_dataset_rows)
                         if effective_max_rows is not None:
@@ -270,7 +286,7 @@ def wrapper_body(**kwargs):
                         row.pid = os.getpid()
                         default_logger.log(row)
 
-                    # Now run the rollout processor with metadata-initialized data
+                    # Prepare rollout processor config once; we will generate fresh outputs per run
                     config = RolloutProcessorConfig(
                         model=model_name,
                         input_params=input_params,
@@ -279,9 +295,12 @@ def wrapper_body(**kwargs):
                         server_script_path=server_script_path,
                         steps=steps,
                     )
-                    input_dataset = execute_function(rollout_processor, rows=data, config=config)
 
                     for _ in range(num_runs):
+                        # Regenerate outputs each run by deep-copying the pristine dataset
+                        # so model responses are not reused across runs.
+                        fresh_rows = [copy.deepcopy(r) for r in data]
+                        input_dataset = execute_function(rollout_processor, rows=fresh_rows, config=config)
                         if mode == "pointwise":
                             # Pointwise mode: apply the evaluator function to each row
                             for row in input_dataset:
@@ -323,6 +342,23 @@ def wrapper_body(**kwargs):
                     scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
                     agg_score = aggregate(scores, aggregation_method)
 
+                    # Compute 95% confidence interval for mean aggregation
+                    # TODO bchen: remove after Derek has his stuff
+                    ci_low: float | None = None
+                    ci_high: float | None = None
+                    if aggregation_method == "mean":
+                        n = len(scores)
+                        if n >= 2:
+                            try:
+                                sample_std = statistics.stdev(scores)
+                                se = sample_std / math.sqrt(n)
+                                margin = 1.96 * se
+                                ci_low = float(max(0.0, (agg_score or 0.0) - margin)) if agg_score is not None else None
+                                ci_high = float(min(1.0, (agg_score or 0.0) + margin)) if agg_score is not None else None
+                            except Exception:
+                                ci_low = None
+                                ci_high = None
+
                     # Determine if the evaluation passed based on threshold
                     passed = None
                     if threshold_of_success is not None:
@@ -335,6 +371,86 @@ def wrapper_body(**kwargs):
                             r.eval_metadata.passed = passed
                         default_logger.log(r)
 
+                    # Optional: print and/or persist a summary artifact for CI
+                    try:
+                        should_print = os.getenv("EP_PRINT_SUMMARY") == "1"
+                        summary_path = os.getenv("EP_SUMMARY_JSON")
+                        suite_name = test_func.__name__
+                        model_used = model_name
+                        total_rows = len(all_results)
+                        summary_obj = {
+                            "suite": suite_name,
+                            "model": model_used,
+                            "agg_score": float(agg_score) if agg_score is not None else None,
+                            "num_runs": num_runs,
+                            "rows": total_rows,
+                        }
+                        if ci_low is not None and ci_high is not None:
+                            summary_obj["agg_ci_low"] = ci_low
+                            summary_obj["agg_ci_high"] = ci_high
+
+                        # Aggregate per-metric mean and 95% CI when available
+                        metrics_summary: Dict[str, Dict[str, float]] = {}
+                        from collections import defaultdict
+                        metric_scores: Dict[str, list] = defaultdict(list)
+                        for r in all_results:
+                            if r.evaluation_result and r.evaluation_result.metrics:
+                                for m_name, m_res in r.evaluation_result.metrics.items():
+                                    if m_res is not None and getattr(m_res, "score", None) is not None:
+                                        metric_scores[m_name].append(m_res.score)
+                        for m_name, vals in metric_scores.items():
+                            if len(vals) == 0:
+                                continue
+                            m_mean = sum(vals) / len(vals)
+                            m_low = None
+                            m_high = None
+                            if len(vals) >= 2:
+                                try:
+                                    m_std = statistics.stdev(vals)
+                                    m_se = m_std / math.sqrt(len(vals))
+                                    m_margin = 1.96 * m_se
+                                    m_low = max(0.0, m_mean - m_margin)
+                                    m_high = min(1.0, m_mean + m_margin)
+                                except Exception:
+                                    m_low = None
+                                    m_high = None
+                            entry: Dict[str, float] = {"mean": float(m_mean)}
+                            if m_low is not None and m_high is not None:
+                                entry["ci_low"] = float(m_low)
+                                entry["ci_high"] = float(m_high)
+                            metrics_summary[m_name] = entry
+                        if metrics_summary:
+                            summary_obj["metrics_agg"] = metrics_summary
+                        if should_print:
+                            if ci_low is not None and ci_high is not None:
+                                print(
+                                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}"
+                                )
+                            else:
+                                print(
+                                    f"EP Summary | suite={suite_name} model={model_used} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}"
+                                )
+                            # Print per-metric aggregations concisely (only names present)
+                            if metrics_summary:
+                                parts = []
+                                for m_name, entry in metrics_summary.items():
+                                    if "ci_low" in entry and "ci_high" in entry:
+                                        parts.append(f"{m_name}={entry['mean']:.3f} ci95=[{entry['ci_low']:.3f},{entry['ci_high']:.3f}]")
+                                    else:
+                                        parts.append(f"{m_name}={entry['mean']:.3f}")
+                                print(f"EP Metrics | " + ", ".join(parts))
+                        if summary_path:
+                            import json, pathlib, time
+
+                            p = pathlib.Path(summary_path)
+                            p.parent.mkdir(parents=True, exist_ok=True)
+                            summary_obj["timestamp"] = int(time.time())
+                            with p.open("w", encoding="utf-8") as f:
+                                json.dump(summary_obj, f)
+                    except Exception:
+                        # Do not fail evaluation if summary writing fails
+                        pass
+
                     # Check threshold after logging
                     if threshold_of_success is not None and not passed:
                         assert (
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
@@ -29,6 +29,22 @@ def pytest_addoption(parser: pytest.Parser) -> None:
             "Pass an integer (e.g., 2, 50) or 'all' for no limit."
         ),
     )
+    group.addoption(
+        "--ep-print-summary",
+        action="store_true",
+        default=False,
+        help=(
+            "Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test."
+        ),
+    )
+    group.addoption(
+        "--ep-summary-json",
+        action="store",
+        default=None,
+        help=(
+            "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."
+        ),
+    )
 
 
 def _normalize_max_rows(val: Optional[str]) -> Optional[str]:
@@ -51,4 +67,11 @@ def pytest_configure(config: pytest.Config) -> None:
     if norm is not None:
         os.environ["EP_MAX_DATASET_ROWS"] = norm
 
+    if config.getoption("--ep-print-summary"):
+        os.environ["EP_PRINT_SUMMARY"] = "1"
+
+    summary_json_path = config.getoption("--ep-summary-json")
+    if summary_json_path:
+        os.environ["EP_SUMMARY_JSON"] = summary_json_path
+
 
diff --git a/examples/aime2025_chat_completion/tests/test_evaluation.py b/examples/aime2025_chat_completion/tests/test_evaluation.py
@@ -61,12 +61,13 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
         "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
-    rollout_input_params=[{"temperature": 0.0, "max_tokens": 1024}],
+    rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}, {}, {"extra_body": {"reasoning_effort": "high"}}],
     rollout_processor=default_single_turn_rollout_processor,
     aggregation_method="mean",
     threshold_of_success=None,
-    num_runs=1,
+    num_runs=2,
     max_dataset_rows=2,
+    max_concurrent_rollouts=4,
     mode="pointwise",
 )
 def test_aime2025_pointwise(row: EvaluationRow) -> EvaluationRow: