Add dataset adapter support in evaluation_test and new test cases (#83)

Dylan Huang · web-flow · commit 6725f565dcf4 · 2025-07-31T22:03:28.000-07:00
- Included helper function `gsm8k_to_evaluation_row` for transforming GSM8K dataset entries into evaluation rows.
diff --git a/eval_protocol/pytest_utils.py b/eval_protocol/pytest_utils.py
@@ -74,6 +74,7 @@ def evaluation_test(
     model: List[ModelParam],
     input_messages: Optional[List[InputMessagesParam]] = None,
     input_dataset: Optional[List[DatasetPathParam]] = None,
+    dataset_adapter: Optional[Callable[[List[Dict[str, Any]]], Dataset]] = lambda x: x,
     input_params: Optional[List[InputParam]] = None,
     rollout_processor: Callable[
         [EvaluationRow, ModelParam, InputParam], List[EvaluationRow]
@@ -90,8 +91,13 @@ def evaluation_test(
 
     Args:
         model: Model identifiers to query.
-        input_messages: Messages to send to the model.
-        input_dataset: Paths to JSONL datasets.
+        input_messages: Messages to send to the model. This is useful if you
+            don't have a dataset but can hard-code the messages.
+        input_dataset: Paths to JSONL datasets. This is useful if you have a
+            dataset already. Provide a dataset_adapter to convert the input dataset
+            to a list of EvaluationRows if you have a custom dataset format.
+        dataset_adapter: Function to convert the input dataset to a list of
+            EvaluationRows. This is useful if you have a custom dataset format.
         input_params: Generation parameters for the model.
         rollout_processor: Function used to perform the rollout.
         aggregation_method: How to aggregate scores across rows.
@@ -240,16 +246,9 @@ def wrapper_body(**kwargs):
                     data = load_jsonl(kwargs["dataset_path"])
                     if max_dataset_rows is not None:
                         data = data[:max_dataset_rows]
-                    input_dataset = []
-                    for entry in data:
-                        user_query = entry.get("user_query") or entry.get("prompt")
-                        if not user_query:
-                            continue
-                        messages = [Message(role="user", content=user_query)]
-                        row = EvaluationRow(
-                            messages=messages,
-                            ground_truth=entry.get("ground_truth_for_eval"),
-                        )
+                    data = dataset_adapter(data)
+                    input_dataset: List[EvaluationRow] = []
+                    for row in data:
                         processed = rollout_processor(
                             row, model=model_name, input_params=kwargs.get("input_params") or {}
                         )
diff --git a/tests/pytest/helper/gsm8k_to_evaluation_row.py b/tests/pytest/helper/gsm8k_to_evaluation_row.py
@@ -0,0 +1,12 @@
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluationRow, Message
+
+
+def gsm8k_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    return [
+        EvaluationRow(
+            messages=[Message(role="user", content=row["user_query"])], ground_truth=row["ground_truth_for_eval"]
+        )
+        for row in data
+    ]
diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py
diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
@@ -1,9 +1,11 @@
 from eval_protocol.pytest_utils import evaluate, evaluation_test
 from examples.math_example.main import evaluate as math_evaluate
+from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
 
 
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
+    dataset_adapter=gsm8k_to_evaluation_row,
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -1,9 +1,11 @@
 from eval_protocol.pytest_utils import evaluate, evaluation_test
 from examples.math_with_format_and_length.main import evaluate as math_fl_evaluate
+from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
 
 
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
+    dataset_adapter=gsm8k_to_evaluation_row,
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,