Refactor evaluation_test function to enforce parameter type validation for pointwise and batch modes, updating tests to use 'rows' instead of 'input_dataset' for consistency.

Dylan Huang · Dylan Huang · commit 1722e0f0bd63 · 2025-08-03T12:39:03.000-07:00
diff --git a/eval_protocol/pytest/pytest_utils.py b/eval_protocol/pytest/pytest_utils.py
@@ -204,15 +204,21 @@ def decorator(
             if "row" not in sig.parameters:
                 raise ValueError(f"In pointwise mode, your eval function must have a parameter named 'row'")
 
+            # validate that "Row" is of type EvaluationRow
+            if sig.parameters["row"].annotation is not EvaluationRow:
+                raise ValueError(f"In pointwise mode, the 'row' parameter must be of type EvaluationRow")
+
             # validate that the function has a return type of EvaluationRow
             if sig.return_annotation is not EvaluationRow:
                 raise ValueError("In pointwise mode, your eval function must return an EvaluationRow instance")
         else:
             # Batch mode: function should accept input_dataset and model
-            if "input_dataset" not in sig.parameters:
-                raise ValueError("In batch mode, your eval function must have a parameter named 'input_dataset'")
-            if "model" not in sig.parameters:
-                raise ValueError("In batch mode, your eval function must have a parameter named 'model'")
+            if "rows" not in sig.parameters:
+                raise ValueError("In batch mode, your eval function must have a parameter named 'rows'")
+
+            # validate that "Rows" is of type List[EvaluationRow]
+            if sig.parameters["rows"].annotation is not List[EvaluationRow]:
+                raise ValueError(f"In batch mode, the 'rows' parameter must be of type List[EvaluationRow]")
 
             # validate that the function has a return type of List[EvaluationRow]
             if sig.return_annotation is not List[EvaluationRow]:
@@ -227,7 +233,7 @@ def execute_with_params(
         ):
             kwargs = {}
             if input_dataset is not None:
-                kwargs["input_dataset"] = list(input_dataset)
+                kwargs["rows"] = input_dataset
             if input_params is not None:
                 kwargs["input_params"] = input_params
             if model is not None:
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -4,7 +4,6 @@
 This test demonstrates how to check if model responses contain the required number of highlighted sections.
 """
 
-import json
 import re
 from typing import Any, Dict, List, Optional
 
@@ -69,8 +68,8 @@ def markdown_format_evaluate(messages: List[Message], ground_truth: Optional[str
     rollout_processor=default_single_turn_rollout_processor,
     num_runs=1,
 )
-def test_markdown_highlighting_evaluation(input_dataset, input_params, model) -> List[EvaluationRow]:
+def test_markdown_highlighting_evaluation(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """
     Test markdown highlighting validation using batch mode with evaluate().
     """
-    return evaluate(input_dataset, markdown_format_evaluate)
+    return evaluate(rows, markdown_format_evaluate)
diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py
@@ -1,21 +1,21 @@
 from typing import List
 
-from eval_protocol.models import EvaluationRow
+from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
 from examples.math_example.main import evaluate as math_evaluate
 
 
 @evaluation_test(
     input_messages=[
         [
-            {"role": "user", "content": "What is the capital of France?"},
+            Message(role="user", content="What is the capital of France?"),
         ],
         [
-            {"role": "user", "content": "What is the capital of the moon?"},
+            Message(role="user", content="What is the capital of the moon?"),
         ],
     ],
     model=["accounts/fireworks/models/kimi-k2-instruct"],
 )
-async def test_pytest_async(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
+async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return input_dataset
+    return rows
diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py
@@ -1,24 +1,24 @@
 from datetime import datetime
 from typing import List
 
-from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import Message, EvaluationRow
 from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test
 
 
 @evaluation_test(
     input_messages=[
         [
-            {
-                "role": "user",
-                "content": "Can you give a summary of the past week in the 'general, model-requests, bug-reports, questions, and feature-requests' channels. For EVERY message or thread has not been resolved, please list them at the end of your response in a table. Be sure to include the exact message, severity, and current status so far. Current Date & Time: {current_date_time}".format(
+            Message(
+                role="user",
+                content="Can you give a summary of the past week in the 'general, model-requests, bug-reports, questions, and feature-requests' channels. For EVERY message or thread has not been resolved, please list them at the end of your response in a table. Be sure to include the exact message, severity, and current status so far. Current Date & Time: {current_date_time}".format(
                     current_date_time=datetime.now().strftime("%B %d, %Y at %I:%M %p")
                 ),
-            }
+            )
         ]
     ],
     rollout_processor=default_agent_rollout_processor,
     model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
 )
-def test_pytest_default_agent_rollout_processor(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
+def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return input_dataset
+    return rows
diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py
@@ -1,18 +1,18 @@
 from typing import List
 
-from eval_protocol.models import EvaluationRow
+from eval_protocol.models import Message, EvaluationRow
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 
 
 @evaluation_test(
     input_messages=[
         [
-            {"role": "user", "content": "What is the capital of France?"},
+            Message(role="user", content="What is the capital of France?"),
         ]
     ],
     model=["accounts/fireworks/models/kimi-k2-instruct"],
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_input_messages_in_decorator(input_dataset: List[EvaluationRow], model) -> List[EvaluationRow]:
+def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return input_dataset
+    return rows
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
@@ -14,6 +14,6 @@
     threshold_of_success=0.0,
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_math_dataset(input_dataset, input_params, model) -> List[EvaluationRow]:
+def test_math_dataset(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math evaluation on sample dataset using pytest interface."""
-    return evaluate(input_dataset, math_evaluate)
+    return evaluate(rows, math_evaluate)
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -14,6 +14,6 @@
     threshold_of_success=0.0,
     rollout_processor=default_single_turn_rollout_processor,
 )
-def test_math_format_length_dataset(input_dataset, input_params, model) -> List[EvaluationRow]:
+def test_math_format_length_dataset(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     """Run math with format and length evaluation on sample dataset."""
-    return evaluate(input_dataset, math_fl_evaluate)
+    return evaluate(rows, math_fl_evaluate)
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -1,4 +1,3 @@
-from typing import List
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 from eval_protocol.models import EvaluateResult, MetricResult, EvaluationRow
 from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,6 @@`
`14`	`14`	`threshold_of_success=0.0,`
`15`	`15`	`rollout_processor=default_single_turn_rollout_processor,`
`16`	`16`	`)`
`17`		`-def test_math_dataset(input_dataset, input_params, model) -> List[EvaluationRow]:`
	`17`	`+def test_math_dataset(rows: List[EvaluationRow]) -> List[EvaluationRow]:`
`18`	`18`	`"""Run math evaluation on sample dataset using pytest interface."""`
`19`		`- return evaluate(input_dataset, math_evaluate)`
	`19`	`+ return evaluate(rows, math_evaluate)`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from typing import List`
`2`	`1`	`from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test`
`3`	`2`	`from eval_protocol.models import EvaluateResult, MetricResult, EvaluationRow`
`4`	`3`	`from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row`