Added task fields to report in Benchmark.

cemde · cemde · commit acffbbb24645 · 2026-03-12T00:52:05.000+01:00
diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
@@ -1235,6 +1235,11 @@ def _execute_task_repetition(
             "traces": execution_traces,
             "config": execution_configs,
             "eval": eval_results,
+            "task": {
+                "query": task.query,
+                "metadata": dict(task.metadata),
+                "protocol": task.protocol.to_dict(),
+            },
         }
 
         # Clear registry after task repetition completes
diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py
@@ -62,6 +62,7 @@ def __init__(
         include_traces: bool = True,
         include_config: bool = True,
         include_eval: bool = True,
+        include_task: bool = False,
         validate_on_completion: bool = True,
     ):
         """Initialize the result logger.
@@ -70,12 +71,15 @@ def __init__(
             include_traces: If True, include execution traces in logged results
             include_config: If True, include configuration in logged results
             include_eval: If True, include evaluation results in logged results
+            include_task: If True, include task data (query, metadata, protocol)
+                in logged results
             validate_on_completion: If True, validate all iterations were logged at end
         """
         super().__init__()
         self.include_traces = include_traces
         self.include_config = include_config
         self.include_eval = include_eval
+        self.include_task = include_task
         self.validate_on_completion = validate_on_completion
 
         # Tracking for validation
@@ -173,6 +177,9 @@ def _filter_report(self, report: Dict) -> Dict:
         if self.include_eval and "eval" in report:
             filtered["eval"] = report["eval"]
 
+        if self.include_task and "task" in report:
+            filtered["task"] = report["task"]
+
         return filtered
 
     def _report_validation_errors(self) -> None:
@@ -306,6 +313,7 @@ def __init__(
         include_traces: bool = True,
         include_config: bool = True,
         include_eval: bool = True,
+        include_task: bool = False,
         validate_on_completion: bool = True,
     ):
         """Initialize the file logger.
@@ -322,12 +330,15 @@ def __init__(
             include_traces: If True, include execution traces in logged results
             include_config: If True, include configuration in logged results
             include_eval: If True, include evaluation results in logged results
+            include_task: If True, include task data (query, metadata, protocol)
+                in logged results
             validate_on_completion: If True, validate all iterations were logged
         """
         super().__init__(
             include_traces=include_traces,
             include_config=include_config,
             include_eval=include_eval,
+            include_task=include_task,
             validate_on_completion=validate_on_completion,
         )
 
@@ -518,6 +529,7 @@ def _write_metadata(self) -> None:
             "include_traces": self.include_traces,
             "include_config": self.include_config,
             "include_eval": self.include_eval,
+            "include_task": self.include_task,
             "validation_enabled": self.validate_on_completion,
         }
 
diff --git a/maseval/core/task.py b/maseval/core/task.py
@@ -52,6 +52,20 @@ class TaskProtocol:
     priority: int = 0
     tags: Dict[str, Any] = field(default_factory=dict)
 
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to a JSON-serializable dictionary.
+
+        Returns:
+            Dictionary with all fields. Enum values are converted to strings.
+        """
+        return {
+            "timeout_seconds": self.timeout_seconds,
+            "timeout_action": self.timeout_action.value,
+            "max_retries": self.max_retries,
+            "priority": self.priority,
+            "tags": dict(self.tags),
+        }
+
 
 class FrozenDict(dict):
     """A dict subclass that raises ``TaskFrozenError`` on any mutation attempt.
diff --git a/tests/test_core/test_callbacks/test_result_logger.py b/tests/test_core/test_callbacks/test_result_logger.py
@@ -174,6 +174,43 @@ def test_filter_report_status_and_error_absent(self):
         assert filtered["status"] is None
         assert filtered["error"] is None
 
+    def test_filter_report_includes_task_when_enabled(self):
+        """Test that task data is included in filtered report when include_task is True."""
+        logger = MockResultLogger(include_task=True)
+
+        report = {
+            "task_id": "task_0",
+            "repeat_idx": 0,
+            "traces": {},
+            "config": {},
+            "eval": {},
+            "task": {
+                "query": "What is 2+2?",
+                "metadata": {"difficulty": "easy"},
+                "protocol": {"timeout_seconds": None, "timeout_action": "skip", "max_retries": 0, "priority": 0, "tags": {}},
+            },
+        }
+
+        filtered = logger._filter_report(report)
+
+        assert "task" in filtered
+        assert filtered["task"]["query"] == "What is 2+2?"
+        assert filtered["task"]["metadata"] == {"difficulty": "easy"}
+
+    def test_filter_report_excludes_task_by_default(self):
+        """Test that task data is excluded from filtered report by default."""
+        logger = MockResultLogger()
+
+        report = {
+            "task_id": "task_0",
+            "repeat_idx": 0,
+            "task": {"query": "What is 2+2?", "metadata": {}, "protocol": {}},
+        }
+
+        filtered = logger._filter_report(report)
+
+        assert "task" not in filtered
+
     def test_filter_report_partial_included(self):
         """Test report filtering with only some fields included."""
         logger = MockResultLogger(include_traces=False, include_config=True, include_eval=False)
diff --git a/tests/test_core/test_task_protocol.py b/tests/test_core/test_task_protocol.py
@@ -67,6 +67,46 @@ def test_tags_isolation(self):
 
         assert "key" not in p2.tags
 
+    def test_to_dict_defaults(self):
+        """to_dict should return all fields with defaults."""
+        protocol = TaskProtocol()
+        result = protocol.to_dict()
+
+        assert result == {
+            "timeout_seconds": None,
+            "timeout_action": "skip",
+            "max_retries": 0,
+            "priority": 0,
+            "tags": {},
+        }
+
+    def test_to_dict_custom_values(self):
+        """to_dict should serialize custom values and enums correctly."""
+        protocol = TaskProtocol(
+            timeout_seconds=60.0,
+            timeout_action=TimeoutAction.RETRY,
+            max_retries=3,
+            priority=10,
+            tags={"category": "hard"},
+        )
+        result = protocol.to_dict()
+
+        assert result == {
+            "timeout_seconds": 60.0,
+            "timeout_action": "retry",
+            "max_retries": 3,
+            "priority": 10,
+            "tags": {"category": "hard"},
+        }
+
+    def test_to_dict_returns_new_dict(self):
+        """to_dict should return a new dict, not a reference to internal state."""
+        protocol = TaskProtocol(tags={"key": "value"})
+        result = protocol.to_dict()
+
+        result["tags"]["key"] = "modified"
+        assert protocol.tags["key"] == "value"
+
 
 @pytest.mark.core
 class TestTaskWithProtocol:
diff --git a/uv.lock b/uv.lock