Fix ResultLogger dropping status and error fields (#38)

cemde · web-flow · commit a4ec486ccdbf · 2026-03-03T15:27:35.000Z
* Fix ResultLogger._filter_report() dropping status and error fields
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -95,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures. Report schema is now consistent across success and failure paths (`error` is always present, `None` on success). (PR: #38)
 - GAIA2: Various fixes for faithful reproduction of ARE reference results — scenario lifecycle, data loading, evaluation flow, multi-turn notification handling, tool filtering, default agent fidelity, and simulation time management (PR: #30)
 - MultiAgentBench: Corrected domain mappings, added missing werewolf/minecraft support, fixed environment constructors, added result summarization matching MARBLE's evaluation pipeline (PR: #30)
 - Tau2: Fixed telecom domain schema to match tau2-bench, added agent/user state synchronization and deterministic network simulation, fixed initialization flow and tool result serialization (PR: #30)
diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
@@ -1226,20 +1226,17 @@ def _execute_task_repetition(
             # Task execution failed, so skip evaluation
             eval_results = None
 
-        # 5. Build report
+        # 5. Build report — all keys always present for consistent schema
         report: Dict[str, Any] = {
             "task_id": str(task.id),
             "repeat_idx": repeat_idx,
             "status": execution_status.value,
+            "error": error_info,
             "traces": execution_traces,
             "config": execution_configs,
             "eval": eval_results,
         }
 
-        # Add error info if present
-        if error_info is not None:
-            report["error"] = error_info
-
         # Clear registry after task repetition completes
         self.clear_registry()
 
diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py
@@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict:
         filtered = {
             "task_id": report.get("task_id"),
             "repeat_idx": report.get("repeat_idx"),
+            "status": report.get("status"),
+            "error": report.get("error"),
         }
 
         if self.include_traces and "traces" in report:
diff --git a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
@@ -562,7 +562,7 @@ def test_successful_task_has_success_status(self):
 
         assert len(reports) == 1
         assert reports[0]["status"] == TaskExecutionStatus.SUCCESS.value
-        assert "error" not in reports[0]
+        assert reports[0]["error"] is None
         assert reports[0]["eval"] is not None
 
     def test_default_failure_flags(self):
diff --git a/tests/test_core/test_callbacks/test_result_logger.py b/tests/test_core/test_callbacks/test_result_logger.py
@@ -134,6 +134,46 @@ def test_filter_report_all_included(self):
         assert "config" in filtered
         assert "eval" in filtered
 
+    def test_filter_report_preserves_status_and_error(self):
+        """Test that status and error fields are always included in filtered reports.
+
+        These are core metadata fields (like task_id and repeat_idx) that must
+        always be present so persisted results can distinguish successes from failures.
+        """
+        logger = MockResultLogger(include_traces=False, include_config=False, include_eval=False)
+
+        report = {
+            "task_id": "task_0",
+            "repeat_idx": 0,
+            "status": "agent_error",
+            "error": {"type": "AgentError", "message": "Tool call failed"},
+            "traces": {"agent": "trace_data"},
+            "config": {"model": "gpt-4"},
+            "eval": {"score": 0.0},
+        }
+
+        filtered = logger._filter_report(report)
+
+        assert filtered["status"] == "agent_error"
+        assert filtered["error"] == {"type": "AgentError", "message": "Tool call failed"}
+        assert "traces" not in filtered
+        assert "config" not in filtered
+        assert "eval" not in filtered
+
+    def test_filter_report_status_and_error_absent(self):
+        """Test that missing status/error fields result in None values."""
+        logger = MockResultLogger()
+
+        report = {
+            "task_id": "task_0",
+            "repeat_idx": 0,
+        }
+
+        filtered = logger._filter_report(report)
+
+        assert filtered["status"] is None
+        assert filtered["error"] is None
+
     def test_filter_report_partial_included(self):
         """Test report filtering with only some fields included."""
         logger = MockResultLogger(include_traces=False, include_config=True, include_eval=False)

Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict:`
`160`	`160`	`filtered = {`
`161`	`161`	`"task_id": report.get("task_id"),`
`162`	`162`	`"repeat_idx": report.get("repeat_idx"),`
	`163`	`+ "status": report.get("status"),`
	`164`	`+ "error": report.get("error"),`
`163`	`165`	`}`
`164`	`166`
`165`	`167`	`if self.include_traces and "traces" in report:`