Fix ResultLogger._filter_report() dropping status and error fields

cemde · cemde · commit fdc889712726 · 2026-03-03T14:22:44.000Z
The _filter_report() method was only copying task_id and repeat_idx into
filtered output, silently dropping the status and error fields. This made
it impossible to distinguish successful runs from failures in persisted
results. These are core metadata fields, not optional bulk data controlled
by include_* flags.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -95,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures (PR: #PR_NUMBER_PLACEHOLDER)
 - GAIA2: Various fixes for faithful reproduction of ARE reference results — scenario lifecycle, data loading, evaluation flow, multi-turn notification handling, tool filtering, default agent fidelity, and simulation time management (PR: #30)
 - MultiAgentBench: Corrected domain mappings, added missing werewolf/minecraft support, fixed environment constructors, added result summarization matching MARBLE's evaluation pipeline (PR: #30)
 - Tau2: Fixed telecom domain schema to match tau2-bench, added agent/user state synchronization and deterministic network simulation, fixed initialization flow and tool result serialization (PR: #30)
diff --git a/maseval/core/callbacks/result_logger.py b/maseval/core/callbacks/result_logger.py
@@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict:
         filtered = {
             "task_id": report.get("task_id"),
             "repeat_idx": report.get("repeat_idx"),
+            "status": report.get("status"),
+            "error": report.get("error"),
         }
 
         if self.include_traces and "traces" in report:
diff --git a/tests/test_core/test_callbacks/test_result_logger.py b/tests/test_core/test_callbacks/test_result_logger.py
@@ -134,6 +134,46 @@ def test_filter_report_all_included(self):
         assert "config" in filtered
         assert "eval" in filtered
 
+    def test_filter_report_preserves_status_and_error(self):
+        """Test that status and error fields are always included in filtered reports.
+
+        These are core metadata fields (like task_id and repeat_idx) that must
+        always be present so persisted results can distinguish successes from failures.
+        """
+        logger = MockResultLogger(include_traces=False, include_config=False, include_eval=False)
+
+        report = {
+            "task_id": "task_0",
+            "repeat_idx": 0,
+            "status": "agent_error",
+            "error": {"type": "AgentError", "message": "Tool call failed"},
+            "traces": {"agent": "trace_data"},
+            "config": {"model": "gpt-4"},
+            "eval": {"score": 0.0},
+        }
+
+        filtered = logger._filter_report(report)
+
+        assert filtered["status"] == "agent_error"
+        assert filtered["error"] == {"type": "AgentError", "message": "Tool call failed"}
+        assert "traces" not in filtered
+        assert "config" not in filtered
+        assert "eval" not in filtered
+
+    def test_filter_report_status_and_error_absent(self):
+        """Test that missing status/error fields result in None values."""
+        logger = MockResultLogger()
+
+        report = {
+            "task_id": "task_0",
+            "repeat_idx": 0,
+        }
+
+        filtered = logger._filter_report(report)
+
+        assert filtered["status"] is None
+        assert filtered["error"] is None
+
     def test_filter_report_partial_included(self):
         """Test report filtering with only some fields included."""
         logger = MockResultLogger(include_traces=False, include_config=True, include_eval=False)

Original file line number	Diff line number	Diff line change
`@@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict:`
`160`	`160`	`filtered = {`
`161`	`161`	`"task_id": report.get("task_id"),`
`162`	`162`	`"repeat_idx": report.get("repeat_idx"),`
	`163`	`+ "status": report.get("status"),`
	`164`	`+ "error": report.get("error"),`
`163`	`165`	`}`
`164`	`166`
`165`	`167`	`if self.include_traces and "traces" in report:`