Skip to content

Commit a4ec486

Browse files
authored
Fix ResultLogger dropping status and error fields (#38)
* Fix ResultLogger._filter_report() dropping status and error fields
1 parent 388653f commit a4ec486

5 files changed

Lines changed: 46 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
9595

9696
### Fixed
9797

98+
- `ResultLogger._filter_report()` now includes `status` and `error` fields in persisted results, so saved logs can distinguish successful runs from infrastructure failures. Report schema is now consistent across success and failure paths (`error` is always present, `None` on success). (PR: #38)
9899
- GAIA2: Various fixes for faithful reproduction of ARE reference results — scenario lifecycle, data loading, evaluation flow, multi-turn notification handling, tool filtering, default agent fidelity, and simulation time management (PR: #30)
99100
- MultiAgentBench: Corrected domain mappings, added missing werewolf/minecraft support, fixed environment constructors, added result summarization matching MARBLE's evaluation pipeline (PR: #30)
100101
- Tau2: Fixed telecom domain schema to match tau2-bench, added agent/user state synchronization and deterministic network simulation, fixed initialization flow and tool result serialization (PR: #30)

maseval/core/benchmark.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,20 +1226,17 @@ def _execute_task_repetition(
12261226
# Task execution failed, so skip evaluation
12271227
eval_results = None
12281228

1229-
# 5. Build report
1229+
# 5. Build report — all keys always present for consistent schema
12301230
report: Dict[str, Any] = {
12311231
"task_id": str(task.id),
12321232
"repeat_idx": repeat_idx,
12331233
"status": execution_status.value,
1234+
"error": error_info,
12341235
"traces": execution_traces,
12351236
"config": execution_configs,
12361237
"eval": eval_results,
12371238
}
12381239

1239-
# Add error info if present
1240-
if error_info is not None:
1241-
report["error"] = error_info
1242-
12431240
# Clear registry after task repetition completes
12441241
self.clear_registry()
12451242

maseval/core/callbacks/result_logger.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ def _filter_report(self, report: Dict) -> Dict:
160160
filtered = {
161161
"task_id": report.get("task_id"),
162162
"repeat_idx": report.get("repeat_idx"),
163+
"status": report.get("status"),
164+
"error": report.get("error"),
163165
}
164166

165167
if self.include_traces and "traces" in report:

tests/test_core/test_benchmark/test_benchmark_lifecycle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ def test_successful_task_has_success_status(self):
562562

563563
assert len(reports) == 1
564564
assert reports[0]["status"] == TaskExecutionStatus.SUCCESS.value
565-
assert "error" not in reports[0]
565+
assert reports[0]["error"] is None
566566
assert reports[0]["eval"] is not None
567567

568568
def test_default_failure_flags(self):

tests/test_core/test_callbacks/test_result_logger.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,46 @@ def test_filter_report_all_included(self):
134134
assert "config" in filtered
135135
assert "eval" in filtered
136136

137+
def test_filter_report_preserves_status_and_error(self):
138+
"""Test that status and error fields are always included in filtered reports.
139+
140+
These are core metadata fields (like task_id and repeat_idx) that must
141+
always be present so persisted results can distinguish successes from failures.
142+
"""
143+
logger = MockResultLogger(include_traces=False, include_config=False, include_eval=False)
144+
145+
report = {
146+
"task_id": "task_0",
147+
"repeat_idx": 0,
148+
"status": "agent_error",
149+
"error": {"type": "AgentError", "message": "Tool call failed"},
150+
"traces": {"agent": "trace_data"},
151+
"config": {"model": "gpt-4"},
152+
"eval": {"score": 0.0},
153+
}
154+
155+
filtered = logger._filter_report(report)
156+
157+
assert filtered["status"] == "agent_error"
158+
assert filtered["error"] == {"type": "AgentError", "message": "Tool call failed"}
159+
assert "traces" not in filtered
160+
assert "config" not in filtered
161+
assert "eval" not in filtered
162+
163+
def test_filter_report_status_and_error_absent(self):
164+
"""Test that missing status/error fields result in None values."""
165+
logger = MockResultLogger()
166+
167+
report = {
168+
"task_id": "task_0",
169+
"repeat_idx": 0,
170+
}
171+
172+
filtered = logger._filter_report(report)
173+
174+
assert filtered["status"] is None
175+
assert filtered["error"] is None
176+
137177
def test_filter_report_partial_included(self):
138178
"""Test report filtering with only some fields included."""
139179
logger = MockResultLogger(include_traces=False, include_config=True, include_eval=False)

0 commit comments

Comments
 (0)