Skip to content

Commit e522a9a

Browse files
committed
MACS now uses TaskExecutionStatus
1 parent d49f1bb commit e522a9a

4 files changed

Lines changed: 77 additions & 81 deletions

File tree

maseval/benchmark/macs/macs.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def get_model_adapter(self, model_id, **kwargs):
5353
MessageHistory,
5454
ModelAdapter,
5555
Task,
56+
TaskExecutionStatus,
5657
ToolInvocationHistory,
5758
ToolLLMSimulator,
5859
User,
@@ -64,6 +65,17 @@ def get_model_adapter(self, model_id, **kwargs):
6465
from maseval.core.tracing import TraceableMixin
6566

6667

68+
# Statuses where agent is accountable (included in scoring)
69+
# Note: task_timeout is included - timeouts count as failures in MACS
70+
SCOREABLE_STATUSES = frozenset(
71+
{
72+
TaskExecutionStatus.SUCCESS.value,
73+
TaskExecutionStatus.AGENT_ERROR.value,
74+
TaskExecutionStatus.TASK_TIMEOUT.value,
75+
}
76+
)
77+
78+
6779
# =============================================================================
6880
# Tool
6981
# =============================================================================
@@ -987,29 +999,14 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
987999
- excluded: Dict with counts of excluded tasks by category
9881000
- status_counts: Dict with counts of each status type
9891001
"""
990-
# Status values that indicate infrastructure failures (not agent's fault)
991-
INFRASTRUCTURE_STATUSES = {
992-
"environment_error",
993-
"user_error",
994-
"unknown_execution_error",
995-
"evaluation_failed",
996-
"setup_failed",
997-
}
998-
9991002
if not results:
10001003
return {
10011004
"total_tasks": 0,
10021005
"scored_tasks": 0,
10031006
"successful_tasks": 0,
10041007
"success_rate": 0.0,
10051008
"mean_metrics": {},
1006-
"excluded": {
1007-
"environment_error": 0,
1008-
"user_error": 0,
1009-
"unknown_execution_error": 0,
1010-
"evaluation_failed": 0,
1011-
"setup_failed": 0,
1012-
},
1009+
"excluded": {},
10131010
"status_counts": {},
10141011
}
10151012

@@ -1019,14 +1016,14 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
10191016
successful_tasks = 0
10201017
scored_tasks = 0
10211018
status_counts: Dict[str, int] = {}
1022-
excluded_counts: Dict[str, int] = {s: 0 for s in INFRASTRUCTURE_STATUSES}
1019+
excluded_counts: Dict[str, int] = {}
10231020

10241021
for res in results:
10251022
status = res.get("status", "unknown")
10261023
status_counts[status] = status_counts.get(status, 0) + 1
10271024

1028-
# Skip infrastructure failures from scoring
1029-
if status in INFRASTRUCTURE_STATUSES:
1025+
# Skip infrastructure failures from scoring (use module-level SCOREABLE_STATUSES)
1026+
if status not in SCOREABLE_STATUSES:
10301027
excluded_counts[status] = excluded_counts.get(status, 0) + 1
10311028
continue
10321029

maseval/benchmark/tau2/evaluator.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,23 @@
2323
from enum import Enum
2424
from typing import Any, Dict, List, Optional
2525

26-
from maseval import Evaluator, Task
26+
from maseval import Evaluator, Task, TaskExecutionStatus
2727

2828
from maseval.benchmark.tau2.environment import Tau2Environment, get_environment_constructor
2929
from maseval.benchmark.tau2.utils import compare_tool_calls
3030

3131

32+
# Statuses where agent is accountable (included in scoring)
33+
# Note: task_timeout is included - timeouts count as failures in tau2
34+
SCOREABLE_STATUSES = frozenset(
35+
{
36+
TaskExecutionStatus.SUCCESS.value,
37+
TaskExecutionStatus.AGENT_ERROR.value,
38+
TaskExecutionStatus.TASK_TIMEOUT.value,
39+
}
40+
)
41+
42+
3243
class RewardType(str, Enum):
3344
"""Types of rewards that can be computed.
3445
@@ -447,21 +458,14 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
447458
"""Compute summary metrics across all benchmark results.
448459
449460
Infrastructure errors are excluded from scoring metrics.
461+
Uses SCOREABLE_STATUSES to determine which results count toward agent score.
450462
451463
Args:
452464
results: List of result dicts from benchmark.run()
453465
454466
Returns:
455467
Dict with success_rate, mean_reward, pass_at_k, status_counts
456468
"""
457-
INFRASTRUCTURE_STATUSES = {
458-
"environment_error",
459-
"user_error",
460-
"unknown_execution_error",
461-
"evaluation_failed",
462-
"setup_failed",
463-
}
464-
465469
if not results:
466470
return {
467471
"total_tasks": 0,
@@ -482,8 +486,8 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
482486
status = res.get("status", "unknown")
483487
status_counts[status] = status_counts.get(status, 0) + 1
484488

485-
if status in INFRASTRUCTURE_STATUSES:
486-
continue
489+
if status not in SCOREABLE_STATUSES:
490+
continue # Skip infrastructure errors
487491

488492
scored_tasks += 1
489493
evals = res.get("eval") or []
@@ -529,7 +533,7 @@ def compute_pass_at_k(
529533
task_results: Dict[str, List[bool]] = {}
530534
for res in results:
531535
task_id = res.get("task_id", "")
532-
if res.get("status") not in {"success", "agent_error"}:
536+
if res.get("status") not in SCOREABLE_STATUSES:
533537
continue # Skip infrastructure errors
534538

535539
evals = res.get("eval") or []
@@ -615,7 +619,7 @@ def compute_pass_hat_k(
615619
task_results: Dict[str, List[bool]] = {}
616620
for res in results:
617621
task_id = res.get("task_id", "")
618-
if res.get("status") not in {"success", "agent_error"}:
622+
if res.get("status") not in SCOREABLE_STATUSES:
619623
continue # Skip infrastructure errors
620624

621625
evals = res.get("eval") or []

tests/test_benchmarks/test_macs/test_macs_benchmark.py

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -365,18 +365,12 @@ def test_empty_results(self):
365365
assert result["successful_tasks"] == 0
366366
assert result["success_rate"] == 0.0
367367
assert result["mean_metrics"] == {}
368-
assert result["excluded"] == {
369-
"environment_error": 0,
370-
"user_error": 0,
371-
"unknown_execution_error": 0,
372-
"evaluation_failed": 0,
373-
"setup_failed": 0,
374-
}
368+
assert result["excluded"] == {}
375369
assert result["status_counts"] == {}
376370

377371
def test_single_successful_result(self):
378372
"""Single successful result counted."""
379-
results = [{"status": "completed", "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0}]}]
373+
results = [{"status": "success", "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0}]}]
380374

381375
metrics = compute_benchmark_metrics(results)
382376

@@ -387,7 +381,7 @@ def test_single_successful_result(self):
387381

388382
def test_single_failed_result(self):
389383
"""Single failed result counted."""
390-
results = [{"status": "completed", "eval": [{"overall_gsr": 0.0, "user_gsr": 0.0, "system_gsr": 0.0}]}]
384+
results = [{"status": "success", "eval": [{"overall_gsr": 0.0, "user_gsr": 0.0, "system_gsr": 0.0}]}]
391385

392386
metrics = compute_benchmark_metrics(results)
393387

@@ -399,9 +393,9 @@ def test_single_failed_result(self):
399393
def test_multiple_results(self):
400394
"""Multiple results aggregated correctly."""
401395
results = [
402-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]}, # Success
403-
{"status": "completed", "eval": [{"overall_gsr": 0.0}]}, # Fail
404-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]}, # Success
396+
{"status": "success", "eval": [{"overall_gsr": 1.0}]}, # Success
397+
{"status": "success", "eval": [{"overall_gsr": 0.0}]}, # Fail
398+
{"status": "success", "eval": [{"overall_gsr": 1.0}]}, # Success
405399
]
406400

407401
metrics = compute_benchmark_metrics(results)
@@ -414,10 +408,10 @@ def test_multiple_results(self):
414408
def test_success_rate_calculation(self):
415409
"""success_rate = successful/scored (not total)."""
416410
results = [
417-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
418-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
419-
{"status": "completed", "eval": [{"overall_gsr": 0.0}]},
420-
{"status": "completed", "eval": [{"overall_gsr": 0.0}]},
411+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
412+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
413+
{"status": "success", "eval": [{"overall_gsr": 0.0}]},
414+
{"status": "success", "eval": [{"overall_gsr": 0.0}]},
421415
]
422416

423417
metrics = compute_benchmark_metrics(results)
@@ -427,8 +421,8 @@ def test_success_rate_calculation(self):
427421
def test_mean_metrics_calculation(self):
428422
"""Mean of numeric metrics computed."""
429423
results = [
430-
{"status": "completed", "eval": [{"overall_gsr": 1.0, "partial_gsr": 0.8}]},
431-
{"status": "completed", "eval": [{"overall_gsr": 0.0, "partial_gsr": 0.4}]},
424+
{"status": "success", "eval": [{"overall_gsr": 1.0, "partial_gsr": 0.8}]},
425+
{"status": "success", "eval": [{"overall_gsr": 0.0, "partial_gsr": 0.4}]},
432426
]
433427

434428
metrics = compute_benchmark_metrics(results)
@@ -439,9 +433,9 @@ def test_mean_metrics_calculation(self):
439433
def test_handles_missing_eval(self):
440434
"""Handles results with no eval key."""
441435
results = [
442-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
443-
{"status": "completed", "no_eval_key": True}, # Missing eval
444-
{"status": "completed", "eval": None}, # None eval
436+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
437+
{"status": "success", "no_eval_key": True}, # Missing eval
438+
{"status": "success", "eval": None}, # None eval
445439
]
446440

447441
metrics = compute_benchmark_metrics(results)
@@ -454,7 +448,7 @@ def test_handles_non_numeric_values(self):
454448
"""Non-numeric values in eval are ignored for mean."""
455449
results = [
456450
{
457-
"status": "completed",
451+
"status": "success",
458452
"eval": [
459453
{
460454
"overall_gsr": 1.0,
@@ -475,23 +469,23 @@ def test_handles_non_numeric_values(self):
475469
def test_excludes_environment_errors_from_scoring(self):
476470
"""Environment errors are excluded from scoring."""
477471
results = [
478-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
472+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
479473
{"status": "environment_error", "eval": None}, # Should be excluded
480-
{"status": "completed", "eval": [{"overall_gsr": 0.0}]},
474+
{"status": "success", "eval": [{"overall_gsr": 0.0}]},
481475
]
482476

483477
metrics = compute_benchmark_metrics(results)
484478

485479
assert metrics["total_tasks"] == 3
486-
assert metrics["scored_tasks"] == 2 # Only completed tasks
480+
assert metrics["scored_tasks"] == 2 # Only success tasks
487481
assert metrics["successful_tasks"] == 1
488482
assert metrics["success_rate"] == 0.5 # 1/2, not 1/3
489483
assert metrics["excluded"]["environment_error"] == 1
490484

491485
def test_excludes_user_errors_from_scoring(self):
492486
"""User simulator errors are excluded from scoring."""
493487
results = [
494-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
488+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
495489
{"status": "user_error", "eval": None},
496490
]
497491

@@ -500,14 +494,14 @@ def test_excludes_user_errors_from_scoring(self):
500494
assert metrics["total_tasks"] == 2
501495
assert metrics["scored_tasks"] == 1
502496
assert metrics["successful_tasks"] == 1
503-
assert metrics["success_rate"] == 1.0 # Only the completed one
497+
assert metrics["success_rate"] == 1.0 # Only the success one
504498
assert metrics["excluded"]["user_error"] == 1
505499

506500
def test_excludes_unknown_errors_from_scoring(self):
507501
"""Unknown execution errors are excluded from scoring."""
508502
results = [
509503
{"status": "unknown_execution_error", "eval": None},
510-
{"status": "completed", "eval": [{"overall_gsr": 0.0}]},
504+
{"status": "success", "eval": [{"overall_gsr": 0.0}]},
511505
]
512506

513507
metrics = compute_benchmark_metrics(results)
@@ -521,7 +515,7 @@ def test_excludes_setup_failed_from_scoring(self):
521515
"""Setup failures are excluded from scoring."""
522516
results = [
523517
{"status": "setup_failed", "eval": None},
524-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
518+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
525519
]
526520

527521
metrics = compute_benchmark_metrics(results)
@@ -534,21 +528,21 @@ def test_excludes_evaluation_failed_from_scoring(self):
534528
"""Evaluation failures are excluded from scoring."""
535529
results = [
536530
{"status": "evaluation_failed", "eval": None},
537-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
531+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
538532
]
539533

540534
metrics = compute_benchmark_metrics(results)
541535

542536
assert metrics["total_tasks"] == 2
543537
assert metrics["scored_tasks"] == 1
544-
assert metrics["success_rate"] == 1.0 # Only the completed one
538+
assert metrics["success_rate"] == 1.0 # Only the success one
545539
assert metrics["excluded"]["evaluation_failed"] == 1
546540

547541
def test_includes_agent_errors_in_scoring(self):
548542
"""Agent errors ARE included in scoring (agent's fault)."""
549543
results = [
550544
{"status": "agent_error", "eval": [{"overall_gsr": 0.0}]},
551-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
545+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
552546
]
553547

554548
metrics = compute_benchmark_metrics(results)
@@ -561,23 +555,23 @@ def test_includes_agent_errors_in_scoring(self):
561555
def test_status_counts_tracked(self):
562556
"""Status counts are tracked for all tasks."""
563557
results = [
564-
{"status": "completed", "eval": [{"overall_gsr": 1.0}]},
565-
{"status": "completed", "eval": [{"overall_gsr": 0.0}]},
558+
{"status": "success", "eval": [{"overall_gsr": 1.0}]},
559+
{"status": "success", "eval": [{"overall_gsr": 0.0}]},
566560
{"status": "agent_error", "eval": None},
567561
{"status": "environment_error", "eval": None},
568562
]
569563

570564
metrics = compute_benchmark_metrics(results)
571565

572-
assert metrics["status_counts"]["completed"] == 2
566+
assert metrics["status_counts"]["success"] == 2
573567
assert metrics["status_counts"]["agent_error"] == 1
574568
assert metrics["status_counts"]["environment_error"] == 1
575569

576570
def test_mixed_errors_comprehensive(self):
577571
"""Comprehensive test with various error types."""
578572
results = [
579-
{"status": "completed", "eval": [{"overall_gsr": 1.0, "accuracy": 0.9}]},
580-
{"status": "completed", "eval": [{"overall_gsr": 0.0, "accuracy": 0.3}]},
573+
{"status": "success", "eval": [{"overall_gsr": 1.0, "accuracy": 0.9}]},
574+
{"status": "success", "eval": [{"overall_gsr": 0.0, "accuracy": 0.3}]},
581575
{"status": "agent_error", "eval": [{"overall_gsr": 0.0, "accuracy": 0.0}]},
582576
{"status": "environment_error", "eval": None}, # Excluded
583577
{"status": "user_error", "eval": None}, # Excluded
@@ -588,7 +582,7 @@ def test_mixed_errors_comprehensive(self):
588582
metrics = compute_benchmark_metrics(results)
589583

590584
assert metrics["total_tasks"] == 7
591-
assert metrics["scored_tasks"] == 3 # completed(2) + agent_error(1)
585+
assert metrics["scored_tasks"] == 3 # success(2) + agent_error(1)
592586
assert metrics["successful_tasks"] == 1
593587
assert metrics["success_rate"] == pytest.approx(1 / 3)
594588
assert metrics["mean_metrics"]["accuracy"] == pytest.approx((0.9 + 0.3 + 0.0) / 3)

0 commit comments

Comments
 (0)