@@ -365,18 +365,12 @@ def test_empty_results(self):
365365 assert result ["successful_tasks" ] == 0
366366 assert result ["success_rate" ] == 0.0
367367 assert result ["mean_metrics" ] == {}
368- assert result ["excluded" ] == {
369- "environment_error" : 0 ,
370- "user_error" : 0 ,
371- "unknown_execution_error" : 0 ,
372- "evaluation_failed" : 0 ,
373- "setup_failed" : 0 ,
374- }
368+ assert result ["excluded" ] == {}
375369 assert result ["status_counts" ] == {}
376370
377371 def test_single_successful_result (self ):
378372 """Single successful result counted."""
379- results = [{"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 , "user_gsr" : 1.0 , "system_gsr" : 1.0 }]}]
373+ results = [{"status" : "success " , "eval" : [{"overall_gsr" : 1.0 , "user_gsr" : 1.0 , "system_gsr" : 1.0 }]}]
380374
381375 metrics = compute_benchmark_metrics (results )
382376
@@ -387,7 +381,7 @@ def test_single_successful_result(self):
387381
388382 def test_single_failed_result (self ):
389383 """Single failed result counted."""
390- results = [{"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 , "user_gsr" : 0.0 , "system_gsr" : 0.0 }]}]
384+ results = [{"status" : "success " , "eval" : [{"overall_gsr" : 0.0 , "user_gsr" : 0.0 , "system_gsr" : 0.0 }]}]
391385
392386 metrics = compute_benchmark_metrics (results )
393387
@@ -399,9 +393,9 @@ def test_single_failed_result(self):
399393 def test_multiple_results (self ):
400394 """Multiple results aggregated correctly."""
401395 results = [
402- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]}, # Success
403- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 }]}, # Fail
404- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]}, # Success
396+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]}, # Success
397+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 }]}, # Fail
398+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]}, # Success
405399 ]
406400
407401 metrics = compute_benchmark_metrics (results )
@@ -414,10 +408,10 @@ def test_multiple_results(self):
414408 def test_success_rate_calculation (self ):
415409 """success_rate = successful/scored (not total)."""
416410 results = [
417- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
418- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
419- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 }]},
420- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 }]},
411+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
412+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
413+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 }]},
414+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 }]},
421415 ]
422416
423417 metrics = compute_benchmark_metrics (results )
@@ -427,8 +421,8 @@ def test_success_rate_calculation(self):
427421 def test_mean_metrics_calculation (self ):
428422 """Mean of numeric metrics computed."""
429423 results = [
430- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 , "partial_gsr" : 0.8 }]},
431- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 , "partial_gsr" : 0.4 }]},
424+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 , "partial_gsr" : 0.8 }]},
425+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 , "partial_gsr" : 0.4 }]},
432426 ]
433427
434428 metrics = compute_benchmark_metrics (results )
@@ -439,9 +433,9 @@ def test_mean_metrics_calculation(self):
439433 def test_handles_missing_eval (self ):
440434 """Handles results with no eval key."""
441435 results = [
442- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
443- {"status" : "completed " , "no_eval_key" : True }, # Missing eval
444- {"status" : "completed " , "eval" : None }, # None eval
436+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
437+ {"status" : "success " , "no_eval_key" : True }, # Missing eval
438+ {"status" : "success " , "eval" : None }, # None eval
445439 ]
446440
447441 metrics = compute_benchmark_metrics (results )
@@ -454,7 +448,7 @@ def test_handles_non_numeric_values(self):
454448 """Non-numeric values in eval are ignored for mean."""
455449 results = [
456450 {
457- "status" : "completed " ,
451+ "status" : "success " ,
458452 "eval" : [
459453 {
460454 "overall_gsr" : 1.0 ,
@@ -475,23 +469,23 @@ def test_handles_non_numeric_values(self):
475469 def test_excludes_environment_errors_from_scoring (self ):
476470 """Environment errors are excluded from scoring."""
477471 results = [
478- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
472+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
479473 {"status" : "environment_error" , "eval" : None }, # Should be excluded
480- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 }]},
474+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 }]},
481475 ]
482476
483477 metrics = compute_benchmark_metrics (results )
484478
485479 assert metrics ["total_tasks" ] == 3
486- assert metrics ["scored_tasks" ] == 2 # Only completed tasks
480+ assert metrics ["scored_tasks" ] == 2 # Only success tasks
487481 assert metrics ["successful_tasks" ] == 1
488482 assert metrics ["success_rate" ] == 0.5 # 1/2, not 1/3
489483 assert metrics ["excluded" ]["environment_error" ] == 1
490484
491485 def test_excludes_user_errors_from_scoring (self ):
492486 """User simulator errors are excluded from scoring."""
493487 results = [
494- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
488+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
495489 {"status" : "user_error" , "eval" : None },
496490 ]
497491
@@ -500,14 +494,14 @@ def test_excludes_user_errors_from_scoring(self):
500494 assert metrics ["total_tasks" ] == 2
501495 assert metrics ["scored_tasks" ] == 1
502496 assert metrics ["successful_tasks" ] == 1
503- assert metrics ["success_rate" ] == 1.0 # Only the completed one
497+ assert metrics ["success_rate" ] == 1.0 # Only the success one
504498 assert metrics ["excluded" ]["user_error" ] == 1
505499
506500 def test_excludes_unknown_errors_from_scoring (self ):
507501 """Unknown execution errors are excluded from scoring."""
508502 results = [
509503 {"status" : "unknown_execution_error" , "eval" : None },
510- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 }]},
504+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 }]},
511505 ]
512506
513507 metrics = compute_benchmark_metrics (results )
@@ -521,7 +515,7 @@ def test_excludes_setup_failed_from_scoring(self):
521515 """Setup failures are excluded from scoring."""
522516 results = [
523517 {"status" : "setup_failed" , "eval" : None },
524- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
518+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
525519 ]
526520
527521 metrics = compute_benchmark_metrics (results )
@@ -534,21 +528,21 @@ def test_excludes_evaluation_failed_from_scoring(self):
534528 """Evaluation failures are excluded from scoring."""
535529 results = [
536530 {"status" : "evaluation_failed" , "eval" : None },
537- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
531+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
538532 ]
539533
540534 metrics = compute_benchmark_metrics (results )
541535
542536 assert metrics ["total_tasks" ] == 2
543537 assert metrics ["scored_tasks" ] == 1
544- assert metrics ["success_rate" ] == 1.0 # Only the completed one
538+ assert metrics ["success_rate" ] == 1.0 # Only the success one
545539 assert metrics ["excluded" ]["evaluation_failed" ] == 1
546540
547541 def test_includes_agent_errors_in_scoring (self ):
548542 """Agent errors ARE included in scoring (agent's fault)."""
549543 results = [
550544 {"status" : "agent_error" , "eval" : [{"overall_gsr" : 0.0 }]},
551- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
545+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
552546 ]
553547
554548 metrics = compute_benchmark_metrics (results )
@@ -561,23 +555,23 @@ def test_includes_agent_errors_in_scoring(self):
561555 def test_status_counts_tracked (self ):
562556 """Status counts are tracked for all tasks."""
563557 results = [
564- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 }]},
565- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 }]},
558+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 }]},
559+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 }]},
566560 {"status" : "agent_error" , "eval" : None },
567561 {"status" : "environment_error" , "eval" : None },
568562 ]
569563
570564 metrics = compute_benchmark_metrics (results )
571565
572- assert metrics ["status_counts" ]["completed " ] == 2
566+ assert metrics ["status_counts" ]["success " ] == 2
573567 assert metrics ["status_counts" ]["agent_error" ] == 1
574568 assert metrics ["status_counts" ]["environment_error" ] == 1
575569
576570 def test_mixed_errors_comprehensive (self ):
577571 """Comprehensive test with various error types."""
578572 results = [
579- {"status" : "completed " , "eval" : [{"overall_gsr" : 1.0 , "accuracy" : 0.9 }]},
580- {"status" : "completed " , "eval" : [{"overall_gsr" : 0.0 , "accuracy" : 0.3 }]},
573+ {"status" : "success " , "eval" : [{"overall_gsr" : 1.0 , "accuracy" : 0.9 }]},
574+ {"status" : "success " , "eval" : [{"overall_gsr" : 0.0 , "accuracy" : 0.3 }]},
581575 {"status" : "agent_error" , "eval" : [{"overall_gsr" : 0.0 , "accuracy" : 0.0 }]},
582576 {"status" : "environment_error" , "eval" : None }, # Excluded
583577 {"status" : "user_error" , "eval" : None }, # Excluded
@@ -588,7 +582,7 @@ def test_mixed_errors_comprehensive(self):
588582 metrics = compute_benchmark_metrics (results )
589583
590584 assert metrics ["total_tasks" ] == 7
591- assert metrics ["scored_tasks" ] == 3 # completed (2) + agent_error(1)
585+ assert metrics ["scored_tasks" ] == 3 # success (2) + agent_error(1)
592586 assert metrics ["successful_tasks" ] == 1
593587 assert metrics ["success_rate" ] == pytest .approx (1 / 3 )
594588 assert metrics ["mean_metrics" ]["accuracy" ] == pytest .approx ((0.9 + 0.3 + 0.0 ) / 3 )
0 commit comments