@@ -954,14 +954,13 @@ def load_benchmark_data(
954954 benchmark = FiveADayBenchmark (
955955 callbacks = [logger ],
956956 seed = args .seed , # Use benchmark's seeding system
957- fail_on_setup_error = True ,
958- fail_on_task_error = True ,
959- fail_on_evaluation_error = True ,
957+ fail_on_setup_error = False ,
958+ fail_on_task_error = False ,
959+ fail_on_evaluation_error = False ,
960960 )
961961 results = benchmark .run (tasks = tasks , agent_data = agent_configs )
962962
963963 # --- Usage summary ---
964- from collections import defaultdict
965964 from maseval import TokenUsage
966965
967966 def _fmt_usage (usage ):
@@ -972,33 +971,17 @@ def _fmt_usage(usage):
972971 parts .append (f"units={ dict (usage .units )} " )
973972 return " " .join (parts )
974973
974+ reporter = UsageReporter .from_reports (results )
975+
975976 print ("\n --- Usage Summary ---" )
976- total = benchmark .usage
977- print (f"Total: { _fmt_usage (total )} " )
978-
979- # Group components by category
980- if benchmark .usage_by_component :
981- by_category : Dict [str , Dict [str , object ]] = defaultdict (dict )
982- for key , usage in benchmark .usage_by_component .items ():
983- category , name = key .split (":" , 1 )
984- by_category [category ][name ] = usage
985-
986- for category in ["agents" , "models" , "tools" , "simulators" , "callbacks" ]:
987- if category not in by_category :
988- continue
989- print (f"\n { category .capitalize ()} :" )
990- for name , usage in by_category [category ].items ():
991- print (f" { name :<35} { _fmt_usage (usage )} " )
992-
993- # Print any remaining categories not in the standard list
994- for category , components in by_category .items ():
995- if category in {"agents" , "models" , "tools" , "simulators" , "callbacks" }:
996- continue
997- print (f"\n { category .capitalize ()} :" )
998- for name , usage in components .items ():
999- print (f" { name :<35} { _fmt_usage (usage )} " )
977+ print (f"Total: { _fmt_usage (reporter .total ())} " )
978+
979+ by_component = reporter .by_component ()
980+ if by_component :
981+ print ("\n By component:" )
982+ for key , usage in by_component .items ():
983+ print (f" { key :<35} { _fmt_usage (usage )} " )
1000984
1001- reporter = UsageReporter .from_reports (results )
1002985 by_task = reporter .by_task ()
1003986 if by_task :
1004987 print ("\n Per-task:" )
0 commit comments