@@ -131,10 +131,11 @@ def check_completeness_and_validity(
131131 if task not in run_results :
132132 missing_tasks .append (task )
133133 else :
134- # Check for retryable errors
134+ # Check for retryable errors only if the task did not succeed
135135 meta = run_results [task ]
136+ success = bool (meta .get ("execution_result" , {}).get ("success" , False ))
136137 error_msg = meta .get ("execution_result" , {}).get ("error_message" , "" )
137- if error_msg and is_retryable_error (error_msg ):
138+ if ( not success ) and error_msg and is_retryable_error (error_msg ):
138139 invalid_tasks .append (f"{ task } : { error_msg [:50 ]} ..." )
139140
140141 if missing_tasks :
@@ -188,6 +189,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
188189 actual_model_name : Optional [str ] = None
189190 # If cost info is not present in metas, leave as None
190191 per_run_cost : Optional [float ] = None
192+ # Model-level flags (to be inferred from meta.json)
193+ is_open_source_model : Optional [bool ] = None
194+ is_reasoning_model : Optional [bool ] = None
191195
192196 # For pass@1 per-run statistics across all services
193197 pass1_rates_per_run_overall : List [float ] = []
@@ -239,6 +243,12 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
239243 if isinstance (possible_cost , (int , float )):
240244 per_run_cost = float (possible_cost )
241245
246+ # capture model flags if present
247+ if is_open_source_model is None and "is_open_source_model" in meta :
248+ is_open_source_model = bool (meta .get ("is_open_source_model" ))
249+ if is_reasoning_model is None and "is_reasoning_model" in meta :
250+ is_reasoning_model = bool (meta .get ("is_reasoning_model" ))
251+
242252 pass1_rates_per_run_overall .append (round (successes_this_run / total_tasks , 6 ))
243253
244254 # Compute pass@k and pass^k across tasks (overall)
@@ -303,6 +313,8 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
303313 "per_run_output_tokens" : per_run_output_tokens ,
304314 "per_run_cost" : computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None ),
305315 "actual_model_name" : actual_model_name or "" ,
316+ "is_open_source_model" : (is_open_source_model if is_open_source_model is not None else False ),
317+ "is_reasoning_model" : (is_reasoning_model if is_reasoning_model is not None else False ),
306318 "pass@1" : {
307319 "avg" : round (avg_pass1 , 4 ),
308320 "std" : round (std_pass1 , 4 ),
@@ -414,6 +426,8 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
414426 "per_run_output_tokens" : s_per_run_output_tokens ,
415427 "per_run_cost" : s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None ),
416428 "actual_model_name" : actual_model_name or "" ,
429+ "is_open_source_model" : (is_open_source_model if is_open_source_model is not None else False ),
430+ "is_reasoning_model" : (is_reasoning_model if is_reasoning_model is not None else False ),
417431 "pass@1" : {
418432 "avg" : round (s_mean , 4 ),
419433 "std" : round (s_std , 4 ),
0 commit comments