@@ -240,7 +240,7 @@ async def run_judgment_async(
240240 return {"model" : model_name , "games" : games }
241241
242242
243- def calculate_bootstrap_scores (judgments : List [Dict [str , Any ]]) -> tuple [float , float , float ]:
243+ def calculate_bootstrap_scores (judgments : List [Dict [str , Any ]]) -> Optional [ tuple [float , float , float ] ]:
244244 """
245245 Calculate bootstrap confidence intervals for Arena-Hard-Auto style judgments.
246246
@@ -251,8 +251,8 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
251251 judgments: List of judgment dicts, each containing "games" with two rounds of scores
252252
253253 Returns:
254- tuple: (mean_score, lower_5th_percentile, upper_95th_percentile)
255- Returns (0.0, 0.0, 0.0) if no valid scores found
254+ Optional[ tuple] : (mean_score, lower_5th_percentile, upper_95th_percentile)
255+ Returns None if no valid scores found
256256 """
257257 # Extract scores from judgments
258258 scores_data = []
@@ -265,7 +265,7 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
265265 scores_data .append (score )
266266
267267 if not scores_data :
268- return 0.0 , 0.0 , 0.0
268+ return None
269269
270270 # Create DataFrame (single column of scores)
271271 battles = pd .DataFrame ({"score" : scores_data })
0 commit comments