remove manual CI (anti pattern)

xzrderek · xzrderek · commit 0b91461a95b8 · 2025-09-15T12:20:51.000-07:00
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -111,12 +111,13 @@ async def run_judgment(row):
     print(f"✅ Generated {len(judgments)} valid judgments")
 
     # Calculate bootstrap scores
-    mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
-
-    if mean_score == 0.0:
+    result = calculate_bootstrap_scores(judgments)
+    if not result:
         print("❌ No valid scores extracted")
         return rows
 
+    mean_score, lower_score, upper_score = result
+
     # Print leaderboard
     print("\n##### LLM Judge Results (90th percentile CI) #####")
 
@@ -128,9 +129,6 @@ async def run_judgment(row):
     for row in rows:
         if row.evaluation_result:
             row.evaluation_result.score = mean_score
-            row.evaluation_result.standard_error = (upper_score - lower_score) / (
-                2 * 1.645
-            )  # Standard error approximation from 90% CI
 
     # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
     push_scores_to_langfuse(rows, model_name, mean_score)
diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py
@@ -240,7 +240,7 @@ async def run_judgment_async(
     return {"model": model_name, "games": games}
 
 
-def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float, float, float]:
+def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> Optional[tuple[float, float, float]]:
     """
     Calculate bootstrap confidence intervals for Arena-Hard-Auto style judgments.
 
@@ -251,8 +251,8 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
         judgments: List of judgment dicts, each containing "games" with two rounds of scores
 
     Returns:
-        tuple: (mean_score, lower_5th_percentile, upper_95th_percentile)
-               Returns (0.0, 0.0, 0.0) if no valid scores found
+        Optional[tuple]: (mean_score, lower_5th_percentile, upper_95th_percentile)
+                        Returns None if no valid scores found
     """
     # Extract scores from judgments
     scores_data = []
@@ -265,7 +265,7 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
                 scores_data.append(score)
 
     if not scores_data:
-        return 0.0, 0.0, 0.0
+        return None
 
     # Create DataFrame (single column of scores)
     battles = pd.DataFrame({"score": scores_data})