Skip to content

Commit 0b91461

Browse files
committed
remove manual CI (anti pattern)
1 parent 668652d commit 0b91461

2 files changed

Lines changed: 8 additions & 10 deletions

File tree

eval_protocol/quickstart/llm_judge.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,13 @@ async def run_judgment(row):
111111
print(f"✅ Generated {len(judgments)} valid judgments")
112112

113113
# Calculate bootstrap scores
114-
mean_score, lower_score, upper_score = calculate_bootstrap_scores(judgments)
115-
116-
if mean_score == 0.0:
114+
result = calculate_bootstrap_scores(judgments)
115+
if not result:
117116
print("❌ No valid scores extracted")
118117
return rows
119118

119+
mean_score, lower_score, upper_score = result
120+
120121
# Print leaderboard
121122
print("\n##### LLM Judge Results (90th percentile CI) #####")
122123

@@ -128,9 +129,6 @@ async def run_judgment(row):
128129
for row in rows:
129130
if row.evaluation_result:
130131
row.evaluation_result.score = mean_score
131-
row.evaluation_result.standard_error = (upper_score - lower_score) / (
132-
2 * 1.645
133-
) # Standard error approximation from 90% CI
134132

135133
# Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
136134
push_scores_to_langfuse(rows, model_name, mean_score)

eval_protocol/quickstart/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ async def run_judgment_async(
240240
return {"model": model_name, "games": games}
241241

242242

243-
def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float, float, float]:
243+
def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> Optional[tuple[float, float, float]]:
244244
"""
245245
Calculate bootstrap confidence intervals for Arena-Hard-Auto style judgments.
246246
@@ -251,8 +251,8 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
251251
judgments: List of judgment dicts, each containing "games" with two rounds of scores
252252
253253
Returns:
254-
tuple: (mean_score, lower_5th_percentile, upper_95th_percentile)
255-
Returns (0.0, 0.0, 0.0) if no valid scores found
254+
Optional[tuple]: (mean_score, lower_5th_percentile, upper_95th_percentile)
255+
Returns None if no valid scores found
256256
"""
257257
# Extract scores from judgments
258258
scores_data = []
@@ -265,7 +265,7 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
265265
scores_data.append(score)
266266

267267
if not scores_data:
268-
return 0.0, 0.0, 0.0
268+
return None
269269

270270
# Create DataFrame (single column of scores)
271271
battles = pd.DataFrame({"score": scores_data})

0 commit comments

Comments
 (0)