-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathllm_judge.py
More file actions
90 lines (73 loc) · 3.33 KB
/
Copy pathllm_judge.py
File metadata and controls
90 lines (73 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
"""
from typing import Optional
from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
from eval_protocol.adapters.base import BaseAdapter
from eval_protocol.quickstart.utils import (
JUDGE_CONFIGS,
LABEL_TO_SCORE,
serialize_message,
run_single_judgment,
)
from openai import AsyncOpenAI
async def aha_judge(
row: EvaluationRow, judge_name: str = "kimi-k2p5", adapter: Optional[BaseAdapter] = None
) -> EvaluationRow:
"""
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons for a single row.
Compares model response against ground truth using an LLM judge:
1. Extracts the question from messages[:-1]
2. Compares messages[-1] (new model response) vs ground_truth (baseline response)
3. Runs two judgment rounds (A vs B, B vs A) to reduce position bias
4. Returns individual scores for bootstrap aggregation
Args:
row: Single EvaluationRow object with messages, ground_truth, and tools
judge_name: Name of the judge configuration to use
adapter: Optional adapter to push scores back to (if provided)
Returns:
Same row with updated evaluation_result containing individual judgment scores
"""
if not row.messages:
return row
judge_config = JUDGE_CONFIGS[judge_name]
# Extract question and answers
question_text = "\n".join([serialize_message(msg) for msg in row.messages[:-1]])
model_a_answer = str(row.ground_truth)
model_b_answer = serialize_message(row.messages[-1])
async with AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")) as client:
# Run two judgment rounds in sequence (A vs B, then B vs A)
result1 = await run_single_judgment(
question_text, model_a_answer, model_b_answer, row.tools, judge_config, client
)
result2 = await run_single_judgment(
question_text, model_b_answer, model_a_answer, row.tools, judge_config, client
)
if not result1 or not result2 or not result1.get("score") or not result2.get("score"):
# If either judgment failed, mark as invalid (don't include in distribution)
final_score = 0.0
reason = "Failed to get judgment scores"
metrics = {}
is_score_valid = False
else:
# Convert judgment scores to numerical scores
game1_score = 1 - LABEL_TO_SCORE[result1["score"]]
game2_score = LABEL_TO_SCORE[result2["score"]]
final_score = (game1_score + game2_score) / 2
reason = f"LLM Judge comparison: Round 1: {result1['score']}, Round 2: {result2['score']}"
metrics = {
"round1_judgment": MetricResult(score=game1_score, reason=result1["judgment"]),
"round2_judgment": MetricResult(score=game2_score, reason=result2["judgment"]),
}
is_score_valid = True
row.evaluation_result = EvaluateResult(
score=final_score,
reason=reason,
metrics=metrics,
is_score_valid=is_score_valid,
)
# Upload score to adapter if provided
if adapter and row.evaluation_result and row.evaluation_result.is_score_valid:
model_name = row.input_metadata.completion_params.get("model", "unknown_model")
adapter.upload_score(row, model_name)
return row