Skip to content

Commit 02dbd00

Browse files
committed
Add position bias detection and Cohen's kappa to llm_judge
Extends compute_agreement.py with compute_position_bias(), compute_cohens_kappa(), and interpret_kappa() functions that work with the standard _pair.jsonl output format. Modifies show_result.py to print a one-line consistency summary at the bottom of pairwise results and adds --show-consistency flag for detailed metrics (bias rate, direction, kappa, Landis & Koch). Includes 13 unit tests with synthetic data in tests/test_consistency_metrics.py.
1 parent 587d5cf commit 02dbd00

File tree

3 files changed

+273
-4
lines changed

3 files changed

+273
-4
lines changed

fastchat/llm_judge/compute_agreement.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,97 @@ def get_mt_bench_agreement(data, judge1, judge2, ban):
9898
raise Exception("Unsupported judges.")
9999

100100

101+
def compute_position_bias(g1_winners, g2_winners):
102+
"""Compute position bias rate and direction from paired judgments.
103+
104+
Args:
105+
g1_winners: list of winners from game 1 (original order).
106+
g2_winners: list of winners from game 2 (swapped order).
107+
108+
Returns:
109+
bias_rate: fraction of pairs where g1 and g2 disagree.
110+
direction: "first" if judge favors position A, "second" if position B,
111+
"none" if balanced or no bias detected.
112+
"""
113+
total = len(g1_winners)
114+
if total == 0:
115+
return 0.0, "none"
116+
117+
disagree = 0
118+
favor_first = 0 # g1 says model_1, g2 says model_2 (both favor position A)
119+
favor_second = 0
120+
for g1, g2 in zip(g1_winners, g2_winners):
121+
if g1 != g2:
122+
disagree += 1
123+
# g1: model_1 in position A; g2: model_2 in position A
124+
# If g1=model_1 and g2=model_2, judge always picked position A
125+
if g1 == "model_1" and g2 == "model_2":
126+
favor_first += 1
127+
elif g1 == "model_2" and g2 == "model_1":
128+
favor_second += 1
129+
130+
bias_rate = disagree / total
131+
if favor_first > favor_second:
132+
direction = "first"
133+
elif favor_second > favor_first:
134+
direction = "second"
135+
else:
136+
direction = "none"
137+
138+
return bias_rate, direction
139+
140+
141+
def compute_cohens_kappa(g1_winners, g2_winners):
142+
"""Compute Cohen's kappa treating game1 and game2 as two raters.
143+
144+
Categories: model_1, model_2, tie.
145+
146+
Args:
147+
g1_winners: list of winners from game 1.
148+
g2_winners: list of winners from game 2.
149+
150+
Returns:
151+
kappa: Cohen's kappa coefficient.
152+
"""
153+
categories = ["model_1", "model_2", "tie"]
154+
total = len(g1_winners)
155+
if total == 0:
156+
return 0.0
157+
158+
# Observed agreement
159+
agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2)
160+
p_o = agree / total
161+
162+
# Expected agreement by chance
163+
p_e = 0.0
164+
for cat in categories:
165+
p1 = sum(1 for g in g1_winners if g == cat) / total
166+
p2 = sum(1 for g in g2_winners if g == cat) / total
167+
p_e += p1 * p2
168+
169+
if p_e == 1.0:
170+
return 1.0
171+
172+
kappa = (p_o - p_e) / (1.0 - p_e)
173+
return kappa
174+
175+
176+
def interpret_kappa(kappa):
177+
"""Interpret kappa using the Landis & Koch scale."""
178+
if kappa < 0.0:
179+
return "poor"
180+
elif kappa < 0.21:
181+
return "slight"
182+
elif kappa < 0.41:
183+
return "fair"
184+
elif kappa < 0.61:
185+
return "moderate"
186+
elif kappa < 0.81:
187+
return "substantial"
188+
else:
189+
return "almost perfect"
190+
191+
101192
def run_mt_bench_agreement(judges, votefiles):
102193
# votes[i]: List of votes
103194
votes = []

fastchat/llm_judge/show_result.py

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
import argparse
66
import pandas as pd
77

8+
from fastchat.llm_judge.compute_agreement import (
9+
compute_position_bias,
10+
compute_cohens_kappa,
11+
interpret_kappa,
12+
)
13+
814

915
def display_result_single(args):
1016
if args.input_file is None:
@@ -91,6 +97,57 @@ def display_result_pairwise(args):
9197
# print(df.sort_values(by="loss_rate", ascending=True))
9298
print(df.sort_values(by="win_rate_adjusted", ascending=False))
9399

100+
# Print one-line consistency summary
101+
g1_winners = df_all["g1_winner"].tolist()
102+
g2_winners = df_all["g2_winner"].tolist()
103+
bias_rate, _ = compute_position_bias(g1_winners, g2_winners)
104+
kappa = compute_cohens_kappa(g1_winners, g2_winners)
105+
print(
106+
f"\n[Consistency] position bias: {bias_rate:.1%}, "
107+
f"Cohen's kappa: {kappa:.3f} ({interpret_kappa(kappa)})"
108+
)
109+
110+
111+
def display_consistency_metrics(args):
112+
"""Display detailed position bias and Cohen's kappa metrics."""
113+
if args.input_file is None:
114+
input_file = (
115+
f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl"
116+
)
117+
else:
118+
input_file = args.input_file
119+
120+
print(f"Input file: {input_file}")
121+
df_all = pd.read_json(input_file, lines=True)
122+
df_all = df_all[(df_all["g1_winner"] != "error") & (df_all["g2_winner"] != "error")]
123+
124+
g1_winners = df_all["g1_winner"].tolist()
125+
g2_winners = df_all["g2_winner"].tolist()
126+
total = len(g1_winners)
127+
128+
if total == 0:
129+
print("No valid judgments found.")
130+
return
131+
132+
# Position bias
133+
bias_rate, direction = compute_position_bias(g1_winners, g2_winners)
134+
135+
# Cohen's kappa
136+
kappa = compute_cohens_kappa(g1_winners, g2_winners)
137+
interpretation = interpret_kappa(kappa)
138+
139+
# Simple agreement
140+
agree = sum(1 for g1, g2 in zip(g1_winners, g2_winners) if g1 == g2)
141+
agree_rate = agree / total
142+
143+
print(f"\n########## Consistency Metrics ##########")
144+
print(f"Total pairs: {total}")
145+
print(f"Agreement rate: {agree_rate:.1%} ({agree}/{total})")
146+
print(f"Position bias rate: {bias_rate:.1%}")
147+
print(f"Position bias direction: {direction}")
148+
print(f"Cohen's kappa: {kappa:.3f}")
149+
print(f"Interpretation: {interpretation} (Landis & Koch)")
150+
94151

95152
if __name__ == "__main__":
96153
parser = argparse.ArgumentParser()
@@ -117,14 +174,22 @@ def display_result_pairwise(args):
117174
"`single` runs single answer grading."
118175
),
119176
)
177+
parser.add_argument(
178+
"--show-consistency",
179+
action="store_true",
180+
help="Show detailed position bias and Cohen's kappa metrics.",
181+
)
120182
args = parser.parse_args()
121183

122-
if args.mode == "single":
184+
if args.show_consistency:
185+
display_consistency_metrics(args)
186+
elif args.mode == "single":
123187
display_result_func = display_result_single
188+
print(f"Mode: {args.mode}")
189+
display_result_func(args)
124190
else:
125191
if args.mode == "pairwise-all":
126192
args.baseline_model = None
127193
display_result_func = display_result_pairwise
128-
129-
print(f"Mode: {args.mode}")
130-
display_result_func(args)
194+
print(f"Mode: {args.mode}")
195+
display_result_func(args)

tests/test_consistency_metrics.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
"""Unit tests for position bias and Cohen's kappa metrics."""
2+
import unittest
3+
4+
from fastchat.llm_judge.compute_agreement import (
5+
compute_position_bias,
6+
compute_cohens_kappa,
7+
interpret_kappa,
8+
)
9+
10+
11+
class TestPositionBias(unittest.TestCase):
12+
def test_no_bias(self):
13+
"""Perfect agreement means zero bias."""
14+
g1 = ["model_1", "model_2", "tie"]
15+
g2 = ["model_1", "model_2", "tie"]
16+
rate, direction = compute_position_bias(g1, g2)
17+
self.assertEqual(rate, 0.0)
18+
self.assertEqual(direction, "none")
19+
20+
def test_full_first_position_bias(self):
21+
"""Judge always picks position A -> favor first."""
22+
g1 = ["model_1", "model_1", "model_1"]
23+
g2 = ["model_2", "model_2", "model_2"]
24+
rate, direction = compute_position_bias(g1, g2)
25+
self.assertEqual(rate, 1.0)
26+
self.assertEqual(direction, "first")
27+
28+
def test_full_second_position_bias(self):
29+
"""Judge always picks position B -> favor second."""
30+
g1 = ["model_2", "model_2"]
31+
g2 = ["model_1", "model_1"]
32+
rate, direction = compute_position_bias(g1, g2)
33+
self.assertEqual(rate, 1.0)
34+
self.assertEqual(direction, "second")
35+
36+
def test_mixed_bias(self):
37+
"""Partial disagreement with balanced direction."""
38+
g1 = ["model_1", "model_2", "model_1", "model_2"]
39+
g2 = ["model_1", "model_2", "model_2", "model_1"]
40+
rate, direction = compute_position_bias(g1, g2)
41+
self.assertAlmostEqual(rate, 0.5)
42+
# one favor_first (idx 2) and one favor_second (idx 3)
43+
self.assertEqual(direction, "none")
44+
45+
def test_empty_input(self):
46+
rate, direction = compute_position_bias([], [])
47+
self.assertEqual(rate, 0.0)
48+
self.assertEqual(direction, "none")
49+
50+
51+
class TestCohensKappa(unittest.TestCase):
52+
def test_perfect_agreement(self):
53+
g1 = ["model_1", "model_2", "tie", "model_1"]
54+
g2 = ["model_1", "model_2", "tie", "model_1"]
55+
kappa = compute_cohens_kappa(g1, g2)
56+
self.assertAlmostEqual(kappa, 1.0)
57+
58+
def test_no_agreement_disjoint(self):
59+
"""Disjoint categories: p_o=0 and p_e=0 -> kappa=0."""
60+
g1 = ["model_1", "model_1", "model_1"]
61+
g2 = ["model_2", "model_2", "model_2"]
62+
kappa = compute_cohens_kappa(g1, g2)
63+
self.assertAlmostEqual(kappa, 0.0)
64+
65+
def test_below_chance_agreement(self):
66+
"""Agreement below chance should give kappa < 0."""
67+
# Both raters use all categories, but disagree more than chance
68+
g1 = ["model_1", "model_2", "tie", "model_1", "model_2", "tie"]
69+
g2 = ["model_2", "tie", "model_1", "model_2", "tie", "model_1"]
70+
kappa = compute_cohens_kappa(g1, g2)
71+
self.assertLess(kappa, 0.0)
72+
73+
def test_chance_agreement(self):
74+
"""When agreement equals chance, kappa should be ~0."""
75+
# Two raters each pick model_1 50% and model_2 50%
76+
# but they disagree in a pattern that matches chance
77+
g1 = ["model_1", "model_2", "model_1", "model_2"]
78+
g2 = ["model_2", "model_1", "model_1", "model_2"]
79+
kappa = compute_cohens_kappa(g1, g2)
80+
# p_o = 2/4 = 0.5, p_e = 0.5*0.5 + 0.5*0.5 = 0.5
81+
self.assertAlmostEqual(kappa, 0.0)
82+
83+
def test_empty_input(self):
84+
kappa = compute_cohens_kappa([], [])
85+
self.assertEqual(kappa, 0.0)
86+
87+
def test_all_same_category(self):
88+
"""Both raters always say the same thing."""
89+
g1 = ["tie", "tie", "tie"]
90+
g2 = ["tie", "tie", "tie"]
91+
kappa = compute_cohens_kappa(g1, g2)
92+
self.assertAlmostEqual(kappa, 1.0)
93+
94+
95+
class TestInterpretKappa(unittest.TestCase):
96+
def test_scale(self):
97+
self.assertEqual(interpret_kappa(-0.1), "poor")
98+
self.assertEqual(interpret_kappa(0.1), "slight")
99+
self.assertEqual(interpret_kappa(0.3), "fair")
100+
self.assertEqual(interpret_kappa(0.5), "moderate")
101+
self.assertEqual(interpret_kappa(0.7), "substantial")
102+
self.assertEqual(interpret_kappa(0.9), "almost perfect")
103+
104+
def test_boundaries(self):
105+
self.assertEqual(interpret_kappa(0.0), "slight")
106+
self.assertEqual(interpret_kappa(0.21), "fair")
107+
self.assertEqual(interpret_kappa(0.41), "moderate")
108+
self.assertEqual(interpret_kappa(0.61), "substantial")
109+
self.assertEqual(interpret_kappa(0.81), "almost perfect")
110+
111+
112+
if __name__ == "__main__":
113+
unittest.main()

0 commit comments

Comments
 (0)