Skip to content

Commit f63d334

Browse files
authored
test: add compare-results regression coverage
1 parent a78b2c5 commit f63d334

2 files changed

Lines changed: 156 additions & 4 deletions

File tree

scripts/compare_results.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,27 @@ def _safe_get(data: dict, *keys, default=None):
4242
return current
4343

4444

45+
def _quality_metric(data: dict, *keys, default=None):
46+
"""Read a quality metric from nested results.json layouts."""
47+
value = _safe_get(data, "quality", *keys, default=default)
48+
if value is not default:
49+
return value
50+
51+
quality = data.get("quality", {})
52+
if len(keys) == 1:
53+
return quality.get(keys[0], default)
54+
55+
legacy_map = {
56+
("absolute_scores", "router_overall"): "router_mean_score",
57+
("absolute_scores", "baseline_overall"): "baseline_mean_score",
58+
}
59+
legacy_key = legacy_map.get(keys)
60+
if legacy_key:
61+
return quality.get(legacy_key, default)
62+
63+
return default
64+
65+
4566
def _fmt_delta(a, b, unit="", lower_is_better=True):
4667
"""Format a delta with direction indicator."""
4768
if a is None or b is None:
@@ -98,11 +119,26 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
98119

99120
# Quality
100121
for metric_key in ["router_win_rate", "baseline_win_rate", "tie_rate"]:
101-
val_a = _safe_get(run_a, "quality", metric_key)
102-
val_b = _safe_get(run_b, "quality", metric_key)
122+
val_a = _quality_metric(run_a, "pairwise", metric_key)
123+
val_b = _quality_metric(run_b, "pairwise", metric_key)
103124
better = metric_key == "router_win_rate" # higher router wins is better
104125
_add("Quality", metric_key, val_a, val_b, "", lower_is_better=not better)
105126

127+
for metric_key in ["router_overall", "baseline_overall"]:
128+
val_a = _quality_metric(run_a, "absolute_scores", metric_key)
129+
val_b = _quality_metric(run_b, "absolute_scores", metric_key)
130+
_add("Quality", metric_key, val_a, val_b, "", lower_is_better=False)
131+
132+
cat_a = _safe_get(run_a, "quality", "win_rate_by_category", default={}) or {}
133+
cat_b = _safe_get(run_b, "quality", "win_rate_by_category", default={}) or {}
134+
for category in sorted(set(cat_a) | set(cat_b)):
135+
for metric_key in ["router_win_rate", "baseline_win_rate", "tie_rate"]:
136+
val_a = _safe_get(cat_a.get(category, {}), metric_key)
137+
val_b = _safe_get(cat_b.get(category, {}), metric_key)
138+
better = metric_key == "router_win_rate"
139+
_add("Quality by Category", f"{category} {metric_key}", val_a, val_b, "",
140+
lower_is_better=not better)
141+
106142
# Requests
107143
for endpoint in ["model_router", "baseline"]:
108144
req_a = _safe_get(run_a, endpoint, "total_requests")
@@ -178,12 +214,12 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
178214
graders = foundry.get("grader_summary", {})
179215

180216
# Quality: map local quality scores to Foundry graders
181-
local_router_score = _safe_get(local, "quality", "router_mean_score")
217+
local_router_score = _quality_metric(local, "absolute_scores", "router_overall")
182218
foundry_router_score = _safe_get(graders, "quality_absolute_router", "mean")
183219
_add("Quality", "router_absolute_score", local_router_score, foundry_router_score, "",
184220
lower_is_better=False)
185221

186-
local_baseline_score = _safe_get(local, "quality", "baseline_mean_score")
222+
local_baseline_score = _quality_metric(local, "absolute_scores", "baseline_overall")
187223
foundry_baseline_score = _safe_get(graders, "quality_absolute_baseline", "mean")
188224
_add("Quality", "baseline_absolute_score", local_baseline_score, foundry_baseline_score, "",
189225
lower_is_better=False)

tests/test_compare_results.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""Regression tests for scripts.compare_results."""
2+
3+
from __future__ import annotations
4+
5+
import importlib.util
6+
from pathlib import Path
7+
8+
9+
def _load_compare_results_module():
10+
script_path = Path(__file__).resolve().parents[1] / "scripts" / "compare_results.py"
11+
spec = importlib.util.spec_from_file_location("compare_results", script_path)
12+
module = importlib.util.module_from_spec(spec)
13+
assert spec.loader is not None
14+
spec.loader.exec_module(module)
15+
return module
16+
17+
18+
def _row_map(rows):
19+
return {(row["category"], row["metric"]): row for row in rows}
20+
21+
22+
def _base_run():
23+
return {
24+
"model_router": {
25+
"total_requests": 10,
26+
"latency": {"mean_ms": 100.0, "p90_ms": 120.0, "p99_ms": 140.0},
27+
"cost": {"estimated_cost_usd": 1.0},
28+
},
29+
"baseline": {
30+
"total_requests": 10,
31+
"latency": {"mean_ms": 150.0, "p90_ms": 180.0, "p99_ms": 210.0},
32+
"cost": {"estimated_cost_usd": 2.0},
33+
},
34+
}
35+
36+
37+
def test_compare_reads_nested_quality_metrics():
38+
mod = _load_compare_results_module()
39+
run_a = _base_run() | {
40+
"quality": {
41+
"pairwise": {
42+
"router_win_rate": 0.4,
43+
"baseline_win_rate": 0.2,
44+
"tie_rate": 0.4,
45+
},
46+
"absolute_scores": {
47+
"router_overall": 4.2,
48+
"baseline_overall": 3.8,
49+
},
50+
"win_rate_by_category": {
51+
"math": {
52+
"router_win_rate": 1.0,
53+
"baseline_win_rate": 0.0,
54+
"tie_rate": 0.0,
55+
},
56+
},
57+
},
58+
}
59+
run_b = _base_run() | {
60+
"quality": {
61+
"pairwise": {
62+
"router_win_rate": 0.5,
63+
"baseline_win_rate": 0.3,
64+
"tie_rate": 0.2,
65+
},
66+
"absolute_scores": {
67+
"router_overall": 4.5,
68+
"baseline_overall": 4.0,
69+
},
70+
"win_rate_by_category": {
71+
"math": {
72+
"router_win_rate": 0.5,
73+
"baseline_win_rate": 0.5,
74+
"tie_rate": 0.0,
75+
},
76+
},
77+
},
78+
}
79+
80+
rows = _row_map(mod.compare(run_a, run_b, "run-a", "run-b"))
81+
82+
assert rows[("Quality", "router_win_rate")]["run_a"] == 0.4
83+
assert rows[("Quality", "router_win_rate")]["run_b"] == 0.5
84+
assert rows[("Quality", "baseline_overall")]["run_a"] == 3.8
85+
assert rows[("Quality", "router_overall")]["run_b"] == 4.5
86+
assert rows[("Quality by Category", "math router_win_rate")]["run_a"] == 1.0
87+
assert rows[("Quality by Category", "math baseline_win_rate")]["run_b"] == 0.5
88+
89+
90+
def test_compare_keeps_legacy_flat_quality_metrics():
91+
mod = _load_compare_results_module()
92+
run_a = _base_run() | {
93+
"quality": {
94+
"router_win_rate": 0.4,
95+
"baseline_win_rate": 0.2,
96+
"tie_rate": 0.4,
97+
"router_mean_score": 4.1,
98+
"baseline_mean_score": 3.7,
99+
},
100+
}
101+
run_b = _base_run() | {
102+
"quality": {
103+
"router_win_rate": 0.6,
104+
"baseline_win_rate": 0.1,
105+
"tie_rate": 0.3,
106+
"router_mean_score": 4.4,
107+
"baseline_mean_score": 3.9,
108+
},
109+
}
110+
111+
rows = _row_map(mod.compare(run_a, run_b, "run-a", "run-b"))
112+
113+
assert rows[("Quality", "router_win_rate")]["run_a"] == 0.4
114+
assert rows[("Quality", "baseline_win_rate")]["run_b"] == 0.1
115+
assert rows[("Quality", "router_overall")]["run_a"] == 4.1
116+
assert rows[("Quality", "baseline_overall")]["run_b"] == 3.9

0 commit comments

Comments
 (0)