Skip to content

Commit ecf0aa2

Browse files
authored
Merge pull request #4 from microsoft-foundry/copilot/fix-quality-metrics-n-a
Fix compare_results quality metric parsing for judge-enabled runs
2 parents 1e3a3ce + 171786d commit ecf0aa2

2 files changed

Lines changed: 166 additions & 4 deletions

File tree

scripts/compare_results.py

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ def _load_results(path: Path) -> dict:
3131
return json.load(f)
3232

3333

34+
_MISSING = object()
35+
36+
3437
def _safe_get(data: dict, *keys, default=None):
3538
"""Safely traverse nested dict keys."""
3639
current = data
@@ -42,6 +45,31 @@ def _safe_get(data: dict, *keys, default=None):
4245
return current
4346

4447

48+
def _quality_metric(data: dict, *keys, default=None):
49+
"""Read a quality metric from nested results.json layouts."""
50+
value = _safe_get(data, "quality", *keys, default=_MISSING)
51+
if value is not _MISSING:
52+
return value
53+
54+
quality = data.get("quality")
55+
if not isinstance(quality, dict):
56+
return default
57+
if len(keys) == 1:
58+
return quality.get(keys[0], default)
59+
if keys[0] == "pairwise" and len(keys) == 2:
60+
return quality.get(keys[1], default)
61+
62+
legacy_map = {
63+
("absolute_scores", "router_overall"): "router_mean_score",
64+
("absolute_scores", "baseline_overall"): "baseline_mean_score",
65+
}
66+
legacy_key = legacy_map.get(keys)
67+
if legacy_key:
68+
return quality.get(legacy_key, default)
69+
70+
return default
71+
72+
4573
def _fmt_delta(a, b, unit="", lower_is_better=True):
4674
"""Format a delta with direction indicator."""
4775
if a is None or b is None:
@@ -98,11 +126,28 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
98126

99127
# Quality
100128
for metric_key in ["router_win_rate", "baseline_win_rate", "tie_rate"]:
101-
val_a = _safe_get(run_a, "quality", metric_key)
102-
val_b = _safe_get(run_b, "quality", metric_key)
129+
val_a = _quality_metric(run_a, "pairwise", metric_key)
130+
val_b = _quality_metric(run_b, "pairwise", metric_key)
103131
better = metric_key == "router_win_rate" # higher router wins is better
104132
_add("Quality", metric_key, val_a, val_b, "", lower_is_better=not better)
105133

134+
for metric_key in ["router_overall", "baseline_overall"]:
135+
val_a = _quality_metric(run_a, "absolute_scores", metric_key)
136+
val_b = _quality_metric(run_b, "absolute_scores", metric_key)
137+
_add("Quality", metric_key, val_a, val_b, "", lower_is_better=False)
138+
139+
_cat_a = _safe_get(run_a, "quality", "win_rate_by_category")
140+
cat_a = _cat_a if isinstance(_cat_a, dict) else {}
141+
_cat_b = _safe_get(run_b, "quality", "win_rate_by_category")
142+
cat_b = _cat_b if isinstance(_cat_b, dict) else {}
143+
for category in sorted(set(cat_a) | set(cat_b)):
144+
for metric_key in ["router_win_rate", "baseline_win_rate", "tie_rate"]:
145+
val_a = _safe_get(cat_a.get(category, {}), metric_key)
146+
val_b = _safe_get(cat_b.get(category, {}), metric_key)
147+
better = metric_key == "router_win_rate"
148+
_add("Quality by Category", f"{category} {metric_key}", val_a, val_b, "",
149+
lower_is_better=not better)
150+
106151
# Requests
107152
for endpoint in ["model_router", "baseline"]:
108153
req_a = _safe_get(run_a, endpoint, "total_requests")
@@ -178,12 +223,12 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
178223
graders = foundry.get("grader_summary", {})
179224

180225
# Quality: map local quality scores to Foundry graders
181-
local_router_score = _safe_get(local, "quality", "router_mean_score")
226+
local_router_score = _quality_metric(local, "absolute_scores", "router_overall")
182227
foundry_router_score = _safe_get(graders, "quality_absolute_router", "mean")
183228
_add("Quality", "router_absolute_score", local_router_score, foundry_router_score, "",
184229
lower_is_better=False)
185230

186-
local_baseline_score = _safe_get(local, "quality", "baseline_mean_score")
231+
local_baseline_score = _quality_metric(local, "absolute_scores", "baseline_overall")
187232
foundry_baseline_score = _safe_get(graders, "quality_absolute_baseline", "mean")
188233
_add("Quality", "baseline_absolute_score", local_baseline_score, foundry_baseline_score, "",
189234
lower_is_better=False)

tests/test_compare_results.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""Regression tests for scripts.compare_results."""
2+
3+
from __future__ import annotations
4+
5+
import importlib.util
6+
from pathlib import Path
7+
8+
9+
def _load_compare_results_module():
10+
script_path = Path(__file__).resolve().parents[1] / "scripts" / "compare_results.py"
11+
spec = importlib.util.spec_from_file_location("compare_results", script_path)
12+
assert spec is not None
13+
module = importlib.util.module_from_spec(spec)
14+
assert spec.loader is not None
15+
spec.loader.exec_module(module)
16+
return module
17+
18+
19+
def _row_map(rows):
20+
return {(row["category"], row["metric"]): row for row in rows}
21+
22+
23+
def _base_run():
24+
return {
25+
"model_router": {
26+
"total_requests": 10,
27+
"latency": {"mean_ms": 100.0, "p90_ms": 120.0, "p99_ms": 140.0},
28+
"cost": {"estimated_cost_usd": 1.0},
29+
},
30+
"baseline": {
31+
"total_requests": 10,
32+
"latency": {"mean_ms": 150.0, "p90_ms": 180.0, "p99_ms": 210.0},
33+
"cost": {"estimated_cost_usd": 2.0},
34+
},
35+
}
36+
37+
38+
def test_compare_reads_nested_quality_metrics():
39+
mod = _load_compare_results_module()
40+
run_a = _base_run() | {
41+
"quality": {
42+
"pairwise": {
43+
"router_win_rate": 0.4,
44+
"baseline_win_rate": 0.2,
45+
"tie_rate": 0.4,
46+
},
47+
"absolute_scores": {
48+
"router_overall": 4.2,
49+
"baseline_overall": 3.8,
50+
},
51+
"win_rate_by_category": {
52+
"math": {
53+
"router_win_rate": 1.0,
54+
"baseline_win_rate": 0.0,
55+
"tie_rate": 0.0,
56+
},
57+
},
58+
},
59+
}
60+
run_b = _base_run() | {
61+
"quality": {
62+
"pairwise": {
63+
"router_win_rate": 0.5,
64+
"baseline_win_rate": 0.3,
65+
"tie_rate": 0.2,
66+
},
67+
"absolute_scores": {
68+
"router_overall": 4.5,
69+
"baseline_overall": 4.0,
70+
},
71+
"win_rate_by_category": {
72+
"math": {
73+
"router_win_rate": 0.5,
74+
"baseline_win_rate": 0.5,
75+
"tie_rate": 0.0,
76+
},
77+
},
78+
},
79+
}
80+
81+
rows = _row_map(mod.compare(run_a, run_b, "run-a", "run-b"))
82+
83+
assert rows[("Quality", "router_win_rate")]["run_a"] == 0.4
84+
assert rows[("Quality", "router_win_rate")]["run_b"] == 0.5
85+
assert rows[("Quality", "baseline_overall")]["run_a"] == 3.8
86+
assert rows[("Quality", "router_overall")]["run_b"] == 4.5
87+
assert rows[("Quality by Category", "math router_win_rate")]["run_a"] == 1.0
88+
assert rows[("Quality by Category", "math baseline_win_rate")]["run_b"] == 0.5
89+
90+
91+
def test_compare_keeps_legacy_flat_quality_metrics():
92+
mod = _load_compare_results_module()
93+
run_a = _base_run() | {
94+
"quality": {
95+
"router_win_rate": 0.4,
96+
"baseline_win_rate": 0.2,
97+
"tie_rate": 0.4,
98+
"router_mean_score": 4.1,
99+
"baseline_mean_score": 3.7,
100+
},
101+
}
102+
run_b = _base_run() | {
103+
"quality": {
104+
"router_win_rate": 0.6,
105+
"baseline_win_rate": 0.1,
106+
"tie_rate": 0.3,
107+
"router_mean_score": 4.4,
108+
"baseline_mean_score": 3.9,
109+
},
110+
}
111+
112+
rows = _row_map(mod.compare(run_a, run_b, "run-a", "run-b"))
113+
114+
assert rows[("Quality", "router_win_rate")]["run_a"] == 0.4
115+
assert rows[("Quality", "baseline_win_rate")]["run_b"] == 0.1
116+
assert rows[("Quality", "router_overall")]["run_a"] == 4.1
117+
assert rows[("Quality", "baseline_overall")]["run_b"] == 3.9

0 commit comments

Comments
 (0)