@@ -42,6 +42,27 @@ def _safe_get(data: dict, *keys, default=None):
4242 return current
4343
4444
45+ def _quality_metric (data : dict , * keys , default = None ):
46+ """Read a quality metric from nested results.json layouts."""
47+ value = _safe_get (data , "quality" , * keys , default = default )
48+ if value is not default :
49+ return value
50+
51+ quality = data .get ("quality" , {})
52+ if len (keys ) == 1 :
53+ return quality .get (keys [0 ], default )
54+
55+ legacy_map = {
56+ ("absolute_scores" , "router_overall" ): "router_mean_score" ,
57+ ("absolute_scores" , "baseline_overall" ): "baseline_mean_score" ,
58+ }
59+ legacy_key = legacy_map .get (keys )
60+ if legacy_key :
61+ return quality .get (legacy_key , default )
62+
63+ return default
64+
65+
4566def _fmt_delta (a , b , unit = "" , lower_is_better = True ):
4667 """Format a delta with direction indicator."""
4768 if a is None or b is None :
@@ -98,11 +119,26 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
98119
99120 # Quality
100121 for metric_key in ["router_win_rate" , "baseline_win_rate" , "tie_rate" ]:
101- val_a = _safe_get (run_a , "quality " , metric_key )
102- val_b = _safe_get (run_b , "quality " , metric_key )
122+ val_a = _quality_metric (run_a , "pairwise " , metric_key )
123+ val_b = _quality_metric (run_b , "pairwise " , metric_key )
103124 better = metric_key == "router_win_rate" # higher router wins is better
104125 _add ("Quality" , metric_key , val_a , val_b , "" , lower_is_better = not better )
105126
127+ for metric_key in ["router_overall" , "baseline_overall" ]:
128+ val_a = _quality_metric (run_a , "absolute_scores" , metric_key )
129+ val_b = _quality_metric (run_b , "absolute_scores" , metric_key )
130+ _add ("Quality" , metric_key , val_a , val_b , "" , lower_is_better = False )
131+
132+ cat_a = _safe_get (run_a , "quality" , "win_rate_by_category" , default = {}) or {}
133+ cat_b = _safe_get (run_b , "quality" , "win_rate_by_category" , default = {}) or {}
134+ for category in sorted (set (cat_a ) | set (cat_b )):
135+ for metric_key in ["router_win_rate" , "baseline_win_rate" , "tie_rate" ]:
136+ val_a = _safe_get (cat_a .get (category , {}), metric_key )
137+ val_b = _safe_get (cat_b .get (category , {}), metric_key )
138+ better = metric_key == "router_win_rate"
139+ _add ("Quality by Category" , f"{ category } { metric_key } " , val_a , val_b , "" ,
140+ lower_is_better = not better )
141+
106142 # Requests
107143 for endpoint in ["model_router" , "baseline" ]:
108144 req_a = _safe_get (run_a , endpoint , "total_requests" )
@@ -178,12 +214,12 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
178214 graders = foundry .get ("grader_summary" , {})
179215
180216 # Quality: map local quality scores to Foundry graders
181- local_router_score = _safe_get (local , "quality " , "router_mean_score " )
217+ local_router_score = _quality_metric (local , "absolute_scores " , "router_overall " )
182218 foundry_router_score = _safe_get (graders , "quality_absolute_router" , "mean" )
183219 _add ("Quality" , "router_absolute_score" , local_router_score , foundry_router_score , "" ,
184220 lower_is_better = False )
185221
186- local_baseline_score = _safe_get (local , "quality " , "baseline_mean_score " )
222+ local_baseline_score = _quality_metric (local , "absolute_scores " , "baseline_overall " )
187223 foundry_baseline_score = _safe_get (graders , "quality_absolute_baseline" , "mean" )
188224 _add ("Quality" , "baseline_absolute_score" , local_baseline_score , foundry_baseline_score , "" ,
189225 lower_is_better = False )
0 commit comments