@@ -31,6 +31,9 @@ def _load_results(path: Path) -> dict:
3131 return json .load (f )
3232
3333
34+ _MISSING = object ()
35+
36+
3437def _safe_get (data : dict , * keys , default = None ):
3538 """Safely traverse nested dict keys."""
3639 current = data
@@ -42,6 +45,31 @@ def _safe_get(data: dict, *keys, default=None):
4245 return current
4346
4447
48+ def _quality_metric (data : dict , * keys , default = None ):
49+ """Read a quality metric from nested results.json layouts."""
50+ value = _safe_get (data , "quality" , * keys , default = _MISSING )
51+ if value is not _MISSING :
52+ return value
53+
54+ quality = data .get ("quality" )
55+ if not isinstance (quality , dict ):
56+ return default
57+ if len (keys ) == 1 :
58+ return quality .get (keys [0 ], default )
59+ if keys [0 ] == "pairwise" and len (keys ) == 2 :
60+ return quality .get (keys [1 ], default )
61+
62+ legacy_map = {
63+ ("absolute_scores" , "router_overall" ): "router_mean_score" ,
64+ ("absolute_scores" , "baseline_overall" ): "baseline_mean_score" ,
65+ }
66+ legacy_key = legacy_map .get (keys )
67+ if legacy_key :
68+ return quality .get (legacy_key , default )
69+
70+ return default
71+
72+
4573def _fmt_delta (a , b , unit = "" , lower_is_better = True ):
4674 """Format a delta with direction indicator."""
4775 if a is None or b is None :
@@ -98,11 +126,28 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
98126
99127 # Quality
100128 for metric_key in ["router_win_rate" , "baseline_win_rate" , "tie_rate" ]:
101- val_a = _safe_get (run_a , "quality " , metric_key )
102- val_b = _safe_get (run_b , "quality " , metric_key )
129+ val_a = _quality_metric (run_a , "pairwise " , metric_key )
130+ val_b = _quality_metric (run_b , "pairwise " , metric_key )
103131 better = metric_key == "router_win_rate" # higher router wins is better
104132 _add ("Quality" , metric_key , val_a , val_b , "" , lower_is_better = not better )
105133
134+ for metric_key in ["router_overall" , "baseline_overall" ]:
135+ val_a = _quality_metric (run_a , "absolute_scores" , metric_key )
136+ val_b = _quality_metric (run_b , "absolute_scores" , metric_key )
137+ _add ("Quality" , metric_key , val_a , val_b , "" , lower_is_better = False )
138+
139+ _cat_a = _safe_get (run_a , "quality" , "win_rate_by_category" )
140+ cat_a = _cat_a if isinstance (_cat_a , dict ) else {}
141+ _cat_b = _safe_get (run_b , "quality" , "win_rate_by_category" )
142+ cat_b = _cat_b if isinstance (_cat_b , dict ) else {}
143+ for category in sorted (set (cat_a ) | set (cat_b )):
144+ for metric_key in ["router_win_rate" , "baseline_win_rate" , "tie_rate" ]:
145+ val_a = _safe_get (cat_a .get (category , {}), metric_key )
146+ val_b = _safe_get (cat_b .get (category , {}), metric_key )
147+ better = metric_key == "router_win_rate"
148+ _add ("Quality by Category" , f"{ category } { metric_key } " , val_a , val_b , "" ,
149+ lower_is_better = not better )
150+
106151 # Requests
107152 for endpoint in ["model_router" , "baseline" ]:
108153 req_a = _safe_get (run_a , endpoint , "total_requests" )
@@ -178,12 +223,12 @@ def _add(category, metric, val_a, val_b, unit="", lower_is_better=True):
178223 graders = foundry .get ("grader_summary" , {})
179224
180225 # Quality: map local quality scores to Foundry graders
181- local_router_score = _safe_get (local , "quality " , "router_mean_score " )
226+ local_router_score = _quality_metric (local , "absolute_scores " , "router_overall " )
182227 foundry_router_score = _safe_get (graders , "quality_absolute_router" , "mean" )
183228 _add ("Quality" , "router_absolute_score" , local_router_score , foundry_router_score , "" ,
184229 lower_is_better = False )
185230
186- local_baseline_score = _safe_get (local , "quality " , "baseline_mean_score " )
231+ local_baseline_score = _quality_metric (local , "absolute_scores " , "baseline_overall " )
187232 foundry_baseline_score = _safe_get (graders , "quality_absolute_baseline" , "mean" )
188233 _add ("Quality" , "baseline_absolute_score" , local_baseline_score , foundry_baseline_score , "" ,
189234 lower_is_better = False )
0 commit comments