1+ {
2+ "generated_at" : " 2026-03-04T23:08:01.932836+00:00" ,
3+ "source" : " runs/official/_raw" ,
4+ "canonical_pairing_rule" : " latest valid per side per (model, task_id); one pair per task" ,
5+ "sensitivity_pairing_rule" : " count-matched per (model, task_id), newest-first within side" ,
6+ "valid_filter" : " output_tokens > 0 and agent_execution_seconds >= 10" ,
7+ "records_scanned" : 7703 ,
8+ "latest_task" : {
9+ "all_pairs" : {
10+ "pair_count" : 502 ,
11+ "model_summary" : {
12+ "haiku" : {
13+ "pairs" : 392 ,
14+ "baseline_total_cost_usd" : 174.00601425 ,
15+ "mcp_total_cost_usd" : 198.0197303500001 ,
16+ "baseline_avg_cost_usd" : 0.44389289349489797 ,
17+ "mcp_avg_cost_usd" : 0.5051523733418369 ,
18+ "delta_cost_usd" : 24.013716100000096 ,
19+ "pct_delta_cost_of_means" : 13.800509254524297 ,
20+ "input_ratio_mcp_over_baseline" : 0.8004334702678465
21+ },
22+ "sonnet" : {
23+ "pairs" : 9 ,
24+ "baseline_total_cost_usd" : 13.34744245 ,
25+ "mcp_total_cost_usd" : 12.55560455 ,
26+ "baseline_avg_cost_usd" : 1.4830491611111112 ,
27+ "mcp_avg_cost_usd" : 1.3950671722222223 ,
28+ "delta_cost_usd" : -0.7918379000000009 ,
29+ "pct_delta_cost_of_means" : -5.932506567953033 ,
30+ "input_ratio_mcp_over_baseline" : 0.24737631184407796
31+ },
32+ "opus" : {
33+ "pairs" : 101 ,
34+ "baseline_total_cost_usd" : 5653.1065816499995 ,
35+ "mcp_total_cost_usd" : 9119.03358425 ,
36+ "baseline_avg_cost_usd" : 55.97135229356435 ,
37+ "mcp_avg_cost_usd" : 90.28746123019803 ,
38+ "delta_cost_usd" : 3465.9270026000013 ,
39+ "pct_delta_cost_of_means" : 61.31013014773878 ,
40+ "input_ratio_mcp_over_baseline" : 1.2365269473317915
41+ }
42+ },
43+ "size_summary" : {
44+ "haiku_by_context_length" : {
45+ "100k-1m" : {
46+ "pairs" : 98 ,
47+ "baseline_avg_cost_usd" : 0.35356851377551024 ,
48+ "mcp_avg_cost_usd" : 0.4815632173469387 ,
49+ "delta_avg_cost_usd" : 0.12799470357142845 ,
50+ "pct_delta_cost_of_means" : 36.200820656981804
51+ },
52+ "<100k" : {
53+ "pairs" : 222 ,
54+ "baseline_avg_cost_usd" : 0.22787664121621623 ,
55+ "mcp_avg_cost_usd" : 0.21461094842342338 ,
56+ "delta_avg_cost_usd" : -0.013265692792792837 ,
57+ "pct_delta_cost_of_means" : -5.821435984834422
58+ },
59+ "unknown" : {
60+ "pairs" : 72 ,
61+ "baseline_avg_cost_usd" : 1.232884521527778 ,
62+ "mcp_avg_cost_usd" : 1.4330958958333335 ,
63+ "delta_avg_cost_usd" : 0.2002113743055555 ,
64+ "pct_delta_cost_of_means" : 16.239264165426913
65+ }
66+ },
67+ "haiku_by_files_count" : {
68+ "10-100" : {
69+ "pairs" : 91 ,
70+ "baseline_avg_cost_usd" : 0.35155932912087917 ,
71+ "mcp_avg_cost_usd" : 0.4929996104395605 ,
72+ "delta_avg_cost_usd" : 0.1414402813186813 ,
73+ "pct_delta_cost_of_means" : 40.23226511222773
74+ },
75+ "<10" : {
76+ "pairs" : 168 ,
77+ "baseline_avg_cost_usd" : 0.2537777163690476 ,
78+ "mcp_avg_cost_usd" : 0.2341883002976191 ,
79+ "delta_avg_cost_usd" : -0.01958941607142851 ,
80+ "pct_delta_cost_of_means" : -7.719123787425552
81+ },
82+ "unknown" : {
83+ "pairs" : 133 ,
84+ "baseline_avg_cost_usd" : 0.7472139770676692 ,
85+ "mcp_avg_cost_usd" : 0.855737829699248 ,
86+ "delta_avg_cost_usd" : 0.10852385263157878 ,
87+ "pct_delta_cost_of_means" : 14.52379853190977
88+ }
89+ }
90+ }
91+ },
92+ "valid_only" : {
93+ "pair_count" : 497 ,
94+ "model_summary" : {
95+ "haiku" : {
96+ "pairs" : 392 ,
97+ "baseline_total_cost_usd" : 287.4485963499998 ,
98+ "mcp_total_cost_usd" : 200.74471865000007 ,
99+ "baseline_avg_cost_usd" : 0.7332872355867341 ,
100+ "mcp_avg_cost_usd" : 0.512103874107143 ,
101+ "delta_cost_usd" : -86.70387769999971 ,
102+ "pct_delta_cost_of_means" : -30.163263554235055 ,
103+ "input_ratio_mcp_over_baseline" : 0.3431840458617894
104+ },
105+ "sonnet" : {
106+ "pairs" : 9 ,
107+ "baseline_total_cost_usd" : 13.34744245 ,
108+ "mcp_total_cost_usd" : 12.55560455 ,
109+ "baseline_avg_cost_usd" : 1.4830491611111112 ,
110+ "mcp_avg_cost_usd" : 1.3950671722222223 ,
111+ "delta_cost_usd" : -0.7918379000000009 ,
112+ "pct_delta_cost_of_means" : -5.932506567953033 ,
113+ "input_ratio_mcp_over_baseline" : 0.24737631184407796
114+ },
115+ "opus" : {
116+ "pairs" : 96 ,
117+ "baseline_total_cost_usd" : 5654.35168975 ,
118+ "mcp_total_cost_usd" : 9109.5925864 ,
119+ "baseline_avg_cost_usd" : 58.89949676822917 ,
120+ "mcp_avg_cost_usd" : 94.89158944166667 ,
121+ "delta_cost_usd" : 3455.2408966499997 ,
122+ "pct_delta_cost_of_means" : 61.10764038454724 ,
123+ "input_ratio_mcp_over_baseline" : 1.2363819844806747
124+ }
125+ },
126+ "size_summary" : {
127+ "haiku_by_context_length" : {
128+ "100k-1m" : {
129+ "pairs" : 98 ,
130+ "baseline_avg_cost_usd" : 1.483237409693878 ,
131+ "mcp_avg_cost_usd" : 0.5093692204081632 ,
132+ "delta_avg_cost_usd" : -0.9738681892857148 ,
133+ "pct_delta_cost_of_means" : -65.65828119766135
134+ },
135+ "<100k" : {
136+ "pairs" : 222 ,
137+ "baseline_avg_cost_usd" : 0.23492972049549551 ,
138+ "mcp_avg_cost_usd" : 0.21461094842342338 ,
139+ "delta_avg_cost_usd" : -0.020318772072072132 ,
140+ "pct_delta_cost_of_means" : -8.648872534823326
141+ },
142+ "unknown" : {
143+ "pairs" : 72 ,
144+ "baseline_avg_cost_usd" : 1.2491240590277777 ,
145+ "mcp_avg_cost_usd" : 1.4330958958333335 ,
146+ "delta_avg_cost_usd" : 0.18397183680555568 ,
147+ "pct_delta_cost_of_means" : 14.728067678781652
148+ }
149+ },
150+ "haiku_by_files_count" : {
151+ "10-100" : {
152+ "pairs" : 91 ,
153+ "baseline_avg_cost_usd" : 1.5676902109890112 ,
154+ "mcp_avg_cost_usd" : 0.5063856565934066 ,
155+ "delta_avg_cost_usd" : -1.0613045543956046 ,
156+ "pct_delta_cost_of_means" : -67.69861462144729
157+ },
158+ "<10" : {
159+ "pairs" : 168 ,
160+ "baseline_avg_cost_usd" : 0.26154956279761904 ,
161+ "mcp_avg_cost_usd" : 0.24315769375000004 ,
162+ "delta_avg_cost_usd" : -0.018391869047619025 ,
163+ "pct_delta_cost_of_means" : -7.031886748688709
164+ },
165+ "unknown" : {
166+ "pairs" : 133 ,
167+ "baseline_avg_cost_usd" : 0.7582591022556393 ,
168+ "mcp_avg_cost_usd" : 0.855737829699248 ,
169+ "delta_avg_cost_usd" : 0.09747872744360878 ,
170+ "pct_delta_cost_of_means" : 12.855596082346121
171+ }
172+ }
173+ }
174+ }
175+ },
176+ "count_matched" : {
177+ "all_pairs" : {
178+ "pair_count" : 3390 ,
179+ "model_summary" : {
180+ "haiku" : {
181+ "pairs" : 3275 ,
182+ "baseline_total_cost_usd" : 1838.4468086999973 ,
183+ "mcp_total_cost_usd" : 1427.1376392500022 ,
184+ "baseline_avg_cost_usd" : 0.5613578041832052 ,
185+ "mcp_avg_cost_usd" : 0.43576721809160374 ,
186+ "delta_cost_usd" : -411.3091694499951 ,
187+ "pct_delta_cost_of_means" : -22.372644533612597 ,
188+ "input_ratio_mcp_over_baseline" : 0.5995627052572636
189+ },
190+ "sonnet" : {
191+ "pairs" : 9 ,
192+ "baseline_total_cost_usd" : 13.34744245 ,
193+ "mcp_total_cost_usd" : 12.55560455 ,
194+ "baseline_avg_cost_usd" : 1.4830491611111112 ,
195+ "mcp_avg_cost_usd" : 1.3950671722222223 ,
196+ "delta_cost_usd" : -0.7918379000000009 ,
197+ "pct_delta_cost_of_means" : -5.932506567953033 ,
198+ "input_ratio_mcp_over_baseline" : 0.24737631184407796
199+ },
200+ "opus" : {
201+ "pairs" : 106 ,
202+ "baseline_total_cost_usd" : 5964.47455875 ,
203+ "mcp_total_cost_usd" : 9588.241008750001 ,
204+ "baseline_avg_cost_usd" : 56.26862791273585 ,
205+ "mcp_avg_cost_usd" : 90.45510385613208 ,
206+ "delta_cost_usd" : 3623.766450000001 ,
207+ "pct_delta_cost_of_means" : 60.75583715390094 ,
208+ "input_ratio_mcp_over_baseline" : 1.243122149086989
209+ }
210+ },
211+ "size_summary" : {
212+ "haiku_by_context_length" : {
213+ "100k-1m" : {
214+ "pairs" : 473 ,
215+ "baseline_avg_cost_usd" : 2.1341574990486274 ,
216+ "mcp_avg_cost_usd" : 1.2989662008456662 ,
217+ "delta_avg_cost_usd" : -0.8351912982029611 ,
218+ "pct_delta_cost_of_means" : -39.134473372995004
219+ },
220+ "<100k" : {
221+ "pairs" : 1104 ,
222+ "baseline_avg_cost_usd" : 0.22202567758152175 ,
223+ "mcp_avg_cost_usd" : 0.21036057576992792 ,
224+ "delta_avg_cost_usd" : -0.011665101811593855 ,
225+ "pct_delta_cost_of_means" : -5.253942669451261
226+ },
227+ "unknown" : {
228+ "pairs" : 1698 ,
229+ "baseline_avg_cost_usd" : 0.34385981366313306 ,
230+ "mcp_avg_cost_usd" : 0.3418660486454651 ,
231+ "delta_avg_cost_usd" : -0.0019937650176679615 ,
232+ "pct_delta_cost_of_means" : -0.5798191409541076
233+ }
234+ },
235+ "haiku_by_files_count" : {
236+ "10-100" : {
237+ "pairs" : 445 ,
238+ "baseline_avg_cost_usd" : 2.2411581459550582 ,
239+ "mcp_avg_cost_usd" : 1.348002051460674 ,
240+ "delta_avg_cost_usd" : -0.8931560944943839 ,
241+ "pct_delta_cost_of_means" : -39.85243505043996
242+ },
243+ "<10" : {
244+ "pairs" : 880 ,
245+ "baseline_avg_cost_usd" : 0.2421480316477273 ,
246+ "mcp_avg_cost_usd" : 0.2284137342613639 ,
247+ "delta_avg_cost_usd" : -0.013734297386363408 ,
248+ "pct_delta_cost_of_means" : -5.671860015919449
249+ },
250+ "unknown" : {
251+ "pairs" : 1950 ,
252+ "baseline_avg_cost_usd" : 0.3220723927692305 ,
253+ "mcp_avg_cost_usd" : 0.3211654565128203 ,
254+ "delta_avg_cost_usd" : -0.0009069362564101678 ,
255+ "pct_delta_cost_of_means" : -0.28159391390618627
256+ }
257+ }
258+ }
259+ },
260+ "valid_only" : {
261+ "pair_count" : 3358 ,
262+ "model_summary" : {
263+ "haiku" : {
264+ "pairs" : 3250 ,
265+ "baseline_total_cost_usd" : 1836.0302942999972 ,
266+ "mcp_total_cost_usd" : 1424.4097463000019 ,
267+ "baseline_avg_cost_usd" : 0.5649323982461529 ,
268+ "mcp_avg_cost_usd" : 0.4382799219384621 ,
269+ "delta_cost_usd" : -411.6205479999953 ,
270+ "pct_delta_cost_of_means" : -22.419049907721124 ,
271+ "input_ratio_mcp_over_baseline" : 0.5995818403986736
272+ },
273+ "sonnet" : {
274+ "pairs" : 9 ,
275+ "baseline_total_cost_usd" : 13.34744245 ,
276+ "mcp_total_cost_usd" : 12.55560455 ,
277+ "baseline_avg_cost_usd" : 1.4830491611111112 ,
278+ "mcp_avg_cost_usd" : 1.3950671722222223 ,
279+ "delta_cost_usd" : -0.7918379000000009 ,
280+ "pct_delta_cost_of_means" : -5.932506567953033 ,
281+ "input_ratio_mcp_over_baseline" : 0.24737631184407796
282+ },
283+ "opus" : {
284+ "pairs" : 99 ,
285+ "baseline_total_cost_usd" : 5963.191190750001 ,
286+ "mcp_total_cost_usd" : 9567.017012400001 ,
287+ "baseline_avg_cost_usd" : 60.23425445202021 ,
288+ "mcp_avg_cost_usd" : 96.63653547878789 ,
289+ "delta_cost_usd" : 3603.8258216500008 ,
290+ "pct_delta_cost_of_means" : 60.43451746508133 ,
291+ "input_ratio_mcp_over_baseline" : 1.2429818699306676
292+ }
293+ },
294+ "size_summary" : {
295+ "haiku_by_context_length" : {
296+ "100k-1m" : {
297+ "pairs" : 463 ,
298+ "baseline_avg_cost_usd" : 2.178298606803457 ,
299+ "mcp_avg_cost_usd" : 1.3272076822894172 ,
300+ "delta_avg_cost_usd" : -0.85109092451404 ,
301+ "pct_delta_cost_of_means" : -39.07136155969786
302+ },
303+ "<100k" : {
304+ "pairs" : 1101 ,
305+ "baseline_avg_cost_usd" : 0.2228427490463215 ,
306+ "mcp_avg_cost_usd" : 0.21056418301544086 ,
307+ "delta_avg_cost_usd" : -0.01227856603088064 ,
308+ "pct_delta_cost_of_means" : -5.509968838307744
309+ },
310+ "unknown" : {
311+ "pairs" : 1686 ,
312+ "baseline_avg_cost_usd" : 0.34527175127520754 ,
313+ "mcp_avg_cost_usd" : 0.3428715444246736 ,
314+ "delta_avg_cost_usd" : -0.0024002068505339464 ,
315+ "pct_delta_cost_of_means" : -0.6951645599934442
316+ }
317+ },
318+ "haiku_by_files_count" : {
319+ "10-100" : {
320+ "pairs" : 436 ,
321+ "baseline_avg_cost_usd" : 2.285346634977066 ,
322+ "mcp_avg_cost_usd" : 1.3760253596330274 ,
323+ "delta_avg_cost_usd" : -0.9093212753440388 ,
324+ "pct_delta_cost_of_means" : -39.789205778543256
325+ },
326+ "<10" : {
327+ "pairs" : 876 ,
328+ "baseline_avg_cost_usd" : 0.24325373042237447 ,
329+ "mcp_avg_cost_usd" : 0.22899221004566234 ,
330+ "delta_avg_cost_usd" : -0.014261520376712116 ,
331+ "pct_delta_cost_of_means" : -5.862816718966268
332+ },
333+ "unknown" : {
334+ "pairs" : 1938 ,
335+ "baseline_avg_cost_usd" : 0.3232863228070173 ,
336+ "mcp_avg_cost_usd" : 0.3219120296697624 ,
337+ "delta_avg_cost_usd" : -0.0013742931372548641 ,
338+ "pct_delta_cost_of_means" : -0.42510092147487466
339+ }
340+ }
341+ }
342+ }
343+ }
344+ }
0 commit comments