Skip to content

Commit 184c693

Browse files
committed
Add canonical paired MCP cost analysis and model/size figure
1 parent f87e015 commit 184c693

File tree

6 files changed

+3115
-2
lines changed

6 files changed

+3115
-2
lines changed
Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
{
2+
"generated_at": "2026-03-04T23:08:01.932836+00:00",
3+
"source": "runs/official/_raw",
4+
"canonical_pairing_rule": "latest valid per side per (model, task_id); one pair per task",
5+
"sensitivity_pairing_rule": "count-matched per (model, task_id), newest-first within side",
6+
"valid_filter": "output_tokens > 0 and agent_execution_seconds >= 10",
7+
"records_scanned": 7703,
8+
"latest_task": {
9+
"all_pairs": {
10+
"pair_count": 502,
11+
"model_summary": {
12+
"haiku": {
13+
"pairs": 392,
14+
"baseline_total_cost_usd": 174.00601425,
15+
"mcp_total_cost_usd": 198.0197303500001,
16+
"baseline_avg_cost_usd": 0.44389289349489797,
17+
"mcp_avg_cost_usd": 0.5051523733418369,
18+
"delta_cost_usd": 24.013716100000096,
19+
"pct_delta_cost_of_means": 13.800509254524297,
20+
"input_ratio_mcp_over_baseline": 0.8004334702678465
21+
},
22+
"sonnet": {
23+
"pairs": 9,
24+
"baseline_total_cost_usd": 13.34744245,
25+
"mcp_total_cost_usd": 12.55560455,
26+
"baseline_avg_cost_usd": 1.4830491611111112,
27+
"mcp_avg_cost_usd": 1.3950671722222223,
28+
"delta_cost_usd": -0.7918379000000009,
29+
"pct_delta_cost_of_means": -5.932506567953033,
30+
"input_ratio_mcp_over_baseline": 0.24737631184407796
31+
},
32+
"opus": {
33+
"pairs": 101,
34+
"baseline_total_cost_usd": 5653.1065816499995,
35+
"mcp_total_cost_usd": 9119.03358425,
36+
"baseline_avg_cost_usd": 55.97135229356435,
37+
"mcp_avg_cost_usd": 90.28746123019803,
38+
"delta_cost_usd": 3465.9270026000013,
39+
"pct_delta_cost_of_means": 61.31013014773878,
40+
"input_ratio_mcp_over_baseline": 1.2365269473317915
41+
}
42+
},
43+
"size_summary": {
44+
"haiku_by_context_length": {
45+
"100k-1m": {
46+
"pairs": 98,
47+
"baseline_avg_cost_usd": 0.35356851377551024,
48+
"mcp_avg_cost_usd": 0.4815632173469387,
49+
"delta_avg_cost_usd": 0.12799470357142845,
50+
"pct_delta_cost_of_means": 36.200820656981804
51+
},
52+
"<100k": {
53+
"pairs": 222,
54+
"baseline_avg_cost_usd": 0.22787664121621623,
55+
"mcp_avg_cost_usd": 0.21461094842342338,
56+
"delta_avg_cost_usd": -0.013265692792792837,
57+
"pct_delta_cost_of_means": -5.821435984834422
58+
},
59+
"unknown": {
60+
"pairs": 72,
61+
"baseline_avg_cost_usd": 1.232884521527778,
62+
"mcp_avg_cost_usd": 1.4330958958333335,
63+
"delta_avg_cost_usd": 0.2002113743055555,
64+
"pct_delta_cost_of_means": 16.239264165426913
65+
}
66+
},
67+
"haiku_by_files_count": {
68+
"10-100": {
69+
"pairs": 91,
70+
"baseline_avg_cost_usd": 0.35155932912087917,
71+
"mcp_avg_cost_usd": 0.4929996104395605,
72+
"delta_avg_cost_usd": 0.1414402813186813,
73+
"pct_delta_cost_of_means": 40.23226511222773
74+
},
75+
"<10": {
76+
"pairs": 168,
77+
"baseline_avg_cost_usd": 0.2537777163690476,
78+
"mcp_avg_cost_usd": 0.2341883002976191,
79+
"delta_avg_cost_usd": -0.01958941607142851,
80+
"pct_delta_cost_of_means": -7.719123787425552
81+
},
82+
"unknown": {
83+
"pairs": 133,
84+
"baseline_avg_cost_usd": 0.7472139770676692,
85+
"mcp_avg_cost_usd": 0.855737829699248,
86+
"delta_avg_cost_usd": 0.10852385263157878,
87+
"pct_delta_cost_of_means": 14.52379853190977
88+
}
89+
}
90+
}
91+
},
92+
"valid_only": {
93+
"pair_count": 497,
94+
"model_summary": {
95+
"haiku": {
96+
"pairs": 392,
97+
"baseline_total_cost_usd": 287.4485963499998,
98+
"mcp_total_cost_usd": 200.74471865000007,
99+
"baseline_avg_cost_usd": 0.7332872355867341,
100+
"mcp_avg_cost_usd": 0.512103874107143,
101+
"delta_cost_usd": -86.70387769999971,
102+
"pct_delta_cost_of_means": -30.163263554235055,
103+
"input_ratio_mcp_over_baseline": 0.3431840458617894
104+
},
105+
"sonnet": {
106+
"pairs": 9,
107+
"baseline_total_cost_usd": 13.34744245,
108+
"mcp_total_cost_usd": 12.55560455,
109+
"baseline_avg_cost_usd": 1.4830491611111112,
110+
"mcp_avg_cost_usd": 1.3950671722222223,
111+
"delta_cost_usd": -0.7918379000000009,
112+
"pct_delta_cost_of_means": -5.932506567953033,
113+
"input_ratio_mcp_over_baseline": 0.24737631184407796
114+
},
115+
"opus": {
116+
"pairs": 96,
117+
"baseline_total_cost_usd": 5654.35168975,
118+
"mcp_total_cost_usd": 9109.5925864,
119+
"baseline_avg_cost_usd": 58.89949676822917,
120+
"mcp_avg_cost_usd": 94.89158944166667,
121+
"delta_cost_usd": 3455.2408966499997,
122+
"pct_delta_cost_of_means": 61.10764038454724,
123+
"input_ratio_mcp_over_baseline": 1.2363819844806747
124+
}
125+
},
126+
"size_summary": {
127+
"haiku_by_context_length": {
128+
"100k-1m": {
129+
"pairs": 98,
130+
"baseline_avg_cost_usd": 1.483237409693878,
131+
"mcp_avg_cost_usd": 0.5093692204081632,
132+
"delta_avg_cost_usd": -0.9738681892857148,
133+
"pct_delta_cost_of_means": -65.65828119766135
134+
},
135+
"<100k": {
136+
"pairs": 222,
137+
"baseline_avg_cost_usd": 0.23492972049549551,
138+
"mcp_avg_cost_usd": 0.21461094842342338,
139+
"delta_avg_cost_usd": -0.020318772072072132,
140+
"pct_delta_cost_of_means": -8.648872534823326
141+
},
142+
"unknown": {
143+
"pairs": 72,
144+
"baseline_avg_cost_usd": 1.2491240590277777,
145+
"mcp_avg_cost_usd": 1.4330958958333335,
146+
"delta_avg_cost_usd": 0.18397183680555568,
147+
"pct_delta_cost_of_means": 14.728067678781652
148+
}
149+
},
150+
"haiku_by_files_count": {
151+
"10-100": {
152+
"pairs": 91,
153+
"baseline_avg_cost_usd": 1.5676902109890112,
154+
"mcp_avg_cost_usd": 0.5063856565934066,
155+
"delta_avg_cost_usd": -1.0613045543956046,
156+
"pct_delta_cost_of_means": -67.69861462144729
157+
},
158+
"<10": {
159+
"pairs": 168,
160+
"baseline_avg_cost_usd": 0.26154956279761904,
161+
"mcp_avg_cost_usd": 0.24315769375000004,
162+
"delta_avg_cost_usd": -0.018391869047619025,
163+
"pct_delta_cost_of_means": -7.031886748688709
164+
},
165+
"unknown": {
166+
"pairs": 133,
167+
"baseline_avg_cost_usd": 0.7582591022556393,
168+
"mcp_avg_cost_usd": 0.855737829699248,
169+
"delta_avg_cost_usd": 0.09747872744360878,
170+
"pct_delta_cost_of_means": 12.855596082346121
171+
}
172+
}
173+
}
174+
}
175+
},
176+
"count_matched": {
177+
"all_pairs": {
178+
"pair_count": 3390,
179+
"model_summary": {
180+
"haiku": {
181+
"pairs": 3275,
182+
"baseline_total_cost_usd": 1838.4468086999973,
183+
"mcp_total_cost_usd": 1427.1376392500022,
184+
"baseline_avg_cost_usd": 0.5613578041832052,
185+
"mcp_avg_cost_usd": 0.43576721809160374,
186+
"delta_cost_usd": -411.3091694499951,
187+
"pct_delta_cost_of_means": -22.372644533612597,
188+
"input_ratio_mcp_over_baseline": 0.5995627052572636
189+
},
190+
"sonnet": {
191+
"pairs": 9,
192+
"baseline_total_cost_usd": 13.34744245,
193+
"mcp_total_cost_usd": 12.55560455,
194+
"baseline_avg_cost_usd": 1.4830491611111112,
195+
"mcp_avg_cost_usd": 1.3950671722222223,
196+
"delta_cost_usd": -0.7918379000000009,
197+
"pct_delta_cost_of_means": -5.932506567953033,
198+
"input_ratio_mcp_over_baseline": 0.24737631184407796
199+
},
200+
"opus": {
201+
"pairs": 106,
202+
"baseline_total_cost_usd": 5964.47455875,
203+
"mcp_total_cost_usd": 9588.241008750001,
204+
"baseline_avg_cost_usd": 56.26862791273585,
205+
"mcp_avg_cost_usd": 90.45510385613208,
206+
"delta_cost_usd": 3623.766450000001,
207+
"pct_delta_cost_of_means": 60.75583715390094,
208+
"input_ratio_mcp_over_baseline": 1.243122149086989
209+
}
210+
},
211+
"size_summary": {
212+
"haiku_by_context_length": {
213+
"100k-1m": {
214+
"pairs": 473,
215+
"baseline_avg_cost_usd": 2.1341574990486274,
216+
"mcp_avg_cost_usd": 1.2989662008456662,
217+
"delta_avg_cost_usd": -0.8351912982029611,
218+
"pct_delta_cost_of_means": -39.134473372995004
219+
},
220+
"<100k": {
221+
"pairs": 1104,
222+
"baseline_avg_cost_usd": 0.22202567758152175,
223+
"mcp_avg_cost_usd": 0.21036057576992792,
224+
"delta_avg_cost_usd": -0.011665101811593855,
225+
"pct_delta_cost_of_means": -5.253942669451261
226+
},
227+
"unknown": {
228+
"pairs": 1698,
229+
"baseline_avg_cost_usd": 0.34385981366313306,
230+
"mcp_avg_cost_usd": 0.3418660486454651,
231+
"delta_avg_cost_usd": -0.0019937650176679615,
232+
"pct_delta_cost_of_means": -0.5798191409541076
233+
}
234+
},
235+
"haiku_by_files_count": {
236+
"10-100": {
237+
"pairs": 445,
238+
"baseline_avg_cost_usd": 2.2411581459550582,
239+
"mcp_avg_cost_usd": 1.348002051460674,
240+
"delta_avg_cost_usd": -0.8931560944943839,
241+
"pct_delta_cost_of_means": -39.85243505043996
242+
},
243+
"<10": {
244+
"pairs": 880,
245+
"baseline_avg_cost_usd": 0.2421480316477273,
246+
"mcp_avg_cost_usd": 0.2284137342613639,
247+
"delta_avg_cost_usd": -0.013734297386363408,
248+
"pct_delta_cost_of_means": -5.671860015919449
249+
},
250+
"unknown": {
251+
"pairs": 1950,
252+
"baseline_avg_cost_usd": 0.3220723927692305,
253+
"mcp_avg_cost_usd": 0.3211654565128203,
254+
"delta_avg_cost_usd": -0.0009069362564101678,
255+
"pct_delta_cost_of_means": -0.28159391390618627
256+
}
257+
}
258+
}
259+
},
260+
"valid_only": {
261+
"pair_count": 3358,
262+
"model_summary": {
263+
"haiku": {
264+
"pairs": 3250,
265+
"baseline_total_cost_usd": 1836.0302942999972,
266+
"mcp_total_cost_usd": 1424.4097463000019,
267+
"baseline_avg_cost_usd": 0.5649323982461529,
268+
"mcp_avg_cost_usd": 0.4382799219384621,
269+
"delta_cost_usd": -411.6205479999953,
270+
"pct_delta_cost_of_means": -22.419049907721124,
271+
"input_ratio_mcp_over_baseline": 0.5995818403986736
272+
},
273+
"sonnet": {
274+
"pairs": 9,
275+
"baseline_total_cost_usd": 13.34744245,
276+
"mcp_total_cost_usd": 12.55560455,
277+
"baseline_avg_cost_usd": 1.4830491611111112,
278+
"mcp_avg_cost_usd": 1.3950671722222223,
279+
"delta_cost_usd": -0.7918379000000009,
280+
"pct_delta_cost_of_means": -5.932506567953033,
281+
"input_ratio_mcp_over_baseline": 0.24737631184407796
282+
},
283+
"opus": {
284+
"pairs": 99,
285+
"baseline_total_cost_usd": 5963.191190750001,
286+
"mcp_total_cost_usd": 9567.017012400001,
287+
"baseline_avg_cost_usd": 60.23425445202021,
288+
"mcp_avg_cost_usd": 96.63653547878789,
289+
"delta_cost_usd": 3603.8258216500008,
290+
"pct_delta_cost_of_means": 60.43451746508133,
291+
"input_ratio_mcp_over_baseline": 1.2429818699306676
292+
}
293+
},
294+
"size_summary": {
295+
"haiku_by_context_length": {
296+
"100k-1m": {
297+
"pairs": 463,
298+
"baseline_avg_cost_usd": 2.178298606803457,
299+
"mcp_avg_cost_usd": 1.3272076822894172,
300+
"delta_avg_cost_usd": -0.85109092451404,
301+
"pct_delta_cost_of_means": -39.07136155969786
302+
},
303+
"<100k": {
304+
"pairs": 1101,
305+
"baseline_avg_cost_usd": 0.2228427490463215,
306+
"mcp_avg_cost_usd": 0.21056418301544086,
307+
"delta_avg_cost_usd": -0.01227856603088064,
308+
"pct_delta_cost_of_means": -5.509968838307744
309+
},
310+
"unknown": {
311+
"pairs": 1686,
312+
"baseline_avg_cost_usd": 0.34527175127520754,
313+
"mcp_avg_cost_usd": 0.3428715444246736,
314+
"delta_avg_cost_usd": -0.0024002068505339464,
315+
"pct_delta_cost_of_means": -0.6951645599934442
316+
}
317+
},
318+
"haiku_by_files_count": {
319+
"10-100": {
320+
"pairs": 436,
321+
"baseline_avg_cost_usd": 2.285346634977066,
322+
"mcp_avg_cost_usd": 1.3760253596330274,
323+
"delta_avg_cost_usd": -0.9093212753440388,
324+
"pct_delta_cost_of_means": -39.789205778543256
325+
},
326+
"<10": {
327+
"pairs": 876,
328+
"baseline_avg_cost_usd": 0.24325373042237447,
329+
"mcp_avg_cost_usd": 0.22899221004566234,
330+
"delta_avg_cost_usd": -0.014261520376712116,
331+
"pct_delta_cost_of_means": -5.862816718966268
332+
},
333+
"unknown": {
334+
"pairs": 1938,
335+
"baseline_avg_cost_usd": 0.3232863228070173,
336+
"mcp_avg_cost_usd": 0.3219120296697624,
337+
"delta_avg_cost_usd": -0.0013742931372548641,
338+
"pct_delta_cost_of_means": -0.42510092147487466
339+
}
340+
}
341+
}
342+
}
343+
}
344+
}
122 KB
Loading

0 commit comments

Comments
 (0)