2323import glob
2424import os
2525import random
26+ import resource
2627import shutil
2728import statistics
2829import subprocess
2930import sys
3031import time
32+ from collections .abc import Callable
3133from concurrent .futures import ThreadPoolExecutor , as_completed
34+ from typing import Any
35+
36+
37+ def winsorized_paired_stats (
38+ diffs : list [float ], * , trim_frac : float = 0.1 , conf : float = 0.95
39+ ) -> dict [str , float ]:
40+ """Robust summary of a list of per-round paired differences.
41+
42+ Point estimate: trimmed mean (drop ``trim_frac`` of values from each end), so a
43+ single outlier round cannot drag the estimate.
44+
45+ Error bar: the Tukey-McLaughlin standard error of the trimmed mean, built from the
46+ *Winsorized* variance. The tails are clamped to the boundary kept-value rather than
47+ deleted -- deleting them and taking the ordinary variance of the survivors would
48+ understate the error bar (it would measure only how calm the middle is, discarding
49+ the fact that the tails were wild). The ``(1 - 2*trim_frac)`` divisor rescales for
50+ the compression Winsorizing introduces.
51+
52+ Returns trimmed-mean estimate, median, the 95% CI half-width, and the kept count.
53+ A normal-approx critical value is used (fine for the n>=~30 runs this is used with).
54+ """
55+ n = len (diffs )
56+ s = sorted (diffs )
57+ g = int (n * trim_frac ) # number trimmed from each end
58+ median = statistics .median (s )
59+ if n < 2 or n - 2 * g < 2 :
60+ est = statistics .mean (s )
61+ return {"est" : est , "median" : median , "ci" : 0.0 , "kept" : float (n )}
62+ kept = s [g : n - g ]
63+ est = statistics .mean (kept )
64+ # Winsorize: clamp the g smallest up to kept[0], the g largest down to kept[-1].
65+ wins = [kept [0 ]] * g + kept + [kept [- 1 ]] * g
66+ wvar = statistics .variance (wins ) # sample Winsorized variance (df = n-1)
67+ se = (wvar ** 0.5 ) / ((1 - 2 * trim_frac ) * (n ** 0.5 ))
68+ z = statistics .NormalDist ().inv_cdf (0.5 + conf / 2 )
69+ return {"est" : est , "median" : median , "ci" : z * se , "kept" : float (len (kept ))}
3270
3371
3472def heading (s : str ) -> None :
@@ -81,14 +119,23 @@ def edit_python_file(fnam: str) -> None:
81119
82120
83121def run_benchmark (
84- compiled_dir : str , check_dir : str , * , incremental : bool , code : str | None , foreign : bool | None
122+ compiled_dir : str ,
123+ check_dir : str ,
124+ * ,
125+ incremental : bool ,
126+ code : str | None ,
127+ foreign : bool | None ,
128+ metric : str = "wall" ,
129+ workers1 : bool = False ,
85130) -> float :
86131 cache_dir = os .path .join (compiled_dir , ".mypy_cache" )
87132 if os .path .isdir (cache_dir ) and not incremental :
88133 shutil .rmtree (cache_dir )
89134 env = os .environ .copy ()
90135 env ["PYTHONPATH" ] = os .path .abspath (compiled_dir )
91136 env ["PYTHONHASHSEED" ] = "1"
137+ if workers1 :
138+ env ["MYPY_NUM_WORKERS" ] = "1"
92139 abschk = os .path .abspath (check_dir )
93140 cmd = [sys .executable , "-m" , "mypy" ]
94141 if code :
@@ -103,13 +150,26 @@ def run_benchmark(
103150 # Update a few files to force non-trivial incremental run
104151 edit_python_file (os .path .join (abschk , "mypy/__main__.py" ))
105152 edit_python_file (os .path .join (abschk , "mypy/test/testcheck.py" ))
106- t0 = time .time ()
153+ stopwatch_func : Callable [[], Any ]
154+ delta_func : Callable [[Any , Any ], Any ]
155+ if metric == "wall" :
156+ stopwatch_func = lambda : time .time ()
157+ delta_func = lambda t0 , t1 : t1 - t0
158+ elif metric == "cpu" :
159+ # NOTE: CPU time (user+sys) is far less sensitive than wall-clock to
160+ # background interference
161+ stopwatch_func = lambda : resource .getrusage (resource .RUSAGE_CHILDREN )
162+ delta_func = lambda r0 , r1 : (r1 .ru_utime - r0 .ru_utime ) + (r1 .ru_stime - r0 .ru_stime )
163+ else :
164+ raise AssertionError (f"Unrecognized metric: { metric !r} " )
165+ v0 = stopwatch_func () # capture
107166 # Ignore errors, since some commits being measured may generate additional errors.
108167 if foreign :
109168 subprocess .run (cmd , cwd = check_dir , env = env )
110169 else :
111170 subprocess .run (cmd , cwd = compiled_dir , env = env )
112- return time .time () - t0
171+ v1 = stopwatch_func () # capture
172+ return delta_func (v0 , v1 )
113173
114174
115175def main () -> None :
@@ -145,6 +205,41 @@ def main() -> None:
145205 type = int ,
146206 help = "set number of measurements to perform (default=15)" ,
147207 )
208+ parser .add_argument (
209+ "--warmup-runs" ,
210+ metavar = "N" ,
211+ default = 1 ,
212+ type = int ,
213+ help = "set number of leading warmup runs to discard (default=1)" ,
214+ )
215+ parser .add_argument (
216+ "--cache-binaries" ,
217+ default = False ,
218+ action = "store_true" ,
219+ help = "cache each commit's compiled clone under "
220+ + "<script_dir>/perf_compare/binaries/<commit> and restore from there on later runs, "
221+ + "skipping the ~5-min clone+compile. Off by default so it doesn't silently consume "
222+ + "disk. Caveat: the cache is keyed by the commit string you pass, so reuse stable SHAs "
223+ + "(a moving ref like a branch name or HEAD can serve a stale build -- delete the cache "
224+ + "dir if in doubt)." ,
225+ )
226+ parser .add_argument (
227+ "--metric" ,
228+ choices = ["wall" , "cpu" ],
229+ default = "wall" ,
230+ help = "quantity to measure per run: 'wall' (wall-clock, default) or 'cpu' (user+sys "
231+ + "CPU time of the type-check process). 'cpu' is much less sensitive to background "
232+ + "interference and scheduling, so it tightens the per-run distribution." ,
233+ )
234+ parser .add_argument (
235+ "--workers1" ,
236+ default = False ,
237+ action = "store_true" ,
238+ help = "run selfcheck with a single mypy worker (MYPY_NUM_WORKERS=1) to "
239+ + "decrease variance in measurements. "
240+ + "Strongly recommended when --metric=cpu. "
241+ + "When omitted, uses mypy's default worker count." ,
242+ )
148243 parser .add_argument (
149244 "-j" ,
150245 metavar = "N" ,
@@ -178,20 +273,39 @@ def main() -> None:
178273 dont_setup : bool = args .dont_setup
179274 multi_file : bool = args .multi_file
180275 commits = args .commit
181- num_runs : int = args .num_runs + 1
276+ baseline_commit : str = commits [0 ]
277+ warmup_runs : int = args .warmup_runs
278+ measurement_runs : int = args .num_runs
279+ num_runs : int = measurement_runs + warmup_runs
182280 max_workers : int = args .j
183281 code : str | None = args .c
184282 foreign_repo : str | None = args .r
283+ metric : str = args .metric
284+ workers1 : bool = args .workers1
285+ cache_binaries : bool = args .cache_binaries
185286
186287 if not (os .path .isdir (".git" ) and os .path .isdir ("mypyc" )):
187288 sys .exit ("error: You must run this script from the mypy repo root" )
188289
290+ archive_root = os .path .join (
291+ os .path .dirname (os .path .abspath (__file__ )), "perf_compare" , "binaries"
292+ )
293+
189294 target_dirs = []
295+ dirs_to_compile = []
190296 for i , commit in enumerate (commits ):
191297 target_dir = f"mypy.{ i } .tmpdir"
192298 target_dirs .append (target_dir )
193299 if not dont_setup :
194- clone (target_dir , commit )
300+ archive = os .path .join (archive_root , commit )
301+ if cache_binaries and os .path .isdir (archive ):
302+ print (f"restore: copying { archive } -> { target_dir } (skipping clone+compile)" )
303+ if os .path .isdir (target_dir ):
304+ shutil .rmtree (target_dir )
305+ shutil .copytree (archive , target_dir , symlinks = True )
306+ else :
307+ clone (target_dir , commit )
308+ dirs_to_compile .append (target_dir )
195309
196310 if foreign_repo :
197311 check_dir = "mypy.foreign.tmpdir"
@@ -202,27 +316,32 @@ def main() -> None:
202316 if not dont_setup :
203317 clone (check_dir , commits [0 ])
204318
205- if not dont_setup :
319+ if not dont_setup and dirs_to_compile :
206320 heading ("Compiling mypy" )
207321 print ("(This will take a while...)" )
208322
209323 with ThreadPoolExecutor (max_workers = max_workers ) as executor :
210324 futures = [
211- executor .submit (build_mypy , target_dir , multi_file ) for target_dir in target_dirs
325+ executor .submit (build_mypy , target_dir , multi_file )
326+ for target_dir in dirs_to_compile
212327 ]
213328 for future in as_completed (futures ):
214329 future .result ()
215330
216- print (f"Finished compiling mypy ({ len (commits )} builds)" )
331+ print (f"Finished compiling mypy ({ len (dirs_to_compile )} builds)" )
332+ elif not dont_setup :
333+ print ("All targets restored from archive; skipping compile step." )
217334
218- heading ("Performing measurements" )
335+ workers_desc = "workers: 1" if workers1 else "workers: default"
336+ key_options_desc = f"(metric: { metric } -time, { workers_desc } )"
337+ heading (f"Performing measurements { key_options_desc } " )
219338
220339 results : dict [str , list [float ]] = {}
221340 for n in range (num_runs ):
222- if n == 0 :
223- print ("Warmup..." )
341+ if n < warmup_runs :
342+ print (f "Warmup { n + 1 } / { warmup_runs } ..." )
224343 else :
225- print (f"Run { n } /{ num_runs - 1 } ..." )
344+ print (f"Run { n - warmup_runs + 1 } /{ num_runs - warmup_runs } ..." )
226345 items = list (enumerate (commits ))
227346 random .shuffle (items )
228347 for i , commit in items :
@@ -232,26 +351,56 @@ def main() -> None:
232351 incremental = incremental ,
233352 code = code ,
234353 foreign = bool (foreign_repo ),
354+ metric = metric ,
355+ workers1 = workers1 ,
235356 )
236- # Don't record the first warm-up run
237- if n > 0 :
357+ # Don't record the leading warm-up runs
358+ if n >= warmup_runs :
238359 print (f"{ commit } : t={ tt :.3f} s" )
239360 results .setdefault (commit , []).append (tt )
240361
241362 print ()
242- heading ("Results" )
243- first = - 1.0
363+ heading (f"Results { key_options_desc } " )
364+ first_mean = - 1.0
365+ first_median = - 1.0
244366 for commit in commits :
245- tt = statistics .mean (results [commit ])
367+ mean = statistics .mean (results [commit ])
368+ median = statistics .median (results [commit ])
246369 # pstdev (instead of stdev) is used here primarily to accommodate the case where num_runs=1
247370 s = statistics .pstdev (results [commit ]) if len (results [commit ]) > 1 else 0
248- if first < 0 :
249- delta = "0.0%"
250- first = tt
371+ if first_mean < 0 :
372+ delta_mean = "0.0%"
373+ first_mean = mean
374+ delta_median = "0.0%"
375+ first_median = median
251376 else :
252- d = (tt / first ) - 1
253- delta = f"{ d :+.1%} "
254- print (f"{ commit :<25} { tt :.3f} s ({ delta } ) | stdev { s :.3f} s " )
377+ d1 = (mean / first_mean ) - 1
378+ delta_mean = f"{ d1 :+.1%} "
379+ d2 = (median / first_median ) - 1
380+ delta_median = f"{ d2 :+.1%} "
381+ print (
382+ f"{ commit :<25} mean { mean :.3f} s ({ delta_mean } ) | stdev { s :.3f} s | "
383+ f"median { median :.3f} s ({ delta_median } )"
384+ )
385+
386+ # Paired per-round differences vs the baseline commit. Each round runs every commit
387+ # once, so results[commit][k] is round k for every commit -- the differences are
388+ # already matched. Differencing cancels round-level common-mode noise (a throttle or
389+ # background-process spike that round slows every commit together), which is the bulk
390+ # of the variance on a laptop. See winsorized_paired_stats for the robust estimator.
391+ base_runs = results [baseline_commit ]
392+ base_center = statistics .median (base_runs )
393+ heading (f"Paired deltas vs { baseline_commit } (per-round diffs; median +/- 95% CI)" )
394+ for commit in commits :
395+ if commit == baseline_commit :
396+ print (f"{ commit :<25} baseline" )
397+ continue
398+ diffs = [c - b for c , b in zip (results [commit ], base_runs )]
399+ st = winsorized_paired_stats (diffs )
400+ ci_ms = st ["ci" ] * 1000
401+ median_ms = st ["median" ] * 1000
402+ pct = (st ["median" ] / base_center * 100 ) if base_center else 0.0
403+ print (f"{ commit :<25} median { median_ms :+7.1f} ms +/-{ ci_ms :4.1f} ({ pct :+.2f} %)" )
255404
256405 t = int (time .time () - whole_program_time_0 )
257406 total_time_taken_formatted = ", " .join (
@@ -264,6 +413,18 @@ def main() -> None:
264413 total_time_taken_formatted ,
265414 )
266415
416+ # Archive compiled clones before cleanup, keyed by commit, so later runs can
417+ # restore them instead of recompiling. Skip if destination already exists.
418+ if cache_binaries :
419+ os .makedirs (archive_root , exist_ok = True )
420+ for target_dir , commit in zip (target_dirs , commits ):
421+ dest = os .path .join (archive_root , commit )
422+ if os .path .isdir (dest ):
423+ print (f"archive: { dest } already exists, skipping" )
424+ else :
425+ print (f"archive: copying { target_dir } -> { dest } " )
426+ shutil .copytree (target_dir , dest , symlinks = True )
427+
267428 shutil .rmtree (check_dir )
268429 for target_dir in target_dirs :
269430 shutil .rmtree (target_dir )
0 commit comments