Skip to content

Commit cb0ae01

Browse files
davidfstrclaude
andcommitted
misc/perf_compare.py: Add options/behaviors to reduce measured variance
Specifically: * Median is reported, in addition to the existing mean+stdev, which is significantly more resistant to skew by outliers. * --metric {wall,cpu} (default wall): Enables profiling using CPU time rather than wall-clock time. CPU profiling has roughly half the coefficient of variation as wall-clock profiling equal run count. * --workers1: Forces MYPY_NUM_WORKERS=1 (rather than the default 4) to cut CPU scheduling variance. Strongly recommended when using --metric cpu. * --warmup-runs N (default 1): Configurable number of leading cold runs to discard. Previously was always 1. Higher run counts decrease outliers that skew the reported mean. * A new "Paired deltas vs <first commit>" section is added to the report, showing per-round paired differencing against the first commit to cancel round-level common-mode noise, reducing variance. Reported as median +/-95% CI. Also: * --cache-binaries (default false): Caches each commit's compiled clone to avoid ~5min recompile whenever comparing the same commit multiple times. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 52de0c7 commit cb0ae01

2 files changed

Lines changed: 187 additions & 23 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,6 @@ test_capi
6060
test_capi
6161
/mypyc/lib-rt/build/
6262
/mypyc/lib-rt/*.so
63+
64+
# perf_compare.py --cache-binaries cache
65+
/misc/perf_compare/

misc/perf_compare.py

Lines changed: 184 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,50 @@
2323
import glob
2424
import os
2525
import random
26+
import resource
2627
import shutil
2728
import statistics
2829
import subprocess
2930
import sys
3031
import time
32+
from collections.abc import Callable
3133
from concurrent.futures import ThreadPoolExecutor, as_completed
34+
from typing import Any
35+
36+
37+
def winsorized_paired_stats(
38+
diffs: list[float], *, trim_frac: float = 0.1, conf: float = 0.95
39+
) -> dict[str, float]:
40+
"""Robust summary of a list of per-round paired differences.
41+
42+
Point estimate: trimmed mean (drop ``trim_frac`` of values from each end), so a
43+
single outlier round cannot drag the estimate.
44+
45+
Error bar: the Tukey-McLaughlin standard error of the trimmed mean, built from the
46+
*Winsorized* variance. The tails are clamped to the boundary kept-value rather than
47+
deleted -- deleting them and taking the ordinary variance of the survivors would
48+
understate the error bar (it would measure only how calm the middle is, discarding
49+
the fact that the tails were wild). The ``(1 - 2*trim_frac)`` divisor rescales for
50+
the compression Winsorizing introduces.
51+
52+
Returns trimmed-mean estimate, median, the 95% CI half-width, and the kept count.
53+
A normal-approx critical value is used (fine for the n>=~30 runs this is used with).
54+
"""
55+
n = len(diffs)
56+
s = sorted(diffs)
57+
g = int(n * trim_frac) # number trimmed from each end
58+
median = statistics.median(s)
59+
if n < 2 or n - 2 * g < 2:
60+
est = statistics.mean(s)
61+
return {"est": est, "median": median, "ci": 0.0, "kept": float(n)}
62+
kept = s[g : n - g]
63+
est = statistics.mean(kept)
64+
# Winsorize: clamp the g smallest up to kept[0], the g largest down to kept[-1].
65+
wins = [kept[0]] * g + kept + [kept[-1]] * g
66+
wvar = statistics.variance(wins) # sample Winsorized variance (df = n-1)
67+
se = (wvar**0.5) / ((1 - 2 * trim_frac) * (n**0.5))
68+
z = statistics.NormalDist().inv_cdf(0.5 + conf / 2)
69+
return {"est": est, "median": median, "ci": z * se, "kept": float(len(kept))}
3270

3371

3472
def heading(s: str) -> None:
@@ -81,14 +119,23 @@ def edit_python_file(fnam: str) -> None:
81119

82120

83121
def run_benchmark(
84-
compiled_dir: str, check_dir: str, *, incremental: bool, code: str | None, foreign: bool | None
122+
compiled_dir: str,
123+
check_dir: str,
124+
*,
125+
incremental: bool,
126+
code: str | None,
127+
foreign: bool | None,
128+
metric: str = "wall",
129+
workers1: bool = False,
85130
) -> float:
86131
cache_dir = os.path.join(compiled_dir, ".mypy_cache")
87132
if os.path.isdir(cache_dir) and not incremental:
88133
shutil.rmtree(cache_dir)
89134
env = os.environ.copy()
90135
env["PYTHONPATH"] = os.path.abspath(compiled_dir)
91136
env["PYTHONHASHSEED"] = "1"
137+
if workers1:
138+
env["MYPY_NUM_WORKERS"] = "1"
92139
abschk = os.path.abspath(check_dir)
93140
cmd = [sys.executable, "-m", "mypy"]
94141
if code:
@@ -103,13 +150,26 @@ def run_benchmark(
103150
# Update a few files to force non-trivial incremental run
104151
edit_python_file(os.path.join(abschk, "mypy/__main__.py"))
105152
edit_python_file(os.path.join(abschk, "mypy/test/testcheck.py"))
106-
t0 = time.time()
153+
stopwatch_func: Callable[[], Any]
154+
delta_func: Callable[[Any, Any], Any]
155+
if metric == "wall":
156+
stopwatch_func = lambda: time.time()
157+
delta_func = lambda t0, t1: t1 - t0
158+
elif metric == "cpu":
159+
# NOTE: CPU time (user+sys) is far less sensitive than wall-clock to
160+
# background interference
161+
stopwatch_func = lambda: resource.getrusage(resource.RUSAGE_CHILDREN)
162+
delta_func = lambda r0, r1: (r1.ru_utime - r0.ru_utime) + (r1.ru_stime - r0.ru_stime)
163+
else:
164+
raise AssertionError(f"Unrecognized metric: {metric!r}")
165+
v0 = stopwatch_func() # capture
107166
# Ignore errors, since some commits being measured may generate additional errors.
108167
if foreign:
109168
subprocess.run(cmd, cwd=check_dir, env=env)
110169
else:
111170
subprocess.run(cmd, cwd=compiled_dir, env=env)
112-
return time.time() - t0
171+
v1 = stopwatch_func() # capture
172+
return delta_func(v0, v1)
113173

114174

115175
def main() -> None:
@@ -145,6 +205,41 @@ def main() -> None:
145205
type=int,
146206
help="set number of measurements to perform (default=15)",
147207
)
208+
parser.add_argument(
209+
"--warmup-runs",
210+
metavar="N",
211+
default=1,
212+
type=int,
213+
help="set number of leading warmup runs to discard (default=1)",
214+
)
215+
parser.add_argument(
216+
"--cache-binaries",
217+
default=False,
218+
action="store_true",
219+
help="cache each commit's compiled clone under "
220+
+ "<script_dir>/perf_compare/binaries/<commit> and restore from there on later runs, "
221+
+ "skipping the ~5-min clone+compile. Off by default so it doesn't silently consume "
222+
+ "disk. Caveat: the cache is keyed by the commit string you pass, so reuse stable SHAs "
223+
+ "(a moving ref like a branch name or HEAD can serve a stale build -- delete the cache "
224+
+ "dir if in doubt).",
225+
)
226+
parser.add_argument(
227+
"--metric",
228+
choices=["wall", "cpu"],
229+
default="wall",
230+
help="quantity to measure per run: 'wall' (wall-clock, default) or 'cpu' (user+sys "
231+
+ "CPU time of the type-check process). 'cpu' is much less sensitive to background "
232+
+ "interference and scheduling, so it tightens the per-run distribution.",
233+
)
234+
parser.add_argument(
235+
"--workers1",
236+
default=False,
237+
action="store_true",
238+
help="run selfcheck with a single mypy worker (MYPY_NUM_WORKERS=1) to "
239+
+ "decrease variance in measurements. "
240+
+ "Strongly recommended when --metric=cpu. "
241+
+ "When omitted, uses mypy's default worker count.",
242+
)
148243
parser.add_argument(
149244
"-j",
150245
metavar="N",
@@ -178,20 +273,39 @@ def main() -> None:
178273
dont_setup: bool = args.dont_setup
179274
multi_file: bool = args.multi_file
180275
commits = args.commit
181-
num_runs: int = args.num_runs + 1
276+
baseline_commit: str = commits[0]
277+
warmup_runs: int = args.warmup_runs
278+
measurement_runs: int = args.num_runs
279+
num_runs: int = measurement_runs + warmup_runs
182280
max_workers: int = args.j
183281
code: str | None = args.c
184282
foreign_repo: str | None = args.r
283+
metric: str = args.metric
284+
workers1: bool = args.workers1
285+
cache_binaries: bool = args.cache_binaries
185286

186287
if not (os.path.isdir(".git") and os.path.isdir("mypyc")):
187288
sys.exit("error: You must run this script from the mypy repo root")
188289

290+
archive_root = os.path.join(
291+
os.path.dirname(os.path.abspath(__file__)), "perf_compare", "binaries"
292+
)
293+
189294
target_dirs = []
295+
dirs_to_compile = []
190296
for i, commit in enumerate(commits):
191297
target_dir = f"mypy.{i}.tmpdir"
192298
target_dirs.append(target_dir)
193299
if not dont_setup:
194-
clone(target_dir, commit)
300+
archive = os.path.join(archive_root, commit)
301+
if cache_binaries and os.path.isdir(archive):
302+
print(f"restore: copying {archive} -> {target_dir} (skipping clone+compile)")
303+
if os.path.isdir(target_dir):
304+
shutil.rmtree(target_dir)
305+
shutil.copytree(archive, target_dir, symlinks=True)
306+
else:
307+
clone(target_dir, commit)
308+
dirs_to_compile.append(target_dir)
195309

196310
if foreign_repo:
197311
check_dir = "mypy.foreign.tmpdir"
@@ -202,27 +316,32 @@ def main() -> None:
202316
if not dont_setup:
203317
clone(check_dir, commits[0])
204318

205-
if not dont_setup:
319+
if not dont_setup and dirs_to_compile:
206320
heading("Compiling mypy")
207321
print("(This will take a while...)")
208322

209323
with ThreadPoolExecutor(max_workers=max_workers) as executor:
210324
futures = [
211-
executor.submit(build_mypy, target_dir, multi_file) for target_dir in target_dirs
325+
executor.submit(build_mypy, target_dir, multi_file)
326+
for target_dir in dirs_to_compile
212327
]
213328
for future in as_completed(futures):
214329
future.result()
215330

216-
print(f"Finished compiling mypy ({len(commits)} builds)")
331+
print(f"Finished compiling mypy ({len(dirs_to_compile)} builds)")
332+
elif not dont_setup:
333+
print("All targets restored from archive; skipping compile step.")
217334

218-
heading("Performing measurements")
335+
workers_desc = "workers: 1" if workers1 else "workers: default"
336+
key_options_desc = f"(metric: {metric}-time, {workers_desc})"
337+
heading(f"Performing measurements {key_options_desc}")
219338

220339
results: dict[str, list[float]] = {}
221340
for n in range(num_runs):
222-
if n == 0:
223-
print("Warmup...")
341+
if n < warmup_runs:
342+
print(f"Warmup {n + 1}/{warmup_runs}...")
224343
else:
225-
print(f"Run {n}/{num_runs - 1}...")
344+
print(f"Run {n - warmup_runs + 1}/{num_runs - warmup_runs}...")
226345
items = list(enumerate(commits))
227346
random.shuffle(items)
228347
for i, commit in items:
@@ -232,26 +351,56 @@ def main() -> None:
232351
incremental=incremental,
233352
code=code,
234353
foreign=bool(foreign_repo),
354+
metric=metric,
355+
workers1=workers1,
235356
)
236-
# Don't record the first warm-up run
237-
if n > 0:
357+
# Don't record the leading warm-up runs
358+
if n >= warmup_runs:
238359
print(f"{commit}: t={tt:.3f}s")
239360
results.setdefault(commit, []).append(tt)
240361

241362
print()
242-
heading("Results")
243-
first = -1.0
363+
heading(f"Results {key_options_desc}")
364+
first_mean = -1.0
365+
first_median = -1.0
244366
for commit in commits:
245-
tt = statistics.mean(results[commit])
367+
mean = statistics.mean(results[commit])
368+
median = statistics.median(results[commit])
246369
# pstdev (instead of stdev) is used here primarily to accommodate the case where num_runs=1
247370
s = statistics.pstdev(results[commit]) if len(results[commit]) > 1 else 0
248-
if first < 0:
249-
delta = "0.0%"
250-
first = tt
371+
if first_mean < 0:
372+
delta_mean = "0.0%"
373+
first_mean = mean
374+
delta_median = "0.0%"
375+
first_median = median
251376
else:
252-
d = (tt / first) - 1
253-
delta = f"{d:+.1%}"
254-
print(f"{commit:<25} {tt:.3f}s ({delta}) | stdev {s:.3f}s ")
377+
d1 = (mean / first_mean) - 1
378+
delta_mean = f"{d1:+.1%}"
379+
d2 = (median / first_median) - 1
380+
delta_median = f"{d2:+.1%}"
381+
print(
382+
f"{commit:<25} mean {mean:.3f}s ({delta_mean}) | stdev {s:.3f}s | "
383+
f"median {median:.3f}s ({delta_median})"
384+
)
385+
386+
# Paired per-round differences vs the baseline commit. Each round runs every commit
387+
# once, so results[commit][k] is round k for every commit -- the differences are
388+
# already matched. Differencing cancels round-level common-mode noise (a throttle or
389+
# background-process spike that round slows every commit together), which is the bulk
390+
# of the variance on a laptop. See winsorized_paired_stats for the robust estimator.
391+
base_runs = results[baseline_commit]
392+
base_center = statistics.median(base_runs)
393+
heading(f"Paired deltas vs {baseline_commit} (per-round diffs; median +/- 95% CI)")
394+
for commit in commits:
395+
if commit == baseline_commit:
396+
print(f"{commit:<25} baseline")
397+
continue
398+
diffs = [c - b for c, b in zip(results[commit], base_runs)]
399+
st = winsorized_paired_stats(diffs)
400+
ci_ms = st["ci"] * 1000
401+
median_ms = st["median"] * 1000
402+
pct = (st["median"] / base_center * 100) if base_center else 0.0
403+
print(f"{commit:<25} median {median_ms:+7.1f}ms +/-{ci_ms:4.1f} ({pct:+.2f}%)")
255404

256405
t = int(time.time() - whole_program_time_0)
257406
total_time_taken_formatted = ", ".join(
@@ -264,6 +413,18 @@ def main() -> None:
264413
total_time_taken_formatted,
265414
)
266415

416+
# Archive compiled clones before cleanup, keyed by commit, so later runs can
417+
# restore them instead of recompiling. Skip if destination already exists.
418+
if cache_binaries:
419+
os.makedirs(archive_root, exist_ok=True)
420+
for target_dir, commit in zip(target_dirs, commits):
421+
dest = os.path.join(archive_root, commit)
422+
if os.path.isdir(dest):
423+
print(f"archive: {dest} already exists, skipping")
424+
else:
425+
print(f"archive: copying {target_dir} -> {dest}")
426+
shutil.copytree(target_dir, dest, symlinks=True)
427+
267428
shutil.rmtree(check_dir)
268429
for target_dir in target_dirs:
269430
shutil.rmtree(target_dir)

0 commit comments

Comments
 (0)