Skip to content

Commit bc17823

Browse files
committed
feat: print a local comparison report after each walltime run
The result files written by each run were only used for CI uploads. This uses them locally too: on the second run, pytest-codspeed finds the most recent prior .codspeed/results_*.json and prints a short regression/improvement summary to the terminal. Skipped when --codspeed-profile-folder is set or in non-walltime modes. Implements the TODO left in plugin.py. Tests in test_comparison.py (unit) and test_comparison_integration.py (pytester end-to-end).
1 parent aa267f3 commit bc17823

4 files changed

Lines changed: 729 additions & 2 deletions

File tree

src/pytest_codspeed/comparison.py

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
"""Local baseline comparison between consecutive CodSpeed runs.
2+
3+
Implements the feature planned in ``plugin.py``::
4+
5+
# Storing the results will be later used for features such as
6+
# local comparison between runs.
7+
8+
Only walltime runs produce per-benchmark statistics (``mean_ns``).
9+
Simulation/analysis runs do not include a ``benchmarks`` key in their
10+
result JSON, so comparisons are silently skipped for those modes.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import json
16+
from dataclasses import dataclass
17+
from typing import TYPE_CHECKING
18+
19+
if TYPE_CHECKING:
20+
from pathlib import Path
21+
from typing import Any
22+
23+
# A benchmark is considered regressed / improved only when the relative
24+
# change exceeds these thresholds. Below 5 % the measurement noise of a
25+
# single local run is too high to draw reliable conclusions.
26+
_REGRESSION_THRESHOLD = 0.05
27+
_IMPROVEMENT_THRESHOLD = 0.05
28+
29+
30+
# ---------------------------------------------------------------------------
31+
# Public data classes
32+
# ---------------------------------------------------------------------------
33+
34+
35+
@dataclass(frozen=True)
36+
class BenchmarkDiff:
37+
"""Performance delta for a single benchmark between two runs."""
38+
39+
name: str
40+
baseline_mean_ns: float
41+
current_mean_ns: float
42+
43+
@property
44+
def change_ratio(self) -> float:
45+
"""Signed relative change.
46+
47+
Positive → slower (regression).
48+
Negative → faster (improvement).
49+
"""
50+
return (self.current_mean_ns - self.baseline_mean_ns) / self.baseline_mean_ns
51+
52+
@property
53+
def change_pct(self) -> str:
54+
sign = "+" if self.change_ratio >= 0 else ""
55+
return f"{sign}{self.change_ratio * 100:.1f}%"
56+
57+
@property
58+
def is_regression(self) -> bool:
59+
return self.change_ratio > _REGRESSION_THRESHOLD
60+
61+
@property
62+
def is_improvement(self) -> bool:
63+
return self.change_ratio < -_IMPROVEMENT_THRESHOLD
64+
65+
66+
@dataclass(frozen=True)
67+
class ComparisonReport:
68+
"""Full comparison report between a baseline run and the current run."""
69+
70+
regressions: tuple[BenchmarkDiff, ...]
71+
improvements: tuple[BenchmarkDiff, ...]
72+
unchanged: tuple[BenchmarkDiff, ...]
73+
new_benchmarks: tuple[str, ...]
74+
removed_benchmarks: tuple[str, ...]
75+
76+
@property
77+
def has_changes(self) -> bool:
78+
return bool(
79+
self.regressions
80+
or self.improvements
81+
or self.new_benchmarks
82+
or self.removed_benchmarks
83+
)
84+
85+
@property
86+
def total_compared(self) -> int:
87+
return len(self.regressions) + len(self.improvements) + len(self.unchanged)
88+
89+
90+
# ---------------------------------------------------------------------------
91+
# File discovery
92+
# ---------------------------------------------------------------------------
93+
94+
95+
def find_baseline(results_dir: Path, current_path: Path) -> Path | None:
96+
"""Return the most recent ``results_*.json`` that is not *current_path*.
97+
98+
Files are ranked by modification time (most recent first). The filename
99+
itself encodes a millisecond timestamp (``results_{ms}.json``) so mtime
100+
and filename order are equivalent in practice; mtime is simpler to sort.
101+
102+
Returns ``None`` when the directory contains no prior run.
103+
"""
104+
candidates = sorted(
105+
(p for p in results_dir.glob("results_*.json") if p != current_path),
106+
key=lambda p: p.stat().st_mtime,
107+
reverse=True,
108+
)
109+
return candidates[0] if candidates else None
110+
111+
112+
# ---------------------------------------------------------------------------
113+
# Comparison logic
114+
# ---------------------------------------------------------------------------
115+
116+
117+
def _extract_benchmarks(data: dict[str, Any]) -> dict[str, float]:
118+
"""Return ``{uri: mean_ns}`` from a parsed results JSON.
119+
120+
Benchmarks without a ``stats.mean_ns`` field (e.g. simulation-mode
121+
stubs) are silently ignored.
122+
"""
123+
result: dict[str, float] = {}
124+
for bench in data.get("benchmarks", []):
125+
stats = bench.get("stats") or {}
126+
mean_ns = stats.get("mean_ns")
127+
if mean_ns is not None:
128+
result[bench["uri"]] = float(mean_ns)
129+
return result
130+
131+
132+
def compare_results(baseline_path: Path, current_path: Path) -> ComparisonReport:
133+
"""Compare two CodSpeed result files and return a :class:`ComparisonReport`.
134+
135+
Args:
136+
baseline_path: Path to the older ``results_*.json`` file.
137+
current_path: Path to the newly written ``results_*.json`` file.
138+
139+
Returns:
140+
A :class:`ComparisonReport` classifying every benchmark as regressed,
141+
improved, unchanged, new, or removed.
142+
"""
143+
baseline = _extract_benchmarks(json.loads(baseline_path.read_text()))
144+
current = _extract_benchmarks(json.loads(current_path.read_text()))
145+
146+
regressions: list[BenchmarkDiff] = []
147+
improvements: list[BenchmarkDiff] = []
148+
unchanged: list[BenchmarkDiff] = []
149+
new_benchmarks: list[str] = []
150+
151+
for uri, current_mean in current.items():
152+
if uri not in baseline:
153+
new_benchmarks.append(uri)
154+
continue
155+
diff = BenchmarkDiff(
156+
name=uri,
157+
baseline_mean_ns=baseline[uri],
158+
current_mean_ns=current_mean,
159+
)
160+
if diff.is_regression:
161+
regressions.append(diff)
162+
elif diff.is_improvement:
163+
improvements.append(diff)
164+
else:
165+
unchanged.append(diff)
166+
167+
removed_benchmarks = [uri for uri in baseline if uri not in current]
168+
169+
return ComparisonReport(
170+
# Sort regressions worst-first, improvements best-first.
171+
regressions=tuple(
172+
sorted(regressions, key=lambda d: d.change_ratio, reverse=True)
173+
),
174+
improvements=tuple(sorted(improvements, key=lambda d: d.change_ratio)),
175+
unchanged=tuple(unchanged),
176+
new_benchmarks=tuple(new_benchmarks),
177+
removed_benchmarks=tuple(removed_benchmarks),
178+
)
179+
180+
181+
# ---------------------------------------------------------------------------
182+
# Terminal output
183+
# ---------------------------------------------------------------------------
184+
185+
186+
def _format_ns(ns: float) -> str:
187+
"""Format a nanosecond duration as a human-readable string."""
188+
if ns >= 1_000_000_000:
189+
return f"{ns / 1_000_000_000:.2f}s"
190+
if ns >= 1_000_000:
191+
return f"{ns / 1_000_000:.2f}ms"
192+
if ns >= 1_000:
193+
return f"{ns / 1_000:.2f}µs"
194+
return f"{ns:.0f}ns"
195+
196+
197+
def _short_name(uri: str) -> str:
198+
"""Return the test function name part of a benchmark URI."""
199+
return uri.split("::")[-1] if "::" in uri else uri
200+
201+
202+
def print_comparison_report(report: ComparisonReport, baseline_path: Path) -> None:
203+
"""Print a human-readable comparison report to stdout.
204+
205+
Produces no output when no benchmarks were compared (e.g. simulation
206+
mode) to avoid polluting CI logs with empty sections.
207+
"""
208+
if report.total_compared == 0 and not report.new_benchmarks:
209+
return
210+
211+
print(f"\n CodSpeed local comparison (vs {baseline_path.name})")
212+
print(" " + "─" * 62)
213+
214+
if report.regressions:
215+
print(f"\n ✗ Regressions ({len(report.regressions)})")
216+
for diff in report.regressions:
217+
print(
218+
f" {_short_name(diff.name):<42}"
219+
f" {_format_ns(diff.baseline_mean_ns):>8}"
220+
f" → {_format_ns(diff.current_mean_ns):>8}"
221+
f" {diff.change_pct}"
222+
)
223+
224+
if report.improvements:
225+
print(f"\n ✓ Improvements ({len(report.improvements)})")
226+
for diff in report.improvements:
227+
print(
228+
f" {_short_name(diff.name):<42}"
229+
f" {_format_ns(diff.baseline_mean_ns):>8}"
230+
f" → {_format_ns(diff.current_mean_ns):>8}"
231+
f" {diff.change_pct}"
232+
)
233+
234+
if report.new_benchmarks:
235+
print(f"\n + New ({len(report.new_benchmarks)})")
236+
for uri in report.new_benchmarks:
237+
print(f" {_short_name(uri)}")
238+
239+
if report.removed_benchmarks:
240+
print(f"\n - Removed ({len(report.removed_benchmarks)})")
241+
for uri in report.removed_benchmarks:
242+
print(f" {_short_name(uri)}")
243+
244+
print(
245+
f"\n {report.total_compared} compared"
246+
f" · {len(report.regressions)} regression(s)"
247+
f" · {len(report.improvements)} improvement(s)"
248+
)
249+
print()

src/pytest_codspeed/plugin.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
)
2929

3030
from . import __version__
31+
from .comparison import compare_results, find_baseline, print_comparison_report
3132

3233
if TYPE_CHECKING:
3334
from typing import Any, Callable, ParamSpec, TypeVar
@@ -304,8 +305,6 @@ def pytest_sessionfinish(session: pytest.Session, exitstatus):
304305
result_path = plugin.profile_folder / "results" / f"{os.getpid()}.json"
305306
else:
306307
# Default to a .codspeed folder in the root of the project.
307-
# Storing the results will be later used for features such as
308-
# local comparison between runs.
309308
result_path = (
310309
session.config.rootpath / f".codspeed/results_{time() * 1000:.0f}.json"
311310
)
@@ -316,6 +315,18 @@ def pytest_sessionfinish(session: pytest.Session, exitstatus):
316315
(result_path.parent / ".gitignore").write_text("*\n")
317316
result_path.write_text(json.dumps(data, indent=2))
318317

318+
# Local baseline comparison — only walltime runs carry per-benchmark
319+
# statistics (mean_ns). Simulation / memory runs skip this silently.
320+
if not plugin.profile_folder and plugin.mode == MeasurementMode.WallTime:
321+
baseline_path = find_baseline(result_path.parent, result_path)
322+
if baseline_path is not None:
323+
try:
324+
report = compare_results(baseline_path, result_path)
325+
print_comparison_report(report, baseline_path)
326+
except Exception:
327+
# Never let comparison errors break a test run.
328+
pass
329+
319330

320331
class BenchmarkFixture:
321332
"""The fixture that can be used to benchmark a function."""

0 commit comments

Comments
 (0)