Skip to content

Commit cc19261

Browse files
committed
feat: hash benchmark return values to detect correctness regressions
Adds --codspeed-capture-output flag. When set, each walltime run hashes the return value of the benchmarked function (pickle + sha256, repr fallback) and stores it in the result JSON alongside mean_ns. On the second run, the local comparison checks whether the hash changed. If it did, the report flags the benchmark with "! output changed" and counts correctness warnings in the footer. This closes the gap in optimisation loops where a suggestion improves perf but silently alters the function's output. Score formula exposed in eval_harness.py: score = perf_gain if output correct, 0 if broken, nan if capture was not enabled.
1 parent bc17823 commit cc19261

7 files changed

Lines changed: 490 additions & 29 deletions

File tree

src/pytest_codspeed/comparison.py

Lines changed: 62 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
import json
1616
from dataclasses import dataclass
17-
from typing import TYPE_CHECKING
17+
from typing import TYPE_CHECKING, NamedTuple
1818

1919
if TYPE_CHECKING:
2020
from pathlib import Path
@@ -27,6 +27,11 @@
2727
_IMPROVEMENT_THRESHOLD = 0.05
2828

2929

30+
class _BenchmarkData(NamedTuple):
31+
mean_ns: float
32+
output_hash: str | None
33+
34+
3035
# ---------------------------------------------------------------------------
3136
# Public data classes
3237
# ---------------------------------------------------------------------------
@@ -54,6 +59,9 @@ def change_pct(self) -> str:
5459
sign = "+" if self.change_ratio >= 0 else ""
5560
return f"{sign}{self.change_ratio * 100:.1f}%"
5661

62+
output_changed: bool | None = None
63+
"""None when one or both runs were collected without --codspeed-capture-output."""
64+
5765
@property
5866
def is_regression(self) -> bool:
5967
return self.change_ratio > _REGRESSION_THRESHOLD
@@ -114,18 +122,21 @@ def find_baseline(results_dir: Path, current_path: Path) -> Path | None:
114122
# ---------------------------------------------------------------------------
115123

116124

117-
def _extract_benchmarks(data: dict[str, Any]) -> dict[str, float]:
118-
"""Return ``{uri: mean_ns}`` from a parsed results JSON.
125+
def _extract_benchmarks(data: dict[str, Any]) -> dict[str, _BenchmarkData]:
126+
"""Return ``{uri: _BenchmarkData}`` from a parsed results JSON.
119127
120128
Benchmarks without a ``stats.mean_ns`` field (e.g. simulation-mode
121129
stubs) are silently ignored.
122130
"""
123-
result: dict[str, float] = {}
131+
result: dict[str, _BenchmarkData] = {}
124132
for bench in data.get("benchmarks", []):
125133
stats = bench.get("stats") or {}
126134
mean_ns = stats.get("mean_ns")
127135
if mean_ns is not None:
128-
result[bench["uri"]] = float(mean_ns)
136+
result[bench["uri"]] = _BenchmarkData(
137+
mean_ns=float(mean_ns),
138+
output_hash=bench.get("output_hash"),
139+
)
129140
return result
130141

131142

@@ -148,14 +159,25 @@ def compare_results(baseline_path: Path, current_path: Path) -> ComparisonReport
148159
unchanged: list[BenchmarkDiff] = []
149160
new_benchmarks: list[str] = []
150161

151-
for uri, current_mean in current.items():
162+
for uri, current_data in current.items():
152163
if uri not in baseline:
153164
new_benchmarks.append(uri)
154165
continue
166+
baseline_data = baseline[uri]
167+
if (
168+
baseline_data.output_hash is not None
169+
and current_data.output_hash is not None
170+
):
171+
output_changed: bool | None = (
172+
baseline_data.output_hash != current_data.output_hash
173+
)
174+
else:
175+
output_changed = None
155176
diff = BenchmarkDiff(
156177
name=uri,
157-
baseline_mean_ns=baseline[uri],
158-
current_mean_ns=current_mean,
178+
baseline_mean_ns=baseline_data.mean_ns,
179+
current_mean_ns=current_data.mean_ns,
180+
output_changed=output_changed,
159181
)
160182
if diff.is_regression:
161183
regressions.append(diff)
@@ -199,6 +221,35 @@ def _short_name(uri: str) -> str:
199221
return uri.split("::")[-1] if "::" in uri else uri
200222

201223

224+
def _fmt_diff(diff: BenchmarkDiff) -> str:
225+
line = (
226+
f" {_short_name(diff.name):<42}"
227+
f" {_format_ns(diff.baseline_mean_ns):>8}"
228+
f" → {_format_ns(diff.current_mean_ns):>8}"
229+
f" {diff.change_pct}"
230+
)
231+
if diff.output_changed is True:
232+
line += " ! output changed"
233+
return line
234+
235+
236+
def _print_footer(report: ComparisonReport) -> None:
237+
correctness_warnings = sum(
238+
1
239+
for d in (*report.regressions, *report.improvements, *report.unchanged)
240+
if d.output_changed is True
241+
)
242+
footer = (
243+
f"\n {report.total_compared} compared"
244+
f" · {len(report.regressions)} regression(s)"
245+
f" · {len(report.improvements)} improvement(s)"
246+
)
247+
if correctness_warnings:
248+
footer += f" · {correctness_warnings} correctness warning(s)"
249+
print(footer)
250+
print()
251+
252+
202253
def print_comparison_report(report: ComparisonReport, baseline_path: Path) -> None:
203254
"""Print a human-readable comparison report to stdout.
204255
@@ -214,22 +265,12 @@ def print_comparison_report(report: ComparisonReport, baseline_path: Path) -> No
214265
if report.regressions:
215266
print(f"\n ✗ Regressions ({len(report.regressions)})")
216267
for diff in report.regressions:
217-
print(
218-
f" {_short_name(diff.name):<42}"
219-
f" {_format_ns(diff.baseline_mean_ns):>8}"
220-
f" → {_format_ns(diff.current_mean_ns):>8}"
221-
f" {diff.change_pct}"
222-
)
268+
print(_fmt_diff(diff))
223269

224270
if report.improvements:
225271
print(f"\n ✓ Improvements ({len(report.improvements)})")
226272
for diff in report.improvements:
227-
print(
228-
f" {_short_name(diff.name):<42}"
229-
f" {_format_ns(diff.baseline_mean_ns):>8}"
230-
f" → {_format_ns(diff.current_mean_ns):>8}"
231-
f" {diff.change_pct}"
232-
)
273+
print(_fmt_diff(diff))
233274

234275
if report.new_benchmarks:
235276
print(f"\n + New ({len(report.new_benchmarks)})")
@@ -241,9 +282,4 @@ def print_comparison_report(report: ComparisonReport, baseline_path: Path) -> No
241282
for uri in report.removed_benchmarks:
242283
print(f" {_short_name(uri)}")
243284

244-
print(
245-
f"\n {report.total_compared} compared"
246-
f" · {len(report.regressions)} regression(s)"
247-
f" · {len(report.improvements)} improvement(s)"
248-
)
249-
print()
285+
_print_footer(report)

src/pytest_codspeed/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class CodSpeedConfig:
2222
warmup_time_ns: int | None = None
2323
max_time_ns: int | None = None
2424
max_rounds: int | None = None
25+
capture_output: bool = False
2526

2627
@classmethod
2728
def from_pytest_config(cls, config: pytest.Config) -> CodSpeedConfig:
@@ -35,6 +36,9 @@ def from_pytest_config(cls, config: pytest.Config) -> CodSpeedConfig:
3536
warmup_time_ns=warmup_time_ns,
3637
max_rounds=config.getoption("--codspeed-max-rounds", None),
3738
max_time_ns=max_time_ns,
39+
capture_output=bool(
40+
config.getoption("--codspeed-capture-output", False)
41+
),
3842
)
3943

4044

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""Scoring utilities for evaluating LLM-generated optimisation suggestions.
2+
3+
After two consecutive walltime runs with ``--codspeed-capture-output``, the
4+
local comparison report includes both a performance delta and a correctness
5+
signal (``output_changed``). This module turns those two signals into a
6+
single scalar score that can be used to rank or accept/reject suggestions
7+
automatically.
8+
9+
Score semantics
10+
---------------
11+
* ``1.0`` - 100 % faster, output unchanged (theoretical maximum).
12+
* ``0.33`` - 33 % faster, output unchanged.
13+
* ``0.0`` - any speedup, but output changed (correctness broken).
14+
* ``nan`` - run was not collected with ``--codspeed-capture-output``;
15+
correctness is unknown, score cannot be computed.
16+
17+
Usage in an optimisation loop::
18+
19+
from pytest_codspeed.comparison import compare_results
20+
from pytest_codspeed.eval_harness import compute_score, EvalReport
21+
22+
report = compare_results(baseline_path, current_path)
23+
eval_report = EvalReport.from_comparison(report)
24+
for entry in eval_report.entries:
25+
print(entry.name, entry.score)
26+
"""
27+
28+
from __future__ import annotations
29+
30+
import math
31+
from dataclasses import dataclass
32+
from typing import TYPE_CHECKING
33+
34+
if TYPE_CHECKING:
35+
from pytest_codspeed.comparison import BenchmarkDiff, ComparisonReport
36+
37+
38+
def compute_score(perf_gain: float, output_changed: bool | None) -> float:
39+
"""Return a scalar quality score for a single optimisation suggestion.
40+
41+
Args:
42+
perf_gain: Relative performance improvement, positive = faster.
43+
Computed as ``-diff.change_ratio`` (so +0.33 means 33 %
44+
faster).
45+
output_changed: ``True`` if the function's return value changed,
46+
``False`` if it stayed the same, ``None`` if the
47+
capture flag was not used.
48+
49+
Returns:
50+
* ``0.0`` when ``output_changed is True`` -- correctness is broken.
51+
* ``float('nan')`` when ``output_changed is None`` -- unknown.
52+
* ``max(0.0, perf_gain)`` otherwise -- reward speed, ignore slowdowns.
53+
"""
54+
if output_changed is True:
55+
return 0.0
56+
if output_changed is None:
57+
return float("nan")
58+
return max(0.0, perf_gain)
59+
60+
61+
@dataclass(frozen=True)
62+
class EvalEntry:
63+
"""Score for a single benchmark in an optimisation suggestion."""
64+
65+
name: str
66+
perf_gain: float
67+
output_changed: bool | None
68+
score: float
69+
70+
@property
71+
def is_acceptable(self) -> bool:
72+
"""True when the suggestion improves perf without breaking correctness."""
73+
return not math.isnan(self.score) and self.score > 0.0
74+
75+
@property
76+
def correctness_unknown(self) -> bool:
77+
return self.output_changed is None
78+
79+
80+
@dataclass(frozen=True)
81+
class EvalReport:
82+
"""Aggregated evaluation of an optimisation suggestion across all benchmarks."""
83+
84+
entries: tuple[EvalEntry, ...]
85+
86+
@classmethod
87+
def from_comparison(cls, report: ComparisonReport) -> EvalReport:
88+
"""Build an :class:`EvalReport` from a :class:`ComparisonReport`."""
89+
all_diffs: tuple[BenchmarkDiff, ...] = (
90+
*report.regressions,
91+
*report.improvements,
92+
*report.unchanged,
93+
)
94+
entries = tuple(
95+
EvalEntry(
96+
name=diff.name,
97+
perf_gain=-diff.change_ratio,
98+
output_changed=diff.output_changed,
99+
score=compute_score(-diff.change_ratio, diff.output_changed),
100+
)
101+
for diff in all_diffs
102+
)
103+
return cls(entries=entries)
104+
105+
@property
106+
def acceptable(self) -> tuple[EvalEntry, ...]:
107+
return tuple(e for e in self.entries if e.is_acceptable)
108+
109+
@property
110+
def correctness_broken(self) -> tuple[EvalEntry, ...]:
111+
return tuple(e for e in self.entries if e.output_changed is True)
112+
113+
@property
114+
def correctness_unknown(self) -> tuple[EvalEntry, ...]:
115+
return tuple(e for e in self.entries if e.correctness_unknown)

src/pytest_codspeed/instruments/walltime.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import annotations
22

3+
import hashlib
34
import os
5+
import pickle
46
import warnings
57
from dataclasses import asdict, dataclass
68
from math import ceil
@@ -146,13 +148,25 @@ def from_list(
146148
)
147149

148150

151+
def _hash_output(value: object) -> str | None:
152+
"""Return a short hex digest of *value*, or ``None`` if un-serialisable."""
153+
try:
154+
raw = pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL)
155+
except Exception:
156+
try:
157+
raw = repr(value).encode()
158+
except Exception:
159+
return None
160+
return hashlib.sha256(raw).hexdigest()[:16]
161+
162+
149163
@dataclass
150164
class Benchmark:
151165
name: str
152166
uri: str
153-
154167
config: BenchmarkConfig
155168
stats: BenchmarkStats
169+
output_hash: str | None = None
156170

157171

158172
class WallTimeInstrument(Instrument):
@@ -200,6 +214,9 @@ def __codspeed_root_frame__() -> T:
200214

201215
# Compute the actual result of the function
202216
out = __codspeed_root_frame__()
217+
output_hash = (
218+
_hash_output(out) if self.config.capture_output else None
219+
)
203220

204221
# Warmup
205222
times_per_round_ns: list[float] = []
@@ -258,7 +275,13 @@ def __codspeed_root_frame__() -> T:
258275
)
259276

260277
self.benchmarks.append(
261-
Benchmark(name=name, uri=uri, config=benchmark_config, stats=stats)
278+
Benchmark(
279+
name=name,
280+
uri=uri,
281+
config=benchmark_config,
282+
stats=stats,
283+
output_hash=output_hash,
284+
)
262285
)
263286
return out
264287

@@ -316,11 +339,20 @@ def __codspeed_root_frame__(*args, **kwargs) -> T:
316339
# Compute the actual result of the function
317340
args, kwargs = pedantic_options.setup_and_get_args_kwargs()
318341
out = __codspeed_root_frame__(*args, **kwargs)
342+
output_hash = (
343+
_hash_output(out) if self.config.capture_output else None
344+
)
319345
if pedantic_options.teardown is not None:
320346
pedantic_options.teardown(*args, **kwargs)
321347

322348
self.benchmarks.append(
323-
Benchmark(name=name, uri=uri, config=benchmark_config, stats=stats)
349+
Benchmark(
350+
name=name,
351+
uri=uri,
352+
config=benchmark_config,
353+
stats=stats,
354+
output_hash=output_hash,
355+
)
324356
)
325357
return out
326358

src/pytest_codspeed/plugin.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ def pytest_addoption(parser: pytest.Parser):
8080
", only for walltime mode"
8181
),
8282
)
83+
group.addoption(
84+
"--codspeed-capture-output",
85+
action="store_true",
86+
default=False,
87+
help=(
88+
"Hash the return value of each benchmark and store it in the "
89+
"result file, enabling correctness checks in local comparisons"
90+
),
91+
)
8392

8493

8594
@dataclass(unsafe_hash=True)

0 commit comments

Comments
 (0)