Skip to content

Commit 5d8be1f

Browse files
committed
reorganized tables and added test data to support test_compare_jmh.py script
1 parent 5360364 commit 5d8be1f

3 files changed

Lines changed: 330 additions & 26 deletions

File tree

.github/scripts/compare-jmh.py

Lines changed: 80 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,70 @@ def compute_discriminators(
317317
return out
318318

319319

320+
# Visual markers used in rendered output. "Performance went up" =
321+
# improvement (less time / less memory per op).
322+
ARROW_REGRESS = "\u2b07\ufe0f" # ⬇️
323+
ARROW_IMPROVE = "\u2b06\ufe0f" # ⬆️
324+
ARROW_NOISE = "\u2796" # ➖
325+
326+
DOT_REGRESS = "\U0001f534" # 🔴
327+
DOT_IMPROVE = "\U0001f7e2" # 🟢
328+
329+
330+
def _row_arrow(r: Row, threshold: float) -> str:
331+
if any(d.regression(threshold) for d in r.deltas):
332+
return ARROW_REGRESS
333+
if any(d.improvement(threshold) for d in r.deltas):
334+
return ARROW_IMPROVE
335+
return ARROW_NOISE
336+
337+
338+
def _short_delta(d: MetricDelta, threshold: float) -> str:
339+
"""One-line metric delta for the brief bullet list.
340+
341+
Returns "" for noise (caller drops it). Regressions are 🔴-marked and
342+
bold so they stand out on a packed PR comment; improvements are
343+
🟢-marked but unbolded.
344+
"""
345+
if d.delta_pct is None:
346+
return ""
347+
label = d.metric.label
348+
delta = fmt_delta(d.delta_pct)
349+
if d.regression(threshold):
350+
return f"{DOT_REGRESS} **{label} {delta}**"
351+
if d.improvement(threshold):
352+
return f"{DOT_IMPROVE} {label} {delta}"
353+
return ""
354+
355+
356+
def _stats_cell(r: Row, threshold: float) -> str:
357+
"""Render the joined Stats cell for one row in the detail table.
358+
359+
Each metric occupies a `<br>`-separated line. Regressions are
360+
bolded and 🔴-tagged; improvements are 🟢-tagged. Metrics with no
361+
baseline/current data are rendered grey ("—") so the cell still
362+
shows which dimension is missing.
363+
"""
364+
lines: List[str] = []
365+
for d in r.deltas:
366+
label = d.metric.label
367+
if d.delta_pct is None:
368+
base = fmt_score(d.baseline, d.baseline_err, d.unit)
369+
curr = fmt_score(d.current, d.current_err, d.unit)
370+
lines.append(f"{label} {base}{curr} (—)")
371+
continue
372+
base = fmt_score(d.baseline, d.baseline_err, d.unit)
373+
curr = fmt_score(d.current, d.current_err, d.unit)
374+
delta = fmt_delta(d.delta_pct)
375+
if d.regression(threshold):
376+
lines.append(f"{DOT_REGRESS} **{label}** {base}{curr} (**{delta}**)")
377+
elif d.improvement(threshold):
378+
lines.append(f"{DOT_IMPROVE} **{label}** {base}{curr} ({delta})")
379+
else:
380+
lines.append(f"{label} {base}{curr} ({delta})")
381+
return "<br>".join(lines)
382+
383+
320384
def build_markdown(
321385
rows: List[Row],
322386
only_current: List[Key],
@@ -341,14 +405,14 @@ def build_markdown(
341405
out: List[str] = ["<!-- jmh-benchmark-comparison -->"]
342406
if regressions:
343407
out.append(
344-
f"## JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%"
408+
f"## {DOT_REGRESS} JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%"
345409
)
346410
elif improvements:
347411
out.append(
348-
f"## JMH benchmark comparison — no regressions, {improvements} improvement(s) over {threshold:g}%"
412+
f"## {DOT_IMPROVE} JMH benchmark comparison — no regressions, {improvements} improvement(s) over {threshold:g}%"
349413
)
350414
else:
351-
out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%")
415+
out.append(f"## {DOT_IMPROVE} JMH benchmark comparison — no changes over {threshold:g}%")
352416
out.append("")
353417

354418
if repo and baseline_run_id and current_run_id:
@@ -376,21 +440,13 @@ def bucket(r: Row) -> int:
376440
for r in rows:
377441
bench, _ = r.key
378442
disc = discriminators.get(r.key, "")
379-
b = bucket(r)
380-
icon = "❌" if b == 0 else "✅"
381-
382-
# In the brief view, only mention metrics that actually crossed
383-
# the threshold — keeps noisy rows to a single line.
384-
bits: List[str] = []
385-
for d in r.deltas:
386-
if d.delta_pct is None:
387-
continue
388-
if d.regression(threshold):
389-
bits.append(f"**{d.metric.label} {fmt_delta(d.delta_pct)}**")
390-
elif d.improvement(threshold):
391-
bits.append(f"{d.metric.label} {fmt_delta(d.delta_pct)}")
392-
393-
line = f"- {icon} `{short_bench(bench)}`"
443+
arrow = _row_arrow(r, threshold)
444+
445+
# Only mention metrics that actually crossed the threshold
446+
# in the brief view — keeps noisy rows to a single line.
447+
bits = [s for s in (_short_delta(d, threshold) for d in r.deltas) if s]
448+
449+
line = f"- {arrow} `{short_bench(bench)}`"
394450
if disc:
395451
line += f" `[{disc}]`"
396452
if bits:
@@ -410,16 +466,14 @@ def bucket(r: Row) -> int:
410466
"</summary>"
411467
)
412468
out.append("")
413-
header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |"
414-
sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|"
415-
out.append(header)
416-
out.append(sep)
469+
out.append("| Benchmark | Stats |")
470+
out.append("|---|---|")
417471
for r in rows:
418472
bench, params = r.key
419-
cells = " | ".join(d.cell() for d in r.deltas)
420-
out.append(
421-
f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |"
422-
)
473+
bench_cell = f"`{short_bench(bench)}`"
474+
if params:
475+
bench_cell += f"<br><sub>{params}</sub>"
476+
out.append(f"| {bench_cell} | {_stats_cell(r, threshold)} |")
423477
out.append("")
424478
out.append("</details>")
425479
out.append("")
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
#!/usr/bin/env python3
2+
"""End-to-end tests for `compare-jmh.py`.
3+
4+
Runs `compare-jmh.py` against every scenario in `test_data/` via
5+
subprocess and asserts on both the rendered markdown and the
6+
`--summary-output` counters. Designed to be run locally
7+
(`python3 .github/scripts/test_compare_jmh.py`) and from CI without
8+
extra dependencies — only the standard library is used.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import os
14+
import shutil
15+
import subprocess
16+
import sys
17+
import tempfile
18+
import unittest
19+
from pathlib import Path
20+
from typing import Dict, List
21+
22+
HERE = Path(__file__).resolve().parent
23+
SCRIPT = HERE / "compare-jmh.py"
24+
DATA = HERE / "test_data"
25+
26+
27+
def _parse_summary(path: Path) -> Dict[str, str]:
28+
out: Dict[str, str] = {}
29+
for line in path.read_text(encoding="utf-8").splitlines():
30+
if "=" in line:
31+
k, v = line.split("=", 1)
32+
out[k.strip()] = v.strip()
33+
return out
34+
35+
36+
def _run_compare(
37+
case: str,
38+
*,
39+
threshold: float = 10.0,
40+
extra_args: List[str] | None = None,
41+
) -> Dict[str, object]:
42+
"""Invoke compare-jmh.py against a fixture; return its outputs.
43+
44+
Returns a dict with: returncode, stdout, stderr, markdown, summary.
45+
"""
46+
case_dir = DATA / case
47+
assert case_dir.is_dir(), f"missing test fixture: {case_dir}"
48+
49+
tmp = Path(tempfile.mkdtemp(prefix=f"cmp-jmh-{case}-"))
50+
try:
51+
out_md = tmp / "out.md"
52+
summary = tmp / "summary.env"
53+
cmd = [
54+
sys.executable,
55+
str(SCRIPT),
56+
"--baseline",
57+
str(case_dir / "baseline"),
58+
"--current",
59+
str(case_dir / "current"),
60+
"--threshold-pct",
61+
str(threshold),
62+
"--output",
63+
str(out_md),
64+
"--summary-output",
65+
str(summary),
66+
]
67+
if extra_args:
68+
cmd.extend(extra_args)
69+
proc = subprocess.run(cmd, capture_output=True, text=True)
70+
result: Dict[str, object] = {
71+
"returncode": proc.returncode,
72+
"stdout": proc.stdout,
73+
"stderr": proc.stderr,
74+
"markdown": out_md.read_text(encoding="utf-8") if out_md.exists() else "",
75+
"summary": _parse_summary(summary) if summary.exists() else {},
76+
}
77+
return result
78+
finally:
79+
shutil.rmtree(tmp, ignore_errors=True)
80+
81+
82+
# Pre-computed marker strings — keep in sync with compare-jmh.py.
83+
ARROW_REGRESS = "\u2b07\ufe0f"
84+
ARROW_IMPROVE = "\u2b06\ufe0f"
85+
ARROW_NOISE = "\u2796"
86+
DOT_REGRESS = "\U0001f534"
87+
DOT_IMPROVE = "\U0001f7e2"
88+
89+
90+
class CompareJmhTest(unittest.TestCase):
91+
"""Each case in test_data/ has a dedicated test asserting on the
92+
headline shape, the bullet markers, and the summary counters."""
93+
94+
def test_all_improvements(self) -> None:
95+
r = _run_compare("all_improvements")
96+
self.assertEqual(r["returncode"], 0)
97+
md = r["markdown"]
98+
self.assertIn("no regressions", md)
99+
self.assertIn(ARROW_IMPROVE, md)
100+
self.assertIn(DOT_IMPROVE, md)
101+
self.assertNotIn(ARROW_REGRESS, md)
102+
self.assertNotIn(DOT_REGRESS, md)
103+
s = r["summary"]
104+
self.assertEqual(s.get("regressions"), "0")
105+
self.assertEqual(s.get("improvements"), "2")
106+
self.assertEqual(s.get("matched"), "2")
107+
108+
def test_all_regressions(self) -> None:
109+
r = _run_compare("all_regressions")
110+
self.assertEqual(r["returncode"], 0)
111+
md = r["markdown"]
112+
self.assertIn("regression(s) over", md)
113+
self.assertIn(ARROW_REGRESS, md)
114+
self.assertIn(DOT_REGRESS, md)
115+
# Bold markdown around the metric label.
116+
self.assertIn("**Time", md)
117+
s = r["summary"]
118+
self.assertEqual(s.get("regressions"), "2")
119+
# No row "purely improved" → improvements should be 0.
120+
self.assertEqual(s.get("improvements"), "0")
121+
122+
def test_mixed(self) -> None:
123+
r = _run_compare("mixed")
124+
self.assertEqual(r["returncode"], 0)
125+
md = r["markdown"]
126+
# Both directions present.
127+
self.assertIn(ARROW_REGRESS, md)
128+
self.assertIn(ARROW_IMPROVE, md)
129+
self.assertIn(DOT_REGRESS, md)
130+
self.assertIn(DOT_IMPROVE, md)
131+
# Discriminator suffix `[limit=…]` appears for the two
132+
# `queryV2` variants of the same benchmark.
133+
self.assertIn("[limit=10000]", md)
134+
self.assertIn("[limit=100000]", md)
135+
# The unique-named benchmark must NOT get a discriminator.
136+
self.assertNotIn("`JDBCQuery.selectJDBCV2` `[", md)
137+
s = r["summary"]
138+
self.assertEqual(s.get("matched"), "4")
139+
self.assertGreater(int(s.get("regressions", "0")), 0)
140+
141+
def test_no_alloc(self) -> None:
142+
# No `gc.alloc.rate.norm` present anywhere → script must
143+
# still compare Time and emit a diagnostic on stderr.
144+
r = _run_compare("no_alloc")
145+
self.assertEqual(r["returncode"], 0)
146+
self.assertIn("no `gc.alloc.rate.norm`", r["stderr"])
147+
md = r["markdown"]
148+
# Time regression should still be detected and 🔴-tagged…
149+
self.assertIn(DOT_REGRESS, md)
150+
# …but no Alloc/op metric is ever 🟢/🔴 because we have no
151+
# baseline/current data for it.
152+
self.assertIn("Alloc/op", md)
153+
# The detail-table cell should fall back to "(—)" for alloc.
154+
self.assertIn("Alloc/op — → — (—)", md)
155+
s = r["summary"]
156+
self.assertEqual(s.get("matched"), "2")
157+
158+
def test_noise_only(self) -> None:
159+
r = _run_compare("noise_only")
160+
self.assertEqual(r["returncode"], 0)
161+
md = r["markdown"]
162+
self.assertIn("no changes over 10%", md)
163+
# Every row should be on the noise arrow…
164+
self.assertIn(ARROW_NOISE, md)
165+
# …and there should be no red/green dots anywhere.
166+
self.assertNotIn(DOT_REGRESS, md)
167+
# 🟢 *is* in the header for the OK case, so don't assert on
168+
# DOT_IMPROVE alone.
169+
s = r["summary"]
170+
self.assertEqual(s.get("regressions"), "0")
171+
self.assertEqual(s.get("improvements"), "0")
172+
173+
def test_only_in_pr(self) -> None:
174+
r = _run_compare("only_in_pr")
175+
self.assertEqual(r["returncode"], 0)
176+
md = r["markdown"]
177+
self.assertIn("Benchmarks only in PR run", md)
178+
self.assertIn("QueryClient.queryV3New", md)
179+
s = r["summary"]
180+
# one shared row matched.
181+
self.assertEqual(s.get("matched"), "1")
182+
183+
def test_only_in_baseline(self) -> None:
184+
r = _run_compare("only_in_baseline")
185+
self.assertEqual(r["returncode"], 0)
186+
md = r["markdown"]
187+
self.assertIn("Benchmarks only in baseline run", md)
188+
self.assertIn("QueryClient.queryV0Removed", md)
189+
s = r["summary"]
190+
self.assertEqual(s.get("matched"), "1")
191+
192+
def test_empty_intersection(self) -> None:
193+
r = _run_compare("empty_intersection")
194+
self.assertEqual(r["returncode"], 0)
195+
md = r["markdown"]
196+
self.assertIn("_No benchmarks matched between baseline and PR._", md)
197+
# Both unique-side sections still appear as <details>.
198+
self.assertIn("Benchmarks only in PR run", md)
199+
self.assertIn("Benchmarks only in baseline run", md)
200+
s = r["summary"]
201+
self.assertEqual(s.get("matched"), "0")
202+
self.assertEqual(s.get("regressions"), "0")
203+
self.assertEqual(s.get("improvements"), "0")
204+
205+
def test_threshold_knob(self) -> None:
206+
# The same fixture flips from "regression" to "ok" when the
207+
# threshold is widened past the largest delta.
208+
strict = _run_compare("all_regressions", threshold=10.0)
209+
lenient = _run_compare("all_regressions", threshold=200.0)
210+
self.assertGreater(int(strict["summary"]["regressions"]), 0)
211+
self.assertEqual(lenient["summary"]["regressions"], "0")
212+
self.assertIn("no changes", lenient["markdown"])
213+
214+
215+
if __name__ == "__main__":
216+
# `-v` prints each scenario name so failures are obvious in CI logs.
217+
unittest.main(verbosity=2)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# `compare-jmh.py` test fixtures
2+
3+
Each subdirectory is a self-contained scenario for `compare-jmh.py`. The
4+
layout is always:
5+
6+
```
7+
<case>/
8+
baseline/jmh-results-baseline.json
9+
current/jmh-results-current.json
10+
```
11+
12+
`compare-jmh.py` discovers result files by globbing for
13+
`jmh-results-*.json` under the `--baseline` and `--current` directories,
14+
so any filename starting with `jmh-results-` works.
15+
16+
JSON records mirror the structure produced by JMH 1.37's
17+
`ResultFormatType.JSON`: an array of objects with `benchmark`, `params`,
18+
`primaryMetric.{score,scoreError,scoreUnit}`, and optionally
19+
`secondaryMetrics["gc.alloc.rate.norm"]`.
20+
21+
| Case | What it covers |
22+
|---|---|
23+
| `all_improvements` | Multiple benchmarks where both Time and Alloc/op fall well below the threshold; report should be all ⬆️ / 🟢 with no failure. |
24+
| `all_regressions` | Multiple benchmarks where Time and/or Alloc/op rise well above the threshold; report should be ⬇️ / 🔴, script flags every row as `REGRESSION`, summary `regressions > 0`. |
25+
| `mixed` | A blend of regressions, improvements, and within-noise rows including multiple variants of the same benchmark — verifies the bucket ordering and the param-discriminator (`[limit=…]`) logic. |
26+
| `no_alloc` | Records with no `gc.alloc.rate.norm` at all; verifies the script falls back to Time-only and doesn't render `🔴`/`🟢` markers in the absence of the key (plus prints the diagnostic warning). |
27+
| `noise_only` | All deltas are inside ±10%; report should be the ✅ "no changes" header and every row should carry the ➖ neutral arrow. |
28+
| `only_in_pr` | A benchmark appears in `current` but not in `baseline`; verifies the "Benchmarks only in PR run" `<details>` block. |
29+
| `only_in_baseline` | The mirror case — verifies the "Benchmarks only in baseline run" block. |
30+
| `empty_intersection` | `baseline` and `current` contain different sets of benchmarks so no rows are matched; verifies the "_No benchmarks matched_" path. |
31+
32+
The companion runner `test_compare_jmh.py` exercises every case and
33+
checks both the rendered markdown and the `--summary-output` counters.

0 commit comments

Comments
 (0)