Skip to content

Commit 63ca2ac

Browse files
authored
Merge pull request #2850 from ClickHouse/05/11/26/benchmark_compare_results
fix compare result
2 parents ec4ea2f + c0bd381 commit 63ca2ac

2 files changed

Lines changed: 149 additions & 24 deletions

File tree

.github/scripts/compare-jmh.py

Lines changed: 138 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
mean sampled latency per op; it's our best available proxy for CPU
1212
work since no dedicated CPU profiler is configured in
1313
`BenchmarkRunner`.
14-
* `Alloc/op` — `secondaryMetrics["·gc.alloc.rate.norm"]`, populated by
14+
* `Alloc/op` — `secondaryMetrics["gc.alloc.rate.norm"]`, populated by
1515
JMH's `GCProfiler`. This is bytes allocated per benchmark op and is
16-
the standard, low-noise JMH memory metric.
16+
the standard, low-noise JMH memory metric. (Note: JMH 1.37 stores the
17+
key with no prefix; some visualizer tools display it as
18+
`·gc.alloc.rate.norm` but the raw JSON does not contain that dot.)
1719
1820
Both metrics are "lower is better", so a positive delta indicates the
1921
PR is worse than the baseline. A run is considered failed when **any**
@@ -30,6 +32,7 @@
3032
import json
3133
import os
3234
import sys
35+
from collections import defaultdict
3336
from dataclasses import dataclass
3437
from typing import Any, Callable, Dict, List, Optional, Tuple
3538

@@ -41,9 +44,25 @@
4144
# ---------------------------------------------------------------------------
4245

4346
# JMH's `GCProfiler` reports allocation rate normalised per op under this
44-
# secondary metric key (the leading char is U+00B7 MIDDLE DOT, not a regular
45-
# dot — that's JMH's convention for profiler-emitted metrics).
46-
ALLOC_NORM_KEY = "\u00b7gc.alloc.rate.norm"
47+
# secondary metric key. Verified against jmh-core 1.37 sources: the key
48+
# is the literal `gc.alloc.rate.norm` with no prefix. Some visualizers
49+
# render it as `·gc.alloc.rate.norm`, which is a display convention, not
50+
# what JMH writes to JSON.
51+
ALLOC_NORM_KEY = "gc.alloc.rate.norm"
52+
53+
# Tolerated prefix-bearing variants we'll fall back to if the canonical
54+
# key ever moves. Lets a future JMH (or a non-Oracle JVM profiler that
55+
# decorates the label) keep working without us being aware.
56+
ALLOC_NORM_VARIANTS = (
57+
ALLOC_NORM_KEY,
58+
"\u00b7gc.alloc.rate.norm", # pre-emptive middle-dot variant
59+
"+gc.alloc.rate.norm",
60+
)
61+
62+
# Set to True the first time we fail to find any allocation data; used
63+
# to emit a one-time diagnostic listing the secondary keys present so
64+
# the cause is obvious in workflow logs.
65+
_alloc_warn_printed = False
4766

4867

4968
@dataclass(frozen=True)
@@ -69,13 +88,35 @@ def _secondary(record: Dict[str, Any], key: str) -> Optional[Dict[str, Any]]:
6988
return val if isinstance(val, dict) else None
7089

7190

91+
def _alloc(record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
92+
"""Pull the GC allocation-per-op metric, tolerating prefix variants.
93+
94+
Falls back to a suffix match so even an unrecognised prefix (e.g. a
95+
third-party JMH profiler that decorates the label) still works."""
96+
global _alloc_warn_printed
97+
sm = record.get("secondaryMetrics") or {}
98+
for k in ALLOC_NORM_VARIANTS:
99+
v = sm.get(k)
100+
if isinstance(v, dict):
101+
return v
102+
# Last-resort suffix match.
103+
for k, v in sm.items():
104+
if isinstance(v, dict) and k.lower().endswith("gc.alloc.rate.norm"):
105+
return v
106+
if sm and not _alloc_warn_printed:
107+
print(
108+
"warn: no `gc.alloc.rate.norm` (or prefix-variant) found in "
109+
"secondaryMetrics. Available keys: "
110+
+ ", ".join(sorted(sm.keys())),
111+
file=sys.stderr,
112+
)
113+
_alloc_warn_printed = True
114+
return None
115+
116+
72117
METRICS: List[Metric] = [
73118
Metric(id="time", label="Time", extract=_primary),
74-
Metric(
75-
id="alloc",
76-
label="Alloc/op",
77-
extract=lambda r: _secondary(r, ALLOC_NORM_KEY),
78-
),
119+
Metric(id="alloc", label="Alloc/op", extract=_alloc),
79120
]
80121

81122

@@ -247,6 +288,35 @@ def build_rows(
247288
# ---------------------------------------------------------------------------
248289

249290

291+
def compute_discriminators(
292+
rows: List[Row], current: Dict[Key, Dict[str, Any]]
293+
) -> Dict[Key, str]:
294+
"""For each row, return a short string of only the params that differ
295+
between rows sharing the same fully-qualified benchmark name.
296+
297+
When a benchmark is run with only one combination of params, its
298+
discriminator is the empty string (no need to disambiguate).
299+
"""
300+
by_bench: Dict[str, List[Row]] = defaultdict(list)
301+
for r in rows:
302+
bench, _ = r.key
303+
by_bench[bench].append(r)
304+
305+
out: Dict[Key, str] = {}
306+
for bench, group in by_bench.items():
307+
if len(group) <= 1:
308+
out[group[0].key] = ""
309+
continue
310+
params_dicts = [(current.get(r.key) or {}).get("params") or {} for r in group]
311+
all_keys = sorted({k for p in params_dicts for k in p.keys()})
312+
varying = [
313+
k for k in all_keys if len({p.get(k) for p in params_dicts}) > 1
314+
]
315+
for r, p in zip(group, params_dicts):
316+
out[r.key] = ", ".join(f"{k}={p[k]}" for k in varying if k in p)
317+
return out
318+
319+
250320
def build_markdown(
251321
rows: List[Row],
252322
only_current: List[Key],
@@ -266,14 +336,19 @@ def build_markdown(
266336
if not any(d.regression(threshold) for d in r.deltas)
267337
and any(d.improvement(threshold) for d in r.deltas)
268338
)
339+
discriminators = compute_discriminators(rows, current)
269340

270341
out: List[str] = ["<!-- jmh-benchmark-comparison -->"]
271342
if regressions:
272-
out.append(f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%")
343+
out.append(
344+
f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%"
345+
)
273346
elif improvements:
274-
out.append(f"## ✅ JMH benchmark comparison — {improvements} improvement(s) over {threshold:g}%")
347+
out.append(
348+
f"## ✅ JMH benchmark comparison — no regressions, {improvements} improvement(s) over {threshold:g}%"
349+
)
275350
else:
276-
out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%")
351+
out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%")
277352
out.append("")
278353

279354
if repo and baseline_run_id and current_run_id:
@@ -285,15 +360,56 @@ def build_markdown(
285360
)
286361
out.append("")
287362

288-
out.append(
289-
f"Threshold: **±{threshold:g}%**. "
290-
f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and "
291-
f"**Alloc/op** (`{ALLOC_NORM_KEY}`, GC allocations per op — memory pressure). "
292-
"Both are lower-is-better, so a positive Δ% means the PR is worse than baseline."
293-
)
294-
out.append("")
363+
# ---- brief per-benchmark summary --------------------------------------
364+
if rows:
365+
# Stable order: regressions first, then improvements, then noise.
366+
# Within each bucket, biggest |Δ| first.
367+
def bucket(r: Row) -> int:
368+
if any(d.regression(threshold) for d in r.deltas):
369+
return 0
370+
if any(d.improvement(threshold) for d in r.deltas):
371+
return 1
372+
return 2
373+
374+
rows = sorted(rows, key=lambda r: (bucket(r), -r.sort_key()))
375+
376+
for r in rows:
377+
bench, _ = r.key
378+
disc = discriminators.get(r.key, "")
379+
b = bucket(r)
380+
icon = "❌" if b == 0 else "✅"
381+
382+
# In the brief view, only mention metrics that actually crossed
383+
# the threshold — keeps noisy rows to a single line.
384+
bits: List[str] = []
385+
for d in r.deltas:
386+
if d.delta_pct is None:
387+
continue
388+
if d.regression(threshold):
389+
bits.append(f"**{d.metric.label} {fmt_delta(d.delta_pct)}**")
390+
elif d.improvement(threshold):
391+
bits.append(f"{d.metric.label} {fmt_delta(d.delta_pct)}")
392+
393+
line = f"- {icon} `{short_bench(bench)}`"
394+
if disc:
395+
line += f" `[{disc}]`"
396+
if bits:
397+
line += " — " + ", ".join(bits)
398+
out.append(line)
399+
out.append("")
400+
else:
401+
out.append("_No benchmarks matched between baseline and PR._")
402+
out.append("")
295403

404+
# ---- detailed table (collapsed) ---------------------------------------
296405
if rows:
406+
out.append(
407+
f"<details><summary>Detailed metrics "
408+
f"(threshold ±{threshold:g}%, Time = <code>primaryMetric.score</code> from <code>SampleTime</code>, "
409+
f"Alloc/op = <code>{ALLOC_NORM_KEY}</code>; positive Δ% = worse than baseline)"
410+
"</summary>"
411+
)
412+
out.append("")
297413
header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |"
298414
sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|"
299415
out.append(header)
@@ -305,8 +421,7 @@ def build_markdown(
305421
f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |"
306422
)
307423
out.append("")
308-
else:
309-
out.append("_No benchmarks matched between baseline and PR._")
424+
out.append("</details>")
310425
out.append("")
311426

312427
if only_current:
@@ -316,7 +431,7 @@ def build_markdown(
316431
bench, params = k
317432
rec = current[k]
318433
time_d = _primary(rec) or {}
319-
alloc_d = _secondary(rec, ALLOC_NORM_KEY) or {}
434+
alloc_d = _alloc(rec) or {}
320435
out.append(
321436
f"- `{short_bench(bench)}` ({params or '—'}): "
322437
f"time={fmt_score(_float(time_d, 'score'), _float(time_d, 'scoreError'), time_d.get('scoreUnit', ''))}, "

.github/workflows/benchmarks.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ jobs:
121121
with:
122122
ref: ${{ env.CHC_BRANCH }}
123123

124+
# The benchmark code runs against the PR's working tree (see the
125+
# PR checkout below), but the comparison tooling lives in this
126+
# workflow's contract on `main`. Stash a copy now so the compare
127+
# step still works even when the PR branch was forked before
128+
# `.github/scripts/compare-jmh.py` existed.
129+
- name: Stash comparison tooling from main
130+
run: |
131+
mkdir -p "$RUNNER_TEMP/jmh-tools"
132+
cp -v .github/scripts/compare-jmh.py "$RUNNER_TEMP/jmh-tools/"
133+
124134
- name: Check out PR
125135
if: steps.pr.outputs.number != ''
126136
run: |
@@ -196,7 +206,7 @@ jobs:
196206
if: steps.pr.outputs.number != '' && steps.baseline.outputs.found == 'true'
197207
continue-on-error: true
198208
run: |
199-
python3 .github/scripts/compare-jmh.py \
209+
python3 "$RUNNER_TEMP/jmh-tools/compare-jmh.py" \
200210
--baseline baseline-results \
201211
--current performance \
202212
--baseline-run-id "${{ steps.baseline.outputs.run_id }}" \

0 commit comments

Comments
 (0)