Skip to content

Commit 940e837

Browse files
committed
Changed PR report
1 parent 305cf0f commit 940e837

1 file changed

Lines changed: 87 additions & 12 deletions

File tree

.github/scripts/compare-jmh.py

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import json
3131
import os
3232
import sys
33+
from collections import defaultdict
3334
from dataclasses import dataclass
3435
from typing import Any, Callable, Dict, List, Optional, Tuple
3536

@@ -247,6 +248,35 @@ def build_rows(
247248
# ---------------------------------------------------------------------------
248249

249250

251+
def compute_discriminators(
252+
rows: List[Row], current: Dict[Key, Dict[str, Any]]
253+
) -> Dict[Key, str]:
254+
"""For each row, return a short string of only the params that differ
255+
between rows sharing the same fully-qualified benchmark name.
256+
257+
When a benchmark is run with only one combination of params, its
258+
discriminator is the empty string (no need to disambiguate).
259+
"""
260+
by_bench: Dict[str, List[Row]] = defaultdict(list)
261+
for r in rows:
262+
bench, _ = r.key
263+
by_bench[bench].append(r)
264+
265+
out: Dict[Key, str] = {}
266+
for bench, group in by_bench.items():
267+
if len(group) <= 1:
268+
out[group[0].key] = ""
269+
continue
270+
params_dicts = [(current.get(r.key) or {}).get("params") or {} for r in group]
271+
all_keys = sorted({k for p in params_dicts for k in p.keys()})
272+
varying = [
273+
k for k in all_keys if len({p.get(k) for p in params_dicts}) > 1
274+
]
275+
for r, p in zip(group, params_dicts):
276+
out[r.key] = ", ".join(f"{k}={p[k]}" for k in varying if k in p)
277+
return out
278+
279+
250280
def build_markdown(
251281
rows: List[Row],
252282
only_current: List[Key],
@@ -266,14 +296,19 @@ def build_markdown(
266296
if not any(d.regression(threshold) for d in r.deltas)
267297
and any(d.improvement(threshold) for d in r.deltas)
268298
)
299+
discriminators = compute_discriminators(rows, current)
269300

270301
out: List[str] = ["<!-- jmh-benchmark-comparison -->"]
271302
if regressions:
272-
out.append(f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%")
303+
out.append(
304+
f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%"
305+
)
273306
elif improvements:
274-
out.append(f"## ✅ JMH benchmark comparison — {improvements} improvement(s) over {threshold:g}%")
307+
out.append(
308+
f"## ✅ JMH benchmark comparison — no regressions, {improvements} improvement(s) over {threshold:g}%"
309+
)
275310
else:
276-
out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%")
311+
out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%")
277312
out.append("")
278313

279314
if repo and baseline_run_id and current_run_id:
@@ -285,15 +320,56 @@ def build_markdown(
285320
)
286321
out.append("")
287322

288-
out.append(
289-
f"Threshold: **±{threshold:g}%**. "
290-
f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and "
291-
f"**Alloc/op** (`{ALLOC_NORM_KEY}`, GC allocations per op — memory pressure). "
292-
"Both are lower-is-better, so a positive Δ% means the PR is worse than baseline."
293-
)
294-
out.append("")
323+
# ---- brief per-benchmark summary --------------------------------------
324+
if rows:
325+
# Stable order: regressions first, then improvements, then noise.
326+
# Within each bucket, biggest |Δ| first.
327+
def bucket(r: Row) -> int:
328+
if any(d.regression(threshold) for d in r.deltas):
329+
return 0
330+
if any(d.improvement(threshold) for d in r.deltas):
331+
return 1
332+
return 2
333+
334+
rows = sorted(rows, key=lambda r: (bucket(r), -r.sort_key()))
295335

336+
for r in rows:
337+
bench, _ = r.key
338+
disc = discriminators.get(r.key, "")
339+
b = bucket(r)
340+
icon = "❌" if b == 0 else "✅"
341+
342+
# In the brief view, only mention metrics that actually crossed
343+
# the threshold — keeps noisy rows to a single line.
344+
bits: List[str] = []
345+
for d in r.deltas:
346+
if d.delta_pct is None:
347+
continue
348+
if d.regression(threshold):
349+
bits.append(f"**{d.metric.label} {fmt_delta(d.delta_pct)}**")
350+
elif d.improvement(threshold):
351+
bits.append(f"{d.metric.label} {fmt_delta(d.delta_pct)}")
352+
353+
line = f"- {icon} `{short_bench(bench)}`"
354+
if disc:
355+
line += f" `[{disc}]`"
356+
if bits:
357+
line += " — " + ", ".join(bits)
358+
out.append(line)
359+
out.append("")
360+
else:
361+
out.append("_No benchmarks matched between baseline and PR._")
362+
out.append("")
363+
364+
# ---- detailed table (collapsed) ---------------------------------------
296365
if rows:
366+
out.append(
367+
f"<details><summary>Detailed metrics "
368+
f"(threshold ±{threshold:g}%, Time = <code>primaryMetric.score</code> from <code>SampleTime</code>, "
369+
f"Alloc/op = <code>{ALLOC_NORM_KEY}</code>; positive Δ% = worse than baseline)"
370+
"</summary>"
371+
)
372+
out.append("")
297373
header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |"
298374
sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|"
299375
out.append(header)
@@ -305,8 +381,7 @@ def build_markdown(
305381
f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |"
306382
)
307383
out.append("")
308-
else:
309-
out.append("_No benchmarks matched between baseline and PR._")
384+
out.append("</details>")
310385
out.append("")
311386

312387
if only_current:

0 commit comments

Comments
 (0)