1111 mean sampled latency per op; it's our best available proxy for CPU
1212 work since no dedicated CPU profiler is configured in
1313 `BenchmarkRunner`.
14- * `Alloc/op` — `secondaryMetrics["· gc.alloc.rate.norm"]`, populated by
14+ * `Alloc/op` — `secondaryMetrics["gc.alloc.rate.norm"]`, populated by
1515 JMH's `GCProfiler`. This is bytes allocated per benchmark op and is
16- the standard, low-noise JMH memory metric.
16+ the standard, low-noise JMH memory metric. (Note: JMH 1.37 stores the
17+ key with no prefix; some visualizer tools display it as
18+ `·gc.alloc.rate.norm` but the raw JSON does not contain that dot.)
1719
1820Both metrics are "lower is better", so a positive delta indicates the
1921PR is worse than the baseline. A run is considered failed when **any**
3032import json
3133import os
3234import sys
35+ from collections import defaultdict
3336from dataclasses import dataclass
3437from typing import Any , Callable , Dict , List , Optional , Tuple
3538
4144# ---------------------------------------------------------------------------
4245
4346# JMH's `GCProfiler` reports allocation rate normalised per op under this
44- # secondary metric key (the leading char is U+00B7 MIDDLE DOT, not a regular
45- # dot — that's JMH's convention for profiler-emitted metrics).
46- ALLOC_NORM_KEY = "\u00b7 gc.alloc.rate.norm"
47+ # secondary metric key. Verified against jmh-core 1.37 sources: the key
48+ # is the literal `gc.alloc.rate.norm` with no prefix. Some visualizers
49+ # render it as `·gc.alloc.rate.norm`, which is a display convention, not
50+ # what JMH writes to JSON.
51+ ALLOC_NORM_KEY = "gc.alloc.rate.norm"
52+
53+ # Tolerated prefix-bearing variants we'll fall back to if the canonical
54+ # key ever moves. Lets a future JMH (or a non-Oracle JVM profiler that
55+ # decorates the label) keep working without us being aware.
56+ ALLOC_NORM_VARIANTS = (
57+ ALLOC_NORM_KEY ,
58+ "\u00b7 gc.alloc.rate.norm" , # pre-emptive middle-dot variant
59+ "+gc.alloc.rate.norm" ,
60+ )
61+
62+ # Set to True the first time we fail to find any allocation data; used
63+ # to emit a one-time diagnostic listing the secondary keys present so
64+ # the cause is obvious in workflow logs.
65+ _alloc_warn_printed = False
4766
4867
4968@dataclass (frozen = True )
@@ -69,13 +88,35 @@ def _secondary(record: Dict[str, Any], key: str) -> Optional[Dict[str, Any]]:
6988 return val if isinstance (val , dict ) else None
7089
7190
91+ def _alloc (record : Dict [str , Any ]) -> Optional [Dict [str , Any ]]:
92+ """Pull the GC allocation-per-op metric, tolerating prefix variants.
93+
94+ Falls back to a suffix match so even an unrecognised prefix (e.g. a
95+ third-party JMH profiler that decorates the label) still works."""
96+ global _alloc_warn_printed
97+ sm = record .get ("secondaryMetrics" ) or {}
98+ for k in ALLOC_NORM_VARIANTS :
99+ v = sm .get (k )
100+ if isinstance (v , dict ):
101+ return v
102+ # Last-resort suffix match.
103+ for k , v in sm .items ():
104+ if isinstance (v , dict ) and k .lower ().endswith ("gc.alloc.rate.norm" ):
105+ return v
106+ if sm and not _alloc_warn_printed :
107+ print (
108+ "warn: no `gc.alloc.rate.norm` (or prefix-variant) found in "
109+ "secondaryMetrics. Available keys: "
110+ + ", " .join (sorted (sm .keys ())),
111+ file = sys .stderr ,
112+ )
113+ _alloc_warn_printed = True
114+ return None
115+
116+
72117METRICS : List [Metric ] = [
73118 Metric (id = "time" , label = "Time" , extract = _primary ),
74- Metric (
75- id = "alloc" ,
76- label = "Alloc/op" ,
77- extract = lambda r : _secondary (r , ALLOC_NORM_KEY ),
78- ),
119+ Metric (id = "alloc" , label = "Alloc/op" , extract = _alloc ),
79120]
80121
81122
@@ -247,6 +288,35 @@ def build_rows(
247288# ---------------------------------------------------------------------------
248289
249290
291+ def compute_discriminators (
292+ rows : List [Row ], current : Dict [Key , Dict [str , Any ]]
293+ ) -> Dict [Key , str ]:
294+ """For each row, return a short string of only the params that differ
295+ between rows sharing the same fully-qualified benchmark name.
296+
297+ When a benchmark is run with only one combination of params, its
298+ discriminator is the empty string (no need to disambiguate).
299+ """
300+ by_bench : Dict [str , List [Row ]] = defaultdict (list )
301+ for r in rows :
302+ bench , _ = r .key
303+ by_bench [bench ].append (r )
304+
305+ out : Dict [Key , str ] = {}
306+ for bench , group in by_bench .items ():
307+ if len (group ) <= 1 :
308+ out [group [0 ].key ] = ""
309+ continue
310+ params_dicts = [(current .get (r .key ) or {}).get ("params" ) or {} for r in group ]
311+ all_keys = sorted ({k for p in params_dicts for k in p .keys ()})
312+ varying = [
313+ k for k in all_keys if len ({p .get (k ) for p in params_dicts }) > 1
314+ ]
315+ for r , p in zip (group , params_dicts ):
316+ out [r .key ] = ", " .join (f"{ k } ={ p [k ]} " for k in varying if k in p )
317+ return out
318+
319+
250320def build_markdown (
251321 rows : List [Row ],
252322 only_current : List [Key ],
@@ -266,14 +336,19 @@ def build_markdown(
266336 if not any (d .regression (threshold ) for d in r .deltas )
267337 and any (d .improvement (threshold ) for d in r .deltas )
268338 )
339+ discriminators = compute_discriminators (rows , current )
269340
270341 out : List [str ] = ["<!-- jmh-benchmark-comparison -->" ]
271342 if regressions :
272- out .append (f"## ❌ JMH benchmark comparison — { regressions } regression(s) over { threshold :g} %" )
343+ out .append (
344+ f"## ❌ JMH benchmark comparison — { regressions } regression(s) over { threshold :g} %"
345+ )
273346 elif improvements :
274- out .append (f"## ✅ JMH benchmark comparison — { improvements } improvement(s) over { threshold :g} %" )
347+ out .append (
348+ f"## ✅ JMH benchmark comparison — no regressions, { improvements } improvement(s) over { threshold :g} %"
349+ )
275350 else :
276- out .append (f"## JMH benchmark comparison — no changes over { threshold :g} %" )
351+ out .append (f"## ✅ JMH benchmark comparison — no changes over { threshold :g} %" )
277352 out .append ("" )
278353
279354 if repo and baseline_run_id and current_run_id :
@@ -285,15 +360,56 @@ def build_markdown(
285360 )
286361 out .append ("" )
287362
288- out .append (
289- f"Threshold: **±{ threshold :g} %**. "
290- f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and "
291- f"**Alloc/op** (`{ ALLOC_NORM_KEY } `, GC allocations per op — memory pressure). "
292- "Both are lower-is-better, so a positive Δ% means the PR is worse than baseline."
293- )
294- out .append ("" )
363+ # ---- brief per-benchmark summary --------------------------------------
364+ if rows :
365+ # Stable order: regressions first, then improvements, then noise.
366+ # Within each bucket, biggest |Δ| first.
367+ def bucket (r : Row ) -> int :
368+ if any (d .regression (threshold ) for d in r .deltas ):
369+ return 0
370+ if any (d .improvement (threshold ) for d in r .deltas ):
371+ return 1
372+ return 2
373+
374+ rows = sorted (rows , key = lambda r : (bucket (r ), - r .sort_key ()))
375+
376+ for r in rows :
377+ bench , _ = r .key
378+ disc = discriminators .get (r .key , "" )
379+ b = bucket (r )
380+ icon = "❌" if b == 0 else "✅"
381+
382+ # In the brief view, only mention metrics that actually crossed
383+ # the threshold — keeps noisy rows to a single line.
384+ bits : List [str ] = []
385+ for d in r .deltas :
386+ if d .delta_pct is None :
387+ continue
388+ if d .regression (threshold ):
389+ bits .append (f"**{ d .metric .label } { fmt_delta (d .delta_pct )} **" )
390+ elif d .improvement (threshold ):
391+ bits .append (f"{ d .metric .label } { fmt_delta (d .delta_pct )} " )
392+
393+ line = f"- { icon } `{ short_bench (bench )} `"
394+ if disc :
395+ line += f" `[{ disc } ]`"
396+ if bits :
397+ line += " — " + ", " .join (bits )
398+ out .append (line )
399+ out .append ("" )
400+ else :
401+ out .append ("_No benchmarks matched between baseline and PR._" )
402+ out .append ("" )
295403
404+ # ---- detailed table (collapsed) ---------------------------------------
296405 if rows :
406+ out .append (
407+ f"<details><summary>Detailed metrics "
408+ f"(threshold ±{ threshold :g} %, Time = <code>primaryMetric.score</code> from <code>SampleTime</code>, "
409+ f"Alloc/op = <code>{ ALLOC_NORM_KEY } </code>; positive Δ% = worse than baseline)"
410+ "</summary>"
411+ )
412+ out .append ("" )
297413 header = "| Benchmark | Params | " + " | " .join (m .label for m in METRICS ) + " | Status |"
298414 sep = "|---|---|" + "|" .join (["---" ] * len (METRICS )) + "|---|"
299415 out .append (header )
@@ -305,8 +421,7 @@ def build_markdown(
305421 f"| `{ short_bench (bench )} ` | { params or '—' } | { cells } | { r .status (threshold )} |"
306422 )
307423 out .append ("" )
308- else :
309- out .append ("_No benchmarks matched between baseline and PR._" )
424+ out .append ("</details>" )
310425 out .append ("" )
311426
312427 if only_current :
@@ -316,7 +431,7 @@ def build_markdown(
316431 bench , params = k
317432 rec = current [k ]
318433 time_d = _primary (rec ) or {}
319- alloc_d = _secondary (rec , ALLOC_NORM_KEY ) or {}
434+ alloc_d = _alloc (rec ) or {}
320435 out .append (
321436 f"- `{ short_bench (bench )} ` ({ params or '—' } ): "
322437 f"time={ fmt_score (_float (time_d , 'score' ), _float (time_d , 'scoreError' ), time_d .get ('scoreUnit' , '' ))} , "
0 commit comments