3030import json
3131import os
3232import sys
33+ from collections import defaultdict
3334from dataclasses import dataclass
3435from typing import Any , Callable , Dict , List , Optional , Tuple
3536
@@ -247,6 +248,35 @@ def build_rows(
247248# ---------------------------------------------------------------------------
248249
249250
251+ def compute_discriminators (
252+ rows : List [Row ], current : Dict [Key , Dict [str , Any ]]
253+ ) -> Dict [Key , str ]:
254+ """For each row, return a short string of only the params that differ
255+ between rows sharing the same fully-qualified benchmark name.
256+
257+ When a benchmark is run with only one combination of params, its
258+ discriminator is the empty string (no need to disambiguate).
259+ """
260+ by_bench : Dict [str , List [Row ]] = defaultdict (list )
261+ for r in rows :
262+ bench , _ = r .key
263+ by_bench [bench ].append (r )
264+
265+ out : Dict [Key , str ] = {}
266+ for bench , group in by_bench .items ():
267+ if len (group ) <= 1 :
268+ out [group [0 ].key ] = ""
269+ continue
270+ params_dicts = [(current .get (r .key ) or {}).get ("params" ) or {} for r in group ]
271+ all_keys = sorted ({k for p in params_dicts for k in p .keys ()})
272+ varying = [
273+ k for k in all_keys if len ({p .get (k ) for p in params_dicts }) > 1
274+ ]
275+ for r , p in zip (group , params_dicts ):
276+ out [r .key ] = ", " .join (f"{ k } ={ p [k ]} " for k in varying if k in p )
277+ return out
278+
279+
250280def build_markdown (
251281 rows : List [Row ],
252282 only_current : List [Key ],
@@ -266,14 +296,19 @@ def build_markdown(
266296 if not any (d .regression (threshold ) for d in r .deltas )
267297 and any (d .improvement (threshold ) for d in r .deltas )
268298 )
299+ discriminators = compute_discriminators (rows , current )
269300
270301 out : List [str ] = ["<!-- jmh-benchmark-comparison -->" ]
271302 if regressions :
272- out .append (f"## ❌ JMH benchmark comparison — { regressions } regression(s) over { threshold :g} %" )
303+ out .append (
304+ f"## ❌ JMH benchmark comparison — { regressions } regression(s) over { threshold :g} %"
305+ )
273306 elif improvements :
274- out .append (f"## ✅ JMH benchmark comparison — { improvements } improvement(s) over { threshold :g} %" )
307+ out .append (
308+ f"## ✅ JMH benchmark comparison — no regressions, { improvements } improvement(s) over { threshold :g} %"
309+ )
275310 else :
276- out .append (f"## JMH benchmark comparison — no changes over { threshold :g} %" )
311+ out .append (f"## ✅ JMH benchmark comparison — no changes over { threshold :g} %" )
277312 out .append ("" )
278313
279314 if repo and baseline_run_id and current_run_id :
@@ -285,15 +320,56 @@ def build_markdown(
285320 )
286321 out .append ("" )
287322
288- out .append (
289- f"Threshold: **±{ threshold :g} %**. "
290- f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and "
291- f"**Alloc/op** (`{ ALLOC_NORM_KEY } `, GC allocations per op — memory pressure). "
292- "Both are lower-is-better, so a positive Δ% means the PR is worse than baseline."
293- )
294- out .append ("" )
323+ # ---- brief per-benchmark summary --------------------------------------
324+ if rows :
325+ # Stable order: regressions first, then improvements, then noise.
326+ # Within each bucket, biggest |Δ| first.
327+ def bucket (r : Row ) -> int :
328+ if any (d .regression (threshold ) for d in r .deltas ):
329+ return 0
330+ if any (d .improvement (threshold ) for d in r .deltas ):
331+ return 1
332+ return 2
333+
334+ rows = sorted (rows , key = lambda r : (bucket (r ), - r .sort_key ()))
295335
336+ for r in rows :
337+ bench , _ = r .key
338+ disc = discriminators .get (r .key , "" )
339+ b = bucket (r )
340+ icon = "❌" if b == 0 else "✅"
341+
342+ # In the brief view, only mention metrics that actually crossed
343+ # the threshold — keeps noisy rows to a single line.
344+ bits : List [str ] = []
345+ for d in r .deltas :
346+ if d .delta_pct is None :
347+ continue
348+ if d .regression (threshold ):
349+ bits .append (f"**{ d .metric .label } { fmt_delta (d .delta_pct )} **" )
350+ elif d .improvement (threshold ):
351+ bits .append (f"{ d .metric .label } { fmt_delta (d .delta_pct )} " )
352+
353+ line = f"- { icon } `{ short_bench (bench )} `"
354+ if disc :
355+ line += f" `[{ disc } ]`"
356+ if bits :
357+ line += " — " + ", " .join (bits )
358+ out .append (line )
359+ out .append ("" )
360+ else :
361+ out .append ("_No benchmarks matched between baseline and PR._" )
362+ out .append ("" )
363+
364+ # ---- detailed table (collapsed) ---------------------------------------
296365 if rows :
366+ out .append (
367+ f"<details><summary>Detailed metrics "
368+ f"(threshold ±{ threshold :g} %, Time = <code>primaryMetric.score</code> from <code>SampleTime</code>, "
369+ f"Alloc/op = <code>{ ALLOC_NORM_KEY } </code>; positive Δ% = worse than baseline)"
370+ "</summary>"
371+ )
372+ out .append ("" )
297373 header = "| Benchmark | Params | " + " | " .join (m .label for m in METRICS ) + " | Status |"
298374 sep = "|---|---|" + "|" .join (["---" ] * len (METRICS )) + "|---|"
299375 out .append (header )
@@ -305,8 +381,7 @@ def build_markdown(
305381 f"| `{ short_bench (bench )} ` | { params or '—' } | { cells } | { r .status (threshold )} |"
306382 )
307383 out .append ("" )
308- else :
309- out .append ("_No benchmarks matched between baseline and PR._" )
384+ out .append ("</details>" )
310385 out .append ("" )
311386
312387 if only_current :
0 commit comments