reorganized tables and added test data to support test_compare_jmh.py script

chernser · chernser · commit 5d8be1fa7697 · 2026-05-12T10:00:11.000-07:00
diff --git a/.github/scripts/compare-jmh.py b/.github/scripts/compare-jmh.py
@@ -317,6 +317,70 @@ def compute_discriminators(
     return out
 
 
+# Visual markers used in rendered output. "Performance went up" =
+# improvement (less time / less memory per op).
+ARROW_REGRESS = "\u2b07\ufe0f"  # ⬇️
+ARROW_IMPROVE = "\u2b06\ufe0f"  # ⬆️
+ARROW_NOISE = "\u2796"          # ➖
+
+DOT_REGRESS = "\U0001f534"      # 🔴
+DOT_IMPROVE = "\U0001f7e2"      # 🟢
+
+
+def _row_arrow(r: Row, threshold: float) -> str:
+    if any(d.regression(threshold) for d in r.deltas):
+        return ARROW_REGRESS
+    if any(d.improvement(threshold) for d in r.deltas):
+        return ARROW_IMPROVE
+    return ARROW_NOISE
+
+
+def _short_delta(d: MetricDelta, threshold: float) -> str:
+    """One-line metric delta for the brief bullet list.
+
+    Returns "" for noise (caller drops it). Regressions are 🔴-marked and
+    bold so they stand out on a packed PR comment; improvements are
+    🟢-marked but unbolded.
+    """
+    if d.delta_pct is None:
+        return ""
+    label = d.metric.label
+    delta = fmt_delta(d.delta_pct)
+    if d.regression(threshold):
+        return f"{DOT_REGRESS} **{label} {delta}**"
+    if d.improvement(threshold):
+        return f"{DOT_IMPROVE} {label} {delta}"
+    return ""
+
+
+def _stats_cell(r: Row, threshold: float) -> str:
+    """Render the joined Stats cell for one row in the detail table.
+
+    Each metric occupies a `<br>`-separated line. Regressions are
+    bolded and 🔴-tagged; improvements are 🟢-tagged. Metrics with no
+    baseline/current data are rendered grey ("—") so the cell still
+    shows which dimension is missing.
+    """
+    lines: List[str] = []
+    for d in r.deltas:
+        label = d.metric.label
+        if d.delta_pct is None:
+            base = fmt_score(d.baseline, d.baseline_err, d.unit)
+            curr = fmt_score(d.current, d.current_err, d.unit)
+            lines.append(f"{label} {base} → {curr} (—)")
+            continue
+        base = fmt_score(d.baseline, d.baseline_err, d.unit)
+        curr = fmt_score(d.current, d.current_err, d.unit)
+        delta = fmt_delta(d.delta_pct)
+        if d.regression(threshold):
+            lines.append(f"{DOT_REGRESS} **{label}** {base} → {curr} (**{delta}**)")
+        elif d.improvement(threshold):
+            lines.append(f"{DOT_IMPROVE} **{label}** {base} → {curr} ({delta})")
+        else:
+            lines.append(f"{label} {base} → {curr} ({delta})")
+    return "<br>".join(lines)
+
+
 def build_markdown(
     rows: List[Row],
     only_current: List[Key],
@@ -341,14 +405,14 @@ def build_markdown(
     out: List[str] = ["<!-- jmh-benchmark-comparison -->"]
     if regressions:
         out.append(
-            f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%"
+            f"## {DOT_REGRESS} JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%"
         )
     elif improvements:
         out.append(
-            f"## ✅ JMH benchmark comparison — no regressions, {improvements} improvement(s) over {threshold:g}%"
+            f"## {DOT_IMPROVE} JMH benchmark comparison — no regressions, {improvements} improvement(s) over {threshold:g}%"
         )
     else:
-        out.append(f"## ✅ JMH benchmark comparison — no changes over {threshold:g}%")
+        out.append(f"## {DOT_IMPROVE} JMH benchmark comparison — no changes over {threshold:g}%")
     out.append("")
 
     if repo and baseline_run_id and current_run_id:
@@ -376,21 +440,13 @@ def bucket(r: Row) -> int:
         for r in rows:
             bench, _ = r.key
             disc = discriminators.get(r.key, "")
-            b = bucket(r)
-            icon = "❌" if b == 0 else "✅"
-
-            # In the brief view, only mention metrics that actually crossed
-            # the threshold — keeps noisy rows to a single line.
-            bits: List[str] = []
-            for d in r.deltas:
-                if d.delta_pct is None:
-                    continue
-                if d.regression(threshold):
-                    bits.append(f"**{d.metric.label} {fmt_delta(d.delta_pct)}**")
-                elif d.improvement(threshold):
-                    bits.append(f"{d.metric.label} {fmt_delta(d.delta_pct)}")
-
-            line = f"- {icon} `{short_bench(bench)}`"
+            arrow = _row_arrow(r, threshold)
+
+            # Only mention metrics that actually crossed the threshold
+            # in the brief view — keeps noisy rows to a single line.
+            bits = [s for s in (_short_delta(d, threshold) for d in r.deltas) if s]
+
+            line = f"- {arrow} `{short_bench(bench)}`"
             if disc:
                 line += f" `[{disc}]`"
             if bits:
@@ -410,16 +466,14 @@ def bucket(r: Row) -> int:
             "</summary>"
         )
         out.append("")
-        header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |"
-        sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|"
-        out.append(header)
-        out.append(sep)
+        out.append("| Benchmark | Stats |")
+        out.append("|---|---|")
         for r in rows:
             bench, params = r.key
-            cells = " | ".join(d.cell() for d in r.deltas)
-            out.append(
-                f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |"
-            )
+            bench_cell = f"`{short_bench(bench)}`"
+            if params:
+                bench_cell += f"<br><sub>{params}</sub>"
+            out.append(f"| {bench_cell} | {_stats_cell(r, threshold)} |")
         out.append("")
         out.append("</details>")
         out.append("")
diff --git a/.github/scripts/test_compare_jmh.py b/.github/scripts/test_compare_jmh.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""End-to-end tests for `compare-jmh.py`.
+
+Runs `compare-jmh.py` against every scenario in `test_data/` via
+subprocess and asserts on both the rendered markdown and the
+`--summary-output` counters. Designed to be run locally
+(`python3 .github/scripts/test_compare_jmh.py`) and from CI without
+extra dependencies — only the standard library is used.
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+from typing import Dict, List
+
+HERE = Path(__file__).resolve().parent
+SCRIPT = HERE / "compare-jmh.py"
+DATA = HERE / "test_data"
+
+
+def _parse_summary(path: Path) -> Dict[str, str]:
+    out: Dict[str, str] = {}
+    for line in path.read_text(encoding="utf-8").splitlines():
+        if "=" in line:
+            k, v = line.split("=", 1)
+            out[k.strip()] = v.strip()
+    return out
+
+
+def _run_compare(
+    case: str,
+    *,
+    threshold: float = 10.0,
+    extra_args: List[str] | None = None,
+) -> Dict[str, object]:
+    """Invoke compare-jmh.py against a fixture; return its outputs.
+
+    Returns a dict with: returncode, stdout, stderr, markdown, summary.
+    """
+    case_dir = DATA / case
+    assert case_dir.is_dir(), f"missing test fixture: {case_dir}"
+
+    tmp = Path(tempfile.mkdtemp(prefix=f"cmp-jmh-{case}-"))
+    try:
+        out_md = tmp / "out.md"
+        summary = tmp / "summary.env"
+        cmd = [
+            sys.executable,
+            str(SCRIPT),
+            "--baseline",
+            str(case_dir / "baseline"),
+            "--current",
+            str(case_dir / "current"),
+            "--threshold-pct",
+            str(threshold),
+            "--output",
+            str(out_md),
+            "--summary-output",
+            str(summary),
+        ]
+        if extra_args:
+            cmd.extend(extra_args)
+        proc = subprocess.run(cmd, capture_output=True, text=True)
+        result: Dict[str, object] = {
+            "returncode": proc.returncode,
+            "stdout": proc.stdout,
+            "stderr": proc.stderr,
+            "markdown": out_md.read_text(encoding="utf-8") if out_md.exists() else "",
+            "summary": _parse_summary(summary) if summary.exists() else {},
+        }
+        return result
+    finally:
+        shutil.rmtree(tmp, ignore_errors=True)
+
+
+# Pre-computed marker strings — keep in sync with compare-jmh.py.
+ARROW_REGRESS = "\u2b07\ufe0f"
+ARROW_IMPROVE = "\u2b06\ufe0f"
+ARROW_NOISE = "\u2796"
+DOT_REGRESS = "\U0001f534"
+DOT_IMPROVE = "\U0001f7e2"
+
+
+class CompareJmhTest(unittest.TestCase):
+    """Each case in test_data/ has a dedicated test asserting on the
+    headline shape, the bullet markers, and the summary counters."""
+
+    def test_all_improvements(self) -> None:
+        r = _run_compare("all_improvements")
+        self.assertEqual(r["returncode"], 0)
+        md = r["markdown"]
+        self.assertIn("no regressions", md)
+        self.assertIn(ARROW_IMPROVE, md)
+        self.assertIn(DOT_IMPROVE, md)
+        self.assertNotIn(ARROW_REGRESS, md)
+        self.assertNotIn(DOT_REGRESS, md)
+        s = r["summary"]
+        self.assertEqual(s.get("regressions"), "0")
+        self.assertEqual(s.get("improvements"), "2")
+        self.assertEqual(s.get("matched"), "2")
+
+    def test_all_regressions(self) -> None:
+        r = _run_compare("all_regressions")
+        self.assertEqual(r["returncode"], 0)
+        md = r["markdown"]
+        self.assertIn("regression(s) over", md)
+        self.assertIn(ARROW_REGRESS, md)
+        self.assertIn(DOT_REGRESS, md)
+        # Bold markdown around the metric label.
+        self.assertIn("**Time", md)
+        s = r["summary"]
+        self.assertEqual(s.get("regressions"), "2")
+        # No row "purely improved" → improvements should be 0.
+        self.assertEqual(s.get("improvements"), "0")
+
+    def test_mixed(self) -> None:
+        r = _run_compare("mixed")
+        self.assertEqual(r["returncode"], 0)
+        md = r["markdown"]
+        # Both directions present.
+        self.assertIn(ARROW_REGRESS, md)
+        self.assertIn(ARROW_IMPROVE, md)
+        self.assertIn(DOT_REGRESS, md)
+        self.assertIn(DOT_IMPROVE, md)
+        # Discriminator suffix `[limit=…]` appears for the two
+        # `queryV2` variants of the same benchmark.
+        self.assertIn("[limit=10000]", md)
+        self.assertIn("[limit=100000]", md)
+        # The unique-named benchmark must NOT get a discriminator.
+        self.assertNotIn("`JDBCQuery.selectJDBCV2` `[", md)
+        s = r["summary"]
+        self.assertEqual(s.get("matched"), "4")
+        self.assertGreater(int(s.get("regressions", "0")), 0)
+
+    def test_no_alloc(self) -> None:
+        # No `gc.alloc.rate.norm` present anywhere → script must
+        # still compare Time and emit a diagnostic on stderr.
+        r = _run_compare("no_alloc")
+        self.assertEqual(r["returncode"], 0)
+        self.assertIn("no `gc.alloc.rate.norm`", r["stderr"])
+        md = r["markdown"]
+        # Time regression should still be detected and 🔴-tagged…
+        self.assertIn(DOT_REGRESS, md)
+        # …but no Alloc/op metric is ever 🟢/🔴 because we have no
+        # baseline/current data for it.
+        self.assertIn("Alloc/op", md)
+        # The detail-table cell should fall back to "(—)" for alloc.
+        self.assertIn("Alloc/op — → — (—)", md)
+        s = r["summary"]
+        self.assertEqual(s.get("matched"), "2")
+
+    def test_noise_only(self) -> None:
+        r = _run_compare("noise_only")
+        self.assertEqual(r["returncode"], 0)
+        md = r["markdown"]
+        self.assertIn("no changes over 10%", md)
+        # Every row should be on the noise arrow…
+        self.assertIn(ARROW_NOISE, md)
+        # …and there should be no red/green dots anywhere.
+        self.assertNotIn(DOT_REGRESS, md)
+        # 🟢 *is* in the header for the OK case, so don't assert on
+        # DOT_IMPROVE alone.
+        s = r["summary"]
+        self.assertEqual(s.get("regressions"), "0")
+        self.assertEqual(s.get("improvements"), "0")
+
+    def test_only_in_pr(self) -> None:
+        r = _run_compare("only_in_pr")
+        self.assertEqual(r["returncode"], 0)
+        md = r["markdown"]
+        self.assertIn("Benchmarks only in PR run", md)
+        self.assertIn("QueryClient.queryV3New", md)
+        s = r["summary"]
+        # one shared row matched.
+        self.assertEqual(s.get("matched"), "1")
+
+    def test_only_in_baseline(self) -> None:
+        r = _run_compare("only_in_baseline")
+        self.assertEqual(r["returncode"], 0)
+        md = r["markdown"]
+        self.assertIn("Benchmarks only in baseline run", md)
+        self.assertIn("QueryClient.queryV0Removed", md)
+        s = r["summary"]
+        self.assertEqual(s.get("matched"), "1")
+
+    def test_empty_intersection(self) -> None:
+        r = _run_compare("empty_intersection")
+        self.assertEqual(r["returncode"], 0)
+        md = r["markdown"]
+        self.assertIn("_No benchmarks matched between baseline and PR._", md)
+        # Both unique-side sections still appear as <details>.
+        self.assertIn("Benchmarks only in PR run", md)
+        self.assertIn("Benchmarks only in baseline run", md)
+        s = r["summary"]
+        self.assertEqual(s.get("matched"), "0")
+        self.assertEqual(s.get("regressions"), "0")
+        self.assertEqual(s.get("improvements"), "0")
+
+    def test_threshold_knob(self) -> None:
+        # The same fixture flips from "regression" to "ok" when the
+        # threshold is widened past the largest delta.
+        strict = _run_compare("all_regressions", threshold=10.0)
+        lenient = _run_compare("all_regressions", threshold=200.0)
+        self.assertGreater(int(strict["summary"]["regressions"]), 0)
+        self.assertEqual(lenient["summary"]["regressions"], "0")
+        self.assertIn("no changes", lenient["markdown"])
+
+
+if __name__ == "__main__":
+    # `-v` prints each scenario name so failures are obvious in CI logs.
+    unittest.main(verbosity=2)
diff --git a/.github/scripts/test_data/README.md b/.github/scripts/test_data/README.md
@@ -0,0 +1,33 @@
+# `compare-jmh.py` test fixtures
+
+Each subdirectory is a self-contained scenario for `compare-jmh.py`. The
+layout is always:
+
+```
+<case>/
+  baseline/jmh-results-baseline.json
+  current/jmh-results-current.json
+```
+
+`compare-jmh.py` discovers result files by globbing for
+`jmh-results-*.json` under the `--baseline` and `--current` directories,
+so any filename starting with `jmh-results-` works.
+
+JSON records mirror the structure produced by JMH 1.37's
+`ResultFormatType.JSON`: an array of objects with `benchmark`, `params`,
+`primaryMetric.{score,scoreError,scoreUnit}`, and optionally
+`secondaryMetrics["gc.alloc.rate.norm"]`.
+
+| Case | What it covers |
+|---|---|
+| `all_improvements` | Multiple benchmarks where both Time and Alloc/op fall well below the threshold; report should be all ⬆️ / 🟢 with no failure. |
+| `all_regressions` | Multiple benchmarks where Time and/or Alloc/op rise well above the threshold; report should be ⬇️ / 🔴, script flags every row as `REGRESSION`, summary `regressions > 0`. |
+| `mixed` | A blend of regressions, improvements, and within-noise rows including multiple variants of the same benchmark — verifies the bucket ordering and the param-discriminator (`[limit=…]`) logic. |
+| `no_alloc` | Records with no `gc.alloc.rate.norm` at all; verifies the script falls back to Time-only and doesn't render `🔴`/`🟢` markers in the absence of the key (plus prints the diagnostic warning). |
+| `noise_only` | All deltas are inside ±10%; report should be the ✅ "no changes" header and every row should carry the ➖ neutral arrow. |
+| `only_in_pr` | A benchmark appears in `current` but not in `baseline`; verifies the "Benchmarks only in PR run" `<details>` block. |
+| `only_in_baseline` | The mirror case — verifies the "Benchmarks only in baseline run" block. |
+| `empty_intersection` | `baseline` and `current` contain different sets of benchmarks so no rows are matched; verifies the "_No benchmarks matched_" path. |
+
+The companion runner `test_compare_jmh.py` exercises every case and
+checks both the rendered markdown and the `--summary-output` counters.