Skip to content

Commit c6b95e1

Browse files
committed
Merge branch 'main' into 04/16/26/jdbc_override_settings
2 parents 37880d4 + e9a4219 commit c6b95e1

36 files changed

Lines changed: 5636 additions & 3360 deletions

File tree

.github/scripts/compare-jmh.py

Lines changed: 588 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
#!/usr/bin/env python3
2+
"""End-to-end tests for `compare-jmh.py`.
3+
4+
Runs `compare-jmh.py` against every scenario in `test_data/` via
5+
subprocess and asserts on both the rendered markdown and the
6+
`--summary-output` counters. Designed to be run locally
7+
(`python3 .github/scripts/test_compare_jmh.py`) and from CI without
8+
extra dependencies — only the standard library is used.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import os
14+
import shutil
15+
import subprocess
16+
import sys
17+
import tempfile
18+
import unittest
19+
from pathlib import Path
20+
from typing import Dict, List
21+
22+
HERE = Path(__file__).resolve().parent
23+
SCRIPT = HERE / "compare-jmh.py"
24+
DATA = HERE / "test_data"
25+
26+
27+
def _parse_summary(path: Path) -> Dict[str, str]:
28+
out: Dict[str, str] = {}
29+
for line in path.read_text(encoding="utf-8").splitlines():
30+
if "=" in line:
31+
k, v = line.split("=", 1)
32+
out[k.strip()] = v.strip()
33+
return out
34+
35+
36+
def _run_compare(
37+
case: str,
38+
*,
39+
threshold: float = 10.0,
40+
extra_args: List[str] | None = None,
41+
) -> Dict[str, object]:
42+
"""Invoke compare-jmh.py against a fixture; return its outputs.
43+
44+
Returns a dict with: returncode, stdout, stderr, markdown, summary.
45+
"""
46+
case_dir = DATA / case
47+
assert case_dir.is_dir(), f"missing test fixture: {case_dir}"
48+
49+
tmp = Path(tempfile.mkdtemp(prefix=f"cmp-jmh-{case}-"))
50+
try:
51+
out_md = tmp / "out.md"
52+
summary = tmp / "summary.env"
53+
cmd = [
54+
sys.executable,
55+
str(SCRIPT),
56+
"--baseline",
57+
str(case_dir / "baseline"),
58+
"--current",
59+
str(case_dir / "current"),
60+
"--threshold-pct",
61+
str(threshold),
62+
"--output",
63+
str(out_md),
64+
"--summary-output",
65+
str(summary),
66+
]
67+
if extra_args:
68+
cmd.extend(extra_args)
69+
proc = subprocess.run(cmd, capture_output=True, text=True)
70+
result: Dict[str, object] = {
71+
"returncode": proc.returncode,
72+
"stdout": proc.stdout,
73+
"stderr": proc.stderr,
74+
"markdown": out_md.read_text(encoding="utf-8") if out_md.exists() else "",
75+
"summary": _parse_summary(summary) if summary.exists() else {},
76+
}
77+
return result
78+
finally:
79+
shutil.rmtree(tmp, ignore_errors=True)
80+
81+
82+
# Pre-computed marker strings — keep in sync with compare-jmh.py.
83+
ARROW_REGRESS = "\u2b07\ufe0f"
84+
ARROW_IMPROVE = "\u2b06\ufe0f"
85+
ARROW_NOISE = "\u2796"
86+
DOT_REGRESS = "\U0001f534"
87+
DOT_IMPROVE = "\U0001f7e2"
88+
89+
90+
class CompareJmhTest(unittest.TestCase):
91+
"""Each case in test_data/ has a dedicated test asserting on the
92+
headline shape, the bullet markers, and the summary counters."""
93+
94+
def test_all_improvements(self) -> None:
95+
r = _run_compare("all_improvements")
96+
self.assertEqual(r["returncode"], 0)
97+
md = r["markdown"]
98+
self.assertIn("no regressions", md)
99+
self.assertIn(ARROW_IMPROVE, md)
100+
self.assertIn(DOT_IMPROVE, md)
101+
self.assertNotIn(ARROW_REGRESS, md)
102+
self.assertNotIn(DOT_REGRESS, md)
103+
s = r["summary"]
104+
self.assertEqual(s.get("regressions"), "0")
105+
self.assertEqual(s.get("improvements"), "2")
106+
self.assertEqual(s.get("matched"), "2")
107+
108+
def test_all_regressions(self) -> None:
109+
r = _run_compare("all_regressions")
110+
self.assertEqual(r["returncode"], 0)
111+
md = r["markdown"]
112+
self.assertIn("regression(s) over", md)
113+
self.assertIn(ARROW_REGRESS, md)
114+
self.assertIn(DOT_REGRESS, md)
115+
# Bold markdown around the metric label.
116+
self.assertIn("**Time", md)
117+
s = r["summary"]
118+
self.assertEqual(s.get("regressions"), "2")
119+
# No row "purely improved" → improvements should be 0.
120+
self.assertEqual(s.get("improvements"), "0")
121+
122+
def test_mixed(self) -> None:
123+
r = _run_compare("mixed")
124+
self.assertEqual(r["returncode"], 0)
125+
md = r["markdown"]
126+
# Both directions present.
127+
self.assertIn(ARROW_REGRESS, md)
128+
self.assertIn(ARROW_IMPROVE, md)
129+
self.assertIn(DOT_REGRESS, md)
130+
self.assertIn(DOT_IMPROVE, md)
131+
# Discriminator suffix `[limit=…]` appears for the two
132+
# `queryV2` variants of the same benchmark.
133+
self.assertIn("[limit=10000]", md)
134+
self.assertIn("[limit=100000]", md)
135+
# The unique-named benchmark must NOT get a discriminator.
136+
self.assertNotIn("`JDBCQuery.selectJDBCV2` `[", md)
137+
s = r["summary"]
138+
self.assertEqual(s.get("matched"), "4")
139+
self.assertGreater(int(s.get("regressions", "0")), 0)
140+
141+
def test_no_alloc(self) -> None:
142+
# No `gc.alloc.rate.norm` present anywhere → script must
143+
# still compare Time and emit a diagnostic on stderr.
144+
r = _run_compare("no_alloc")
145+
self.assertEqual(r["returncode"], 0)
146+
self.assertIn("no `gc.alloc.rate.norm`", r["stderr"])
147+
md = r["markdown"]
148+
# Time regression should still be detected and 🔴-tagged…
149+
self.assertIn(DOT_REGRESS, md)
150+
# …but no Alloc/op metric is ever 🟢/🔴 because we have no
151+
# baseline/current data for it.
152+
self.assertIn("Alloc/op", md)
153+
# The detail-table cell should fall back to "(—)" for alloc.
154+
self.assertIn("Alloc/op — → — (—)", md)
155+
s = r["summary"]
156+
self.assertEqual(s.get("matched"), "2")
157+
158+
def test_noise_only(self) -> None:
159+
r = _run_compare("noise_only")
160+
self.assertEqual(r["returncode"], 0)
161+
md = r["markdown"]
162+
self.assertIn("no changes over 10%", md)
163+
# Every row should be on the noise arrow…
164+
self.assertIn(ARROW_NOISE, md)
165+
# …and there should be no red/green dots anywhere.
166+
self.assertNotIn(DOT_REGRESS, md)
167+
# 🟢 *is* in the header for the OK case, so don't assert on
168+
# DOT_IMPROVE alone.
169+
s = r["summary"]
170+
self.assertEqual(s.get("regressions"), "0")
171+
self.assertEqual(s.get("improvements"), "0")
172+
173+
def test_only_in_pr(self) -> None:
174+
r = _run_compare("only_in_pr")
175+
self.assertEqual(r["returncode"], 0)
176+
md = r["markdown"]
177+
self.assertIn("Benchmarks only in PR run", md)
178+
self.assertIn("QueryClient.queryV3New", md)
179+
s = r["summary"]
180+
# one shared row matched.
181+
self.assertEqual(s.get("matched"), "1")
182+
183+
def test_only_in_baseline(self) -> None:
184+
r = _run_compare("only_in_baseline")
185+
self.assertEqual(r["returncode"], 0)
186+
md = r["markdown"]
187+
self.assertIn("Benchmarks only in baseline run", md)
188+
self.assertIn("QueryClient.queryV0Removed", md)
189+
s = r["summary"]
190+
self.assertEqual(s.get("matched"), "1")
191+
192+
def test_empty_intersection(self) -> None:
193+
r = _run_compare("empty_intersection")
194+
self.assertEqual(r["returncode"], 0)
195+
md = r["markdown"]
196+
self.assertIn("_No benchmarks matched between baseline and PR._", md)
197+
# Both unique-side sections still appear as <details>.
198+
self.assertIn("Benchmarks only in PR run", md)
199+
self.assertIn("Benchmarks only in baseline run", md)
200+
s = r["summary"]
201+
self.assertEqual(s.get("matched"), "0")
202+
self.assertEqual(s.get("regressions"), "0")
203+
self.assertEqual(s.get("improvements"), "0")
204+
205+
def test_threshold_knob(self) -> None:
206+
# The same fixture flips from "regression" to "ok" when the
207+
# threshold is widened past the largest delta.
208+
strict = _run_compare("all_regressions", threshold=10.0)
209+
lenient = _run_compare("all_regressions", threshold=200.0)
210+
self.assertGreater(int(strict["summary"]["regressions"]), 0)
211+
self.assertEqual(lenient["summary"]["regressions"], "0")
212+
self.assertIn("no changes", lenient["markdown"])
213+
214+
215+
if __name__ == "__main__":
216+
# `-v` prints each scenario name so failures are obvious in CI logs.
217+
unittest.main(verbosity=2)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# `compare-jmh.py` test fixtures
2+
3+
Each subdirectory is a self-contained scenario for `compare-jmh.py`. The
4+
layout is always:
5+
6+
```
7+
<case>/
8+
baseline/jmh-results-baseline.json
9+
current/jmh-results-current.json
10+
```
11+
12+
`compare-jmh.py` discovers result files by globbing for
13+
`jmh-results-*.json` under the `--baseline` and `--current` directories,
14+
so any filename starting with `jmh-results-` works.
15+
16+
JSON records mirror the structure produced by JMH 1.37's
17+
`ResultFormatType.JSON`: an array of objects with `benchmark`, `params`,
18+
`primaryMetric.{score,scoreError,scoreUnit}`, and optionally
19+
`secondaryMetrics["gc.alloc.rate.norm"]`.
20+
21+
| Case | What it covers |
22+
|---|---|
23+
| `all_improvements` | Multiple benchmarks where both Time and Alloc/op fall well below the threshold; report should be all ⬆️ / 🟢 with no failure. |
24+
| `all_regressions` | Multiple benchmarks where Time and/or Alloc/op rise well above the threshold; report should be ⬇️ / 🔴, script flags every row as `REGRESSION`, summary `regressions > 0`. |
25+
| `mixed` | A blend of regressions, improvements, and within-noise rows including multiple variants of the same benchmark — verifies the bucket ordering and the param-discriminator (`[limit=…]`) logic. |
26+
| `no_alloc` | Records with no `gc.alloc.rate.norm` at all; verifies the script falls back to Time-only and doesn't render `🔴`/`🟢` markers in the absence of the key (plus prints the diagnostic warning). |
27+
| `noise_only` | All deltas are inside ±10%; report should be the ✅ "no changes" header and every row should carry the ➖ neutral arrow. |
28+
| `only_in_pr` | A benchmark appears in `current` but not in `baseline`; verifies the "Benchmarks only in PR run" `<details>` block. |
29+
| `only_in_baseline` | The mirror case — verifies the "Benchmarks only in baseline run" block. |
30+
| `empty_intersection` | `baseline` and `current` contain different sets of benchmarks so no rows are matched; verifies the "_No benchmarks matched_" path. |
31+
32+
The companion runner `test_compare_jmh.py` exercises every case and
33+
checks both the rendered markdown and the `--summary-output` counters.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryV2",
4+
"mode": "sample",
5+
"params": {"limit": "10000"},
6+
"primaryMetric": {"score": 20.0, "scoreError": 0.5, "scoreUnit": "ms/op"},
7+
"secondaryMetrics": {
8+
"gc.alloc.rate.norm": {"score": 2048.0, "scoreError": 12.0, "scoreUnit": "B/op"}
9+
}
10+
},
11+
{
12+
"benchmark": "com.clickhouse.benchmark.clients.InsertClient.insertV2",
13+
"mode": "sample",
14+
"params": {"limit": "10000"},
15+
"primaryMetric": {"score": 80.0, "scoreError": 1.5, "scoreUnit": "ms/op"},
16+
"secondaryMetrics": {
17+
"gc.alloc.rate.norm": {"score": 4096.0, "scoreError": 24.0, "scoreUnit": "B/op"}
18+
}
19+
}
20+
]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryV2",
4+
"mode": "sample",
5+
"params": {"limit": "10000"},
6+
"primaryMetric": {"score": 16.0, "scoreError": 0.4, "scoreUnit": "ms/op"},
7+
"secondaryMetrics": {
8+
"gc.alloc.rate.norm": {"score": 1536.0, "scoreError": 10.0, "scoreUnit": "B/op"}
9+
}
10+
},
11+
{
12+
"benchmark": "com.clickhouse.benchmark.clients.InsertClient.insertV2",
13+
"mode": "sample",
14+
"params": {"limit": "10000"},
15+
"primaryMetric": {"score": 64.0, "scoreError": 1.3, "scoreUnit": "ms/op"},
16+
"secondaryMetrics": {
17+
"gc.alloc.rate.norm": {"score": 3200.0, "scoreError": 20.0, "scoreUnit": "B/op"}
18+
}
19+
}
20+
]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryV2",
4+
"mode": "sample",
5+
"params": {"limit": "10000"},
6+
"primaryMetric": {"score": 20.0, "scoreError": 0.5, "scoreUnit": "ms/op"},
7+
"secondaryMetrics": {
8+
"gc.alloc.rate.norm": {"score": 2048.0, "scoreError": 12.0, "scoreUnit": "B/op"}
9+
}
10+
},
11+
{
12+
"benchmark": "com.clickhouse.benchmark.clients.InsertClient.insertV2",
13+
"mode": "sample",
14+
"params": {"limit": "10000"},
15+
"primaryMetric": {"score": 80.0, "scoreError": 1.5, "scoreUnit": "ms/op"},
16+
"secondaryMetrics": {
17+
"gc.alloc.rate.norm": {"score": 4096.0, "scoreError": 24.0, "scoreUnit": "B/op"}
18+
}
19+
}
20+
]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryV2",
4+
"mode": "sample",
5+
"params": {"limit": "10000"},
6+
"primaryMetric": {"score": 26.0, "scoreError": 0.6, "scoreUnit": "ms/op"},
7+
"secondaryMetrics": {
8+
"gc.alloc.rate.norm": {"score": 3300.0, "scoreError": 18.0, "scoreUnit": "B/op"}
9+
}
10+
},
11+
{
12+
"benchmark": "com.clickhouse.benchmark.clients.InsertClient.insertV2",
13+
"mode": "sample",
14+
"params": {"limit": "10000"},
15+
"primaryMetric": {"score": 100.0, "scoreError": 1.8, "scoreUnit": "ms/op"},
16+
"secondaryMetrics": {
17+
"gc.alloc.rate.norm": {"score": 5400.0, "scoreError": 30.0, "scoreUnit": "B/op"}
18+
}
19+
}
20+
]
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[
2+
{
3+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryOld",
4+
"mode": "sample",
5+
"params": {"limit": "10000"},
6+
"primaryMetric": {"score": 20.0, "scoreError": 0.5, "scoreUnit": "ms/op"},
7+
"secondaryMetrics": {
8+
"gc.alloc.rate.norm": {"score": 2048.0, "scoreError": 12.0, "scoreUnit": "B/op"}
9+
}
10+
}
11+
]
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[
2+
{
3+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryNew",
4+
"mode": "sample",
5+
"params": {"limit": "10000"},
6+
"primaryMetric": {"score": 18.0, "scoreError": 0.5, "scoreUnit": "ms/op"},
7+
"secondaryMetrics": {
8+
"gc.alloc.rate.norm": {"score": 1900.0, "scoreError": 11.0, "scoreUnit": "B/op"}
9+
}
10+
}
11+
]
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
[
2+
{
3+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryV2",
4+
"mode": "sample",
5+
"params": {"datasetSourceName": "file://default.csv", "limit": "10000"},
6+
"primaryMetric": {"score": 20.0, "scoreError": 0.5, "scoreUnit": "ms/op"},
7+
"secondaryMetrics": {
8+
"gc.alloc.rate.norm": {"score": 2048.0, "scoreError": 12.0, "scoreUnit": "B/op"}
9+
}
10+
},
11+
{
12+
"benchmark": "com.clickhouse.benchmark.clients.QueryClient.queryV2",
13+
"mode": "sample",
14+
"params": {"datasetSourceName": "file://default.csv", "limit": "100000"},
15+
"primaryMetric": {"score": 180.0, "scoreError": 2.0, "scoreUnit": "ms/op"},
16+
"secondaryMetrics": {
17+
"gc.alloc.rate.norm": {"score": 16384.0, "scoreError": 96.0, "scoreUnit": "B/op"}
18+
}
19+
},
20+
{
21+
"benchmark": "com.clickhouse.benchmark.clients.InsertClient.insertV2",
22+
"mode": "sample",
23+
"params": {"limit": "10000"},
24+
"primaryMetric": {"score": 80.0, "scoreError": 1.5, "scoreUnit": "ms/op"},
25+
"secondaryMetrics": {
26+
"gc.alloc.rate.norm": {"score": 4096.0, "scoreError": 24.0, "scoreUnit": "B/op"}
27+
}
28+
},
29+
{
30+
"benchmark": "com.clickhouse.benchmark.clients.JDBCQuery.selectJDBCV2",
31+
"mode": "sample",
32+
"params": {"limit": "10000"},
33+
"primaryMetric": {"score": 30.0, "scoreError": 0.7, "scoreUnit": "ms/op"},
34+
"secondaryMetrics": {
35+
"gc.alloc.rate.norm": {"score": 6144.0, "scoreError": 36.0, "scoreUnit": "B/op"}
36+
}
37+
}
38+
]

0 commit comments

Comments
 (0)