Skip to content

Commit f97cb0c

Browse files
Reduce benchmark diff noise by requiring a minimum absolute delta for regressions (#4423)
Benchmark PR comments were over-reporting regressions for changes that should be performance-neutral, mostly due to large % swings on tiny sub-millisecond metrics. This updates benchmark comparison to suppress those false positives while preserving meaningful regression detection. - **Regression significance logic** - Added `isNotableMetricChange` in `packages/benchmark/src/compare.ts`. - A metric is now considered notable only when **both** conditions hold: - `abs(percentChange) >= threshold` (existing behavior) - `abs(changeMs) >= 1ms` (new guard) - `hasNotableChanges` now uses the same gate, aligning failure signaling with comment output. - **PR comment and summary filtering** - Updated `packages/benchmark/src/format-comment.ts` to apply the same significance rule for: - top-level regression count/table - change indicators in markdown and console summaries - Result: noisy tiny metrics (e.g., `0.05ms -> 0.06ms`) no longer inflate regression counts. - **Targeted benchmark tests** - Added `packages/benchmark/test/compare.test.ts` to cover: - high-% but tiny absolute changes are excluded - custom `minChangeMs` behavior - regression summary excludes metrics below the minimum absolute threshold ```ts export function isNotableMetricChange( metric: MetricComparison, threshold = 5, minChangeMs = 1, ): boolean { return Math.abs(metric.percentChange) >= threshold && Math.abs(metric.change) >= minChangeMs; } ``` --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> Co-authored-by: Timothee Guerin <tiguerin@microsoft.com>
1 parent a35f3b1 commit f97cb0c

4 files changed

Lines changed: 85 additions & 11 deletions

File tree

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
changeKind: fix
3+
packages:
4+
- "@azure-tools/typespec-benchmark"
5+
---
6+
7+
Reduce noisy benchmark comparison regressions by requiring both the percentage threshold and a minimum absolute runtime delta (1ms) before a metric is flagged as notable.

packages/benchmark/src/compare.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import type { BenchmarkResult, ComparisonResult, MetricComparison, RuntimeStats } from "./types.js";
22

33
const DEFAULT_THRESHOLD = 5; // percent
4+
const DEFAULT_MIN_CHANGE_MS = 1;
45

56
export interface CompareOptions {
67
/** Percentage threshold for highlighting changes (default: 5%). */
@@ -13,6 +14,14 @@ function createMetric(label: string, baseline: number, current: number): MetricC
1314
return { label, baseline, current, change, percentChange };
1415
}
1516

17+
export function isNotableMetricChange(
18+
metric: MetricComparison,
19+
threshold: number = DEFAULT_THRESHOLD,
20+
minChangeMs: number = DEFAULT_MIN_CHANGE_MS,
21+
): boolean {
22+
return Math.abs(metric.percentChange) >= threshold && Math.abs(metric.change) >= minChangeMs;
23+
}
24+
1625
function extractRuntimeMetrics(
1726
baselineRuntime: RuntimeStats,
1827
currentRuntime: RuntimeStats,
@@ -145,7 +154,7 @@ export function hasNotableChanges(
145154
): boolean {
146155
for (const comp of comparisons) {
147156
for (const m of comp.metrics) {
148-
if (Math.abs(m.percentChange) >= threshold) {
157+
if (isNotableMetricChange(m, threshold)) {
149158
return true;
150159
}
151160
}

packages/benchmark/src/format-comment.ts

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import { isNotableMetricChange } from "./compare.js";
12
import type { BenchmarkResult, ComparisonResult, MetricComparison, RuntimeStats } from "./types.js";
23

34
const DEFAULT_THRESHOLD = 5;
5+
const DEFAULT_MIN_CHANGE_MS = 1;
46

57
function formatMs(ms: number): string {
68
if (ms >= 1000) return `${(ms / 1000).toFixed(2)}s`;
@@ -32,9 +34,10 @@ function formatMsColored(ms: number, thresholds: readonly [number, number]): str
3234
return `${timeIndicator(ms, thresholds)} ${formatMs(ms)}`;
3335
}
3436

35-
function changeIndicator(percentChange: number, threshold: number): string {
36-
if (percentChange >= threshold) return "🔴";
37-
if (percentChange <= -threshold) return "🟢";
37+
function changeIndicator(metric: MetricComparison, threshold: number): string {
38+
if (!isNotableMetricChange(metric, threshold, DEFAULT_MIN_CHANGE_MS)) return "";
39+
if (metric.percentChange >= threshold) return "🔴";
40+
if (metric.percentChange <= -threshold) return "🟢";
3841
return "";
3942
}
4043

@@ -161,7 +164,10 @@ export function formatPrComment(
161164

162165
// Average metrics across all specs
163166
const averaged = averageComparisonMetrics(comparisons);
164-
const regressions = averaged.filter((m) => m.percentChange >= threshold);
167+
const regressions = averaged.filter(
168+
(m) =>
169+
m.percentChange >= threshold && isNotableMetricChange(m, threshold, DEFAULT_MIN_CHANGE_MS),
170+
);
165171

166172
// Top-level summary: show regressions prominently, otherwise a simple ok message
167173
if (regressions.length === 0) {
@@ -173,8 +179,7 @@ export function formatPrComment(
173179
lines.push("| Metric | Baseline | Current | Change |");
174180
lines.push("|--------|----------|---------|--------|");
175181
for (const m of regressions) {
176-
const changeStr =
177-
`${formatPercent(m.percentChange)} ${changeIndicator(m.percentChange, threshold)}`.trim();
182+
const changeStr = `${formatPercent(m.percentChange)} ${changeIndicator(m, threshold)}`.trim();
178183
const th = thresholdsFor(m.label);
179184
lines.push(
180185
`| ${displayLabel(m.label)} | ${formatMsColored(m.baseline, th)} | ${formatMsColored(m.current, th)} | ${changeStr} |`,
@@ -192,8 +197,7 @@ export function formatPrComment(
192197
lines.push("| Metric | Baseline | Current | Change |");
193198
lines.push("|--------|----------|---------|--------|");
194199
for (const m of averaged) {
195-
const changeStr =
196-
`${formatPercent(m.percentChange)} ${changeIndicator(m.percentChange, threshold)}`.trim();
200+
const changeStr = `${formatPercent(m.percentChange)} ${changeIndicator(m, threshold)}`.trim();
197201
const th = thresholdsFor(m.label);
198202
lines.push(
199203
`| ${displayLabel(m.label)} | ${formatMsColored(m.baseline, th)} | ${formatMsColored(m.current, th)} | ${changeStr} |`,
@@ -255,7 +259,7 @@ export function formatConsoleSummary(
255259

256260
lines.push("\nBenchmark comparison (averaged across specs):");
257261
for (const m of averaged) {
258-
const indicator = changeIndicator(m.percentChange, threshold);
262+
const indicator = changeIndicator(m, threshold);
259263
const label = isSubMetric(m.label) ? ` ${m.label}` : m.label;
260264
lines.push(
261265
` ${label.padEnd(50)} ${formatMs(m.baseline).padStart(10)}${formatMs(m.current).padStart(10)} ${formatPercent(m.percentChange).padStart(8)} ${indicator}`,
@@ -319,7 +323,7 @@ export function formatComparisonSummary(
319323
lines.push("| Metric | Baseline | Current | Change |");
320324
lines.push("|--------|----------|---------|--------|");
321325
for (const m of averaged) {
322-
const indicator = changeIndicator(m.percentChange, threshold);
326+
const indicator = changeIndicator(m, threshold);
323327
const changeStr = `${formatPercent(m.percentChange)} ${indicator}`.trim();
324328
const th = thresholdsFor(m.label);
325329
lines.push(
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import { expect, it } from "vitest";
2+
import { hasNotableChanges, isNotableMetricChange } from "../src/compare.js";
3+
import { formatPrComment } from "../src/format-comment.js";
4+
import type { ComparisonResult, MetricComparison } from "../src/types.js";
5+
6+
function createMetric(label: string, baseline: number, current: number): MetricComparison {
7+
const change = current - baseline;
8+
const percentChange = baseline === 0 ? (current === 0 ? 0 : 100) : (change / baseline) * 100;
9+
return { label, baseline, current, change, percentChange };
10+
}
11+
12+
function createComparison(metrics: MetricComparison[]): ComparisonResult {
13+
return {
14+
specName: "sample",
15+
metrics,
16+
complexity: {
17+
createdTypes: { baseline: 1, current: 1 },
18+
finishedTypes: { baseline: 1, current: 1 },
19+
},
20+
};
21+
}
22+
23+
it("ignores tiny absolute changes even when percent change is high", () => {
24+
const tinyMetric = createMetric("linter/rule", 0.05, 0.06);
25+
expect(isNotableMetricChange(tinyMetric, 5)).toBe(false);
26+
});
27+
28+
it("respects a custom minimum absolute threshold", () => {
29+
const metric = createMetric("checker", 100, 100.6);
30+
expect(isNotableMetricChange(metric, 0.5, 0.5)).toBe(true);
31+
expect(isNotableMetricChange(metric, 0.5, 1)).toBe(false);
32+
});
33+
34+
it("detects notable changes when percent and absolute deltas are both large enough", () => {
35+
const notableMetric = createMetric("checker", 100, 106);
36+
const comparisons = [createComparison([notableMetric])];
37+
expect(hasNotableChanges(comparisons, 5)).toBe(true);
38+
});
39+
40+
it("excludes metrics below minimum absolute threshold from regression summary", () => {
41+
const comparisons = [
42+
createComparison([
43+
createMetric("linter/noisy-rule", 0.05, 0.06),
44+
createMetric("checker", 100, 106),
45+
]),
46+
];
47+
48+
const comment = formatPrComment(comparisons, "baseline123", "current123", { threshold: 5 });
49+
const topSummary = comment.split("<details>")[0];
50+
51+
expect(topSummary).toContain("⚠️ **1 metric(s) regressed** above the +5% threshold:");
52+
expect(topSummary).toContain("| checker |");
53+
expect(topSummary).not.toContain("linter/noisy-rule");
54+
});

0 commit comments

Comments
 (0)