Reduce benchmark diff noise by requiring a minimum absolute delta for regressions (#4423)

Copilot · timotheeguerin · web-flow · commit f97cb0cb7512 · 2026-05-14T20:06:54.000Z
Benchmark PR comments were over-reporting regressions for changes that
should be performance-neutral, mostly due to large % swings on tiny
sub-millisecond metrics. This updates benchmark comparison to suppress
those false positives while preserving meaningful regression detection.

- **Regression significance logic**
- Added `isNotableMetricChange` in `packages/benchmark/src/compare.ts`.
- A metric is now considered notable only when **both** conditions hold:
    - `abs(percentChange) &gt;= threshold` (existing behavior)
    - `abs(changeMs) &gt;= 1ms` (new guard)
- `hasNotableChanges` now uses the same gate, aligning failure signaling
with comment output.

- **PR comment and summary filtering**
- Updated `packages/benchmark/src/format-comment.ts` to apply the same
significance rule for:
    - top-level regression count/table
    - change indicators in markdown and console summaries
- Result: noisy tiny metrics (e.g., `0.05ms -&gt; 0.06ms`) no longer
inflate regression counts.

- **Targeted benchmark tests**
  - Added `packages/benchmark/test/compare.test.ts` to cover:
    - high-% but tiny absolute changes are excluded
    - custom `minChangeMs` behavior
- regression summary excludes metrics below the minimum absolute
threshold

```ts
export function isNotableMetricChange(
  metric: MetricComparison,
  threshold = 5,
  minChangeMs = 1,
): boolean {
  return Math.abs(metric.percentChange) &gt;= threshold &amp;&amp; Math.abs(metric.change) &gt;= minChangeMs;
}
```

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: timotheeguerin &lt;1031227+timotheeguerin@users.noreply.github.com&gt;
Co-authored-by: Timothee Guerin &lt;tiguerin@microsoft.com&gt;
diff --git a/.chronus/changes/fix-benchmark-diff-noise-2026-5-13-22-0-0.md b/.chronus/changes/fix-benchmark-diff-noise-2026-5-13-22-0-0.md
@@ -0,0 +1,7 @@
+---
+changeKind: fix
+packages:
+  - "@azure-tools/typespec-benchmark"
+---
+
+Reduce noisy benchmark comparison regressions by requiring both the percentage threshold and a minimum absolute runtime delta (1ms) before a metric is flagged as notable.
diff --git a/packages/benchmark/src/compare.ts b/packages/benchmark/src/compare.ts
@@ -1,6 +1,7 @@
 import type { BenchmarkResult, ComparisonResult, MetricComparison, RuntimeStats } from "./types.js";
 
 const DEFAULT_THRESHOLD = 5; // percent
+const DEFAULT_MIN_CHANGE_MS = 1;
 
 export interface CompareOptions {
   /** Percentage threshold for highlighting changes (default: 5%). */
@@ -13,6 +14,14 @@ function createMetric(label: string, baseline: number, current: number): MetricC
   return { label, baseline, current, change, percentChange };
 }
 
+export function isNotableMetricChange(
+  metric: MetricComparison,
+  threshold: number = DEFAULT_THRESHOLD,
+  minChangeMs: number = DEFAULT_MIN_CHANGE_MS,
+): boolean {
+  return Math.abs(metric.percentChange) >= threshold && Math.abs(metric.change) >= minChangeMs;
+}
+
 function extractRuntimeMetrics(
   baselineRuntime: RuntimeStats,
   currentRuntime: RuntimeStats,
@@ -145,7 +154,7 @@ export function hasNotableChanges(
 ): boolean {
   for (const comp of comparisons) {
     for (const m of comp.metrics) {
-      if (Math.abs(m.percentChange) >= threshold) {
+      if (isNotableMetricChange(m, threshold)) {
         return true;
       }
     }
diff --git a/packages/benchmark/src/format-comment.ts b/packages/benchmark/src/format-comment.ts
@@ -1,6 +1,8 @@
+import { isNotableMetricChange } from "./compare.js";
 import type { BenchmarkResult, ComparisonResult, MetricComparison, RuntimeStats } from "./types.js";
 
 const DEFAULT_THRESHOLD = 5;
+const DEFAULT_MIN_CHANGE_MS = 1;
 
 function formatMs(ms: number): string {
   if (ms >= 1000) return `${(ms / 1000).toFixed(2)}s`;
@@ -32,9 +34,10 @@ function formatMsColored(ms: number, thresholds: readonly [number, number]): str
   return `${timeIndicator(ms, thresholds)} ${formatMs(ms)}`;
 }
 
-function changeIndicator(percentChange: number, threshold: number): string {
-  if (percentChange >= threshold) return "🔴";
-  if (percentChange <= -threshold) return "🟢";
+function changeIndicator(metric: MetricComparison, threshold: number): string {
+  if (!isNotableMetricChange(metric, threshold, DEFAULT_MIN_CHANGE_MS)) return "";
+  if (metric.percentChange >= threshold) return "🔴";
+  if (metric.percentChange <= -threshold) return "🟢";
   return "";
 }
 
@@ -161,7 +164,10 @@ export function formatPrComment(
 
   // Average metrics across all specs
   const averaged = averageComparisonMetrics(comparisons);
-  const regressions = averaged.filter((m) => m.percentChange >= threshold);
+  const regressions = averaged.filter(
+    (m) =>
+      m.percentChange >= threshold && isNotableMetricChange(m, threshold, DEFAULT_MIN_CHANGE_MS),
+  );
 
   // Top-level summary: show regressions prominently, otherwise a simple ok message
   if (regressions.length === 0) {
@@ -173,8 +179,7 @@ export function formatPrComment(
     lines.push("| Metric | Baseline | Current | Change |");
     lines.push("|--------|----------|---------|--------|");
     for (const m of regressions) {
-      const changeStr =
-        `${formatPercent(m.percentChange)} ${changeIndicator(m.percentChange, threshold)}`.trim();
+      const changeStr = `${formatPercent(m.percentChange)} ${changeIndicator(m, threshold)}`.trim();
       const th = thresholdsFor(m.label);
       lines.push(
         `| ${displayLabel(m.label)} | ${formatMsColored(m.baseline, th)} | ${formatMsColored(m.current, th)} | ${changeStr} |`,
@@ -192,8 +197,7 @@ export function formatPrComment(
   lines.push("| Metric | Baseline | Current | Change |");
   lines.push("|--------|----------|---------|--------|");
   for (const m of averaged) {
-    const changeStr =
-      `${formatPercent(m.percentChange)} ${changeIndicator(m.percentChange, threshold)}`.trim();
+    const changeStr = `${formatPercent(m.percentChange)} ${changeIndicator(m, threshold)}`.trim();
     const th = thresholdsFor(m.label);
     lines.push(
       `| ${displayLabel(m.label)} | ${formatMsColored(m.baseline, th)} | ${formatMsColored(m.current, th)} | ${changeStr} |`,
@@ -255,7 +259,7 @@ export function formatConsoleSummary(
 
   lines.push("\nBenchmark comparison (averaged across specs):");
   for (const m of averaged) {
-    const indicator = changeIndicator(m.percentChange, threshold);
+    const indicator = changeIndicator(m, threshold);
     const label = isSubMetric(m.label) ? `  ${m.label}` : m.label;
     lines.push(
       `  ${label.padEnd(50)} ${formatMs(m.baseline).padStart(10)} → ${formatMs(m.current).padStart(10)}  ${formatPercent(m.percentChange).padStart(8)} ${indicator}`,
@@ -319,7 +323,7 @@ export function formatComparisonSummary(
   lines.push("| Metric | Baseline | Current | Change |");
   lines.push("|--------|----------|---------|--------|");
   for (const m of averaged) {
-    const indicator = changeIndicator(m.percentChange, threshold);
+    const indicator = changeIndicator(m, threshold);
     const changeStr = `${formatPercent(m.percentChange)} ${indicator}`.trim();
     const th = thresholdsFor(m.label);
     lines.push(
diff --git a/packages/benchmark/test/compare.test.ts b/packages/benchmark/test/compare.test.ts
@@ -0,0 +1,54 @@
+import { expect, it } from "vitest";
+import { hasNotableChanges, isNotableMetricChange } from "../src/compare.js";
+import { formatPrComment } from "../src/format-comment.js";
+import type { ComparisonResult, MetricComparison } from "../src/types.js";
+
+function createMetric(label: string, baseline: number, current: number): MetricComparison {
+  const change = current - baseline;
+  const percentChange = baseline === 0 ? (current === 0 ? 0 : 100) : (change / baseline) * 100;
+  return { label, baseline, current, change, percentChange };
+}
+
+function createComparison(metrics: MetricComparison[]): ComparisonResult {
+  return {
+    specName: "sample",
+    metrics,
+    complexity: {
+      createdTypes: { baseline: 1, current: 1 },
+      finishedTypes: { baseline: 1, current: 1 },
+    },
+  };
+}
+
+it("ignores tiny absolute changes even when percent change is high", () => {
+  const tinyMetric = createMetric("linter/rule", 0.05, 0.06);
+  expect(isNotableMetricChange(tinyMetric, 5)).toBe(false);
+});
+
+it("respects a custom minimum absolute threshold", () => {
+  const metric = createMetric("checker", 100, 100.6);
+  expect(isNotableMetricChange(metric, 0.5, 0.5)).toBe(true);
+  expect(isNotableMetricChange(metric, 0.5, 1)).toBe(false);
+});
+
+it("detects notable changes when percent and absolute deltas are both large enough", () => {
+  const notableMetric = createMetric("checker", 100, 106);
+  const comparisons = [createComparison([notableMetric])];
+  expect(hasNotableChanges(comparisons, 5)).toBe(true);
+});
+
+it("excludes metrics below minimum absolute threshold from regression summary", () => {
+  const comparisons = [
+    createComparison([
+      createMetric("linter/noisy-rule", 0.05, 0.06),
+      createMetric("checker", 100, 106),
+    ]),
+  ];
+
+  const comment = formatPrComment(comparisons, "baseline123", "current123", { threshold: 5 });
+  const topSummary = comment.split("<details>")[0];
+
+  expect(topSummary).toContain("⚠️ **1 metric(s) regressed** above the +5% threshold:");
+  expect(topSummary).toContain("| checker |");
+  expect(topSummary).not.toContain("linter/noisy-rule");
+});