Stabilize benchmark scoring with robust runtime aggregation, deterministic spec ordering, and higher CI sampling (#4402)

Copilot · timotheeguerin · web-flow · commit fe097e536f25 · 2026-05-12T19:54:45.000Z
Benchmark output was overly sensitive to run-to-run jitter, causing
inconsistent values even when commits did not change benchmark-relevant
code. This updates the benchmark engine to produce a more stable central
value per metric.

- **Aggregation model update**
- Replaced simple arithmetic averaging of runtime samples with an
outlier-resistant estimator:
    - **5+ iterations**: trimmed mean (drop min/max)
    - **1–4 iterations**: median
- Applied consistently across top-level runtime stages and nested
per-validator / per-rule / per-emitter metrics.

- **Deterministic execution order**
- Spec discovery now sorts directories before execution to remove
ordering variance from run output.

- **CI sampling update**
- Increased benchmark workflow measured iterations from **5** to **15**
(warmup remains **1**) to align with higher sample-count benchmarking
recommendations and reduce noise in comparisons.

- **Benchmark package coverage/docs**
  - Added focused unit tests for aggregation behavior.
  - Documented the new aggregation strategy in benchmark README.

```ts
// New runtime aggregation behavior
export function aggregateDurations(values: number[]): number {
  const sorted = [...values].sort((a, b) =&gt; a - b);
  if (sorted.length &gt;= 5) return average(sorted.slice(1, -1)); // trimmed mean
  return median(sorted); // small-sample robust center
}
```

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: timotheeguerin &lt;1031227+timotheeguerin@users.noreply.github.com&gt;
Co-authored-by: Timothee Guerin &lt;tiguerin@microsoft.com&gt;
diff --git a/.chronus/changes/fix-benchmark-robust-aggregation-2026-5-8-14-50-0.md b/.chronus/changes/fix-benchmark-robust-aggregation-2026-5-8-14-50-0.md
@@ -0,0 +1,7 @@
+---
+changeKind: fix
+packages:
+  - "@azure-tools/typespec-benchmark"
+---
+
+Improve benchmark result stability by using outlier-resistant runtime aggregation (trimmed mean for 5+ iterations and median for smaller samples), run specs in a deterministic order, and increase CI benchmark measured iterations from 5 to 15 for stronger statistical confidence.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -41,7 +41,7 @@ jobs:
         run: |
           node packages/benchmark/dist/src/cli.js run \
             --specs-dir packages/benchmark/specs \
-            --iterations 5 \
+            --iterations 15 \
             --warmup 1 \
             --commit ${{ github.sha }} \
             --output /tmp/benchmark-results.json
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
@@ -6,8 +6,9 @@ Performance benchmarking tool for TypeSpec Azure compilation. Tracks compilation
 
 1. **Benchmark runner** compiles dedicated TypeSpec specs using the compiler's programmatic API
 2. The compiler provides built-in `Stats` data including per-stage timing and per-linter-rule breakdown
-3. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch
-4. PR comments show a comparison table highlighting performance changes
+3. Runtime metrics are aggregated with an outlier-resistant estimator (trimmed mean for 5+ samples, median for smaller sample sizes)
+4. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch
+5. PR comments show a comparison table highlighting performance changes
 
 ## Local usage
 
diff --git a/packages/benchmark/package.json b/packages/benchmark/package.json
@@ -15,6 +15,7 @@
   "scripts": {
     "clean": "rimraf ./dist",
     "build": "tsc -p tsconfig.build.json",
+    "test": "vitest run",
     "watch": "tsc -p tsconfig.build.json --watch",
     "bench": "node ./dist/src/cli.js run",
     "bench:compare": "node ./dist/src/cli.js compare"
@@ -35,6 +36,7 @@
   "devDependencies": {
     "@types/node": "catalog:",
     "rimraf": "catalog:",
-    "typescript": "catalog:"
+    "typescript": "catalog:",
+    "vitest": "catalog:"
   }
 }
diff --git a/packages/benchmark/src/aggregate.ts b/packages/benchmark/src/aggregate.ts
@@ -0,0 +1,23 @@
+/**
+ * Aggregate timing samples with outlier resistance.
+ * - 5+ samples: 20% trimmed mean (drop lowest and highest sample)
+ * - 1-4 samples: median
+ */
+export function aggregateDurations(values: number[]): number {
+  if (values.length === 0) {
+    throw new Error("No values to aggregate");
+  }
+
+  const sorted = [...values].sort((a, b) => a - b);
+  if (sorted.length >= 5) {
+    const trimmed = sorted.slice(1, -1);
+    return trimmed.reduce((sum, value) => sum + value, 0) / trimmed.length;
+  }
+
+  const middle = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 1) {
+    return sorted[middle];
+  }
+
+  return (sorted[middle - 1] + sorted[middle]) / 2;
+}
diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts
@@ -4,6 +4,7 @@ import { execSync } from "child_process";
 import { readdir } from "fs/promises";
 import os from "os";
 import { join, resolve } from "path";
+import { aggregateDurations } from "./aggregate.js";
 import type {
   BenchmarkResult,
   RunnerInfo,
@@ -31,7 +32,10 @@ export interface RunOptions {
 /** Discover benchmark spec directories under the given path. */
 async function discoverSpecs(specsDir: string, filter?: string[]): Promise<string[]> {
   const entries = await readdir(specsDir, { withFileTypes: true });
-  const dirs = entries.filter((e) => e.isDirectory()).map((e) => e.name);
+  const dirs = entries
+    .filter((e) => e.isDirectory())
+    .map((e) => e.name)
+    .sort((a, b) => a.localeCompare(b));
   if (filter && filter.length > 0) {
     return dirs.filter((d) => filter.includes(d));
   }
@@ -98,9 +102,8 @@ function averageStats(statsList: Stats[]): Stats {
 }
 
 function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
-  const n = runtimes.length;
-  const avg = (accessor: (r: RuntimeStats) => number) =>
-    runtimes.reduce((s, r) => s + accessor(r), 0) / n;
+  const aggregate = (accessor: (r: RuntimeStats) => number) =>
+    aggregateDurations(runtimes.map((r) => accessor(r)));
 
   // Average validation
   const validatorKeys = new Set<string>();
@@ -111,7 +114,7 @@ function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
   }
   const validators: Record<string, number> = {};
   for (const k of validatorKeys) {
-    validators[k] = runtimes.reduce((s, r) => s + (r.validation.validators[k] ?? 0), 0) / n;
+    validators[k] = aggregateDurations(runtimes.map((r) => r.validation.validators[k] ?? 0));
   }
 
   // Average linter rules
@@ -123,7 +126,7 @@ function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
   }
   const rules: Record<string, number> = {};
   for (const k of ruleKeys) {
-    rules[k] = runtimes.reduce((s, r) => s + (r.linter.rules[k] ?? 0), 0) / n;
+    rules[k] = aggregateDurations(runtimes.map((r) => r.linter.rules[k] ?? 0));
   }
 
   // Average emitters
@@ -146,29 +149,29 @@ function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
     }
     const steps: Record<string, number> = {};
     for (const k of stepKeys) {
-      steps[k] = runtimes.reduce((s, r) => s + (r.emit.emitters[name]?.steps[k] ?? 0), 0) / n;
+      steps[k] = aggregateDurations(runtimes.map((r) => r.emit.emitters[name]?.steps[k] ?? 0));
     }
     emitters[name] = {
-      total: runtimes.reduce((s, r) => s + (r.emit.emitters[name]?.total ?? 0), 0) / n,
+      total: aggregateDurations(runtimes.map((r) => r.emit.emitters[name]?.total ?? 0)),
       steps,
     };
   }
 
   return {
-    total: avg((r) => r.total),
-    loader: avg((r) => r.loader),
-    resolver: avg((r) => r.resolver),
-    checker: avg((r) => r.checker),
+    total: aggregate((r) => r.total),
+    loader: aggregate((r) => r.loader),
+    resolver: aggregate((r) => r.resolver),
+    checker: aggregate((r) => r.checker),
     validation: {
-      total: avg((r) => r.validation.total),
+      total: aggregate((r) => r.validation.total),
       validators,
     },
     linter: {
-      total: avg((r) => r.linter.total),
+      total: aggregate((r) => r.linter.total),
       rules,
     },
     emit: {
-      total: avg((r) => r.emit.total),
+      total: aggregate((r) => r.emit.total),
       emitters,
     },
   };
diff --git a/packages/benchmark/test/aggregate.test.ts b/packages/benchmark/test/aggregate.test.ts
@@ -0,0 +1,12 @@
+import { expect, it } from "vitest";
+import { aggregateDurations } from "../src/aggregate.js";
+
+it("aggregateDurations uses trimmed mean for 5+ samples", () => {
+  const actual = aggregateDurations([100, 101, 102, 103, 1000]);
+  expect(actual).toBe(102);
+});
+
+it("aggregateDurations uses median for fewer than 5 samples", () => {
+  const actual = aggregateDurations([100, 101, 1000]);
+  expect(actual).toBe(101);
+});
diff --git a/packages/benchmark/vitest.config.ts b/packages/benchmark/vitest.config.ts
@@ -0,0 +1,4 @@
+import { defineConfig, mergeConfig } from "vitest/config";
+import { defaultTypeSpecVitestConfig } from "../../core/vitest.config";
+
+export default mergeConfig(defaultTypeSpecVitestConfig, defineConfig({}));
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml