Skip to content

Commit fe097e5

Browse files
Stabilize benchmark scoring with robust runtime aggregation, deterministic spec ordering, and higher CI sampling (#4402)
Benchmark output was overly sensitive to run-to-run jitter, causing inconsistent values even when commits did not change benchmark-relevant code. This updates the benchmark engine to produce a more stable central value per metric. - **Aggregation model update** - Replaced simple arithmetic averaging of runtime samples with an outlier-resistant estimator: - **5+ iterations**: trimmed mean (drop min/max) - **1–4 iterations**: median - Applied consistently across top-level runtime stages and nested per-validator / per-rule / per-emitter metrics. - **Deterministic execution order** - Spec discovery now sorts directories before execution to remove ordering variance from run output. - **CI sampling update** - Increased benchmark workflow measured iterations from **5** to **15** (warmup remains **1**) to align with higher sample-count benchmarking recommendations and reduce noise in comparisons. - **Benchmark package coverage/docs** - Added focused unit tests for aggregation behavior. - Documented the new aggregation strategy in benchmark README. ```ts // New runtime aggregation behavior export function aggregateDurations(values: number[]): number { const sorted = [...values].sort((a, b) => a - b); if (sorted.length >= 5) return average(sorted.slice(1, -1)); // trimmed mean return median(sorted); // small-sample robust center } ``` --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> Co-authored-by: Timothee Guerin <tiguerin@microsoft.com>
1 parent 5f7f1ef commit fe097e5

9 files changed

Lines changed: 74 additions & 19 deletions

File tree

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
changeKind: fix
3+
packages:
4+
- "@azure-tools/typespec-benchmark"
5+
---
6+
7+
Improve benchmark result stability by using outlier-resistant runtime aggregation (trimmed mean for 5+ iterations and median for smaller samples), run specs in a deterministic order, and increase CI benchmark measured iterations from 5 to 15 for stronger statistical confidence.

.github/workflows/benchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
run: |
4242
node packages/benchmark/dist/src/cli.js run \
4343
--specs-dir packages/benchmark/specs \
44-
--iterations 5 \
44+
--iterations 15 \
4545
--warmup 1 \
4646
--commit ${{ github.sha }} \
4747
--output /tmp/benchmark-results.json

packages/benchmark/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ Performance benchmarking tool for TypeSpec Azure compilation. Tracks compilation
66

77
1. **Benchmark runner** compiles dedicated TypeSpec specs using the compiler's programmatic API
88
2. The compiler provides built-in `Stats` data including per-stage timing and per-linter-rule breakdown
9-
3. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch
10-
4. PR comments show a comparison table highlighting performance changes
9+
3. Runtime metrics are aggregated with an outlier-resistant estimator (trimmed mean for 5+ samples, median for smaller sample sizes)
10+
4. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch
11+
5. PR comments show a comparison table highlighting performance changes
1112

1213
## Local usage
1314

packages/benchmark/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"scripts": {
1616
"clean": "rimraf ./dist",
1717
"build": "tsc -p tsconfig.build.json",
18+
"test": "vitest run",
1819
"watch": "tsc -p tsconfig.build.json --watch",
1920
"bench": "node ./dist/src/cli.js run",
2021
"bench:compare": "node ./dist/src/cli.js compare"
@@ -35,6 +36,7 @@
3536
"devDependencies": {
3637
"@types/node": "catalog:",
3738
"rimraf": "catalog:",
38-
"typescript": "catalog:"
39+
"typescript": "catalog:",
40+
"vitest": "catalog:"
3941
}
4042
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/**
2+
* Aggregate timing samples with outlier resistance.
3+
* - 5+ samples: 20% trimmed mean (drop lowest and highest sample)
4+
* - 1-4 samples: median
5+
*/
6+
export function aggregateDurations(values: number[]): number {
7+
if (values.length === 0) {
8+
throw new Error("No values to aggregate");
9+
}
10+
11+
const sorted = [...values].sort((a, b) => a - b);
12+
if (sorted.length >= 5) {
13+
const trimmed = sorted.slice(1, -1);
14+
return trimmed.reduce((sum, value) => sum + value, 0) / trimmed.length;
15+
}
16+
17+
const middle = Math.floor(sorted.length / 2);
18+
if (sorted.length % 2 === 1) {
19+
return sorted[middle];
20+
}
21+
22+
return (sorted[middle - 1] + sorted[middle]) / 2;
23+
}

packages/benchmark/src/run.ts

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { execSync } from "child_process";
44
import { readdir } from "fs/promises";
55
import os from "os";
66
import { join, resolve } from "path";
7+
import { aggregateDurations } from "./aggregate.js";
78
import type {
89
BenchmarkResult,
910
RunnerInfo,
@@ -31,7 +32,10 @@ export interface RunOptions {
3132
/** Discover benchmark spec directories under the given path. */
3233
async function discoverSpecs(specsDir: string, filter?: string[]): Promise<string[]> {
3334
const entries = await readdir(specsDir, { withFileTypes: true });
34-
const dirs = entries.filter((e) => e.isDirectory()).map((e) => e.name);
35+
const dirs = entries
36+
.filter((e) => e.isDirectory())
37+
.map((e) => e.name)
38+
.sort((a, b) => a.localeCompare(b));
3539
if (filter && filter.length > 0) {
3640
return dirs.filter((d) => filter.includes(d));
3741
}
@@ -98,9 +102,8 @@ function averageStats(statsList: Stats[]): Stats {
98102
}
99103

100104
function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
101-
const n = runtimes.length;
102-
const avg = (accessor: (r: RuntimeStats) => number) =>
103-
runtimes.reduce((s, r) => s + accessor(r), 0) / n;
105+
const aggregate = (accessor: (r: RuntimeStats) => number) =>
106+
aggregateDurations(runtimes.map((r) => accessor(r)));
104107

105108
// Average validation
106109
const validatorKeys = new Set<string>();
@@ -111,7 +114,7 @@ function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
111114
}
112115
const validators: Record<string, number> = {};
113116
for (const k of validatorKeys) {
114-
validators[k] = runtimes.reduce((s, r) => s + (r.validation.validators[k] ?? 0), 0) / n;
117+
validators[k] = aggregateDurations(runtimes.map((r) => r.validation.validators[k] ?? 0));
115118
}
116119

117120
// Average linter rules
@@ -123,7 +126,7 @@ function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
123126
}
124127
const rules: Record<string, number> = {};
125128
for (const k of ruleKeys) {
126-
rules[k] = runtimes.reduce((s, r) => s + (r.linter.rules[k] ?? 0), 0) / n;
129+
rules[k] = aggregateDurations(runtimes.map((r) => r.linter.rules[k] ?? 0));
127130
}
128131

129132
// Average emitters
@@ -146,29 +149,29 @@ function averageRuntimeStats(runtimes: RuntimeStats[]): RuntimeStats {
146149
}
147150
const steps: Record<string, number> = {};
148151
for (const k of stepKeys) {
149-
steps[k] = runtimes.reduce((s, r) => s + (r.emit.emitters[name]?.steps[k] ?? 0), 0) / n;
152+
steps[k] = aggregateDurations(runtimes.map((r) => r.emit.emitters[name]?.steps[k] ?? 0));
150153
}
151154
emitters[name] = {
152-
total: runtimes.reduce((s, r) => s + (r.emit.emitters[name]?.total ?? 0), 0) / n,
155+
total: aggregateDurations(runtimes.map((r) => r.emit.emitters[name]?.total ?? 0)),
153156
steps,
154157
};
155158
}
156159

157160
return {
158-
total: avg((r) => r.total),
159-
loader: avg((r) => r.loader),
160-
resolver: avg((r) => r.resolver),
161-
checker: avg((r) => r.checker),
161+
total: aggregate((r) => r.total),
162+
loader: aggregate((r) => r.loader),
163+
resolver: aggregate((r) => r.resolver),
164+
checker: aggregate((r) => r.checker),
162165
validation: {
163-
total: avg((r) => r.validation.total),
166+
total: aggregate((r) => r.validation.total),
164167
validators,
165168
},
166169
linter: {
167-
total: avg((r) => r.linter.total),
170+
total: aggregate((r) => r.linter.total),
168171
rules,
169172
},
170173
emit: {
171-
total: avg((r) => r.emit.total),
174+
total: aggregate((r) => r.emit.total),
172175
emitters,
173176
},
174177
};
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import { expect, it } from "vitest";
2+
import { aggregateDurations } from "../src/aggregate.js";
3+
4+
it("aggregateDurations uses trimmed mean for 5+ samples", () => {
5+
const actual = aggregateDurations([100, 101, 102, 103, 1000]);
6+
expect(actual).toBe(102);
7+
});
8+
9+
it("aggregateDurations uses median for fewer than 5 samples", () => {
10+
const actual = aggregateDurations([100, 101, 1000]);
11+
expect(actual).toBe(101);
12+
});
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import { defineConfig, mergeConfig } from "vitest/config";
2+
import { defaultTypeSpecVitestConfig } from "../../core/vitest.config";
3+
4+
export default mergeConfig(defaultTypeSpecVitestConfig, defineConfig({}));

pnpm-lock.yaml

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)