From 468b5e6660ffbfffc3df417a501710404a58b882 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Jun 2026 21:43:44 +0000 Subject: [PATCH 1/7] Implement benchmark variance and rolling baseline Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> --- .github/workflows/benchmark.yml | 21 ++- packages/benchmark/README.md | 11 +- packages/benchmark/src/cli.ts | 18 +++ packages/benchmark/src/format-comment.ts | 30 +++- packages/benchmark/src/run.ts | 53 ++++++- packages/benchmark/src/statistics.ts | 62 ++++++++ packages/benchmark/src/types.ts | 23 +++ packages/benchmark/src/upload-pr-comment.ts | 162 +++++++++++++++++++- packages/benchmark/test/compare.test.ts | 13 ++ packages/benchmark/test/statistics.test.ts | 17 ++ 10 files changed, 390 insertions(+), 20 deletions(-) create mode 100644 packages/benchmark/src/statistics.ts create mode 100644 packages/benchmark/test/statistics.test.ts diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a34ffd6882..2cb66c2905 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -19,6 +19,11 @@ on: required: false type: string default: "benchmark-data" + runner: + description: "Runner label (for stable runs prefer self-hosted or larger dedicated runner)" + required: false + type: string + default: "ubuntu-latest" permissions: contents: write @@ -30,7 +35,7 @@ concurrency: jobs: benchmark: name: Run Benchmarks - runs-on: ubuntu-latest + runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || vars.BENCHMARK_RUNNER || 'ubuntu-latest' }} env: TYPESPEC_VS_CI_BUILD: true TYPESPEC_SKIP_WEBSITE_BUILD: true @@ -41,6 +46,8 @@ jobs: fetch-depth: 0 - uses: ./.github/actions/setup + with: + node-version: 24.11.1 - name: Install dependencies run: pnpm install @@ -54,8 +61,8 @@ jobs: node packages/benchmark/dist/src/cli.js backfill \ --from ${{ inputs.backfill_from }} \ --specs-dir packages/benchmark/specs \ - --iterations 15 \ - --warmup 1 \ + --iterations 25 \ + --warmup 3 \ --branch ${{ inputs.branch }} \ --push @@ -64,8 +71,11 @@ jobs: run: | node packages/benchmark/dist/src/cli.js run \ --specs-dir packages/benchmark/specs \ - --iterations 15 \ - --warmup 1 \ + --iterations 25 \ + --warmup 3 \ + --noise-cv-threshold 0.08 \ + --max-reruns 1 \ + --rerun-iterations 10 \ --commit ${{ github.sha }} \ --output /tmp/benchmark-results.json @@ -84,6 +94,7 @@ jobs: node packages/benchmark/dist/src/cli.js upload-pr-comment \ --results /tmp/benchmark-results.json \ --pr-number ${{ github.event.number }} \ + --baseline-window 20 \ --output-dir /tmp/benchmark-artifacts - name: Upload benchmark comment diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md index 4b88a730f7..f2a031a0e7 100644 --- a/packages/benchmark/README.md +++ b/packages/benchmark/README.md @@ -7,8 +7,11 @@ Performance benchmarking tool for TypeSpec Azure compilation. Tracks compilation 1. **Benchmark runner** compiles dedicated TypeSpec specs using the compiler's programmatic API 2. The compiler provides built-in `Stats` data including per-stage timing and per-linter-rule breakdown 3. Runtime metrics are aggregated with an outlier-resistant estimator (trimmed mean for 5+ samples, median for smaller sample sizes) -4. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch -5. PR comments show a comparison table highlighting performance changes +4. Per-spec variability (standard deviation and coefficient of variation) is captured from raw iterations +5. Optional noise-gating can auto-run extra iterations when variance is high +6. PR baseline can be built from a rolling window of recent `main` results instead of only `latest.json` +7. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch +8. PR comments show a comparison table highlighting performance changes ## Local usage @@ -25,6 +28,9 @@ node packages/benchmark/dist/src/cli.js run --output results.json node packages/benchmark/dist/src/cli.js run \ --iterations 3 \ --warmup 1 \ + --noise-cv-threshold 0.08 \ + --max-reruns 1 \ + --rerun-iterations 5 \ --specs azure-core-dataplane,azure-arm-resource-manager \ --output results.json ``` @@ -78,6 +84,7 @@ The `.github/workflows/benchmark.yml` workflow: - **On push to `main`**: Runs benchmarks and stores results to the `benchmark-data` branch via the `store-results` CLI command - **On pull requests**: Runs benchmarks, fetches the baseline, compares, and generates a PR comment via the `upload-pr-comment` CLI command +- Benchmark PR baselines are generated from a rolling window of recent `main` runs when `results/history.json` is available, with fallback to `results/latest.json` ### Data storage diff --git a/packages/benchmark/src/cli.ts b/packages/benchmark/src/cli.ts index ab3670b362..96b76fce10 100644 --- a/packages/benchmark/src/cli.ts +++ b/packages/benchmark/src/cli.ts @@ -38,6 +38,11 @@ Run options: --specs-dir Directory containing benchmark specs (default: built-in specs) --iterations Number of measured iterations (default: 5) --warmup Number of warmup iterations (default: 1) + --noise-cv-threshold + Rerun when total-runtime coefficient of variation is above this value (e.g. 0.08 = 8%) + --max-reruns Max rerun cycles when noise gate triggers (default: 0) + --rerun-iterations + Extra measured iterations per rerun (default: same as --iterations) --specs Comma-separated list of specific specs to run --commit Git commit SHA to record --output Output file for results JSON (default: stdout) @@ -64,6 +69,7 @@ Upload-pr-comment options: --pr-number Pull request number --output-dir Output directory for artifacts --branch Branch name for fetching baseline (default: benchmark-data) + --baseline-window Number of recent main results to build rolling baseline (default: 20) --threshold Percent threshold for notable changes (default: 5) Backfill options: @@ -127,6 +133,14 @@ async function runCommand(args: Record): Promise { const specs = args["specs"]?.split(","); const commit = args["commit"]; const outputFile = args["output"]; + const noiseCvThreshold = + args["noise-cv-threshold"] !== undefined + ? parseFloat(args["noise-cv-threshold"]) + : undefined; + const maxReruns = args["max-reruns"] ? parseInt(args["max-reruns"], 10) : undefined; + const rerunIterations = args["rerun-iterations"] + ? parseInt(args["rerun-iterations"], 10) + : undefined; const result = await runBenchmarks({ specsDir, @@ -134,6 +148,9 @@ async function runCommand(args: Record): Promise { warmup, specs, commit, + noiseCvThreshold, + maxReruns, + rerunIterations, }); await outputResult(JSON.stringify(result, null, 2), outputFile); @@ -205,6 +222,7 @@ function uploadPrCommentCommand(args: Record): void { outputDir, branch: args["branch"], threshold: args["threshold"] ? parseFloat(args["threshold"]) : undefined, + baselineWindow: args["baseline-window"] ? parseInt(args["baseline-window"], 10) : undefined, }); } diff --git a/packages/benchmark/src/format-comment.ts b/packages/benchmark/src/format-comment.ts index f41ce7fae2..5d91c4e72d 100644 --- a/packages/benchmark/src/format-comment.ts +++ b/packages/benchmark/src/format-comment.ts @@ -46,6 +46,13 @@ function formatPercent(pct: number): string { return `${sign}${pct.toFixed(1)}%`; } +function formatRef(value: string): string { + if (/^[a-f0-9]{7,40}$/i.test(value)) { + return value.slice(0, 7); + } + return value; +} + // ── Metric flattening helpers ────────────────────────────────────────────── interface FlatMetric { @@ -192,7 +199,7 @@ export function formatPrComment( const specNames = comparisons.map((c) => c.specName).join(", "); lines.push("
"); lines.push( - `Full details – comparing ${currentCommit.slice(0, 7)} vs baseline ${baselineCommit.slice(0, 7)}\n`, + `Full details – comparing ${formatRef(currentCommit)} vs baseline ${formatRef(baselineCommit)}\n`, ); lines.push("| Metric | Baseline | Current | Change |"); lines.push("|--------|----------|---------|--------|"); @@ -273,7 +280,7 @@ export function formatConsoleSummary( export function formatRunSummary(result: BenchmarkResult): string { const lines: string[] = []; lines.push("## ⚡ Benchmark Results\n"); - lines.push(`**Commit:** \`${result.commit.slice(0, 7)}\` `); + lines.push(`**Commit:** \`${formatRef(result.commit)}\` `); lines.push(`**Date:** ${result.timestamp} `); lines.push( `**Runner:** ${result.runner.os}, Node ${result.runner.nodeVersion}, ${result.runner.arch}\n`, @@ -300,6 +307,23 @@ export function formatRunSummary(result: BenchmarkResult): string { lines.push(""); lines.push(`> Averaged across ${specs.length} specs (${specNames.join(", ")}).`); + const specVariability = Object.entries(result.specs) + .map(([specName, spec]) => ({ + specName, + variability: spec.variability?.total, + })) + .filter((x) => x.variability !== undefined) + .sort((a, b) => (b.variability?.cv ?? 0) - (a.variability?.cv ?? 0)); + + if (specVariability.length > 0) { + lines.push("> Total runtime variability (CV) per spec:"); + for (const entry of specVariability) { + const variability = entry.variability!; + lines.push( + `> - ${entry.specName}: ${(variability.cv * 100).toFixed(1)}% (n=${variability.sampleCount}, σ=${formatMs(variability.stdDev)})`, + ); + } + } lines.push(LEGEND); return lines.join("\n"); @@ -315,7 +339,7 @@ export function formatComparisonSummary( const lines: string[] = []; lines.push("## ⚡ Benchmark Comparison\n"); lines.push( - `Comparing [\`${currentCommit.slice(0, 7)}\`] against baseline [\`${baselineCommit.slice(0, 7)}\`]\n`, + `Comparing [\`${formatRef(currentCommit)}\`] against baseline [\`${formatRef(baselineCommit)}\`]\n`, ); const averaged = averageComparisonMetrics(comparisons); diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts index 37eac8af8d..830c93e14c 100644 --- a/packages/benchmark/src/run.ts +++ b/packages/benchmark/src/run.ts @@ -5,8 +5,10 @@ import { readdir } from "fs/promises"; import os from "os"; import { join, resolve } from "path"; import { aggregateDurations } from "./aggregate.js"; +import { summarize } from "./statistics.js"; import type { BenchmarkResult, + NoiseGateInfo, RunnerInfo, RuntimeStats, SpecBenchmarkResult, @@ -27,6 +29,12 @@ export interface RunOptions { specs?: string[]; /** Git commit SHA to record. */ commit?: string; + /** If set, rerun a spec when total-runtime coefficient of variation exceeds threshold. */ + noiseCvThreshold?: number; + /** Max number of rerun cycles for noisy specs. */ + maxReruns?: number; + /** Number of additional measured iterations on each rerun (default: iterations). */ + rerunIterations?: number; } /** Discover benchmark spec directories under the given path. */ @@ -208,6 +216,9 @@ export async function runBenchmarks(options: RunOptions): Promise = {}; + const noiseCvThreshold = options.noiseCvThreshold; + const maxReruns = options.maxReruns ?? 0; + const rerunIterations = options.rerunIterations ?? iterations; for (const specName of specNames) { const specDir = join(specsDir, specName); @@ -227,14 +238,52 @@ export async function runBenchmarks(options: RunOptions): Promise 0 && rerunIterations > 0) { + for (let rerun = 0; rerun < maxReruns; rerun++) { + const totalSummary = summarize(rawIterations.map((x) => x.runtime.total)); + if (totalSummary.cv <= noiseCvThreshold) { + break; + } + + rerunsPerformed++; + console.log( + ` Noise gate triggered (CV ${(totalSummary.cv * 100).toFixed(1)}% > ${(noiseCvThreshold * 100).toFixed(1)}%), running ${rerunIterations} extra iteration(s)...`, + ); + for (let i = 0; i < rerunIterations; i++) { + console.log(` Rerun iteration ${i + 1}/${rerunIterations}...`); + const stats = await compileSpec(specDir); + rawIterations.push(stats); + } + } + } + + const totalSummary = summarize(rawIterations.map((x) => x.runtime.total)); + const noiseGateInfo: NoiseGateInfo | undefined = + noiseCvThreshold === undefined + ? undefined + : { + thresholdCv: noiseCvThreshold, + maxReruns, + rerunIterations, + rerunsPerformed, + triggered: rerunsPerformed > 0, + }; + specs[specName] = { name: specName, - iterations, + iterations: rawIterations.length, stats: averageStats(rawIterations), rawIterations, + variability: { + total: totalSummary, + noiseGate: noiseGateInfo, + }, }; - console.log(` Total: ${specs[specName].stats.runtime.total.toFixed(1)}ms (avg)`); + console.log( + ` Total: ${specs[specName].stats.runtime.total.toFixed(1)}ms (avg), CV ${(totalSummary.cv * 100).toFixed(1)}%`, + ); } const commit = getGitCommit(options.commit); diff --git a/packages/benchmark/src/statistics.ts b/packages/benchmark/src/statistics.ts new file mode 100644 index 0000000000..dc0b5a5f35 --- /dev/null +++ b/packages/benchmark/src/statistics.ts @@ -0,0 +1,62 @@ +export interface DistributionStats { + mean: number; + median: number; + stdDev: number; + cv: number; + min: number; + max: number; + sampleCount: number; +} + +export function mean(values: number[]): number { + if (values.length === 0) return 0; + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +export function median(values: number[]): number { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const middle = Math.floor(sorted.length / 2); + if (sorted.length % 2 === 1) { + return sorted[middle]; + } + return (sorted[middle - 1] + sorted[middle]) / 2; +} + +export function stdDev(values: number[]): number { + if (values.length < 2) return 0; + const avg = mean(values); + const variance = values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / values.length; + return Math.sqrt(variance); +} + +export function coefficientOfVariation(values: number[]): number { + if (values.length === 0) return 0; + const avg = mean(values); + if (avg === 0) return 0; + return stdDev(values) / avg; +} + +export function summarize(values: number[]): DistributionStats { + if (values.length === 0) { + return { + mean: 0, + median: 0, + stdDev: 0, + cv: 0, + min: 0, + max: 0, + sampleCount: 0, + }; + } + + return { + mean: mean(values), + median: median(values), + stdDev: stdDev(values), + cv: coefficientOfVariation(values), + min: Math.min(...values), + max: Math.max(...values), + sampleCount: values.length, + }; +} diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts index caf284c27c..8bf91f1120 100644 --- a/packages/benchmark/src/types.ts +++ b/packages/benchmark/src/types.ts @@ -37,6 +37,24 @@ export interface RuntimeStats { }; } +export interface MetricVariability { + mean: number; + median: number; + stdDev: number; + cv: number; + min: number; + max: number; + sampleCount: number; +} + +export interface NoiseGateInfo { + thresholdCv: number; + maxReruns: number; + rerunIterations: number; + rerunsPerformed: number; + triggered: boolean; +} + /** Benchmark result for a single spec across multiple iterations. */ export interface SpecBenchmarkResult { /** The spec name (directory name). */ @@ -47,6 +65,11 @@ export interface SpecBenchmarkResult { stats: Stats; /** Per-iteration raw stats. */ rawIterations: Stats[]; + /** Variability summary for measured iterations. */ + variability?: { + total: MetricVariability; + noiseGate?: NoiseGateInfo; + }; } /** Complete benchmark result set. */ diff --git a/packages/benchmark/src/upload-pr-comment.ts b/packages/benchmark/src/upload-pr-comment.ts index 80a5b08cf4..99541bc56d 100644 --- a/packages/benchmark/src/upload-pr-comment.ts +++ b/packages/benchmark/src/upload-pr-comment.ts @@ -2,14 +2,16 @@ import { execSync } from "node:child_process"; import { appendFileSync, existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { join, resolve } from "node:path"; +import { aggregateDurations } from "./aggregate.js"; import { compareBenchmarks, hasNotableChanges } from "./compare.js"; import { formatComparisonSummary, formatConsoleSummary, formatPrComment, } from "./format-comment.js"; -import type { BenchmarkResult } from "./types.js"; +import type { BenchmarkResult, RuntimeStats, SpecBenchmarkResult } from "./types.js"; import { DEFAULT_BRANCH } from "./utils.js"; +import type { HistoryData } from "./generate-history.js"; export interface UploadPrCommentOptions { /** Path to the current benchmark results JSON file. */ @@ -22,9 +24,132 @@ export interface UploadPrCommentOptions { branch?: string; /** Percent threshold for notable changes. */ threshold?: number; + /** Number of latest entries to use for rolling baseline. */ + baselineWindow?: number; } -function fetchBaseline(branch: string): BenchmarkResult | undefined { +interface BaselineResult { + baseline: BenchmarkResult; + label: string; +} + +function unflattenRuntime(flat: Record): RuntimeStats { + const runtime: RuntimeStats = { + total: flat["total"] ?? 0, + loader: flat["loader"] ?? 0, + resolver: flat["resolver"] ?? 0, + checker: flat["checker"] ?? 0, + validation: { total: flat["validation"] ?? 0, validators: {} }, + linter: { total: flat["linter"] ?? 0, rules: {} }, + emit: { total: flat["emit"] ?? 0, emitters: {} }, + }; + + for (const [label, value] of Object.entries(flat)) { + if (label.startsWith("validation/")) { + runtime.validation.validators[label.replace("validation/", "")] = value; + continue; + } + if (label.startsWith("linter/")) { + runtime.linter.rules[label.replace("linter/", "")] = value; + continue; + } + if (!label.startsWith("emit/")) { + continue; + } + + const parts = label.split("/"); + if (parts.length < 2) { + continue; + } + + const emitterName = parts[1]; + runtime.emit.emitters[emitterName] ??= { total: 0, steps: {} }; + if (parts.length === 2) { + runtime.emit.emitters[emitterName].total = value; + } else if (parts.length > 2) { + const stepName = parts.slice(2).join("/"); + runtime.emit.emitters[emitterName].steps[stepName] = value; + } + } + + return runtime; +} + +function aggregateSpecFromHistory( + specName: string, + entries: HistoryData["entries"], + currentSpec: SpecBenchmarkResult, +): SpecBenchmarkResult | undefined { + const samplesByMetric = new Map(); + for (const entry of entries) { + const metrics = entry.specMetrics[specName]; + if (!metrics) continue; + for (const [label, value] of Object.entries(metrics)) { + const samples = samplesByMetric.get(label); + if (samples) { + samples.push(value); + } else { + samplesByMetric.set(label, [value]); + } + } + } + + if (samplesByMetric.size === 0) { + return undefined; + } + + const aggregated: Record = {}; + for (const [label, samples] of samplesByMetric) { + aggregated[label] = aggregateDurations(samples); + } + + return { + ...currentSpec, + stats: { + ...currentSpec.stats, + runtime: unflattenRuntime(aggregated), + }, + }; +} + +function buildRollingBaseline( + history: HistoryData, + current: BenchmarkResult, + baselineWindow: number, +): BaselineResult | undefined { + const window = Math.max(1, baselineWindow); + const entries = history.entries.slice(-window); + if (entries.length === 0) { + return undefined; + } + + const specs: Record = {}; + for (const [specName, currentSpec] of Object.entries(current.specs)) { + const rollingSpec = aggregateSpecFromHistory(specName, entries, currentSpec); + if (!rollingSpec) { + continue; + } + specs[specName] = rollingSpec; + } + + if (Object.keys(specs).length === 0) { + return undefined; + } + + const firstCommit = entries[0]?.commit.slice(0, 7) ?? "unknown"; + const lastCommit = entries[entries.length - 1]?.commit.slice(0, 7) ?? "unknown"; + return { + baseline: { + ...current, + commit: `rolling:${firstCommit}..${lastCommit}`, + timestamp: new Date().toISOString(), + specs, + }, + label: `rolling baseline (${entries.length} main run${entries.length > 1 ? "s" : ""})`, + }; +} + +function fetchBaseline(branch: string, current: BenchmarkResult, baselineWindow: number): BaselineResult | undefined { try { const hasRemote = (() => { try { @@ -40,11 +165,28 @@ function fetchBaseline(branch: string): BenchmarkResult | undefined { } execSync(`git fetch origin ${branch}`, { stdio: "ignore" }); - const content = execSync(`git show origin/${branch}:results/latest.json`, { + try { + const historyContent = execSync(`git show origin/${branch}:results/history.json`, { + encoding: "utf-8", + maxBuffer: 50_000_000, + }); + const history = JSON.parse(historyContent) as HistoryData; + const rollingBaseline = buildRollingBaseline(history, current, baselineWindow); + if (rollingBaseline) { + return rollingBaseline; + } + } catch { + // ignore and fallback to latest.json + } + + const latestContent = execSync(`git show origin/${branch}:results/latest.json`, { encoding: "utf-8", maxBuffer: 50_000_000, }); - return JSON.parse(content) as BenchmarkResult; + return { + baseline: JSON.parse(latestContent) as BenchmarkResult, + label: "latest main benchmark", + }; } catch { return undefined; } @@ -64,25 +206,29 @@ export function uploadPrComment(options: UploadPrCommentOptions): void { const { resultsFile, prNumber, outputDir } = options; const branch = options.branch ?? DEFAULT_BRANCH; const threshold = options.threshold; + const baselineWindow = options.baselineWindow ?? 20; if (!existsSync(resultsFile)) { throw new Error(`Results file not found: ${resultsFile}`); } const current = JSON.parse(readFileSync(resolve(resultsFile), "utf-8")) as BenchmarkResult; - const baseline = fetchBaseline(branch); + const baselineResult = fetchBaseline(branch, current, baselineWindow); mkdirSync(outputDir, { recursive: true }); let commentMarkdown: string; let githubSummary: string | undefined; - if (baseline) { + if (baselineResult) { + const { baseline, label } = baselineResult; const comparisons = compareBenchmarks(baseline, current, { threshold }); - commentMarkdown = formatPrComment(comparisons, baseline.commit, current.commit, { threshold }); + commentMarkdown = formatPrComment(comparisons, `${baseline.commit} (${label})`, current.commit, { + threshold, + }); githubSummary = formatComparisonSummary( comparisons, - baseline.commit, + `${baseline.commit} (${label})`, current.commit, threshold, ); diff --git a/packages/benchmark/test/compare.test.ts b/packages/benchmark/test/compare.test.ts index 1344cac452..b2c86ecd9e 100644 --- a/packages/benchmark/test/compare.test.ts +++ b/packages/benchmark/test/compare.test.ts @@ -52,3 +52,16 @@ it("excludes metrics below minimum absolute threshold from regression summary", expect(topSummary).toContain("| checker |"); expect(topSummary).not.toContain("linter/noisy-rule"); }); + +it("keeps descriptive baseline labels in comments", () => { + const comparisons = [createComparison([createMetric("checker", 100, 99)])]; + const comment = formatPrComment( + comparisons, + "rolling:abc1234..def5678 (rolling baseline (20 main runs))", + "1234567890abcdef", + { threshold: 5 }, + ); + + expect(comment).toContain("rolling baseline (20 main runs)"); + expect(comment).toContain("1234567"); +}); diff --git a/packages/benchmark/test/statistics.test.ts b/packages/benchmark/test/statistics.test.ts new file mode 100644 index 0000000000..406434d5c0 --- /dev/null +++ b/packages/benchmark/test/statistics.test.ts @@ -0,0 +1,17 @@ +import { expect, it } from "vitest"; +import { coefficientOfVariation, summarize } from "../src/statistics.js"; + +it("computes coefficient of variation", () => { + const cv = coefficientOfVariation([100, 102, 98, 100]); + expect(cv).toBeGreaterThan(0); + expect(cv).toBeLessThan(0.02); +}); + +it("summarizes value distributions", () => { + const summary = summarize([90, 100, 110]); + expect(summary.sampleCount).toBe(3); + expect(summary.mean).toBe(100); + expect(summary.median).toBe(100); + expect(summary.min).toBe(90); + expect(summary.max).toBe(110); +}); From e6f105943180f5ba36f1884069f2cb9d7b776c64 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Jun 2026 21:50:35 +0000 Subject: [PATCH 2/7] Address benchmark review follow-ups Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> --- packages/benchmark/src/format-comment.ts | 1 + packages/benchmark/src/statistics.ts | 2 +- packages/benchmark/src/upload-pr-comment.ts | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/benchmark/src/format-comment.ts b/packages/benchmark/src/format-comment.ts index 5d91c4e72d..311ca57542 100644 --- a/packages/benchmark/src/format-comment.ts +++ b/packages/benchmark/src/format-comment.ts @@ -47,6 +47,7 @@ function formatPercent(pct: number): string { } function formatRef(value: string): string { + // Commit hashes are shortened for readability while descriptive labels are kept intact. if (/^[a-f0-9]{7,40}$/i.test(value)) { return value.slice(0, 7); } diff --git a/packages/benchmark/src/statistics.ts b/packages/benchmark/src/statistics.ts index dc0b5a5f35..64b3cbf081 100644 --- a/packages/benchmark/src/statistics.ts +++ b/packages/benchmark/src/statistics.ts @@ -26,7 +26,7 @@ export function median(values: number[]): number { export function stdDev(values: number[]): number { if (values.length < 2) return 0; const avg = mean(values); - const variance = values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / values.length; + const variance = values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / (values.length - 1); return Math.sqrt(variance); } diff --git a/packages/benchmark/src/upload-pr-comment.ts b/packages/benchmark/src/upload-pr-comment.ts index 99541bc56d..ab15c2b65b 100644 --- a/packages/benchmark/src/upload-pr-comment.ts +++ b/packages/benchmark/src/upload-pr-comment.ts @@ -141,7 +141,7 @@ function buildRollingBaseline( return { baseline: { ...current, - commit: `rolling:${firstCommit}..${lastCommit}`, + commit: `rolling-baseline-${firstCommit}-${lastCommit}`, timestamp: new Date().toISOString(), specs, }, From c226b3522b6dbac1665c9bc663d2f2659f6b235c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Jun 2026 15:20:11 +0000 Subject: [PATCH 3/7] Fix benchmark formatting and spellcheck follow-up Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> --- packages/benchmark/src/cli.ts | 4 +--- packages/benchmark/src/upload-pr-comment.ts | 23 ++++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/packages/benchmark/src/cli.ts b/packages/benchmark/src/cli.ts index 96b76fce10..82380f99cf 100644 --- a/packages/benchmark/src/cli.ts +++ b/packages/benchmark/src/cli.ts @@ -134,9 +134,7 @@ async function runCommand(args: Record): Promise { const commit = args["commit"]; const outputFile = args["output"]; const noiseCvThreshold = - args["noise-cv-threshold"] !== undefined - ? parseFloat(args["noise-cv-threshold"]) - : undefined; + args["noise-cv-threshold"] !== undefined ? parseFloat(args["noise-cv-threshold"]) : undefined; const maxReruns = args["max-reruns"] ? parseInt(args["max-reruns"], 10) : undefined; const rerunIterations = args["rerun-iterations"] ? parseInt(args["rerun-iterations"], 10) diff --git a/packages/benchmark/src/upload-pr-comment.ts b/packages/benchmark/src/upload-pr-comment.ts index ab15c2b65b..53e447bcc2 100644 --- a/packages/benchmark/src/upload-pr-comment.ts +++ b/packages/benchmark/src/upload-pr-comment.ts @@ -9,9 +9,9 @@ import { formatConsoleSummary, formatPrComment, } from "./format-comment.js"; +import type { HistoryData } from "./generate-history.js"; import type { BenchmarkResult, RuntimeStats, SpecBenchmarkResult } from "./types.js"; import { DEFAULT_BRANCH } from "./utils.js"; -import type { HistoryData } from "./generate-history.js"; export interface UploadPrCommentOptions { /** Path to the current benchmark results JSON file. */ @@ -33,7 +33,7 @@ interface BaselineResult { label: string; } -function unflattenRuntime(flat: Record): RuntimeStats { +function expandRuntimeMetrics(flat: Record): RuntimeStats { const runtime: RuntimeStats = { total: flat["total"] ?? 0, loader: flat["loader"] ?? 0, @@ -107,7 +107,7 @@ function aggregateSpecFromHistory( ...currentSpec, stats: { ...currentSpec.stats, - runtime: unflattenRuntime(aggregated), + runtime: expandRuntimeMetrics(aggregated), }, }; } @@ -149,7 +149,11 @@ function buildRollingBaseline( }; } -function fetchBaseline(branch: string, current: BenchmarkResult, baselineWindow: number): BaselineResult | undefined { +function fetchBaseline( + branch: string, + current: BenchmarkResult, + baselineWindow: number, +): BaselineResult | undefined { try { const hasRemote = (() => { try { @@ -223,9 +227,14 @@ export function uploadPrComment(options: UploadPrCommentOptions): void { if (baselineResult) { const { baseline, label } = baselineResult; const comparisons = compareBenchmarks(baseline, current, { threshold }); - commentMarkdown = formatPrComment(comparisons, `${baseline.commit} (${label})`, current.commit, { - threshold, - }); + commentMarkdown = formatPrComment( + comparisons, + `${baseline.commit} (${label})`, + current.commit, + { + threshold, + }, + ); githubSummary = formatComparisonSummary( comparisons, `${baseline.commit} (${label})`, From 093bcfe208880a8ca402e5a71c0712c77b1e844e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 15 Jun 2026 16:04:47 +0000 Subject: [PATCH 4/7] Fix benchmark workflow Node engine mismatch Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2cb66c2905..101902c7a1 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -47,7 +47,7 @@ jobs: - uses: ./.github/actions/setup with: - node-version: 24.11.1 + node-version: 24.15.0 - name: Install dependencies run: pnpm install From 1248489ff41548260897805964c5b6c74e39e492 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 24 Jun 2026 15:31:14 +0000 Subject: [PATCH 5/7] Increase benchmark workflow Node heap limit to prevent OOM Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> --- .github/workflows/benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 101902c7a1..72b3b9646a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -58,7 +58,7 @@ jobs: - name: Run backfill if: ${{ github.event_name == 'workflow_dispatch' && inputs.backfill_from }} run: | - node packages/benchmark/dist/src/cli.js backfill \ + node --max-old-space-size=6144 packages/benchmark/dist/src/cli.js backfill \ --from ${{ inputs.backfill_from }} \ --specs-dir packages/benchmark/specs \ --iterations 25 \ @@ -69,7 +69,7 @@ jobs: - name: Run benchmarks if: ${{ github.event_name != 'workflow_dispatch' || !inputs.backfill_from }} run: | - node packages/benchmark/dist/src/cli.js run \ + node --max-old-space-size=6144 packages/benchmark/dist/src/cli.js run \ --specs-dir packages/benchmark/specs \ --iterations 25 \ --warmup 3 \ From e2284a1aede647dc92a854c94b39e2761760de93 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:12:10 +0000 Subject: [PATCH 6/7] Isolate benchmark iterations to prevent benchmark-run OOM Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> --- packages/benchmark/src/compile-once.ts | 52 ++++++++++++++++++ packages/benchmark/src/run.ts | 73 ++++++++++++-------------- 2 files changed, 87 insertions(+), 38 deletions(-) create mode 100644 packages/benchmark/src/compile-once.ts diff --git a/packages/benchmark/src/compile-once.ts b/packages/benchmark/src/compile-once.ts new file mode 100644 index 0000000000..29c80afc61 --- /dev/null +++ b/packages/benchmark/src/compile-once.ts @@ -0,0 +1,52 @@ +/* eslint-disable no-console */ +import { compile, NodeHost, resolveCompilerOptions } from "@typespec/compiler"; +import { join } from "path"; +import type { Stats } from "./types.js"; + +async function compileSpec(specDir: string): Promise { + const mainFile = join(specDir, "main.tsp"); + const [options, diagnostics] = await resolveCompilerOptions(NodeHost, { + entrypoint: mainFile, + cwd: specDir, + }); + if (diagnostics.length > 0) { + const msgs = diagnostics.map((d: any) => ` ${d.message}`).join("\n"); + console.warn(`Warnings resolving options for ${specDir}:\n${msgs}`); + } + + const program = await compile(NodeHost, mainFile, { + ...options, + outputDir: join(specDir, "tsp-output"), + }); + + if (program.hasError()) { + const errorDiags = program.diagnostics + .filter((d: any) => d.severity === "error") + .map((d: any) => ` ${d.message}`) + .join("\n"); + throw new Error(`Compilation failed for ${specDir}:\n${errorDiags}`); + } + + const stats = (program as any).stats as Stats; + stats.runtime.total = + (stats.runtime.loader ?? 0) + + (stats.runtime.resolver ?? 0) + + (stats.runtime.checker ?? 0) + + (stats.runtime.validation?.total ?? 0) + + (stats.runtime.linter?.total ?? 0); + return stats; +} + +async function main() { + const specDir = process.argv[2]; + if (!specDir) { + throw new Error("Missing spec directory"); + } + const stats = await compileSpec(specDir); + process.stdout.write(JSON.stringify(stats)); +} + +main().catch((error: Error) => { + console.error(error.message); + process.exit(1); +}); diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts index 830c93e14c..8d7fb53947 100644 --- a/packages/benchmark/src/run.ts +++ b/packages/benchmark/src/run.ts @@ -1,9 +1,9 @@ /* eslint-disable no-console */ -import { compile, NodeHost, resolveCompilerOptions } from "@typespec/compiler"; -import { execSync } from "child_process"; +import { execSync, spawn } from "child_process"; import { readdir } from "fs/promises"; import os from "os"; import { join, resolve } from "path"; +import { fileURLToPath } from "url"; import { aggregateDurations } from "./aggregate.js"; import { summarize } from "./statistics.js"; import type { @@ -50,44 +50,41 @@ async function discoverSpecs(specsDir: string, filter?: string[]): Promise { - const mainFile = join(specDir, "main.tsp"); - const [options, diagnostics] = await resolveCompilerOptions(NodeHost, { - entrypoint: mainFile, - cwd: specDir, - }); - if (diagnostics.length > 0) { - const msgs = diagnostics.map((d: any) => ` ${d.message}`).join("\n"); - console.warn(` Warnings resolving options for ${specDir}:\n${msgs}`); - } +const compileOncePath = fileURLToPath(new URL("./compile-once.js", import.meta.url)); - const program = await compile(NodeHost, mainFile, { - ...options, - outputDir: join(specDir, "tsp-output"), +/** Compile a single spec in an isolated process and return its stats. */ +async function compileSpec(specDir: string): Promise { + return await new Promise((resolveResult, reject) => { + const child = spawn(process.execPath, [compileOncePath, specDir], { + stdio: ["ignore", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + child.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + child.on("error", (error) => { + reject(error); + }); + child.on("close", (code) => { + if (code !== 0) { + reject(new Error(stderr.trim() || `Compilation process exited with code ${code}`)); + return; + } + try { + resolveResult(JSON.parse(stdout) as Stats); + } catch (error) { + reject( + new Error( + `Failed to parse benchmark stats output: ${error instanceof Error ? error.message : String(error)}`, + ), + ); + } + }); }); - - if (program.hasError()) { - const errorDiags = program.diagnostics - .filter((d: any) => d.severity === "error") - .map((d: any) => ` ${d.message}`) - .join("\n"); - throw new Error(`Compilation failed for ${specDir}:\n${errorDiags}`); - } - - // program.stats is @internal but available at runtime - const stats = (program as any).stats as Stats; - - // Recompute total without the emit stage so that adding more emitters - // does not inflate the "compilation" total metric. - stats.runtime.total = - (stats.runtime.loader ?? 0) + - (stats.runtime.resolver ?? 0) + - (stats.runtime.checker ?? 0) + - (stats.runtime.validation?.total ?? 0) + - (stats.runtime.linter?.total ?? 0); - - return stats; } /** Average multiple Stats objects. */ From 3f2a5f9ad8bad12c89f6d9a1fdd02182741e5091 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 24 Jun 2026 17:17:47 +0000 Subject: [PATCH 7/7] Address benchmark review feedback on process output handling Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com> --- packages/benchmark/src/compile-once.ts | 2 +- packages/benchmark/src/run.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/benchmark/src/compile-once.ts b/packages/benchmark/src/compile-once.ts index 29c80afc61..35fe31bebc 100644 --- a/packages/benchmark/src/compile-once.ts +++ b/packages/benchmark/src/compile-once.ts @@ -11,7 +11,7 @@ async function compileSpec(specDir: string): Promise { }); if (diagnostics.length > 0) { const msgs = diagnostics.map((d: any) => ` ${d.message}`).join("\n"); - console.warn(`Warnings resolving options for ${specDir}:\n${msgs}`); + process.stderr.write(`Warnings resolving options for ${specDir}:\n${msgs}\n`); } const program = await compile(NodeHost, mainFile, { diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts index 8d7fb53947..039a8982de 100644 --- a/packages/benchmark/src/run.ts +++ b/packages/benchmark/src/run.ts @@ -54,7 +54,7 @@ const compileOncePath = fileURLToPath(new URL("./compile-once.js", import.meta.u /** Compile a single spec in an isolated process and return its stats. */ async function compileSpec(specDir: string): Promise { - return await new Promise((resolveResult, reject) => { + return await new Promise((resolve, reject) => { const child = spawn(process.execPath, [compileOncePath, specDir], { stdio: ["ignore", "pipe", "pipe"], }); @@ -75,7 +75,7 @@ async function compileSpec(specDir: string): Promise { return; } try { - resolveResult(JSON.parse(stdout) as Stats); + resolve(JSON.parse(stdout) as Stats); } catch (error) { reject( new Error(