From 468b5e6660ffbfffc3df417a501710404a58b882 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Jun 2026 21:43:44 +0000
Subject: [PATCH 1/7] Implement benchmark variance and rolling baseline

Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com>
---
 .github/workflows/benchmark.yml             |  21 ++-
 packages/benchmark/README.md                |  11 +-
 packages/benchmark/src/cli.ts               |  18 +++
 packages/benchmark/src/format-comment.ts    |  30 +++-
 packages/benchmark/src/run.ts               |  53 ++++++-
 packages/benchmark/src/statistics.ts        |  62 ++++++++
 packages/benchmark/src/types.ts             |  23 +++
 packages/benchmark/src/upload-pr-comment.ts | 162 +++++++++++++++++++-
 packages/benchmark/test/compare.test.ts     |  13 ++
 packages/benchmark/test/statistics.test.ts  |  17 ++
 10 files changed, 390 insertions(+), 20 deletions(-)
 create mode 100644 packages/benchmark/src/statistics.ts
 create mode 100644 packages/benchmark/test/statistics.test.ts

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index a34ffd6882..2cb66c2905 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -19,6 +19,11 @@ on:
         required: false
         type: string
         default: "benchmark-data"
+      runner:
+        description: "Runner label (for stable runs prefer self-hosted or larger dedicated runner)"
+        required: false
+        type: string
+        default: "ubuntu-latest"
 
 permissions:
   contents: write
@@ -30,7 +35,7 @@ concurrency:
 jobs:
   benchmark:
     name: Run Benchmarks
-    runs-on: ubuntu-latest
+    runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || vars.BENCHMARK_RUNNER || 'ubuntu-latest' }}
     env:
       TYPESPEC_VS_CI_BUILD: true
       TYPESPEC_SKIP_WEBSITE_BUILD: true
@@ -41,6 +46,8 @@ jobs:
           fetch-depth: 0
 
       - uses: ./.github/actions/setup
+        with:
+          node-version: 24.11.1
 
       - name: Install dependencies
         run: pnpm install
@@ -54,8 +61,8 @@ jobs:
           node packages/benchmark/dist/src/cli.js backfill \
             --from ${{ inputs.backfill_from }} \
             --specs-dir packages/benchmark/specs \
-            --iterations 15 \
-            --warmup 1 \
+            --iterations 25 \
+            --warmup 3 \
             --branch ${{ inputs.branch }} \
             --push
 
@@ -64,8 +71,11 @@ jobs:
         run: |
           node packages/benchmark/dist/src/cli.js run \
             --specs-dir packages/benchmark/specs \
-            --iterations 15 \
-            --warmup 1 \
+            --iterations 25 \
+            --warmup 3 \
+            --noise-cv-threshold 0.08 \
+            --max-reruns 1 \
+            --rerun-iterations 10 \
             --commit ${{ github.sha }} \
             --output /tmp/benchmark-results.json
 
@@ -84,6 +94,7 @@ jobs:
           node packages/benchmark/dist/src/cli.js upload-pr-comment \
             --results /tmp/benchmark-results.json \
             --pr-number ${{ github.event.number }} \
+            --baseline-window 20 \
             --output-dir /tmp/benchmark-artifacts
 
       - name: Upload benchmark comment
diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md
index 4b88a730f7..f2a031a0e7 100644
--- a/packages/benchmark/README.md
+++ b/packages/benchmark/README.md
@@ -7,8 +7,11 @@ Performance benchmarking tool for TypeSpec Azure compilation. Tracks compilation
 1. **Benchmark runner** compiles dedicated TypeSpec specs using the compiler's programmatic API
 2. The compiler provides built-in `Stats` data including per-stage timing and per-linter-rule breakdown
 3. Runtime metrics are aggregated with an outlier-resistant estimator (trimmed mean for 5+ samples, median for smaller sample sizes)
-4. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch
-5. PR comments show a comparison table highlighting performance changes
+4. Per-spec variability (standard deviation and coefficient of variation) is captured from raw iterations
+5. Optional noise-gating can auto-run extra iterations when variance is high
+6. PR baseline can be built from a rolling window of recent `main` results instead of only `latest.json`
+7. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch
+8. PR comments show a comparison table highlighting performance changes
 
 ## Local usage
 
@@ -25,6 +28,9 @@ node packages/benchmark/dist/src/cli.js run --output results.json
 node packages/benchmark/dist/src/cli.js run \
   --iterations 3 \
   --warmup 1 \
+  --noise-cv-threshold 0.08 \
+  --max-reruns 1 \
+  --rerun-iterations 5 \
   --specs azure-core-dataplane,azure-arm-resource-manager \
   --output results.json
 ```
@@ -78,6 +84,7 @@ The `.github/workflows/benchmark.yml` workflow:
 
 - **On push to `main`**: Runs benchmarks and stores results to the `benchmark-data` branch via the `store-results` CLI command
 - **On pull requests**: Runs benchmarks, fetches the baseline, compares, and generates a PR comment via the `upload-pr-comment` CLI command
+- Benchmark PR baselines are generated from a rolling window of recent `main` runs when `results/history.json` is available, with fallback to `results/latest.json`
 
 ### Data storage
 
diff --git a/packages/benchmark/src/cli.ts b/packages/benchmark/src/cli.ts
index ab3670b362..96b76fce10 100644
--- a/packages/benchmark/src/cli.ts
+++ b/packages/benchmark/src/cli.ts
@@ -38,6 +38,11 @@ Run options:
   --specs-dir <dir>     Directory containing benchmark specs (default: built-in specs)
   --iterations <n>      Number of measured iterations (default: 5)
   --warmup <n>          Number of warmup iterations (default: 1)
+  --noise-cv-threshold <n>
+                        Rerun when total-runtime coefficient of variation is above this value (e.g. 0.08 = 8%)
+  --max-reruns <n>      Max rerun cycles when noise gate triggers (default: 0)
+  --rerun-iterations <n>
+                        Extra measured iterations per rerun (default: same as --iterations)
   --specs <name,...>    Comma-separated list of specific specs to run
   --commit <sha>        Git commit SHA to record
   --output <file>       Output file for results JSON (default: stdout)
@@ -64,6 +69,7 @@ Upload-pr-comment options:
   --pr-number <n>       Pull request number
   --output-dir <dir>    Output directory for artifacts
   --branch <name>       Branch name for fetching baseline (default: benchmark-data)
+  --baseline-window <n> Number of recent main results to build rolling baseline (default: 20)
   --threshold <n>       Percent threshold for notable changes (default: 5)
 
 Backfill options:
@@ -127,6 +133,14 @@ async function runCommand(args: Record<string, string>): Promise<void> {
   const specs = args["specs"]?.split(",");
   const commit = args["commit"];
   const outputFile = args["output"];
+  const noiseCvThreshold =
+    args["noise-cv-threshold"] !== undefined
+      ? parseFloat(args["noise-cv-threshold"])
+      : undefined;
+  const maxReruns = args["max-reruns"] ? parseInt(args["max-reruns"], 10) : undefined;
+  const rerunIterations = args["rerun-iterations"]
+    ? parseInt(args["rerun-iterations"], 10)
+    : undefined;
 
   const result = await runBenchmarks({
     specsDir,
@@ -134,6 +148,9 @@ async function runCommand(args: Record<string, string>): Promise<void> {
     warmup,
     specs,
     commit,
+    noiseCvThreshold,
+    maxReruns,
+    rerunIterations,
   });
 
   await outputResult(JSON.stringify(result, null, 2), outputFile);
@@ -205,6 +222,7 @@ function uploadPrCommentCommand(args: Record<string, string>): void {
     outputDir,
     branch: args["branch"],
     threshold: args["threshold"] ? parseFloat(args["threshold"]) : undefined,
+    baselineWindow: args["baseline-window"] ? parseInt(args["baseline-window"], 10) : undefined,
   });
 }
 
diff --git a/packages/benchmark/src/format-comment.ts b/packages/benchmark/src/format-comment.ts
index f41ce7fae2..5d91c4e72d 100644
--- a/packages/benchmark/src/format-comment.ts
+++ b/packages/benchmark/src/format-comment.ts
@@ -46,6 +46,13 @@ function formatPercent(pct: number): string {
   return `${sign}${pct.toFixed(1)}%`;
 }
 
+function formatRef(value: string): string {
+  if (/^[a-f0-9]{7,40}$/i.test(value)) {
+    return value.slice(0, 7);
+  }
+  return value;
+}
+
 // ── Metric flattening helpers ──────────────────────────────────────────────
 
 interface FlatMetric {
@@ -192,7 +199,7 @@ export function formatPrComment(
   const specNames = comparisons.map((c) => c.specName).join(", ");
   lines.push("<details>");
   lines.push(
-    `<summary>Full details – comparing <code>${currentCommit.slice(0, 7)}</code> vs baseline <code>${baselineCommit.slice(0, 7)}</code></summary>\n`,
+    `<summary>Full details – comparing <code>${formatRef(currentCommit)}</code> vs baseline <code>${formatRef(baselineCommit)}</code></summary>\n`,
   );
   lines.push("| Metric | Baseline | Current | Change |");
   lines.push("|--------|----------|---------|--------|");
@@ -273,7 +280,7 @@ export function formatConsoleSummary(
 export function formatRunSummary(result: BenchmarkResult): string {
   const lines: string[] = [];
   lines.push("## ⚡ Benchmark Results\n");
-  lines.push(`**Commit:** \`${result.commit.slice(0, 7)}\`  `);
+  lines.push(`**Commit:** \`${formatRef(result.commit)}\`  `);
   lines.push(`**Date:** ${result.timestamp}  `);
   lines.push(
     `**Runner:** ${result.runner.os}, Node ${result.runner.nodeVersion}, ${result.runner.arch}\n`,
@@ -300,6 +307,23 @@ export function formatRunSummary(result: BenchmarkResult): string {
 
   lines.push("");
   lines.push(`> Averaged across ${specs.length} specs (${specNames.join(", ")}).`);
+  const specVariability = Object.entries(result.specs)
+    .map(([specName, spec]) => ({
+      specName,
+      variability: spec.variability?.total,
+    }))
+    .filter((x) => x.variability !== undefined)
+    .sort((a, b) => (b.variability?.cv ?? 0) - (a.variability?.cv ?? 0));
+
+  if (specVariability.length > 0) {
+    lines.push("> Total runtime variability (CV) per spec:");
+    for (const entry of specVariability) {
+      const variability = entry.variability!;
+      lines.push(
+        `> - ${entry.specName}: ${(variability.cv * 100).toFixed(1)}% (n=${variability.sampleCount}, σ=${formatMs(variability.stdDev)})`,
+      );
+    }
+  }
   lines.push(LEGEND);
 
   return lines.join("\n");
@@ -315,7 +339,7 @@ export function formatComparisonSummary(
   const lines: string[] = [];
   lines.push("## ⚡ Benchmark Comparison\n");
   lines.push(
-    `Comparing [\`${currentCommit.slice(0, 7)}\`] against baseline [\`${baselineCommit.slice(0, 7)}\`]\n`,
+    `Comparing [\`${formatRef(currentCommit)}\`] against baseline [\`${formatRef(baselineCommit)}\`]\n`,
   );
 
   const averaged = averageComparisonMetrics(comparisons);
diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts
index 37eac8af8d..830c93e14c 100644
--- a/packages/benchmark/src/run.ts
+++ b/packages/benchmark/src/run.ts
@@ -5,8 +5,10 @@ import { readdir } from "fs/promises";
 import os from "os";
 import { join, resolve } from "path";
 import { aggregateDurations } from "./aggregate.js";
+import { summarize } from "./statistics.js";
 import type {
   BenchmarkResult,
+  NoiseGateInfo,
   RunnerInfo,
   RuntimeStats,
   SpecBenchmarkResult,
@@ -27,6 +29,12 @@ export interface RunOptions {
   specs?: string[];
   /** Git commit SHA to record. */
   commit?: string;
+  /** If set, rerun a spec when total-runtime coefficient of variation exceeds threshold. */
+  noiseCvThreshold?: number;
+  /** Max number of rerun cycles for noisy specs. */
+  maxReruns?: number;
+  /** Number of additional measured iterations on each rerun (default: iterations). */
+  rerunIterations?: number;
 }
 
 /** Discover benchmark spec directories under the given path. */
@@ -208,6 +216,9 @@ export async function runBenchmarks(options: RunOptions): Promise<BenchmarkResul
   );
 
   const specs: Record<string, SpecBenchmarkResult> = {};
+  const noiseCvThreshold = options.noiseCvThreshold;
+  const maxReruns = options.maxReruns ?? 0;
+  const rerunIterations = options.rerunIterations ?? iterations;
 
   for (const specName of specNames) {
     const specDir = join(specsDir, specName);
@@ -227,14 +238,52 @@ export async function runBenchmarks(options: RunOptions): Promise<BenchmarkResul
       rawIterations.push(stats);
     }
 
+    let rerunsPerformed = 0;
+    if (noiseCvThreshold !== undefined && maxReruns > 0 && rerunIterations > 0) {
+      for (let rerun = 0; rerun < maxReruns; rerun++) {
+        const totalSummary = summarize(rawIterations.map((x) => x.runtime.total));
+        if (totalSummary.cv <= noiseCvThreshold) {
+          break;
+        }
+
+        rerunsPerformed++;
+        console.log(
+          `    Noise gate triggered (CV ${(totalSummary.cv * 100).toFixed(1)}% > ${(noiseCvThreshold * 100).toFixed(1)}%), running ${rerunIterations} extra iteration(s)...`,
+        );
+        for (let i = 0; i < rerunIterations; i++) {
+          console.log(`    Rerun iteration ${i + 1}/${rerunIterations}...`);
+          const stats = await compileSpec(specDir);
+          rawIterations.push(stats);
+        }
+      }
+    }
+
+    const totalSummary = summarize(rawIterations.map((x) => x.runtime.total));
+    const noiseGateInfo: NoiseGateInfo | undefined =
+      noiseCvThreshold === undefined
+        ? undefined
+        : {
+            thresholdCv: noiseCvThreshold,
+            maxReruns,
+            rerunIterations,
+            rerunsPerformed,
+            triggered: rerunsPerformed > 0,
+          };
+
     specs[specName] = {
       name: specName,
-      iterations,
+      iterations: rawIterations.length,
       stats: averageStats(rawIterations),
       rawIterations,
+      variability: {
+        total: totalSummary,
+        noiseGate: noiseGateInfo,
+      },
     };
 
-    console.log(`    Total: ${specs[specName].stats.runtime.total.toFixed(1)}ms (avg)`);
+    console.log(
+      `    Total: ${specs[specName].stats.runtime.total.toFixed(1)}ms (avg), CV ${(totalSummary.cv * 100).toFixed(1)}%`,
+    );
   }
 
   const commit = getGitCommit(options.commit);
diff --git a/packages/benchmark/src/statistics.ts b/packages/benchmark/src/statistics.ts
new file mode 100644
index 0000000000..dc0b5a5f35
--- /dev/null
+++ b/packages/benchmark/src/statistics.ts
@@ -0,0 +1,62 @@
+export interface DistributionStats {
+  mean: number;
+  median: number;
+  stdDev: number;
+  cv: number;
+  min: number;
+  max: number;
+  sampleCount: number;
+}
+
+export function mean(values: number[]): number {
+  if (values.length === 0) return 0;
+  return values.reduce((sum, value) => sum + value, 0) / values.length;
+}
+
+export function median(values: number[]): number {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  const middle = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 1) {
+    return sorted[middle];
+  }
+  return (sorted[middle - 1] + sorted[middle]) / 2;
+}
+
+export function stdDev(values: number[]): number {
+  if (values.length < 2) return 0;
+  const avg = mean(values);
+  const variance = values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / values.length;
+  return Math.sqrt(variance);
+}
+
+export function coefficientOfVariation(values: number[]): number {
+  if (values.length === 0) return 0;
+  const avg = mean(values);
+  if (avg === 0) return 0;
+  return stdDev(values) / avg;
+}
+
+export function summarize(values: number[]): DistributionStats {
+  if (values.length === 0) {
+    return {
+      mean: 0,
+      median: 0,
+      stdDev: 0,
+      cv: 0,
+      min: 0,
+      max: 0,
+      sampleCount: 0,
+    };
+  }
+
+  return {
+    mean: mean(values),
+    median: median(values),
+    stdDev: stdDev(values),
+    cv: coefficientOfVariation(values),
+    min: Math.min(...values),
+    max: Math.max(...values),
+    sampleCount: values.length,
+  };
+}
diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts
index caf284c27c..8bf91f1120 100644
--- a/packages/benchmark/src/types.ts
+++ b/packages/benchmark/src/types.ts
@@ -37,6 +37,24 @@ export interface RuntimeStats {
   };
 }
 
+export interface MetricVariability {
+  mean: number;
+  median: number;
+  stdDev: number;
+  cv: number;
+  min: number;
+  max: number;
+  sampleCount: number;
+}
+
+export interface NoiseGateInfo {
+  thresholdCv: number;
+  maxReruns: number;
+  rerunIterations: number;
+  rerunsPerformed: number;
+  triggered: boolean;
+}
+
 /** Benchmark result for a single spec across multiple iterations. */
 export interface SpecBenchmarkResult {
   /** The spec name (directory name). */
@@ -47,6 +65,11 @@ export interface SpecBenchmarkResult {
   stats: Stats;
   /** Per-iteration raw stats. */
   rawIterations: Stats[];
+  /** Variability summary for measured iterations. */
+  variability?: {
+    total: MetricVariability;
+    noiseGate?: NoiseGateInfo;
+  };
 }
 
 /** Complete benchmark result set. */
diff --git a/packages/benchmark/src/upload-pr-comment.ts b/packages/benchmark/src/upload-pr-comment.ts
index 80a5b08cf4..99541bc56d 100644
--- a/packages/benchmark/src/upload-pr-comment.ts
+++ b/packages/benchmark/src/upload-pr-comment.ts
@@ -2,14 +2,16 @@
 import { execSync } from "node:child_process";
 import { appendFileSync, existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
 import { join, resolve } from "node:path";
+import { aggregateDurations } from "./aggregate.js";
 import { compareBenchmarks, hasNotableChanges } from "./compare.js";
 import {
   formatComparisonSummary,
   formatConsoleSummary,
   formatPrComment,
 } from "./format-comment.js";
-import type { BenchmarkResult } from "./types.js";
+import type { BenchmarkResult, RuntimeStats, SpecBenchmarkResult } from "./types.js";
 import { DEFAULT_BRANCH } from "./utils.js";
+import type { HistoryData } from "./generate-history.js";
 
 export interface UploadPrCommentOptions {
   /** Path to the current benchmark results JSON file. */
@@ -22,9 +24,132 @@ export interface UploadPrCommentOptions {
   branch?: string;
   /** Percent threshold for notable changes. */
   threshold?: number;
+  /** Number of latest entries to use for rolling baseline. */
+  baselineWindow?: number;
 }
 
-function fetchBaseline(branch: string): BenchmarkResult | undefined {
+interface BaselineResult {
+  baseline: BenchmarkResult;
+  label: string;
+}
+
+function unflattenRuntime(flat: Record<string, number>): RuntimeStats {
+  const runtime: RuntimeStats = {
+    total: flat["total"] ?? 0,
+    loader: flat["loader"] ?? 0,
+    resolver: flat["resolver"] ?? 0,
+    checker: flat["checker"] ?? 0,
+    validation: { total: flat["validation"] ?? 0, validators: {} },
+    linter: { total: flat["linter"] ?? 0, rules: {} },
+    emit: { total: flat["emit"] ?? 0, emitters: {} },
+  };
+
+  for (const [label, value] of Object.entries(flat)) {
+    if (label.startsWith("validation/")) {
+      runtime.validation.validators[label.replace("validation/", "")] = value;
+      continue;
+    }
+    if (label.startsWith("linter/")) {
+      runtime.linter.rules[label.replace("linter/", "")] = value;
+      continue;
+    }
+    if (!label.startsWith("emit/")) {
+      continue;
+    }
+
+    const parts = label.split("/");
+    if (parts.length < 2) {
+      continue;
+    }
+
+    const emitterName = parts[1];
+    runtime.emit.emitters[emitterName] ??= { total: 0, steps: {} };
+    if (parts.length === 2) {
+      runtime.emit.emitters[emitterName].total = value;
+    } else if (parts.length > 2) {
+      const stepName = parts.slice(2).join("/");
+      runtime.emit.emitters[emitterName].steps[stepName] = value;
+    }
+  }
+
+  return runtime;
+}
+
+function aggregateSpecFromHistory(
+  specName: string,
+  entries: HistoryData["entries"],
+  currentSpec: SpecBenchmarkResult,
+): SpecBenchmarkResult | undefined {
+  const samplesByMetric = new Map<string, number[]>();
+  for (const entry of entries) {
+    const metrics = entry.specMetrics[specName];
+    if (!metrics) continue;
+    for (const [label, value] of Object.entries(metrics)) {
+      const samples = samplesByMetric.get(label);
+      if (samples) {
+        samples.push(value);
+      } else {
+        samplesByMetric.set(label, [value]);
+      }
+    }
+  }
+
+  if (samplesByMetric.size === 0) {
+    return undefined;
+  }
+
+  const aggregated: Record<string, number> = {};
+  for (const [label, samples] of samplesByMetric) {
+    aggregated[label] = aggregateDurations(samples);
+  }
+
+  return {
+    ...currentSpec,
+    stats: {
+      ...currentSpec.stats,
+      runtime: unflattenRuntime(aggregated),
+    },
+  };
+}
+
+function buildRollingBaseline(
+  history: HistoryData,
+  current: BenchmarkResult,
+  baselineWindow: number,
+): BaselineResult | undefined {
+  const window = Math.max(1, baselineWindow);
+  const entries = history.entries.slice(-window);
+  if (entries.length === 0) {
+    return undefined;
+  }
+
+  const specs: Record<string, SpecBenchmarkResult> = {};
+  for (const [specName, currentSpec] of Object.entries(current.specs)) {
+    const rollingSpec = aggregateSpecFromHistory(specName, entries, currentSpec);
+    if (!rollingSpec) {
+      continue;
+    }
+    specs[specName] = rollingSpec;
+  }
+
+  if (Object.keys(specs).length === 0) {
+    return undefined;
+  }
+
+  const firstCommit = entries[0]?.commit.slice(0, 7) ?? "unknown";
+  const lastCommit = entries[entries.length - 1]?.commit.slice(0, 7) ?? "unknown";
+  return {
+    baseline: {
+      ...current,
+      commit: `rolling:${firstCommit}..${lastCommit}`,
+      timestamp: new Date().toISOString(),
+      specs,
+    },
+    label: `rolling baseline (${entries.length} main run${entries.length > 1 ? "s" : ""})`,
+  };
+}
+
+function fetchBaseline(branch: string, current: BenchmarkResult, baselineWindow: number): BaselineResult | undefined {
   try {
     const hasRemote = (() => {
       try {
@@ -40,11 +165,28 @@ function fetchBaseline(branch: string): BenchmarkResult | undefined {
     }
 
     execSync(`git fetch origin ${branch}`, { stdio: "ignore" });
-    const content = execSync(`git show origin/${branch}:results/latest.json`, {
+    try {
+      const historyContent = execSync(`git show origin/${branch}:results/history.json`, {
+        encoding: "utf-8",
+        maxBuffer: 50_000_000,
+      });
+      const history = JSON.parse(historyContent) as HistoryData;
+      const rollingBaseline = buildRollingBaseline(history, current, baselineWindow);
+      if (rollingBaseline) {
+        return rollingBaseline;
+      }
+    } catch {
+      // ignore and fallback to latest.json
+    }
+
+    const latestContent = execSync(`git show origin/${branch}:results/latest.json`, {
       encoding: "utf-8",
       maxBuffer: 50_000_000,
     });
-    return JSON.parse(content) as BenchmarkResult;
+    return {
+      baseline: JSON.parse(latestContent) as BenchmarkResult,
+      label: "latest main benchmark",
+    };
   } catch {
     return undefined;
   }
@@ -64,25 +206,29 @@ export function uploadPrComment(options: UploadPrCommentOptions): void {
   const { resultsFile, prNumber, outputDir } = options;
   const branch = options.branch ?? DEFAULT_BRANCH;
   const threshold = options.threshold;
+  const baselineWindow = options.baselineWindow ?? 20;
 
   if (!existsSync(resultsFile)) {
     throw new Error(`Results file not found: ${resultsFile}`);
   }
 
   const current = JSON.parse(readFileSync(resolve(resultsFile), "utf-8")) as BenchmarkResult;
-  const baseline = fetchBaseline(branch);
+  const baselineResult = fetchBaseline(branch, current, baselineWindow);
 
   mkdirSync(outputDir, { recursive: true });
 
   let commentMarkdown: string;
   let githubSummary: string | undefined;
 
-  if (baseline) {
+  if (baselineResult) {
+    const { baseline, label } = baselineResult;
     const comparisons = compareBenchmarks(baseline, current, { threshold });
-    commentMarkdown = formatPrComment(comparisons, baseline.commit, current.commit, { threshold });
+    commentMarkdown = formatPrComment(comparisons, `${baseline.commit} (${label})`, current.commit, {
+      threshold,
+    });
     githubSummary = formatComparisonSummary(
       comparisons,
-      baseline.commit,
+      `${baseline.commit} (${label})`,
       current.commit,
       threshold,
     );
diff --git a/packages/benchmark/test/compare.test.ts b/packages/benchmark/test/compare.test.ts
index 1344cac452..b2c86ecd9e 100644
--- a/packages/benchmark/test/compare.test.ts
+++ b/packages/benchmark/test/compare.test.ts
@@ -52,3 +52,16 @@ it("excludes metrics below minimum absolute threshold from regression summary",
   expect(topSummary).toContain("| checker |");
   expect(topSummary).not.toContain("linter/noisy-rule");
 });
+
+it("keeps descriptive baseline labels in comments", () => {
+  const comparisons = [createComparison([createMetric("checker", 100, 99)])];
+  const comment = formatPrComment(
+    comparisons,
+    "rolling:abc1234..def5678 (rolling baseline (20 main runs))",
+    "1234567890abcdef",
+    { threshold: 5 },
+  );
+
+  expect(comment).toContain("rolling baseline (20 main runs)");
+  expect(comment).toContain("<code>1234567</code>");
+});
diff --git a/packages/benchmark/test/statistics.test.ts b/packages/benchmark/test/statistics.test.ts
new file mode 100644
index 0000000000..406434d5c0
--- /dev/null
+++ b/packages/benchmark/test/statistics.test.ts
@@ -0,0 +1,17 @@
+import { expect, it } from "vitest";
+import { coefficientOfVariation, summarize } from "../src/statistics.js";
+
+it("computes coefficient of variation", () => {
+  const cv = coefficientOfVariation([100, 102, 98, 100]);
+  expect(cv).toBeGreaterThan(0);
+  expect(cv).toBeLessThan(0.02);
+});
+
+it("summarizes value distributions", () => {
+  const summary = summarize([90, 100, 110]);
+  expect(summary.sampleCount).toBe(3);
+  expect(summary.mean).toBe(100);
+  expect(summary.median).toBe(100);
+  expect(summary.min).toBe(90);
+  expect(summary.max).toBe(110);
+});

From e6f105943180f5ba36f1884069f2cb9d7b776c64 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Jun 2026 21:50:35 +0000
Subject: [PATCH 2/7] Address benchmark review follow-ups

Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com>
---
 packages/benchmark/src/format-comment.ts    | 1 +
 packages/benchmark/src/statistics.ts        | 2 +-
 packages/benchmark/src/upload-pr-comment.ts | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/packages/benchmark/src/format-comment.ts b/packages/benchmark/src/format-comment.ts
index 5d91c4e72d..311ca57542 100644
--- a/packages/benchmark/src/format-comment.ts
+++ b/packages/benchmark/src/format-comment.ts
@@ -47,6 +47,7 @@ function formatPercent(pct: number): string {
 }
 
 function formatRef(value: string): string {
+  // Commit hashes are shortened for readability while descriptive labels are kept intact.
   if (/^[a-f0-9]{7,40}$/i.test(value)) {
     return value.slice(0, 7);
   }
diff --git a/packages/benchmark/src/statistics.ts b/packages/benchmark/src/statistics.ts
index dc0b5a5f35..64b3cbf081 100644
--- a/packages/benchmark/src/statistics.ts
+++ b/packages/benchmark/src/statistics.ts
@@ -26,7 +26,7 @@ export function median(values: number[]): number {
 export function stdDev(values: number[]): number {
   if (values.length < 2) return 0;
   const avg = mean(values);
-  const variance = values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / values.length;
+  const variance = values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / (values.length - 1);
   return Math.sqrt(variance);
 }
 
diff --git a/packages/benchmark/src/upload-pr-comment.ts b/packages/benchmark/src/upload-pr-comment.ts
index 99541bc56d..ab15c2b65b 100644
--- a/packages/benchmark/src/upload-pr-comment.ts
+++ b/packages/benchmark/src/upload-pr-comment.ts
@@ -141,7 +141,7 @@ function buildRollingBaseline(
   return {
     baseline: {
       ...current,
-      commit: `rolling:${firstCommit}..${lastCommit}`,
+      commit: `rolling-baseline-${firstCommit}-${lastCommit}`,
       timestamp: new Date().toISOString(),
       specs,
     },

From c226b3522b6dbac1665c9bc663d2f2659f6b235c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 15 Jun 2026 15:20:11 +0000
Subject: [PATCH 3/7] Fix benchmark formatting and spellcheck follow-up

Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com>
---
 packages/benchmark/src/cli.ts               |  4 +---
 packages/benchmark/src/upload-pr-comment.ts | 23 ++++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/packages/benchmark/src/cli.ts b/packages/benchmark/src/cli.ts
index 96b76fce10..82380f99cf 100644
--- a/packages/benchmark/src/cli.ts
+++ b/packages/benchmark/src/cli.ts
@@ -134,9 +134,7 @@ async function runCommand(args: Record<string, string>): Promise<void> {
   const commit = args["commit"];
   const outputFile = args["output"];
   const noiseCvThreshold =
-    args["noise-cv-threshold"] !== undefined
-      ? parseFloat(args["noise-cv-threshold"])
-      : undefined;
+    args["noise-cv-threshold"] !== undefined ? parseFloat(args["noise-cv-threshold"]) : undefined;
   const maxReruns = args["max-reruns"] ? parseInt(args["max-reruns"], 10) : undefined;
   const rerunIterations = args["rerun-iterations"]
     ? parseInt(args["rerun-iterations"], 10)
diff --git a/packages/benchmark/src/upload-pr-comment.ts b/packages/benchmark/src/upload-pr-comment.ts
index ab15c2b65b..53e447bcc2 100644
--- a/packages/benchmark/src/upload-pr-comment.ts
+++ b/packages/benchmark/src/upload-pr-comment.ts
@@ -9,9 +9,9 @@ import {
   formatConsoleSummary,
   formatPrComment,
 } from "./format-comment.js";
+import type { HistoryData } from "./generate-history.js";
 import type { BenchmarkResult, RuntimeStats, SpecBenchmarkResult } from "./types.js";
 import { DEFAULT_BRANCH } from "./utils.js";
-import type { HistoryData } from "./generate-history.js";
 
 export interface UploadPrCommentOptions {
   /** Path to the current benchmark results JSON file. */
@@ -33,7 +33,7 @@ interface BaselineResult {
   label: string;
 }
 
-function unflattenRuntime(flat: Record<string, number>): RuntimeStats {
+function expandRuntimeMetrics(flat: Record<string, number>): RuntimeStats {
   const runtime: RuntimeStats = {
     total: flat["total"] ?? 0,
     loader: flat["loader"] ?? 0,
@@ -107,7 +107,7 @@ function aggregateSpecFromHistory(
     ...currentSpec,
     stats: {
       ...currentSpec.stats,
-      runtime: unflattenRuntime(aggregated),
+      runtime: expandRuntimeMetrics(aggregated),
     },
   };
 }
@@ -149,7 +149,11 @@ function buildRollingBaseline(
   };
 }
 
-function fetchBaseline(branch: string, current: BenchmarkResult, baselineWindow: number): BaselineResult | undefined {
+function fetchBaseline(
+  branch: string,
+  current: BenchmarkResult,
+  baselineWindow: number,
+): BaselineResult | undefined {
   try {
     const hasRemote = (() => {
       try {
@@ -223,9 +227,14 @@ export function uploadPrComment(options: UploadPrCommentOptions): void {
   if (baselineResult) {
     const { baseline, label } = baselineResult;
     const comparisons = compareBenchmarks(baseline, current, { threshold });
-    commentMarkdown = formatPrComment(comparisons, `${baseline.commit} (${label})`, current.commit, {
-      threshold,
-    });
+    commentMarkdown = formatPrComment(
+      comparisons,
+      `${baseline.commit} (${label})`,
+      current.commit,
+      {
+        threshold,
+      },
+    );
     githubSummary = formatComparisonSummary(
       comparisons,
       `${baseline.commit} (${label})`,

From 093bcfe208880a8ca402e5a71c0712c77b1e844e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 15 Jun 2026 16:04:47 +0000
Subject: [PATCH 4/7] Fix benchmark workflow Node engine mismatch

Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com>
---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2cb66c2905..101902c7a1 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -47,7 +47,7 @@ jobs:
 
       - uses: ./.github/actions/setup
         with:
-          node-version: 24.11.1
+          node-version: 24.15.0
 
       - name: Install dependencies
         run: pnpm install

From 1248489ff41548260897805964c5b6c74e39e492 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:31:14 +0000
Subject: [PATCH 5/7] Increase benchmark workflow Node heap limit to prevent
 OOM

Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com>
---
 .github/workflows/benchmark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 101902c7a1..72b3b9646a 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -58,7 +58,7 @@ jobs:
       - name: Run backfill
         if: ${{ github.event_name == 'workflow_dispatch' && inputs.backfill_from }}
         run: |
-          node packages/benchmark/dist/src/cli.js backfill \
+          node --max-old-space-size=6144 packages/benchmark/dist/src/cli.js backfill \
             --from ${{ inputs.backfill_from }} \
             --specs-dir packages/benchmark/specs \
             --iterations 25 \
@@ -69,7 +69,7 @@ jobs:
       - name: Run benchmarks
         if: ${{ github.event_name != 'workflow_dispatch' || !inputs.backfill_from }}
         run: |
-          node packages/benchmark/dist/src/cli.js run \
+          node --max-old-space-size=6144 packages/benchmark/dist/src/cli.js run \
             --specs-dir packages/benchmark/specs \
             --iterations 25 \
             --warmup 3 \

From e2284a1aede647dc92a854c94b39e2761760de93 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 17:12:10 +0000
Subject: [PATCH 6/7] Isolate benchmark iterations to prevent benchmark-run OOM

Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com>
---
 packages/benchmark/src/compile-once.ts | 52 ++++++++++++++++++
 packages/benchmark/src/run.ts          | 73 ++++++++++++--------------
 2 files changed, 87 insertions(+), 38 deletions(-)
 create mode 100644 packages/benchmark/src/compile-once.ts

diff --git a/packages/benchmark/src/compile-once.ts b/packages/benchmark/src/compile-once.ts
new file mode 100644
index 0000000000..29c80afc61
--- /dev/null
+++ b/packages/benchmark/src/compile-once.ts
@@ -0,0 +1,52 @@
+/* eslint-disable no-console */
+import { compile, NodeHost, resolveCompilerOptions } from "@typespec/compiler";
+import { join } from "path";
+import type { Stats } from "./types.js";
+
+async function compileSpec(specDir: string): Promise<Stats> {
+  const mainFile = join(specDir, "main.tsp");
+  const [options, diagnostics] = await resolveCompilerOptions(NodeHost, {
+    entrypoint: mainFile,
+    cwd: specDir,
+  });
+  if (diagnostics.length > 0) {
+    const msgs = diagnostics.map((d: any) => `  ${d.message}`).join("\n");
+    console.warn(`Warnings resolving options for ${specDir}:\n${msgs}`);
+  }
+
+  const program = await compile(NodeHost, mainFile, {
+    ...options,
+    outputDir: join(specDir, "tsp-output"),
+  });
+
+  if (program.hasError()) {
+    const errorDiags = program.diagnostics
+      .filter((d: any) => d.severity === "error")
+      .map((d: any) => `  ${d.message}`)
+      .join("\n");
+    throw new Error(`Compilation failed for ${specDir}:\n${errorDiags}`);
+  }
+
+  const stats = (program as any).stats as Stats;
+  stats.runtime.total =
+    (stats.runtime.loader ?? 0) +
+    (stats.runtime.resolver ?? 0) +
+    (stats.runtime.checker ?? 0) +
+    (stats.runtime.validation?.total ?? 0) +
+    (stats.runtime.linter?.total ?? 0);
+  return stats;
+}
+
+async function main() {
+  const specDir = process.argv[2];
+  if (!specDir) {
+    throw new Error("Missing spec directory");
+  }
+  const stats = await compileSpec(specDir);
+  process.stdout.write(JSON.stringify(stats));
+}
+
+main().catch((error: Error) => {
+  console.error(error.message);
+  process.exit(1);
+});
diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts
index 830c93e14c..8d7fb53947 100644
--- a/packages/benchmark/src/run.ts
+++ b/packages/benchmark/src/run.ts
@@ -1,9 +1,9 @@
 /* eslint-disable no-console */
-import { compile, NodeHost, resolveCompilerOptions } from "@typespec/compiler";
-import { execSync } from "child_process";
+import { execSync, spawn } from "child_process";
 import { readdir } from "fs/promises";
 import os from "os";
 import { join, resolve } from "path";
+import { fileURLToPath } from "url";
 import { aggregateDurations } from "./aggregate.js";
 import { summarize } from "./statistics.js";
 import type {
@@ -50,44 +50,41 @@ async function discoverSpecs(specsDir: string, filter?: string[]): Promise<strin
   return dirs;
 }
 
-/** Compile a single spec and return its stats. */
-async function compileSpec(specDir: string): Promise<Stats> {
-  const mainFile = join(specDir, "main.tsp");
-  const [options, diagnostics] = await resolveCompilerOptions(NodeHost, {
-    entrypoint: mainFile,
-    cwd: specDir,
-  });
-  if (diagnostics.length > 0) {
-    const msgs = diagnostics.map((d: any) => `  ${d.message}`).join("\n");
-    console.warn(`  Warnings resolving options for ${specDir}:\n${msgs}`);
-  }
+const compileOncePath = fileURLToPath(new URL("./compile-once.js", import.meta.url));
 
-  const program = await compile(NodeHost, mainFile, {
-    ...options,
-    outputDir: join(specDir, "tsp-output"),
+/** Compile a single spec in an isolated process and return its stats. */
+async function compileSpec(specDir: string): Promise<Stats> {
+  return await new Promise<Stats>((resolveResult, reject) => {
+    const child = spawn(process.execPath, [compileOncePath, specDir], {
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+    let stdout = "";
+    let stderr = "";
+    child.stdout.on("data", (chunk) => {
+      stdout += chunk.toString();
+    });
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk.toString();
+    });
+    child.on("error", (error) => {
+      reject(error);
+    });
+    child.on("close", (code) => {
+      if (code !== 0) {
+        reject(new Error(stderr.trim() || `Compilation process exited with code ${code}`));
+        return;
+      }
+      try {
+        resolveResult(JSON.parse(stdout) as Stats);
+      } catch (error) {
+        reject(
+          new Error(
+            `Failed to parse benchmark stats output: ${error instanceof Error ? error.message : String(error)}`,
+          ),
+        );
+      }
+    });
   });
-
-  if (program.hasError()) {
-    const errorDiags = program.diagnostics
-      .filter((d: any) => d.severity === "error")
-      .map((d: any) => `  ${d.message}`)
-      .join("\n");
-    throw new Error(`Compilation failed for ${specDir}:\n${errorDiags}`);
-  }
-
-  // program.stats is @internal but available at runtime
-  const stats = (program as any).stats as Stats;
-
-  // Recompute total without the emit stage so that adding more emitters
-  // does not inflate the "compilation" total metric.
-  stats.runtime.total =
-    (stats.runtime.loader ?? 0) +
-    (stats.runtime.resolver ?? 0) +
-    (stats.runtime.checker ?? 0) +
-    (stats.runtime.validation?.total ?? 0) +
-    (stats.runtime.linter?.total ?? 0);
-
-  return stats;
 }
 
 /** Average multiple Stats objects. */

From 3f2a5f9ad8bad12c89f6d9a1fdd02182741e5091 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 24 Jun 2026 17:17:47 +0000
Subject: [PATCH 7/7] Address benchmark review feedback on process output
 handling

Co-authored-by: timotheeguerin <1031227+timotheeguerin@users.noreply.github.com>
---
 packages/benchmark/src/compile-once.ts | 2 +-
 packages/benchmark/src/run.ts          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/benchmark/src/compile-once.ts b/packages/benchmark/src/compile-once.ts
index 29c80afc61..35fe31bebc 100644
--- a/packages/benchmark/src/compile-once.ts
+++ b/packages/benchmark/src/compile-once.ts
@@ -11,7 +11,7 @@ async function compileSpec(specDir: string): Promise<Stats> {
   });
   if (diagnostics.length > 0) {
     const msgs = diagnostics.map((d: any) => `  ${d.message}`).join("\n");
-    console.warn(`Warnings resolving options for ${specDir}:\n${msgs}`);
+    process.stderr.write(`Warnings resolving options for ${specDir}:\n${msgs}\n`);
   }
 
   const program = await compile(NodeHost, mainFile, {
diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts
index 8d7fb53947..039a8982de 100644
--- a/packages/benchmark/src/run.ts
+++ b/packages/benchmark/src/run.ts
@@ -54,7 +54,7 @@ const compileOncePath = fileURLToPath(new URL("./compile-once.js", import.meta.u
 
 /** Compile a single spec in an isolated process and return its stats. */
 async function compileSpec(specDir: string): Promise<Stats> {
-  return await new Promise<Stats>((resolveResult, reject) => {
+  return await new Promise<Stats>((resolve, reject) => {
     const child = spawn(process.execPath, [compileOncePath, specDir], {
       stdio: ["ignore", "pipe", "pipe"],
     });
@@ -75,7 +75,7 @@ async function compileSpec(specDir: string): Promise<Stats> {
         return;
       }
       try {
-        resolveResult(JSON.parse(stdout) as Stats);
+        resolve(JSON.parse(stdout) as Stats);
       } catch (error) {
         reject(
           new Error(