diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a34ffd6882..72b3b9646a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -19,6 +19,11 @@ on: required: false type: string default: "benchmark-data" + runner: + description: "Runner label (for stable runs prefer self-hosted or larger dedicated runner)" + required: false + type: string + default: "ubuntu-latest" permissions: contents: write @@ -30,7 +35,7 @@ concurrency: jobs: benchmark: name: Run Benchmarks - runs-on: ubuntu-latest + runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || vars.BENCHMARK_RUNNER || 'ubuntu-latest' }} env: TYPESPEC_VS_CI_BUILD: true TYPESPEC_SKIP_WEBSITE_BUILD: true @@ -41,6 +46,8 @@ jobs: fetch-depth: 0 - uses: ./.github/actions/setup + with: + node-version: 24.15.0 - name: Install dependencies run: pnpm install @@ -51,21 +58,24 @@ jobs: - name: Run backfill if: ${{ github.event_name == 'workflow_dispatch' && inputs.backfill_from }} run: | - node packages/benchmark/dist/src/cli.js backfill \ + node --max-old-space-size=6144 packages/benchmark/dist/src/cli.js backfill \ --from ${{ inputs.backfill_from }} \ --specs-dir packages/benchmark/specs \ - --iterations 15 \ - --warmup 1 \ + --iterations 25 \ + --warmup 3 \ --branch ${{ inputs.branch }} \ --push - name: Run benchmarks if: ${{ github.event_name != 'workflow_dispatch' || !inputs.backfill_from }} run: | - node packages/benchmark/dist/src/cli.js run \ + node --max-old-space-size=6144 packages/benchmark/dist/src/cli.js run \ --specs-dir packages/benchmark/specs \ - --iterations 15 \ - --warmup 1 \ + --iterations 25 \ + --warmup 3 \ + --noise-cv-threshold 0.08 \ + --max-reruns 1 \ + --rerun-iterations 10 \ --commit ${{ github.sha }} \ --output /tmp/benchmark-results.json @@ -84,6 +94,7 @@ jobs: node packages/benchmark/dist/src/cli.js upload-pr-comment \ --results /tmp/benchmark-results.json \ --pr-number ${{ github.event.number }} \ + --baseline-window 20 \ --output-dir /tmp/benchmark-artifacts - name: Upload benchmark comment diff --git a/packages/benchmark/README.md b/packages/benchmark/README.md index 4b88a730f7..f2a031a0e7 100644 --- a/packages/benchmark/README.md +++ b/packages/benchmark/README.md @@ -7,8 +7,11 @@ Performance benchmarking tool for TypeSpec Azure compilation. Tracks compilation 1. **Benchmark runner** compiles dedicated TypeSpec specs using the compiler's programmatic API 2. The compiler provides built-in `Stats` data including per-stage timing and per-linter-rule breakdown 3. Runtime metrics are aggregated with an outlier-resistant estimator (trimmed mean for 5+ samples, median for smaller sample sizes) -4. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch -5. PR comments show a comparison table highlighting performance changes +4. Per-spec variability (standard deviation and coefficient of variation) is captured from raw iterations +5. Optional noise-gating can auto-run extra iterations when variance is high +6. PR baseline can be built from a rolling window of recent `main` results instead of only `latest.json` +7. Results are stored as JSON — on CI, they're saved to the `benchmark-data` branch +8. PR comments show a comparison table highlighting performance changes ## Local usage @@ -25,6 +28,9 @@ node packages/benchmark/dist/src/cli.js run --output results.json node packages/benchmark/dist/src/cli.js run \ --iterations 3 \ --warmup 1 \ + --noise-cv-threshold 0.08 \ + --max-reruns 1 \ + --rerun-iterations 5 \ --specs azure-core-dataplane,azure-arm-resource-manager \ --output results.json ``` @@ -78,6 +84,7 @@ The `.github/workflows/benchmark.yml` workflow: - **On push to `main`**: Runs benchmarks and stores results to the `benchmark-data` branch via the `store-results` CLI command - **On pull requests**: Runs benchmarks, fetches the baseline, compares, and generates a PR comment via the `upload-pr-comment` CLI command +- Benchmark PR baselines are generated from a rolling window of recent `main` runs when `results/history.json` is available, with fallback to `results/latest.json` ### Data storage diff --git a/packages/benchmark/src/cli.ts b/packages/benchmark/src/cli.ts index ab3670b362..82380f99cf 100644 --- a/packages/benchmark/src/cli.ts +++ b/packages/benchmark/src/cli.ts @@ -38,6 +38,11 @@ Run options: --specs-dir Directory containing benchmark specs (default: built-in specs) --iterations Number of measured iterations (default: 5) --warmup Number of warmup iterations (default: 1) + --noise-cv-threshold + Rerun when total-runtime coefficient of variation is above this value (e.g. 0.08 = 8%) + --max-reruns Max rerun cycles when noise gate triggers (default: 0) + --rerun-iterations + Extra measured iterations per rerun (default: same as --iterations) --specs Comma-separated list of specific specs to run --commit Git commit SHA to record --output Output file for results JSON (default: stdout) @@ -64,6 +69,7 @@ Upload-pr-comment options: --pr-number Pull request number --output-dir Output directory for artifacts --branch Branch name for fetching baseline (default: benchmark-data) + --baseline-window Number of recent main results to build rolling baseline (default: 20) --threshold Percent threshold for notable changes (default: 5) Backfill options: @@ -127,6 +133,12 @@ async function runCommand(args: Record): Promise { const specs = args["specs"]?.split(","); const commit = args["commit"]; const outputFile = args["output"]; + const noiseCvThreshold = + args["noise-cv-threshold"] !== undefined ? parseFloat(args["noise-cv-threshold"]) : undefined; + const maxReruns = args["max-reruns"] ? parseInt(args["max-reruns"], 10) : undefined; + const rerunIterations = args["rerun-iterations"] + ? parseInt(args["rerun-iterations"], 10) + : undefined; const result = await runBenchmarks({ specsDir, @@ -134,6 +146,9 @@ async function runCommand(args: Record): Promise { warmup, specs, commit, + noiseCvThreshold, + maxReruns, + rerunIterations, }); await outputResult(JSON.stringify(result, null, 2), outputFile); @@ -205,6 +220,7 @@ function uploadPrCommentCommand(args: Record): void { outputDir, branch: args["branch"], threshold: args["threshold"] ? parseFloat(args["threshold"]) : undefined, + baselineWindow: args["baseline-window"] ? parseInt(args["baseline-window"], 10) : undefined, }); } diff --git a/packages/benchmark/src/compile-once.ts b/packages/benchmark/src/compile-once.ts new file mode 100644 index 0000000000..35fe31bebc --- /dev/null +++ b/packages/benchmark/src/compile-once.ts @@ -0,0 +1,52 @@ +/* eslint-disable no-console */ +import { compile, NodeHost, resolveCompilerOptions } from "@typespec/compiler"; +import { join } from "path"; +import type { Stats } from "./types.js"; + +async function compileSpec(specDir: string): Promise { + const mainFile = join(specDir, "main.tsp"); + const [options, diagnostics] = await resolveCompilerOptions(NodeHost, { + entrypoint: mainFile, + cwd: specDir, + }); + if (diagnostics.length > 0) { + const msgs = diagnostics.map((d: any) => ` ${d.message}`).join("\n"); + process.stderr.write(`Warnings resolving options for ${specDir}:\n${msgs}\n`); + } + + const program = await compile(NodeHost, mainFile, { + ...options, + outputDir: join(specDir, "tsp-output"), + }); + + if (program.hasError()) { + const errorDiags = program.diagnostics + .filter((d: any) => d.severity === "error") + .map((d: any) => ` ${d.message}`) + .join("\n"); + throw new Error(`Compilation failed for ${specDir}:\n${errorDiags}`); + } + + const stats = (program as any).stats as Stats; + stats.runtime.total = + (stats.runtime.loader ?? 0) + + (stats.runtime.resolver ?? 0) + + (stats.runtime.checker ?? 0) + + (stats.runtime.validation?.total ?? 0) + + (stats.runtime.linter?.total ?? 0); + return stats; +} + +async function main() { + const specDir = process.argv[2]; + if (!specDir) { + throw new Error("Missing spec directory"); + } + const stats = await compileSpec(specDir); + process.stdout.write(JSON.stringify(stats)); +} + +main().catch((error: Error) => { + console.error(error.message); + process.exit(1); +}); diff --git a/packages/benchmark/src/format-comment.ts b/packages/benchmark/src/format-comment.ts index f41ce7fae2..311ca57542 100644 --- a/packages/benchmark/src/format-comment.ts +++ b/packages/benchmark/src/format-comment.ts @@ -46,6 +46,14 @@ function formatPercent(pct: number): string { return `${sign}${pct.toFixed(1)}%`; } +function formatRef(value: string): string { + // Commit hashes are shortened for readability while descriptive labels are kept intact. + if (/^[a-f0-9]{7,40}$/i.test(value)) { + return value.slice(0, 7); + } + return value; +} + // ── Metric flattening helpers ────────────────────────────────────────────── interface FlatMetric { @@ -192,7 +200,7 @@ export function formatPrComment( const specNames = comparisons.map((c) => c.specName).join(", "); lines.push("
"); lines.push( - `Full details – comparing ${currentCommit.slice(0, 7)} vs baseline ${baselineCommit.slice(0, 7)}\n`, + `Full details – comparing ${formatRef(currentCommit)} vs baseline ${formatRef(baselineCommit)}\n`, ); lines.push("| Metric | Baseline | Current | Change |"); lines.push("|--------|----------|---------|--------|"); @@ -273,7 +281,7 @@ export function formatConsoleSummary( export function formatRunSummary(result: BenchmarkResult): string { const lines: string[] = []; lines.push("## ⚡ Benchmark Results\n"); - lines.push(`**Commit:** \`${result.commit.slice(0, 7)}\` `); + lines.push(`**Commit:** \`${formatRef(result.commit)}\` `); lines.push(`**Date:** ${result.timestamp} `); lines.push( `**Runner:** ${result.runner.os}, Node ${result.runner.nodeVersion}, ${result.runner.arch}\n`, @@ -300,6 +308,23 @@ export function formatRunSummary(result: BenchmarkResult): string { lines.push(""); lines.push(`> Averaged across ${specs.length} specs (${specNames.join(", ")}).`); + const specVariability = Object.entries(result.specs) + .map(([specName, spec]) => ({ + specName, + variability: spec.variability?.total, + })) + .filter((x) => x.variability !== undefined) + .sort((a, b) => (b.variability?.cv ?? 0) - (a.variability?.cv ?? 0)); + + if (specVariability.length > 0) { + lines.push("> Total runtime variability (CV) per spec:"); + for (const entry of specVariability) { + const variability = entry.variability!; + lines.push( + `> - ${entry.specName}: ${(variability.cv * 100).toFixed(1)}% (n=${variability.sampleCount}, σ=${formatMs(variability.stdDev)})`, + ); + } + } lines.push(LEGEND); return lines.join("\n"); @@ -315,7 +340,7 @@ export function formatComparisonSummary( const lines: string[] = []; lines.push("## ⚡ Benchmark Comparison\n"); lines.push( - `Comparing [\`${currentCommit.slice(0, 7)}\`] against baseline [\`${baselineCommit.slice(0, 7)}\`]\n`, + `Comparing [\`${formatRef(currentCommit)}\`] against baseline [\`${formatRef(baselineCommit)}\`]\n`, ); const averaged = averageComparisonMetrics(comparisons); diff --git a/packages/benchmark/src/run.ts b/packages/benchmark/src/run.ts index 37eac8af8d..039a8982de 100644 --- a/packages/benchmark/src/run.ts +++ b/packages/benchmark/src/run.ts @@ -1,12 +1,14 @@ /* eslint-disable no-console */ -import { compile, NodeHost, resolveCompilerOptions } from "@typespec/compiler"; -import { execSync } from "child_process"; +import { execSync, spawn } from "child_process"; import { readdir } from "fs/promises"; import os from "os"; import { join, resolve } from "path"; +import { fileURLToPath } from "url"; import { aggregateDurations } from "./aggregate.js"; +import { summarize } from "./statistics.js"; import type { BenchmarkResult, + NoiseGateInfo, RunnerInfo, RuntimeStats, SpecBenchmarkResult, @@ -27,6 +29,12 @@ export interface RunOptions { specs?: string[]; /** Git commit SHA to record. */ commit?: string; + /** If set, rerun a spec when total-runtime coefficient of variation exceeds threshold. */ + noiseCvThreshold?: number; + /** Max number of rerun cycles for noisy specs. */ + maxReruns?: number; + /** Number of additional measured iterations on each rerun (default: iterations). */ + rerunIterations?: number; } /** Discover benchmark spec directories under the given path. */ @@ -42,44 +50,41 @@ async function discoverSpecs(specsDir: string, filter?: string[]): Promise { - const mainFile = join(specDir, "main.tsp"); - const [options, diagnostics] = await resolveCompilerOptions(NodeHost, { - entrypoint: mainFile, - cwd: specDir, - }); - if (diagnostics.length > 0) { - const msgs = diagnostics.map((d: any) => ` ${d.message}`).join("\n"); - console.warn(` Warnings resolving options for ${specDir}:\n${msgs}`); - } +const compileOncePath = fileURLToPath(new URL("./compile-once.js", import.meta.url)); - const program = await compile(NodeHost, mainFile, { - ...options, - outputDir: join(specDir, "tsp-output"), +/** Compile a single spec in an isolated process and return its stats. */ +async function compileSpec(specDir: string): Promise { + return await new Promise((resolve, reject) => { + const child = spawn(process.execPath, [compileOncePath, specDir], { + stdio: ["ignore", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + child.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + child.on("error", (error) => { + reject(error); + }); + child.on("close", (code) => { + if (code !== 0) { + reject(new Error(stderr.trim() || `Compilation process exited with code ${code}`)); + return; + } + try { + resolve(JSON.parse(stdout) as Stats); + } catch (error) { + reject( + new Error( + `Failed to parse benchmark stats output: ${error instanceof Error ? error.message : String(error)}`, + ), + ); + } + }); }); - - if (program.hasError()) { - const errorDiags = program.diagnostics - .filter((d: any) => d.severity === "error") - .map((d: any) => ` ${d.message}`) - .join("\n"); - throw new Error(`Compilation failed for ${specDir}:\n${errorDiags}`); - } - - // program.stats is @internal but available at runtime - const stats = (program as any).stats as Stats; - - // Recompute total without the emit stage so that adding more emitters - // does not inflate the "compilation" total metric. - stats.runtime.total = - (stats.runtime.loader ?? 0) + - (stats.runtime.resolver ?? 0) + - (stats.runtime.checker ?? 0) + - (stats.runtime.validation?.total ?? 0) + - (stats.runtime.linter?.total ?? 0); - - return stats; } /** Average multiple Stats objects. */ @@ -208,6 +213,9 @@ export async function runBenchmarks(options: RunOptions): Promise = {}; + const noiseCvThreshold = options.noiseCvThreshold; + const maxReruns = options.maxReruns ?? 0; + const rerunIterations = options.rerunIterations ?? iterations; for (const specName of specNames) { const specDir = join(specsDir, specName); @@ -227,14 +235,52 @@ export async function runBenchmarks(options: RunOptions): Promise 0 && rerunIterations > 0) { + for (let rerun = 0; rerun < maxReruns; rerun++) { + const totalSummary = summarize(rawIterations.map((x) => x.runtime.total)); + if (totalSummary.cv <= noiseCvThreshold) { + break; + } + + rerunsPerformed++; + console.log( + ` Noise gate triggered (CV ${(totalSummary.cv * 100).toFixed(1)}% > ${(noiseCvThreshold * 100).toFixed(1)}%), running ${rerunIterations} extra iteration(s)...`, + ); + for (let i = 0; i < rerunIterations; i++) { + console.log(` Rerun iteration ${i + 1}/${rerunIterations}...`); + const stats = await compileSpec(specDir); + rawIterations.push(stats); + } + } + } + + const totalSummary = summarize(rawIterations.map((x) => x.runtime.total)); + const noiseGateInfo: NoiseGateInfo | undefined = + noiseCvThreshold === undefined + ? undefined + : { + thresholdCv: noiseCvThreshold, + maxReruns, + rerunIterations, + rerunsPerformed, + triggered: rerunsPerformed > 0, + }; + specs[specName] = { name: specName, - iterations, + iterations: rawIterations.length, stats: averageStats(rawIterations), rawIterations, + variability: { + total: totalSummary, + noiseGate: noiseGateInfo, + }, }; - console.log(` Total: ${specs[specName].stats.runtime.total.toFixed(1)}ms (avg)`); + console.log( + ` Total: ${specs[specName].stats.runtime.total.toFixed(1)}ms (avg), CV ${(totalSummary.cv * 100).toFixed(1)}%`, + ); } const commit = getGitCommit(options.commit); diff --git a/packages/benchmark/src/statistics.ts b/packages/benchmark/src/statistics.ts new file mode 100644 index 0000000000..64b3cbf081 --- /dev/null +++ b/packages/benchmark/src/statistics.ts @@ -0,0 +1,62 @@ +export interface DistributionStats { + mean: number; + median: number; + stdDev: number; + cv: number; + min: number; + max: number; + sampleCount: number; +} + +export function mean(values: number[]): number { + if (values.length === 0) return 0; + return values.reduce((sum, value) => sum + value, 0) / values.length; +} + +export function median(values: number[]): number { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const middle = Math.floor(sorted.length / 2); + if (sorted.length % 2 === 1) { + return sorted[middle]; + } + return (sorted[middle - 1] + sorted[middle]) / 2; +} + +export function stdDev(values: number[]): number { + if (values.length < 2) return 0; + const avg = mean(values); + const variance = values.reduce((sum, value) => sum + (value - avg) ** 2, 0) / (values.length - 1); + return Math.sqrt(variance); +} + +export function coefficientOfVariation(values: number[]): number { + if (values.length === 0) return 0; + const avg = mean(values); + if (avg === 0) return 0; + return stdDev(values) / avg; +} + +export function summarize(values: number[]): DistributionStats { + if (values.length === 0) { + return { + mean: 0, + median: 0, + stdDev: 0, + cv: 0, + min: 0, + max: 0, + sampleCount: 0, + }; + } + + return { + mean: mean(values), + median: median(values), + stdDev: stdDev(values), + cv: coefficientOfVariation(values), + min: Math.min(...values), + max: Math.max(...values), + sampleCount: values.length, + }; +} diff --git a/packages/benchmark/src/types.ts b/packages/benchmark/src/types.ts index caf284c27c..8bf91f1120 100644 --- a/packages/benchmark/src/types.ts +++ b/packages/benchmark/src/types.ts @@ -37,6 +37,24 @@ export interface RuntimeStats { }; } +export interface MetricVariability { + mean: number; + median: number; + stdDev: number; + cv: number; + min: number; + max: number; + sampleCount: number; +} + +export interface NoiseGateInfo { + thresholdCv: number; + maxReruns: number; + rerunIterations: number; + rerunsPerformed: number; + triggered: boolean; +} + /** Benchmark result for a single spec across multiple iterations. */ export interface SpecBenchmarkResult { /** The spec name (directory name). */ @@ -47,6 +65,11 @@ export interface SpecBenchmarkResult { stats: Stats; /** Per-iteration raw stats. */ rawIterations: Stats[]; + /** Variability summary for measured iterations. */ + variability?: { + total: MetricVariability; + noiseGate?: NoiseGateInfo; + }; } /** Complete benchmark result set. */ diff --git a/packages/benchmark/src/upload-pr-comment.ts b/packages/benchmark/src/upload-pr-comment.ts index 80a5b08cf4..53e447bcc2 100644 --- a/packages/benchmark/src/upload-pr-comment.ts +++ b/packages/benchmark/src/upload-pr-comment.ts @@ -2,13 +2,15 @@ import { execSync } from "node:child_process"; import { appendFileSync, existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { join, resolve } from "node:path"; +import { aggregateDurations } from "./aggregate.js"; import { compareBenchmarks, hasNotableChanges } from "./compare.js"; import { formatComparisonSummary, formatConsoleSummary, formatPrComment, } from "./format-comment.js"; -import type { BenchmarkResult } from "./types.js"; +import type { HistoryData } from "./generate-history.js"; +import type { BenchmarkResult, RuntimeStats, SpecBenchmarkResult } from "./types.js"; import { DEFAULT_BRANCH } from "./utils.js"; export interface UploadPrCommentOptions { @@ -22,9 +24,136 @@ export interface UploadPrCommentOptions { branch?: string; /** Percent threshold for notable changes. */ threshold?: number; + /** Number of latest entries to use for rolling baseline. */ + baselineWindow?: number; } -function fetchBaseline(branch: string): BenchmarkResult | undefined { +interface BaselineResult { + baseline: BenchmarkResult; + label: string; +} + +function expandRuntimeMetrics(flat: Record): RuntimeStats { + const runtime: RuntimeStats = { + total: flat["total"] ?? 0, + loader: flat["loader"] ?? 0, + resolver: flat["resolver"] ?? 0, + checker: flat["checker"] ?? 0, + validation: { total: flat["validation"] ?? 0, validators: {} }, + linter: { total: flat["linter"] ?? 0, rules: {} }, + emit: { total: flat["emit"] ?? 0, emitters: {} }, + }; + + for (const [label, value] of Object.entries(flat)) { + if (label.startsWith("validation/")) { + runtime.validation.validators[label.replace("validation/", "")] = value; + continue; + } + if (label.startsWith("linter/")) { + runtime.linter.rules[label.replace("linter/", "")] = value; + continue; + } + if (!label.startsWith("emit/")) { + continue; + } + + const parts = label.split("/"); + if (parts.length < 2) { + continue; + } + + const emitterName = parts[1]; + runtime.emit.emitters[emitterName] ??= { total: 0, steps: {} }; + if (parts.length === 2) { + runtime.emit.emitters[emitterName].total = value; + } else if (parts.length > 2) { + const stepName = parts.slice(2).join("/"); + runtime.emit.emitters[emitterName].steps[stepName] = value; + } + } + + return runtime; +} + +function aggregateSpecFromHistory( + specName: string, + entries: HistoryData["entries"], + currentSpec: SpecBenchmarkResult, +): SpecBenchmarkResult | undefined { + const samplesByMetric = new Map(); + for (const entry of entries) { + const metrics = entry.specMetrics[specName]; + if (!metrics) continue; + for (const [label, value] of Object.entries(metrics)) { + const samples = samplesByMetric.get(label); + if (samples) { + samples.push(value); + } else { + samplesByMetric.set(label, [value]); + } + } + } + + if (samplesByMetric.size === 0) { + return undefined; + } + + const aggregated: Record = {}; + for (const [label, samples] of samplesByMetric) { + aggregated[label] = aggregateDurations(samples); + } + + return { + ...currentSpec, + stats: { + ...currentSpec.stats, + runtime: expandRuntimeMetrics(aggregated), + }, + }; +} + +function buildRollingBaseline( + history: HistoryData, + current: BenchmarkResult, + baselineWindow: number, +): BaselineResult | undefined { + const window = Math.max(1, baselineWindow); + const entries = history.entries.slice(-window); + if (entries.length === 0) { + return undefined; + } + + const specs: Record = {}; + for (const [specName, currentSpec] of Object.entries(current.specs)) { + const rollingSpec = aggregateSpecFromHistory(specName, entries, currentSpec); + if (!rollingSpec) { + continue; + } + specs[specName] = rollingSpec; + } + + if (Object.keys(specs).length === 0) { + return undefined; + } + + const firstCommit = entries[0]?.commit.slice(0, 7) ?? "unknown"; + const lastCommit = entries[entries.length - 1]?.commit.slice(0, 7) ?? "unknown"; + return { + baseline: { + ...current, + commit: `rolling-baseline-${firstCommit}-${lastCommit}`, + timestamp: new Date().toISOString(), + specs, + }, + label: `rolling baseline (${entries.length} main run${entries.length > 1 ? "s" : ""})`, + }; +} + +function fetchBaseline( + branch: string, + current: BenchmarkResult, + baselineWindow: number, +): BaselineResult | undefined { try { const hasRemote = (() => { try { @@ -40,11 +169,28 @@ function fetchBaseline(branch: string): BenchmarkResult | undefined { } execSync(`git fetch origin ${branch}`, { stdio: "ignore" }); - const content = execSync(`git show origin/${branch}:results/latest.json`, { + try { + const historyContent = execSync(`git show origin/${branch}:results/history.json`, { + encoding: "utf-8", + maxBuffer: 50_000_000, + }); + const history = JSON.parse(historyContent) as HistoryData; + const rollingBaseline = buildRollingBaseline(history, current, baselineWindow); + if (rollingBaseline) { + return rollingBaseline; + } + } catch { + // ignore and fallback to latest.json + } + + const latestContent = execSync(`git show origin/${branch}:results/latest.json`, { encoding: "utf-8", maxBuffer: 50_000_000, }); - return JSON.parse(content) as BenchmarkResult; + return { + baseline: JSON.parse(latestContent) as BenchmarkResult, + label: "latest main benchmark", + }; } catch { return undefined; } @@ -64,25 +210,34 @@ export function uploadPrComment(options: UploadPrCommentOptions): void { const { resultsFile, prNumber, outputDir } = options; const branch = options.branch ?? DEFAULT_BRANCH; const threshold = options.threshold; + const baselineWindow = options.baselineWindow ?? 20; if (!existsSync(resultsFile)) { throw new Error(`Results file not found: ${resultsFile}`); } const current = JSON.parse(readFileSync(resolve(resultsFile), "utf-8")) as BenchmarkResult; - const baseline = fetchBaseline(branch); + const baselineResult = fetchBaseline(branch, current, baselineWindow); mkdirSync(outputDir, { recursive: true }); let commentMarkdown: string; let githubSummary: string | undefined; - if (baseline) { + if (baselineResult) { + const { baseline, label } = baselineResult; const comparisons = compareBenchmarks(baseline, current, { threshold }); - commentMarkdown = formatPrComment(comparisons, baseline.commit, current.commit, { threshold }); + commentMarkdown = formatPrComment( + comparisons, + `${baseline.commit} (${label})`, + current.commit, + { + threshold, + }, + ); githubSummary = formatComparisonSummary( comparisons, - baseline.commit, + `${baseline.commit} (${label})`, current.commit, threshold, ); diff --git a/packages/benchmark/test/compare.test.ts b/packages/benchmark/test/compare.test.ts index 1344cac452..b2c86ecd9e 100644 --- a/packages/benchmark/test/compare.test.ts +++ b/packages/benchmark/test/compare.test.ts @@ -52,3 +52,16 @@ it("excludes metrics below minimum absolute threshold from regression summary", expect(topSummary).toContain("| checker |"); expect(topSummary).not.toContain("linter/noisy-rule"); }); + +it("keeps descriptive baseline labels in comments", () => { + const comparisons = [createComparison([createMetric("checker", 100, 99)])]; + const comment = formatPrComment( + comparisons, + "rolling:abc1234..def5678 (rolling baseline (20 main runs))", + "1234567890abcdef", + { threshold: 5 }, + ); + + expect(comment).toContain("rolling baseline (20 main runs)"); + expect(comment).toContain("1234567"); +}); diff --git a/packages/benchmark/test/statistics.test.ts b/packages/benchmark/test/statistics.test.ts new file mode 100644 index 0000000000..406434d5c0 --- /dev/null +++ b/packages/benchmark/test/statistics.test.ts @@ -0,0 +1,17 @@ +import { expect, it } from "vitest"; +import { coefficientOfVariation, summarize } from "../src/statistics.js"; + +it("computes coefficient of variation", () => { + const cv = coefficientOfVariation([100, 102, 98, 100]); + expect(cv).toBeGreaterThan(0); + expect(cv).toBeLessThan(0.02); +}); + +it("summarizes value distributions", () => { + const summary = summarize([90, 100, 110]); + expect(summary.sampleCount).toBe(3); + expect(summary.mean).toBe(100); + expect(summary.median).toBe(100); + expect(summary.min).toBe(90); + expect(summary.max).toBe(110); +});