Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/benchmarks-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,21 @@ jobs:
echo "number=$NUMBER" >> "$GITHUB_OUTPUT"
echo "Resolved PR number: $NUMBER"

# Walk prior successful bench runs on this PR's branch and build a
# per-iteration history. The reporter merges this with bench-history.json
# (main commits) so peak attribution spans BOTH main AND this PR's
# iterations — an agent sees "iteration 3 was the best on update-10th;
# your current iteration regressed from that."
- name: Fetch PR iteration history
env:
GH_TOKEN: ${{ steps.bot-token.outputs.token }}
run: |
node tools/bench-reporter/fetch-pr-history.js \
--branch '${{ github.event.workflow_run.head_branch }}' \
--repo '${{ github.repository }}' \
--current-run-id '${{ github.event.workflow_run.id }}' \
--out pr-history.json

- name: Generate report
env:
STARTED: ${{ github.event.workflow_run.created_at }}
Expand All @@ -98,6 +113,7 @@ jobs:
--run-id '${{ github.event.workflow_run.id }}' \
--base-ref 'main' \
--repo '${{ github.repository }}' \
--pr-history pr-history.json \
--wall-clock "$WALL_CLOCK" \
--out bench-report

Expand Down
155 changes: 155 additions & 0 deletions tools/bench-reporter/fetch-pr-history.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/usr/bin/env node
/*
Fetch prior bench results from this PR's branch to build a per-iteration
history. Walks completed Benchmarks workflow runs, downloads their
results-* artifacts, extracts per-metric absolute CIs, and outputs a
pr-history.json in the same schema as bench-history.json.

The reporter merges this PR-iteration history with bench-history.json
(main-commit history) to compute cross-run peak attribution. An agent
iterating on a perf branch sees: "iteration 3 was the best on
update-10th; your current iteration regressed from that."

Usage:
node fetch-pr-history.js \
--branch <name> PR head branch name
--repo <owner/name> GitHub repo slug
--current-run-id <id> exclude this run from history (it's the one we're reporting)
--max-runs <n> cap at N most recent prior runs (default: 20)
--out <path> output path (default: ./pr-history.json)

Requires `gh` CLI authenticated (GITHUB_TOKEN or GH_TOKEN in env).
*/

import { execSync } from 'node:child_process';
import fs from 'node:fs';
import path from 'node:path';

const args = parseArgs(process.argv.slice(2));
const branch = required(args, 'branch');
const repo = required(args, 'repo');
const currentRunId = args['current-run-id'] ?? '';
const maxRuns = Number(args['max-runs'] ?? 20);
const outPath = args.out ?? './pr-history.json';

// List successful Benchmarks workflow runs on this branch.
const runsRaw = exec(
`gh run list --repo "${repo}" --workflow=benchmarks.yml --limit ${maxRuns * 2} `
+ `--json databaseId,conclusion,headBranch,headSha,displayTitle,createdAt`,
);
const allRuns = JSON.parse(runsRaw);
const prRuns = allRuns
.filter((r) => r.headBranch === branch && r.conclusion === 'success')
.filter((r) => String(r.databaseId) !== String(currentRunId))
.slice(0, maxRuns);

console.log(`Found ${prRuns.length} prior successful bench runs on ${branch}`);

const commits = [];
for (const run of prRuns) {
const dir = fs.mkdtempSync('/tmp/pr-hist-');
try {
exec(
`gh run download ${run.databaseId} --repo "${repo}" `
+ `--pattern "results-*" --dir "${dir}"`,
);
}
catch {
console.log(` Skip ${run.databaseId} (artifact download failed)`);
continue;
}

const metrics = loadMetrics(dir);
if (Object.keys(metrics).length === 0) {
console.log(` Skip ${run.databaseId} (no metrics)`);
continue;
}

commits.push({
sha: run.headSha,
msg: run.displayTitle,
parent_sha: '',
timestamp: run.createdAt,
pr: null,
metrics,
});
console.log(` ${run.headSha.slice(0, 7)} — ${Object.keys(metrics).length} metrics`);
}

// Chronological order (oldest first) so peak-index → bisect-candidates
// after peak produces a causal timeline.
commits.sort((a, b) => a.timestamp.localeCompare(b.timestamp));

fs.writeFileSync(outPath, JSON.stringify({ schema_version: 1, commits }, null, 2) + '\n');
console.log(`Wrote ${commits.length} entries to ${outPath}`);

/**
* Walk a results directory and extract one { ci, mean_ms } entry per
* metric. Uses the `this-change` absolute CI — same extraction logic
* as append-history.js.
*/
function loadMetrics(dir) {
const out = {};
for (const entry of walk(dir)) {
if (!entry.endsWith('.json')) { continue; }
let data;
try {
data = JSON.parse(fs.readFileSync(entry, 'utf8'));
}
catch {
continue;
}
if (!Array.isArray(data.benchmarks)) { continue; }

for (const bm of data.benchmarks) {
const source = (bm.name ?? '').split(' [')[0];
if (source !== 'this-change') { continue; }
const metricName = bm.measurement?.name ?? bm.name;
if (!bm.mean) { continue; }
out[metricName] = {
ci: [round4(bm.mean.low), round4(bm.mean.high)],
mean_ms: round4((bm.mean.low + bm.mean.high) / 2),
};
}
}
return out;
}

function round4(n) {
return Number(n.toFixed(4));
}

function* walk(dir) {
for (const ent of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, ent.name);
if (ent.isDirectory()) { yield* walk(full); }
else { yield full; }
}
}

function exec(cmd) {
return execSync(cmd, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
}

function parseArgs(argv) {
const out = {};
for (let i = 0; i < argv.length; i++) {
if (!argv[i].startsWith('--')) { continue; }
const key = argv[i].slice(2);
const next = argv[i + 1];
if (next === undefined || next.startsWith('--')) { out[key] = true; }
else {
out[key] = next;
i++;
}
}
return out;
}

function required(args, key) {
if (args[key] === undefined) {
console.error(`Missing required --${key}`);
process.exit(1);
}
return args[key];
}
30 changes: 29 additions & 1 deletion tools/bench-reporter/reporter.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
--repo-root <dir> filesystem root for resolving bench sources (default: cwd)
--wall-clock <seconds> total bench run duration — footer metadata
--history <path> bench-history.json path (default: <repo-root>/bench-history.json)
--pr-history <path> PR-iteration history from fetch-pr-history.js (merged with --history)
--out <dir> output directory (default: ./bench-report)

Cross-run taxonomy (WIN / TIED-PEAK / REOPENED) engages automatically once
Expand All @@ -42,6 +43,7 @@ const repo = args.repo ?? process.env.GITHUB_REPOSITORY ?? '';
const repoRoot = args['repo-root'] ?? process.cwd();
const wallClockSec = args['wall-clock'] ? Number(args['wall-clock']) : null;
const historyPath = args.history ?? path.join(repoRoot, 'bench-history.json');
const prHistoryPath = args['pr-history'] ?? '';
const outDir = args.out ?? './bench-report';

const NOISE_FLOOR = 2; // percent — matches autoSampleConditions
Expand Down Expand Up @@ -88,7 +90,9 @@ function expectedNoisePp(meanMs) {
}

const benchDirs = findBenchDirs(repoRoot);
const history = loadHistory(historyPath);
const mainHistory = loadHistory(historyPath);
const prHistory = loadHistory(prHistoryPath);
const history = mergeHistories(mainHistory, prHistory);
const metrics = loadAllMetrics(resultsDir);
const report = buildReport(metrics);
const markdown = renderMarkdown(report);
Expand Down Expand Up @@ -670,6 +674,30 @@ function formatWallClock(sec) {
return m > 0 ? `${m}m${s.toString().padStart(2, '0')}s` : `${s}s`;
}

/**
* Merge main-commit history (bench-history.json) with PR-iteration
* history (pr-history.json) into a single timeline sorted by timestamp.
* Peak attribution then looks across BOTH main AND this PR's iterations
* to find the best-ever CI per metric.
*/
function mergeHistories(mainHist, prHist) {
if (!mainHist && !prHist) { return null; }
const commits = [
...(mainHist?.commits ?? []),
...(prHist?.commits ?? []),
];
// Deduplicate by SHA (same commit shouldn't appear twice)
const seen = new Set();
const deduped = commits.filter((c) => {
if (seen.has(c.sha)) { return false; }
seen.add(c.sha);
return true;
});
// Chronological order so bisect candidates are in causal sequence
deduped.sort((a, b) => (a.timestamp ?? '').localeCompare(b.timestamp ?? ''));
return { schema_version: 1, commits: deduped };
}

/**
* Load bench-history.json. Returns null if missing/empty/invalid — the
* reporter's D3b features all gracefully degrade on a null history.
Expand Down
Loading