Semantic-Org · jlukic · Apr 18, 2026 · Apr 18, 2026
diff --git a/.github/workflows/benchmarks-report.yml b/.github/workflows/benchmarks-report.yml
@@ -84,6 +84,21 @@ jobs:
           echo "number=$NUMBER" >> "$GITHUB_OUTPUT"
           echo "Resolved PR number: $NUMBER"
 
+      # Walk prior successful bench runs on this PR's branch and build a
+      # per-iteration history. The reporter merges this with bench-history.json
+      # (main commits) so peak attribution spans BOTH main AND this PR's
+      # iterations — an agent sees "iteration 3 was the best on update-10th;
+      # your current iteration regressed from that."
+      - name: Fetch PR iteration history
+        env:
+          GH_TOKEN: ${{ steps.bot-token.outputs.token }}
+        run: |
+          node tools/bench-reporter/fetch-pr-history.js \
+            --branch '${{ github.event.workflow_run.head_branch }}' \
+            --repo '${{ github.repository }}' \
+            --current-run-id '${{ github.event.workflow_run.id }}' \
+            --out pr-history.json
+
       - name: Generate report
         env:
           STARTED: ${{ github.event.workflow_run.created_at }}
@@ -98,6 +113,7 @@ jobs:
             --run-id '${{ github.event.workflow_run.id }}' \
             --base-ref 'main' \
             --repo '${{ github.repository }}' \
+            --pr-history pr-history.json \
             --wall-clock "$WALL_CLOCK" \
             --out bench-report
 

diff --git a/tools/bench-reporter/fetch-pr-history.js b/tools/bench-reporter/fetch-pr-history.js
@@ -0,0 +1,155 @@
+#!/usr/bin/env node
+/*
+  Fetch prior bench results from this PR's branch to build a per-iteration
+  history. Walks completed Benchmarks workflow runs, downloads their
+  results-* artifacts, extracts per-metric absolute CIs, and outputs a
+  pr-history.json in the same schema as bench-history.json.
+
+  The reporter merges this PR-iteration history with bench-history.json
+  (main-commit history) to compute cross-run peak attribution. An agent
+  iterating on a perf branch sees: "iteration 3 was the best on
+  update-10th; your current iteration regressed from that."
+
+  Usage:
+    node fetch-pr-history.js \
+      --branch <name>           PR head branch name
+      --repo <owner/name>       GitHub repo slug
+      --current-run-id <id>     exclude this run from history (it's the one we're reporting)
+      --max-runs <n>            cap at N most recent prior runs (default: 20)
+      --out <path>              output path (default: ./pr-history.json)
+
+  Requires `gh` CLI authenticated (GITHUB_TOKEN or GH_TOKEN in env).
+*/
+
+import { execSync } from 'node:child_process';
+import fs from 'node:fs';
+import path from 'node:path';
+
+const args = parseArgs(process.argv.slice(2));
+const branch = required(args, 'branch');
+const repo = required(args, 'repo');
+const currentRunId = args['current-run-id'] ?? '';
+const maxRuns = Number(args['max-runs'] ?? 20);
+const outPath = args.out ?? './pr-history.json';
+
+// List successful Benchmarks workflow runs on this branch.
+const runsRaw = exec(
+  `gh run list --repo "${repo}" --workflow=benchmarks.yml --limit ${maxRuns * 2} `
+    + `--json databaseId,conclusion,headBranch,headSha,displayTitle,createdAt`,
+);
+const allRuns = JSON.parse(runsRaw);
+const prRuns = allRuns
+  .filter((r) => r.headBranch === branch && r.conclusion === 'success')
+  .filter((r) => String(r.databaseId) !== String(currentRunId))
+  .slice(0, maxRuns);
+
+console.log(`Found ${prRuns.length} prior successful bench runs on ${branch}`);
+
+const commits = [];
+for (const run of prRuns) {
+  const dir = fs.mkdtempSync('/tmp/pr-hist-');
+  try {
+    exec(
+      `gh run download ${run.databaseId} --repo "${repo}" `
+        + `--pattern "results-*" --dir "${dir}"`,
+    );
+  }
+  catch {
+    console.log(`  Skip ${run.databaseId} (artifact download failed)`);
+    continue;
+  }
+
+  const metrics = loadMetrics(dir);
+  if (Object.keys(metrics).length === 0) {
+    console.log(`  Skip ${run.databaseId} (no metrics)`);
+    continue;
+  }
+
+  commits.push({
+    sha: run.headSha,
+    msg: run.displayTitle,
+    parent_sha: '',
+    timestamp: run.createdAt,
+    pr: null,
+    metrics,
+  });
+  console.log(`  ${run.headSha.slice(0, 7)} — ${Object.keys(metrics).length} metrics`);
+}
+
+// Chronological order (oldest first) so peak-index → bisect-candidates
+// after peak produces a causal timeline.
+commits.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
+
+fs.writeFileSync(outPath, JSON.stringify({ schema_version: 1, commits }, null, 2) + '\n');
+console.log(`Wrote ${commits.length} entries to ${outPath}`);
+
+/**
+ * Walk a results directory and extract one { ci, mean_ms } entry per
+ * metric. Uses the `this-change` absolute CI — same extraction logic
+ * as append-history.js.
+ */
+function loadMetrics(dir) {
+  const out = {};
+  for (const entry of walk(dir)) {
+    if (!entry.endsWith('.json')) { continue; }
+    let data;
+    try {
+      data = JSON.parse(fs.readFileSync(entry, 'utf8'));
+    }
+    catch {
+      continue;
+    }
+    if (!Array.isArray(data.benchmarks)) { continue; }
+
+    for (const bm of data.benchmarks) {
+      const source = (bm.name ?? '').split(' [')[0];
+      if (source !== 'this-change') { continue; }
+      const metricName = bm.measurement?.name ?? bm.name;
+      if (!bm.mean) { continue; }
+      out[metricName] = {
+        ci: [round4(bm.mean.low), round4(bm.mean.high)],
+        mean_ms: round4((bm.mean.low + bm.mean.high) / 2),
+      };
+    }
+  }
+  return out;
+}
+
+function round4(n) {
+  return Number(n.toFixed(4));
+}
+
+function* walk(dir) {
+  for (const ent of fs.readdirSync(dir, { withFileTypes: true })) {
+    const full = path.join(dir, ent.name);
+    if (ent.isDirectory()) { yield* walk(full); }
+    else { yield full; }
+  }
+}
+
+function exec(cmd) {
+  return execSync(cmd, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
+}
+
+function parseArgs(argv) {
+  const out = {};
+  for (let i = 0; i < argv.length; i++) {
+    if (!argv[i].startsWith('--')) { continue; }
+    const key = argv[i].slice(2);
+    const next = argv[i + 1];
+    if (next === undefined || next.startsWith('--')) { out[key] = true; }
+    else {
+      out[key] = next;
+      i++;
+    }
+  }
+  return out;
+}
+
+function required(args, key) {
+  if (args[key] === undefined) {
+    console.error(`Missing required --${key}`);
+    process.exit(1);
+  }
+  return args[key];
+}
diff --git a/tools/bench-reporter/reporter.js b/tools/bench-reporter/reporter.js
@@ -17,6 +17,7 @@
       --repo-root <dir>       filesystem root for resolving bench sources (default: cwd)
       --wall-clock <seconds>  total bench run duration — footer metadata
       --history <path>        bench-history.json path (default: <repo-root>/bench-history.json)
+      --pr-history <path>     PR-iteration history from fetch-pr-history.js (merged with --history)
       --out <dir>             output directory (default: ./bench-report)
 
   Cross-run taxonomy (WIN / TIED-PEAK / REOPENED) engages automatically once
@@ -42,6 +43,7 @@ const repo = args.repo ?? process.env.GITHUB_REPOSITORY ?? '';
 const repoRoot = args['repo-root'] ?? process.cwd();
 const wallClockSec = args['wall-clock'] ? Number(args['wall-clock']) : null;
 const historyPath = args.history ?? path.join(repoRoot, 'bench-history.json');
+const prHistoryPath = args['pr-history'] ?? '';
 const outDir = args.out ?? './bench-report';
 
 const NOISE_FLOOR = 2; // percent — matches autoSampleConditions
@@ -88,7 +90,9 @@ function expectedNoisePp(meanMs) {
 }
 
 const benchDirs = findBenchDirs(repoRoot);
-const history = loadHistory(historyPath);
+const mainHistory = loadHistory(historyPath);
+const prHistory = loadHistory(prHistoryPath);
+const history = mergeHistories(mainHistory, prHistory);
 const metrics = loadAllMetrics(resultsDir);
 const report = buildReport(metrics);
 const markdown = renderMarkdown(report);
@@ -670,6 +674,30 @@ function formatWallClock(sec) {
   return m > 0 ? `${m}m${s.toString().padStart(2, '0')}s` : `${s}s`;
 }
 
+/**
+ * Merge main-commit history (bench-history.json) with PR-iteration
+ * history (pr-history.json) into a single timeline sorted by timestamp.
+ * Peak attribution then looks across BOTH main AND this PR's iterations
+ * to find the best-ever CI per metric.
+ */
+function mergeHistories(mainHist, prHist) {
+  if (!mainHist && !prHist) { return null; }
+  const commits = [
+    ...(mainHist?.commits ?? []),
+    ...(prHist?.commits ?? []),
+  ];
+  // Deduplicate by SHA (same commit shouldn't appear twice)
+  const seen = new Set();
+  const deduped = commits.filter((c) => {
+    if (seen.has(c.sha)) { return false; }
+    seen.add(c.sha);
+    return true;
+  });
+  // Chronological order so bisect candidates are in causal sequence
+  deduped.sort((a, b) => (a.timestamp ?? '').localeCompare(b.timestamp ?? ''));
+  return { schema_version: 1, commits: deduped };
+}
+
 /**
  * Load bench-history.json. Returns null if missing/empty/invalid — the
  * reporter's D3b features all gracefully degrade on a null history.