Skip to content

Commit 8f33fab

Browse files
authored
Build: Bench reporter surfaces wins (#180)
1 parent 88c5638 commit 8f33fab

3 files changed

Lines changed: 299 additions & 38 deletions

File tree

tools/ci/bench/reporter/fetch-pr-history.js

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,26 @@ for (const run of prRuns) {
6666
continue;
6767
}
6868

69+
// Did this iteration's commit actually touch packages source? GitHub's
70+
// `paths:` filter triggers benches on any commit in a PR whose overall
71+
// diff includes packages/**, so harness-only commits ride along when
72+
// earlier commits in the PR moved packages. Filtering here keeps those
73+
// commits out of bisect/credit candidate suggestions downstream.
74+
const touchesPackages = commitTouchesPackages(repo, run.headSha);
75+
6976
commits.push({
7077
sha: run.headSha,
7178
msg: run.displayTitle,
7279
parent_sha: '',
7380
timestamp: run.createdAt,
7481
pr: null,
82+
touches_packages: touchesPackages,
7583
metrics,
7684
});
7785
console.log(
7886
` ${run.headSha.slice(0, 7)}${Object.keys(metrics).length} metrics`
79-
+ (baselineSha ? ` @ baseline ${baselineSha.slice(0, 7)}` : ''),
87+
+ (baselineSha ? ` @ baseline ${baselineSha.slice(0, 7)}` : '')
88+
+ (touchesPackages ? '' : ' (harness-only)'),
8089
);
8190
}
8291

@@ -91,6 +100,24 @@ function exec(cmd) {
91100
return execSync(cmd, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
92101
}
93102

103+
/**
104+
* Query the GitHub API for the commit's changed-file list and return true
105+
* when any path is under `packages/`. Defaults to true on API failure so
106+
* an unreachable commit doesn't silently disappear from candidate lists.
107+
*/
108+
function commitTouchesPackages(repoSlug, sha) {
109+
try {
110+
const filesRaw = exec(
111+
`gh api repos/${repoSlug}/commits/${sha} --jq '.files[].filename'`,
112+
);
113+
const files = filesRaw.split('\n').filter(Boolean);
114+
return files.some((f) => f.startsWith('packages/'));
115+
}
116+
catch {
117+
return true;
118+
}
119+
}
120+
94121
function parseArgs(argv) {
95122
const out = {};
96123
for (let i = 0; i < argv.length; i++) {

tools/ci/bench/reporter/reporter.js

Lines changed: 82 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,39 @@ const BISECT_MARKDOWN_MAX = 3;
103103
// long-running PR.
104104
const DRIFT_THRESHOLD_PP = 5;
105105

106+
// Cross-iteration peak-attribution sections. Both REOPENED ("regression")
107+
// and WIN ("win") render the same shape: heading, description, table of
108+
// metric / current / peak / vs peak / candidates, with drift footnotes when
109+
// peak and current had different baselines and main moved enough to confound
110+
// the comparison. Only the framing (status filter, sort direction, copy,
111+
// delta wording) differs.
112+
const PEAK_SECTIONS = {
113+
regression: {
114+
status: 'REOPENED',
115+
headingPrefix: '📜 Regressions from peak',
116+
description:
117+
`These metrics were better on a prior iteration than they are now. The peak's percent-delta vs its baseline dominates current's percent-delta vs its baseline — not attributable to per-sample noise. Bisect candidates are the commits between the peak iteration and HEAD; nearest-to-peak is usually the best bet.`,
118+
columnHeader: '| metric | current | peak | vs peak | bisect candidates |',
119+
// Largest % regression first (descending on signed delta).
120+
sortSign: -1,
121+
formatDelta: (delta) =>
122+
delta > 0
123+
? `regressed +${delta.toFixed(0)}%`
124+
: `${delta.toFixed(0)}%`,
125+
},
126+
win: {
127+
status: 'WIN',
128+
headingPrefix: '🏆 New peaks',
129+
description:
130+
`These metrics reached a new best in this iteration — current's percent-delta vs its baseline dominates the prior peak's percent-delta vs its baseline. Credit candidates are the commits between the prior peak and HEAD; nearest-to-current is usually the cause.`,
131+
columnHeader: '| metric | current | prior peak | vs prior peak | credit candidates |',
132+
// Most-improved first. delta_from_peak_pct is negative for WIN, so
133+
// ascending sort surfaces the best.
134+
sortSign: 1,
135+
formatDelta: (delta) => `improved ${Math.abs(delta).toFixed(0)}%`,
136+
},
137+
};
138+
106139
/**
107140
* Expected percent-change CI width for an unresolved CI given the bench's
108141
* absolute duration. Derived from the standard-error-of-the-difference of
@@ -324,6 +357,10 @@ function renderMarkdown(report) {
324357
`🔍 ${unsureTotal} unsure`,
325358
`⚪ ${noChange.length} no change`,
326359
];
360+
const winCount = report.history_summary?.WIN ?? 0;
361+
if (winCount > 0) {
362+
resultsParts.push(`🏆 ${winCount} new peak${winCount === 1 ? '' : 's'}`);
363+
}
327364
const reopenedCount = report.history_summary?.REOPENED ?? 0;
328365
if (reopenedCount > 0) {
329366
resultsParts.push(`📜 ${reopenedCount} reopened`);
@@ -343,8 +380,11 @@ function renderMarkdown(report) {
343380
renderFasterSlowerSection(lines, slower, 'slower', report);
344381
}
345382

383+
// ─── New peaks (cross-run; auto-expanded when present) ───────────────
384+
renderPeakSection(lines, report, 'win');
385+
346386
// ─── Regressions from peak (cross-run; auto-expanded when present) ───
347-
renderRegressionsFromPeak(lines, report);
387+
renderPeakSection(lines, report, 'regression');
348388

349389
// ─── No Change (always collapsed) ────────────────────────────────────
350390
if (noChange.length > 0) {
@@ -429,33 +469,31 @@ function renderMarkdown(report) {
429469
}
430470

431471
/**
432-
* Append a "Regressions from peak" section when one or more metrics are
433-
* REOPENED (current pct-delta dominated by a prior iteration's pct-delta).
434-
* Actionable signal: the metric was once better and this PR — or a commit
435-
* before it — gave that improvement back.
472+
* Append a cross-iteration peak-attribution section. `kind === 'regression'`
473+
* surfaces REOPENED metrics (peak dominates current); `kind === 'win'`
474+
* surfaces WIN metrics (current dominates peak). Drift footnotes fire when
475+
* peak and current had different baselines and main moved enough to confound
476+
* the comparison. Symmetric across both kinds because false-blame and
477+
* false-credit are the same kind of attribution failure in opposite directions.
436478
*
437-
* Surface units are within-session percent-deltas (the pct-delta this run
438-
* achieved vs its baseline; the pct-delta peak achieved vs ITS baseline).
439-
* `delta_from_peak_pct` is the difference between those two midpoints in
440-
* percentage points (pp). Drift footnotes fire when peak and current had
441-
* different baselines and main moved enough on the metric to plausibly
442-
* confound the comparison.
479+
* Surface units: within-session percent-deltas vs each iteration's baseline.
480+
* `delta_from_peak_pct` is the difference between those midpoints, rendered
481+
* in the same `%` unit as the table cells.
443482
*/
444-
function renderRegressionsFromPeak(lines, report) {
445-
const reopened = report.metrics.filter((m) => m.history_status === 'REOPENED');
446-
if (reopened.length === 0) { return; }
483+
function renderPeakSection(lines, report, kind) {
484+
const config = PEAK_SECTIONS[kind];
485+
const rows = report.metrics.filter((m) => m.history_status === config.status);
486+
if (rows.length === 0) { return; }
447487

448-
const sorted = [...reopened].sort(
449-
(a, b) => (b.delta_from_peak_pct ?? 0) - (a.delta_from_peak_pct ?? 0),
488+
const sorted = [...rows].sort(
489+
(a, b) => config.sortSign * ((a.delta_from_peak_pct ?? 0) - (b.delta_from_peak_pct ?? 0)),
450490
);
451491

452-
lines.push(`#### 📜 Regressions from peak (${reopened.length})`);
492+
lines.push(`#### ${config.headingPrefix} (${rows.length})`);
453493
lines.push('');
454-
lines.push(
455-
`These metrics were better on a prior iteration than they are now. The peak's percent-delta vs its baseline dominates current's percent-delta vs its baseline — not attributable to per-sample noise. Bisect candidates are the commits between the peak iteration and HEAD; nearest-to-peak is usually the best bet.`,
456-
);
494+
lines.push(config.description);
457495
lines.push('');
458-
lines.push('| metric | current | peak | vs peak | bisect candidates |');
496+
lines.push(config.columnHeader);
459497
lines.push('|---|---|---|---|---|');
460498

461499
const flagged = [];
@@ -464,18 +502,10 @@ function renderRegressionsFromPeak(lines, report) {
464502
const currentStr = formatSignedPct(mid(m.percent_change_ci));
465503
const peakStr = formatSignedPct(mid(m.peak.percent_delta_ci));
466504
const peakLink = commitOrPrLink(m.peak, report.repo);
467-
const deltaStr = m.delta_from_peak_pct > 0
468-
? `regressed +${m.delta_from_peak_pct.toFixed(0)}pp`
469-
: `${m.delta_from_peak_pct.toFixed(0)}pp`;
470-
const bisectMd = (m.bisect_candidates ?? [])
471-
.slice(0, BISECT_MARKDOWN_MAX)
472-
.map((c) => commitOrPrLink(c, report.repo))
473-
.join(', ');
474-
const bisectCell = m.bisect_candidates && m.bisect_candidates.length > BISECT_MARKDOWN_MAX
475-
? `${bisectMd} +${m.bisect_candidates.length - BISECT_MARKDOWN_MAX} more`
476-
: bisectMd || '—';
477-
478-
// Fires on threshold breach or chain-gap (magnitude unavailable).
505+
const deltaStr = config.formatDelta(m.delta_from_peak_pct);
506+
const candCell = formatCandidateCell(m.bisect_candidates, report.repo);
507+
508+
// Drift fires on threshold breach or chain-gap (magnitude unavailable).
479509
let driftFlag = '';
480510
if (m.drift?.detected) {
481511
const mag = m.drift.magnitude;
@@ -494,9 +524,7 @@ function renderRegressionsFromPeak(lines, report) {
494524
}
495525

496526
lines.push(
497-
`| ${
498-
metricLink(m, report)
499-
} | ${currentStr}${driftFlag} | ${peakStr} @ ${peakLink} | ${deltaStr} | ${bisectCell} |`,
527+
`| ${metricLink(m, report)} | ${currentStr}${driftFlag} | ${peakStr} @ ${peakLink} | ${deltaStr} | ${candCell} |`,
500528
);
501529
}
502530
lines.push('');
@@ -513,6 +541,23 @@ function formatSignedPct(pct) {
513541
return pct > 0 ? `+${pct.toFixed(0)}%` : `${pct.toFixed(0)}%`;
514542
}
515543

544+
/**
545+
* Render the comma-joined candidate cell for peak-attribution tables, with
546+
* an overflow suffix when the list exceeds BISECT_MARKDOWN_MAX. Same shape
547+
* for bisect (regression) and credit (win) sides. Only the column heading
548+
* differs at the call site.
549+
*/
550+
function formatCandidateCell(candidates, repo) {
551+
if (!candidates || candidates.length === 0) { return '—'; }
552+
const md = candidates
553+
.slice(0, BISECT_MARKDOWN_MAX)
554+
.map((c) => commitOrPrLink(c, repo))
555+
.join(', ');
556+
return candidates.length > BISECT_MARKDOWN_MAX
557+
? `${md} +${candidates.length - BISECT_MARKDOWN_MAX} more`
558+
: md;
559+
}
560+
516561
function formatDriftFootnote({ idx, metric, drift, currentBaseline, peakBaseline }) {
517562
const peakSha = peakBaseline ? peakBaseline.slice(0, 7) : '?';
518563
const currentSha = currentBaseline ? currentBaseline.slice(0, 7) : '?';
@@ -887,7 +932,7 @@ function computeHistoryStatus(metric, peakHist, driftHist) {
887932
*
888933
* Returns one of:
889934
* { detected: false } — baselines match (or one/both unknown)
890-
* { detected: true, magnitude: N, chain_len: K, missing: M } — quantified; N in pp, positive = main got slower
935+
* { detected: true, magnitude: N, chain_len: K, missing: M } — quantified. N in %, positive = main got slower
891936
* { detected: true, magnitude: null, chain_len: K, missing: M } — chain partially or wholly unwalkable
892937
*
893938
* Combines multiplicatively: ∏(1 + pct_i) − 1.

0 commit comments

Comments
 (0)