Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion tools/ci/bench/reporter/fetch-pr-history.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,26 @@ for (const run of prRuns) {
continue;
}

// Did this iteration's commit actually touch packages source? GitHub's
// `paths:` filter triggers benches on any commit in a PR whose overall
// diff includes packages/**, so harness-only commits ride along when
// earlier commits in the PR moved packages. Filtering here keeps those
// commits out of bisect/credit candidate suggestions downstream.
const touchesPackages = commitTouchesPackages(repo, run.headSha);

commits.push({
sha: run.headSha,
msg: run.displayTitle,
parent_sha: '',
timestamp: run.createdAt,
pr: null,
touches_packages: touchesPackages,
metrics,
});
console.log(
` ${run.headSha.slice(0, 7)} — ${Object.keys(metrics).length} metrics`
+ (baselineSha ? ` @ baseline ${baselineSha.slice(0, 7)}` : ''),
+ (baselineSha ? ` @ baseline ${baselineSha.slice(0, 7)}` : '')
+ (touchesPackages ? '' : ' (harness-only)'),
);
}

Expand All @@ -91,6 +100,24 @@ function exec(cmd) {
return execSync(cmd, { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] });
}

/**
* Query the GitHub API for the commit's changed-file list and return true
* when any path is under `packages/`. Defaults to true on API failure so
* an unreachable commit doesn't silently disappear from candidate lists.
*/
function commitTouchesPackages(repoSlug, sha) {
try {
const filesRaw = exec(
`gh api repos/${repoSlug}/commits/${sha} --jq '.files[].filename'`,
);
const files = filesRaw.split('\n').filter(Boolean);
return files.some((f) => f.startsWith('packages/'));
}
catch {
return true;
}
}

function parseArgs(argv) {
const out = {};
for (let i = 0; i < argv.length; i++) {
Expand Down
119 changes: 82 additions & 37 deletions tools/ci/bench/reporter/reporter.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,39 @@ const BISECT_MARKDOWN_MAX = 3;
// long-running PR.
const DRIFT_THRESHOLD_PP = 5;

// Cross-iteration peak-attribution sections. Both REOPENED ("regression")
// and WIN ("win") render the same shape: heading, description, table of
// metric / current / peak / vs peak / candidates, with drift footnotes when
// peak and current had different baselines and main moved enough to confound
// the comparison. Only the framing (status filter, sort direction, copy,
// delta wording) differs.
const PEAK_SECTIONS = {
regression: {
status: 'REOPENED',
headingPrefix: '📜 Regressions from peak',
description:
`These metrics were better on a prior iteration than they are now. The peak's percent-delta vs its baseline dominates current's percent-delta vs its baseline — not attributable to per-sample noise. Bisect candidates are the commits between the peak iteration and HEAD; nearest-to-peak is usually the best bet.`,
columnHeader: '| metric | current | peak | vs peak | bisect candidates |',
// Largest % regression first (descending on signed delta).
sortSign: -1,
formatDelta: (delta) =>
delta > 0
? `regressed +${delta.toFixed(0)}%`
: `${delta.toFixed(0)}%`,
},
win: {
status: 'WIN',
headingPrefix: '🏆 New peaks',
description:
`These metrics reached a new best in this iteration — current's percent-delta vs its baseline dominates the prior peak's percent-delta vs its baseline. Credit candidates are the commits between the prior peak and HEAD; nearest-to-current is usually the cause.`,
columnHeader: '| metric | current | prior peak | vs prior peak | credit candidates |',
// Most-improved first. delta_from_peak_pct is negative for WIN, so
// ascending sort surfaces the best.
sortSign: 1,
formatDelta: (delta) => `improved ${Math.abs(delta).toFixed(0)}%`,
},
};

/**
* Expected percent-change CI width for an unresolved CI given the bench's
* absolute duration. Derived from the standard-error-of-the-difference of
Expand Down Expand Up @@ -324,6 +357,10 @@ function renderMarkdown(report) {
`🔍 ${unsureTotal} unsure`,
`⚪ ${noChange.length} no change`,
];
const winCount = report.history_summary?.WIN ?? 0;
if (winCount > 0) {
resultsParts.push(`🏆 ${winCount} new peak${winCount === 1 ? '' : 's'}`);
}
const reopenedCount = report.history_summary?.REOPENED ?? 0;
if (reopenedCount > 0) {
resultsParts.push(`📜 ${reopenedCount} reopened`);
Expand All @@ -343,8 +380,11 @@ function renderMarkdown(report) {
renderFasterSlowerSection(lines, slower, 'slower', report);
}

// ─── New peaks (cross-run; auto-expanded when present) ───────────────
renderPeakSection(lines, report, 'win');

// ─── Regressions from peak (cross-run; auto-expanded when present) ───
renderRegressionsFromPeak(lines, report);
renderPeakSection(lines, report, 'regression');

// ─── No Change (always collapsed) ────────────────────────────────────
if (noChange.length > 0) {
Expand Down Expand Up @@ -429,33 +469,31 @@ function renderMarkdown(report) {
}

/**
* Append a "Regressions from peak" section when one or more metrics are
* REOPENED (current pct-delta dominated by a prior iteration's pct-delta).
* Actionable signal: the metric was once better and this PR — or a commit
* before it — gave that improvement back.
* Append a cross-iteration peak-attribution section. `kind === 'regression'`
* surfaces REOPENED metrics (peak dominates current); `kind === 'win'`
* surfaces WIN metrics (current dominates peak). Drift footnotes fire when
* peak and current had different baselines and main moved enough to confound
* the comparison. Symmetric across both kinds because false-blame and
* false-credit are the same kind of attribution failure in opposite directions.
*
* Surface units are within-session percent-deltas (the pct-delta this run
* achieved vs its baseline; the pct-delta peak achieved vs ITS baseline).
* `delta_from_peak_pct` is the difference between those two midpoints in
* percentage points (pp). Drift footnotes fire when peak and current had
* different baselines and main moved enough on the metric to plausibly
* confound the comparison.
* Surface units: within-session percent-deltas vs each iteration's baseline.
* `delta_from_peak_pct` is the difference between those midpoints, rendered
* in the same `%` unit as the table cells.
*/
function renderRegressionsFromPeak(lines, report) {
const reopened = report.metrics.filter((m) => m.history_status === 'REOPENED');
if (reopened.length === 0) { return; }
function renderPeakSection(lines, report, kind) {
const config = PEAK_SECTIONS[kind];
const rows = report.metrics.filter((m) => m.history_status === config.status);
if (rows.length === 0) { return; }

const sorted = [...reopened].sort(
(a, b) => (b.delta_from_peak_pct ?? 0) - (a.delta_from_peak_pct ?? 0),
const sorted = [...rows].sort(
(a, b) => config.sortSign * ((a.delta_from_peak_pct ?? 0) - (b.delta_from_peak_pct ?? 0)),
);

lines.push(`#### 📜 Regressions from peak (${reopened.length})`);
lines.push(`#### ${config.headingPrefix} (${rows.length})`);
lines.push('');
lines.push(
`These metrics were better on a prior iteration than they are now. The peak's percent-delta vs its baseline dominates current's percent-delta vs its baseline — not attributable to per-sample noise. Bisect candidates are the commits between the peak iteration and HEAD; nearest-to-peak is usually the best bet.`,
);
lines.push(config.description);
lines.push('');
lines.push('| metric | current | peak | vs peak | bisect candidates |');
lines.push(config.columnHeader);
lines.push('|---|---|---|---|---|');

const flagged = [];
Expand All @@ -464,18 +502,10 @@ function renderRegressionsFromPeak(lines, report) {
const currentStr = formatSignedPct(mid(m.percent_change_ci));
const peakStr = formatSignedPct(mid(m.peak.percent_delta_ci));
const peakLink = commitOrPrLink(m.peak, report.repo);
const deltaStr = m.delta_from_peak_pct > 0
? `regressed +${m.delta_from_peak_pct.toFixed(0)}pp`
: `${m.delta_from_peak_pct.toFixed(0)}pp`;
const bisectMd = (m.bisect_candidates ?? [])
.slice(0, BISECT_MARKDOWN_MAX)
.map((c) => commitOrPrLink(c, report.repo))
.join(', ');
const bisectCell = m.bisect_candidates && m.bisect_candidates.length > BISECT_MARKDOWN_MAX
? `${bisectMd} +${m.bisect_candidates.length - BISECT_MARKDOWN_MAX} more`
: bisectMd || '—';

// Fires on threshold breach or chain-gap (magnitude unavailable).
const deltaStr = config.formatDelta(m.delta_from_peak_pct);
const candCell = formatCandidateCell(m.bisect_candidates, report.repo);

// Drift fires on threshold breach or chain-gap (magnitude unavailable).
let driftFlag = '';
if (m.drift?.detected) {
const mag = m.drift.magnitude;
Expand All @@ -494,9 +524,7 @@ function renderRegressionsFromPeak(lines, report) {
}

lines.push(
`| ${
metricLink(m, report)
} | ${currentStr}${driftFlag} | ${peakStr} @ ${peakLink} | ${deltaStr} | ${bisectCell} |`,
`| ${metricLink(m, report)} | ${currentStr}${driftFlag} | ${peakStr} @ ${peakLink} | ${deltaStr} | ${candCell} |`,
);
}
lines.push('');
Expand All @@ -513,6 +541,23 @@ function formatSignedPct(pct) {
return pct > 0 ? `+${pct.toFixed(0)}%` : `${pct.toFixed(0)}%`;
}

/**
* Render the comma-joined candidate cell for peak-attribution tables, with
* an overflow suffix when the list exceeds BISECT_MARKDOWN_MAX. Same shape
* for bisect (regression) and credit (win) sides. Only the column heading
* differs at the call site.
*/
function formatCandidateCell(candidates, repo) {
if (!candidates || candidates.length === 0) { return '—'; }
const md = candidates
.slice(0, BISECT_MARKDOWN_MAX)
.map((c) => commitOrPrLink(c, repo))
.join(', ');
return candidates.length > BISECT_MARKDOWN_MAX
? `${md} +${candidates.length - BISECT_MARKDOWN_MAX} more`
: md;
}

function formatDriftFootnote({ idx, metric, drift, currentBaseline, peakBaseline }) {
const peakSha = peakBaseline ? peakBaseline.slice(0, 7) : '?';
const currentSha = currentBaseline ? currentBaseline.slice(0, 7) : '?';
Expand Down Expand Up @@ -887,7 +932,7 @@ function computeHistoryStatus(metric, peakHist, driftHist) {
*
* Returns one of:
* { detected: false } — baselines match (or one/both unknown)
* { detected: true, magnitude: N, chain_len: K, missing: M } — quantified; N in pp, positive = main got slower
* { detected: true, magnitude: N, chain_len: K, missing: M } — quantified. N in %, positive = main got slower
* { detected: true, magnitude: null, chain_len: K, missing: M } — chain partially or wholly unwalkable
*
* Combines multiplicatively: ∏(1 + pct_i) − 1.
Expand Down
Loading
Loading