Skip to content

Commit 84e1a5f

Browse files
committed
ci(bench): add per-PR perf canary for extractor/graph/native changes
Adds .github/workflows/perf-canary.yml — a path-filtered workflow that fires on PRs touching src/extractors/, src/domain/graph/, or crates/** and runs only the incremental-benchmark suite (full build + no-op + 1-file rebuild, both engines). Catches the class of regressions that accumulated invisibly across the Phase 8.x PRs and were only detected at v3.12.0 publish time. The regression guard gains BENCH_CANARY=1 mode: raises thresholds to 50%/100%/150% (standard/noisy/WASM) and skips the build, query, and resolution suites — only incremental checks run. This absorbs shared- runner timing variance while still blocking catastrophic regressions (+98% full build, +1827% 1-file rebuild from v3.12.0). Closes #1433
1 parent 66fc899 commit 84e1a5f

2 files changed

Lines changed: 150 additions & 10 deletions

File tree

.github/workflows/perf-canary.yml

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
name: Perf Canary
2+
3+
# Lightweight per-PR build-time regression gate for PRs that touch the
4+
# extractor, graph-builder, or native Rust layers — the parts of the codebase
5+
# that caused the v3.12.0 regressions (+1827% 1-file rebuild, +98% full build).
6+
#
7+
# Only the incremental-benchmark suite is run (full build + no-op + 1-file
8+
# rebuild for both engines). The regression guard uses BENCH_CANARY=1 mode,
9+
# which applies a 50% threshold instead of the full suite's 25% — enough
10+
# to catch catastrophic regressions while tolerating CI runner variance.
11+
#
12+
# This is intentionally separate from the full pre-publish-benchmark job in
13+
# ci.yml, which runs unconditionally on every PR and measures the complete
14+
# suite. The canary completes in roughly 5–10 minutes; the full suite takes
15+
# 20–60 minutes.
16+
17+
on:
18+
pull_request:
19+
paths:
20+
- "src/extractors/**"
21+
- "src/domain/graph/**"
22+
- "crates/**"
23+
- "scripts/benchmark.ts"
24+
- "scripts/incremental-benchmark.ts"
25+
- "scripts/lib/bench-config.ts"
26+
- "scripts/lib/fork-engine.ts"
27+
28+
concurrency:
29+
group: perf-canary-${{ github.ref }}
30+
cancel-in-progress: true
31+
32+
jobs:
33+
perf-canary:
34+
name: Perf canary (incremental tiers)
35+
runs-on: ubuntu-latest
36+
env:
37+
CODEGRAPH_FAST_SKIP_DIAG: "1"
38+
39+
steps:
40+
- uses: actions/checkout@v6
41+
with:
42+
fetch-depth: 0
43+
44+
- uses: actions/setup-node@v6
45+
with:
46+
node-version: "22"
47+
cache: "npm"
48+
49+
- name: Setup Rust
50+
uses: dtolnay/rust-toolchain@stable
51+
52+
- name: Rust cache
53+
uses: Swatinem/rust-cache@v2
54+
with:
55+
workspaces: crates/codegraph-core
56+
57+
- name: Install napi-rs CLI
58+
timeout-minutes: 5
59+
run: npm install -g @napi-rs/cli@3
60+
61+
- name: Build native addon
62+
working-directory: crates/codegraph-core
63+
run: napi build --release
64+
65+
- name: Install dependencies
66+
timeout-minutes: 20
67+
shell: bash
68+
run: |
69+
for attempt in 1 2 3; do
70+
npm install && break
71+
if [ "$attempt" -lt 3 ]; then
72+
echo "::warning::npm install attempt $attempt failed, retrying in 15s..."
73+
sleep 15
74+
else
75+
echo "::error::npm install failed after 3 attempts"
76+
exit 1
77+
fi
78+
done
79+
80+
- name: Install native addon over published binary
81+
run: node scripts/ci-install-native.mjs
82+
83+
# Build dist/ so benchmarks load the same compiled JS that ships to npm,
84+
# matching the methodology used by the full pre-publish-benchmark gate.
85+
- name: Build TypeScript
86+
run: npm run build
87+
88+
- name: Run incremental benchmark
89+
timeout-minutes: 15
90+
run: |
91+
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
92+
node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/incremental-benchmark.ts --version dev --dist > incremental-canary-result.json
93+
94+
- name: Update incremental report
95+
run: |
96+
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
97+
node $STRIP_FLAG scripts/update-incremental-report.ts incremental-canary-result.json
98+
99+
- name: Regression guard (50% threshold)
100+
env:
101+
RUN_REGRESSION_GUARD: "1"
102+
BENCH_CANARY: "1"
103+
run: npm run test:regression-guard
104+
105+
- name: Upload canary result
106+
if: always()
107+
uses: actions/upload-artifact@v7
108+
with:
109+
name: incremental-canary-result
110+
path: incremental-canary-result.json
111+
if-no-files-found: warn

tests/benchmarks/regression-guard.test.ts

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,16 @@ import { describe, expect, test } from 'vitest';
1616

1717
// ── Configuration ────────────────────────────────────────────────────────
1818

19+
/**
20+
* When BENCH_CANARY=1, only incremental-benchmark checks run and all timing
21+
* thresholds are raised to 50%. This mode is used by the per-PR perf-canary
22+
* workflow (.github/workflows/perf-canary.yml) which runs only on PRs
23+
* touching src/extractors/, src/domain/graph/, or crates/. The looser
24+
* threshold absorbs CI runner variance while still catching the class of
25+
* catastrophic regressions that hit v3.12.0 (+98%/+1827%).
26+
*/
27+
const BENCH_CANARY = process.env.BENCH_CANARY === '1';
28+
1929
/**
2030
* Maximum allowed regression (as a fraction, e.g. 0.25 = 25%).
2131
*
@@ -26,8 +36,10 @@ import { describe, expect, test } from 'vitest';
2636
*
2737
* Genuinely high-variance sub-30ms metrics get a wider tolerance via
2838
* `NOISY_METRICS` below — see that set's docstring for rationale.
39+
*
40+
* In BENCH_CANARY mode this is overridden to 0.5 (50%) — see above.
2941
*/
30-
const REGRESSION_THRESHOLD = 0.25;
42+
const REGRESSION_THRESHOLD = BENCH_CANARY ? 0.5 : 0.25;
3143

3244
/**
3345
* Wider regression threshold applied to metrics in NOISY_METRICS.
@@ -41,8 +53,11 @@ const REGRESSION_THRESHOLD = 0.25;
4153
* Keeping the global threshold at 25% means a regression in the 30–100ms
4254
* range is still caught (e.g. 50ms→63ms = +26%, flagged), while sub-30ms
4355
* metrics in this set get the wider 50% allowance.
56+
*
57+
* In BENCH_CANARY mode this is overridden to 1.0 (100%) — the canary's
58+
* purpose is to catch gross regressions (+50%+), not sub-30ms jitter.
4459
*/
45-
const NOISY_METRIC_THRESHOLD = 0.5;
60+
const NOISY_METRIC_THRESHOLD = BENCH_CANARY ? 1.0 : 0.5;
4661

4762
/**
4863
* Metric labels treated as high-variance and given the NOISY_METRIC_THRESHOLD
@@ -86,8 +101,12 @@ const NOISY_METRICS = new Set<string>(['No-op rebuild', '1-file rebuild', 'fnDep
86101
* v3.0.1–3.4.0), which 75% still flags, while absorbing the ≤71% shared-runner
87102
* jitter. Size metrics (DB bytes/file) are engine-independent and excluded from
88103
* this widening via SIZE_METRICS below — they keep the strict threshold.
104+
*
105+
* In BENCH_CANARY mode this is overridden to 1.5 (150%) — the canary targets
106+
* gross regressions only, and WASM incremental metrics have extreme variance
107+
* on shared runners.
89108
*/
90-
const WASM_TIMING_THRESHOLD = 0.75;
109+
const WASM_TIMING_THRESHOLD = BENCH_CANARY ? 1.5 : 0.75;
91110

92111
/**
93112
* Metric labels that measure size/count rather than wall-clock time. These are
@@ -608,6 +627,10 @@ interface IncrementalEntry {
608627
// in the default `npm test` run so docs commits that merge already-recorded
609628
// regressed history into main don't trigger false failures — by then the
610629
// release has already passed the gate.
630+
//
631+
// When BENCH_CANARY=1 (set by .github/workflows/perf-canary.yml), only the
632+
// incremental-benchmark suite runs and thresholds are raised to 50% — see
633+
// the BENCH_CANARY constant above.
611634
const RUN_REGRESSION_GUARD = process.env.RUN_REGRESSION_GUARD === '1';
612635

613636
describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
@@ -627,7 +650,9 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
627650
// Warn when KNOWN_REGRESSIONS entries are stale (more than 1 minor version
628651
// behind the current package version). This makes the stale-exemption
629652
// problem self-detecting rather than requiring manual bookkeeping.
630-
test('KNOWN_REGRESSIONS entries are not stale', () => {
653+
// Skipped in canary mode — this check is maintenance-only and irrelevant
654+
// for a lightweight build-time regression gate.
655+
test.skipIf(BENCH_CANARY)('KNOWN_REGRESSIONS entries are not stale', () => {
631656
// eslint-disable-next-line @typescript-eslint/no-require-imports
632657
const pkgVersion: string = JSON.parse(
633658
fs.readFileSync(path.join(ROOT, 'package.json'), 'utf8'),
@@ -656,18 +681,22 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
656681
).toBe(0);
657682
});
658683

659-
// Validate newest-first ordering assumption for all history arrays
660-
test('build history is sorted newest-first', () => {
684+
// Validate newest-first ordering assumption for all history arrays.
685+
// Build/query ordering checks are skipped in canary mode (only incremental
686+
// history is updated by the canary workflow).
687+
test.skipIf(BENCH_CANARY)('build history is sorted newest-first', () => {
661688
assertNewestFirst(buildHistory, 'Build benchmark');
662689
});
663-
test('query history is sorted newest-first', () => {
690+
test.skipIf(BENCH_CANARY)('query history is sorted newest-first', () => {
664691
assertNewestFirst(queryHistory, 'Query benchmark');
665692
});
666693
test('incremental history is sorted newest-first', () => {
667694
assertNewestFirst(incrementalHistory, 'Incremental benchmark');
668695
});
669696

670-
describe('build benchmarks', () => {
697+
// In canary mode only the incremental suite runs — build/query/resolution
698+
// benchmarks are not measured by the perf-canary workflow.
699+
describe.skipIf(BENCH_CANARY)('build benchmarks', () => {
671700
for (const engineKey of ['native', 'wasm'] as const) {
672701
const pair = findLatestPair(buildHistory, (e) => e[engineKey] != null);
673702
if (!pair) continue;
@@ -700,7 +729,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
700729
});
701730
});
702731

703-
describe('query benchmarks', () => {
732+
describe.skipIf(BENCH_CANARY)('query benchmarks', () => {
704733
for (const engineKey of ['native', 'wasm'] as const) {
705734
const pair = findLatestPair(queryHistory, (e) => e[engineKey] != null);
706735
if (!pair) continue;
@@ -803,7 +832,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
803832
});
804833
});
805834

806-
describe('resolution benchmarks', () => {
835+
describe.skipIf(BENCH_CANARY)('resolution benchmarks', () => {
807836
/**
808837
* Resolution precision/recall regression thresholds.
809838
* These are percentage-point drops (not relative %) because resolution

0 commit comments

Comments
 (0)