ci(bench): add per-PR perf canary for extractor/graph/native changes (#1488)

carlos-alm · web-flow · commit 65bd28ae19b9 · 2026-06-13T15:05:16.000-06:00
* chore: gitignore napi-generated artifacts in crates/codegraph-core * chore(tests): remove unused biome suppression in visitor.test.ts * fix(titan-run): sync --start-from enum and phase-timestamp list with actual phases * fix(hooks): track Bash file modifications via before/after git status diff Adds snapshot-pre-bash.sh (PreToolUse Bash) + track-bash-writes.sh (PostToolUse Bash): the pre-hook captures git status --porcelain to a per-worktree temp file before each Bash call; the post-hook diffs the before/after state and appends newly modified or created files to .claude/session-edits.log. This closes the gap where files written by sed -i, printf redirects, tee, heredocs, or build tools (Cargo.lock, lockfiles) were never recorded, causing guard-git.sh to emit false-positive BLOCKED errors. Closes #1457 * chore(native): remove dead code (unused var, method, variant, fields) - clojure.rs: annotate lifetime-anchor assignment to silence false-positive - cfg.rs: remove never-called start_line_of method - complexity.rs: remove never-constructed NotHandled variant; convert irrefutable if-let patterns to plain let destructures - dataflow.rs: remove never-read callee fields from CallReturn/Destructured - incremental.rs: remove never-read lang field from CacheEntry cargo check and cargo clippy both clean after these changes. * refactor(native): extract emit_pts_alias_edges params into PtsAliasCtx struct * fix(wasm): sort call targets by confidence before emit to match native engine * fix(bench): add 2 warmup runs and raise INCREMENTAL_RUNS to 5 for incremental tiers * ci(bench): add per-PR perf canary for extractor/graph/native changes Adds .github/workflows/perf-canary.yml — a path-filtered workflow that fires on PRs touching src/extractors/, src/domain/graph/, or crates/** and runs only the incremental-benchmark suite (full build + no-op + 1-file rebuild, both engines). Catches the class of regressions that accumulated invisibly across the Phase 8.x PRs and were only detected at v3.12.0 publish time. The regression guard gains BENCH_CANARY=1 mode: raises thresholds to 50%/100%/150% (standard/noisy/WASM) and skips the build, query, and resolution suites — only incremental checks run. This absorbs shared- runner timing variance while still blocking catastrophic regressions (+98% full build, +1827% 1-file rebuild from v3.12.0). Closes #1433 * ci(bench): add permissions block, self-referential path filters, drop excess fetch-depth - Add `permissions: {}` to lock down token scope for the read-only canary - Add `scripts/update-incremental-report.ts` and `tests/benchmarks/regression-guard.test.ts` to path filter so PRs that modify the canary machinery itself also trigger the canary - Remove `fetch-depth: 0` (full history not needed; canary compares against committed benchmark data, not git refs) - Align `node-version: 22` with the integer format used in ci.yml
diff --git a/.github/workflows/perf-canary.yml b/.github/workflows/perf-canary.yml
@@ -0,0 +1,113 @@
+name: Perf Canary
+
+# Lightweight per-PR build-time regression gate for PRs that touch the
+# extractor, graph-builder, or native Rust layers — the parts of the codebase
+# that caused the v3.12.0 regressions (+1827% 1-file rebuild, +98% full build).
+#
+# Only the incremental-benchmark suite is run (full build + no-op + 1-file
+# rebuild for both engines). The regression guard uses BENCH_CANARY=1 mode,
+# which applies a 50% threshold instead of the full suite's 25% — enough
+# to catch catastrophic regressions while tolerating CI runner variance.
+#
+# This is intentionally separate from the full pre-publish-benchmark job in
+# ci.yml, which runs unconditionally on every PR and measures the complete
+# suite. The canary completes in roughly 5–10 minutes; the full suite takes
+# 20–60 minutes.
+
+on:
+  pull_request:
+    paths:
+      - "src/extractors/**"
+      - "src/domain/graph/**"
+      - "crates/**"
+      - "scripts/benchmark.ts"
+      - "scripts/incremental-benchmark.ts"
+      - "scripts/lib/bench-config.ts"
+      - "scripts/lib/fork-engine.ts"
+      - "scripts/update-incremental-report.ts"
+      - "tests/benchmarks/regression-guard.test.ts"
+
+permissions: {}
+
+concurrency:
+  group: perf-canary-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  perf-canary:
+    name: Perf canary (incremental tiers)
+    runs-on: ubuntu-latest
+    env:
+      CODEGRAPH_FAST_SKIP_DIAG: "1"
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22
+          cache: "npm"
+
+      - name: Setup Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: crates/codegraph-core
+
+      - name: Install napi-rs CLI
+        timeout-minutes: 5
+        run: npm install -g @napi-rs/cli@3
+
+      - name: Build native addon
+        working-directory: crates/codegraph-core
+        run: napi build --release
+
+      - name: Install dependencies
+        timeout-minutes: 20
+        shell: bash
+        run: |
+          for attempt in 1 2 3; do
+            npm install && break
+            if [ "$attempt" -lt 3 ]; then
+              echo "::warning::npm install attempt $attempt failed, retrying in 15s..."
+              sleep 15
+            else
+              echo "::error::npm install failed after 3 attempts"
+              exit 1
+            fi
+          done
+
+      - name: Install native addon over published binary
+        run: node scripts/ci-install-native.mjs
+
+      # Build dist/ so benchmarks load the same compiled JS that ships to npm,
+      # matching the methodology used by the full pre-publish-benchmark gate.
+      - name: Build TypeScript
+        run: npm run build
+
+      - name: Run incremental benchmark
+        timeout-minutes: 15
+        run: |
+          STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
+          node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/incremental-benchmark.ts --version dev --dist > incremental-canary-result.json
+
+      - name: Update incremental report
+        run: |
+          STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
+          node $STRIP_FLAG scripts/update-incremental-report.ts incremental-canary-result.json
+
+      - name: Regression guard (50% threshold)
+        env:
+          RUN_REGRESSION_GUARD: "1"
+          BENCH_CANARY: "1"
+        run: npm run test:regression-guard
+
+      - name: Upload canary result
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: incremental-canary-result
+          path: incremental-canary-result.json
+          if-no-files-found: warn
diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts
@@ -16,6 +16,16 @@ import { describe, expect, test } from 'vitest';
 
 // ── Configuration ────────────────────────────────────────────────────────
 
+/**
+ * When BENCH_CANARY=1, only incremental-benchmark checks run and all timing
+ * thresholds are raised to 50%. This mode is used by the per-PR perf-canary
+ * workflow (.github/workflows/perf-canary.yml) which runs only on PRs
+ * touching src/extractors/, src/domain/graph/, or crates/. The looser
+ * threshold absorbs CI runner variance while still catching the class of
+ * catastrophic regressions that hit v3.12.0 (+98%/+1827%).
+ */
+const BENCH_CANARY = process.env.BENCH_CANARY === '1';
+
 /**
  * Maximum allowed regression (as a fraction, e.g. 0.25 = 25%).
  *
@@ -26,8 +36,10 @@ import { describe, expect, test } from 'vitest';
  *
  * Genuinely high-variance sub-30ms metrics get a wider tolerance via
  * `NOISY_METRICS` below — see that set's docstring for rationale.
+ *
+ * In BENCH_CANARY mode this is overridden to 0.5 (50%) — see above.
  */
-const REGRESSION_THRESHOLD = 0.25;
+const REGRESSION_THRESHOLD = BENCH_CANARY ? 0.5 : 0.25;
 
 /**
  * Wider regression threshold applied to metrics in NOISY_METRICS.
@@ -41,8 +53,11 @@ const REGRESSION_THRESHOLD = 0.25;
  * Keeping the global threshold at 25% means a regression in the 30–100ms
  * range is still caught (e.g. 50ms→63ms = +26%, flagged), while sub-30ms
  * metrics in this set get the wider 50% allowance.
+ *
+ * In BENCH_CANARY mode this is overridden to 1.0 (100%) — the canary's
+ * purpose is to catch gross regressions (+50%+), not sub-30ms jitter.
  */
-const NOISY_METRIC_THRESHOLD = 0.5;
+const NOISY_METRIC_THRESHOLD = BENCH_CANARY ? 1.0 : 0.5;
 
 /**
  * Metric labels treated as high-variance and given the NOISY_METRIC_THRESHOLD
@@ -86,8 +101,12 @@ const NOISY_METRICS = new Set<string>(['No-op rebuild', '1-file rebuild', 'fnDep
  * v3.0.1–3.4.0), which 75% still flags, while absorbing the ≤71% shared-runner
  * jitter. Size metrics (DB bytes/file) are engine-independent and excluded from
  * this widening via SIZE_METRICS below — they keep the strict threshold.
+ *
+ * In BENCH_CANARY mode this is overridden to 1.5 (150%) — the canary targets
+ * gross regressions only, and WASM incremental metrics have extreme variance
+ * on shared runners.
  */
-const WASM_TIMING_THRESHOLD = 0.75;
+const WASM_TIMING_THRESHOLD = BENCH_CANARY ? 1.5 : 0.75;
 
 /**
  * Metric labels that measure size/count rather than wall-clock time. These are
@@ -622,6 +641,10 @@ interface IncrementalEntry {
 // in the default `npm test` run so docs commits that merge already-recorded
 // regressed history into main don't trigger false failures — by then the
 // release has already passed the gate.
+//
+// When BENCH_CANARY=1 (set by .github/workflows/perf-canary.yml), only the
+// incremental-benchmark suite runs and thresholds are raised to 50% — see
+// the BENCH_CANARY constant above.
 const RUN_REGRESSION_GUARD = process.env.RUN_REGRESSION_GUARD === '1';
 
 describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
@@ -641,7 +664,9 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
   // Warn when KNOWN_REGRESSIONS entries are stale (more than 1 minor version
   // behind the current package version).  This makes the stale-exemption
   // problem self-detecting rather than requiring manual bookkeeping.
-  test('KNOWN_REGRESSIONS entries are not stale', () => {
+  // Skipped in canary mode — this check is maintenance-only and irrelevant
+  // for a lightweight build-time regression gate.
+  test.skipIf(BENCH_CANARY)('KNOWN_REGRESSIONS entries are not stale', () => {
     // eslint-disable-next-line @typescript-eslint/no-require-imports
     const pkgVersion: string = JSON.parse(
       fs.readFileSync(path.join(ROOT, 'package.json'), 'utf8'),
@@ -670,18 +695,22 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
     ).toBe(0);
   });
 
-  // Validate newest-first ordering assumption for all history arrays
-  test('build history is sorted newest-first', () => {
+  // Validate newest-first ordering assumption for all history arrays.
+  // Build/query ordering checks are skipped in canary mode (only incremental
+  // history is updated by the canary workflow).
+  test.skipIf(BENCH_CANARY)('build history is sorted newest-first', () => {
     assertNewestFirst(buildHistory, 'Build benchmark');
   });
-  test('query history is sorted newest-first', () => {
+  test.skipIf(BENCH_CANARY)('query history is sorted newest-first', () => {
     assertNewestFirst(queryHistory, 'Query benchmark');
   });
   test('incremental history is sorted newest-first', () => {
     assertNewestFirst(incrementalHistory, 'Incremental benchmark');
   });
 
-  describe('build benchmarks', () => {
+  // In canary mode only the incremental suite runs — build/query/resolution
+  // benchmarks are not measured by the perf-canary workflow.
+  describe.skipIf(BENCH_CANARY)('build benchmarks', () => {
     for (const engineKey of ['native', 'wasm'] as const) {
       const pair = findLatestPair(buildHistory, (e) => e[engineKey] != null);
       if (!pair) continue;
@@ -714,7 +743,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
     });
   });
 
-  describe('query benchmarks', () => {
+  describe.skipIf(BENCH_CANARY)('query benchmarks', () => {
     for (const engineKey of ['native', 'wasm'] as const) {
       const pair = findLatestPair(queryHistory, (e) => e[engineKey] != null);
       if (!pair) continue;
@@ -817,7 +846,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
     });
   });
 
-  describe('resolution benchmarks', () => {
+  describe.skipIf(BENCH_CANARY)('resolution benchmarks', () => {
     /**
      * Resolution precision/recall regression thresholds.
      * These are percentage-point drops (not relative %) because resolution