From 9cac6cff126c6737c4dbda3397e3973816730dcb Mon Sep 17 00:00:00 2001
From: Justin Gordon <justin@shakacode.com>
Date: Fri, 17 Apr 2026 21:20:55 -1000
Subject: [PATCH 1/4] ci: make Bencher main-regression alert a warning, not a
 hard fail

Single-run alerts on GitHub-hosted runners are dominated by environmental
noise: the set of (benchmark, measure) pairs flagged varies almost entirely
across consecutive pushes on main (Jaccard ~0 between adjacent runs), while
every non-docs commit fails the gate. Tracking issue #3116 has accumulated
auto-regression comments with no actionable signal.

Retain the Bencher dashboard upload, the auto-opened tracking issue, and the
job-summary report (steps 7a/7b). Downgrade step 7c from `exit 1` to a
`::warning::` annotation so main stops being permanently red. Re-enable a
hard gate later when thresholds/sample-size tolerate the runner noise.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index e26de8ba49..6f6ed0012a 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -789,15 +789,17 @@ jobs:
           fi
 
       # ============================================
-      # STEP 7c: FAIL WORKFLOW ON MAIN REGRESSION
+      # STEP 7c: WARN ON MAIN REGRESSION
       # ============================================
-      # Only fail on main — PR benchmarks are informational (triggered by 'benchmark' label).
-      # Regressions on PRs are surfaced via Bencher report comments, not workflow failures.
-      - name: Fail workflow if Bencher detected regression on main
+      # Regressions are surfaced via the Bencher dashboard, the auto-opened tracking
+      # issue (Step 7b), and the job-summary report. Do not fail the workflow: single-run
+      # alerts on GitHub-hosted runners are dominated by environmental noise (alert sets
+      # across consecutive runs have ~0 overlap), so blocking main produces churn without
+      # signal. Re-enable a hard gate once thresholds/sample-size tolerate that noise.
+      - name: Warn if Bencher detected regression on main
         if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0'
         run: |
-          echo "Bencher detected a regression (exit code: ${BENCHER_EXIT_CODE:-1})"
-          exit "${BENCHER_EXIT_CODE:-1}"
+          echo "::warning::Bencher flagged a regression on main (exit ${BENCHER_EXIT_CODE:-1}). See tracking issue #3116 and the Bencher dashboard."
 
       # ============================================
       # STEP 8: WORKFLOW COMPLETION

From e39e00ba6ea4b1a7b817a9255099f8f9c98b81f2 Mon Sep 17 00:00:00 2001
From: Justin Gordon <justin@shakacode.com>
Date: Fri, 17 Apr 2026 22:01:40 -1000
Subject: [PATCH 2/4] ci: still fail main on operational Bencher errors, polish
 warning

Distinguish regression alerts from operational failures so the warn-only
policy on main only applies to genuine performance signals:

- Set BENCHER_HAS_ALERT=1 when bencher stderr mentions alert/threshold/
  boundary indicators; leave it 0 for auth/API/network/CLI failures.
- Step 7b (auto-open issue) and step 7c (::warning::) now require
  BENCHER_HAS_ALERT == 1 so operational errors do not create bogus
  regression issues.
- New step 7d fails the workflow on non-regression Bencher errors on
  main, preserving the original safety net for a broken benchmark
  pipeline (missing uploads, bad credentials, Bencher outage).
- Step 7c warning no longer hardcodes a specific issue number
  (#3116 will churn as issues are closed/reopened) and now includes
  the workflow run URL for a direct clickable link.

Addresses review feedback on #3168.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 44 ++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 6f6ed0012a..822d436f31 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -650,11 +650,19 @@ jobs:
             echo "::warning::Bencher baseline not found for start-point hash '$START_POINT_HASH' — regression comparison unavailable for this run"
             BENCHER_EXIT_CODE=0
           fi
+
+          # Distinguish regression alerts from operational failures (auth/API/network/CLI)
+          # so that main can warn-only on the former while still failing hard on the latter.
+          BENCHER_HAS_ALERT=0
+          if [ $BENCHER_EXIT_CODE -ne 0 ] && grep -qiE "alert|threshold violation|boundary violation" "$BENCHER_STDERR"; then
+            BENCHER_HAS_ALERT=1
+          fi
           rm -f "$BENCHER_STDERR"
 
-          # Export exit code early so downstream alerting steps (7b/7c) always see it,
-          # even if the post-processing below (step summary, PR comments) fails.
+          # Export exit code and alert flag early so downstream steps (7b/7c/7d) always
+          # see them, even if the post-processing below (step summary, PR comments) fails.
           echo "BENCHER_EXIT_CODE=$BENCHER_EXIT_CODE" >> "$GITHUB_ENV"
+          echo "BENCHER_HAS_ALERT=$BENCHER_HAS_ALERT" >> "$GITHUB_ENV"
 
           # Post report to job summary and PR comment(s) if there's HTML output
           if [ -s bench_results/bencher_report.html ]; then
@@ -703,7 +711,7 @@ jobs:
       # STEP 7b: ALERT ON MAIN REGRESSION
       # ============================================
       - name: Create GitHub Issue for main regression
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0'
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT == '1'
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
@@ -792,14 +800,32 @@ jobs:
       # STEP 7c: WARN ON MAIN REGRESSION
       # ============================================
       # Regressions are surfaced via the Bencher dashboard, the auto-opened tracking
-      # issue (Step 7b), and the job-summary report. Do not fail the workflow: single-run
-      # alerts on GitHub-hosted runners are dominated by environmental noise (alert sets
-      # across consecutive runs have ~0 overlap), so blocking main produces churn without
-      # signal. Re-enable a hard gate once thresholds/sample-size tolerate that noise.
+      # issue (Step 7b), and the job-summary report. Do not fail the workflow on regression
+      # alerts: single-run alerts on GitHub-hosted runners are dominated by environmental
+      # noise (alert sets across consecutive runs have ~0 overlap), so blocking main
+      # produces churn without signal. Re-enable a hard gate once thresholds/sample-size
+      # tolerate that noise. Operational errors (auth/API/network/CLI) are handled in
+      # step 7d and still fail the workflow.
       - name: Warn if Bencher detected regression on main
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0'
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT == '1'
+        env:
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          echo "::warning::Bencher flagged a regression on main (exit ${BENCHER_EXIT_CODE:-1}). See the open regression issue (label: performance-regression), the Bencher dashboard, and the workflow run: ${RUN_URL}"
+
+      # ============================================
+      # STEP 7d: FAIL ON NON-REGRESSION BENCHER ERRORS
+      # ============================================
+      # If bencher exited non-zero but stderr did not contain regression alert
+      # indicators, this is an operational failure (auth/API/network/CLI) rather than
+      # a performance signal. These should still fail the workflow on main so a broken
+      # benchmark pipeline (missing uploads, bad credentials, Bencher outage, etc.) is
+      # not silently hidden behind a warning annotation.
+      - name: Fail on non-regression Bencher error on main
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT != '1'
         run: |
-          echo "::warning::Bencher flagged a regression on main (exit ${BENCHER_EXIT_CODE:-1}). See tracking issue #3116 and the Bencher dashboard."
+          echo "::error::Bencher exited ${BENCHER_EXIT_CODE:-1} on main with no regression alert in stderr — this indicates an operational failure (auth/API/network/CLI), not a performance regression. Check the logs above."
+          exit 1
 
       # ============================================
       # STEP 8: WORKFLOW COMPLETION

From 12fa8f2ab07a0a273c2d2709dc880f326c4de0a6 Mon Sep 17 00:00:00 2001
From: Justin Gordon <justin@shakacode.com>
Date: Sat, 18 Apr 2026 12:21:12 -1000
Subject: [PATCH 3/4] ci: tighten Bencher alert detection to word-bounded
 "Alert[s]"

Narrows BENCHER_HAS_ALERT from case-insensitive "alert" to word-bounded
case-sensitive "Alert" / "Alerts" so that lowercase "alert" appearing in
bencher URL paths (e.g., "/v0/.../alerts") or CLI text cannot misclassify
an operational failure as a regression and let main go green.

"threshold violation" and "boundary violation" remain case-insensitive
since those specific phrases are unlikely to appear outside real alert
output.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 822d436f31..71b1e5eb01 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -653,8 +653,14 @@ jobs:
 
           # Distinguish regression alerts from operational failures (auth/API/network/CLI)
           # so that main can warn-only on the former while still failing hard on the latter.
+          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
+          # a word-bounded, case-sensitive "Alert[s]" — a bare case-insensitive "alert" would
+          # also match lowercase "alert" in URL paths (e.g., "/v0/.../alerts") or CLI text,
+          # which could misclassify an operational failure as a regression and let main go green.
           BENCHER_HAS_ALERT=0
-          if [ $BENCHER_EXIT_CODE -ne 0 ] && grep -qiE "alert|threshold violation|boundary violation" "$BENCHER_STDERR"; then
+          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
+             { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \
+               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
             BENCHER_HAS_ALERT=1
           fi
           rm -f "$BENCHER_STDERR"

From 6f2bb80154b3b68a29607a626b51f1f848fc2003 Mon Sep 17 00:00:00 2001
From: Justin Gordon <justin@shakacode.com>
Date: Sat, 18 Apr 2026 23:40:49 -1000
Subject: [PATCH 4/4] ci: pin Bencher CLI, broaden alert match, preserve exit
 code

Addresses follow-up review feedback on the Bencher main-regression gate:

- Pin bencherdev/bencher to v0.6.2 and document the expected stderr
  phrases ("Alert[s]", "threshold violation", "boundary violation") so a
  silent CLI upgrade can't flip the alert classification.
- Switch the BENCHER_HAS_ALERT grep from case-sensitive `\bAlerts?\b` to
  case-insensitive `\bAlerts?\b` to stay consistent with the 404 guard
  and avoid misclassifying a lowercase `alert` variant as operational.
  Word boundaries still prevent matches against URL paths like
  `/v0/.../alerts`, so there's no new false-positive risk.
- Propagate the original Bencher exit code from step 7d (`exit
  "${BENCHER_EXIT_CODE:-1}"` instead of bare `exit 1`) to preserve
  diagnostic context in CI logs.
- Drop the redundant `rm -f "$BENCHER_STDERR"` now that the
  `trap 'rm -f' EXIT` registered above handles cleanup.
---
 .github/workflows/benchmark.yml | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 71b1e5eb01..5fda166e19 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -157,7 +157,12 @@ jobs:
       # ============================================
 
       - name: Install Bencher CLI
-        uses: bencherdev/bencher@main
+        # Pinned: step 7a's alert-detection heuristic greps stderr for
+        # "Alert[s]" / "threshold violation" / "boundary violation". Those strings
+        # are not a documented API contract, so upgrading the CLI requires
+        # re-verifying that the expected phrases still appear in alert output.
+        # Expected stderr phrases at v0.6.2: "🚨 N Alerts", "Alert detected".
+        uses: bencherdev/bencher@v0.6.2
 
       - name: Add tools directory to PATH
         run: |
@@ -654,16 +659,18 @@ jobs:
           # Distinguish regression alerts from operational failures (auth/API/network/CLI)
           # so that main can warn-only on the former while still failing hard on the latter.
           # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
-          # a word-bounded, case-sensitive "Alert[s]" — a bare case-insensitive "alert" would
-          # also match lowercase "alert" in URL paths (e.g., "/v0/.../alerts") or CLI text,
-          # which could misclassify an operational failure as a regression and let main go green.
+          # a word-bounded, case-insensitive "Alert[s]". The word boundary (\b) already
+          # prevents matching lowercase "alerts" inside URL paths (e.g., "/v0/.../alerts"),
+          # so the case-insensitive flag adds robustness against future CLI wording changes
+          # (e.g., lowercase "1 alert") without reintroducing the URL-path false positive.
           BENCHER_HAS_ALERT=0
           if [ $BENCHER_EXIT_CODE -ne 0 ] && \
-             { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \
+             { grep -qiE "\bAlerts?\b" "$BENCHER_STDERR" || \
                grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
             BENCHER_HAS_ALERT=1
           fi
-          rm -f "$BENCHER_STDERR"
+          # Cleanup is also handled by the `trap 'rm -f "$BENCHER_STDERR"' EXIT` above,
+          # so we don't need an explicit rm here.
 
           # Export exit code and alert flag early so downstream steps (7b/7c/7d) always
           # see them, even if the post-processing below (step summary, PR comments) fails.
@@ -831,7 +838,8 @@ jobs:
         if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT != '1'
         run: |
           echo "::error::Bencher exited ${BENCHER_EXIT_CODE:-1} on main with no regression alert in stderr — this indicates an operational failure (auth/API/network/CLI), not a performance regression. Check the logs above."
-          exit 1
+          # Preserve the original Bencher exit code in CI logs for diagnostic context.
+          exit "${BENCHER_EXIT_CODE:-1}"
 
       # ============================================
       # STEP 8: WORKFLOW COMPLETION