shakacode · justin808 · Apr 20, 2026 · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -157,7 +157,12 @@ jobs:
       # ============================================
 
       - name: Install Bencher CLI
-        uses: bencherdev/bencher@main
+        # Pinned: step 7a's alert-detection heuristic greps stderr for
+        # "Alert[s]" / "threshold violation" / "boundary violation". Those strings
+        # are not a documented API contract, so upgrading the CLI requires
+        # re-verifying that the expected phrases still appear in alert output.
+        # Expected stderr phrases at v0.6.2: "🚨 N Alerts", "Alert detected".
+        uses: bencherdev/bencher@v0.6.2
-        uses: bencherdev/bencher@v0.6.2
+        uses: bencherdev/bencher@v0.6.2  # TODO: pin to SHA once verified, e.g. bencherdev/bencher@<sha>
-        uses: bencherdev/bencher@v0.6.2
+        uses: bencherdev/bencher@v0.6.2  # TODO: pin to SHA once verified, e.g. bencherdev/bencher@<sha>
 
       - name: Add tools directory to PATH
         run: |
@@ -650,11 +655,27 @@ jobs:
             echo "::warning::Bencher baseline not found for start-point hash '$START_POINT_HASH' — regression comparison unavailable for this run"
             BENCHER_EXIT_CODE=0
           fi
-          rm -f "$BENCHER_STDERR"
 
-          # Export exit code early so downstream alerting steps (7b/7c) always see it,
-          # even if the post-processing below (step summary, PR comments) fails.
+          # Distinguish regression alerts from operational failures (auth/API/network/CLI)
+          # so that main can warn-only on the former while still failing hard on the latter.
+          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
+          # a word-bounded, case-insensitive "Alert[s]". The word boundary (\b) already
+          # prevents matching lowercase "alerts" inside URL paths (e.g., "/v0/.../alerts"),
+          # so the case-insensitive flag adds robustness against future CLI wording changes
+          # (e.g., lowercase "1 alert") without reintroducing the URL-path false positive.
-          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
-          # a word-bounded, case-insensitive "Alert[s]". The word boundary (\b) already
-          # prevents matching lowercase "alerts" inside URL paths (e.g., "/v0/.../alerts"),
-          # so the case-insensitive flag adds robustness against future CLI wording changes
-          # (e.g., lowercase "1 alert") without reintroducing the URL-path false positive.
+          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
+          # a word-bounded, case-insensitive "Alert[s]". Note: \b does NOT prevent matching
+          # "alerts" inside URL paths like "/v0/alerts" (/ is \W so a word boundary exists
+          # before "a"). False positives are unlikely in practice because Bencher does not
+          # emit raw API endpoint paths in its stderr alert output.
-          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
-          # a word-bounded, case-insensitive "Alert[s]". The word boundary (\b) already
-          # prevents matching lowercase "alerts" inside URL paths (e.g., "/v0/.../alerts"),
-          # so the case-insensitive flag adds robustness against future CLI wording changes
-          # (e.g., lowercase "1 alert") without reintroducing the URL-path false positive.
+          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
+          # a word-bounded, case-insensitive "Alert[s]". Note: \b does NOT prevent matching
+          # "alerts" inside URL paths like "/v0/alerts" (/ is \W so a word boundary exists
+          # before "a"). False positives are unlikely in practice because Bencher does not
+          # emit raw API endpoint paths in its stderr alert output.
+          BENCHER_HAS_ALERT=0
+          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
+             { grep -qiE "\bAlerts?\b" "$BENCHER_STDERR" || \
+               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
+            BENCHER_HAS_ALERT=1
-          BENCHER_HAS_ALERT=0
-          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
-             { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \
-               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
-            BENCHER_HAS_ALERT=1
+          BENCHER_HAS_ALERT=0
+          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
+             { grep -qiE "\bAlerts?\b" "$BENCHER_STDERR" || \
+               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
+            BENCHER_HAS_ALERT=1
-          BENCHER_HAS_ALERT=0
-          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
-             { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \
-               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
-            BENCHER_HAS_ALERT=1
+          BENCHER_HAS_ALERT=0
+          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
+             { grep -qiE "\bAlerts?\b" "$BENCHER_STDERR" || \
+               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
+            BENCHER_HAS_ALERT=1
+          fi
-          # Distinguish regression alerts from operational failures (auth/API/network/CLI)
-          # so that main can warn-only on the former while still failing hard on the latter.
-          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
-          # a word-bounded, case-insensitive "Alert[s]". The word boundary (\b) already
-          # prevents matching lowercase "alerts" inside URL paths (e.g., "/v0/.../alerts"),
-          # so the case-insensitive flag adds robustness against future CLI wording changes
-          # (e.g., lowercase "1 alert") without reintroducing the URL-path false positive.
-          BENCHER_HAS_ALERT=0
-          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
-             { grep -qiE "\bAlerts?\b" "$BENCHER_STDERR" || \
-               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
-            BENCHER_HAS_ALERT=1
-          fi
+          # Match Bencher's pinned v0.6.2 alert output (e.g., "🚨 2 Alerts", "Alert detected")
+          # with a case-sensitive "Alert[s]" so lowercase URL paths such as "/v0/.../alerts"
+          # in operational errors are not classified as performance regressions.
+          BENCHER_HAS_ALERT=0
+          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
+             { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \
+               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
+            BENCHER_HAS_ALERT=1
+          fi
-          # Distinguish regression alerts from operational failures (auth/API/network/CLI)
-          # so that main can warn-only on the former while still failing hard on the latter.
-          # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via
-          # a word-bounded, case-insensitive "Alert[s]". The word boundary (\b) already
-          # prevents matching lowercase "alerts" inside URL paths (e.g., "/v0/.../alerts"),
-          # so the case-insensitive flag adds robustness against future CLI wording changes
-          # (e.g., lowercase "1 alert") without reintroducing the URL-path false positive.
-          BENCHER_HAS_ALERT=0
-          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
-             { grep -qiE "\bAlerts?\b" "$BENCHER_STDERR" || \
-               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
-            BENCHER_HAS_ALERT=1
-          fi
+          # Match Bencher's pinned v0.6.2 alert output (e.g., "🚨 2 Alerts", "Alert detected")
+          # with a case-sensitive "Alert[s]" so lowercase URL paths such as "/v0/.../alerts"
+          # in operational errors are not classified as performance regressions.
+          BENCHER_HAS_ALERT=0
+          if [ $BENCHER_EXIT_CODE -ne 0 ] && \
+             { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \
+               grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then
+            BENCHER_HAS_ALERT=1
+          fi
+          # Cleanup is also handled by the `trap 'rm -f "$BENCHER_STDERR"' EXIT` above,
+          # so we don't need an explicit rm here.
+
+          # Export exit code and alert flag early so downstream steps (7b/7c/7d) always
+          # see them, even if the post-processing below (step summary, PR comments) fails.
           echo "BENCHER_EXIT_CODE=$BENCHER_EXIT_CODE" >> "$GITHUB_ENV"
+          echo "BENCHER_HAS_ALERT=$BENCHER_HAS_ALERT" >> "$GITHUB_ENV"
 
           # Post report to job summary and PR comment(s) if there's HTML output
           if [ -s bench_results/bencher_report.html ]; then
@@ -703,7 +724,7 @@ jobs:
       # STEP 7b: ALERT ON MAIN REGRESSION
       # ============================================
       - name: Create GitHub Issue for main regression
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0'
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT == '1'
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
@@ -789,14 +810,35 @@ jobs:
           fi
 
       # ============================================
-      # STEP 7c: FAIL WORKFLOW ON MAIN REGRESSION
+      # STEP 7c: WARN ON MAIN REGRESSION
+      # ============================================
+      # Regressions are surfaced via the Bencher dashboard, the auto-opened tracking
+      # issue (Step 7b), and the job-summary report. Do not fail the workflow on regression
+      # alerts: single-run alerts on GitHub-hosted runners are dominated by environmental
+      # noise (alert sets across consecutive runs have ~0 overlap), so blocking main
+      # produces churn without signal. Re-enable a hard gate once thresholds/sample-size
+      # tolerate that noise. Operational errors (auth/API/network/CLI) are handled in
+      # step 7d and still fail the workflow.
+      - name: Warn if Bencher detected regression on main
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT == '1'
+        env:
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          echo "::warning::Bencher flagged a regression on main (exit ${BENCHER_EXIT_CODE:-1}). See the open regression issue (label: performance-regression), the Bencher dashboard, and the workflow run: ${RUN_URL}"
+
+      # ============================================
+      # STEP 7d: FAIL ON NON-REGRESSION BENCHER ERRORS
       # ============================================
-      # Only fail on main — PR benchmarks are informational (triggered by 'benchmark' label).
-      # Regressions on PRs are surfaced via Bencher report comments, not workflow failures.
-      - name: Fail workflow if Bencher detected regression on main
-        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0'
+      # If bencher exited non-zero but stderr did not contain regression alert
+      # indicators, this is an operational failure (auth/API/network/CLI) rather than
+      # a performance signal. These should still fail the workflow on main so a broken
+      # benchmark pipeline (missing uploads, bad credentials, Bencher outage, etc.) is
+      # not silently hidden behind a warning annotation.
+      - name: Fail on non-regression Bencher error on main
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT != '1'
         run: |
-          echo "Bencher detected a regression (exit code: ${BENCHER_EXIT_CODE:-1})"
+          echo "::error::Bencher exited ${BENCHER_EXIT_CODE:-1} on main with no regression alert in stderr — this indicates an operational failure (auth/API/network/CLI), not a performance regression. Check the logs above."
+          # Preserve the original Bencher exit code in CI logs for diagnostic context.
           exit "${BENCHER_EXIT_CODE:-1}"
 
       # ============================================