From 9cac6cff126c6737c4dbda3397e3973816730dcb Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Fri, 17 Apr 2026 21:20:55 -1000 Subject: [PATCH 1/4] ci: make Bencher main-regression alert a warning, not a hard fail Single-run alerts on GitHub-hosted runners are dominated by environmental noise: the set of (benchmark, measure) pairs flagged varies almost entirely across consecutive pushes on main (Jaccard ~0 between adjacent runs), while every non-docs commit fails the gate. Tracking issue #3116 has accumulated auto-regression comments with no actionable signal. Retain the Bencher dashboard upload, the auto-opened tracking issue, and the job-summary report (steps 7a/7b). Downgrade step 7c from `exit 1` to a `::warning::` annotation so main stops being permanently red. Re-enable a hard gate later when thresholds/sample-size tolerate the runner noise. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/benchmark.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index e26de8ba49..6f6ed0012a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -789,15 +789,17 @@ jobs: fi # ============================================ - # STEP 7c: FAIL WORKFLOW ON MAIN REGRESSION + # STEP 7c: WARN ON MAIN REGRESSION # ============================================ - # Only fail on main — PR benchmarks are informational (triggered by 'benchmark' label). - # Regressions on PRs are surfaced via Bencher report comments, not workflow failures. - - name: Fail workflow if Bencher detected regression on main + # Regressions are surfaced via the Bencher dashboard, the auto-opened tracking + # issue (Step 7b), and the job-summary report. Do not fail the workflow: single-run + # alerts on GitHub-hosted runners are dominated by environmental noise (alert sets + # across consecutive runs have ~0 overlap), so blocking main produces churn without + # signal. Re-enable a hard gate once thresholds/sample-size tolerate that noise. + - name: Warn if Bencher detected regression on main if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' run: | - echo "Bencher detected a regression (exit code: ${BENCHER_EXIT_CODE:-1})" - exit "${BENCHER_EXIT_CODE:-1}" + echo "::warning::Bencher flagged a regression on main (exit ${BENCHER_EXIT_CODE:-1}). See tracking issue #3116 and the Bencher dashboard." # ============================================ # STEP 8: WORKFLOW COMPLETION From e39e00ba6ea4b1a7b817a9255099f8f9c98b81f2 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Fri, 17 Apr 2026 22:01:40 -1000 Subject: [PATCH 2/4] ci: still fail main on operational Bencher errors, polish warning Distinguish regression alerts from operational failures so the warn-only policy on main only applies to genuine performance signals: - Set BENCHER_HAS_ALERT=1 when bencher stderr mentions alert/threshold/ boundary indicators; leave it 0 for auth/API/network/CLI failures. - Step 7b (auto-open issue) and step 7c (::warning::) now require BENCHER_HAS_ALERT == 1 so operational errors do not create bogus regression issues. - New step 7d fails the workflow on non-regression Bencher errors on main, preserving the original safety net for a broken benchmark pipeline (missing uploads, bad credentials, Bencher outage). - Step 7c warning no longer hardcodes a specific issue number (#3116 will churn as issues are closed/reopened) and now includes the workflow run URL for a direct clickable link. Addresses review feedback on #3168. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/benchmark.yml | 44 ++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 6f6ed0012a..822d436f31 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -650,11 +650,19 @@ jobs: echo "::warning::Bencher baseline not found for start-point hash '$START_POINT_HASH' — regression comparison unavailable for this run" BENCHER_EXIT_CODE=0 fi + + # Distinguish regression alerts from operational failures (auth/API/network/CLI) + # so that main can warn-only on the former while still failing hard on the latter. + BENCHER_HAS_ALERT=0 + if [ $BENCHER_EXIT_CODE -ne 0 ] && grep -qiE "alert|threshold violation|boundary violation" "$BENCHER_STDERR"; then + BENCHER_HAS_ALERT=1 + fi rm -f "$BENCHER_STDERR" - # Export exit code early so downstream alerting steps (7b/7c) always see it, - # even if the post-processing below (step summary, PR comments) fails. + # Export exit code and alert flag early so downstream steps (7b/7c/7d) always + # see them, even if the post-processing below (step summary, PR comments) fails. echo "BENCHER_EXIT_CODE=$BENCHER_EXIT_CODE" >> "$GITHUB_ENV" + echo "BENCHER_HAS_ALERT=$BENCHER_HAS_ALERT" >> "$GITHUB_ENV" # Post report to job summary and PR comment(s) if there's HTML output if [ -s bench_results/bencher_report.html ]; then @@ -703,7 +711,7 @@ jobs: # STEP 7b: ALERT ON MAIN REGRESSION # ============================================ - name: Create GitHub Issue for main regression - if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' + if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT == '1' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | @@ -792,14 +800,32 @@ jobs: # STEP 7c: WARN ON MAIN REGRESSION # ============================================ # Regressions are surfaced via the Bencher dashboard, the auto-opened tracking - # issue (Step 7b), and the job-summary report. Do not fail the workflow: single-run - # alerts on GitHub-hosted runners are dominated by environmental noise (alert sets - # across consecutive runs have ~0 overlap), so blocking main produces churn without - # signal. Re-enable a hard gate once thresholds/sample-size tolerate that noise. + # issue (Step 7b), and the job-summary report. Do not fail the workflow on regression + # alerts: single-run alerts on GitHub-hosted runners are dominated by environmental + # noise (alert sets across consecutive runs have ~0 overlap), so blocking main + # produces churn without signal. Re-enable a hard gate once thresholds/sample-size + # tolerate that noise. Operational errors (auth/API/network/CLI) are handled in + # step 7d and still fail the workflow. - name: Warn if Bencher detected regression on main - if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' + if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT == '1' + env: + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + echo "::warning::Bencher flagged a regression on main (exit ${BENCHER_EXIT_CODE:-1}). See the open regression issue (label: performance-regression), the Bencher dashboard, and the workflow run: ${RUN_URL}" + + # ============================================ + # STEP 7d: FAIL ON NON-REGRESSION BENCHER ERRORS + # ============================================ + # If bencher exited non-zero but stderr did not contain regression alert + # indicators, this is an operational failure (auth/API/network/CLI) rather than + # a performance signal. These should still fail the workflow on main so a broken + # benchmark pipeline (missing uploads, bad credentials, Bencher outage, etc.) is + # not silently hidden behind a warning annotation. + - name: Fail on non-regression Bencher error on main + if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT != '1' run: | - echo "::warning::Bencher flagged a regression on main (exit ${BENCHER_EXIT_CODE:-1}). See tracking issue #3116 and the Bencher dashboard." + echo "::error::Bencher exited ${BENCHER_EXIT_CODE:-1} on main with no regression alert in stderr — this indicates an operational failure (auth/API/network/CLI), not a performance regression. Check the logs above." + exit 1 # ============================================ # STEP 8: WORKFLOW COMPLETION From 12fa8f2ab07a0a273c2d2709dc880f326c4de0a6 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 18 Apr 2026 12:21:12 -1000 Subject: [PATCH 3/4] ci: tighten Bencher alert detection to word-bounded "Alert[s]" Narrows BENCHER_HAS_ALERT from case-insensitive "alert" to word-bounded case-sensitive "Alert" / "Alerts" so that lowercase "alert" appearing in bencher URL paths (e.g., "/v0/.../alerts") or CLI text cannot misclassify an operational failure as a regression and let main go green. "threshold violation" and "boundary violation" remain case-insensitive since those specific phrases are unlikely to appear outside real alert output. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/benchmark.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 822d436f31..71b1e5eb01 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -653,8 +653,14 @@ jobs: # Distinguish regression alerts from operational failures (auth/API/network/CLI) # so that main can warn-only on the former while still failing hard on the latter. + # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via + # a word-bounded, case-sensitive "Alert[s]" — a bare case-insensitive "alert" would + # also match lowercase "alert" in URL paths (e.g., "/v0/.../alerts") or CLI text, + # which could misclassify an operational failure as a regression and let main go green. BENCHER_HAS_ALERT=0 - if [ $BENCHER_EXIT_CODE -ne 0 ] && grep -qiE "alert|threshold violation|boundary violation" "$BENCHER_STDERR"; then + if [ $BENCHER_EXIT_CODE -ne 0 ] && \ + { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \ + grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then BENCHER_HAS_ALERT=1 fi rm -f "$BENCHER_STDERR" From 6f2bb80154b3b68a29607a626b51f1f848fc2003 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 18 Apr 2026 23:40:49 -1000 Subject: [PATCH 4/4] ci: pin Bencher CLI, broaden alert match, preserve exit code Addresses follow-up review feedback on the Bencher main-regression gate: - Pin bencherdev/bencher to v0.6.2 and document the expected stderr phrases ("Alert[s]", "threshold violation", "boundary violation") so a silent CLI upgrade can't flip the alert classification. - Switch the BENCHER_HAS_ALERT grep from case-sensitive `\bAlerts?\b` to case-insensitive `\bAlerts?\b` to stay consistent with the 404 guard and avoid misclassifying a lowercase `alert` variant as operational. Word boundaries still prevent matches against URL paths like `/v0/.../alerts`, so there's no new false-positive risk. - Propagate the original Bencher exit code from step 7d (`exit "${BENCHER_EXIT_CODE:-1}"` instead of bare `exit 1`) to preserve diagnostic context in CI logs. - Drop the redundant `rm -f "$BENCHER_STDERR"` now that the `trap 'rm -f' EXIT` registered above handles cleanup. --- .github/workflows/benchmark.yml | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 71b1e5eb01..5fda166e19 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -157,7 +157,12 @@ jobs: # ============================================ - name: Install Bencher CLI - uses: bencherdev/bencher@main + # Pinned: step 7a's alert-detection heuristic greps stderr for + # "Alert[s]" / "threshold violation" / "boundary violation". Those strings + # are not a documented API contract, so upgrading the CLI requires + # re-verifying that the expected phrases still appear in alert output. + # Expected stderr phrases at v0.6.2: "🚨 N Alerts", "Alert detected". + uses: bencherdev/bencher@v0.6.2 - name: Add tools directory to PATH run: | @@ -654,16 +659,18 @@ jobs: # Distinguish regression alerts from operational failures (auth/API/network/CLI) # so that main can warn-only on the former while still failing hard on the latter. # Match Bencher's actual alert output (e.g., "🚨 2 Alerts", "Alert detected") via - # a word-bounded, case-sensitive "Alert[s]" — a bare case-insensitive "alert" would - # also match lowercase "alert" in URL paths (e.g., "/v0/.../alerts") or CLI text, - # which could misclassify an operational failure as a regression and let main go green. + # a word-bounded, case-insensitive "Alert[s]". The word boundary (\b) already + # prevents matching lowercase "alerts" inside URL paths (e.g., "/v0/.../alerts"), + # so the case-insensitive flag adds robustness against future CLI wording changes + # (e.g., lowercase "1 alert") without reintroducing the URL-path false positive. BENCHER_HAS_ALERT=0 if [ $BENCHER_EXIT_CODE -ne 0 ] && \ - { grep -qE "\bAlerts?\b" "$BENCHER_STDERR" || \ + { grep -qiE "\bAlerts?\b" "$BENCHER_STDERR" || \ grep -qiE "threshold violation|boundary violation" "$BENCHER_STDERR"; }; then BENCHER_HAS_ALERT=1 fi - rm -f "$BENCHER_STDERR" + # Cleanup is also handled by the `trap 'rm -f "$BENCHER_STDERR"' EXIT` above, + # so we don't need an explicit rm here. # Export exit code and alert flag early so downstream steps (7b/7c/7d) always # see them, even if the post-processing below (step summary, PR comments) fails. @@ -831,7 +838,8 @@ jobs: if: github.event_name == 'push' && github.ref == 'refs/heads/main' && env.BENCHER_EXIT_CODE != '0' && env.BENCHER_HAS_ALERT != '1' run: | echo "::error::Bencher exited ${BENCHER_EXIT_CODE:-1} on main with no regression alert in stderr — this indicates an operational failure (auth/API/network/CLI), not a performance regression. Check the logs above." - exit 1 + # Preserve the original Bencher exit code in CI logs for diagnostic context. + exit "${BENCHER_EXIT_CODE:-1}" # ============================================ # STEP 8: WORKFLOW COMPLETION