ci: Fix bencher.dev thresholds (#459)

samuelburnham · web-flow · commit 4b0af6cb9ea8 · 2026-06-30T14:42:33.000-04:00
diff --git a/.github/actions/bencher-track/action.yml b/.github/actions/bencher-track/action.yml
@@ -1,13 +1,18 @@
 name: Track benchmarks on bencher
 description: >-
-  Run `bencher run` for the `ix` project, windowing deterministic "stepping"
-  measures to commits since the last lean-toolchain bump (and dropping them on
-  the bump commit via --thresholds-reset, so a toolchain step lands without a
-  spurious alert). The calling job must check out with fetch-depth: 0.
+  Run `bencher run` for the `ix` project. Every measure's baseline is windowed
+  to the data points recorded since the workload's `baseline-reset-<workload>`
+  tag, so a toolchain bump (or any permanent shift) is recalibrated by moving
+  that one tag — via the !bencher-thresholds-reset PR comment or the
+  bench-thresholds-reset workflow. The calling job must check out with
+  fetch-depth: 0 and fetch-tags: true (the tag is read here).
 
 inputs:
   testbed:
-    description: Bencher testbed name.
+    description: Bencher testbed slug.
+    required: true
+  workload:
+    description: Workload key for the `baseline-reset-<workload>` tag (e.g. ix-compile, aiur).
     required: true
   file:
     description: Bencher Metric Format JSON file to upload.
@@ -18,56 +23,52 @@ inputs:
   github-token:
     description: GITHUB_TOKEN, for the --github-actions comment.
     required: true
-  always-thresholds:
-    description: >-
-      Raw `--threshold-measure …` flags for measures tracked on every commit
-      (non-stepping). Word-split, so keep each flag a single token.
-    required: false
-    default: ""
-  stepping-measures:
+  thresholds:
     description: >-
-      Space-separated measure names that step on a toolchain bump (e.g.
-      "file-size constants" or "fft-cost"). Windowed to commits-since-bump and
-      omitted on the bump commit.
-    required: false
-    default: ""
+      `--threshold-*` flags for every tracked measure. Use the literal token
+      `__WINDOW__` in place of each `--threshold-max-sample-size` value; it is
+      replaced with the data-point count since the baseline anchor. Word-split,
+      so keep each flag a single token.
+    required: true
 
 runs:
   using: composite
   steps:
-    - uses: bencherdev/bencher@v0.6.7
+    - uses: bencherdev/bencher@v0.6.8
     - shell: bash
       env:
         BENCHER_KEY: ${{ inputs.key }}
         GH_TOKEN: ${{ inputs.github-token }}
         TESTBED: ${{ inputs.testbed }}
+        WORKLOAD: ${{ inputs.workload }}
         FILE: ${{ inputs.file }}
-        ALWAYS_THRESHOLDS: ${{ inputs.always-thresholds }}
-        STEPPING_MEASURES: ${{ inputs.stepping-measures }}
+        THRESHOLDS: ${{ inputs.thresholds }}
       run: |
-        # Sample only commits since lean-toolchain last changed, so a stepping
-        # measure's baseline window never straddles a toolchain bump.
-        last_bump=$(git log -1 --format=%H -- lean-toolchain)
-        sample=$(git rev-list --count "${last_bump}..HEAD")
-        [ "$sample" -gt 64 ] && sample=64
-        echo "commits since last lean-toolchain bump (capped 64): $sample"
-        # On the bump commit (sample=0) omit the stepping measures entirely;
-        # --thresholds-reset then clears their models so the step doesn't alert.
-        # Between bumps these measures are deterministic, so pin them exactly
-        # (0% on both sides): any change in either direction is a real circuit
-        # or closure change, not noise, and should alert.
-        step_thresholds=()
-        if [ "$sample" -gt 0 ]; then
-          for m in $STEPPING_MEASURES; do
-            step_thresholds+=(
-              --threshold-measure "$m"
-              --threshold-test percentage
-              --threshold-max-sample-size "$sample"
-              --threshold-upper-boundary 0
-              --threshold-lower-boundary 0
-            )
-          done
+        # Baseline window: data points for this benchmark since the workload's
+        # `baseline-reset-<workload>` tag (moved on a toolchain bump or any
+        # permanent shift). The same window applies to every measure. We count
+        # actual data points, not commits, so the baseline never reaches past
+        # the anchor (a failed run produces no metric, so a commit count would
+        # overshoot). The matrix uploads one report per benchmark, so count this
+        # benchmark's reports (its name is the upload's top-level key). The ix
+        # project is public, so no token is needed. With no tag yet, fall back
+        # to Bencher's 64-point window.
+        sample=64
+        if reset_epoch=$(git log -1 --format=%ct "baseline-reset-$WORKLOAD" 2>/dev/null); then
+          bench=$(jq -r '[keys[]][0]' "$FILE")
+          sample=$(curl -fsS "https://api.bencher.dev/v0/projects/ix/reports?branch=${GITHUB_REF_NAME}&testbed=${TESTBED}&start_time=$((reset_epoch * 1000))&per_page=255" \
+            | jq --arg b "$bench" '[.[] | select([.results[]?[]?.benchmark.name] | index($b))] | length')
         fi
+        sample=${sample:-0}
+        [ "$sample" -gt 64 ] && sample=64
+        echo "baseline window for '$WORKLOAD': $sample data point(s)"
+
+        # No data points since the reset → omit thresholds so the reset commit
+        # lands without an alert (--thresholds-reset clears stale models); they
+        # re-arm next run. Otherwise substitute the window into each measure.
+        thresholds=()
+        [ "$sample" -gt 0 ] && thresholds=( ${THRESHOLDS//__WINDOW__/$sample} )
+
         bencher run \
           --project ix \
           --key "$BENCHER_KEY" \
@@ -76,7 +77,6 @@ runs:
           --testbed "$TESTBED" \
           --adapter json \
           --github-actions "$GH_TOKEN" \
-          $ALWAYS_THRESHOLDS \
-          "${step_thresholds[@]}" \
+          "${thresholds[@]}" \
           --thresholds-reset \
           --file "$FILE"
diff --git a/.github/workflows/bench-main.yml b/.github/workflows/bench-main.yml
@@ -81,6 +81,7 @@ jobs:
       - uses: actions/checkout@v6
         with:
           fetch-depth: 0
+          fetch-tags: true   # bencher-track reads the baseline-reset tag
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
@@ -138,24 +139,30 @@ jobs:
             }
           }
           EOF
-      # Upload compile metrics. file-size/constants step on a toolchain bump, so
-      # they're the bump-windowed stepping measures; compile-time/throughput ride
-      # their normal 64-window (throughput's regression is a drop, hence its
-      # lower boundary).
+      # Upload compile metrics. Every measure shares the per-workload baseline
+      # window (data points since the ix-compile reset tag): file-size/constants
+      # are deterministic, pinned exactly (0/0); compile-time rides a 5% upper
+      # bound; throughput a 5% lower bound (its regression is a drop).
       - uses: ./.github/actions/bencher-track
         with:
-          testbed: warp-ubuntu-x64-32x
+          testbed: ix-compile-x64-32x
+          workload: ix-compile
           file: benchmark.json
           key: ${{ secrets.BENCHER_API_KEY }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          stepping-measures: file-size constants
-          always-thresholds: |
+          thresholds: |
             --threshold-measure compile-time --threshold-test percentage
-            --threshold-max-sample-size 64 --threshold-upper-boundary 0.05
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
             --threshold-lower-boundary _
             --threshold-measure throughput --threshold-test percentage
-            --threshold-max-sample-size 64 --threshold-upper-boundary _
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.05
+            --threshold-measure file-size --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
+            --threshold-measure constants --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
 
   # Restore each matrix job's `.ixe` from the cache and run the Aiur execute + prove
   # benchmark over selected constants. No compiler run here.
@@ -174,7 +181,8 @@ jobs:
     steps:
       - uses: actions/checkout@v6
         with:
-          fetch-depth: 0   # full history for the toolchain-bump sample-size query
+          fetch-depth: 0     # full history for the baseline-anchor lookup
+          fetch-tags: true   # bencher-track reads the baseline-reset tag
       - uses: actions/cache/restore@v5
         with:
           path: ~/.local/bin
@@ -230,30 +238,37 @@ jobs:
             map_values(to_entries | map({(.key): {value: .value}}) | add)
           ' results.json > aiur.json
           cat aiur.json
-      # Upload Aiur metrics. fft-cost and constants are deterministic and step on a
-      # toolchain bump (a different stdlib changes the constants' circuits and
-      # closure sizes), so they're the bump-windowed stepping measures — same
-      # treatment compile gives file-size/constants. prove-time/execute-time,
+      # Upload Aiur metrics. Every measure shares the per-workload baseline
+      # window (data points since the aiur reset tag). constants is deterministic
+      # → pinned exactly (0/0). fft-cost is deterministic but only ever drops on
+      # a real Aiur win, so it rides an upper-only 5% bound (flag a regression,
+      # let wins through) rather than a hard pin. prove-time/execute-time,
       # peak-rss (texray's proving RSS high-water mark), and throughput
-      # (constants/prove-time, like compile's, where a drop is the regression)
-      # are noisy wall-clock and ride their normal 64-window.
+      # (constants/prove-time, where a drop is the regression) are noisy
+      # wall-clock and ride percentage bounds.
       - uses: ./.github/actions/bencher-track
         with:
           testbed: aiur-typecheck-x64-32x
+          workload: aiur
           file: aiur.json
           key: ${{ secrets.BENCHER_API_KEY }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          stepping-measures: fft-cost constants
-          always-thresholds: |
+          thresholds: |
+            --threshold-measure constants --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
+            --threshold-lower-boundary 0
+            --threshold-measure fft-cost --threshold-test percentage
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
+            --threshold-lower-boundary _
             --threshold-measure prove-time --threshold-test percentage
-            --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
             --threshold-measure execute-time --threshold-test percentage
-            --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
             --threshold-measure peak-rss --threshold-test percentage
-            --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
             --threshold-lower-boundary _
             --threshold-measure throughput --threshold-test percentage
-            --threshold-max-sample-size 64 --threshold-upper-boundary _
+            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
             --threshold-lower-boundary 0.10
diff --git a/.github/workflows/bench-thresholds-reset.yml b/.github/workflows/bench-thresholds-reset.yml
@@ -0,0 +1,82 @@
+name: Bencher thresholds reset
+
+# Move `baseline-reset-<workload>` tag(s) so bencher-track re-windows those
+# measures from a chosen commit — after a toolchain bump or any permanent shift.
+# Other workloads are unaffected; historical data stays in Bencher, only the
+# alert baseline is re-anchored. Two ways to trigger:
+#
+#   - Merged PR carrying a maintainer comment `!bencher-thresholds-reset <workload|all>`:
+#     fires automatically on merge, anchored at the merge commit — no need to
+#     race a manual run afterward. A bare command (no workload) is ignored.
+#   - workflow_dispatch: reset the chosen workload at a given commit (default
+#     HEAD); does not read PR contents. For ad-hoc or bootstrap resets.
+#
+# pull_request_target runs in the base-repo context with a write token (works
+# for fork PRs too). Safe here because the job runs no PR-provided code — only
+# `gh api` calls against the trusted event payload.
+on:
+  workflow_dispatch:
+    inputs:
+      workload:
+        description: Workload baseline to reset
+        required: true
+        type: choice
+        options: [ix-compile, aiur, all]
+      sha:
+        description: "Commit to anchor to (default: HEAD)"
+        required: false
+  pull_request_target:
+    types: [closed]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  reset:
+    if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true
+    runs-on: ubuntu-latest
+    env:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      REPO: ${{ github.repository }}
+      EVENT: ${{ github.event_name }}
+      INPUT_WORKLOAD: ${{ inputs.workload }}
+      INPUT_SHA: ${{ inputs.sha }}
+      HEAD_SHA: ${{ github.sha }}
+      PR: ${{ github.event.pull_request.number }}
+      MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }}
+    steps:
+      - run: |
+          valid="ix-compile aiur"
+          if [ "$EVENT" = workflow_dispatch ]; then
+            # Reset the chosen workload(s) at the given commit; no PR scan.
+            sha="${INPUT_SHA:-$HEAD_SHA}"
+            [ "$INPUT_WORKLOAD" = all ] && workloads="$valid" || workloads="$INPUT_WORKLOAD"
+          else
+            # Reset workloads named in maintainer `!bencher-thresholds-reset`
+            # comments, anchored at the merge commit. Arg required, so a bare
+            # command matches nothing and is ignored.
+            sha="$MERGE_SHA"
+            workloads=$(gh api "repos/$REPO/issues/$PR/comments" --paginate \
+              --jq '.[] | select(.author_association | IN("OWNER","MEMBER","COLLABORATOR")) | .body' \
+              | grep -oiE '!bencher-thresholds-reset[[:space:]]+[a-z0-9 -]+' \
+              | sed -E 's/^!bencher-thresholds-reset[[:space:]]+//I' \
+              | tr '[:upper:] ' '[:lower:]\n' | sort -u)
+            echo "$workloads" | grep -qx all && workloads="$valid"
+          fi
+
+          done_list=""
+          for w in $workloads; do
+            case " $valid " in *" $w "*) ;; *) echo "skip unknown workload: $w"; continue ;; esac
+            tag="baseline-reset-$w"
+            echo "Anchoring $tag -> $sha"
+            gh api -X PATCH "repos/$REPO/git/refs/tags/$tag" -f sha="$sha" -F force=true \
+              || gh api -X POST "repos/$REPO/git/refs" -f ref="refs/tags/$tag" -f sha="$sha"
+            done_list="$done_list $w"
+          done
+          [ -z "$done_list" ] && { echo "Nothing to reset."; exit 0; }
+
+          # Confirm on the PR for the merge-triggered path.
+          [ "$EVENT" = pull_request_target ] && gh pr comment "$PR" --repo "$REPO" \
+            --body "♻️ Baseline reset to \`${sha:0:7}\` for:$done_list"
+          true
diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean
@@ -59,7 +59,7 @@ limit; bound a run with an external `timeout` if needed.
 The JSON is a neutral, flat shape (`{ "<name>": { "constants": …, "fft-cost": …,
 "execute-time": …, "prove-time": …, "throughput": … } }`, where `prove-time` and
 `throughput` appear only for proven constants); any bencher-specific reshaping
-is the caller's job (see `.github/workflows/aiur-bench.yml`).
+is the caller's job (see `.github/workflows/bench-main.yml`).
 -/
 
 open Lean (Json Name)