diff --git a/.github/actions/bencher-track/action.yml b/.github/actions/bencher-track/action.yml index 93d6dc25..d37d179d 100644 --- a/.github/actions/bencher-track/action.yml +++ b/.github/actions/bencher-track/action.yml @@ -1,13 +1,18 @@ name: Track benchmarks on bencher description: >- - Run `bencher run` for the `ix` project, windowing deterministic "stepping" - measures to commits since the last lean-toolchain bump (and dropping them on - the bump commit via --thresholds-reset, so a toolchain step lands without a - spurious alert). The calling job must check out with fetch-depth: 0. + Run `bencher run` for the `ix` project. Every measure's baseline is windowed + to the data points recorded since the workload's `baseline-reset-` + tag, so a toolchain bump (or any permanent shift) is recalibrated by moving + that one tag — via the !bencher-thresholds-reset PR comment or the + bench-thresholds-reset workflow. The calling job must check out with + fetch-depth: 0 and fetch-tags: true (the tag is read here). inputs: testbed: - description: Bencher testbed name. + description: Bencher testbed slug. + required: true + workload: + description: Workload key for the `baseline-reset-` tag (e.g. ix-compile, aiur). required: true file: description: Bencher Metric Format JSON file to upload. @@ -18,56 +23,52 @@ inputs: github-token: description: GITHUB_TOKEN, for the --github-actions comment. required: true - always-thresholds: - description: >- - Raw `--threshold-measure …` flags for measures tracked on every commit - (non-stepping). Word-split, so keep each flag a single token. - required: false - default: "" - stepping-measures: + thresholds: description: >- - Space-separated measure names that step on a toolchain bump (e.g. - "file-size constants" or "fft-cost"). Windowed to commits-since-bump and - omitted on the bump commit. - required: false - default: "" + `--threshold-*` flags for every tracked measure. Use the literal token + `__WINDOW__` in place of each `--threshold-max-sample-size` value; it is + replaced with the data-point count since the baseline anchor. Word-split, + so keep each flag a single token. + required: true runs: using: composite steps: - - uses: bencherdev/bencher@v0.6.7 + - uses: bencherdev/bencher@v0.6.8 - shell: bash env: BENCHER_KEY: ${{ inputs.key }} GH_TOKEN: ${{ inputs.github-token }} TESTBED: ${{ inputs.testbed }} + WORKLOAD: ${{ inputs.workload }} FILE: ${{ inputs.file }} - ALWAYS_THRESHOLDS: ${{ inputs.always-thresholds }} - STEPPING_MEASURES: ${{ inputs.stepping-measures }} + THRESHOLDS: ${{ inputs.thresholds }} run: | - # Sample only commits since lean-toolchain last changed, so a stepping - # measure's baseline window never straddles a toolchain bump. - last_bump=$(git log -1 --format=%H -- lean-toolchain) - sample=$(git rev-list --count "${last_bump}..HEAD") - [ "$sample" -gt 64 ] && sample=64 - echo "commits since last lean-toolchain bump (capped 64): $sample" - # On the bump commit (sample=0) omit the stepping measures entirely; - # --thresholds-reset then clears their models so the step doesn't alert. - # Between bumps these measures are deterministic, so pin them exactly - # (0% on both sides): any change in either direction is a real circuit - # or closure change, not noise, and should alert. - step_thresholds=() - if [ "$sample" -gt 0 ]; then - for m in $STEPPING_MEASURES; do - step_thresholds+=( - --threshold-measure "$m" - --threshold-test percentage - --threshold-max-sample-size "$sample" - --threshold-upper-boundary 0 - --threshold-lower-boundary 0 - ) - done + # Baseline window: data points for this benchmark since the workload's + # `baseline-reset-` tag (moved on a toolchain bump or any + # permanent shift). The same window applies to every measure. We count + # actual data points, not commits, so the baseline never reaches past + # the anchor (a failed run produces no metric, so a commit count would + # overshoot). The matrix uploads one report per benchmark, so count this + # benchmark's reports (its name is the upload's top-level key). The ix + # project is public, so no token is needed. With no tag yet, fall back + # to Bencher's 64-point window. + sample=64 + if reset_epoch=$(git log -1 --format=%ct "baseline-reset-$WORKLOAD" 2>/dev/null); then + bench=$(jq -r '[keys[]][0]' "$FILE") + sample=$(curl -fsS "https://api.bencher.dev/v0/projects/ix/reports?branch=${GITHUB_REF_NAME}&testbed=${TESTBED}&start_time=$((reset_epoch * 1000))&per_page=255" \ + | jq --arg b "$bench" '[.[] | select([.results[]?[]?.benchmark.name] | index($b))] | length') fi + sample=${sample:-0} + [ "$sample" -gt 64 ] && sample=64 + echo "baseline window for '$WORKLOAD': $sample data point(s)" + + # No data points since the reset → omit thresholds so the reset commit + # lands without an alert (--thresholds-reset clears stale models); they + # re-arm next run. Otherwise substitute the window into each measure. + thresholds=() + [ "$sample" -gt 0 ] && thresholds=( ${THRESHOLDS//__WINDOW__/$sample} ) + bencher run \ --project ix \ --key "$BENCHER_KEY" \ @@ -76,7 +77,6 @@ runs: --testbed "$TESTBED" \ --adapter json \ --github-actions "$GH_TOKEN" \ - $ALWAYS_THRESHOLDS \ - "${step_thresholds[@]}" \ + "${thresholds[@]}" \ --thresholds-reset \ --file "$FILE" diff --git a/.github/workflows/aiur-bench.yml b/.github/workflows/bench-main.yml similarity index 81% rename from .github/workflows/aiur-bench.yml rename to .github/workflows/bench-main.yml index 5f94ff99..9b645594 100644 --- a/.github/workflows/aiur-bench.yml +++ b/.github/workflows/bench-main.yml @@ -81,6 +81,7 @@ jobs: - uses: actions/checkout@v6 with: fetch-depth: 0 + fetch-tags: true # bencher-track reads the baseline-reset tag - uses: actions/cache/restore@v5 with: path: ~/.local/bin @@ -138,24 +139,30 @@ jobs: } } EOF - # Upload compile metrics. file-size/constants step on a toolchain bump, so - # they're the bump-windowed stepping measures; compile-time/throughput ride - # their normal 64-window (throughput's regression is a drop, hence its - # lower boundary). + # Upload compile metrics. Every measure shares the per-workload baseline + # window (data points since the ix-compile reset tag): file-size/constants + # are deterministic, pinned exactly (0/0); compile-time rides a 5% upper + # bound; throughput a 5% lower bound (its regression is a drop). - uses: ./.github/actions/bencher-track with: - testbed: warp-ubuntu-x64-32x + testbed: ix-compile-x64-32x + workload: ix-compile file: benchmark.json key: ${{ secrets.BENCHER_API_KEY }} github-token: ${{ secrets.GITHUB_TOKEN }} - stepping-measures: file-size constants - always-thresholds: | + thresholds: | --threshold-measure compile-time --threshold-test percentage - --threshold-max-sample-size 64 --threshold-upper-boundary 0.05 + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05 --threshold-lower-boundary _ --threshold-measure throughput --threshold-test percentage - --threshold-max-sample-size 64 --threshold-upper-boundary _ + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _ --threshold-lower-boundary 0.05 + --threshold-measure file-size --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0 + --threshold-lower-boundary 0 + --threshold-measure constants --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0 + --threshold-lower-boundary 0 # Restore each matrix job's `.ixe` from the cache and run the Aiur execute + prove # benchmark over selected constants. No compiler run here. @@ -174,7 +181,8 @@ jobs: steps: - uses: actions/checkout@v6 with: - fetch-depth: 0 # full history for the toolchain-bump sample-size query + fetch-depth: 0 # full history for the baseline-anchor lookup + fetch-tags: true # bencher-track reads the baseline-reset tag - uses: actions/cache/restore@v5 with: path: ~/.local/bin @@ -230,30 +238,37 @@ jobs: map_values(to_entries | map({(.key): {value: .value}}) | add) ' results.json > aiur.json cat aiur.json - # Upload Aiur metrics. fft-cost and constants are deterministic and step on a - # toolchain bump (a different stdlib changes the constants' circuits and - # closure sizes), so they're the bump-windowed stepping measures — same - # treatment compile gives file-size/constants. prove-time/execute-time, + # Upload Aiur metrics. Every measure shares the per-workload baseline + # window (data points since the aiur reset tag). constants is deterministic + # → pinned exactly (0/0). fft-cost is deterministic but only ever drops on + # a real Aiur win, so it rides an upper-only 5% bound (flag a regression, + # let wins through) rather than a hard pin. prove-time/execute-time, # peak-rss (texray's proving RSS high-water mark), and throughput - # (constants/prove-time, like compile's, where a drop is the regression) - # are noisy wall-clock and ride their normal 64-window. + # (constants/prove-time, where a drop is the regression) are noisy + # wall-clock and ride percentage bounds. - uses: ./.github/actions/bencher-track with: testbed: aiur-typecheck-x64-32x + workload: aiur file: aiur.json key: ${{ secrets.BENCHER_API_KEY }} github-token: ${{ secrets.GITHUB_TOKEN }} - stepping-measures: fft-cost constants - always-thresholds: | + thresholds: | + --threshold-measure constants --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0 + --threshold-lower-boundary 0 + --threshold-measure fft-cost --threshold-test percentage + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05 + --threshold-lower-boundary _ --threshold-measure prove-time --threshold-test percentage - --threshold-max-sample-size 64 --threshold-upper-boundary 0.10 + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10 --threshold-lower-boundary _ --threshold-measure execute-time --threshold-test percentage - --threshold-max-sample-size 64 --threshold-upper-boundary 0.10 + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10 --threshold-lower-boundary _ --threshold-measure peak-rss --threshold-test percentage - --threshold-max-sample-size 64 --threshold-upper-boundary 0.10 + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10 --threshold-lower-boundary _ --threshold-measure throughput --threshold-test percentage - --threshold-max-sample-size 64 --threshold-upper-boundary _ + --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _ --threshold-lower-boundary 0.10 diff --git a/.github/workflows/bench-thresholds-reset.yml b/.github/workflows/bench-thresholds-reset.yml new file mode 100644 index 00000000..1f8be388 --- /dev/null +++ b/.github/workflows/bench-thresholds-reset.yml @@ -0,0 +1,82 @@ +name: Bencher thresholds reset + +# Move `baseline-reset-` tag(s) so bencher-track re-windows those +# measures from a chosen commit — after a toolchain bump or any permanent shift. +# Other workloads are unaffected; historical data stays in Bencher, only the +# alert baseline is re-anchored. Two ways to trigger: +# +# - Merged PR carrying a maintainer comment `!bencher-thresholds-reset `: +# fires automatically on merge, anchored at the merge commit — no need to +# race a manual run afterward. A bare command (no workload) is ignored. +# - workflow_dispatch: reset the chosen workload at a given commit (default +# HEAD); does not read PR contents. For ad-hoc or bootstrap resets. +# +# pull_request_target runs in the base-repo context with a write token (works +# for fork PRs too). Safe here because the job runs no PR-provided code — only +# `gh api` calls against the trusted event payload. +on: + workflow_dispatch: + inputs: + workload: + description: Workload baseline to reset + required: true + type: choice + options: [ix-compile, aiur, all] + sha: + description: "Commit to anchor to (default: HEAD)" + required: false + pull_request_target: + types: [closed] + +permissions: + contents: write + pull-requests: write + +jobs: + reset: + if: github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true + runs-on: ubuntu-latest + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + EVENT: ${{ github.event_name }} + INPUT_WORKLOAD: ${{ inputs.workload }} + INPUT_SHA: ${{ inputs.sha }} + HEAD_SHA: ${{ github.sha }} + PR: ${{ github.event.pull_request.number }} + MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} + steps: + - run: | + valid="ix-compile aiur" + if [ "$EVENT" = workflow_dispatch ]; then + # Reset the chosen workload(s) at the given commit; no PR scan. + sha="${INPUT_SHA:-$HEAD_SHA}" + [ "$INPUT_WORKLOAD" = all ] && workloads="$valid" || workloads="$INPUT_WORKLOAD" + else + # Reset workloads named in maintainer `!bencher-thresholds-reset` + # comments, anchored at the merge commit. Arg required, so a bare + # command matches nothing and is ignored. + sha="$MERGE_SHA" + workloads=$(gh api "repos/$REPO/issues/$PR/comments" --paginate \ + --jq '.[] | select(.author_association | IN("OWNER","MEMBER","COLLABORATOR")) | .body' \ + | grep -oiE '!bencher-thresholds-reset[[:space:]]+[a-z0-9 -]+' \ + | sed -E 's/^!bencher-thresholds-reset[[:space:]]+//I' \ + | tr '[:upper:] ' '[:lower:]\n' | sort -u) + echo "$workloads" | grep -qx all && workloads="$valid" + fi + + done_list="" + for w in $workloads; do + case " $valid " in *" $w "*) ;; *) echo "skip unknown workload: $w"; continue ;; esac + tag="baseline-reset-$w" + echo "Anchoring $tag -> $sha" + gh api -X PATCH "repos/$REPO/git/refs/tags/$tag" -f sha="$sha" -F force=true \ + || gh api -X POST "repos/$REPO/git/refs" -f ref="refs/tags/$tag" -f sha="$sha" + done_list="$done_list $w" + done + [ -z "$done_list" ] && { echo "Nothing to reset."; exit 0; } + + # Confirm on the PR for the merge-triggered path. + [ "$EVENT" = pull_request_target ] && gh pr comment "$PR" --repo "$REPO" \ + --body "♻️ Baseline reset to \`${sha:0:7}\` for:$done_list" + true diff --git a/Benchmarks/Typecheck.lean b/Benchmarks/Typecheck.lean index ce4c64ca..1906378b 100644 --- a/Benchmarks/Typecheck.lean +++ b/Benchmarks/Typecheck.lean @@ -59,7 +59,7 @@ limit; bound a run with an external `timeout` if needed. The JSON is a neutral, flat shape (`{ "": { "constants": …, "fft-cost": …, "execute-time": …, "prove-time": …, "throughput": … } }`, where `prove-time` and `throughput` appear only for proven constants); any bencher-specific reshaping -is the caller's job (see `.github/workflows/aiur-bench.yml`). +is the caller's job (see `.github/workflows/bench-main.yml`). -/ open Lean (Json Name)