|
| 1 | +name: Aiur benchmarks |
| 2 | + |
| 3 | +# One workflow, two benchmarks per library env, on every push to main: |
| 4 | +# 1. compile job — `ix compile` the Lean env to a `.ixe` (compile-throughput |
| 5 | +# metrics) and cache the `.ixe`. |
| 6 | +# 2. prove job — restore that `.ixe` from the cache (no recompile) and |
| 7 | +# STARK-check selected constants over it via bench-typecheck |
| 8 | +# (Aiur execute + prove metrics). |
| 9 | +# The prove job reuses the exact `.ixe` the compile job built, so the compiler |
| 10 | +# runs once. Compile and prove report to separate bencher testbeds so each one's |
| 11 | +# `--thresholds-reset` only touches its own measures. |
| 12 | + |
| 13 | +on: |
| 14 | + push: |
| 15 | + branches: main |
| 16 | + workflow_dispatch: |
| 17 | + |
| 18 | +permissions: |
| 19 | + contents: read |
| 20 | + checks: write |
| 21 | + |
| 22 | +env: |
| 23 | + COMPILE_DIR: Benchmarks/Compile |
| 24 | + |
| 25 | +jobs: |
| 26 | + # Build + stage the `ix` and `bench-typecheck` binaries once, then restore |
| 27 | + # them on the (more expensive) matrix runners. |
| 28 | + build: |
| 29 | + runs-on: ubuntu-latest |
| 30 | + steps: |
| 31 | + - uses: actions/checkout@v6 |
| 32 | + # Pinned Rust toolchain + cargo cache (~/.cargo + target/, via the action's |
| 33 | + # built-in rust-cache), so `lake build`'s cargo step doesn't recompile the |
| 34 | + # Plonky3/multi-stark deps from scratch on every run. |
| 35 | + - uses: actions-rust-lang/setup-rust-toolchain@v1 |
| 36 | + # `.cargo/config.toml` sets `-Ctarget-cpu=native`, and the binary is built |
| 37 | + # once here (then only restored on the warp runners), so this build host's |
| 38 | + # CPU fixes the instruction set baked in — incl. AVX-512, a big Plonky3 |
| 39 | + # speedup. Log it so a benchmark shift can be traced to a build-CPU change. |
| 40 | + - name: Log build CPU |
| 41 | + run: | |
| 42 | + lscpu |
| 43 | + grep -qw avx512f /proc/cpuinfo \ |
| 44 | + && echo "AVX-512F: present (compiled into the binary)" \ |
| 45 | + || echo "AVX-512F: absent" |
| 46 | + - uses: leanprover/lean-action@v1 |
| 47 | + with: |
| 48 | + auto-config: false |
| 49 | + build: true |
| 50 | + build-args: "ix --wfail -v" |
| 51 | + - run: | |
| 52 | + mkdir -p ~/.local/bin |
| 53 | + echo | lake run install # copies ix -> ~/.local/bin/ix |
| 54 | + lake build bench-typecheck |
| 55 | + cp .lake/build/bin/bench-typecheck ~/.local/bin/bench-typecheck |
| 56 | + chmod +x ~/.local/bin/bench-typecheck |
| 57 | + - uses: actions/cache/save@v5 |
| 58 | + with: |
| 59 | + path: ~/.local/bin |
| 60 | + key: aiur-bench-bins-${{ github.sha }} |
| 61 | + |
| 62 | + # Compile each library env to a `.ixe` and track compile throughput. Caches |
| 63 | + # the `.ixe` (keyed by sha + matrix job) for the prove job to consume. |
| 64 | + compile: |
| 65 | + needs: build |
| 66 | + runs-on: warp-ubuntu-latest-x64-32x |
| 67 | + strategy: |
| 68 | + fail-fast: false |
| 69 | + matrix: |
| 70 | + bench: [InitStd, Lean, Mathlib, FLT] # Add FC if updated to latest Lean |
| 71 | + include: |
| 72 | + - bench: Mathlib |
| 73 | + mathlib: true |
| 74 | + - bench: FLT |
| 75 | + cache_pkg: flt |
| 76 | + mathlib: true |
| 77 | + # - bench: FC |
| 78 | + # cache_pkg: formal_conjectures |
| 79 | + # mathlib: true |
| 80 | + steps: |
| 81 | + - uses: actions/checkout@v6 |
| 82 | + with: |
| 83 | + fetch-depth: 0 |
| 84 | + - uses: actions/cache/restore@v5 |
| 85 | + with: |
| 86 | + path: ~/.local/bin |
| 87 | + key: aiur-bench-bins-${{ github.sha }} |
| 88 | + - run: echo "$HOME/.local/bin" >> $GITHUB_PATH |
| 89 | + # FC's library env lives in a sibling `${COMPILE_DIR}FC` package dir, so |
| 90 | + # point COMPILE_DIR there for the FC matrix job. |
| 91 | + # - if: matrix.bench == 'FC' |
| 92 | + # run: echo "COMPILE_DIR=${{ env.COMPILE_DIR }}FC" | tee -a $GITHUB_ENV |
| 93 | + # Download elan; fetch the mathlib cache only for benches that import |
| 94 | + # Mathlib (Mathlib, FLT) — InitStd/Lean would otherwise pull it for |
| 95 | + # nothing, since the shared Benchmarks/Compile package depends on mathlib. |
| 96 | + - uses: leanprover/lean-action@v1 |
| 97 | + with: |
| 98 | + lake-package-directory: ${{ env.COMPILE_DIR }} |
| 99 | + auto-config: false |
| 100 | + use-github-cache: false |
| 101 | + use-mathlib-cache: ${{ matrix.mathlib && 'true' || 'false' }} |
| 102 | + # FLT and FC take a few minutes to rebuild, so cache their build artifacts. |
| 103 | + - if: matrix.cache_pkg |
| 104 | + uses: actions/cache@v5 |
| 105 | + with: |
| 106 | + path: ${{ env.COMPILE_DIR }}/.lake/packages/${{ matrix.cache_pkg }}/.lake/build |
| 107 | + key: ${{ matrix.cache_pkg }}-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles(format('{0}/lean-toolchain', env.COMPILE_DIR)) }}-${{ hashFiles(format('{0}/lake-manifest.json', env.COMPILE_DIR)) }} |
| 108 | + # No `--wfail` here: formal-conjectures (FC) emits a copyright-notice |
| 109 | + # warning that must not fail the build. |
| 110 | + - run: lake build Compile${{ matrix.bench }} |
| 111 | + working-directory: ${{ env.COMPILE_DIR }} |
| 112 | + # Serialize the env to a `.ixe` and emit the `##benchmark##` line. |
| 113 | + - name: Run ix compile |
| 114 | + run: | |
| 115 | + ix compile ${{ env.COMPILE_DIR }}/Compile${{ matrix.bench }}.lean \ |
| 116 | + --out ${{ matrix.bench }}.ixe 2>&1 | tee output.txt |
| 117 | + # Cache the `.ixe` for the prove job (reused, never recompiled there). |
| 118 | + # Only the matrix jobs the prove job consumes, to stay under the repo cache limit. |
| 119 | + - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib' |
| 120 | + uses: actions/cache/save@v5 |
| 121 | + with: |
| 122 | + path: ${{ matrix.bench }}.ixe |
| 123 | + key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }} |
| 124 | + - name: Generate compile benchmark JSON |
| 125 | + run: | |
| 126 | + line=$(grep '^##benchmark##' output.txt) |
| 127 | + elapsed_s=$(echo "$line" | awk '{printf "%.3f", $2 / 1000}') |
| 128 | + bytes=$(echo "$line" | awk '{print $3}') |
| 129 | + constants=$(echo "$line" | awk '{print $4}') |
| 130 | + throughput=$(echo "$line" | awk '{if ($2 > 0) printf "%.2f", $4 * 1000 / $2; else print 0}') |
| 131 | + cat > benchmark.json <<EOF |
| 132 | + { |
| 133 | + "${{ matrix.bench }}": { |
| 134 | + "compile-time": {"value": ${elapsed_s}}, |
| 135 | + "file-size": {"value": ${bytes}}, |
| 136 | + "throughput": {"value": ${throughput}}, |
| 137 | + "constants": {"value": ${constants}} |
| 138 | + } |
| 139 | + } |
| 140 | + EOF |
| 141 | + # Upload compile metrics. file-size/constants step on a toolchain bump, so |
| 142 | + # they're the bump-windowed stepping measures; compile-time/throughput ride |
| 143 | + # their normal 64-window (throughput's regression is a drop, hence its |
| 144 | + # lower boundary). |
| 145 | + - uses: ./.github/actions/bencher-track |
| 146 | + with: |
| 147 | + testbed: warp-ubuntu-x64-32x |
| 148 | + file: benchmark.json |
| 149 | + key: ${{ secrets.BENCHER_API_KEY }} |
| 150 | + github-token: ${{ secrets.GITHUB_TOKEN }} |
| 151 | + stepping-measures: file-size constants |
| 152 | + always-thresholds: | |
| 153 | + --threshold-measure compile-time --threshold-test percentage |
| 154 | + --threshold-max-sample-size 64 --threshold-upper-boundary 0.05 |
| 155 | + --threshold-lower-boundary _ |
| 156 | + --threshold-measure throughput --threshold-test percentage |
| 157 | + --threshold-max-sample-size 64 --threshold-upper-boundary _ |
| 158 | + --threshold-lower-boundary 0.05 |
| 159 | +
|
| 160 | + # Restore each matrix job's `.ixe` from the cache and run the Aiur execute + prove |
| 161 | + # benchmark over selected constants. No compiler run here. |
| 162 | + prove: |
| 163 | + needs: compile |
| 164 | + runs-on: warp-ubuntu-latest-x64-32x |
| 165 | + timeout-minutes: 60 |
| 166 | + strategy: |
| 167 | + fail-fast: false |
| 168 | + matrix: |
| 169 | + include: |
| 170 | + - bench: InitStd |
| 171 | + consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc |
| 172 | + - bench: Mathlib |
| 173 | + consts: Nat.factorial Nat.Coprime Nat.Prime.two_le |
| 174 | + steps: |
| 175 | + - uses: actions/checkout@v6 |
| 176 | + with: |
| 177 | + fetch-depth: 0 # full history for the toolchain-bump sample-size query |
| 178 | + - uses: actions/cache/restore@v5 |
| 179 | + with: |
| 180 | + path: ~/.local/bin |
| 181 | + key: aiur-bench-bins-${{ github.sha }} |
| 182 | + - run: echo "$HOME/.local/bin" >> $GITHUB_PATH |
| 183 | + # Provision the toolchain so the bench-typecheck binary finds libleanshared |
| 184 | + # (no package build). use-github-cache off: nothing to cache here, and |
| 185 | + # parallel matrix jobs share a key and would race the cache save. |
| 186 | + - uses: leanprover/lean-action@v1 |
| 187 | + with: |
| 188 | + auto-config: false |
| 189 | + build: false |
| 190 | + use-github-cache: false |
| 191 | + # Pull the `.ixe` the compile job built — do NOT recompile it here. |
| 192 | + - uses: actions/cache/restore@v5 |
| 193 | + with: |
| 194 | + path: ${{ matrix.bench }}.ixe |
| 195 | + key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }} |
| 196 | + fail-on-cache-miss: true |
| 197 | + # Run each constant in its own process so a clean failure or timeout drops |
| 198 | + # only that constant from the report. NB: a constant heavy enough to OOM |
| 199 | + # the runner host still cancels the whole job (an OOM SIGKILL of the host |
| 200 | + # is uncatchable here), so every listed constant must fit in runner RAM. |
| 201 | + # RAM is tracked via tracing-texray's machine-readable streaming lines |
| 202 | + # (`[texray] <span> peak-rss-bytes=<N> (<X.YZ MiB>)`, emitted as each |
| 203 | + # span closes): we parse the raw byte integer (awk's `$2+0` stops at |
| 204 | + # the first non-digit, ignoring the parenthesized human suffix) and |
| 205 | + # fold the max over spans in as `peak-rss` (bytes), the proving RSS |
| 206 | + # high-water mark. |
| 207 | + - name: Run Aiur typecheck benchmark |
| 208 | + run: | |
| 209 | + measure() { |
| 210 | + local c="$1" rss |
| 211 | + timeout 20m bench-typecheck --ixe ${{ matrix.bench }}.ixe "$c" \ |
| 212 | + --json "res-$c.json" --texray 2>"tx-$c.log" \ |
| 213 | + || echo "warning: $c failed (OOM/timeout); dropping it from this report" |
| 214 | + rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>max {max=$2+0} END {if (max>0) print max}' "tx-$c.log") |
| 215 | + if [ -f "res-$c.json" ] && [ -n "$rss" ] && [ "$rss" -gt 0 ]; then |
| 216 | + jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' \ |
| 217 | + "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true |
| 218 | + fi |
| 219 | + } |
| 220 | + for c in ${{ matrix.consts }}; do measure "$c"; done |
| 221 | + # Merge the per-constant results; if none produced anything, emit `{}`. |
| 222 | + jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \ |
| 223 | + || echo '{}' > results.json |
| 224 | + [ -s results.json ] || echo '{}' > results.json |
| 225 | + # Wrap each metric value as { "value": v } for Bencher Metric Format. |
| 226 | + # bench-typecheck already emits slug keys (constants, fft-cost, |
| 227 | + # execute-time, prove-time, throughput = constants/prove-time); peak-rss |
| 228 | + # is injected above. |
| 229 | + jq ' |
| 230 | + map_values(to_entries | map({(.key): {value: .value}}) | add) |
| 231 | + ' results.json > aiur.json |
| 232 | + cat aiur.json |
| 233 | + # Upload Aiur metrics. fft-cost and constants are deterministic and step on a |
| 234 | + # toolchain bump (a different stdlib changes the constants' circuits and |
| 235 | + # closure sizes), so they're the bump-windowed stepping measures — same |
| 236 | + # treatment compile gives file-size/constants. prove-time/execute-time, |
| 237 | + # peak-rss (texray's proving RSS high-water mark), and throughput |
| 238 | + # (constants/prove-time, like compile's, where a drop is the regression) |
| 239 | + # are noisy wall-clock and ride their normal 64-window. |
| 240 | + - uses: ./.github/actions/bencher-track |
| 241 | + with: |
| 242 | + testbed: aiur-typecheck-x64-32x |
| 243 | + file: aiur.json |
| 244 | + key: ${{ secrets.BENCHER_API_KEY }} |
| 245 | + github-token: ${{ secrets.GITHUB_TOKEN }} |
| 246 | + stepping-measures: fft-cost constants |
| 247 | + always-thresholds: | |
| 248 | + --threshold-measure prove-time --threshold-test percentage |
| 249 | + --threshold-max-sample-size 64 --threshold-upper-boundary 0.10 |
| 250 | + --threshold-lower-boundary _ |
| 251 | + --threshold-measure execute-time --threshold-test percentage |
| 252 | + --threshold-max-sample-size 64 --threshold-upper-boundary 0.10 |
| 253 | + --threshold-lower-boundary _ |
| 254 | + --threshold-measure peak-rss --threshold-test percentage |
| 255 | + --threshold-max-sample-size 64 --threshold-upper-boundary 0.10 |
| 256 | + --threshold-lower-boundary _ |
| 257 | + --threshold-measure throughput --threshold-test percentage |
| 258 | + --threshold-max-sample-size 64 --threshold-upper-boundary _ |
| 259 | + --threshold-lower-boundary 0.10 |
0 commit comments