ix/.github/workflows/bench-main.yml at main · argumentcomputer/ix · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
name: Aiur benchmarks

# One workflow, two benchmarks per library env, on every push to main:
#   1. compile job — `ix compile` the Lean env to a `.ixe` (compile-throughput
#                    metrics) and cache the `.ixe`.
#   2. prove job   — restore that `.ixe` from the cache (no recompile) and
#                    STARK-check selected constants over it via bench-typecheck
#                    (Aiur execute + prove metrics).
# The prove job reuses the exact `.ixe` the compile job built, so the compiler
# runs once. Compile and prove report to separate bencher testbeds so each one's
# `--thresholds-reset` only touches its own measures.

on:
  push:
    branches: main
  workflow_dispatch:

permissions:
  contents: read
  checks: write

env:
  COMPILE_DIR: Benchmarks/Compile

jobs:
  # Build + stage the `ix` and `bench-typecheck` binaries once, then restore
  # them on the (more expensive) matrix runners.
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      # Pinned Rust toolchain + cargo cache (~/.cargo + target/, via the action's
      # built-in rust-cache), so `lake build`'s cargo step doesn't recompile the
      # Plonky3/multi-stark deps from scratch on every run.
      - uses: actions-rust-lang/setup-rust-toolchain@v1
      # `.cargo/config.toml` sets `-Ctarget-cpu=native`, and the binary is built
      # once here (then only restored on the warp runners), so this build host's
      # CPU fixes the instruction set baked in — incl. AVX-512, a big Plonky3
      # speedup. Log it so a benchmark shift can be traced to a build-CPU change.
      - name: Log build CPU
        run: |
          lscpu
          grep -qw avx512f /proc/cpuinfo \
            && echo "AVX-512F: present (compiled into the binary)" \
            || echo "AVX-512F: absent"
      - uses: leanprover/lean-action@v1
        with:
          auto-config: false
          build: true
          build-args: "ix --wfail -v"
      - run: |
          mkdir -p ~/.local/bin
          echo | lake run install            # copies ix -> ~/.local/bin/ix
          lake build bench-typecheck
          cp .lake/build/bin/bench-typecheck ~/.local/bin/bench-typecheck
          chmod +x ~/.local/bin/bench-typecheck
      - uses: actions/cache/save@v5
        with:
          path: ~/.local/bin
          key: aiur-bench-bins-${{ github.sha }}

  # Compile each library env to a `.ixe` and track compile throughput. Caches
  # the `.ixe` (keyed by sha + matrix job) for the prove job to consume.
  compile:
    needs: build
    runs-on: warp-ubuntu-latest-x64-32x
    strategy:
      fail-fast: false
      matrix:
        bench: [InitStd, Lean, Mathlib, FLT] # Add FC if updated to latest Lean
        include:
          - bench: Mathlib
            mathlib: true
          - bench: FLT
            cache_pkg: flt
            mathlib: true
          # - bench: FC
          #   cache_pkg: formal_conjectures
          #   mathlib: true
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0
          fetch-tags: true   # bencher-track reads the bencher-thresholds-reset tag
      - uses: actions/cache/restore@v5
        with:
          path: ~/.local/bin
          key: aiur-bench-bins-${{ github.sha }}
      - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
      # FC's library env lives in a sibling `${COMPILE_DIR}FC` package dir, so
      # point COMPILE_DIR there for the FC matrix job.
      # - if: matrix.bench == 'FC'
      #   run: echo "COMPILE_DIR=${{ env.COMPILE_DIR }}FC" | tee -a $GITHUB_ENV
      # Download elan; fetch the mathlib cache only for benches that import
      # Mathlib (Mathlib, FLT) — InitStd/Lean would otherwise pull it for
      # nothing, since the shared Benchmarks/Compile package depends on mathlib.
      - uses: leanprover/lean-action@v1
        with:
          lake-package-directory: ${{ env.COMPILE_DIR }}
          auto-config: false
          use-github-cache: false
          use-mathlib-cache: ${{ matrix.mathlib && 'true' || 'false' }}
      # FLT and FC take a few minutes to rebuild, so cache their build artifacts.
      - if: matrix.cache_pkg
        uses: actions/cache@v5
        with:
          path: ${{ env.COMPILE_DIR }}/.lake/packages/${{ matrix.cache_pkg }}/.lake/build
          key: ${{ matrix.cache_pkg }}-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles(format('{0}/lean-toolchain', env.COMPILE_DIR)) }}-${{ hashFiles(format('{0}/lake-manifest.json', env.COMPILE_DIR)) }}
      # No `--wfail` here: formal-conjectures (FC) emits a copyright-notice
      # warning that must not fail the build.
      - run: lake build Compile${{ matrix.bench }}
        working-directory: ${{ env.COMPILE_DIR }}
      # Serialize the env to a `.ixe` and emit the `##benchmark##` line.
      - name: Run ix compile
        run: |
          ix compile ${{ env.COMPILE_DIR }}/Compile${{ matrix.bench }}.lean \
            --out ${{ matrix.bench }}.ixe 2>&1 | tee output.txt
      # Cache the `.ixe` for the prove job (reused, never recompiled there).
      # Only the matrix jobs the prove job consumes, to stay under the repo cache limit.
      - if: matrix.bench == 'InitStd' || matrix.bench == 'Mathlib'
        uses: actions/cache/save@v5
        with:
          path: ${{ matrix.bench }}.ixe
          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
      - name: Generate compile benchmark JSON
        run: |
          line=$(grep '^##benchmark##' output.txt)
          elapsed_s=$(echo "$line" | awk '{printf "%.3f", $2 / 1000}')
          bytes=$(echo "$line" | awk '{print $3}')
          constants=$(echo "$line" | awk '{print $4}')
          throughput=$(echo "$line" | awk '{if ($2 > 0) printf "%.2f", $4 * 1000 / $2; else print 0}')
          cat > benchmark.json <<EOF
          {
            "${{ matrix.bench }}": {
              "compile-time": {"value": ${elapsed_s}},
              "file-size": {"value": ${bytes}},
              "throughput": {"value": ${throughput}},
              "constants": {"value": ${constants}}
            }
          }
          EOF
      # Upload compile metrics. Every measure shares the per-workload baseline
      # window (data points since the ix-compile reset tag): file-size/constants
      # are deterministic, pinned exactly (0/0); compile-time rides a 5% upper
      # bound; throughput a 5% lower bound (its regression is a drop).
      - uses: ./.github/actions/bencher-track
        with:
          testbed: ix-compile-x64-32x
          workload: ix-compile
          file: benchmark.json
          key: ${{ secrets.BENCHER_API_KEY }}
          github-token: ${{ secrets.GITHUB_TOKEN }}
          thresholds: |
            --threshold-measure compile-time --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
            --threshold-lower-boundary _
            --threshold-measure throughput --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
            --threshold-lower-boundary 0.05
            --threshold-measure file-size --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
            --threshold-lower-boundary 0
            --threshold-measure constants --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
            --threshold-lower-boundary 0

  # Restore each matrix job's `.ixe` from the cache and run the Aiur execute + prove
  # benchmark over selected constants. No compiler run here.
  prove:
    needs: compile
    runs-on: warp-ubuntu-latest-x64-32x
    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
        include:
          - bench: InitStd
            consts: Nat.add_comm Nat.sub_le_of_le_add String.append Array.append_assoc
          - bench: Mathlib
            consts: Nat.factorial Nat.Coprime Nat.Prime.two_le
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0     # full history for the baseline-anchor lookup
          fetch-tags: true   # bencher-track reads the bencher-thresholds-reset tag
      - uses: actions/cache/restore@v5
        with:
          path: ~/.local/bin
          key: aiur-bench-bins-${{ github.sha }}
      - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
      # Provision the toolchain so the bench-typecheck binary finds libleanshared
      # (no package build). use-github-cache off: nothing to cache here, and
      # parallel matrix jobs share a key and would race the cache save.
      - uses: leanprover/lean-action@v1
        with:
          auto-config: false
          build: false
          use-github-cache: false
      # Pull the `.ixe` the compile job built — do NOT recompile it here.
      - uses: actions/cache/restore@v5
        with:
          path: ${{ matrix.bench }}.ixe
          key: aiur-ixe-${{ github.sha }}-${{ matrix.bench }}
          fail-on-cache-miss: true
      # Run each constant in its own process so a clean failure or timeout drops
      # only that constant from the report. NB: a constant heavy enough to OOM
      # the runner host still cancels the whole job (an OOM SIGKILL of the host
      # is uncatchable here), so every listed constant must fit in runner RAM.
      # RAM is tracked via tracing-texray's machine-readable streaming lines
      # (`[texray] <span> peak-rss-bytes=<N> (<X.YZ MiB>)`, emitted as each
      # span closes): we parse the raw byte integer (awk's `$2+0` stops at
      # the first non-digit, ignoring the parenthesized human suffix) and
      # fold the max over spans in as `peak-rss` (bytes), the proving RSS
      # high-water mark.
      - name: Run Aiur typecheck benchmark
        run: |
          measure() {
            local c="$1" rss
            timeout 20m bench-typecheck --ixe ${{ matrix.bench }}.ixe "$c" \
              --json "res-$c.json" --texray 2>"tx-$c.log" \
              || echo "warning: $c failed (OOM/timeout); dropping it from this report"
            rss=$(awk -F'peak-rss-bytes=' 'NF>1 && $2+0>max {max=$2+0} END {if (max>0) print max}' "tx-$c.log")
            if [ -f "res-$c.json" ] && [ -n "$rss" ] && [ "$rss" -gt 0 ]; then
              jq --argjson rss "$rss" 'map_values(. + {"peak-rss": $rss})' \
                "res-$c.json" > "res-$c.json.tmp" && mv "res-$c.json.tmp" "res-$c.json" || true
            fi
          }
          for c in ${{ matrix.consts }}; do measure "$c"; done
          # Merge the per-constant results; if none produced anything, emit `{}`.
          jq -s 'reduce .[] as $o ({}; . + $o)' res-*.json > results.json 2>/dev/null \
            || echo '{}' > results.json
          [ -s results.json ] || echo '{}' > results.json
          # Wrap each metric value as { "value": v } for Bencher Metric Format.
          # bench-typecheck already emits slug keys (constants, fft-cost,
          # execute-time, prove-time, throughput = constants/prove-time); peak-rss
          # is injected above.
          jq '
            map_values(to_entries | map({(.key): {value: .value}}) | add)
          ' results.json > aiur.json
          cat aiur.json
      # Upload Aiur metrics. Every measure shares the per-workload baseline
      # window (data points since the aiur reset tag). constants is deterministic
      # → pinned exactly (0/0). fft-cost is deterministic but only ever drops on
      # a real Aiur win, so it rides an upper-only 5% bound (flag a regression,
      # let wins through) rather than a hard pin. prove-time/execute-time,
      # peak-rss (texray's proving RSS high-water mark), and throughput
      # (constants/prove-time, where a drop is the regression) are noisy
      # wall-clock and ride percentage bounds.
      - uses: ./.github/actions/bencher-track
        with:
          testbed: aiur-typecheck-x64-32x
          workload: aiur
          file: aiur.json
          key: ${{ secrets.BENCHER_API_KEY }}
          github-token: ${{ secrets.GITHUB_TOKEN }}
          thresholds: |
            --threshold-measure constants --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
            --threshold-lower-boundary 0
            --threshold-measure fft-cost --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
            --threshold-lower-boundary _
            --threshold-measure prove-time --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
            --threshold-lower-boundary _
            --threshold-measure execute-time --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
            --threshold-lower-boundary _
            --threshold-measure peak-rss --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
            --threshold-lower-boundary _
            --threshold-measure throughput --threshold-test percentage
            --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
            --threshold-lower-boundary 0.10