8181 - uses : actions/checkout@v6
8282 with :
8383 fetch-depth : 0
84+ fetch-tags : true # bencher-track reads the baseline-reset tag
8485 - uses : actions/cache/restore@v5
8586 with :
8687 path : ~/.local/bin
@@ -138,24 +139,30 @@ jobs:
138139 }
139140 }
140141 EOF
141- # Upload compile metrics. file-size/constants step on a toolchain bump, so
142- # they're the bump-windowed stepping measures; compile-time/throughput ride
143- # their normal 64-window (throughput's regression is a drop, hence its
144- # lower boundary ).
142+ # Upload compile metrics. Every measure shares the per-workload baseline
143+ # window (data points since the ix-compile reset tag): file-size/constants
144+ # are deterministic, pinned exactly (0/0); compile-time rides a 5% upper
145+ # bound; throughput a 5% lower bound (its regression is a drop ).
145146 - uses : ./.github/actions/bencher-track
146147 with :
147- testbed : warp-ubuntu-x64-32x
148+ testbed : ix-compile-x64-32x
149+ workload : ix-compile
148150 file : benchmark.json
149151 key : ${{ secrets.BENCHER_API_KEY }}
150152 github-token : ${{ secrets.GITHUB_TOKEN }}
151- stepping-measures : file-size constants
152- always-thresholds : |
153+ thresholds : |
153154 --threshold-measure compile-time --threshold-test percentage
154- --threshold-max-sample-size 64 --threshold-upper-boundary 0.05
155+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
155156 --threshold-lower-boundary _
156157 --threshold-measure throughput --threshold-test percentage
157- --threshold-max-sample-size 64 --threshold-upper-boundary _
158+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
158159 --threshold-lower-boundary 0.05
160+ --threshold-measure file-size --threshold-test percentage
161+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
162+ --threshold-lower-boundary 0
163+ --threshold-measure constants --threshold-test percentage
164+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
165+ --threshold-lower-boundary 0
159166
160167 # Restore each matrix job's `.ixe` from the cache and run the Aiur execute + prove
161168 # benchmark over selected constants. No compiler run here.
@@ -174,7 +181,8 @@ jobs:
174181 steps :
175182 - uses : actions/checkout@v6
176183 with :
177- fetch-depth : 0 # full history for the toolchain-bump sample-size query
184+ fetch-depth : 0 # full history for the baseline-anchor lookup
185+ fetch-tags : true # bencher-track reads the baseline-reset tag
178186 - uses : actions/cache/restore@v5
179187 with :
180188 path : ~/.local/bin
@@ -230,30 +238,37 @@ jobs:
230238 map_values(to_entries | map({(.key): {value: .value}}) | add)
231239 ' results.json > aiur.json
232240 cat aiur.json
233- # Upload Aiur metrics. fft-cost and constants are deterministic and step on a
234- # toolchain bump (a different stdlib changes the constants' circuits and
235- # closure sizes), so they're the bump-windowed stepping measures — same
236- # treatment compile gives file-size/constants. prove-time/execute-time,
241+ # Upload Aiur metrics. Every measure shares the per-workload baseline
242+ # window (data points since the aiur reset tag). constants is deterministic
243+ # → pinned exactly (0/0). fft-cost is deterministic but only ever drops on
244+ # a real Aiur win, so it rides an upper-only 5% bound (flag a regression,
245+ # let wins through) rather than a hard pin. prove-time/execute-time,
237246 # peak-rss (texray's proving RSS high-water mark), and throughput
238- # (constants/prove-time, like compile's, where a drop is the regression)
239- # are noisy wall-clock and ride their normal 64-window .
247+ # (constants/prove-time, where a drop is the regression) are noisy
248+ # wall-clock and ride percentage bounds .
240249 - uses : ./.github/actions/bencher-track
241250 with :
242251 testbed : aiur-typecheck-x64-32x
252+ workload : aiur
243253 file : aiur.json
244254 key : ${{ secrets.BENCHER_API_KEY }}
245255 github-token : ${{ secrets.GITHUB_TOKEN }}
246- stepping-measures : fft-cost constants
247- always-thresholds : |
256+ thresholds : |
257+ --threshold-measure constants --threshold-test percentage
258+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0
259+ --threshold-lower-boundary 0
260+ --threshold-measure fft-cost --threshold-test percentage
261+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.05
262+ --threshold-lower-boundary _
248263 --threshold-measure prove-time --threshold-test percentage
249- --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
264+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
250265 --threshold-lower-boundary _
251266 --threshold-measure execute-time --threshold-test percentage
252- --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
267+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
253268 --threshold-lower-boundary _
254269 --threshold-measure peak-rss --threshold-test percentage
255- --threshold-max-sample-size 64 --threshold-upper-boundary 0.10
270+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary 0.10
256271 --threshold-lower-boundary _
257272 --threshold-measure throughput --threshold-test percentage
258- --threshold-max-sample-size 64 --threshold-upper-boundary _
273+ --threshold-max-sample-size __WINDOW__ --threshold-upper-boundary _
259274 --threshold-lower-boundary 0.10
0 commit comments