feat(ci3): run uploadable benchmarks on a dedicated on-demand instance

charlielye · charlielye · commit e33bb2faa1ab · 2026-06-12T16:14:25.000Z
Spot diversification means build instances land on variable EC2 types, so the
in-build benchmark phase produced hardware-dependent numbers that tripped the
105% regression alert. Move the uploadable benchmark series onto a dedicated,
fixed, on-demand m6a.16xlarge for stable single-run results.

- bootstrap.sh: drop the inline `bench` from ci-full/ci-full-no-test-cache. In
  build_and_test (full builds only), once the build completes: on uploadable
  runs (SHOULD_UPLOAD_BENCHMARKS=1) launch the dedicated box via `./ci.sh bench`
  as a backgrounded, colored, denoised job (waited on, non-fatally, before
  return); otherwise append bench_cmds to the test stream so benches run as
  ordinary tests under contention — a breakage check, no upload. New `ci-bench`
  mode = cache-hit `make full` + `bench` (no test engine), uploading the
  existing bench-&lt;treehash&gt; cache key.
- ci.sh: new `bench` launcher — AWS_INSTANCE=m6a.16xlarge NO_SPOT=1 pins a fixed
  on-demand type (CPUS unneeded; AWS_INSTANCE bypasses pool sizing).
- bench_engine: drop the 8-core OS isolation / HT-disable / pinning. The box is
  dedicated, so benches run against the full machine honouring per-bench CPUS via
  the strict scheduler (which already defaults to nproc/2 without BENCH_CPU_COUNT).
- ci3_labels_to_env.sh: scope SHOULD_UPLOAD_BENCHMARKS to merge-queue-&gt;next (it
  now also gates the dedicated box). bootstrap_ec2: pass it through to the instance.

Results reach the GA upload step unchanged via the bench-&lt;treehash&gt; cache key
(ci3_success.sh `gh-bench`). Requires the ci3-build-instance-role launch perms
(separate iac PR) before enabling.

Expect a one-time baseline shift in bench/next (different machine + no isolation).
diff --git a/.github/ci3_labels_to_env.sh b/.github/ci3_labels_to_env.sh
@@ -149,8 +149,11 @@ function main {
   echo "CI_MODE=$ci_mode" >> $GITHUB_ENV
   echo "CI mode: $ci_mode"
 
-  # Determine if benchmarks should be uploaded (merge-queue, full, or full-no-test-cache modes)
-  if [[ "$ci_mode" == "merge-queue" || "$ci_mode" == "merge-queue-heavy" || "$ci_mode" == "full" || "$ci_mode" == "full-no-test-cache" ]]; then
+  # Only the canonical "about to land on next" series produces uploadable benchmark
+  # numbers. This flag now also gates spinning up the dedicated on-demand bench box
+  # (build_and_test reads it on the instance), so keep it scoped to merge-queue->next.
+  # Other full/merge-queue runs run benches inline as a breakage check, no upload.
+  if [[ ("$ci_mode" == "merge-queue" || "$ci_mode" == "merge-queue-heavy") && "$target_branch" == "next" ]]; then
     echo "SHOULD_UPLOAD_BENCHMARKS=1" >> $GITHUB_ENV
   fi
 
diff --git a/bootstrap.sh b/bootstrap.sh
@@ -435,6 +435,18 @@ function build_and_test {
       start_txes
       make noir-projects-txe-tests
 
+      # Benches (full builds only). For uploadable runs we want stable numbers, so
+      # launch a dedicated fixed on-demand instance to run them (backgrounded and
+      # logged like the test engine, waited on below). Otherwise the benches just
+      # become tests, run by this engine under contention as a breakage check.
+      if [ "$1" == full ]; then
+        if [ "${SHOULD_UPLOAD_BENCHMARKS:-0}" == 1 ]; then
+          setsid color_prefix "bench" "denoise './ci.sh bench'" & bench_pid=$!
+        else
+          bench_cmds >> $test_cmds_file
+        fi
+      fi
+
       # Signal tests complete, handled by parallel -E STOP.
       echo STOP >> $test_cmds_file
     fi
@@ -447,6 +459,13 @@ function build_and_test {
 
   stop_txes
 
+  # Wait for the dedicated bench instance, if one was launched. Non-fatal: bench
+  # infra shouldn't block the run — a failure just means no fresh numbers to upload.
+  if [ -n "${bench_pid:-}" ]; then
+    echo "Waiting for dedicated bench run..."
+    wait "$bench_pid" || echo_stderr "Dedicated bench run failed (non-fatal)."
+  fi
+
   return 0
 }
 
@@ -750,13 +769,22 @@ case "$cmd" in
     export USE_TEST_CACHE=1
     export CI_FULL=1
     build_and_test full
-    bench
     ;;
   "ci-full-no-test-cache")
     export CI=1
     export USE_TEST_CACHE=0
     export CI_FULL=1
     build_and_test full
+    ;;
+  "ci-bench")
+    # Run on a dedicated, fixed, on-demand instance (launched by the build
+    # instance via './ci.sh bench') for stable benchmark numbers. The build is a
+    # near-instant cache pull, as the launching build instance already populated
+    # the cache for this commit. No test engine; bench uploads bench-<treehash>.
+    export CI=1
+    export CI_FULL=1
+    prep
+    make full
     bench
     ;;
   "ci-chonk-input-update")
diff --git a/ci.sh b/ci.sh
@@ -122,6 +122,14 @@ case "$cmd" in
     # GitHub status check name is unchanged.
     multi_job_run "x-$cmd amd64 ci-$cmd"
     ;;
+  bench)
+    # Launched by the build instance on uploadable runs to produce stable benchmark
+    # numbers on a dedicated, fixed, on-demand instance. AWS_INSTANCE pins the exact
+    # type (bypasses spot pool diversification); NO_SPOT forces on-demand. CI_DASHBOARD
+    # and PARENT_LOG_ID are inherited from the launching run so it nests as a sibling job.
+    AWS_INSTANCE=m6a.16xlarge NO_SPOT=1 JOB_ID=bench INSTANCE_POSTFIX=bench \
+      bootstrap_ec2 "./bootstrap.sh ci-bench"
+    ;;
   socket-fix)
     export CI_DASHBOARD="prs"
     export JOB_ID="x-socket-fix"
diff --git a/ci3/bench_engine b/ci3/bench_engine
@@ -1,57 +1,10 @@
 #!/usr/bin/env bash
 # Uses strict scheduling to run benchmarks in parallel on their own cpus.
 # For benchmarks that can't be parallelized, runs them one at a time to avoid resource contention.
-# Isolates benchmark CPUs from OS and pins all other processes to non-bench CPUs to avoid interference.
 NO_CD=1 source $(git rev-parse --show-toplevel)/ci3/source
 
 bench_cmds_file=$1
 
-function isolate_bench_cpus {
-  [ "$CI" -eq 0 ] && return
-
-  # CPU layout assumption: physical cores are 0..N/2-1, hyperthreads are N/2..N-1.
-  local total_cpus=$(nproc)
-  local total_physical=$((total_cpus / 2))
-  local os_reserve=8
-  local bench_count=$((total_physical - os_reserve))
-
-  # Disable hyperthread siblings of benchmark cores (N/2 .. N/2+bench_count-1).
-  # OS cores' hyperthreads (N/2+bench_count .. N-1) stay on for extra OS capacity.
-  for cpu in $(seq $total_physical $((total_physical + bench_count - 1))); do
-    sudo sh -c "echo 0 > /sys/devices/system/cpu/cpu$cpu/online" 2>/dev/null || true
-  done
-
-  # Pin all container processes to OS CPUs so they can't land on benchmark cores.
-  # exec_test's taskset overrides this for each benchmark with its allocated CPUs.
-  local os_cpu_list="$bench_count-$((total_physical - 1)),$((total_physical + bench_count))-$((total_cpus - 1))"
-  echo "Pinning container processes to OS CPUs ($os_cpu_list)..."
-  for pid in $(ps -eo pid= 2>/dev/null); do
-    taskset -apc "$os_cpu_list" $pid &>/dev/null || true
-  done
-
-  export BENCH_CPU_COUNT=$bench_count
-
-  echo "Benchmark CPU isolation: CPUs 0-$((bench_count - 1)) ($bench_count cores, hyperthreads off) for benchmarks."
-  echo "OS CPUs: $os_cpu_list."
-}
-
-function unisolate_bench_cpus {
-  [ "$CI" -eq 0 ] && return
-
-  echo "Re-enabling all CPUs..."
-  local total_cpus=$(nproc --all)
-  for cpu in $(seq 1 $((total_cpus - 1))); do
-    sudo sh -c "echo 1 > /sys/devices/system/cpu/cpu$cpu/online" 2>/dev/null || true
-  done
-  # Unpin all processes (were pinned to OS CPUs during bench).
-  for pid in $(ps -eo pid= 2>/dev/null); do
-    taskset -apc 0-$((total_cpus - 1)) $pid &>/dev/null || true
-  done
-  echo "All CPUs re-enabled. Online CPUs: $(nproc)"
-}
-
-isolate_bench_cpus
-
 # Clean up old benchmark outputs to avoid confusion with new results.
 find . -type d -iname bench-out | xargs rm -rf
 
@@ -67,5 +20,3 @@ if [ -n "$serial_cmds" ]; then
     run_test_cmd "$cmd"
   done <<< "$serial_cmds"
 fi
-
-unisolate_bench_cpus
diff --git a/ci3/bootstrap_ec2 b/ci3/bootstrap_ec2
@@ -367,6 +367,7 @@ start_build() {
     -e GITHUB_ACTOR=${GITHUB_ACTOR:-} \
     -e EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-unknown} \
     -e EC2_SPOT=${EC2_SPOT:-unknown} \
+    -e SHOULD_UPLOAD_BENCHMARKS=${SHOULD_UPLOAD_BENCHMARKS:-} \
     -e AZTEC_TOOLCHAIN_DEFAULT_MAJOR_VERSION=${AZTEC_TOOLCHAIN_DEFAULT_MAJOR_VERSION:-} \
     -e DRY_RUN=${DRY_RUN:-} \
     --pids-limit=65536 \