fix(compile-many): split cores across stage-2 workers (drop jobs=1 hardcode) (#336)

zackees · web-flow · commit 29531b6bc857 · 2026-05-31T07:03:07.000-07:00
diff --git a/crates/fbuild-build/src/compile_many.rs b/crates/fbuild-build/src/compile_many.rs
@@ -84,6 +84,30 @@ pub fn default_sketch_jobs() -> usize {
         .max(1)
 }
 
+/// Compile parallelism to give each stage-2 worker, splitting the host's
+/// available cores across `sketch_jobs` workers.
+///
+/// The original `jobs=1` hardcoding assumed stage-2 workers compile a
+/// single TU (sketch.cpp) against a pre-built framework archive. In
+/// practice consumers stage each sketch in its own project dir (so two
+/// sketches with different `.ino` content can build in parallel), and
+/// each project dir has its own `.fbuild/build/<env>/<profile>/` — which
+/// means stage 2 rebuilds the framework from scratch per sketch. With
+/// `jobs=1` that framework rebuild is serial inside each worker, and the
+/// per-sketch wall time becomes "sum of framework TU times" instead of
+/// "max of framework TU times". See FastLED/fbuild#335.
+///
+/// Splitting cores across workers keeps total in-flight compile slots
+/// at roughly `cores` so we don't oversubscribe small runners — each
+/// worker gets `max(1, cores / sketch_jobs)`.
+pub fn stage2_jobs_per_worker(sketch_jobs: usize) -> usize {
+    let cores = std::thread::available_parallelism()
+        .map(|n| n.get())
+        .unwrap_or(1)
+        .max(1);
+    (cores / sketch_jobs.max(1)).max(1)
+}
+
 /// Request parameters for [`compile_many`].
 #[derive(Debug, Clone)]
 pub struct CompileManyRequest {
@@ -496,6 +520,12 @@ fn run_stage2(
     let results_slot: Vec<Mutex<Option<SketchResult>>> =
         results.iter_mut().map(|_| Mutex::new(None)).collect();
 
+    // Split available cores across stage-2 workers so each worker has
+    // real compile parallelism. See `stage2_jobs_per_worker` for why the
+    // old `jobs=1` hardcoding was a 2-3x regression vs the single-build
+    // path on cold cache — FastLED/fbuild#335.
+    let jobs_per_worker = stage2_jobs_per_worker(cap);
+
     std::thread::scope(|scope| {
         let handles: Vec<_> = (0..cap)
             .map(|_| {
@@ -513,10 +543,7 @@ fn run_stage2(
                         env_name: env_name.clone(),
                         platform,
                         profile,
-                        // Per-sketch work is single-TU; framework archives
-                        // are already pre-built, so jobs=1 keeps memory
-                        // per worker minimal.
-                        jobs: 1,
+                        jobs: jobs_per_worker,
                         verbose,
                         stage: Stage::Stage2Sketch,
                         pio_env: pio_env.clone(),
diff --git a/crates/fbuild-build/tests/compile_many_two_stage.rs b/crates/fbuild-build/tests/compile_many_two_stage.rs
@@ -14,7 +14,8 @@ use std::sync::{Arc, Barrier, Mutex};
 use std::time::Duration;
 
 use fbuild_build::compile_many::{
-    compile_many_with, CompileManyRequest, SketchBuildInputs, SketchBuilder, SketchResult, Stage,
+    compile_many_with, stage2_jobs_per_worker, CompileManyRequest, SketchBuildInputs,
+    SketchBuilder, SketchResult, Stage,
 };
 use fbuild_core::BuildProfile;
 
@@ -200,12 +201,16 @@ fn stage1_runs_exactly_once_and_stage2_handles_the_rest() {
         "sketch stage called once per remaining sketch"
     );
 
-    // Stage-1 honors framework_jobs; stage-2 always passes jobs=1.
+    // Stage-1 honors framework_jobs; stage-2 derives per-worker jobs
+    // from the host's core budget split across `sketch_jobs` workers
+    // (#335). With `sketch_jobs=4` here, every stage-2 worker must see
+    // the same `stage2_jobs_per_worker(4)` value the dispatcher passed.
     assert_eq!(stage1[0].3, 1, "framework_jobs=1 forwarded to stage 1");
+    let expected_stage2_jobs = stage2_jobs_per_worker(4);
     for (_, _, _, jobs) in &stage2 {
         assert_eq!(
-            *jobs, 1,
-            "stage 2 workers always invoke orchestrator with jobs=1"
+            *jobs, expected_stage2_jobs,
+            "stage-2 workers must receive jobs=stage2_jobs_per_worker(sketch_jobs)"
         );
     }
 
@@ -343,3 +348,75 @@ fn single_sketch_runs_only_stage1() {
     assert_eq!(result.stage2_count, 0);
     assert_eq!(result.results[0].stage, Stage::Stage1Framework);
 }
+
+/// AC: stage-2 workers must keep total in-flight compile slots at roughly
+/// the host's core count — never <1, never silently >cores. The previous
+/// `jobs=1` hardcoding (FastLED/fbuild#335) capped each worker to a single
+/// compile thread even when the orchestrator was re-building the framework
+/// inside the worker, producing a ~2x slowdown vs. a single `fbuild build`
+/// on a 16-core host. This locks the new core-split behavior so a future
+/// regression to `jobs=1` (or to an oversubscribing default like
+/// `cores` per worker) gets caught at unit-test speed instead of in CI.
+#[test]
+fn stage2_jobs_per_worker_splits_cores_across_workers() {
+    let cores = std::thread::available_parallelism()
+        .map(|n| n.get())
+        .unwrap_or(1)
+        .max(1);
+
+    // Always >= 1, never zero — the hot path multiplies this in, so a
+    // bug that produced 0 would silently turn every stage-2 build into a
+    // no-op.
+    for sketch_jobs in [1usize, 2, 3, 4, 8, 16, 32] {
+        let per = stage2_jobs_per_worker(sketch_jobs);
+        assert!(
+            per >= 1,
+            "stage2_jobs_per_worker({sketch_jobs}) returned {per}, expected >= 1"
+        );
+    }
+
+    // sketch_jobs=1 should grant the worker the whole core budget so
+    // serial-batch invocations don't regress vs `fbuild build` (which
+    // uses the same effective parallelism). On a single-core box this
+    // still resolves to 1 — `available_parallelism()` is the floor.
+    let lone_worker = stage2_jobs_per_worker(1);
+    assert_eq!(
+        lone_worker,
+        cores.max(1),
+        "with one stage-2 worker the worker should own the full core budget"
+    );
+
+    // Sum across all stage-2 workers must stay within the host's cores
+    // — the whole point of splitting is to avoid oversubscription. We
+    // tolerate a small undershoot when cores doesn't divide sketch_jobs
+    // evenly (each worker still gets a floor of 1).
+    for sketch_jobs in [2usize, 4, 8] {
+        let per = stage2_jobs_per_worker(sketch_jobs);
+        let total = per * sketch_jobs;
+        // Floor of 1 per worker means total can exceed cores when
+        // sketch_jobs > cores; bound only when sketch_jobs <= cores.
+        if sketch_jobs <= cores {
+            assert!(
+                total <= cores,
+                "{sketch_jobs} workers × {per} jobs = {total} exceeds {cores} cores"
+            );
+        }
+    }
+
+    // sketch_jobs > cores ⇒ each worker still gets a floor of 1 — better
+    // for the worker to spawn no extra parallelism than to stall at 0.
+    let many = stage2_jobs_per_worker(cores * 8);
+    assert_eq!(
+        many, 1,
+        "oversubscribed sketch_jobs should clamp per-worker jobs to the floor of 1"
+    );
+
+    // sketch_jobs=0 must not divide-by-zero — the public API takes
+    // Option<usize> but the helper takes usize, and callers in the wild
+    // can hand us a literal 0 (rounding, off-by-one). Treat 0 as 1.
+    assert_eq!(
+        stage2_jobs_per_worker(0),
+        cores.max(1),
+        "stage2_jobs_per_worker(0) must not panic and should treat 0 as 1"
+    );
+}