Skip to content

Commit 29531b6

Browse files
authored
fix(compile-many): split cores across stage-2 workers (drop jobs=1 hardcode) (#336)
1 parent d6e16c9 commit 29531b6

2 files changed

Lines changed: 112 additions & 8 deletions

File tree

crates/fbuild-build/src/compile_many.rs

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,30 @@ pub fn default_sketch_jobs() -> usize {
8484
.max(1)
8585
}
8686

87+
/// Compile parallelism to give each stage-2 worker, splitting the host's
88+
/// available cores across `sketch_jobs` workers.
89+
///
90+
/// The original `jobs=1` hardcoding assumed stage-2 workers compile a
91+
/// single TU (sketch.cpp) against a pre-built framework archive. In
92+
/// practice consumers stage each sketch in its own project dir (so two
93+
/// sketches with different `.ino` content can build in parallel), and
94+
/// each project dir has its own `.fbuild/build/<env>/<profile>/` — which
95+
/// means stage 2 rebuilds the framework from scratch per sketch. With
96+
/// `jobs=1` that framework rebuild is serial inside each worker, and the
97+
/// per-sketch wall time becomes "sum of framework TU times" instead of
98+
/// "max of framework TU times". See FastLED/fbuild#335.
99+
///
100+
/// Splitting cores across workers keeps total in-flight compile slots
101+
/// at roughly `cores` so we don't oversubscribe small runners — each
102+
/// worker gets `max(1, cores / sketch_jobs)`.
103+
pub fn stage2_jobs_per_worker(sketch_jobs: usize) -> usize {
104+
let cores = std::thread::available_parallelism()
105+
.map(|n| n.get())
106+
.unwrap_or(1)
107+
.max(1);
108+
(cores / sketch_jobs.max(1)).max(1)
109+
}
110+
87111
/// Request parameters for [`compile_many`].
88112
#[derive(Debug, Clone)]
89113
pub struct CompileManyRequest {
@@ -496,6 +520,12 @@ fn run_stage2(
496520
let results_slot: Vec<Mutex<Option<SketchResult>>> =
497521
results.iter_mut().map(|_| Mutex::new(None)).collect();
498522

523+
// Split available cores across stage-2 workers so each worker has
524+
// real compile parallelism. See `stage2_jobs_per_worker` for why the
525+
// old `jobs=1` hardcoding was a 2-3x regression vs the single-build
526+
// path on cold cache — FastLED/fbuild#335.
527+
let jobs_per_worker = stage2_jobs_per_worker(cap);
528+
499529
std::thread::scope(|scope| {
500530
let handles: Vec<_> = (0..cap)
501531
.map(|_| {
@@ -513,10 +543,7 @@ fn run_stage2(
513543
env_name: env_name.clone(),
514544
platform,
515545
profile,
516-
// Per-sketch work is single-TU; framework archives
517-
// are already pre-built, so jobs=1 keeps memory
518-
// per worker minimal.
519-
jobs: 1,
546+
jobs: jobs_per_worker,
520547
verbose,
521548
stage: Stage::Stage2Sketch,
522549
pio_env: pio_env.clone(),

crates/fbuild-build/tests/compile_many_two_stage.rs

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ use std::sync::{Arc, Barrier, Mutex};
1414
use std::time::Duration;
1515

1616
use fbuild_build::compile_many::{
17-
compile_many_with, CompileManyRequest, SketchBuildInputs, SketchBuilder, SketchResult, Stage,
17+
compile_many_with, stage2_jobs_per_worker, CompileManyRequest, SketchBuildInputs,
18+
SketchBuilder, SketchResult, Stage,
1819
};
1920
use fbuild_core::BuildProfile;
2021

@@ -200,12 +201,16 @@ fn stage1_runs_exactly_once_and_stage2_handles_the_rest() {
200201
"sketch stage called once per remaining sketch"
201202
);
202203

203-
// Stage-1 honors framework_jobs; stage-2 always passes jobs=1.
204+
// Stage-1 honors framework_jobs; stage-2 derives per-worker jobs
205+
// from the host's core budget split across `sketch_jobs` workers
206+
// (#335). With `sketch_jobs=4` here, every stage-2 worker must see
207+
// the same `stage2_jobs_per_worker(4)` value the dispatcher passed.
204208
assert_eq!(stage1[0].3, 1, "framework_jobs=1 forwarded to stage 1");
209+
let expected_stage2_jobs = stage2_jobs_per_worker(4);
205210
for (_, _, _, jobs) in &stage2 {
206211
assert_eq!(
207-
*jobs, 1,
208-
"stage 2 workers always invoke orchestrator with jobs=1"
212+
*jobs, expected_stage2_jobs,
213+
"stage-2 workers must receive jobs=stage2_jobs_per_worker(sketch_jobs)"
209214
);
210215
}
211216

@@ -343,3 +348,75 @@ fn single_sketch_runs_only_stage1() {
343348
assert_eq!(result.stage2_count, 0);
344349
assert_eq!(result.results[0].stage, Stage::Stage1Framework);
345350
}
351+
352+
/// AC: stage-2 workers must keep total in-flight compile slots at roughly
353+
/// the host's core count — never <1, never silently >cores. The previous
354+
/// `jobs=1` hardcoding (FastLED/fbuild#335) capped each worker to a single
355+
/// compile thread even when the orchestrator was re-building the framework
356+
/// inside the worker, producing a ~2x slowdown vs. a single `fbuild build`
357+
/// on a 16-core host. This locks the new core-split behavior so a future
358+
/// regression to `jobs=1` (or to an oversubscribing default like
359+
/// `cores` per worker) gets caught at unit-test speed instead of in CI.
360+
#[test]
361+
fn stage2_jobs_per_worker_splits_cores_across_workers() {
362+
let cores = std::thread::available_parallelism()
363+
.map(|n| n.get())
364+
.unwrap_or(1)
365+
.max(1);
366+
367+
// Always >= 1, never zero — the hot path multiplies this in, so a
368+
// bug that produced 0 would silently turn every stage-2 build into a
369+
// no-op.
370+
for sketch_jobs in [1usize, 2, 3, 4, 8, 16, 32] {
371+
let per = stage2_jobs_per_worker(sketch_jobs);
372+
assert!(
373+
per >= 1,
374+
"stage2_jobs_per_worker({sketch_jobs}) returned {per}, expected >= 1"
375+
);
376+
}
377+
378+
// sketch_jobs=1 should grant the worker the whole core budget so
379+
// serial-batch invocations don't regress vs `fbuild build` (which
380+
// uses the same effective parallelism). On a single-core box this
381+
// still resolves to 1 — `available_parallelism()` is the floor.
382+
let lone_worker = stage2_jobs_per_worker(1);
383+
assert_eq!(
384+
lone_worker,
385+
cores.max(1),
386+
"with one stage-2 worker the worker should own the full core budget"
387+
);
388+
389+
// Sum across all stage-2 workers must stay within the host's cores
390+
// — the whole point of splitting is to avoid oversubscription. We
391+
// tolerate a small undershoot when cores doesn't divide sketch_jobs
392+
// evenly (each worker still gets a floor of 1).
393+
for sketch_jobs in [2usize, 4, 8] {
394+
let per = stage2_jobs_per_worker(sketch_jobs);
395+
let total = per * sketch_jobs;
396+
// Floor of 1 per worker means total can exceed cores when
397+
// sketch_jobs > cores; bound only when sketch_jobs <= cores.
398+
if sketch_jobs <= cores {
399+
assert!(
400+
total <= cores,
401+
"{sketch_jobs} workers × {per} jobs = {total} exceeds {cores} cores"
402+
);
403+
}
404+
}
405+
406+
// sketch_jobs > cores ⇒ each worker still gets a floor of 1 — better
407+
// for the worker to spawn no extra parallelism than to stall at 0.
408+
let many = stage2_jobs_per_worker(cores * 8);
409+
assert_eq!(
410+
many, 1,
411+
"oversubscribed sketch_jobs should clamp per-worker jobs to the floor of 1"
412+
);
413+
414+
// sketch_jobs=0 must not divide-by-zero — the public API takes
415+
// Option<usize> but the helper takes usize, and callers in the wild
416+
// can hand us a literal 0 (rounding, off-by-one). Treat 0 as 1.
417+
assert_eq!(
418+
stage2_jobs_per_worker(0),
419+
cores.max(1),
420+
"stage2_jobs_per_worker(0) must not panic and should treat 0 as 1"
421+
);
422+
}

0 commit comments

Comments
 (0)