Skip to content

Commit 17f47f3

Browse files
authored
fix(distributed): gate per-worker SwiftShader probe to worker 0 only (#956)
After #916 moved `assertSwiftShader` from `renderChunk()`'s eager probe session into `executeWorkerTask`, every parallel worker began running its own `chrome://gpu` / canvas-WebGL probe. At `chunkWorkerCount=6` (texture launch at chunks=3) that's 6 concurrent CDP page-loads per chunk × 3 chunks = 18 simultaneous probes. Bench data on dev (12 producer pods × 22 vCPU) showed c=3 worst-case wall-clock at 67.3s, 24.7s above c=6 worst (42.6s) — pod_total inflates 100s → 147s uniformly across all three chunks per slow iter, the signature of cluster-level CDP contention rather than within-pod contention. Workers within a chunk share the same Chrome binary, flags, and OS/driver state on a single pod, so worker 0's success is representative for the rest. Gate the probe via `shouldVerifyWorkerGpu(workerId, config)` so only worker 0 navigates to the probe page; workers 1..N-1 skip it. The fail-fast contract still holds at the chunk level (worker 0 still aborts the chunk if SwiftShader didn't load) — just without the concurrent CDP traffic. Expected wall-clock impact: c=3 worst drops from ~67s to in line with c=6 worst (~42-44s). c=6 (3 workers/pod) and c=8 (2 workers/pod) should see smaller wins; c=12 (1 worker/pod, sequential branch) is unaffected. Closes #955.
1 parent 7354d61 commit 17f47f3

3 files changed

Lines changed: 63 additions & 15 deletions

File tree

packages/engine/src/services/parallelCoordinator.test.ts

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
import { describe, it, expect } from "vitest";
2-
import { calculateOptimalWorkers, distributeFrames } from "./parallelCoordinator.js";
2+
import {
3+
calculateOptimalWorkers,
4+
distributeFrames,
5+
shouldVerifyWorkerGpu,
6+
} from "./parallelCoordinator.js";
7+
import type { EngineConfig } from "../config.js";
38

49
describe("distributeFrames", () => {
510
it("distributes frames evenly across workers", () => {
@@ -68,3 +73,29 @@ describe("calculateOptimalWorkers", () => {
6873
expect(workers).toBe(4);
6974
});
7075
});
76+
77+
describe("shouldVerifyWorkerGpu", () => {
78+
const softwareConfig: Partial<EngineConfig> = { browserGpuMode: "software" };
79+
80+
it("returns true for worker 0 when GPU mode is software", () => {
81+
expect(shouldVerifyWorkerGpu(0, softwareConfig)).toBe(true);
82+
});
83+
84+
it("returns false for non-zero workers when GPU mode is software", () => {
85+
expect(shouldVerifyWorkerGpu(1, softwareConfig)).toBe(false);
86+
expect(shouldVerifyWorkerGpu(5, softwareConfig)).toBe(false);
87+
expect(shouldVerifyWorkerGpu(17, softwareConfig)).toBe(false);
88+
});
89+
90+
it("returns false for any worker when GPU mode is not software", () => {
91+
expect(shouldVerifyWorkerGpu(0, { browserGpuMode: "hardware" } as Partial<EngineConfig>)).toBe(
92+
false,
93+
);
94+
expect(shouldVerifyWorkerGpu(0, {})).toBe(false);
95+
});
96+
97+
it("returns false when config is undefined", () => {
98+
expect(shouldVerifyWorkerGpu(0, undefined)).toBe(false);
99+
expect(shouldVerifyWorkerGpu(3, undefined)).toBe(false);
100+
});
101+
});

packages/engine/src/services/parallelCoordinator.ts

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,16 @@ export function distributeFrames(
181181
return tasks;
182182
}
183183

184+
/**
185+
* Decide whether a parallel worker should run the per-worker SwiftShader
186+
* assertion. Gated to worker 0 only: workers within a chunk share the same
187+
* Chrome binary, flags, and OS/driver state, so one verification per chunk
188+
* is sufficient. See `heygen-com/hyperframes#955`.
189+
*/
190+
export function shouldVerifyWorkerGpu(workerId: number, config?: Partial<EngineConfig>): boolean {
191+
return config?.browserGpuMode === "software" && workerId === 0;
192+
}
193+
184194
async function executeWorkerTask(
185195
task: WorkerTask,
186196
serverUrl: string,
@@ -207,17 +217,22 @@ async function executeWorkerTask(
207217
createBeforeCaptureHook(),
208218
config,
209219
);
210-
// Per-worker SwiftShader assertion: when the caller declares
211-
// `browserGpuMode: "software"`, every worker session must verify Chrome's
212-
// WebGL backend is actually SwiftShader before the first frame. Hosts
213-
// that fall back to a hardware GL backend (or silently fail to load
220+
// Per-worker SwiftShader assertion, gated to worker 0 only.
221+
// When `browserGpuMode: "software"` is declared, the chunk's GL backend
222+
// must be verified as SwiftShader before the first frame — a host that
223+
// falls back to a hardware GL backend (or silently fails to load
214224
// SwiftShader) would otherwise produce non-deterministic pixels and
215-
// break the distributed byte-identical-retry contract — the parallel
216-
// branch wouldn't catch it via the pre-warmup probe (renderChunk now
217-
// skips that when chunkWorkerCount > 1). The canvas-based reader works
218-
// on both regular Chrome and chrome-headless-shell (which serves
219-
// `chrome://gpu` as an empty document).
220-
if (config?.browserGpuMode === "software") {
225+
// break the distributed byte-identical-retry contract. Running this
226+
// probe on every worker means N concurrent navigations to a WebGL
227+
// probe page per chunk; with `chunkWorkerCount=6` × 3 chunks, that's
228+
// 18 simultaneous CDP page-loads, which inflated c=3 worst-case wall
229+
// by ~24s vs c=6/c=8 on the texture-launch bench. Workers in the same
230+
// chunk share the same Chrome binary, flags, and OS/driver state, so
231+
// worker 0's success is representative — gate it there and skip the
232+
// rest. See `heygen-com/hyperframes#955` for the bench data and the
233+
// pre-warmup probe interaction (which `renderChunk` already skips
234+
// when `chunkWorkerCount > 1`).
235+
if (shouldVerifyWorkerGpu(task.workerId, config)) {
221236
await assertSwiftShader(session.page, readWebGlVendorInfoFromCanvas);
222237
}
223238
await initializeSession(session);

packages/producer/src/services/distributed/renderChunk.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -469,10 +469,12 @@ export async function renderChunk(
469469
// Resolve worker count up-front so we can decide whether to bother
470470
// pre-warming a probe session at all. The parallel branch
471471
// (chunkWorkerCount > 1) closes the probe immediately and creates fresh
472-
// per-worker sessions; `executeWorkerTask` now runs its own
473-
// `assertSwiftShader` against each worker session (gated on
474-
// `cfg.browserGpuMode === "software"`), so the safety contract holds
475-
// without the eager pre-probe.
472+
// per-worker sessions; `executeWorkerTask` runs `assertSwiftShader`
473+
// on worker 0 only (gated on `cfg.browserGpuMode === "software"`), so
474+
// the safety contract holds without the eager pre-probe and without
475+
// every worker concurrently navigating to the GL probe page. See
476+
// `heygen-com/hyperframes#955` for the worst-case wall regression that
477+
// motivated gating the probe to worker 0.
476478
//
477479
// Capture-cost calibration based on shader transitions / renderModeHints
478480
// is not threaded through to chunks yet; the in-process renderer's

0 commit comments

Comments
 (0)