-
-
Notifications
You must be signed in to change notification settings - Fork 632
Fix infinite fork loop when node renderer worker fails to bind port #2843
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
10200bc
fcf1119
d09b98e
ea800d5
c1c8a3f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ import { buildConfig, Config, logSanitizedConfig } from './shared/configBuilder. | |
| import restartWorkers from './master/restartWorkers.js'; | ||
| import * as errorReporter from './shared/errorReporter.js'; | ||
| import { getLicenseStatus } from './shared/licenseValidator.js'; | ||
| import { isWorkerStartupFailureMessage, type WorkerStartupFailureMessage } from './shared/workerMessages.js'; | ||
|
|
||
| const MILLISECONDS_IN_MINUTE = 60000; | ||
| // How often to scan for orphaned upload directories. | ||
|
|
@@ -77,21 +78,44 @@ export default function masterRun(runningConfig?: Partial<Config>) { | |
| })(); | ||
| }, ORPHAN_CLEANUP_INTERVAL_MS); | ||
|
|
||
| let isAbortingForStartupFailure = false; | ||
| let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null; | ||
|
|
||
| for (let i = 0; i < workersCount; i += 1) { | ||
| cluster.fork(); | ||
| } | ||
|
|
||
| cluster.on('message', (worker, message) => { | ||
| if (!isWorkerStartupFailureMessage(message) || isAbortingForStartupFailure) return; | ||
|
|
||
| isAbortingForStartupFailure = true; | ||
| fatalStartupFailure = { workerId: worker.id, failure: message }; | ||
| }); | ||
|
|
||
| // Listen for dying workers: | ||
| cluster.on('exit', (worker) => { | ||
| if (worker.isScheduledRestart) { | ||
| log.info('Restarting worker #%d on schedule', worker.id); | ||
| } else { | ||
| // TODO: Track last rendering request per worker.id | ||
| // TODO: Consider blocking a given rendering request if it kills a worker more than X times | ||
| const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; | ||
| cluster.fork(); | ||
| return; | ||
| } | ||
|
|
||
| if (isAbortingForStartupFailure) { | ||
| const failure = fatalStartupFailure?.failure; | ||
| const msg = | ||
| failure?.code === 'EADDRINUSE' | ||
| ? `Node renderer startup failed: port ${failure.port} is already in use` | ||
| : `Node renderer startup failed in worker ${worker.id}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; | ||
|
|
||
| errorReporter.message(msg); | ||
| process.exit(1); | ||
| return; | ||
|
Comment on lines
+111
to
+112
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| } | ||
|
Comment on lines
+88
to
113
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Node.js cluster does not guarantee that a worker's In practice this is rare because the worker only calls One mitigation would be to listen for the worker's IPC channel |
||
| // Replace the dead worker: | ||
|
|
||
| // TODO: Track last rendering request per worker.id | ||
| // TODO: Consider blocking a given rendering request if it kills a worker more than X times | ||
| const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; | ||
| errorReporter.message(msg); | ||
| cluster.fork(); | ||
| }); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| export const WORKER_STARTUP_FAILURE = 'NODE_RENDERER_WORKER_STARTUP_FAILURE' as const; | ||
|
|
||
| export interface WorkerStartupFailureMessage { | ||
| type: typeof WORKER_STARTUP_FAILURE; | ||
| stage: 'listen'; | ||
| code?: string; | ||
| errno?: number; | ||
| syscall?: string; | ||
| host: string; | ||
| port: number; | ||
| message: string; | ||
| } | ||
|
|
||
| export function isWorkerStartupFailureMessage(value: unknown): value is WorkerStartupFailureMessage { | ||
| return ( | ||
| typeof value === 'object' && | ||
| value !== null && | ||
| (value as { type?: string }).type === WORKER_STARTUP_FAILURE | ||
| ); | ||
|
Comment on lines
+14
to
+19
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Validate the full IPC payload shape in the type guard.
🛡️ Proposed fix export function isWorkerStartupFailureMessage(value: unknown): value is WorkerStartupFailureMessage {
- return (
- typeof value === 'object' &&
- value !== null &&
- (value as { type?: string }).type === WORKER_STARTUP_FAILURE
- );
+ if (typeof value !== 'object' || value === null) {
+ return false;
+ }
+
+ const message = value as Partial<WorkerStartupFailureMessage>;
+ return (
+ message.type === WORKER_STARTUP_FAILURE &&
+ message.stage === 'listen' &&
+ typeof message.host === 'string' &&
+ typeof message.port === 'number' &&
+ typeof message.message === 'string' &&
+ (message.code === undefined || typeof message.code === 'string') &&
+ (message.errno === undefined || typeof message.errno === 'number') &&
+ (message.syscall === undefined || typeof message.syscall === 'string')
+ );
}🤖 Prompt for AI Agents |
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| import { | ||
| WORKER_STARTUP_FAILURE, | ||
| isWorkerStartupFailureMessage, | ||
| type WorkerStartupFailureMessage, | ||
| } from '../src/shared/workerMessages'; | ||
|
|
||
| /** | ||
| * These tests verify that the master process correctly handles the | ||
| * WORKER_STARTUP_FAILURE IPC message by aborting instead of reforking, | ||
| * while preserving the existing behavior for scheduled restarts and | ||
| * runtime crashes. | ||
| * | ||
| * We test the decision logic in isolation by simulating the cluster events | ||
| * that the master process listens to, rather than forking real workers. | ||
| */ | ||
|
|
||
| function buildStartupFailureMessage( | ||
| overrides: Partial<WorkerStartupFailureMessage> = {}, | ||
| ): WorkerStartupFailureMessage { | ||
| return { | ||
| type: WORKER_STARTUP_FAILURE, | ||
| stage: 'listen', | ||
| code: 'EADDRINUSE', | ||
| errno: -48, | ||
| syscall: 'listen', | ||
| host: 'localhost', | ||
| port: 3800, | ||
| message: 'listen EADDRINUSE: address already in use :::3800', | ||
| ...overrides, | ||
| }; | ||
| } | ||
|
|
||
| describe('master startup failure handling', () => { | ||
| let isAbortingForStartupFailure: boolean; | ||
| let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null; | ||
| let forkCount: number; | ||
| let exitCode: number | null; | ||
| let reportedMessages: string[]; | ||
|
|
||
| // Simulated handlers matching master.ts logic | ||
| function handleMessage(worker: { id: number }, message: unknown) { | ||
| if (!isWorkerStartupFailureMessage(message) || isAbortingForStartupFailure) return; | ||
|
|
||
| isAbortingForStartupFailure = true; | ||
| fatalStartupFailure = { workerId: worker.id, failure: message }; | ||
| } | ||
|
|
||
| function handleExit(worker: { | ||
| id: number; | ||
| isScheduledRestart?: boolean; | ||
| process: { exitCode: number | null }; | ||
| }) { | ||
| if (worker.isScheduledRestart) { | ||
| forkCount += 1; // cluster.fork() | ||
| return; | ||
| } | ||
|
|
||
| if (isAbortingForStartupFailure) { | ||
| const failure = fatalStartupFailure?.failure; | ||
| const msg = | ||
| failure?.code === 'EADDRINUSE' | ||
| ? `Node renderer startup failed: port ${failure.port} is already in use` | ||
| : `Node renderer startup failed in worker ${worker.id}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; | ||
|
|
||
| reportedMessages.push(msg); | ||
| exitCode = 1; | ||
| return; | ||
| } | ||
|
|
||
| const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; | ||
| reportedMessages.push(msg); | ||
| forkCount += 1; // cluster.fork() | ||
| } | ||
|
Comment on lines
+41
to
+73
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests duplicate
🤖 Prompt for AI Agents |
||
|
|
||
| beforeEach(() => { | ||
| isAbortingForStartupFailure = false; | ||
| fatalStartupFailure = null; | ||
| forkCount = 0; | ||
| exitCode = null; | ||
| reportedMessages = []; | ||
| }); | ||
|
|
||
| it('aborts with clear message on EADDRINUSE startup failure', () => { | ||
| const worker = { id: 1, process: { exitCode: 1 } }; | ||
| const failure = buildStartupFailureMessage(); | ||
|
|
||
| // Worker sends startup failure message | ||
| handleMessage(worker, failure); | ||
| expect(isAbortingForStartupFailure).toBe(true); | ||
|
|
||
| // Worker exits | ||
| handleExit(worker); | ||
|
|
||
| expect(forkCount).toBe(0); // No refork | ||
| expect(exitCode).toBe(1); | ||
| expect(reportedMessages).toEqual(['Node renderer startup failed: port 3800 is already in use']); | ||
| }); | ||
|
|
||
| it('aborts with generic message on non-EADDRINUSE startup failure', () => { | ||
| const worker = { id: 2, process: { exitCode: 1 } }; | ||
| const failure = buildStartupFailureMessage({ | ||
| code: 'EACCES', | ||
| message: 'listen EACCES: permission denied 0.0.0.0:80', | ||
| }); | ||
|
|
||
| handleMessage(worker, failure); | ||
| handleExit(worker); | ||
|
|
||
| expect(forkCount).toBe(0); | ||
| expect(exitCode).toBe(1); | ||
| expect(reportedMessages).toEqual([ | ||
| 'Node renderer startup failed in worker 2: listen EACCES: permission denied 0.0.0.0:80', | ||
| ]); | ||
| }); | ||
|
|
||
| it('deduplicates multiple workers failing simultaneously', () => { | ||
| const worker1 = { id: 1, process: { exitCode: 1 } }; | ||
| const worker2 = { id: 2, process: { exitCode: 1 } }; | ||
| const failure = buildStartupFailureMessage(); | ||
|
|
||
| // Both workers send failure messages | ||
| handleMessage(worker1, failure); | ||
| handleMessage(worker2, failure); | ||
|
|
||
| // Only the first one is recorded | ||
| expect(fatalStartupFailure?.workerId).toBe(1); | ||
|
|
||
| // First worker exit triggers abort | ||
| handleExit(worker1); | ||
| expect(exitCode).toBe(1); | ||
| expect(forkCount).toBe(0); | ||
| }); | ||
|
|
||
| it('reforks on scheduled restart', () => { | ||
| const worker = { id: 1, isScheduledRestart: true, process: { exitCode: 0 } }; | ||
|
|
||
| handleExit(worker); | ||
|
|
||
| expect(forkCount).toBe(1); | ||
| expect(exitCode).toBeNull(); | ||
| expect(reportedMessages).toHaveLength(0); | ||
| }); | ||
|
|
||
| it('reforks on unexpected runtime crash without startup failure', () => { | ||
| const worker = { id: 3, process: { exitCode: 1 } }; | ||
|
|
||
| // No startup failure message sent — this is a runtime crash | ||
| handleExit(worker); | ||
|
|
||
| expect(forkCount).toBe(1); | ||
| expect(exitCode).toBeNull(); | ||
| expect(reportedMessages).toEqual(['Worker 3 died UNEXPECTEDLY :(, restarting']); | ||
| }); | ||
|
|
||
| it('ignores non-startup-failure messages', () => { | ||
| const worker = { id: 1 }; | ||
|
|
||
| handleMessage(worker, { type: 'SOME_OTHER_MESSAGE' }); | ||
| handleMessage(worker, 'a string message'); | ||
| handleMessage(worker, null); | ||
|
|
||
| expect(isAbortingForStartupFailure).toBe(false); | ||
| expect(fatalStartupFailure).toBeNull(); | ||
| }); | ||
| }); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the recorded failing worker ID in the generic error path.
Once
isAbortingForStartupFailureis set, the nextexitevent is not guaranteed to come from the same worker that sent the IPC message. Usingworker.idhere can misreport which worker actually failed even thoughfatalStartupFailure.workerIdalready captured it.🐛 Proposed fix
if (isAbortingForStartupFailure) { const failure = fatalStartupFailure?.failure; + const failedWorkerId = fatalStartupFailure?.workerId ?? worker.id; const msg = failure?.code === 'EADDRINUSE' ? `Node renderer startup failed: port ${failure.port} is already in use` - : `Node renderer startup failed in worker ${worker.id}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; + : `Node renderer startup failed in worker ${failedWorkerId}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; errorReporter.message(msg); process.exit(1);📝 Committable suggestion
🤖 Prompt for AI Agents