From a48cb3d3a0171df35a72c863c0ab896631b7b1fb Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 28 Mar 2026 13:27:12 -1000 Subject: [PATCH 01/20] Fix infinite fork loop when node renderer worker fails to bind port When a worker fails during app.listen() (e.g. EADDRINUSE), the master previously reforked unconditionally, causing an infinite fork/crash loop. Workers now send a WORKER_STARTUP_FAILURE IPC message to the master before exiting. The master sets an abort flag and exits with a clear error message instead of reforking. Scheduled restarts and runtime crashes continue to refork as before. Co-Authored-By: Yassa-hue Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/claude-code-review.yml | 8 + .../src/master.ts | 34 +++- .../src/shared/workerMessages.ts | 20 +++ .../src/worker.ts | 19 ++ .../tests/masterStartupFailure.test.ts | 165 ++++++++++++++++++ .../tests/workerStartupFailure.test.ts | 119 +++++++++++++ 6 files changed, 360 insertions(+), 5 deletions(-) create mode 100644 packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts create mode 100644 packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts create mode 100644 packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index cf507b65b0..7adfc97f0d 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -7,6 +7,8 @@ on: jobs: claude-review: runs-on: ubuntu-latest + env: + HAS_CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN != '' }} permissions: contents: read pull-requests: write @@ -21,6 +23,7 @@ jobs: - name: Run Claude Code Review id: claude-review + if: env.HAS_CLAUDE_CODE_OAUTH_TOKEN == 'true' uses: anthropics/claude-code-action@v1 with: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} @@ -43,3 +46,8 @@ jobs: claude_args: | --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*)" + + - name: Skip Claude review when token is unavailable + if: env.HAS_CLAUDE_CODE_OAUTH_TOKEN != 'true' + run: | + echo "Skipping Claude Code Review because CLAUDE_CODE_OAUTH_TOKEN is not available for this workflow context." diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 3030d1480d..f8be3d5a26 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -10,6 +10,7 @@ import { buildConfig, Config, logSanitizedConfig } from './shared/configBuilder. import restartWorkers from './master/restartWorkers.js'; import * as errorReporter from './shared/errorReporter.js'; import { getLicenseStatus } from './shared/licenseValidator.js'; +import { isWorkerStartupFailureMessage, type WorkerStartupFailureMessage } from './shared/workerMessages.js'; const MILLISECONDS_IN_MINUTE = 60000; // How often to scan for orphaned upload directories. @@ -77,21 +78,44 @@ export default function masterRun(runningConfig?: Partial) { })(); }, ORPHAN_CLEANUP_INTERVAL_MS); + let isAbortingForStartupFailure = false; + let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null; + for (let i = 0; i < workersCount; i += 1) { cluster.fork(); } + cluster.on('message', (worker, message) => { + if (!isWorkerStartupFailureMessage(message) || isAbortingForStartupFailure) return; + + isAbortingForStartupFailure = true; + fatalStartupFailure = { workerId: worker.id, failure: message }; + }); + // Listen for dying workers: cluster.on('exit', (worker) => { if (worker.isScheduledRestart) { log.info('Restarting worker #%d on schedule', worker.id); - } else { - // TODO: Track last rendering request per worker.id - // TODO: Consider blocking a given rendering request if it kills a worker more than X times - const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; + cluster.fork(); + return; + } + + if (isAbortingForStartupFailure) { + const failure = fatalStartupFailure?.failure; + const msg = + failure?.code === 'EADDRINUSE' + ? `Node renderer startup failed: port ${failure.port} is already in use` + : `Node renderer startup failed in worker ${worker.id}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; + errorReporter.message(msg); + process.exit(1); + return; } - // Replace the dead worker: + + // TODO: Track last rendering request per worker.id + // TODO: Consider blocking a given rendering request if it kills a worker more than X times + const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; + errorReporter.message(msg); cluster.fork(); }); diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts new file mode 100644 index 0000000000..e5d14dbe36 --- /dev/null +++ b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts @@ -0,0 +1,20 @@ +export const WORKER_STARTUP_FAILURE = 'NODE_RENDERER_WORKER_STARTUP_FAILURE' as const; + +export interface WorkerStartupFailureMessage { + type: typeof WORKER_STARTUP_FAILURE; + stage: 'listen'; + code?: string; + errno?: number; + syscall?: string; + host: string; + port: number; + message: string; +} + +export function isWorkerStartupFailureMessage(value: unknown): value is WorkerStartupFailureMessage { + return ( + typeof value === 'object' && + value !== null && + (value as { type?: string }).type === WORKER_STARTUP_FAILURE + ); +} diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index eb2d0ec9f9..90b9d83a99 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -23,6 +23,7 @@ import { type ProvidedNewBundle, } from './worker/handleRenderRequest.js'; import handleGracefulShutdown from './worker/handleGracefulShutdown.js'; +import { WORKER_STARTUP_FAILURE, type WorkerStartupFailureMessage } from './shared/workerMessages.js'; import { badRequestResponseResult, errorResponseResult, @@ -511,6 +512,24 @@ export default function run(config: Partial) { app.listen({ port, host }, (err, address) => { if (err) { log.error({ err, host, port }, 'Node renderer failed to start'); + + if (cluster.isWorker && process.send) { + const startupFailure: WorkerStartupFailureMessage = { + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + code: (err as NodeJS.ErrnoException).code, + errno: (err as NodeJS.ErrnoException).errno, + syscall: (err as NodeJS.ErrnoException).syscall, + host, + port, + message: err.message, + }; + process.send(startupFailure, undefined, undefined, () => { + process.exit(1); + }); + return; + } + process.exit(1); } const workerName = worker ? `worker #${worker.id}` : 'master (single-process)'; diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts new file mode 100644 index 0000000000..1d3daa374b --- /dev/null +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -0,0 +1,165 @@ +import { + WORKER_STARTUP_FAILURE, + isWorkerStartupFailureMessage, + type WorkerStartupFailureMessage, +} from '../src/shared/workerMessages'; + +/** + * These tests verify that the master process correctly handles the + * WORKER_STARTUP_FAILURE IPC message by aborting instead of reforking, + * while preserving the existing behavior for scheduled restarts and + * runtime crashes. + * + * We test the decision logic in isolation by simulating the cluster events + * that the master process listens to, rather than forking real workers. + */ + +function buildStartupFailureMessage( + overrides: Partial = {}, +): WorkerStartupFailureMessage { + return { + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + code: 'EADDRINUSE', + errno: -48, + syscall: 'listen', + host: 'localhost', + port: 3800, + message: 'listen EADDRINUSE: address already in use :::3800', + ...overrides, + }; +} + +describe('master startup failure handling', () => { + let isAbortingForStartupFailure: boolean; + let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null; + let forkCount: number; + let exitCode: number | null; + let reportedMessages: string[]; + + // Simulated handlers matching master.ts logic + function handleMessage(worker: { id: number }, message: unknown) { + if (!isWorkerStartupFailureMessage(message) || isAbortingForStartupFailure) return; + + isAbortingForStartupFailure = true; + fatalStartupFailure = { workerId: worker.id, failure: message }; + } + + function handleExit(worker: { + id: number; + isScheduledRestart?: boolean; + process: { exitCode: number | null }; + }) { + if (worker.isScheduledRestart) { + forkCount += 1; // cluster.fork() + return; + } + + if (isAbortingForStartupFailure) { + const failure = fatalStartupFailure?.failure; + const msg = + failure?.code === 'EADDRINUSE' + ? `Node renderer startup failed: port ${failure.port} is already in use` + : `Node renderer startup failed in worker ${worker.id}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; + + reportedMessages.push(msg); + exitCode = 1; + return; + } + + const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; + reportedMessages.push(msg); + forkCount += 1; // cluster.fork() + } + + beforeEach(() => { + isAbortingForStartupFailure = false; + fatalStartupFailure = null; + forkCount = 0; + exitCode = null; + reportedMessages = []; + }); + + it('aborts with clear message on EADDRINUSE startup failure', () => { + const worker = { id: 1, process: { exitCode: 1 } }; + const failure = buildStartupFailureMessage(); + + // Worker sends startup failure message + handleMessage(worker, failure); + expect(isAbortingForStartupFailure).toBe(true); + + // Worker exits + handleExit(worker); + + expect(forkCount).toBe(0); // No refork + expect(exitCode).toBe(1); + expect(reportedMessages).toEqual(['Node renderer startup failed: port 3800 is already in use']); + }); + + it('aborts with generic message on non-EADDRINUSE startup failure', () => { + const worker = { id: 2, process: { exitCode: 1 } }; + const failure = buildStartupFailureMessage({ + code: 'EACCES', + message: 'listen EACCES: permission denied 0.0.0.0:80', + }); + + handleMessage(worker, failure); + handleExit(worker); + + expect(forkCount).toBe(0); + expect(exitCode).toBe(1); + expect(reportedMessages).toEqual([ + 'Node renderer startup failed in worker 2: listen EACCES: permission denied 0.0.0.0:80', + ]); + }); + + it('deduplicates multiple workers failing simultaneously', () => { + const worker1 = { id: 1, process: { exitCode: 1 } }; + const worker2 = { id: 2, process: { exitCode: 1 } }; + const failure = buildStartupFailureMessage(); + + // Both workers send failure messages + handleMessage(worker1, failure); + handleMessage(worker2, failure); + + // Only the first one is recorded + expect(fatalStartupFailure?.workerId).toBe(1); + + // First worker exit triggers abort + handleExit(worker1); + expect(exitCode).toBe(1); + expect(forkCount).toBe(0); + }); + + it('reforks on scheduled restart', () => { + const worker = { id: 1, isScheduledRestart: true, process: { exitCode: 0 } }; + + handleExit(worker); + + expect(forkCount).toBe(1); + expect(exitCode).toBeNull(); + expect(reportedMessages).toHaveLength(0); + }); + + it('reforks on unexpected runtime crash without startup failure', () => { + const worker = { id: 3, process: { exitCode: 1 } }; + + // No startup failure message sent — this is a runtime crash + handleExit(worker); + + expect(forkCount).toBe(1); + expect(exitCode).toBeNull(); + expect(reportedMessages).toEqual(['Worker 3 died UNEXPECTEDLY :(, restarting']); + }); + + it('ignores non-startup-failure messages', () => { + const worker = { id: 1 }; + + handleMessage(worker, { type: 'SOME_OTHER_MESSAGE' }); + handleMessage(worker, 'a string message'); + handleMessage(worker, null); + + expect(isAbortingForStartupFailure).toBe(false); + expect(fatalStartupFailure).toBeNull(); + }); +}); diff --git a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts new file mode 100644 index 0000000000..4e5b7d185e --- /dev/null +++ b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts @@ -0,0 +1,119 @@ +import cluster from 'cluster'; +import { + WORKER_STARTUP_FAILURE, + isWorkerStartupFailureMessage, + type WorkerStartupFailureMessage, +} from '../src/shared/workerMessages'; + +describe('isWorkerStartupFailureMessage', () => { + it('returns true for a valid startup failure message', () => { + const msg: WorkerStartupFailureMessage = { + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + code: 'EADDRINUSE', + errno: -48, + syscall: 'listen', + host: 'localhost', + port: 3800, + message: 'listen EADDRINUSE: address already in use :::3800', + }; + expect(isWorkerStartupFailureMessage(msg)).toBe(true); + }); + + it('returns false for null', () => { + expect(isWorkerStartupFailureMessage(null)).toBe(false); + }); + + it('returns false for a string', () => { + expect(isWorkerStartupFailureMessage('hello')).toBe(false); + }); + + it('returns false for an object with a different type', () => { + expect(isWorkerStartupFailureMessage({ type: 'OTHER' })).toBe(false); + }); + + it('returns false for an object without type', () => { + expect(isWorkerStartupFailureMessage({ stage: 'listen' })).toBe(false); + }); +}); + +describe('worker startup failure IPC', () => { + const originalSend = process.send; + const originalExit = process.exit; + const originalIsWorker = cluster.isWorker; + + afterEach(() => { + process.send = originalSend; + process.exit = originalExit; + Object.defineProperty(cluster, 'isWorker', { value: originalIsWorker, writable: true }); + }); + + it('sends WORKER_STARTUP_FAILURE message in clustered mode', () => { + // Simulate a cluster worker environment + Object.defineProperty(cluster, 'isWorker', { value: true, writable: true }); + const sentMessages: unknown[] = []; + const exitCalls: number[] = []; + + process.send = ((msg: unknown, _handle?: unknown, _options?: unknown, callback?: () => void) => { + sentMessages.push(msg); + if (callback) callback(); + return true; + }) as typeof process.send; + + process.exit = ((code?: number) => { + exitCalls.push(code ?? 0); + }) as typeof process.exit; + + // Simulate what the worker does on listen error + const err = Object.assign(new Error('listen EADDRINUSE: address already in use :::3800'), { + code: 'EADDRINUSE', + errno: -48, + syscall: 'listen', + }); + const host = 'localhost'; + const port = 3800; + + const startupFailure: WorkerStartupFailureMessage = { + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + code: err.code, + errno: err.errno, + syscall: err.syscall, + host, + port, + message: err.message, + }; + + process.send!(startupFailure, undefined, undefined, () => { + process.exit(1); + }); + + expect(sentMessages).toHaveLength(1); + expect(isWorkerStartupFailureMessage(sentMessages[0])).toBe(true); + expect((sentMessages[0] as WorkerStartupFailureMessage).code).toBe('EADDRINUSE'); + expect(exitCalls).toEqual([1]); + }); + + it('exits without IPC in single-process mode', () => { + Object.defineProperty(cluster, 'isWorker', { value: false, writable: true }); + process.send = undefined; + + const exitCalls: number[] = []; + process.exit = ((code?: number) => { + exitCalls.push(code ?? 0); + }) as typeof process.exit; + + // In single-process mode, cluster.isWorker is false and process.send is undefined + if (cluster.isWorker && process.send) { + // Should not reach here + process.send({ type: WORKER_STARTUP_FAILURE }, undefined, undefined, () => { + process.exit(1); + }); + return; + } + + process.exit(1); + + expect(exitCalls).toEqual([1]); + }); +}); From a861a6eb1baf401ed224a2aa299f60c0896b65c3 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 28 Mar 2026 17:49:51 -1000 Subject: [PATCH 02/20] Revert claude-code-review.yml to match default branch The workflow modifications (env check, conditional step, skip step) cause a 401 OIDC token validation failure because GitHub requires workflow content to match the default branch for id-token: write permissions. Reverting to unblock CI; workflow changes should be merged to main separately. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/claude-code-review.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index 7adfc97f0d..cf507b65b0 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -7,8 +7,6 @@ on: jobs: claude-review: runs-on: ubuntu-latest - env: - HAS_CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN != '' }} permissions: contents: read pull-requests: write @@ -23,7 +21,6 @@ jobs: - name: Run Claude Code Review id: claude-review - if: env.HAS_CLAUDE_CODE_OAUTH_TOKEN == 'true' uses: anthropics/claude-code-action@v1 with: claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} @@ -46,8 +43,3 @@ jobs: claude_args: | --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*)" - - - name: Skip Claude review when token is unavailable - if: env.HAS_CLAUDE_CODE_OAUTH_TOKEN != 'true' - run: | - echo "Skipping Claude Code Review because CLAUDE_CODE_OAUTH_TOKEN is not available for this workflow context." From 4f82dd796703a804f9a1cf9d9a918fc0bf5f6364 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 28 Mar 2026 18:38:14 -1000 Subject: [PATCH 03/20] Fix startup-failure listener ordering and failure payload checks --- .../src/master.ts | 11 ++++++----- .../src/shared/workerMessages.ts | 15 ++++++++++++--- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index f8be3d5a26..3432365784 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -81,10 +81,6 @@ export default function masterRun(runningConfig?: Partial) { let isAbortingForStartupFailure = false; let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null; - for (let i = 0; i < workersCount; i += 1) { - cluster.fork(); - } - cluster.on('message', (worker, message) => { if (!isWorkerStartupFailureMessage(message) || isAbortingForStartupFailure) return; @@ -92,6 +88,10 @@ export default function masterRun(runningConfig?: Partial) { fatalStartupFailure = { workerId: worker.id, failure: message }; }); + for (let i = 0; i < workersCount; i += 1) { + cluster.fork(); + } + // Listen for dying workers: cluster.on('exit', (worker) => { if (worker.isScheduledRestart) { @@ -102,10 +102,11 @@ export default function masterRun(runningConfig?: Partial) { if (isAbortingForStartupFailure) { const failure = fatalStartupFailure?.failure; + const failedWorkerId = fatalStartupFailure?.workerId ?? worker.id; const msg = failure?.code === 'EADDRINUSE' ? `Node renderer startup failed: port ${failure.port} is already in use` - : `Node renderer startup failed in worker ${worker.id}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; + : `Node renderer startup failed in worker ${failedWorkerId}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; errorReporter.message(msg); process.exit(1); diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts index e5d14dbe36..8bd66214a9 100644 --- a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts +++ b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts @@ -12,9 +12,18 @@ export interface WorkerStartupFailureMessage { } export function isWorkerStartupFailureMessage(value: unknown): value is WorkerStartupFailureMessage { + if (typeof value !== 'object' || value === null) { + return false; + } + + const message = value as Partial; + return ( - typeof value === 'object' && - value !== null && - (value as { type?: string }).type === WORKER_STARTUP_FAILURE + message.type === WORKER_STARTUP_FAILURE && + message.stage === 'listen' && + typeof message.host === 'string' && + typeof message.port === 'number' && + !Number.isNaN(message.port) && + typeof message.message === 'string' ); } From a090fee97141240d7d6dfe123abed916a60509f8 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 28 Mar 2026 19:03:06 -1000 Subject: [PATCH 04/20] Harden worker startup-failure IPC exit path --- .../src/worker.ts | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index 90b9d83a99..046237b614 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -524,9 +524,24 @@ export default function run(config: Partial) { port, message: err.message, }; - process.send(startupFailure, undefined, undefined, () => { + const exitFallbackTimer = setTimeout(() => { process.exit(1); - }); + }, 500); + exitFallbackTimer.unref(); + + try { + process.send(startupFailure, undefined, undefined, (sendErr) => { + clearTimeout(exitFallbackTimer); + if (sendErr) { + log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); + } + process.exit(1); + }); + } catch (sendErr) { + clearTimeout(exitFallbackTimer); + log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); + process.exit(1); + } return; } From 5440452e72ec2c34ef76e646c425f031dd052fed Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 28 Mar 2026 22:18:36 -1000 Subject: [PATCH 05/20] Prevent regressions in node renderer startup-failure abort handling --- .../src/worker.ts | 2 + .../tests/masterStartupFailure.test.ts | 104 +++++++++++- .../repro-node-renderer-startup-failure.mjs | 148 ++++++++++++++++++ 3 files changed, 252 insertions(+), 2 deletions(-) create mode 100755 scripts/repro-node-renderer-startup-failure.mjs diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index 046237b614..a821911e4b 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -524,6 +524,8 @@ export default function run(config: Partial) { port, message: err.message, }; + // If the IPC callback never fires because the channel is already broken, + // force the worker to exit instead of hanging during startup failure. const exitFallbackTimer = setTimeout(() => { process.exit(1); }, 500); diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index 1d3daa374b..ec2823db6e 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -10,8 +10,10 @@ import { * while preserving the existing behavior for scheduled restarts and * runtime crashes. * - * We test the decision logic in isolation by simulating the cluster events - * that the master process listens to, rather than forking real workers. + * Most cases exercise the decision logic in isolation by simulating the + * cluster events that the master process listens to. We also keep one thin + * wiring test that runs masterRun() against a mocked cluster so regressions + * in the actual listener registration path are caught too. */ function buildStartupFailureMessage( @@ -30,6 +32,104 @@ function buildStartupFailureMessage( }; } +describe('masterRun wiring', () => { + afterEach(() => { + jest.restoreAllMocks(); + jest.resetModules(); + }); + + it('registers the startup-failure listener before forking and aborts without reforking', () => { + const mockOperations: string[] = []; + const mockClusterHandlers: Record void> = {}; + const mockFork = jest.fn(() => { + mockOperations.push('fork'); + return {}; + }); + const mockCluster = { + on: jest.fn((event: string, handler: (...args: unknown[]) => void) => { + mockOperations.push(`on:${event}`); + mockClusterHandlers[event] = handler; + return mockCluster; + }), + fork: mockFork, + }; + const mockErrorReporterMessage = jest.fn(); + const mockLog = { + info: jest.fn(), + warn: jest.fn(), + error: jest.fn(), + fatal: jest.fn(), + }; + const mockBuildConfig = jest.fn(() => ({ + workersCount: 2, + allWorkersRestartInterval: undefined, + delayBetweenIndividualWorkerRestarts: undefined, + gracefulWorkerRestartTimeout: 0, + serverBundleCachePath: '/tmp/react-on-rails-pro-node-renderer-bundles', + })); + const mockLogSanitizedConfig = jest.fn(); + const mockGetLicenseStatus = jest.fn(() => 'valid'); + const setIntervalSpy = jest.spyOn(global, 'setInterval').mockReturnValue(0 as unknown as NodeJS.Timeout); + const processExitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => { + throw new Error(`process.exit:${code}`); + }) as typeof process.exit); + + jest.doMock('cluster', () => ({ + __esModule: true, + default: mockCluster, + })); + jest.doMock('../src/shared/log', () => ({ + __esModule: true, + default: mockLog, + })); + jest.doMock('../src/shared/errorReporter', () => ({ + __esModule: true, + message: mockErrorReporterMessage, + error: jest.fn(), + addMessageNotifier: jest.fn(), + addErrorNotifier: jest.fn(), + addNotifier: jest.fn(), + })); + jest.doMock('../src/shared/configBuilder', () => ({ + __esModule: true, + buildConfig: mockBuildConfig, + logSanitizedConfig: mockLogSanitizedConfig, + })); + jest.doMock('../src/shared/licenseValidator', () => ({ + __esModule: true, + getLicenseStatus: mockGetLicenseStatus, + })); + jest.doMock('../src/master/restartWorkers', () => ({ + __esModule: true, + default: jest.fn(), + })); + + let masterRun: typeof import('../src/master').default; + jest.isolateModules(() => { + // eslint-disable-next-line global-require + masterRun = require('../src/master').default as typeof import('../src/master').default; + }); + + masterRun(); + + expect(mockOperations).toEqual(['on:message', 'fork', 'fork', 'on:exit']); + expect(mockFork).toHaveBeenCalledTimes(2); + expect(setIntervalSpy).toHaveBeenCalledTimes(1); + + const failure = buildStartupFailureMessage(); + const worker = { id: 1, process: { exitCode: 1 } }; + + mockClusterHandlers.message(worker, failure); + + expect(() => mockClusterHandlers.exit(worker)).toThrow('process.exit:1'); + expect(mockErrorReporterMessage).toHaveBeenCalledWith( + 'Node renderer startup failed: port 3800 is already in use', + ); + expect(processExitSpy).toHaveBeenCalledWith(1); + expect(mockFork).toHaveBeenCalledTimes(2); + }); +}); + describe('master startup failure handling', () => { let isAbortingForStartupFailure: boolean; let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null; diff --git a/scripts/repro-node-renderer-startup-failure.mjs b/scripts/repro-node-renderer-startup-failure.mjs new file mode 100755 index 0000000000..025d040e4c --- /dev/null +++ b/scripts/repro-node-renderer-startup-failure.mjs @@ -0,0 +1,148 @@ +#!/usr/bin/env node + +import net from 'node:net'; +import path from 'node:path'; +import { spawn, spawnSync } from 'node:child_process'; +import process from 'node:process'; + +const RENDERER_PACKAGE = 'react-on-rails-pro-node-renderer'; +const RENDERER_ENTRY = path.join( + process.cwd(), + 'packages', + 'react-on-rails-pro-node-renderer', + 'lib', + 'default-node-renderer.js', +); + +const args = new Set(process.argv.slice(2)); +const skipBuild = args.has('--skip-build'); +const timeoutMs = Number(process.env.REPRO_TIMEOUT_MS || '8000'); + +if (Number.isNaN(timeoutMs) || timeoutMs <= 0) { + console.error(`Invalid REPRO_TIMEOUT_MS value: ${process.env.REPRO_TIMEOUT_MS}`); + process.exit(1); +} + +if (!skipBuild) { + console.log(`Building ${RENDERER_PACKAGE}...`); + const buildResult = spawnSync('pnpm', ['--filter', RENDERER_PACKAGE, 'build'], { + stdio: 'inherit', + env: process.env, + }); + + if (buildResult.status !== 0) { + process.exit(buildResult.status ?? 1); + } +} + +const blocker = net.createServer(); + +const closeBlocker = async () => { + await new Promise((resolve) => { + blocker.close(() => resolve(undefined)); + }); +}; + +blocker.listen(0, '127.0.0.1', () => { + const address = blocker.address(); + if (!address || typeof address === 'string') { + console.error('Failed to allocate a blocker port for reproduction'); + process.exit(1); + } + + const blockedPort = address.port; + const startTime = Date.now(); + let timedOut = false; + + const renderer = spawn('node', [RENDERER_ENTRY], { + env: { + ...process.env, + NODE_ENV: 'test', + RENDERER_HOST: '127.0.0.1', + RENDERER_PORT: String(blockedPort), + RENDERER_WORKERS_COUNT: '2', + RENDERER_LOG_LEVEL: 'info', + RENDERER_LOG_HTTP_LEVEL: 'error', + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + let output = ''; + renderer.stdout.on('data', (chunk) => { + output += chunk.toString(); + }); + renderer.stderr.on('data', (chunk) => { + output += chunk.toString(); + }); + + const timeout = setTimeout(() => { + timedOut = true; + renderer.kill('SIGKILL'); + }, timeoutMs); + + renderer.on('error', async (err) => { + clearTimeout(timeout); + await closeBlocker(); + console.error('Failed to spawn node renderer:', err); + process.exit(1); + }); + + renderer.on('exit', async (code, signal) => { + clearTimeout(timeout); + await closeBlocker(); + + const durationMs = Date.now() - startTime; + const restartMessages = (output.match(/died UNEXPECTEDLY :\(, restarting/g) || []).length; + const startupFailureMessage = new RegExp( + `Node renderer startup failed: port ${blockedPort} is already in use`, + ).test(output); + + const checks = { + exitedWithoutTimeout: !timedOut, + exitCodeIsOne: code === 1, + noSignal: signal === null, + startupFailureMessage, + noUnexpectedRestartLoop: restartMessages === 0, + }; + + const passed = Object.values(checks).every(Boolean); + + if (passed) { + console.log('PASS: startup failure path aborts cleanly with no refork loop.'); + console.log( + JSON.stringify( + { + blockedPort, + durationMs, + code, + signal, + restartMessages, + }, + null, + 2, + ), + ); + process.exit(0); + } + + console.error('FAIL: startup failure reproduction did not match expected fixed behavior.'); + console.error( + JSON.stringify( + { + blockedPort, + durationMs, + code, + signal, + timedOut, + restartMessages, + checks, + }, + null, + 2, + ), + ); + console.error('--- renderer output (first 200 lines) ---'); + console.error(output.split('\n').slice(0, 200).join('\n')); + process.exit(1); + }); +}); From c4ce2ea5a91547961bf9fe084ad3577c73de5545 Mon Sep 17 00:00:00 2001 From: Yassa-hue Date: Sun, 29 Mar 2026 20:52:19 +0200 Subject: [PATCH 06/20] Remove unnecessary 500ms fallback timer from worker IPC exit path process.send with a callback always invokes the callback (success or async failure), and the try/catch handles the synchronous throw case. The arbitrary timer added complexity without covering a real failure mode. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/react-on-rails-pro-node-renderer/src/worker.ts | 9 --------- 1 file changed, 9 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index a821911e4b..509820fe1a 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -524,23 +524,14 @@ export default function run(config: Partial) { port, message: err.message, }; - // If the IPC callback never fires because the channel is already broken, - // force the worker to exit instead of hanging during startup failure. - const exitFallbackTimer = setTimeout(() => { - process.exit(1); - }, 500); - exitFallbackTimer.unref(); - try { process.send(startupFailure, undefined, undefined, (sendErr) => { - clearTimeout(exitFallbackTimer); if (sendErr) { log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); } process.exit(1); }); } catch (sendErr) { - clearTimeout(exitFallbackTimer); log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); process.exit(1); } From 1be836d45df439c2dceeeceac5349fa9436ded51 Mon Sep 17 00:00:00 2001 From: Yassa-hue Date: Sun, 29 Mar 2026 21:03:30 +0200 Subject: [PATCH 07/20] Remove redundant startup-failure repro script The unit and wiring tests in masterStartupFailure.test.ts already cover the same behavior. A standalone script outside the test framework adds maintenance burden without additional coverage. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../repro-node-renderer-startup-failure.mjs | 148 ------------------ 1 file changed, 148 deletions(-) delete mode 100755 scripts/repro-node-renderer-startup-failure.mjs diff --git a/scripts/repro-node-renderer-startup-failure.mjs b/scripts/repro-node-renderer-startup-failure.mjs deleted file mode 100755 index 025d040e4c..0000000000 --- a/scripts/repro-node-renderer-startup-failure.mjs +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env node - -import net from 'node:net'; -import path from 'node:path'; -import { spawn, spawnSync } from 'node:child_process'; -import process from 'node:process'; - -const RENDERER_PACKAGE = 'react-on-rails-pro-node-renderer'; -const RENDERER_ENTRY = path.join( - process.cwd(), - 'packages', - 'react-on-rails-pro-node-renderer', - 'lib', - 'default-node-renderer.js', -); - -const args = new Set(process.argv.slice(2)); -const skipBuild = args.has('--skip-build'); -const timeoutMs = Number(process.env.REPRO_TIMEOUT_MS || '8000'); - -if (Number.isNaN(timeoutMs) || timeoutMs <= 0) { - console.error(`Invalid REPRO_TIMEOUT_MS value: ${process.env.REPRO_TIMEOUT_MS}`); - process.exit(1); -} - -if (!skipBuild) { - console.log(`Building ${RENDERER_PACKAGE}...`); - const buildResult = spawnSync('pnpm', ['--filter', RENDERER_PACKAGE, 'build'], { - stdio: 'inherit', - env: process.env, - }); - - if (buildResult.status !== 0) { - process.exit(buildResult.status ?? 1); - } -} - -const blocker = net.createServer(); - -const closeBlocker = async () => { - await new Promise((resolve) => { - blocker.close(() => resolve(undefined)); - }); -}; - -blocker.listen(0, '127.0.0.1', () => { - const address = blocker.address(); - if (!address || typeof address === 'string') { - console.error('Failed to allocate a blocker port for reproduction'); - process.exit(1); - } - - const blockedPort = address.port; - const startTime = Date.now(); - let timedOut = false; - - const renderer = spawn('node', [RENDERER_ENTRY], { - env: { - ...process.env, - NODE_ENV: 'test', - RENDERER_HOST: '127.0.0.1', - RENDERER_PORT: String(blockedPort), - RENDERER_WORKERS_COUNT: '2', - RENDERER_LOG_LEVEL: 'info', - RENDERER_LOG_HTTP_LEVEL: 'error', - }, - stdio: ['ignore', 'pipe', 'pipe'], - }); - - let output = ''; - renderer.stdout.on('data', (chunk) => { - output += chunk.toString(); - }); - renderer.stderr.on('data', (chunk) => { - output += chunk.toString(); - }); - - const timeout = setTimeout(() => { - timedOut = true; - renderer.kill('SIGKILL'); - }, timeoutMs); - - renderer.on('error', async (err) => { - clearTimeout(timeout); - await closeBlocker(); - console.error('Failed to spawn node renderer:', err); - process.exit(1); - }); - - renderer.on('exit', async (code, signal) => { - clearTimeout(timeout); - await closeBlocker(); - - const durationMs = Date.now() - startTime; - const restartMessages = (output.match(/died UNEXPECTEDLY :\(, restarting/g) || []).length; - const startupFailureMessage = new RegExp( - `Node renderer startup failed: port ${blockedPort} is already in use`, - ).test(output); - - const checks = { - exitedWithoutTimeout: !timedOut, - exitCodeIsOne: code === 1, - noSignal: signal === null, - startupFailureMessage, - noUnexpectedRestartLoop: restartMessages === 0, - }; - - const passed = Object.values(checks).every(Boolean); - - if (passed) { - console.log('PASS: startup failure path aborts cleanly with no refork loop.'); - console.log( - JSON.stringify( - { - blockedPort, - durationMs, - code, - signal, - restartMessages, - }, - null, - 2, - ), - ); - process.exit(0); - } - - console.error('FAIL: startup failure reproduction did not match expected fixed behavior.'); - console.error( - JSON.stringify( - { - blockedPort, - durationMs, - code, - signal, - timedOut, - restartMessages, - checks, - }, - null, - 2, - ), - ); - console.error('--- renderer output (first 200 lines) ---'); - console.error(output.split('\n').slice(0, 200).join('\n')); - process.exit(1); - }); -}); From 49c0ff0bedc5c4119dcb9db55353bd2441785dc6 Mon Sep 17 00:00:00 2001 From: Yassa-hue Date: Sun, 29 Mar 2026 21:14:22 +0200 Subject: [PATCH 08/20] Address review feedback: remove dead return, include host in error, fix test drift - Remove unreachable return after process.exit(1) in worker IPC path - Include host in EADDRINUSE message (e.g. "localhost:3800 is already in use") for easier cross-referencing with ss/lsof - Fix handleExit in logic tests to use failedWorkerId matching actual master.ts code Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/react-on-rails-pro-node-renderer/src/master.ts | 2 +- packages/react-on-rails-pro-node-renderer/src/worker.ts | 1 - .../tests/masterStartupFailure.test.ts | 9 +++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 3432365784..0eb0976161 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -105,7 +105,7 @@ export default function masterRun(runningConfig?: Partial) { const failedWorkerId = fatalStartupFailure?.workerId ?? worker.id; const msg = failure?.code === 'EADDRINUSE' - ? `Node renderer startup failed: port ${failure.port} is already in use` + ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` : `Node renderer startup failed in worker ${failedWorkerId}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; errorReporter.message(msg); diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index 509820fe1a..b9c9c361b3 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -535,7 +535,6 @@ export default function run(config: Partial) { log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); process.exit(1); } - return; } process.exit(1); diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index ec2823db6e..9882368f8f 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -123,7 +123,7 @@ describe('masterRun wiring', () => { expect(() => mockClusterHandlers.exit(worker)).toThrow('process.exit:1'); expect(mockErrorReporterMessage).toHaveBeenCalledWith( - 'Node renderer startup failed: port 3800 is already in use', + 'Node renderer startup failed: localhost:3800 is already in use', ); expect(processExitSpy).toHaveBeenCalledWith(1); expect(mockFork).toHaveBeenCalledTimes(2); @@ -157,10 +157,11 @@ describe('master startup failure handling', () => { if (isAbortingForStartupFailure) { const failure = fatalStartupFailure?.failure; + const failedWorkerId = fatalStartupFailure?.workerId ?? worker.id; const msg = failure?.code === 'EADDRINUSE' - ? `Node renderer startup failed: port ${failure.port} is already in use` - : `Node renderer startup failed in worker ${worker.id}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; + ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` + : `Node renderer startup failed in worker ${failedWorkerId}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; reportedMessages.push(msg); exitCode = 1; @@ -193,7 +194,7 @@ describe('master startup failure handling', () => { expect(forkCount).toBe(0); // No refork expect(exitCode).toBe(1); - expect(reportedMessages).toEqual(['Node renderer startup failed: port 3800 is already in use']); + expect(reportedMessages).toEqual(['Node renderer startup failed: localhost:3800 is already in use']); }); it('aborts with generic message on non-EADDRINUSE startup failure', () => { From 9c6aaf0b30b546e80574c3e31e2ba07e6ed2810a Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sun, 29 Mar 2026 15:08:50 -1000 Subject: [PATCH 09/20] Prevent startup-failure IPC loss before worker exit --- .../src/master.ts | 1 - .../src/worker.ts | 2 + .../tests/masterStartupFailure.test.ts | 194 ++++++++++-------- 3 files changed, 107 insertions(+), 90 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 0eb0976161..92d1f2297d 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -110,7 +110,6 @@ export default function masterRun(runningConfig?: Partial) { errorReporter.message(msg); process.exit(1); - return; } // TODO: Track last rendering request per worker.id diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index b9c9c361b3..0889347cd1 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -531,9 +531,11 @@ export default function run(config: Partial) { } process.exit(1); }); + return; } catch (sendErr) { log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); process.exit(1); + return; } } diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index 9882368f8f..844f3eb18c 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -38,96 +38,112 @@ describe('masterRun wiring', () => { jest.resetModules(); }); - it('registers the startup-failure listener before forking and aborts without reforking', () => { - const mockOperations: string[] = []; - const mockClusterHandlers: Record void> = {}; - const mockFork = jest.fn(() => { - mockOperations.push('fork'); - return {}; - }); - const mockCluster = { - on: jest.fn((event: string, handler: (...args: unknown[]) => void) => { - mockOperations.push(`on:${event}`); - mockClusterHandlers[event] = handler; - return mockCluster; + it.each([ + { + testName: 'EADDRINUSE startup failure', + failure: buildStartupFailureMessage(), + expectedMessage: 'Node renderer startup failed: localhost:3800 is already in use', + }, + { + testName: 'generic startup failure', + failure: buildStartupFailureMessage({ + code: 'EACCES', + message: 'listen EACCES: permission denied 0.0.0.0:80', }), - fork: mockFork, - }; - const mockErrorReporterMessage = jest.fn(); - const mockLog = { - info: jest.fn(), - warn: jest.fn(), - error: jest.fn(), - fatal: jest.fn(), - }; - const mockBuildConfig = jest.fn(() => ({ - workersCount: 2, - allWorkersRestartInterval: undefined, - delayBetweenIndividualWorkerRestarts: undefined, - gracefulWorkerRestartTimeout: 0, - serverBundleCachePath: '/tmp/react-on-rails-pro-node-renderer-bundles', - })); - const mockLogSanitizedConfig = jest.fn(); - const mockGetLicenseStatus = jest.fn(() => 'valid'); - const setIntervalSpy = jest.spyOn(global, 'setInterval').mockReturnValue(0 as unknown as NodeJS.Timeout); - const processExitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => { - throw new Error(`process.exit:${code}`); - }) as typeof process.exit); - - jest.doMock('cluster', () => ({ - __esModule: true, - default: mockCluster, - })); - jest.doMock('../src/shared/log', () => ({ - __esModule: true, - default: mockLog, - })); - jest.doMock('../src/shared/errorReporter', () => ({ - __esModule: true, - message: mockErrorReporterMessage, - error: jest.fn(), - addMessageNotifier: jest.fn(), - addErrorNotifier: jest.fn(), - addNotifier: jest.fn(), - })); - jest.doMock('../src/shared/configBuilder', () => ({ - __esModule: true, - buildConfig: mockBuildConfig, - logSanitizedConfig: mockLogSanitizedConfig, - })); - jest.doMock('../src/shared/licenseValidator', () => ({ - __esModule: true, - getLicenseStatus: mockGetLicenseStatus, - })); - jest.doMock('../src/master/restartWorkers', () => ({ - __esModule: true, - default: jest.fn(), - })); - - let masterRun: typeof import('../src/master').default; - jest.isolateModules(() => { - // eslint-disable-next-line global-require - masterRun = require('../src/master').default as typeof import('../src/master').default; - }); - - masterRun(); - - expect(mockOperations).toEqual(['on:message', 'fork', 'fork', 'on:exit']); - expect(mockFork).toHaveBeenCalledTimes(2); - expect(setIntervalSpy).toHaveBeenCalledTimes(1); - - const failure = buildStartupFailureMessage(); - const worker = { id: 1, process: { exitCode: 1 } }; - - mockClusterHandlers.message(worker, failure); - - expect(() => mockClusterHandlers.exit(worker)).toThrow('process.exit:1'); - expect(mockErrorReporterMessage).toHaveBeenCalledWith( - 'Node renderer startup failed: localhost:3800 is already in use', - ); - expect(processExitSpy).toHaveBeenCalledWith(1); - expect(mockFork).toHaveBeenCalledTimes(2); - }); + expectedMessage: + 'Node renderer startup failed in worker 1: listen EACCES: permission denied 0.0.0.0:80', + }, + ])( + 'registers listener before forking and aborts without reforking on $testName', + ({ failure, expectedMessage }) => { + const mockOperations: string[] = []; + const mockClusterHandlers: Record void> = {}; + const mockFork = jest.fn(() => { + mockOperations.push('fork'); + return {}; + }); + const mockCluster = { + on: jest.fn((event: string, handler: (...args: unknown[]) => void) => { + mockOperations.push(`on:${event}`); + mockClusterHandlers[event] = handler; + return mockCluster; + }), + fork: mockFork, + }; + const mockErrorReporterMessage = jest.fn(); + const mockLog = { + info: jest.fn(), + warn: jest.fn(), + error: jest.fn(), + fatal: jest.fn(), + }; + const mockBuildConfig = jest.fn(() => ({ + workersCount: 2, + allWorkersRestartInterval: undefined, + delayBetweenIndividualWorkerRestarts: undefined, + gracefulWorkerRestartTimeout: 0, + serverBundleCachePath: '/tmp/react-on-rails-pro-node-renderer-bundles', + })); + const mockLogSanitizedConfig = jest.fn(); + const mockGetLicenseStatus = jest.fn(() => 'valid'); + const setIntervalSpy = jest + .spyOn(global, 'setInterval') + .mockReturnValue(0 as unknown as NodeJS.Timeout); + const processExitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => { + throw new Error(`process.exit:${code}`); + }) as typeof process.exit); + + jest.doMock('cluster', () => ({ + __esModule: true, + default: mockCluster, + })); + jest.doMock('../src/shared/log', () => ({ + __esModule: true, + default: mockLog, + })); + jest.doMock('../src/shared/errorReporter', () => ({ + __esModule: true, + message: mockErrorReporterMessage, + error: jest.fn(), + addMessageNotifier: jest.fn(), + addErrorNotifier: jest.fn(), + addNotifier: jest.fn(), + })); + jest.doMock('../src/shared/configBuilder', () => ({ + __esModule: true, + buildConfig: mockBuildConfig, + logSanitizedConfig: mockLogSanitizedConfig, + })); + jest.doMock('../src/shared/licenseValidator', () => ({ + __esModule: true, + getLicenseStatus: mockGetLicenseStatus, + })); + jest.doMock('../src/master/restartWorkers', () => ({ + __esModule: true, + default: jest.fn(), + })); + + let masterRun: typeof import('../src/master').default; + jest.isolateModules(() => { + // eslint-disable-next-line global-require + masterRun = require('../src/master').default as typeof import('../src/master').default; + }); + + masterRun(); + + expect(mockOperations).toEqual(['on:message', 'fork', 'fork', 'on:exit']); + expect(mockFork).toHaveBeenCalledTimes(2); + expect(setIntervalSpy).toHaveBeenCalledTimes(1); + + const worker = { id: 1, process: { exitCode: 1 } }; + mockClusterHandlers.message(worker, failure); + + expect(() => mockClusterHandlers.exit(worker)).toThrow('process.exit:1'); + expect(mockErrorReporterMessage).toHaveBeenCalledWith(expectedMessage); + expect(processExitSpy).toHaveBeenCalledWith(1); + expect(mockFork).toHaveBeenCalledTimes(2); + }, + ); }); describe('master startup failure handling', () => { From 05851ed451e99abfded96729d25e3484945dd95e Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sun, 29 Mar 2026 19:54:10 -1000 Subject: [PATCH 10/20] Harden startup-failure abort path and align tests with production wiring --- .../src/master.ts | 12 +- .../src/worker.ts | 82 ++-- .../tests/masterStartupFailure.test.ts | 387 ++++++++---------- .../tests/workerStartupFailure.test.ts | 116 +++--- 4 files changed, 288 insertions(+), 309 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 92d1f2297d..64e00717a5 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -100,9 +100,8 @@ export default function masterRun(runningConfig?: Partial) { return; } - if (isAbortingForStartupFailure) { - const failure = fatalStartupFailure?.failure; - const failedWorkerId = fatalStartupFailure?.workerId ?? worker.id; + if (isAbortingForStartupFailure && fatalStartupFailure) { + const { failure, workerId: failedWorkerId } = fatalStartupFailure; const msg = failure?.code === 'EADDRINUSE' ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` @@ -110,6 +109,13 @@ export default function masterRun(runningConfig?: Partial) { errorReporter.message(msg); process.exit(1); + return; + } + + if (isAbortingForStartupFailure) { + errorReporter.message('Node renderer startup failed before failure details were captured'); + process.exit(1); + return; } // TODO: Track last rendering request per worker.id diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index 0889347cd1..6cc8615664 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -202,6 +202,57 @@ const extractBodyArrayField = ( return undefined; }; +type StartupListenErrorHandlerOptions = { + err: Error; + host: string; + port: number; + isWorker?: boolean; + send?: NodeJS.Process['send']; + exit?: NodeJS.Process['exit']; +}; + +export function handleStartupListenError({ + err, + host, + port, + isWorker = cluster.isWorker, + send, + exit, +}: StartupListenErrorHandlerOptions) { + const sendFn = send ?? process.send?.bind(process); + const exitFn = exit ?? ((code?: number) => process.exit(code)); + + log.error({ err, host, port }, 'Node renderer failed to start'); + + if (isWorker && sendFn) { + const startupFailure: WorkerStartupFailureMessage = { + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + code: (err as NodeJS.ErrnoException).code, + errno: (err as NodeJS.ErrnoException).errno, + syscall: (err as NodeJS.ErrnoException).syscall, + host, + port, + message: err.message, + }; + try { + sendFn(startupFailure, undefined, undefined, (sendErr) => { + if (sendErr) { + log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); + } + exitFn(1); + }); + return; + } catch (sendErr) { + log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); + exitFn(1); + return; + } + } + + exitFn(1); +} + export default function run(config: Partial) { // Store config in app state. From now it can be loaded by any module using // getConfig(): @@ -511,35 +562,8 @@ export default function run(config: Partial) { if (workersCount === 0 || cluster.isWorker) { app.listen({ port, host }, (err, address) => { if (err) { - log.error({ err, host, port }, 'Node renderer failed to start'); - - if (cluster.isWorker && process.send) { - const startupFailure: WorkerStartupFailureMessage = { - type: WORKER_STARTUP_FAILURE, - stage: 'listen', - code: (err as NodeJS.ErrnoException).code, - errno: (err as NodeJS.ErrnoException).errno, - syscall: (err as NodeJS.ErrnoException).syscall, - host, - port, - message: err.message, - }; - try { - process.send(startupFailure, undefined, undefined, (sendErr) => { - if (sendErr) { - log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); - } - process.exit(1); - }); - return; - } catch (sendErr) { - log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); - process.exit(1); - return; - } - } - - process.exit(1); + handleStartupListenError({ err, host, port }); + return; } const workerName = worker ? `worker #${worker.id}` : 'master (single-process)'; log.info({ workerName, address }, 'Node renderer listening'); diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index 844f3eb18c..60a190e815 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -1,20 +1,20 @@ -import { - WORKER_STARTUP_FAILURE, - isWorkerStartupFailureMessage, - type WorkerStartupFailureMessage, -} from '../src/shared/workerMessages'; +import { WORKER_STARTUP_FAILURE, type WorkerStartupFailureMessage } from '../src/shared/workerMessages'; -/** - * These tests verify that the master process correctly handles the - * WORKER_STARTUP_FAILURE IPC message by aborting instead of reforking, - * while preserving the existing behavior for scheduled restarts and - * runtime crashes. - * - * Most cases exercise the decision logic in isolation by simulating the - * cluster events that the master process listens to. We also keep one thin - * wiring test that runs masterRun() against a mocked cluster so regressions - * in the actual listener registration path are caught too. - */ +type MockWorker = { + id: number; + isScheduledRestart?: boolean; + process: { exitCode: number | null }; +}; + +type ClusterHandlers = { + message: (worker: MockWorker, message: unknown) => void; + exit: (worker: MockWorker) => void; +}; + +type MockCluster = { + on: jest.Mock void]>; + fork: jest.Mock; +}; function buildStartupFailureMessage( overrides: Partial = {}, @@ -32,7 +32,102 @@ function buildStartupFailureMessage( }; } -describe('masterRun wiring', () => { +function setupMasterRunHarness() { + const operations: string[] = []; + const clusterHandlers: Partial = {}; + const mockFork = jest.fn(() => { + operations.push('fork'); + return {}; + }); + const mockCluster = {} as MockCluster; + mockCluster.on = jest.fn((event: string, handler: (...args: unknown[]) => void) => { + operations.push(`on:${event}`); + if (event === 'message') { + clusterHandlers.message = handler as ClusterHandlers['message']; + } else if (event === 'exit') { + clusterHandlers.exit = handler as ClusterHandlers['exit']; + } + return mockCluster; + }); + mockCluster.fork = mockFork; + const mockErrorReporterMessage = jest.fn(); + const mockLog = { + info: jest.fn(), + warn: jest.fn(), + error: jest.fn(), + fatal: jest.fn(), + }; + const mockBuildConfig = jest.fn(() => ({ + workersCount: 2, + allWorkersRestartInterval: undefined, + delayBetweenIndividualWorkerRestarts: undefined, + gracefulWorkerRestartTimeout: 0, + serverBundleCachePath: '/tmp/react-on-rails-pro-node-renderer-bundles', + })); + const mockLogSanitizedConfig = jest.fn(); + const mockGetLicenseStatus = jest.fn(() => 'valid'); + const setIntervalSpy = jest.spyOn(global, 'setInterval').mockReturnValue(0 as unknown as NodeJS.Timeout); + const processExitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => { + throw new Error(`process.exit:${code}`); + }) as typeof process.exit); + + jest.doMock('cluster', () => ({ + __esModule: true, + default: mockCluster, + })); + jest.doMock('../src/shared/log', () => ({ + __esModule: true, + default: mockLog, + })); + jest.doMock('../src/shared/errorReporter', () => ({ + __esModule: true, + message: mockErrorReporterMessage, + error: jest.fn(), + addMessageNotifier: jest.fn(), + addErrorNotifier: jest.fn(), + addNotifier: jest.fn(), + })); + jest.doMock('../src/shared/configBuilder', () => ({ + __esModule: true, + buildConfig: mockBuildConfig, + logSanitizedConfig: mockLogSanitizedConfig, + })); + jest.doMock('../src/shared/licenseValidator', () => ({ + __esModule: true, + getLicenseStatus: mockGetLicenseStatus, + })); + jest.doMock('../src/master/restartWorkers', () => ({ + __esModule: true, + default: jest.fn(), + })); + + let masterRun: typeof import('../src/master').default | undefined; + jest.isolateModules(() => { + // eslint-disable-next-line global-require + masterRun = require('../src/master').default as typeof import('../src/master').default; + }); + + if (!masterRun) { + throw new Error('Failed to load masterRun'); + } + + masterRun(); + + if (!clusterHandlers.message || !clusterHandlers.exit) { + throw new Error('Failed to register cluster handlers'); + } + + return { + operations, + clusterHandlers: clusterHandlers as ClusterHandlers, + mockFork, + mockErrorReporterMessage, + setIntervalSpy, + processExitSpy, + }; +} + +describe('master startup failure handling via masterRun wiring', () => { afterEach(() => { jest.restoreAllMocks(); jest.resetModules(); @@ -41,242 +136,90 @@ describe('masterRun wiring', () => { it.each([ { testName: 'EADDRINUSE startup failure', + failureWorker: { id: 1, process: { exitCode: 1 } }, + exitingWorker: { id: 1, process: { exitCode: 1 } }, failure: buildStartupFailureMessage(), expectedMessage: 'Node renderer startup failed: localhost:3800 is already in use', }, { - testName: 'generic startup failure', + testName: 'generic startup failure from one worker while another exits first', + failureWorker: { id: 2, process: { exitCode: 1 } }, + exitingWorker: { id: 1, process: { exitCode: 1 } }, failure: buildStartupFailureMessage({ code: 'EACCES', message: 'listen EACCES: permission denied 0.0.0.0:80', }), expectedMessage: - 'Node renderer startup failed in worker 1: listen EACCES: permission denied 0.0.0.0:80', + 'Node renderer startup failed in worker 2: listen EACCES: permission denied 0.0.0.0:80', }, - ])( - 'registers listener before forking and aborts without reforking on $testName', - ({ failure, expectedMessage }) => { - const mockOperations: string[] = []; - const mockClusterHandlers: Record void> = {}; - const mockFork = jest.fn(() => { - mockOperations.push('fork'); - return {}; - }); - const mockCluster = { - on: jest.fn((event: string, handler: (...args: unknown[]) => void) => { - mockOperations.push(`on:${event}`); - mockClusterHandlers[event] = handler; - return mockCluster; - }), - fork: mockFork, - }; - const mockErrorReporterMessage = jest.fn(); - const mockLog = { - info: jest.fn(), - warn: jest.fn(), - error: jest.fn(), - fatal: jest.fn(), - }; - const mockBuildConfig = jest.fn(() => ({ - workersCount: 2, - allWorkersRestartInterval: undefined, - delayBetweenIndividualWorkerRestarts: undefined, - gracefulWorkerRestartTimeout: 0, - serverBundleCachePath: '/tmp/react-on-rails-pro-node-renderer-bundles', - })); - const mockLogSanitizedConfig = jest.fn(); - const mockGetLicenseStatus = jest.fn(() => 'valid'); - const setIntervalSpy = jest - .spyOn(global, 'setInterval') - .mockReturnValue(0 as unknown as NodeJS.Timeout); - const processExitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => { - throw new Error(`process.exit:${code}`); - }) as typeof process.exit); - - jest.doMock('cluster', () => ({ - __esModule: true, - default: mockCluster, - })); - jest.doMock('../src/shared/log', () => ({ - __esModule: true, - default: mockLog, - })); - jest.doMock('../src/shared/errorReporter', () => ({ - __esModule: true, - message: mockErrorReporterMessage, - error: jest.fn(), - addMessageNotifier: jest.fn(), - addErrorNotifier: jest.fn(), - addNotifier: jest.fn(), - })); - jest.doMock('../src/shared/configBuilder', () => ({ - __esModule: true, - buildConfig: mockBuildConfig, - logSanitizedConfig: mockLogSanitizedConfig, - })); - jest.doMock('../src/shared/licenseValidator', () => ({ - __esModule: true, - getLicenseStatus: mockGetLicenseStatus, - })); - jest.doMock('../src/master/restartWorkers', () => ({ - __esModule: true, - default: jest.fn(), - })); + ])('registers listeners before forking and aborts without reforking on $testName', (scenario) => { + const harness = setupMasterRunHarness(); - let masterRun: typeof import('../src/master').default; - jest.isolateModules(() => { - // eslint-disable-next-line global-require - masterRun = require('../src/master').default as typeof import('../src/master').default; - }); + expect(harness.operations).toEqual(['on:message', 'fork', 'fork', 'on:exit']); + expect(harness.mockFork).toHaveBeenCalledTimes(2); + expect(harness.setIntervalSpy).toHaveBeenCalledTimes(1); - masterRun(); - - expect(mockOperations).toEqual(['on:message', 'fork', 'fork', 'on:exit']); - expect(mockFork).toHaveBeenCalledTimes(2); - expect(setIntervalSpy).toHaveBeenCalledTimes(1); - - const worker = { id: 1, process: { exitCode: 1 } }; - mockClusterHandlers.message(worker, failure); - - expect(() => mockClusterHandlers.exit(worker)).toThrow('process.exit:1'); - expect(mockErrorReporterMessage).toHaveBeenCalledWith(expectedMessage); - expect(processExitSpy).toHaveBeenCalledWith(1); - expect(mockFork).toHaveBeenCalledTimes(2); - }, - ); -}); + harness.clusterHandlers.message(scenario.failureWorker, scenario.failure); -describe('master startup failure handling', () => { - let isAbortingForStartupFailure: boolean; - let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null; - let forkCount: number; - let exitCode: number | null; - let reportedMessages: string[]; - - // Simulated handlers matching master.ts logic - function handleMessage(worker: { id: number }, message: unknown) { - if (!isWorkerStartupFailureMessage(message) || isAbortingForStartupFailure) return; - - isAbortingForStartupFailure = true; - fatalStartupFailure = { workerId: worker.id, failure: message }; - } - - function handleExit(worker: { - id: number; - isScheduledRestart?: boolean; - process: { exitCode: number | null }; - }) { - if (worker.isScheduledRestart) { - forkCount += 1; // cluster.fork() - return; - } - - if (isAbortingForStartupFailure) { - const failure = fatalStartupFailure?.failure; - const failedWorkerId = fatalStartupFailure?.workerId ?? worker.id; - const msg = - failure?.code === 'EADDRINUSE' - ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` - : `Node renderer startup failed in worker ${failedWorkerId}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; - - reportedMessages.push(msg); - exitCode = 1; - return; - } - - const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; - reportedMessages.push(msg); - forkCount += 1; // cluster.fork() - } - - beforeEach(() => { - isAbortingForStartupFailure = false; - fatalStartupFailure = null; - forkCount = 0; - exitCode = null; - reportedMessages = []; + expect(() => harness.clusterHandlers.exit(scenario.exitingWorker)).toThrow('process.exit:1'); + expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith(scenario.expectedMessage); + expect(harness.processExitSpy).toHaveBeenCalledWith(1); + expect(harness.mockFork).toHaveBeenCalledTimes(2); }); - it('aborts with clear message on EADDRINUSE startup failure', () => { - const worker = { id: 1, process: { exitCode: 1 } }; - const failure = buildStartupFailureMessage(); - - // Worker sends startup failure message - handleMessage(worker, failure); - expect(isAbortingForStartupFailure).toBe(true); - - // Worker exits - handleExit(worker); - - expect(forkCount).toBe(0); // No refork - expect(exitCode).toBe(1); - expect(reportedMessages).toEqual(['Node renderer startup failed: localhost:3800 is already in use']); - }); - - it('aborts with generic message on non-EADDRINUSE startup failure', () => { - const worker = { id: 2, process: { exitCode: 1 } }; - const failure = buildStartupFailureMessage({ + it('keeps the first startup-failure details when multiple workers report failures', () => { + const harness = setupMasterRunHarness(); + const firstFailure = buildStartupFailureMessage({ code: 'EACCES', message: 'listen EACCES: permission denied 0.0.0.0:80', }); + const secondFailure = buildStartupFailureMessage({ + code: 'ECONNREFUSED', + message: 'listen ECONNREFUSED: connection refused 127.0.0.1:3800', + }); - handleMessage(worker, failure); - handleExit(worker); - - expect(forkCount).toBe(0); - expect(exitCode).toBe(1); - expect(reportedMessages).toEqual([ - 'Node renderer startup failed in worker 2: listen EACCES: permission denied 0.0.0.0:80', - ]); - }); - - it('deduplicates multiple workers failing simultaneously', () => { - const worker1 = { id: 1, process: { exitCode: 1 } }; - const worker2 = { id: 2, process: { exitCode: 1 } }; - const failure = buildStartupFailureMessage(); - - // Both workers send failure messages - handleMessage(worker1, failure); - handleMessage(worker2, failure); - - // Only the first one is recorded - expect(fatalStartupFailure?.workerId).toBe(1); + harness.clusterHandlers.message({ id: 1, process: { exitCode: 1 } }, firstFailure); + harness.clusterHandlers.message({ id: 2, process: { exitCode: 1 } }, secondFailure); - // First worker exit triggers abort - handleExit(worker1); - expect(exitCode).toBe(1); - expect(forkCount).toBe(0); + expect(() => harness.clusterHandlers.exit({ id: 2, process: { exitCode: 1 } })).toThrow('process.exit:1'); + expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith( + 'Node renderer startup failed in worker 1: listen EACCES: permission denied 0.0.0.0:80', + ); }); - it('reforks on scheduled restart', () => { - const worker = { id: 1, isScheduledRestart: true, process: { exitCode: 0 } }; + it('restarts scheduled-restart workers without reporting an error', () => { + const harness = setupMasterRunHarness(); + const worker: MockWorker = { id: 1, isScheduledRestart: true, process: { exitCode: 0 } }; - handleExit(worker); + harness.clusterHandlers.exit(worker); - expect(forkCount).toBe(1); - expect(exitCode).toBeNull(); - expect(reportedMessages).toHaveLength(0); + expect(harness.mockFork).toHaveBeenCalledTimes(3); + expect(harness.mockErrorReporterMessage).not.toHaveBeenCalled(); + expect(harness.processExitSpy).not.toHaveBeenCalled(); }); - it('reforks on unexpected runtime crash without startup failure', () => { - const worker = { id: 3, process: { exitCode: 1 } }; + it('reforks on unexpected runtime crash when no startup failure was received', () => { + const harness = setupMasterRunHarness(); - // No startup failure message sent — this is a runtime crash - handleExit(worker); + harness.clusterHandlers.exit({ id: 3, process: { exitCode: 1 } }); - expect(forkCount).toBe(1); - expect(exitCode).toBeNull(); - expect(reportedMessages).toEqual(['Worker 3 died UNEXPECTEDLY :(, restarting']); + expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith( + 'Worker 3 died UNEXPECTEDLY :(, restarting', + ); + expect(harness.mockFork).toHaveBeenCalledTimes(3); + expect(harness.processExitSpy).not.toHaveBeenCalled(); }); - it('ignores non-startup-failure messages', () => { - const worker = { id: 1 }; + it('ignores malformed startup-failure messages and treats exit as runtime crash', () => { + const harness = setupMasterRunHarness(); - handleMessage(worker, { type: 'SOME_OTHER_MESSAGE' }); - handleMessage(worker, 'a string message'); - handleMessage(worker, null); + harness.clusterHandlers.message({ id: 1, process: { exitCode: 1 } }, { type: WORKER_STARTUP_FAILURE }); + harness.clusterHandlers.exit({ id: 1, process: { exitCode: 1 } }); - expect(isAbortingForStartupFailure).toBe(false); - expect(fatalStartupFailure).toBeNull(); + expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith( + 'Worker 1 died UNEXPECTEDLY :(, restarting', + ); + expect(harness.mockFork).toHaveBeenCalledTimes(3); + expect(harness.processExitSpy).not.toHaveBeenCalled(); }); }); diff --git a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts index 4e5b7d185e..5e4cc9cbc3 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts @@ -1,9 +1,9 @@ -import cluster from 'cluster'; import { WORKER_STARTUP_FAILURE, isWorkerStartupFailureMessage, type WorkerStartupFailureMessage, } from '../src/shared/workerMessages'; +import { handleStartupListenError } from '../src/worker'; describe('isWorkerStartupFailureMessage', () => { it('returns true for a valid startup failure message', () => { @@ -37,82 +37,88 @@ describe('isWorkerStartupFailureMessage', () => { }); }); -describe('worker startup failure IPC', () => { - const originalSend = process.send; - const originalExit = process.exit; - const originalIsWorker = cluster.isWorker; +describe('worker startup listen error handling', () => { + const buildListenError = () => + Object.assign(new Error('listen EADDRINUSE: address already in use :::3800'), { + code: 'EADDRINUSE', + errno: -48, + syscall: 'listen', + }); afterEach(() => { - process.send = originalSend; - process.exit = originalExit; - Object.defineProperty(cluster, 'isWorker', { value: originalIsWorker, writable: true }); + jest.restoreAllMocks(); }); - it('sends WORKER_STARTUP_FAILURE message in clustered mode', () => { - // Simulate a cluster worker environment - Object.defineProperty(cluster, 'isWorker', { value: true, writable: true }); + it('sends WORKER_STARTUP_FAILURE message in clustered mode via the production handler', () => { const sentMessages: unknown[] = []; const exitCalls: number[] = []; - - process.send = ((msg: unknown, _handle?: unknown, _options?: unknown, callback?: () => void) => { + const send = ((msg: unknown, _handle?: unknown, _options?: unknown, callback?: () => void) => { sentMessages.push(msg); - if (callback) callback(); + callback?.(); return true; - }) as typeof process.send; - - process.exit = ((code?: number) => { + }) as NodeJS.Process['send']; + const exit = ((code?: number) => { exitCalls.push(code ?? 0); - }) as typeof process.exit; + }) as NodeJS.Process['exit']; - // Simulate what the worker does on listen error - const err = Object.assign(new Error('listen EADDRINUSE: address already in use :::3800'), { - code: 'EADDRINUSE', - errno: -48, - syscall: 'listen', + handleStartupListenError({ + err: buildListenError(), + host: 'localhost', + port: 3800, + isWorker: true, + send, + exit, }); - const host = 'localhost'; - const port = 3800; - const startupFailure: WorkerStartupFailureMessage = { + expect(sentMessages).toHaveLength(1); + expect(isWorkerStartupFailureMessage(sentMessages[0])).toBe(true); + expect(sentMessages[0]).toMatchObject({ type: WORKER_STARTUP_FAILURE, stage: 'listen', - code: err.code, - errno: err.errno, - syscall: err.syscall, - host, - port, - message: err.message, - }; + host: 'localhost', + port: 3800, + code: 'EADDRINUSE', + }); + expect(exitCalls).toEqual([1]); + }); - process.send!(startupFailure, undefined, undefined, () => { - process.exit(1); + it('exits without IPC in single-process mode via the production handler', () => { + const send = jest.fn(); + const exitCalls: number[] = []; + const exit = ((code?: number) => { + exitCalls.push(code ?? 0); + }) as NodeJS.Process['exit']; + + handleStartupListenError({ + err: buildListenError(), + host: 'localhost', + port: 3800, + isWorker: false, + send: send as unknown as NodeJS.Process['send'], + exit, }); - expect(sentMessages).toHaveLength(1); - expect(isWorkerStartupFailureMessage(sentMessages[0])).toBe(true); - expect((sentMessages[0] as WorkerStartupFailureMessage).code).toBe('EADDRINUSE'); + expect(send).not.toHaveBeenCalled(); expect(exitCalls).toEqual([1]); }); - it('exits without IPC in single-process mode', () => { - Object.defineProperty(cluster, 'isWorker', { value: false, writable: true }); - process.send = undefined; - + it('exits when process.send throws synchronously', () => { + const send = (() => { + throw new Error('ERR_IPC_CHANNEL_CLOSED'); + }) as NodeJS.Process['send']; const exitCalls: number[] = []; - process.exit = ((code?: number) => { + const exit = ((code?: number) => { exitCalls.push(code ?? 0); - }) as typeof process.exit; - - // In single-process mode, cluster.isWorker is false and process.send is undefined - if (cluster.isWorker && process.send) { - // Should not reach here - process.send({ type: WORKER_STARTUP_FAILURE }, undefined, undefined, () => { - process.exit(1); - }); - return; - } - - process.exit(1); + }) as NodeJS.Process['exit']; + + handleStartupListenError({ + err: buildListenError(), + host: 'localhost', + port: 3800, + isWorker: true, + send, + exit, + }); expect(exitCalls).toEqual([1]); }); From 7bd1ee9880433b6cba9cc7e626d842d4bd8ba63c Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Mon, 30 Mar 2026 22:31:08 -1000 Subject: [PATCH 11/20] Address review: disconnect workers on abort, warn on missing IPC, remove dead code - Disconnect all live workers before process.exit(1) so they release ports instead of becoming orphans - Log a warning when a cluster worker has no IPC channel and cannot notify master of startup failure - Remove unreachable second isAbortingForStartupFailure branch - Remove misleading optional chaining where failure is known non-null - Add test for isWorker=true with no IPC channel Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/master.ts | 13 ++++----- .../src/worker.ts | 4 +++ .../tests/masterStartupFailure.test.ts | 4 +++ .../tests/workerStartupFailure.test.ts | 27 +++++++++++++++++++ 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 64e00717a5..4d55c410b6 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -103,17 +103,14 @@ export default function masterRun(runningConfig?: Partial) { if (isAbortingForStartupFailure && fatalStartupFailure) { const { failure, workerId: failedWorkerId } = fatalStartupFailure; const msg = - failure?.code === 'EADDRINUSE' + failure.code === 'EADDRINUSE' ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` - : `Node renderer startup failed in worker ${failedWorkerId}: ${failure?.message || `exit code ${worker.process.exitCode}`}`; + : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message || `exit code ${worker.process.exitCode}`}`; errorReporter.message(msg); - process.exit(1); - return; - } - - if (isAbortingForStartupFailure) { - errorReporter.message('Node renderer startup failed before failure details were captured'); + // Disconnect all live workers so they release their ports before the master exits. + // Without this, successfully-started workers become orphans still holding the port. + cluster.disconnect(); process.exit(1); return; } diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index 6cc8615664..ea0a58243f 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -224,6 +224,10 @@ export function handleStartupListenError({ log.error({ err, host, port }, 'Node renderer failed to start'); + if (isWorker && !sendFn) { + log.warn('Cluster worker has no IPC channel; cannot notify master of startup failure'); + } + if (isWorker && sendFn) { const startupFailure: WorkerStartupFailureMessage = { type: WORKER_STARTUP_FAILURE, diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index 60a190e815..69e533e74b 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -14,6 +14,7 @@ type ClusterHandlers = { type MockCluster = { on: jest.Mock void]>; fork: jest.Mock; + disconnect: jest.Mock; }; function buildStartupFailureMessage( @@ -40,6 +41,7 @@ function setupMasterRunHarness() { return {}; }); const mockCluster = {} as MockCluster; + mockCluster.disconnect = jest.fn(); mockCluster.on = jest.fn((event: string, handler: (...args: unknown[]) => void) => { operations.push(`on:${event}`); if (event === 'message') { @@ -121,6 +123,7 @@ function setupMasterRunHarness() { operations, clusterHandlers: clusterHandlers as ClusterHandlers, mockFork, + mockCluster, mockErrorReporterMessage, setIntervalSpy, processExitSpy, @@ -163,6 +166,7 @@ describe('master startup failure handling via masterRun wiring', () => { expect(() => harness.clusterHandlers.exit(scenario.exitingWorker)).toThrow('process.exit:1'); expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith(scenario.expectedMessage); + expect(harness.mockCluster.disconnect).toHaveBeenCalledTimes(1); expect(harness.processExitSpy).toHaveBeenCalledWith(1); expect(harness.mockFork).toHaveBeenCalledTimes(2); }); diff --git a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts index 5e4cc9cbc3..ddff10f8c6 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts @@ -102,6 +102,33 @@ describe('worker startup listen error handling', () => { expect(exitCalls).toEqual([1]); }); + it('logs a warning and exits when cluster worker has no IPC channel', () => { + // Temporarily remove process.send so the fallback `send ?? process.send?.bind(process)` + // resolves to undefined, simulating a worker whose IPC channel has been destroyed. + const originalSend = process.send; + delete (process as { send?: typeof process.send }).send; + + const exitCalls: number[] = []; + const exit = ((code?: number) => { + exitCalls.push(code ?? 0); + }) as NodeJS.Process['exit']; + + try { + handleStartupListenError({ + err: buildListenError(), + host: 'localhost', + port: 3800, + isWorker: true, + send: undefined, + exit, + }); + + expect(exitCalls).toEqual([1]); + } finally { + process.send = originalSend; + } + }); + it('exits when process.send throws synchronously', () => { const send = (() => { throw new Error('ERR_IPC_CHANNEL_CLOSED'); From fd704f06e4b68dad946adfa107267b7613eff401 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Mon, 30 Mar 2026 22:45:41 -1000 Subject: [PATCH 12/20] Address review: async disconnect, IPC timeout fallback, abort-first exit handling - Make cluster.disconnect() wait via callback before process.exit(1) so workers actually release their ports before the master exits - Add 2s setTimeout fallback with .unref() in worker IPC send path to prevent indefinite hang when the IPC callback is never invoked - Move startup-failure abort check before scheduled-restart check in the exit handler so no workers are reforked once abort is decided - Guard against duplicate errorReporter.message calls when multiple workers exit during the abort sequence - Tighten port validation in type guard (integer, 0-65535 range) - Reorder message handler condition to short-circuit the type-guard when already aborting - Add tests for IPC timeout fallback, duplicate exit guard, scheduled restart during abort, and port validation edge cases Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/master.ts | 44 ++++++--- .../src/shared/workerMessages.ts | 4 +- .../src/worker.ts | 19 ++-- .../tests/masterStartupFailure.test.ts | 39 +++++++- .../tests/workerStartupFailure.test.ts | 98 +++++++++++++++++++ 5 files changed, 179 insertions(+), 25 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 4d55c410b6..0d018681e9 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -80,9 +80,12 @@ export default function masterRun(runningConfig?: Partial) { let isAbortingForStartupFailure = false; let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null; + let hasInitiatedShutdown = false; cluster.on('message', (worker, message) => { - if (!isWorkerStartupFailureMessage(message) || isAbortingForStartupFailure) return; + // Check the abort flag first to short-circuit the type-guard on every + // ordinary IPC message once we are already aborting. + if (isAbortingForStartupFailure || !isWorkerStartupFailureMessage(message)) return; isAbortingForStartupFailure = true; fatalStartupFailure = { workerId: worker.id, failure: message }; @@ -94,24 +97,33 @@ export default function masterRun(runningConfig?: Partial) { // Listen for dying workers: cluster.on('exit', (worker) => { - if (worker.isScheduledRestart) { - log.info('Restarting worker #%d on schedule', worker.id); - cluster.fork(); + // Once a startup failure has been detected, abort regardless of whether + // this particular exit was from the failing worker, a scheduled restart, + // or an unrelated crash. Don't fork any more workers. + if (isAbortingForStartupFailure && fatalStartupFailure) { + if (!hasInitiatedShutdown) { + hasInitiatedShutdown = true; + // Note: the exiting worker may differ from the one that sent the + // failure message if multiple workers exit in rapid succession. + // We always report the first failure received. + const { failure, workerId: failedWorkerId } = fatalStartupFailure; + const msg = + failure.code === 'EADDRINUSE' + ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` + : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message || `exit code ${worker.process.exitCode}`}`; + + errorReporter.message(msg); + // Disconnect all live workers so they release their ports before the + // master exits. cluster.disconnect() is async — pass process.exit as + // the callback so we wait for workers to actually disconnect. + cluster.disconnect(() => process.exit(1)); + } return; } - if (isAbortingForStartupFailure && fatalStartupFailure) { - const { failure, workerId: failedWorkerId } = fatalStartupFailure; - const msg = - failure.code === 'EADDRINUSE' - ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` - : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message || `exit code ${worker.process.exitCode}`}`; - - errorReporter.message(msg); - // Disconnect all live workers so they release their ports before the master exits. - // Without this, successfully-started workers become orphans still holding the port. - cluster.disconnect(); - process.exit(1); + if (worker.isScheduledRestart) { + log.info('Restarting worker #%d on schedule', worker.id); + cluster.fork(); return; } diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts index 8bd66214a9..8ba0e10677 100644 --- a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts +++ b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts @@ -23,7 +23,9 @@ export function isWorkerStartupFailureMessage(value: unknown): value is WorkerSt message.stage === 'listen' && typeof message.host === 'string' && typeof message.port === 'number' && - !Number.isNaN(message.port) && + Number.isInteger(message.port) && + message.port >= 0 && + message.port <= 65535 && typeof message.message === 'string' ); } diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index ea0a58243f..b3e532d573 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -240,15 +240,22 @@ export function handleStartupListenError({ message: err.message, }; try { - sendFn(startupFailure, undefined, undefined, (sendErr) => { - if (sendErr) { - log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); - } + let exited = false; + const doExit = (sendErr?: Error | null) => { + if (exited) return; + exited = true; + if (sendErr) log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); exitFn(1); - }); + }; + sendFn(startupFailure, undefined, undefined, doExit); + // Safety net: if the IPC channel is half-broken the callback may never + // fire, leaving this worker alive indefinitely. Force exit after a timeout. + const IPC_SEND_TIMEOUT_MS = 2000; + const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS); + if (typeof timer.unref === 'function') timer.unref(); return; } catch (sendErr) { - log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); + log.warn({ err: sendErr as Error }, 'Failed to send startup failure message to master'); exitFn(1); return; } diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index 69e533e74b..697aa27b2d 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -14,7 +14,7 @@ type ClusterHandlers = { type MockCluster = { on: jest.Mock void]>; fork: jest.Mock; - disconnect: jest.Mock; + disconnect: jest.Mock void]>; }; function buildStartupFailureMessage( @@ -41,7 +41,9 @@ function setupMasterRunHarness() { return {}; }); const mockCluster = {} as MockCluster; - mockCluster.disconnect = jest.fn(); + mockCluster.disconnect = jest.fn((callback?: () => void) => { + if (callback) callback(); + }); mockCluster.on = jest.fn((event: string, handler: (...args: unknown[]) => void) => { operations.push(`on:${event}`); if (event === 'message') { @@ -191,6 +193,39 @@ describe('master startup failure handling via masterRun wiring', () => { ); }); + it('reports error only once when multiple workers exit during abort', () => { + const harness = setupMasterRunHarness(); + // Use a disconnect mock that does NOT invoke the callback, so process.exit + // is not called and subsequent exit events can be observed. + harness.mockCluster.disconnect.mockImplementation(() => {}); + + harness.clusterHandlers.message({ id: 1, process: { exitCode: 1 } }, buildStartupFailureMessage()); + + // First exit triggers the error report and disconnect. + harness.clusterHandlers.exit({ id: 1, process: { exitCode: 1 } }); + expect(harness.mockErrorReporterMessage).toHaveBeenCalledTimes(1); + expect(harness.mockCluster.disconnect).toHaveBeenCalledTimes(1); + + // Second worker exit during abort — no duplicate report, no refork. + harness.clusterHandlers.exit({ id: 2, process: { exitCode: 1 } }); + expect(harness.mockErrorReporterMessage).toHaveBeenCalledTimes(1); + expect(harness.mockCluster.disconnect).toHaveBeenCalledTimes(1); + expect(harness.mockFork).toHaveBeenCalledTimes(2); + }); + + it('does not refork a scheduled-restart worker when aborting for startup failure', () => { + const harness = setupMasterRunHarness(); + + harness.clusterHandlers.message({ id: 1, process: { exitCode: 1 } }, buildStartupFailureMessage()); + + // A scheduled-restart worker exiting during abort should NOT be reforked. + expect(() => + harness.clusterHandlers.exit({ id: 2, isScheduledRestart: true, process: { exitCode: 0 } }), + ).toThrow('process.exit:1'); + expect(harness.mockFork).toHaveBeenCalledTimes(2); + expect(harness.mockErrorReporterMessage).toHaveBeenCalledTimes(1); + }); + it('restarts scheduled-restart workers without reporting an error', () => { const harness = setupMasterRunHarness(); const worker: MockWorker = { id: 1, isScheduledRestart: true, process: { exitCode: 0 } }; diff --git a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts index ddff10f8c6..61e9a3a37a 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts @@ -35,6 +35,42 @@ describe('isWorkerStartupFailureMessage', () => { it('returns false for an object without type', () => { expect(isWorkerStartupFailureMessage({ stage: 'listen' })).toBe(false); }); + + it('returns false for a non-integer port', () => { + expect( + isWorkerStartupFailureMessage({ + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + host: 'localhost', + port: 3800.5, + message: 'some error', + }), + ).toBe(false); + }); + + it('returns false for an out-of-range port', () => { + expect( + isWorkerStartupFailureMessage({ + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + host: 'localhost', + port: 70000, + message: 'some error', + }), + ).toBe(false); + }); + + it('returns false for a negative port', () => { + expect( + isWorkerStartupFailureMessage({ + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + host: 'localhost', + port: -1, + message: 'some error', + }), + ).toBe(false); + }); }); describe('worker startup listen error handling', () => { @@ -149,4 +185,66 @@ describe('worker startup listen error handling', () => { expect(exitCalls).toEqual([1]); }); + + it('exits via timeout fallback when IPC callback is never invoked', () => { + jest.useFakeTimers(); + const exitCalls: number[] = []; + // send accepts the message but never invokes the callback, simulating a + // half-broken IPC channel. + const send = ((_msg: unknown, _handle?: unknown, _options?: unknown, _callback?: () => void) => + true) as NodeJS.Process['send']; + const exit = ((code?: number) => { + exitCalls.push(code ?? 0); + }) as NodeJS.Process['exit']; + + handleStartupListenError({ + err: buildListenError(), + host: 'localhost', + port: 3800, + isWorker: true, + send, + exit, + }); + + // Callback was never invoked, so exit hasn't been called yet. + expect(exitCalls).toEqual([]); + + // Advance past the IPC_SEND_TIMEOUT_MS (2000ms) fallback timer. + jest.advanceTimersByTime(2000); + expect(exitCalls).toEqual([1]); + + jest.useRealTimers(); + }); + + it('does not exit twice when callback fires after timeout', () => { + jest.useFakeTimers(); + const exitCalls: number[] = []; + let savedCallback: (() => void) | undefined; + const send = ((_msg: unknown, _handle?: unknown, _options?: unknown, callback?: () => void) => { + savedCallback = callback; + return true; + }) as NodeJS.Process['send']; + const exit = ((code?: number) => { + exitCalls.push(code ?? 0); + }) as NodeJS.Process['exit']; + + handleStartupListenError({ + err: buildListenError(), + host: 'localhost', + port: 3800, + isWorker: true, + send, + exit, + }); + + // Timeout fires first. + jest.advanceTimersByTime(2000); + expect(exitCalls).toEqual([1]); + + // Late callback — should be a no-op. + savedCallback?.(); + expect(exitCalls).toEqual([1]); + + jest.useRealTimers(); + }); }); From 11cbe99871bb546684f52e735f1e71351243f621 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 4 Apr 2026 21:14:52 -1000 Subject: [PATCH 13/20] Harden startup-failure exit ordering and IPC error visibility --- .../src/master.ts | 59 +++++++++++-------- .../src/worker.ts | 6 +- .../tests/masterStartupFailure.test.ts | 31 +++++++++- 3 files changed, 68 insertions(+), 28 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 0d018681e9..d1efaad2fb 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -82,6 +82,30 @@ export default function masterRun(runningConfig?: Partial) { let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null; let hasInitiatedShutdown = false; + const abortForStartupFailure = (exitingWorker: cluster.Worker): boolean => { + if (!(isAbortingForStartupFailure && fatalStartupFailure)) return false; + + if (!hasInitiatedShutdown) { + hasInitiatedShutdown = true; + // Note: the exiting worker may differ from the one that sent the + // failure message if multiple workers exit in rapid succession. + // We always report the first failure received. + const { failure, workerId: failedWorkerId } = fatalStartupFailure; + const msg = + failure.code === 'EADDRINUSE' + ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` + : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message || `exit code ${exitingWorker.process.exitCode}`}`; + + errorReporter.message(msg); + // Disconnect all live workers so they release their ports before the + // master exits. cluster.disconnect() is async — pass process.exit as + // the callback so we wait for workers to actually disconnect. + cluster.disconnect(() => process.exit(1)); + } + + return true; + }; + cluster.on('message', (worker, message) => { // Check the abort flag first to short-circuit the type-guard on every // ordinary IPC message once we are already aborting. @@ -100,24 +124,7 @@ export default function masterRun(runningConfig?: Partial) { // Once a startup failure has been detected, abort regardless of whether // this particular exit was from the failing worker, a scheduled restart, // or an unrelated crash. Don't fork any more workers. - if (isAbortingForStartupFailure && fatalStartupFailure) { - if (!hasInitiatedShutdown) { - hasInitiatedShutdown = true; - // Note: the exiting worker may differ from the one that sent the - // failure message if multiple workers exit in rapid succession. - // We always report the first failure received. - const { failure, workerId: failedWorkerId } = fatalStartupFailure; - const msg = - failure.code === 'EADDRINUSE' - ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` - : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message || `exit code ${worker.process.exitCode}`}`; - - errorReporter.message(msg); - // Disconnect all live workers so they release their ports before the - // master exits. cluster.disconnect() is async — pass process.exit as - // the callback so we wait for workers to actually disconnect. - cluster.disconnect(() => process.exit(1)); - } + if (abortForStartupFailure(worker)) { return; } @@ -127,11 +134,17 @@ export default function masterRun(runningConfig?: Partial) { return; } - // TODO: Track last rendering request per worker.id - // TODO: Consider blocking a given rendering request if it kills a worker more than X times - const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; - errorReporter.message(msg); - cluster.fork(); + // Give in-flight startup-failure IPC messages one event-loop turn to be + // processed before classifying this as an ordinary runtime crash. + setImmediate(() => { + if (abortForStartupFailure(worker)) return; + + // TODO: Track last rendering request per worker.id + // TODO: Consider blocking a given rendering request if it kills a worker more than X times + const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`; + errorReporter.message(msg); + cluster.fork(); + }); }); // Schedule regular restarts of workers diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index b3e532d573..134a0cea5d 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -225,7 +225,7 @@ export function handleStartupListenError({ log.error({ err, host, port }, 'Node renderer failed to start'); if (isWorker && !sendFn) { - log.warn('Cluster worker has no IPC channel; cannot notify master of startup failure'); + log.error('Cluster worker has no IPC channel; cannot notify master of startup failure'); } if (isWorker && sendFn) { @@ -244,7 +244,7 @@ export function handleStartupListenError({ const doExit = (sendErr?: Error | null) => { if (exited) return; exited = true; - if (sendErr) log.warn({ err: sendErr }, 'Failed to send startup failure message to master'); + if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master'); exitFn(1); }; sendFn(startupFailure, undefined, undefined, doExit); @@ -255,7 +255,7 @@ export function handleStartupListenError({ if (typeof timer.unref === 'function') timer.unref(); return; } catch (sendErr) { - log.warn({ err: sendErr as Error }, 'Failed to send startup failure message to master'); + log.error({ err: sendErr as Error }, 'Failed to send startup failure message to master'); exitFn(1); return; } diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index 697aa27b2d..795e458a32 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -132,6 +132,12 @@ function setupMasterRunHarness() { }; } +async function waitForSetImmediate() { + await new Promise((resolve) => { + setImmediate(resolve); + }); +} + describe('master startup failure handling via masterRun wiring', () => { afterEach(() => { jest.restoreAllMocks(); @@ -237,10 +243,30 @@ describe('master startup failure handling via masterRun wiring', () => { expect(harness.processExitSpy).not.toHaveBeenCalled(); }); - it('reforks on unexpected runtime crash when no startup failure was received', () => { + it('waits one tick for startup-failure IPC before classifying an unexpected crash', async () => { + const harness = setupMasterRunHarness(); + harness.mockCluster.disconnect.mockImplementation(() => {}); + const worker = { id: 1, process: { exitCode: 1 } }; + + // Exit arrives first. + harness.clusterHandlers.exit(worker); + // Startup-failure message arrives before the deferred crash classification runs. + harness.clusterHandlers.message(worker, buildStartupFailureMessage()); + await waitForSetImmediate(); + + expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith( + 'Node renderer startup failed: localhost:3800 is already in use', + ); + expect(harness.mockCluster.disconnect).toHaveBeenCalledTimes(1); + expect(harness.mockFork).toHaveBeenCalledTimes(2); + expect(harness.processExitSpy).not.toHaveBeenCalled(); + }); + + it('reforks on unexpected runtime crash when no startup failure was received', async () => { const harness = setupMasterRunHarness(); harness.clusterHandlers.exit({ id: 3, process: { exitCode: 1 } }); + await waitForSetImmediate(); expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith( 'Worker 3 died UNEXPECTEDLY :(, restarting', @@ -249,11 +275,12 @@ describe('master startup failure handling via masterRun wiring', () => { expect(harness.processExitSpy).not.toHaveBeenCalled(); }); - it('ignores malformed startup-failure messages and treats exit as runtime crash', () => { + it('ignores malformed startup-failure messages and treats exit as runtime crash', async () => { const harness = setupMasterRunHarness(); harness.clusterHandlers.message({ id: 1, process: { exitCode: 1 } }, { type: WORKER_STARTUP_FAILURE }); harness.clusterHandlers.exit({ id: 1, process: { exitCode: 1 } }); + await waitForSetImmediate(); expect(harness.mockErrorReporterMessage).toHaveBeenCalledWith( 'Worker 1 died UNEXPECTEDLY :(, restarting', From e68d9377bac7210a1ec989cd145e2789153febd6 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 4 Apr 2026 22:21:00 -1000 Subject: [PATCH 14/20] Address review: remove dead message fallback, document stage extensibility - Remove unreachable `|| exit code` fallback in abort error message since failure.message is always a string (validated by type guard) - Remove now-unused exitingWorker param from abortForStartupFailure - Add comment documenting that stage: 'listen' is the only supported stage and how to extend for pre-listen failures Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/react-on-rails-pro-node-renderer/src/master.ts | 8 ++++---- .../src/shared/workerMessages.ts | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index d1efaad2fb..34d8c05dc7 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -82,7 +82,7 @@ export default function masterRun(runningConfig?: Partial) { let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null; let hasInitiatedShutdown = false; - const abortForStartupFailure = (exitingWorker: cluster.Worker): boolean => { + const abortForStartupFailure = (): boolean => { if (!(isAbortingForStartupFailure && fatalStartupFailure)) return false; if (!hasInitiatedShutdown) { @@ -94,7 +94,7 @@ export default function masterRun(runningConfig?: Partial) { const msg = failure.code === 'EADDRINUSE' ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use` - : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message || `exit code ${exitingWorker.process.exitCode}`}`; + : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message}`; errorReporter.message(msg); // Disconnect all live workers so they release their ports before the @@ -124,7 +124,7 @@ export default function masterRun(runningConfig?: Partial) { // Once a startup failure has been detected, abort regardless of whether // this particular exit was from the failing worker, a scheduled restart, // or an unrelated crash. Don't fork any more workers. - if (abortForStartupFailure(worker)) { + if (abortForStartupFailure()) { return; } @@ -137,7 +137,7 @@ export default function masterRun(runningConfig?: Partial) { // Give in-flight startup-failure IPC messages one event-loop turn to be // processed before classifying this as an ordinary runtime crash. setImmediate(() => { - if (abortForStartupFailure(worker)) return; + if (abortForStartupFailure()) return; // TODO: Track last rendering request per worker.id // TODO: Consider blocking a given rendering request if it kills a worker more than X times diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts index 8ba0e10677..6b8e1c1bc4 100644 --- a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts +++ b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts @@ -18,6 +18,9 @@ export function isWorkerStartupFailureMessage(value: unknown): value is WorkerSt const message = value as Partial; + // stage: 'listen' is the only supported stage today. To handle pre-listen + // failures (e.g. plugin registration), add a new stage value here and + // update the master handler accordingly. return ( message.type === WORKER_STARTUP_FAILURE && message.stage === 'listen' && From 08b9f0b4e77d641743c40c889f88cc23f437a1f4 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sat, 4 Apr 2026 23:31:45 -1000 Subject: [PATCH 15/20] Add hard-deadline timer to cluster.disconnect() in abort path Prevents the master from hanging indefinitely if a worker is stuck (leaked handle, blocking syscall) during startup-failure shutdown. Follows the same timeout pattern used in restartWorkers.ts. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../react-on-rails-pro-node-renderer/src/master.ts | 14 +++++++++++--- .../tests/masterStartupFailure.test.ts | 4 ++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts index 34d8c05dc7..4c804263d8 100644 --- a/packages/react-on-rails-pro-node-renderer/src/master.ts +++ b/packages/react-on-rails-pro-node-renderer/src/master.ts @@ -98,9 +98,17 @@ export default function masterRun(runningConfig?: Partial) { errorReporter.message(msg); // Disconnect all live workers so they release their ports before the - // master exits. cluster.disconnect() is async — pass process.exit as - // the callback so we wait for workers to actually disconnect. - cluster.disconnect(() => process.exit(1)); + // master exits. cluster.disconnect() is async — the callback fires + // once every worker has disconnected. A hard-deadline timer guarantees + // the master still exits if a worker is stuck (leaked handle, blocking + // syscall, etc.), following the same pattern as restartWorkers.ts. + const MASTER_SHUTDOWN_TIMEOUT_MS = 5000; + const shutdownTimer = setTimeout(() => process.exit(1), MASTER_SHUTDOWN_TIMEOUT_MS); + if (typeof shutdownTimer.unref === 'function') shutdownTimer.unref(); + cluster.disconnect(() => { + clearTimeout(shutdownTimer); + process.exit(1); + }); } return true; diff --git a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts index 795e458a32..c3cf8ddb5b 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/masterStartupFailure.test.ts @@ -71,6 +71,9 @@ function setupMasterRunHarness() { const mockLogSanitizedConfig = jest.fn(); const mockGetLicenseStatus = jest.fn(() => 'valid'); const setIntervalSpy = jest.spyOn(global, 'setInterval').mockReturnValue(0 as unknown as NodeJS.Timeout); + const setTimeoutSpy = jest + .spyOn(global, 'setTimeout') + .mockReturnValue({ unref: jest.fn() } as unknown as NodeJS.Timeout); const processExitSpy = jest.spyOn(process, 'exit').mockImplementation(((code?: number) => { throw new Error(`process.exit:${code}`); }) as typeof process.exit); @@ -128,6 +131,7 @@ function setupMasterRunHarness() { mockCluster, mockErrorReporterMessage, setIntervalSpy, + setTimeoutSpy, processExitSpy, }; } From 07c1f14421103b22024c2a0efc30319c0621addb Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sun, 5 Apr 2026 17:00:56 -1000 Subject: [PATCH 16/20] Update CHANGELOG with fork loop port bind fix for PR 2881 Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 340a4c9be7..15bc157680 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ After a release, run `/update-changelog` in Claude Code to analyze commits, writ #### Fixed - **[Pro]** **Fixed TanStack Router SSR hydration mismatches in the async path**: Client hydration now restores server match data before first render, uses `RouterProvider` directly to match the server-rendered tree, and stops the post-hydration load when a custom `router.options.hydrate` callback fails instead of continuing with partially hydrated client state. [PR 2932](https://github.com/shakacode/react_on_rails/pull/2932) by [justin808](https://github.com/justin808). +- **[Pro] Fixed infinite fork loop when node renderer worker fails to bind port**: When a worker failed during `app.listen()` (e.g., `EADDRINUSE`), the master previously reforked unconditionally, causing an infinite fork/crash loop that consumed CPU and filled logs. Workers now send a `WORKER_STARTUP_FAILURE` IPC message to the master before exiting; the master sets an abort flag and exits with a clear error message instead of reforking. Scheduled restarts and runtime crashes continue to refork as before. [PR 2881](https://github.com/shakacode/react_on_rails/pull/2881) by [justin808](https://github.com/justin808). ### [16.6.0.rc.0] - 2026-04-01 From a867ec997503f3a644e70e0cc15d1e783f32c7eb Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sun, 5 Apr 2026 18:17:08 -1000 Subject: [PATCH 17/20] Extract handleStartupListenError to dedicated module Move handleStartupListenError and its options type from worker.ts into src/worker/startupErrorHandler.ts so the production entry point no longer exports a test-only surface. Tests now import from the utility module directly. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/worker.ts | 64 +----------------- .../src/worker/startupErrorHandler.ts | 65 +++++++++++++++++++ .../tests/workerStartupFailure.test.ts | 2 +- 3 files changed, 67 insertions(+), 64 deletions(-) create mode 100644 packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts index 134a0cea5d..a08a7c91e4 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker.ts @@ -23,7 +23,7 @@ import { type ProvidedNewBundle, } from './worker/handleRenderRequest.js'; import handleGracefulShutdown from './worker/handleGracefulShutdown.js'; -import { WORKER_STARTUP_FAILURE, type WorkerStartupFailureMessage } from './shared/workerMessages.js'; +import { handleStartupListenError } from './worker/startupErrorHandler.js'; import { badRequestResponseResult, errorResponseResult, @@ -202,68 +202,6 @@ const extractBodyArrayField = ( return undefined; }; -type StartupListenErrorHandlerOptions = { - err: Error; - host: string; - port: number; - isWorker?: boolean; - send?: NodeJS.Process['send']; - exit?: NodeJS.Process['exit']; -}; - -export function handleStartupListenError({ - err, - host, - port, - isWorker = cluster.isWorker, - send, - exit, -}: StartupListenErrorHandlerOptions) { - const sendFn = send ?? process.send?.bind(process); - const exitFn = exit ?? ((code?: number) => process.exit(code)); - - log.error({ err, host, port }, 'Node renderer failed to start'); - - if (isWorker && !sendFn) { - log.error('Cluster worker has no IPC channel; cannot notify master of startup failure'); - } - - if (isWorker && sendFn) { - const startupFailure: WorkerStartupFailureMessage = { - type: WORKER_STARTUP_FAILURE, - stage: 'listen', - code: (err as NodeJS.ErrnoException).code, - errno: (err as NodeJS.ErrnoException).errno, - syscall: (err as NodeJS.ErrnoException).syscall, - host, - port, - message: err.message, - }; - try { - let exited = false; - const doExit = (sendErr?: Error | null) => { - if (exited) return; - exited = true; - if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master'); - exitFn(1); - }; - sendFn(startupFailure, undefined, undefined, doExit); - // Safety net: if the IPC channel is half-broken the callback may never - // fire, leaving this worker alive indefinitely. Force exit after a timeout. - const IPC_SEND_TIMEOUT_MS = 2000; - const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS); - if (typeof timer.unref === 'function') timer.unref(); - return; - } catch (sendErr) { - log.error({ err: sendErr as Error }, 'Failed to send startup failure message to master'); - exitFn(1); - return; - } - } - - exitFn(1); -} - export default function run(config: Partial) { // Store config in app state. From now it can be loaded by any module using // getConfig(): diff --git a/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts b/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts new file mode 100644 index 0000000000..3d2aee8cf3 --- /dev/null +++ b/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts @@ -0,0 +1,65 @@ +import cluster from 'cluster'; +import log from '../shared/log.js'; +import { WORKER_STARTUP_FAILURE, type WorkerStartupFailureMessage } from '../shared/workerMessages.js'; + +export type StartupListenErrorHandlerOptions = { + err: Error; + host: string; + port: number; + isWorker?: boolean; + send?: NodeJS.Process['send']; + exit?: NodeJS.Process['exit']; +}; + +export function handleStartupListenError({ + err, + host, + port, + isWorker = cluster.isWorker, + send, + exit, +}: StartupListenErrorHandlerOptions) { + const sendFn = send ?? process.send?.bind(process); + const exitFn = exit ?? ((code?: number) => process.exit(code)); + + log.error({ err, host, port }, 'Node renderer failed to start'); + + if (isWorker && !sendFn) { + log.error('Cluster worker has no IPC channel; cannot notify master of startup failure'); + } + + if (isWorker && sendFn) { + const startupFailure: WorkerStartupFailureMessage = { + type: WORKER_STARTUP_FAILURE, + stage: 'listen', + code: (err as NodeJS.ErrnoException).code, + errno: (err as NodeJS.ErrnoException).errno, + syscall: (err as NodeJS.ErrnoException).syscall, + host, + port, + message: err.message, + }; + try { + let exited = false; + const doExit = (sendErr?: Error | null) => { + if (exited) return; + exited = true; + if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master'); + exitFn(1); + }; + sendFn(startupFailure, undefined, undefined, doExit); + // Safety net: if the IPC channel is half-broken the callback may never + // fire, leaving this worker alive indefinitely. Force exit after a timeout. + const IPC_SEND_TIMEOUT_MS = 2000; + const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS); + if (typeof timer.unref === 'function') timer.unref(); + return; + } catch (sendErr) { + log.error({ err: sendErr as Error }, 'Failed to send startup failure message to master'); + exitFn(1); + return; + } + } + + exitFn(1); +} diff --git a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts index 61e9a3a37a..14c6ebc2d5 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/workerStartupFailure.test.ts @@ -3,7 +3,7 @@ import { isWorkerStartupFailureMessage, type WorkerStartupFailureMessage, } from '../src/shared/workerMessages'; -import { handleStartupListenError } from '../src/worker'; +import { handleStartupListenError } from '../src/worker/startupErrorHandler'; describe('isWorkerStartupFailureMessage', () => { it('returns true for a valid startup failure message', () => { From 9d48d81a103ad24f2e782f9b4ef9010eea0ed700 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sun, 5 Apr 2026 19:28:01 -1000 Subject: [PATCH 18/20] Validate renderer port range to prevent startup refork loops --- .../src/shared/configBuilder.ts | 13 ++++++++++ .../src/worker/startupErrorHandler.ts | 11 ++++----- .../tests/configBuilder.test.ts | 24 +++++++++++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts b/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts index 3ede56a3e9..875a8db123 100644 --- a/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts +++ b/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts @@ -142,6 +142,13 @@ function logLevel(level: string): LevelWithSilent { } } +function validatePort(port: number): string | null { + if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) { + return `RENDERER_PORT must be an integer between 0 and 65535. Received: ${String(port)}`; + } + return null; +} + function normalizedRuntimeEnvs() { return [env.RAILS_ENV, env.NODE_ENV] .filter((value): value is string => Boolean(value)) @@ -380,6 +387,12 @@ export function buildConfig(providedUserConfig?: Partial): Config { } }); + const portValidationError = validatePort(config.port); + if (portValidationError) { + log.error(portValidationError); + process.exit(1); + } + if ( 'honeybadgerApiKey' in config || 'sentryDsn' in config || diff --git a/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts b/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts index 3d2aee8cf3..54556475cc 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts @@ -26,9 +26,8 @@ export function handleStartupListenError({ if (isWorker && !sendFn) { log.error('Cluster worker has no IPC channel; cannot notify master of startup failure'); - } - - if (isWorker && sendFn) { + exitFn(1); + } else if (isWorker) { const startupFailure: WorkerStartupFailureMessage = { type: WORKER_STARTUP_FAILURE, stage: 'listen', @@ -53,13 +52,11 @@ export function handleStartupListenError({ const IPC_SEND_TIMEOUT_MS = 2000; const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS); if (typeof timer.unref === 'function') timer.unref(); - return; } catch (sendErr) { log.error({ err: sendErr as Error }, 'Failed to send startup failure message to master'); exitFn(1); - return; } + } else { + exitFn(1); } - - exitFn(1); } diff --git a/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts b/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts index 277f1295c2..4a6bf26fec 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts @@ -1,6 +1,7 @@ describe('configBuilder', () => { const envVarsToRestore = [ 'RENDERER_HOST', + 'RENDERER_PORT', 'NODE_ENV', 'RENDERER_PASSWORD', 'RAILS_ENV', @@ -113,6 +114,29 @@ describe('configBuilder', () => { expect(finalSettings.password).toBe(''); }); + describe('port validation', () => { + it('throws when configured port is outside the valid TCP range', () => { + process.env.NODE_ENV = 'development'; + process.env.RAILS_ENV = 'development'; + const processExit = mockProcessExit(); + const { buildConfig, error } = loadConfigBuilderWithMockedLogger(); + + expect(() => buildConfig({ port: 70000 })).toThrow('process.exit: 1'); + expect(processExit).toHaveBeenCalledWith(1); + expect(error).toHaveBeenCalledWith( + 'RENDERER_PORT must be an integer between 0 and 65535. Received: 70000', + ); + }); + + it('allows port 0 for ephemeral-port test setups', () => { + process.env.NODE_ENV = 'development'; + process.env.RAILS_ENV = 'development'; + const { buildConfig } = loadConfigBuilderWithMockedLogger(); + + expect(buildConfig({ port: 0 }).port).toBe(0); + }); + }); + describe('password validation in production-like environments', () => { it('throws when no password is set in production', () => { process.env.NODE_ENV = 'production'; From 97784b9100f138f420d24fb3d7a8264133365209 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sun, 5 Apr 2026 19:43:07 -1000 Subject: [PATCH 19/20] Fix startupErrorHandler sendFn narrowing for TypeScript --- .../src/worker/startupErrorHandler.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts b/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts index 54556475cc..622775e9fa 100644 --- a/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts +++ b/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts @@ -24,10 +24,13 @@ export function handleStartupListenError({ log.error({ err, host, port }, 'Node renderer failed to start'); - if (isWorker && !sendFn) { - log.error('Cluster worker has no IPC channel; cannot notify master of startup failure'); - exitFn(1); - } else if (isWorker) { + if (isWorker) { + if (!sendFn) { + log.error('Cluster worker has no IPC channel; cannot notify master of startup failure'); + exitFn(1); + return; + } + const startupFailure: WorkerStartupFailureMessage = { type: WORKER_STARTUP_FAILURE, stage: 'listen', From ca6ac0162ff988bafcd6bb38dd8ba342f32317e4 Mon Sep 17 00:00:00 2001 From: Justin Gordon Date: Sun, 5 Apr 2026 20:41:43 -1000 Subject: [PATCH 20/20] Coerce string port to number before validation in configBuilder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The validatePort function uses Number.isInteger() which returns false for string values like "3800". When users pass env-derived port values (e.g. `port: env.RENDERER_PORT || 3800`), the env var is a string, causing the validation to reject valid ports and crash the renderer. This was the root cause of all Pro integration test failures in CI — the node renderer exited immediately with "RENDERER_PORT must be an integer between 0 and 65535. Received: 3800". Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/shared/configBuilder.ts | 5 +++++ .../tests/configBuilder.test.ts | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts b/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts index 875a8db123..949673bf03 100644 --- a/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts +++ b/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts @@ -387,6 +387,11 @@ export function buildConfig(providedUserConfig?: Partial): Config { } }); + // Coerce port to a number — user configs frequently pass env-derived strings + // (e.g. `port: env.RENDERER_PORT || 3800` yields the string "3800"). + // eslint-disable-next-line @typescript-eslint/no-unnecessary-type-conversion -- runtime value may be string despite the type + config.port = Number(config.port); + const portValidationError = validatePort(config.port); if (portValidationError) { log.error(portValidationError); diff --git a/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts b/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts index 4a6bf26fec..3ebd53a775 100644 --- a/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts +++ b/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts @@ -135,6 +135,16 @@ describe('configBuilder', () => { expect(buildConfig({ port: 0 }).port).toBe(0); }); + + it('coerces a string port from env vars to a number', () => { + process.env.NODE_ENV = 'development'; + process.env.RAILS_ENV = 'development'; + const { buildConfig } = loadConfigBuilderWithMockedLogger(); + + // Simulates `port: env.RENDERER_PORT || 3800` where env var is the string "3800" + const config = buildConfig({ port: '3800' as unknown as number }); + expect(config.port).toBe(3800); + }); }); describe('password validation in production-like environments', () => {