Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a48cb3d
Fix infinite fork loop when node renderer worker fails to bind port
justin808 Mar 28, 2026
a861a6e
Revert claude-code-review.yml to match default branch
justin808 Mar 29, 2026
4f82dd7
Fix startup-failure listener ordering and failure payload checks
justin808 Mar 29, 2026
a090fee
Harden worker startup-failure IPC exit path
justin808 Mar 29, 2026
5440452
Prevent regressions in node renderer startup-failure abort handling
justin808 Mar 29, 2026
c4ce2ea
Remove unnecessary 500ms fallback timer from worker IPC exit path
yassa-tamer Mar 29, 2026
1be836d
Remove redundant startup-failure repro script
yassa-tamer Mar 29, 2026
49c0ff0
Address review feedback: remove dead return, include host in error, f…
yassa-tamer Mar 29, 2026
9c6aaf0
Prevent startup-failure IPC loss before worker exit
justin808 Mar 30, 2026
05851ed
Harden startup-failure abort path and align tests with production wiring
justin808 Mar 30, 2026
7bd1ee9
Address review: disconnect workers on abort, warn on missing IPC, rem…
justin808 Mar 31, 2026
fd704f0
Address review: async disconnect, IPC timeout fallback, abort-first e…
justin808 Mar 31, 2026
11cbe99
Harden startup-failure exit ordering and IPC error visibility
justin808 Apr 5, 2026
e68d937
Address review: remove dead message fallback, document stage extensib…
justin808 Apr 5, 2026
08b9f0b
Add hard-deadline timer to cluster.disconnect() in abort path
justin808 Apr 5, 2026
07c1f14
Update CHANGELOG with fork loop port bind fix for PR 2881
justin808 Apr 6, 2026
a867ec9
Extract handleStartupListenError to dedicated module
justin808 Apr 6, 2026
9d48d81
Validate renderer port range to prevent startup refork loops
justin808 Apr 6, 2026
97784b9
Fix startupErrorHandler sendFn narrowing for TypeScript
justin808 Apr 6, 2026
ca6ac01
Coerce string port to number before validation in configBuilder
justin808 Apr 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ After a release, run `/update-changelog` in Claude Code to analyze commits, writ
#### Fixed

- **[Pro]** **Fixed TanStack Router SSR hydration mismatches in the async path**: Client hydration now restores server match data before first render, uses `RouterProvider` directly to match the server-rendered tree, and stops the post-hydration load when a custom `router.options.hydrate` callback fails instead of continuing with partially hydrated client state. [PR 2932](https://github.com/shakacode/react_on_rails/pull/2932) by [justin808](https://github.com/justin808).
- **[Pro] Fixed infinite fork loop when node renderer worker fails to bind port**: When a worker failed during `app.listen()` (e.g., `EADDRINUSE`), the master previously reforked unconditionally, causing an infinite fork/crash loop that consumed CPU and filled logs. Workers now send a `WORKER_STARTUP_FAILURE` IPC message to the master before exiting; the master sets an abort flag and exits with a clear error message instead of reforking. Scheduled restarts and runtime crashes continue to refork as before. [PR 2881](https://github.com/shakacode/react_on_rails/pull/2881) by [justin808](https://github.com/justin808).

### [16.6.0.rc.0] - 2026-04-01

Expand Down
68 changes: 64 additions & 4 deletions packages/react-on-rails-pro-node-renderer/src/master.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { buildConfig, Config, logSanitizedConfig } from './shared/configBuilder.
import restartWorkers from './master/restartWorkers.js';
import * as errorReporter from './shared/errorReporter.js';
import { getLicenseStatus } from './shared/licenseValidator.js';
import { isWorkerStartupFailureMessage, type WorkerStartupFailureMessage } from './shared/workerMessages.js';

const MILLISECONDS_IN_MINUTE = 60000;
// How often to scan for orphaned upload directories.
Expand Down Expand Up @@ -77,22 +78,81 @@ export default function masterRun(runningConfig?: Partial<Config>) {
})();
}, ORPHAN_CLEANUP_INTERVAL_MS);

let isAbortingForStartupFailure = false;
let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null;
let hasInitiatedShutdown = false;

const abortForStartupFailure = (): boolean => {
Comment thread
justin808 marked this conversation as resolved.
if (!(isAbortingForStartupFailure && fatalStartupFailure)) return false;

if (!hasInitiatedShutdown) {
hasInitiatedShutdown = true;
// Note: the exiting worker may differ from the one that sent the
// failure message if multiple workers exit in rapid succession.
// We always report the first failure received.
const { failure, workerId: failedWorkerId } = fatalStartupFailure;
const msg =
failure.code === 'EADDRINUSE'
? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use`
: `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message}`;

errorReporter.message(msg);
// Disconnect all live workers so they release their ports before the
// master exits. cluster.disconnect() is async — the callback fires
// once every worker has disconnected. A hard-deadline timer guarantees
// the master still exits if a worker is stuck (leaked handle, blocking
// syscall, etc.), following the same pattern as restartWorkers.ts.
const MASTER_SHUTDOWN_TIMEOUT_MS = 5000;
const shutdownTimer = setTimeout(() => process.exit(1), MASTER_SHUTDOWN_TIMEOUT_MS);
Comment thread
justin808 marked this conversation as resolved.
if (typeof shutdownTimer.unref === 'function') shutdownTimer.unref();
cluster.disconnect(() => {
clearTimeout(shutdownTimer);
process.exit(1);
});
}

return true;
};

cluster.on('message', (worker, message) => {
// Check the abort flag first to short-circuit the type-guard on every
// ordinary IPC message once we are already aborting.
if (isAbortingForStartupFailure || !isWorkerStartupFailureMessage(message)) return;

isAbortingForStartupFailure = true;
fatalStartupFailure = { workerId: worker.id, failure: message };
});
Comment thread
justin808 marked this conversation as resolved.
Comment thread
justin808 marked this conversation as resolved.

for (let i = 0; i < workersCount; i += 1) {
Comment thread
justin808 marked this conversation as resolved.
cluster.fork();
}

// Listen for dying workers:
cluster.on('exit', (worker) => {
Comment thread
justin808 marked this conversation as resolved.
Comment thread
justin808 marked this conversation as resolved.
Comment thread
justin808 marked this conversation as resolved.
Comment on lines 126 to 131
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor ordering note: the exit listener is registered after workers are forked. A worker that fails synchronously and exits before the master reaches cluster.on('exit', …) (line 131) would be silently missed — no refork, no abort. In practice this is vanishingly unlikely (the forked process needs its own event loop to start and crash), but registering the listener before cluster.fork() (as is already done for cluster.on('message', …)) would make the intent clearer and eliminate the gap entirely.

// Once a startup failure has been detected, abort regardless of whether
// this particular exit was from the failing worker, a scheduled restart,
// or an unrelated crash. Don't fork any more workers.
if (abortForStartupFailure()) {
return;
}
Comment thread
cursor[bot] marked this conversation as resolved.

if (worker.isScheduledRestart) {
log.info('Restarting worker #%d on schedule', worker.id);
} else {
cluster.fork();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a narrow timing gap here: the abortForStartupFailure() check at the top of the exit handler runs synchronously, but the IPC message that sets isAbortingForStartupFailure = true might not have been processed yet. A worker could be a scheduled restart AND the startup failure message could be in flight.

In that window, this branch forks a new replacement worker even though the master is about to abort. The new worker will fail with the same EADDRINUSE, send another WORKER_STARTUP_FAILURE (which is ignored since the flag is already set), and exit — the master still shuts down correctly. It's not a correctness bug, but it produces one spurious extra fork in the EADDRINUSE-during-scheduled-restart scenario.

If this edge case matters, wrapping the scheduled-restart fork in a setImmediate (same as the crash path below) would close the gap. Given how unlikely the overlap is in practice this is low priority.

return;
}
Comment on lines 139 to +143
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The isScheduledRestart branch forks synchronously without the setImmediate deferred check that the runtime-crash branch uses. This creates an asymmetric race window: if a scheduled-restart exit fires while a WORKER_STARTUP_FAILURE IPC message is in-flight (sent but not yet processed), the master will fork a new replacement worker before setting the abort flag.

The new worker will likely fail with the same error and exit, triggering the abort on its exit — so this isn't a correctness failure (the master will still eventually abort), but it does transiently over-fork by one worker during the startup-failure shutdown sequence.

To close the window consistently, apply the same setImmediate deferral here:

Suggested change
if (worker.isScheduledRestart) {
log.info('Restarting worker #%d on schedule', worker.id);
} else {
cluster.fork();
return;
}
if (worker.isScheduledRestart) {
setImmediate(() => {
if (abortForStartupFailure()) return;
log.info('Restarting worker #%d on schedule', worker.id);
cluster.fork();
});
return;
}

The existing test 'does not refork a scheduled-restart worker when aborting for startup failure' already asserts the desired outcome for the synchronous case; adding a parallel async test (IPC arrives after the exit event) would cover this path.

Comment on lines 139 to +143
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The setImmediate deferral that gives in-flight IPC messages one tick to arrive only applies to the non-scheduled-restart branch. If a scheduled-restart worker happens to exit in the same very-narrow window that a startup-failure IPC message is in-flight (i.e. isAbortingForStartupFailure is still false at the synchronous check on line 135 but the message arrives before the next tick), this branch will fork a new worker before the abort kicks in.

In practice this race is almost impossible — scheduled restarts fire minutes after startup — but it's worth a comment so future readers don't wonder why the parity is missing:

Suggested change
if (worker.isScheduledRestart) {
log.info('Restarting worker #%d on schedule', worker.id);
} else {
cluster.fork();
return;
}
if (worker.isScheduledRestart) {
log.info('Restarting worker #%d on schedule', worker.id);
// Scheduled restarts happen well after startup (order of minutes), so the
// startup-failure window has long closed; no setImmediate deferral needed.
cluster.fork();
return;
}


// Give in-flight startup-failure IPC messages one event-loop turn to be
// processed before classifying this as an ordinary runtime crash.
setImmediate(() => {
Comment thread
justin808 marked this conversation as resolved.
Comment thread
justin808 marked this conversation as resolved.
if (abortForStartupFailure()) return;

// TODO: Track last rendering request per worker.id
// TODO: Consider blocking a given rendering request if it kills a worker more than X times
const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`;
errorReporter.message(msg);
}
// Replace the dead worker:
cluster.fork();
cluster.fork();
});
});

// Schedule regular restarts of workers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,13 @@ function logLevel(level: string): LevelWithSilent {
}
}

function validatePort(port: number): string | null {
if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The !Number.isFinite(port) check is redundant here — Number.isInteger already returns false for NaN, Infinity, and -Infinity, so the finite check never catches anything the integer check doesn't.

Suggested change
if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
if (!Number.isInteger(port) || port < 0 || port > 65535) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!Number.isFinite(port) is redundant here — Number.isInteger already returns false for NaN, Infinity, and -Infinity, so the finite check can never add a failure that isInteger didn't already catch.

Suggested change
if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
if (!Number.isInteger(port) || port < 0 || port > 65535) {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dead condition in port validation function

Low Severity

The validatePort condition !Number.isInteger(port) || !Number.isFinite(port) contains a dead sub-expression. Number.isInteger already implies Number.isFinite — every integer is finite, and Number.isInteger returns false for NaN, Infinity, and -Infinity. So when !Number.isInteger(port) is false (the only time !Number.isFinite is even evaluated), port is guaranteed to be finite, making !Number.isFinite(port) always false at that point. The !Number.isFinite check never affects the result and misleads readers into thinking there is an input class that Number.isInteger alone would miss.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 97784b9. Configure here.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!Number.isFinite(port) is redundant here. Number.isInteger() already returns false for NaN, Infinity, and -Infinity, so the isFinite guard adds no coverage.

Suggested change
if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
if (!Number.isInteger(port) || port < 0 || port > 65535) {

return `RENDERER_PORT must be an integer between 0 and 65535. Received: ${String(port)}`;
}
return null;
}

function normalizedRuntimeEnvs() {
return [env.RAILS_ENV, env.NODE_ENV]
.filter((value): value is string => Boolean(value))
Expand Down Expand Up @@ -380,6 +387,17 @@ export function buildConfig(providedUserConfig?: Partial<Config>): Config {
}
});

// Coerce port to a number — user configs frequently pass env-derived strings
// (e.g. `port: env.RENDERER_PORT || 3800` yields the string "3800").
// eslint-disable-next-line @typescript-eslint/no-unnecessary-type-conversion -- runtime value may be string despite the type
config.port = Number(config.port);

const portValidationError = validatePort(config.port);
if (portValidationError) {
log.error(portValidationError);
process.exit(1);
}

if (
'honeybadgerApiKey' in config ||
'sentryDsn' in config ||
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
export const WORKER_STARTUP_FAILURE = 'NODE_RENDERER_WORKER_STARTUP_FAILURE' as const;

export interface WorkerStartupFailureMessage {
type: typeof WORKER_STARTUP_FAILURE;
stage: 'listen';
code?: string;
errno?: number;
syscall?: string;
host: string;
port: number;
message: string;
}

export function isWorkerStartupFailureMessage(value: unknown): value is WorkerStartupFailureMessage {
if (typeof value !== 'object' || value === null) {
return false;
}

const message = value as Partial<WorkerStartupFailureMessage>;

// stage: 'listen' is the only supported stage today. To handle pre-listen
// failures (e.g. plugin registration), add a new stage value here and
// update the master handler accordingly.
return (
message.type === WORKER_STARTUP_FAILURE &&
message.stage === 'listen' &&
Comment thread
justin808 marked this conversation as resolved.
typeof message.host === 'string' &&
Comment thread
justin808 marked this conversation as resolved.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Handle undefined host in startup-failure message guard

This guard drops startup-failure IPC messages unless host is a string, but buildConfig can still propagate host: undefined from JS callers (for example host: process.env.RENDERER_HOST when unset). In that case, workers still send WORKER_STARTUP_FAILURE on app.listen errors, yet the master rejects the payload here and treats exits as ordinary crashes, re-entering the fork/crash loop this change is meant to prevent (notably on EADDRINUSE). Either normalize/validate host before workers start or allow missing host values in this guard.

Useful? React with 👍 / 👎.

typeof message.port === 'number' &&
Number.isInteger(message.port) &&
message.port >= 0 &&
message.port <= 65535 &&
Comment thread
justin808 marked this conversation as resolved.
typeof message.message === 'string'
);
Comment thread
justin808 marked this conversation as resolved.
}
5 changes: 3 additions & 2 deletions packages/react-on-rails-pro-node-renderer/src/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import {
type ProvidedNewBundle,
} from './worker/handleRenderRequest.js';
import handleGracefulShutdown from './worker/handleGracefulShutdown.js';
import { handleStartupListenError } from './worker/startupErrorHandler.js';
import {
badRequestResponseResult,
errorResponseResult,
Expand Down Expand Up @@ -510,8 +511,8 @@ export default function run(config: Partial<Config>) {
if (workersCount === 0 || cluster.isWorker) {
app.listen({ port, host }, (err, address) => {
if (err) {
log.error({ err, host, port }, 'Node renderer failed to start');
process.exit(1);
handleStartupListenError({ err, host, port });
return;
}
const workerName = worker ? `worker #${worker.id}` : 'master (single-process)';
log.info({ workerName, address }, 'Node renderer listening');
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import cluster from 'cluster';
import log from '../shared/log.js';
import { WORKER_STARTUP_FAILURE, type WorkerStartupFailureMessage } from '../shared/workerMessages.js';

export type StartupListenErrorHandlerOptions = {
err: Error;
host: string;
port: number;
isWorker?: boolean;
send?: NodeJS.Process['send'];
exit?: NodeJS.Process['exit'];
};

export function handleStartupListenError({
err,
host,
port,
isWorker = cluster.isWorker,
send,
exit,
}: StartupListenErrorHandlerOptions) {
const sendFn = send ?? process.send?.bind(process);
const exitFn = exit ?? ((code?: number) => process.exit(code));

log.error({ err, host, port }, 'Node renderer failed to start');

if (isWorker) {
if (!sendFn) {
log.error('Cluster worker has no IPC channel; cannot notify master of startup failure');
exitFn(1);
return;
}

const startupFailure: WorkerStartupFailureMessage = {
type: WORKER_STARTUP_FAILURE,
stage: 'listen',
code: (err as NodeJS.ErrnoException).code,
errno: (err as NodeJS.ErrnoException).errno,
syscall: (err as NodeJS.ErrnoException).syscall,
host,
port,
message: err.message,
};
try {
let exited = false;
const doExit = (sendErr?: Error | null) => {
if (exited) return;
exited = true;
if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
exitFn(1);
};
sendFn(startupFailure, undefined, undefined, doExit);
// Safety net: if the IPC channel is half-broken the callback may never
// fire, leaving this worker alive indefinitely. Force exit after a timeout.
const IPC_SEND_TIMEOUT_MS = 2000;
const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
if (typeof timer.unref === 'function') timer.unref();
Comment on lines +46 to +57
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The timeout timer is never cleared when doExit fires via the IPC callback in the normal path. In production this is harmless (the process exits before the timer fires), and in tests the exited guard makes doExit idempotent — but clearing the timer explicitly inside doExit would be cleaner and avoids leaving a dangling handle:

Suggested change
const doExit = (sendErr?: Error | null) => {
if (exited) return;
exited = true;
if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
exitFn(1);
};
sendFn(startupFailure, undefined, undefined, doExit);
// Safety net: if the IPC channel is half-broken the callback may never
// fire, leaving this worker alive indefinitely. Force exit after a timeout.
const IPC_SEND_TIMEOUT_MS = 2000;
const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
if (typeof timer.unref === 'function') timer.unref();
let exited = false;
let timer: NodeJS.Timeout | undefined;
const doExit = (sendErr?: Error | null) => {
if (exited) return;
exited = true;
clearTimeout(timer);
if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
exitFn(1);
};
sendFn(startupFailure, undefined, undefined, doExit);
// Safety net: if the IPC channel is half-broken the callback may never
// fire, leaving this worker alive indefinitely. Force exit after a timeout.
const IPC_SEND_TIMEOUT_MS = 2000;
timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
if (typeof timer.unref === 'function') timer.unref();

} catch (sendErr) {
log.error({ err: sendErr as Error }, 'Failed to send startup failure message to master');
exitFn(1);
}
} else {
exitFn(1);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
describe('configBuilder', () => {
const envVarsToRestore = [
'RENDERER_HOST',
'RENDERER_PORT',
'NODE_ENV',
'RENDERER_PASSWORD',
'RAILS_ENV',
Expand Down Expand Up @@ -113,6 +114,39 @@ describe('configBuilder', () => {
expect(finalSettings.password).toBe('<EMPTY STRING>');
});

describe('port validation', () => {
it('throws when configured port is outside the valid TCP range', () => {
process.env.NODE_ENV = 'development';
process.env.RAILS_ENV = 'development';
const processExit = mockProcessExit();
const { buildConfig, error } = loadConfigBuilderWithMockedLogger();

expect(() => buildConfig({ port: 70000 })).toThrow('process.exit: 1');
expect(processExit).toHaveBeenCalledWith(1);
expect(error).toHaveBeenCalledWith(
'RENDERER_PORT must be an integer between 0 and 65535. Received: 70000',
);
});

it('allows port 0 for ephemeral-port test setups', () => {
process.env.NODE_ENV = 'development';
process.env.RAILS_ENV = 'development';
const { buildConfig } = loadConfigBuilderWithMockedLogger();

expect(buildConfig({ port: 0 }).port).toBe(0);
});

it('coerces a string port from env vars to a number', () => {
process.env.NODE_ENV = 'development';
process.env.RAILS_ENV = 'development';
const { buildConfig } = loadConfigBuilderWithMockedLogger();

// Simulates `port: env.RENDERER_PORT || 3800` where env var is the string "3800"
const config = buildConfig({ port: '3800' as unknown as number });
expect(config.port).toBe(3800);
});
});

describe('password validation in production-like environments', () => {
it('throws when no password is set in production', () => {
process.env.NODE_ENV = 'production';
Expand Down
Loading
Loading