shakacode · justin808 · Apr 7, 2026 · Mar 28, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -31,6 +31,7 @@ After a release, run `/update-changelog` in Claude Code to analyze commits, writ
 #### Fixed
 
 - **[Pro]** **Fixed TanStack Router SSR hydration mismatches in the async path**: Client hydration now restores server match data before first render, uses `RouterProvider` directly to match the server-rendered tree, and stops the post-hydration load when a custom `router.options.hydrate` callback fails instead of continuing with partially hydrated client state. [PR 2932](https://github.com/shakacode/react_on_rails/pull/2932) by [justin808](https://github.com/justin808).
+- **[Pro] Fixed infinite fork loop when node renderer worker fails to bind port**: When a worker failed during `app.listen()` (e.g., `EADDRINUSE`), the master previously reforked unconditionally, causing an infinite fork/crash loop that consumed CPU and filled logs. Workers now send a `WORKER_STARTUP_FAILURE` IPC message to the master before exiting; the master sets an abort flag and exits with a clear error message instead of reforking. Scheduled restarts and runtime crashes continue to refork as before. [PR 2881](https://github.com/shakacode/react_on_rails/pull/2881) by [justin808](https://github.com/justin808).
 
 ### [16.6.0.rc.0] - 2026-04-01
 

diff --git a/packages/react-on-rails-pro-node-renderer/src/master.ts b/packages/react-on-rails-pro-node-renderer/src/master.ts
@@ -10,6 +10,7 @@ import { buildConfig, Config, logSanitizedConfig } from './shared/configBuilder.
 import restartWorkers from './master/restartWorkers.js';
 import * as errorReporter from './shared/errorReporter.js';
 import { getLicenseStatus } from './shared/licenseValidator.js';
+import { isWorkerStartupFailureMessage, type WorkerStartupFailureMessage } from './shared/workerMessages.js';
 
 const MILLISECONDS_IN_MINUTE = 60000;
 // How often to scan for orphaned upload directories.
@@ -77,22 +78,81 @@ export default function masterRun(runningConfig?: Partial<Config>) {
     })();
   }, ORPHAN_CLEANUP_INTERVAL_MS);
 
+  let isAbortingForStartupFailure = false;
+  let fatalStartupFailure: { workerId: number; failure: WorkerStartupFailureMessage } | null = null;
+  let hasInitiatedShutdown = false;
+
+  const abortForStartupFailure = (): boolean => {
+    if (!(isAbortingForStartupFailure && fatalStartupFailure)) return false;
+
+    if (!hasInitiatedShutdown) {
+      hasInitiatedShutdown = true;
+      // Note: the exiting worker may differ from the one that sent the
+      // failure message if multiple workers exit in rapid succession.
+      // We always report the first failure received.
+      const { failure, workerId: failedWorkerId } = fatalStartupFailure;
+      const msg =
+        failure.code === 'EADDRINUSE'
+          ? `Node renderer startup failed: ${failure.host}:${failure.port} is already in use`
+          : `Node renderer startup failed in worker ${failedWorkerId}: ${failure.message}`;
+
+      errorReporter.message(msg);
+      // Disconnect all live workers so they release their ports before the
+      // master exits. cluster.disconnect() is async — the callback fires
+      // once every worker has disconnected. A hard-deadline timer guarantees
+      // the master still exits if a worker is stuck (leaked handle, blocking
+      // syscall, etc.), following the same pattern as restartWorkers.ts.
+      const MASTER_SHUTDOWN_TIMEOUT_MS = 5000;
+      const shutdownTimer = setTimeout(() => process.exit(1), MASTER_SHUTDOWN_TIMEOUT_MS);
+      if (typeof shutdownTimer.unref === 'function') shutdownTimer.unref();
+      cluster.disconnect(() => {
+        clearTimeout(shutdownTimer);
+        process.exit(1);
+      });
+    }
+
+    return true;
+  };
+
+  cluster.on('message', (worker, message) => {
+    // Check the abort flag first to short-circuit the type-guard on every
+    // ordinary IPC message once we are already aborting.
+    if (isAbortingForStartupFailure || !isWorkerStartupFailureMessage(message)) return;
+
+    isAbortingForStartupFailure = true;
+    fatalStartupFailure = { workerId: worker.id, failure: message };
+  });
+
   for (let i = 0; i < workersCount; i += 1) {
     cluster.fork();
   }
 
   // Listen for dying workers:
   cluster.on('exit', (worker) => {
+    // Once a startup failure has been detected, abort regardless of whether
+    // this particular exit was from the failing worker, a scheduled restart,
+    // or an unrelated crash. Don't fork any more workers.
+    if (abortForStartupFailure()) {
+      return;
+    }
+
     if (worker.isScheduledRestart) {
       log.info('Restarting worker #%d on schedule', worker.id);
-    } else {
+      cluster.fork();
+      return;
+    }
-    if (worker.isScheduledRestart) {
-      log.info('Restarting worker #%d on schedule', worker.id);
-    } else {
-      cluster.fork();
-      return;
-    }
+    if (worker.isScheduledRestart) {
+      setImmediate(() => {
+        if (abortForStartupFailure()) return;
+        log.info('Restarting worker #%d on schedule', worker.id);
+        cluster.fork();
+      });
+      return;
+    }
-    if (worker.isScheduledRestart) {
-      log.info('Restarting worker #%d on schedule', worker.id);
-    } else {
-      cluster.fork();
-      return;
-    }
+    if (worker.isScheduledRestart) {
+      log.info('Restarting worker #%d on schedule', worker.id);
+      // Scheduled restarts happen well after startup (order of minutes), so the
+      // startup-failure window has long closed; no setImmediate deferral needed.
+      cluster.fork();
+      return;
+    }
-    if (worker.isScheduledRestart) {
-      log.info('Restarting worker #%d on schedule', worker.id);
-    } else {
-      cluster.fork();
-      return;
-    }
+    if (worker.isScheduledRestart) {
+      setImmediate(() => {
+        if (abortForStartupFailure()) return;
+        log.info('Restarting worker #%d on schedule', worker.id);
+        cluster.fork();
+      });
+      return;
+    }
-    if (worker.isScheduledRestart) {
-      log.info('Restarting worker #%d on schedule', worker.id);
-    } else {
-      cluster.fork();
-      return;
-    }
+    if (worker.isScheduledRestart) {
+      log.info('Restarting worker #%d on schedule', worker.id);
+      // Scheduled restarts happen well after startup (order of minutes), so the
+      // startup-failure window has long closed; no setImmediate deferral needed.
+      cluster.fork();
+      return;
+    }
+
+    // Give in-flight startup-failure IPC messages one event-loop turn to be
+    // processed before classifying this as an ordinary runtime crash.
+    setImmediate(() => {
+      if (abortForStartupFailure()) return;
+
       // TODO: Track last rendering request per worker.id
       // TODO: Consider blocking a given rendering request if it kills a worker more than X times
       const msg = `Worker ${worker.id} died UNEXPECTEDLY :(, restarting`;
       errorReporter.message(msg);
-    }
-    // Replace the dead worker:
-    cluster.fork();
+      cluster.fork();
+    });
   });
 
   // Schedule regular restarts of workers

diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts b/packages/react-on-rails-pro-node-renderer/src/shared/configBuilder.ts
@@ -142,6 +142,13 @@ function logLevel(level: string): LevelWithSilent {
   }
 }
 
+function validatePort(port: number): string | null {
+  if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
-  if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
+  if (!Number.isInteger(port) || port < 0 || port > 65535) {
-  if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
+  if (!Number.isInteger(port) || port < 0 || port > 65535) {
-  if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
+  if (!Number.isInteger(port) || port < 0 || port > 65535) {
-  if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
+  if (!Number.isInteger(port) || port < 0 || port > 65535) {
-  if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
+  if (!Number.isInteger(port) || port < 0 || port > 65535) {
-  if (!Number.isInteger(port) || !Number.isFinite(port) || port < 0 || port > 65535) {
+  if (!Number.isInteger(port) || port < 0 || port > 65535) {
+    return `RENDERER_PORT must be an integer between 0 and 65535. Received: ${String(port)}`;
+  }
+  return null;
+}
+
 function normalizedRuntimeEnvs() {
   return [env.RAILS_ENV, env.NODE_ENV]
     .filter((value): value is string => Boolean(value))
@@ -380,6 +387,17 @@ export function buildConfig(providedUserConfig?: Partial<Config>): Config {
     }
   });
 
+  // Coerce port to a number — user configs frequently pass env-derived strings
+  // (e.g. `port: env.RENDERER_PORT || 3800` yields the string "3800").
+  // eslint-disable-next-line @typescript-eslint/no-unnecessary-type-conversion -- runtime value may be string despite the type
+  config.port = Number(config.port);
+
+  const portValidationError = validatePort(config.port);
+  if (portValidationError) {
+    log.error(portValidationError);
+    process.exit(1);
+  }
+
   if (
     'honeybadgerApiKey' in config ||
     'sentryDsn' in config ||

diff --git a/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts b/packages/react-on-rails-pro-node-renderer/src/shared/workerMessages.ts
@@ -0,0 +1,34 @@
+export const WORKER_STARTUP_FAILURE = 'NODE_RENDERER_WORKER_STARTUP_FAILURE' as const;
+
+export interface WorkerStartupFailureMessage {
+  type: typeof WORKER_STARTUP_FAILURE;
+  stage: 'listen';
+  code?: string;
+  errno?: number;
+  syscall?: string;
+  host: string;
+  port: number;
+  message: string;
+}
+
+export function isWorkerStartupFailureMessage(value: unknown): value is WorkerStartupFailureMessage {
+  if (typeof value !== 'object' || value === null) {
+    return false;
+  }
+
+  const message = value as Partial<WorkerStartupFailureMessage>;
+
+  // stage: 'listen' is the only supported stage today. To handle pre-listen
+  // failures (e.g. plugin registration), add a new stage value here and
+  // update the master handler accordingly.
+  return (
+    message.type === WORKER_STARTUP_FAILURE &&
+    message.stage === 'listen' &&
+    typeof message.host === 'string' &&
+    typeof message.port === 'number' &&
+    Number.isInteger(message.port) &&
+    message.port >= 0 &&
+    message.port <= 65535 &&
+    typeof message.message === 'string'
+  );
+}
diff --git a/packages/react-on-rails-pro-node-renderer/src/worker.ts b/packages/react-on-rails-pro-node-renderer/src/worker.ts
@@ -23,6 +23,7 @@ import {
   type ProvidedNewBundle,
 } from './worker/handleRenderRequest.js';
 import handleGracefulShutdown from './worker/handleGracefulShutdown.js';
+import { handleStartupListenError } from './worker/startupErrorHandler.js';
 import {
   badRequestResponseResult,
   errorResponseResult,
@@ -510,8 +511,8 @@ export default function run(config: Partial<Config>) {
   if (workersCount === 0 || cluster.isWorker) {
     app.listen({ port, host }, (err, address) => {
       if (err) {
-        log.error({ err, host, port }, 'Node renderer failed to start');
-        process.exit(1);
+        handleStartupListenError({ err, host, port });
+        return;
       }
       const workerName = worker ? `worker #${worker.id}` : 'master (single-process)';
       log.info({ workerName, address }, 'Node renderer listening');

diff --git a/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts b/packages/react-on-rails-pro-node-renderer/src/worker/startupErrorHandler.ts
@@ -0,0 +1,65 @@
+import cluster from 'cluster';
+import log from '../shared/log.js';
+import { WORKER_STARTUP_FAILURE, type WorkerStartupFailureMessage } from '../shared/workerMessages.js';
+
+export type StartupListenErrorHandlerOptions = {
+  err: Error;
+  host: string;
+  port: number;
+  isWorker?: boolean;
+  send?: NodeJS.Process['send'];
+  exit?: NodeJS.Process['exit'];
+};
+
+export function handleStartupListenError({
+  err,
+  host,
+  port,
+  isWorker = cluster.isWorker,
+  send,
+  exit,
+}: StartupListenErrorHandlerOptions) {
+  const sendFn = send ?? process.send?.bind(process);
+  const exitFn = exit ?? ((code?: number) => process.exit(code));
+
+  log.error({ err, host, port }, 'Node renderer failed to start');
+
+  if (isWorker) {
+    if (!sendFn) {
+      log.error('Cluster worker has no IPC channel; cannot notify master of startup failure');
+      exitFn(1);
+      return;
+    }
+
+    const startupFailure: WorkerStartupFailureMessage = {
+      type: WORKER_STARTUP_FAILURE,
+      stage: 'listen',
+      code: (err as NodeJS.ErrnoException).code,
+      errno: (err as NodeJS.ErrnoException).errno,
+      syscall: (err as NodeJS.ErrnoException).syscall,
+      host,
+      port,
+      message: err.message,
+    };
+    try {
+      let exited = false;
+      const doExit = (sendErr?: Error | null) => {
+        if (exited) return;
+        exited = true;
+        if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
+        exitFn(1);
+      };
+      sendFn(startupFailure, undefined, undefined, doExit);
+      // Safety net: if the IPC channel is half-broken the callback may never
+      // fire, leaving this worker alive indefinitely. Force exit after a timeout.
+      const IPC_SEND_TIMEOUT_MS = 2000;
+      const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
+      if (typeof timer.unref === 'function') timer.unref();
-      const doExit = (sendErr?: Error | null) => {
-        if (exited) return;
-        exited = true;
-        if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
-        exitFn(1);
-      };
-      sendFn(startupFailure, undefined, undefined, doExit);
-      // Safety net: if the IPC channel is half-broken the callback may never
-      // fire, leaving this worker alive indefinitely. Force exit after a timeout.
-      const IPC_SEND_TIMEOUT_MS = 2000;
-      const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
-      if (typeof timer.unref === 'function') timer.unref();
+      let exited = false;
+      let timer: NodeJS.Timeout | undefined;
+      const doExit = (sendErr?: Error | null) => {
+        if (exited) return;
+        exited = true;
+        clearTimeout(timer);
+        if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
+        exitFn(1);
+      };
+      sendFn(startupFailure, undefined, undefined, doExit);
+      // Safety net: if the IPC channel is half-broken the callback may never
+      // fire, leaving this worker alive indefinitely. Force exit after a timeout.
+      const IPC_SEND_TIMEOUT_MS = 2000;
+      timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
+      if (typeof timer.unref === 'function') timer.unref();
-      const doExit = (sendErr?: Error | null) => {
-        if (exited) return;
-        exited = true;
-        if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
-        exitFn(1);
-      };
-      sendFn(startupFailure, undefined, undefined, doExit);
-      // Safety net: if the IPC channel is half-broken the callback may never
-      // fire, leaving this worker alive indefinitely. Force exit after a timeout.
-      const IPC_SEND_TIMEOUT_MS = 2000;
-      const timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
-      if (typeof timer.unref === 'function') timer.unref();
+      let exited = false;
+      let timer: NodeJS.Timeout | undefined;
+      const doExit = (sendErr?: Error | null) => {
+        if (exited) return;
+        exited = true;
+        clearTimeout(timer);
+        if (sendErr) log.error({ err: sendErr }, 'Failed to send startup failure message to master');
+        exitFn(1);
+      };
+      sendFn(startupFailure, undefined, undefined, doExit);
+      // Safety net: if the IPC channel is half-broken the callback may never
+      // fire, leaving this worker alive indefinitely. Force exit after a timeout.
+      const IPC_SEND_TIMEOUT_MS = 2000;
+      timer = setTimeout(() => doExit(), IPC_SEND_TIMEOUT_MS);
+      if (typeof timer.unref === 'function') timer.unref();
+    } catch (sendErr) {
+      log.error({ err: sendErr as Error }, 'Failed to send startup failure message to master');
+      exitFn(1);
+    }
+  } else {
+    exitFn(1);
+  }
+}
diff --git a/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts b/packages/react-on-rails-pro-node-renderer/tests/configBuilder.test.ts
@@ -1,6 +1,7 @@
 describe('configBuilder', () => {
   const envVarsToRestore = [
     'RENDERER_HOST',
+    'RENDERER_PORT',
     'NODE_ENV',
     'RENDERER_PASSWORD',
     'RAILS_ENV',
@@ -113,6 +114,39 @@ describe('configBuilder', () => {
     expect(finalSettings.password).toBe('<EMPTY STRING>');
   });
 
+  describe('port validation', () => {
+    it('throws when configured port is outside the valid TCP range', () => {
+      process.env.NODE_ENV = 'development';
+      process.env.RAILS_ENV = 'development';
+      const processExit = mockProcessExit();
+      const { buildConfig, error } = loadConfigBuilderWithMockedLogger();
+
+      expect(() => buildConfig({ port: 70000 })).toThrow('process.exit: 1');
+      expect(processExit).toHaveBeenCalledWith(1);
+      expect(error).toHaveBeenCalledWith(
+        'RENDERER_PORT must be an integer between 0 and 65535. Received: 70000',
+      );
+    });
+
+    it('allows port 0 for ephemeral-port test setups', () => {
+      process.env.NODE_ENV = 'development';
+      process.env.RAILS_ENV = 'development';
+      const { buildConfig } = loadConfigBuilderWithMockedLogger();
+
+      expect(buildConfig({ port: 0 }).port).toBe(0);
+    });
+
+    it('coerces a string port from env vars to a number', () => {
+      process.env.NODE_ENV = 'development';
+      process.env.RAILS_ENV = 'development';
+      const { buildConfig } = loadConfigBuilderWithMockedLogger();
+
+      // Simulates `port: env.RENDERER_PORT || 3800` where env var is the string "3800"
+      const config = buildConfig({ port: '3800' as unknown as number });
+      expect(config.port).toBe(3800);
+    });
+  });
+
   describe('password validation in production-like environments', () => {
     it('throws when no password is set in production', () => {
       process.env.NODE_ENV = 'production';