[https://nvbugs/6336747][fix] Fail fast when executor worker stalls

2ez4bz · 2ez4bz · commit 1832ab34b46e · 2026-06-23T16:00:25.000-07:00
* Why?

A stuck or disconnected executor worker left the proxy blocked
indefinitely: the request queue uses an unbounded send HWM with no
send timeout, so request_queue.put -&gt; socket.send never returned once
the worker stopped draining, and the error monitor never tripped. In
CI this could surface as a ~1h hang ending in an opaque timeout kill.
The stall itself is non-deterministic and not yet root-caused.

* What?

Make the failure fast and legible instead:

- Bound request submission: poll the socket for send-readiness and
check worker liveness, raising RequestError if the worker has not
accepted the request within a timeout.
- Add a progress watchdog to the error monitor that marks the worker
stalled and aborts in-flight requests when no result arrives while
requests are outstanding.
- Honor the previously-ignored timeout in GenerationResult.result()
and bound the per-request wait in the VideoMME evaluator.
- On a detected stall, signal the worker (SIGUSR1/faulthandler) to
dump all thread stacks so the next occurrence is diagnosable.

This mitigates the hang and captures worker state; it does not fix
the underlying intermittent stall.

Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/ipc.py b/tensorrt_llm/executor/ipc.py
@@ -181,6 +181,22 @@ def poll(self, timeout: int) -> bool:
         else:
             return False
 
+    def poll_send(self, timeout: float) -> bool:
+        """Return True if a message can be sent within *timeout* seconds.
+
+        Args:
+            timeout (float): Timeout in seconds.
+
+        Returns:
+            For a PAIR socket whose peer has disconnected, the socket enters mute state, and this
+            returns `False` instead of blocking, letting callers detect a dead/disconnected peer
+            rather than blocking forever in `send()`.
+        """
+        self.setup_lazily()
+        self._check_thread_safety()
+        return bool(
+            self.socket.poll(timeout=int(timeout * 1000), flags=zmq.POLLOUT))
+
     def put(self, obj: Any, routing_id: Optional[bytes] = None):
         self.setup_lazily()
         self._check_thread_safety()
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
@@ -16,7 +16,9 @@
 import concurrent.futures
 import json
 import os
+import signal
 import threading
+import time
 import weakref
 from queue import Empty
 from typing import Dict, List, Optional, Union
@@ -122,6 +124,23 @@ def __init__(
 
         self._results: Dict[int, GenerationResult] = {}
 
+        # --- liveness / stall detection state ---
+        # Time of the last sign of worker progress (a request submitted or a result received). The
+        # error monitor uses this to detect a worker that has silently stopped servicing requests.
+        self._last_progress_time = time.monotonic()
+        # Max time to wait for the worker to accept a submitted request before declaring it
+        # dead/stalled. With an unbounded send HWM the send only blocks when the worker disconnected
+        # or stopped draining.
+        self._submit_timeout_secs = float(
+            os.environ.get("TLLM_EXECUTOR_SUBMIT_TIMEOUT_SECS", "300"))
+        # Max time with requests in flight but no result before treating the worker as stalled.
+        self._stall_timeout_secs = float(
+            os.environ.get("TLLM_EXECUTOR_STALL_TIMEOUT_SECS", "300"))
+        # PID of the leader worker process, learned from the init handshake; used to request a
+        # thread-stack dump (SIGUSR1) when a stall is detected. Stays `None` for remote/out-of-host
+        # worker sessions.
+        self._worker_pid: Optional[int] = None
+
         self.model_world_size = model_world_size
 
         _llm_args = worker_kwargs.get("llm_args", None)
@@ -262,13 +281,50 @@ def _error_monitor_loop(self) -> None:
                 self._drain_error_queue()
                 if self._fatal_error is not None:
                     return
+
+                # Progress watchdog: a worker that silently stops servicing requests (no result for
+                # a long time while requests are in flight) is treated as a fatal stall so callers
+                # fail fast instead of hanging indefinitely.
+                if self._results and (
+                        time.monotonic() -
+                        self._last_progress_time) > self._stall_timeout_secs:
+                    logger.error(
+                        f"Error monitor: no result progress for {self._stall_timeout_secs:.2f}s "
+                        f"with {len(self._results)} request(s) in flight; "
+                        "treating worker as stalled.")
+                    self._maybe_dump_worker_traceback()
+                    self._set_fatal_error(
+                        RuntimeError(
+                            f"Worker stalled: no result for {self._stall_timeout_secs:.2f}s "
+                            f"with {len(self._results)} request(s) in flight."))
+                    self.pre_shutdown()
+                    return
             except Exception as exc:
                 logger.debug(f"Error monitor: unexpected exception (ignored): "
                              f"{exc!r}")
 
             # Wait up to 5s, but wake immediately if _shutdown_event is set
             self._shutdown_event.wait(timeout=5.0)
 
+    def _maybe_dump_worker_traceback(self) -> None:
+        """Best-effort: ask the worker process to dump all thread stacks.
+
+        The worker registers a SIGUSR1 handler (faulthandler) that writes a full traceback of every
+        thread to its stderr / traceback file. This can be useful for diagnosing a stalled worker,
+        whose state is otherwise lost when the process is killed at teardown. Only works when the
+        worker runs on the same host (the in-process `MpiPoolSession` case).
+        """
+        pid = self._worker_pid
+        if pid is None or not hasattr(signal, "SIGUSR1"):
+            return
+        try:
+            os.kill(pid, signal.SIGUSR1)
+            logger.error(
+                f"Sent SIGUSR1 to worker pid {pid} requesting a thread-stack "
+                f"dump for stall diagnosis.")
+        except OSError as e:
+            logger.debug(f"Could not signal worker pid {pid}: {e!r}")
+
     def _setup_queues(self) -> WorkerCommIpcAddrs:
 
         self.request_queue = IpcQueue(is_server=True,
@@ -319,6 +375,9 @@ def dispatch_result_task(self) -> bool:
         if (res := self.result_queue.get()) is None:
             return False  # shutdown the thread
 
+        # A result arrived: the worker is making progress.
+        self._last_progress_time = time.monotonic()
+
         async_queues = []
         event_loop = None
 
@@ -421,7 +480,8 @@ def mpi_done_callback(future: concurrent.futures.Future):
 
         while True:
             if self.worker_init_status_queue.poll(1):
-                ready_signal, error_trace = self.worker_init_status_queue.get()
+                ready_signal, ready_payload = self.worker_init_status_queue.get(
+                )
                 # Send ACK to the worker
                 self.worker_init_status_queue.put("ACK")
                 logger.info("get signal from executor worker")
@@ -432,11 +492,18 @@ def mpi_done_callback(future: concurrent.futures.Future):
             self._handle_background_error()
 
         if ready_signal != GenerationExecutorProxy.READY_SIGNAL:
-            logger.error(f"Executor worker initialization error: {error_trace}")
+            # On the error path the payload is the worker's traceback string.
+            logger.error(
+                f"Executor worker initialization error: {ready_payload}")
             self.mpi_session.shutdown_abort(reason=ready_signal)
             raise RuntimeError(
                 "Executor worker returned error") from ready_signal
 
+        # On success the worker sends its PID as the payload so we can signal it (SIGUSR1 ->
+        # thread-stack dump) if it later stalls.
+        if isinstance(ready_payload, int):
+            self._worker_pid = ready_payload
+
     def _abort_all_requests(self):
         # The results can be finished during this loop, so self._results may be changed.
         for result in list(self._results.values()):
@@ -549,12 +616,41 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
         self._results[request.id] = result
 
         with nvtx_range_debug("request_queue.put"):
-            self.request_queue.put(request)
+            self._submit_request(request)
 
         self._handle_background_error()
 
         return result
 
+    def _submit_request(self, request: GenerationRequest) -> None:
+        """Send a request to the worker with a bounded wait.
+
+        This is so a dead or stuck worker surfaces as a fast error instead of blocking forever.
+        """
+        # With an unbounded send HWM, `socket.send` only blocks when the worker has disconnected
+        # (PAIR mute state) or has stopped draining. We poll for send-readiness and, while we cannot
+        # send, check whether the worker has died or a fatal error was recorded, giving up after
+        # `_submit_timeout_secs`.
+        deadline = time.monotonic() + self._submit_timeout_secs
+        while not self.request_queue.poll_send(timeout=1.0):
+            # Surface any error already recorded by the monitor / callbacks.
+            self._handle_background_error()
+            if self._check_mpi_futures():
+                raise RequestError(
+                    "Executor worker exited before the request could be "
+                    "submitted.")
+            if time.monotonic() >= deadline:
+                self._maybe_dump_worker_traceback()
+                err = RequestError(
+                    f"Worker did not accept request {request.id} within "
+                    f"{self._submit_timeout_secs:.0f}s; it appears stalled or "
+                    f"disconnected.")
+                self._set_fatal_error(err)
+                self.pre_shutdown()
+                raise err
+        self._last_progress_time = time.monotonic()
+        self.request_queue.put(request)
+
     def collective_rpc(
         self,
         method: str,
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -965,7 +965,12 @@ def _handle_ray_response(self, response: Any):
         return response
 
     def _result_step(self, timeout: Optional[float] = None):
-        response = self.queue.get()
+        try:
+            response = self.queue.get(timeout=timeout)
+        except Empty:
+            raise TimeoutError(
+                f"Request {self.request_id} timed out after {timeout}s "
+                f"waiting for a response from the executor worker.")
         self._handle_response(response)
 
     async def _aresult_step(self):
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -1,5 +1,8 @@
+import faulthandler
 import gc
 import os
+import signal
+import sys
 import threading
 import time
 import traceback
@@ -185,6 +188,30 @@ def _print_stacks():
                                                daemon=True)
         print_stacks_thread.start()
 
+    # Install a faulthandler so the worker's thread stacks can be captured on demand (SIGUSR1, e.g.
+    # triggered by the proxy's stall watchdog) or on a fatal signal. A stalled worker's state is
+    # otherwise lost when it is killed at teardown. Optionally dump to a durable file (when worker
+    # stderr is not captured) via TLLM_WORKER_TRACEBACK_DIR.
+    faulthandler.enable()
+    _traceback_file = sys.stderr
+    _traceback_dir = os.getenv("TLLM_WORKER_TRACEBACK_DIR")
+    if _traceback_dir:
+        try:
+            os.makedirs(_traceback_dir, exist_ok=True)
+            _traceback_file = open(
+                os.path.join(_traceback_dir,
+                             f"worker_traceback_{os.getpid()}.log"), "a")
+        except OSError:
+            _traceback_file = sys.stderr
+    if hasattr(signal, "SIGUSR1"):
+        try:
+            faulthandler.register(signal.SIGUSR1,
+                                  file=_traceback_file,
+                                  all_threads=True,
+                                  chain=False)
+        except (ValueError, OSError):
+            pass
+
     mpi_comm().barrier()
 
     if llm_args is not None and llm_args.env_overrides:
@@ -335,8 +362,9 @@ def notify_proxy_threads_to_quit():
                 else:
                     worker.set_result_queue(result_queue)
 
-                # Send ready signal with confirmation
-                ready_msg = (ready_signal, None)
+                # Send ready signal with confirmation. The payload carries the worker PID so the
+                # proxy can signal it (SIGUSR1 -> thread-stack dump) if it later stalls.
+                ready_msg = (ready_signal, os.getpid())
                 if not worker_init_status_queue.notify_with_retry(ready_msg):
                     logger.warning(
                         "Failed to deliver ready signal to proxy, continuing anyway"
diff --git a/tests/integration/defs/accuracy/video_mme.py b/tests/integration/defs/accuracy/video_mme.py
@@ -151,7 +151,13 @@ def evaluate(
                     streaming=streaming,
                 )
             )
-        outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]
+        # Bound the per-request wait so a stalled/dead worker fails the test fast instead of hanging
+        # until the outer CI timeout. No healthy single request should come close to this budget.
+        result_timeout = 300.0
+        outputs = [
+            future.result(timeout=result_timeout)
+            for future in tqdm(futures, desc="Fetching responses")
+        ]
 
         if self.output_dir:
             dump_inference_results(self.output_dir, outputs, getattr(llm, "tokenizer", None))

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,13 @@ def evaluate(`
`151`	`151`	`streaming=streaming,`
`152`	`152`	`)`
`153`	`153`	`)`
`154`		`- outputs = [future.result() for future in tqdm(futures, desc="Fetching responses")]`
	`154`	`+ # Bound the per-request wait so a stalled/dead worker fails the test fast instead of hanging`
	`155`	`+ # until the outer CI timeout. No healthy single request should come close to this budget.`
	`156`	`+ result_timeout = 300.0`
	`157`	`+ outputs = [`
	`158`	`+ future.result(timeout=result_timeout)`
	`159`	`+ for future in tqdm(futures, desc="Fetching responses")`
	`160`	`+ ]`
`155`	`161`
`156`	`162`	`if self.output_dir:`
`157`	`163`	`dump_inference_results(self.output_dir, outputs, getattr(llm, "tokenizer", None))`