1616import concurrent .futures
1717import json
1818import os
19+ import signal
1920import threading
21+ import time
2022import weakref
2123from queue import Empty
2224from typing import Dict , List , Optional , Union
@@ -122,6 +124,23 @@ def __init__(
122124
123125 self ._results : Dict [int , GenerationResult ] = {}
124126
127+ # --- liveness / stall detection state ---
128+ # Time of the last sign of worker progress (a request submitted or a result received). The
129+ # error monitor uses this to detect a worker that has silently stopped servicing requests.
130+ self ._last_progress_time = time .monotonic ()
131+ # Max time to wait for the worker to accept a submitted request before declaring it
132+ # dead/stalled. With an unbounded send HWM the send only blocks when the worker disconnected
133+ # or stopped draining.
134+ self ._submit_timeout_secs = float (
135+ os .environ .get ("TLLM_EXECUTOR_SUBMIT_TIMEOUT_SECS" , "300" ))
136+ # Max time with requests in flight but no result before treating the worker as stalled.
137+ self ._stall_timeout_secs = float (
138+ os .environ .get ("TLLM_EXECUTOR_STALL_TIMEOUT_SECS" , "300" ))
139+ # PID of the leader worker process, learned from the init handshake; used to request a
140+ # thread-stack dump (SIGUSR1) when a stall is detected. Stays `None` for remote/out-of-host
141+ # worker sessions.
142+ self ._worker_pid : Optional [int ] = None
143+
125144 self .model_world_size = model_world_size
126145
127146 _llm_args = worker_kwargs .get ("llm_args" , None )
@@ -262,13 +281,50 @@ def _error_monitor_loop(self) -> None:
262281 self ._drain_error_queue ()
263282 if self ._fatal_error is not None :
264283 return
284+
285+ # Progress watchdog: a worker that silently stops servicing requests (no result for
286+ # a long time while requests are in flight) is treated as a fatal stall so callers
287+ # fail fast instead of hanging indefinitely.
288+ if self ._results and (
289+ time .monotonic () -
290+ self ._last_progress_time ) > self ._stall_timeout_secs :
291+ logger .error (
292+ f"Error monitor: no result progress for { self ._stall_timeout_secs :.2f} s "
293+ f"with { len (self ._results )} request(s) in flight; "
294+ "treating worker as stalled." )
295+ self ._maybe_dump_worker_traceback ()
296+ self ._set_fatal_error (
297+ RuntimeError (
298+ f"Worker stalled: no result for { self ._stall_timeout_secs :.2f} s "
299+ f"with { len (self ._results )} request(s) in flight." ))
300+ self .pre_shutdown ()
301+ return
265302 except Exception as exc :
266303 logger .debug (f"Error monitor: unexpected exception (ignored): "
267304 f"{ exc !r} " )
268305
269306 # Wait up to 5s, but wake immediately if _shutdown_event is set
270307 self ._shutdown_event .wait (timeout = 5.0 )
271308
309+ def _maybe_dump_worker_traceback (self ) -> None :
310+ """Best-effort: ask the worker process to dump all thread stacks.
311+
312+ The worker registers a SIGUSR1 handler (faulthandler) that writes a full traceback of every
313+ thread to its stderr / traceback file. This can be useful for diagnosing a stalled worker,
314+ whose state is otherwise lost when the process is killed at teardown. Only works when the
315+ worker runs on the same host (the in-process `MpiPoolSession` case).
316+ """
317+ pid = self ._worker_pid
318+ if pid is None or not hasattr (signal , "SIGUSR1" ):
319+ return
320+ try :
321+ os .kill (pid , signal .SIGUSR1 )
322+ logger .error (
323+ f"Sent SIGUSR1 to worker pid { pid } requesting a thread-stack "
324+ f"dump for stall diagnosis." )
325+ except OSError as e :
326+ logger .debug (f"Could not signal worker pid { pid } : { e !r} " )
327+
272328 def _setup_queues (self ) -> WorkerCommIpcAddrs :
273329
274330 self .request_queue = IpcQueue (is_server = True ,
@@ -319,6 +375,9 @@ def dispatch_result_task(self) -> bool:
319375 if (res := self .result_queue .get ()) is None :
320376 return False # shutdown the thread
321377
378+ # A result arrived: the worker is making progress.
379+ self ._last_progress_time = time .monotonic ()
380+
322381 async_queues = []
323382 event_loop = None
324383
@@ -421,7 +480,8 @@ def mpi_done_callback(future: concurrent.futures.Future):
421480
422481 while True :
423482 if self .worker_init_status_queue .poll (1 ):
424- ready_signal , error_trace = self .worker_init_status_queue .get ()
483+ ready_signal , ready_payload = self .worker_init_status_queue .get (
484+ )
425485 # Send ACK to the worker
426486 self .worker_init_status_queue .put ("ACK" )
427487 logger .info ("get signal from executor worker" )
@@ -432,11 +492,18 @@ def mpi_done_callback(future: concurrent.futures.Future):
432492 self ._handle_background_error ()
433493
434494 if ready_signal != GenerationExecutorProxy .READY_SIGNAL :
435- logger .error (f"Executor worker initialization error: { error_trace } " )
495+ # On the error path the payload is the worker's traceback string.
496+ logger .error (
497+ f"Executor worker initialization error: { ready_payload } " )
436498 self .mpi_session .shutdown_abort (reason = ready_signal )
437499 raise RuntimeError (
438500 "Executor worker returned error" ) from ready_signal
439501
502+ # On success the worker sends its PID as the payload so we can signal it (SIGUSR1 ->
503+ # thread-stack dump) if it later stalls.
504+ if isinstance (ready_payload , int ):
505+ self ._worker_pid = ready_payload
506+
440507 def _abort_all_requests (self ):
441508 # The results can be finished during this loop, so self._results may be changed.
442509 for result in list (self ._results .values ()):
@@ -549,12 +616,41 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
549616 self ._results [request .id ] = result
550617
551618 with nvtx_range_debug ("request_queue.put" ):
552- self .request_queue . put (request )
619+ self ._submit_request (request )
553620
554621 self ._handle_background_error ()
555622
556623 return result
557624
625+ def _submit_request (self , request : GenerationRequest ) -> None :
626+ """Send a request to the worker with a bounded wait.
627+
628+ This is so a dead or stuck worker surfaces as a fast error instead of blocking forever.
629+ """
630+ # With an unbounded send HWM, `socket.send` only blocks when the worker has disconnected
631+ # (PAIR mute state) or has stopped draining. We poll for send-readiness and, while we cannot
632+ # send, check whether the worker has died or a fatal error was recorded, giving up after
633+ # `_submit_timeout_secs`.
634+ deadline = time .monotonic () + self ._submit_timeout_secs
635+ while not self .request_queue .poll_send (timeout = 1.0 ):
636+ # Surface any error already recorded by the monitor / callbacks.
637+ self ._handle_background_error ()
638+ if self ._check_mpi_futures ():
639+ raise RequestError (
640+ "Executor worker exited before the request could be "
641+ "submitted." )
642+ if time .monotonic () >= deadline :
643+ self ._maybe_dump_worker_traceback ()
644+ err = RequestError (
645+ f"Worker did not accept request { request .id } within "
646+ f"{ self ._submit_timeout_secs :.0f} s; it appears stalled or "
647+ f"disconnected." )
648+ self ._set_fatal_error (err )
649+ self .pre_shutdown ()
650+ raise err
651+ self ._last_progress_time = time .monotonic ()
652+ self .request_queue .put (request )
653+
558654 def collective_rpc (
559655 self ,
560656 method : str ,
0 commit comments