3636from ..llmapi .utils import (AsyncQueue , ManagedThread , _SyncQueue ,
3737 enable_llm_debug , logger_debug , print_colored )
3838from .executor import GenerationExecutor
39+ from .heartbeat import worker_heartbeat_timed_out
3940from .ipc import FusedIpcQueue , IpcQueue
4041from .postproc_worker import PostprocWorker , PostprocWorkerConfig
4142from .request import CancellingRequest , GenerationRequest
4243from .result import GenerationResult , IterationResult
4344from .rpc import RPCClient
4445from .rpc .rpc_common import RPCError , get_unique_ipc_addr
4546from .utils import (ErrorResponse , RequestError , WorkerCommIpcAddrs ,
46- create_mpi_comm_session , get_spawn_proxy_process_env ,
47- is_llm_response , print_alive_threads )
47+ WorkerHeartbeat , create_mpi_comm_session ,
48+ get_spawn_proxy_process_env , is_llm_response ,
49+ print_alive_threads )
4850from .worker import GenerationExecutorWorker , worker_main
4951
5052__all__ = [
@@ -125,17 +127,28 @@ def __init__(
125127 self ._results : Dict [int , GenerationResult ] = {}
126128
127129 # --- liveness / stall detection state ---
128- # Time of the last sign of worker progress (a request submitted or a result received). The
129- # error monitor uses this to detect a worker that has silently stopped servicing requests.
130- self ._last_progress_time = time .monotonic ()
130+ # Time of the last result and worker heartbeat. Long non-streaming requests can legitimately
131+ # go quiet on the result queue, so fatal stall detection is based on heartbeats from the
132+ # worker response-polling thread rather than result traffic.
133+ self ._last_result_time = time .monotonic ()
134+ self ._last_worker_heartbeat_time = self ._last_result_time
131135 # Max time to wait for the worker to accept a submitted request before declaring it
132136 # dead/stalled. With an unbounded send HWM the send only blocks when the worker disconnected
133137 # or stopped draining.
134138 self ._submit_timeout_secs = float (
135139 os .environ .get ("TLLM_EXECUTOR_SUBMIT_TIMEOUT_SECS" , "300" ))
136- # Max time with requests in flight but no result before treating the worker as stalled.
137- self ._stall_timeout_secs = float (
138- os .environ .get ("TLLM_EXECUTOR_STALL_TIMEOUT_SECS" , "300" ))
140+ # Max time with requests in flight but no worker heartbeat before treating the worker as
141+ # stalled. The legacy env var is still accepted as an alias for compatibility.
142+ self ._heartbeat_timeout_secs = float (
143+ os .environ .get (
144+ "TLLM_EXECUTOR_HEARTBEAT_TIMEOUT_SECS" ,
145+ os .environ .get ("TLLM_EXECUTOR_STALL_TIMEOUT_SECS" , "300" )))
146+ # Warn about long result-quiet periods without killing the worker. This is diagnostic only:
147+ # a healthy heartbeat means the worker is still polling responses.
148+ self ._result_quiet_warning_secs = float (
149+ os .environ .get ("TLLM_EXECUTOR_RESULT_QUIET_WARNING_SECS" ,
150+ str (self ._heartbeat_timeout_secs )))
151+ self ._last_result_quiet_warning_time = self ._last_result_time
139152 # PID of the leader worker process, learned from the init handshake; used to request a
140153 # thread-stack dump (SIGUSR1) when a stall is detected. Stays `None` for remote/out-of-host
141154 # worker sessions.
@@ -282,23 +295,34 @@ def _error_monitor_loop(self) -> None:
282295 if self ._fatal_error is not None :
283296 return
284297
285- # Progress watchdog: a worker that silently stops servicing requests (no result for
286- # a long time while requests are in flight) is treated as a fatal stall so callers
287- # fail fast instead of hanging indefinitely.
288- if self ._results and (
289- time .monotonic () -
290- self ._last_progress_time ) > self ._stall_timeout_secs :
298+ try :
299+ self ._drain_heartbeat_queue ()
300+ except Exception as exc :
301+ logger .warning (
302+ "Error monitor: failed to drain worker heartbeat "
303+ f"queue; continuing timeout check: { exc !r} " )
304+
305+ # Heartbeat watchdog: this checks liveness of the worker's response-polling thread,
306+ # not generation forward progress. Lack of result traffic alone is not fatal because
307+ # long non-streaming requests can be healthy but silent.
308+ if worker_heartbeat_timed_out (
309+ has_inflight_requests = bool (self ._results ),
310+ now = time .monotonic (),
311+ last_heartbeat_time = self ._last_worker_heartbeat_time ,
312+ timeout_secs = self ._heartbeat_timeout_secs ):
291313 logger .error (
292- f"Error monitor: no result progress for { self ._stall_timeout_secs :.2f} s "
314+ f"Error monitor: no worker heartbeat for { self ._heartbeat_timeout_secs :.2f} s "
293315 f"with { len (self ._results )} request(s) in flight; "
294316 "treating worker as stalled." )
295317 self ._maybe_dump_worker_traceback ()
296318 self ._set_fatal_error (
297319 RuntimeError (
298- f"Worker stalled: no result for { self ._stall_timeout_secs :.2f} s "
320+ f"Worker stalled: no heartbeat for { self ._heartbeat_timeout_secs :.2f} s "
299321 f"with { len (self ._results )} request(s) in flight." ))
300322 self .pre_shutdown ()
301323 return
324+
325+ self ._maybe_log_result_quiet_warning ()
302326 except Exception as exc :
303327 logger .debug (f"Error monitor: unexpected exception (ignored): "
304328 f"{ exc !r} " )
@@ -325,6 +349,30 @@ def _maybe_dump_worker_traceback(self) -> None:
325349 except OSError as e :
326350 logger .debug (f"Could not signal worker pid { pid } : { e !r} " )
327351
352+ def _drain_heartbeat_queue (self ) -> None :
353+ """Drain worker heartbeats and refresh the proxy-local liveness timestamp."""
354+ while self .heartbeat_queue .poll (0 ):
355+ heartbeat = self .heartbeat_queue .get ()
356+ if isinstance (heartbeat , WorkerHeartbeat ):
357+ self ._last_worker_heartbeat_time = time .monotonic ()
358+ if heartbeat .rank == 0 :
359+ self ._worker_pid = heartbeat .pid
360+
361+ def _maybe_log_result_quiet_warning (self ) -> None :
362+ if not self ._results :
363+ return
364+ now = time .monotonic ()
365+ if (now - self ._last_result_time ) <= self ._result_quiet_warning_secs :
366+ return
367+ time_since_last_warning = now - self ._last_result_quiet_warning_time
368+ if time_since_last_warning <= self ._result_quiet_warning_secs :
369+ return
370+ logger .warning (
371+ f"No result emitted for { self ._result_quiet_warning_secs :.2f} s "
372+ f"with { len (self ._results )} request(s) in flight, but worker "
373+ "heartbeats are still arriving." )
374+ self ._last_result_quiet_warning_time = now
375+
328376 def _setup_queues (self ) -> WorkerCommIpcAddrs :
329377
330378 self .request_queue = IpcQueue (is_server = True ,
@@ -342,6 +390,9 @@ def _setup_queues(self) -> WorkerCommIpcAddrs:
342390 socket_type = zmq .PULL
343391 if self .enable_postprocess_parallel else zmq .PAIR ,
344392 name = "proxy_result_queue" )
393+ self .heartbeat_queue = IpcQueue (is_server = True ,
394+ socket_type = zmq .PULL ,
395+ name = "proxy_heartbeat_queue" )
345396 self ._resource_governor_queue = IpcQueue (
346397 is_server = True , name = "proxy_resource_governor_queue"
347398 ) if self ._enable_resource_governor else None
@@ -350,6 +401,7 @@ def _setup_queues(self) -> WorkerCommIpcAddrs:
350401 request_queue_addr = self .request_queue .address ,
351402 worker_init_status_queue_addr = self .worker_init_status_queue .address ,
352403 result_queue_addr = self .result_queue .address ,
404+ heartbeat_queue_addr = self .heartbeat_queue .address ,
353405 resource_governor_queue_addr = self ._resource_governor_queue .address
354406 if self ._resource_governor_queue is not None else None ,
355407 )
@@ -383,7 +435,7 @@ def dispatch_result_task(self) -> bool:
383435 return False # shutdown the thread
384436
385437 # A result arrived: the worker is making progress.
386- self ._last_progress_time = time .monotonic ()
438+ self ._last_result_time = time .monotonic ()
387439
388440 async_queues = []
389441 event_loop = None
@@ -633,6 +685,7 @@ def shutdown(self):
633685 self .request_queue .close ()
634686 self .worker_init_status_queue .close ()
635687 self .result_queue .close ()
688+ self .heartbeat_queue .close ()
636689 if self ._resource_governor_queue is not None :
637690 self ._resource_governor_queue .close ()
638691
@@ -663,6 +716,9 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
663716 executor = self ,
664717 disaggregated_params = request .disaggregated_params ,
665718 logprob_params = logprob_params )
719+ now = time .monotonic ()
720+ self ._last_result_time = now
721+ self ._last_worker_heartbeat_time = now
666722 self ._results [request .id ] = result
667723
668724 with nvtx_range_debug ("request_queue.put" ):
@@ -698,7 +754,9 @@ def _submit_request(self, request: GenerationRequest) -> None:
698754 self ._set_fatal_error (err )
699755 self .pre_shutdown ()
700756 raise err
701- self ._last_progress_time = time .monotonic ()
757+ now = time .monotonic ()
758+ self ._last_result_time = now
759+ self ._last_worker_heartbeat_time = now
702760 self .request_queue .put (request )
703761
704762 def collective_rpc (
0 commit comments