heartbeat

2ez4bz · 2ez4bz · commit d2d75cc17023 · 2026-06-24T10:47:20.000-07:00
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
@@ -17,6 +17,7 @@
 import gc
 import json
 import os
+import time
 import weakref
 from pathlib import Path
 from queue import Queue
@@ -50,7 +51,7 @@
 from .result import (GenerationResult, LogProbsResult, ResponseWrapper,
                      compute_logprobs, get_metrics_dict)
 from .utils import (ErrorResponse, IntraProcessQueue, RequestError,
-                    is_llm_response)
+                    WorkerHeartbeat, is_llm_response)
 
 if TYPE_CHECKING:
     from ..disaggregated_params import DisaggregatedParams
@@ -118,6 +119,7 @@ def __init__(
 
         self.engine = None
         self.result_queue: Optional[IpcQueue] = None
+        self.heartbeat_queue: Optional[IpcQueue] = None
         self.postproc_queues: Optional[List[IpcQueue]] = None
         self.rank = mpi_rank()
         self.global_rank = global_mpi_rank()
@@ -345,6 +347,10 @@ def set_result_queue(self, queue):
         assert self.postproc_queues is None
         self.result_queue = queue
 
+    def set_heartbeat_queue(self, queue):
+        """Set the IPC queue used to send worker liveness heartbeats to the proxy."""
+        self.heartbeat_queue = queue
+
     def set_postproc_queues(self, queues: List["IpcQueue"]):
         """ Set the IPC queues for feeding post-processing processes. """
         assert self.result_queue is None
@@ -904,6 +910,11 @@ def __init__(self, worker: "BaseWorker"):
         self.enable_postprocprocess_parallel = self.worker.enable_postprocess_parallel
         # The error responses when submit request failed will be put here
         self.temp_error_responses = Queue()
+        self._heartbeat_interval_secs = float(
+            os.environ.get("TLLM_EXECUTOR_HEARTBEAT_INTERVAL_SECS", "1"))
+        self._last_heartbeat_time = 0.0
+        self._heartbeat_pid = os.getpid()
+        self._heartbeat_rank = mpi_rank()
 
     def responses_handler(self, responses: List[tllm.Response]):
         HandlerKind = AwaitResponseHelper.HandlerKind
@@ -971,8 +982,21 @@ def __call__(self, timeout: Optional[float] = None) -> bool:
         error = getattr(self.worker.engine, "_event_loop_error", None)
         if error is not None:
             return self._broadcast_event_loop_error(error)
+        self._send_heartbeat()
         return True
 
+    def _send_heartbeat(self) -> None:
+        heartbeat_queue = self.worker.heartbeat_queue
+        if heartbeat_queue is None:
+            return
+        now = time.monotonic()
+        if (now - self._last_heartbeat_time) < self._heartbeat_interval_secs:
+            return
+        self._last_heartbeat_time = now
+        heartbeat_queue.put_noblock(WorkerHeartbeat(pid=self._heartbeat_pid,
+                                                    rank=self._heartbeat_rank),
+                                    retry=1)
+
     def _broadcast_event_loop_error(self, error: BaseException) -> bool:
         """Wake every pending ``GenerationResult`` after an event-loop crash.
 
diff --git a/tensorrt_llm/executor/heartbeat.py b/tensorrt_llm/executor/heartbeat.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def worker_heartbeat_timed_out(
+    *,
+    has_inflight_requests: bool,
+    now: float,
+    last_heartbeat_time: float,
+    timeout_secs: float,
+) -> bool:
+    """Return whether an in-flight worker has exceeded its heartbeat timeout."""
+    return has_inflight_requests and (now - last_heartbeat_time) > timeout_secs
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
@@ -36,15 +36,17 @@
 from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue,
                             enable_llm_debug, logger_debug, print_colored)
 from .executor import GenerationExecutor
+from .heartbeat import worker_heartbeat_timed_out
 from .ipc import FusedIpcQueue, IpcQueue
 from .postproc_worker import PostprocWorker, PostprocWorkerConfig
 from .request import CancellingRequest, GenerationRequest
 from .result import GenerationResult, IterationResult
 from .rpc import RPCClient
 from .rpc.rpc_common import RPCError, get_unique_ipc_addr
 from .utils import (ErrorResponse, RequestError, WorkerCommIpcAddrs,
-                    create_mpi_comm_session, get_spawn_proxy_process_env,
-                    is_llm_response, print_alive_threads)
+                    WorkerHeartbeat, create_mpi_comm_session,
+                    get_spawn_proxy_process_env, is_llm_response,
+                    print_alive_threads)
 from .worker import GenerationExecutorWorker, worker_main
 
 __all__ = [
@@ -125,17 +127,28 @@ def __init__(
         self._results: Dict[int, GenerationResult] = {}
 
         # --- liveness / stall detection state ---
-        # Time of the last sign of worker progress (a request submitted or a result received). The
-        # error monitor uses this to detect a worker that has silently stopped servicing requests.
-        self._last_progress_time = time.monotonic()
+        # Time of the last result and worker heartbeat. Long non-streaming requests can legitimately
+        # go quiet on the result queue, so fatal stall detection is based on heartbeats from the
+        # worker response-polling thread rather than result traffic.
+        self._last_result_time = time.monotonic()
+        self._last_worker_heartbeat_time = self._last_result_time
         # Max time to wait for the worker to accept a submitted request before declaring it
         # dead/stalled. With an unbounded send HWM the send only blocks when the worker disconnected
         # or stopped draining.
         self._submit_timeout_secs = float(
             os.environ.get("TLLM_EXECUTOR_SUBMIT_TIMEOUT_SECS", "300"))
-        # Max time with requests in flight but no result before treating the worker as stalled.
-        self._stall_timeout_secs = float(
-            os.environ.get("TLLM_EXECUTOR_STALL_TIMEOUT_SECS", "300"))
+        # Max time with requests in flight but no worker heartbeat before treating the worker as
+        # stalled. The legacy env var is still accepted as an alias for compatibility.
+        self._heartbeat_timeout_secs = float(
+            os.environ.get(
+                "TLLM_EXECUTOR_HEARTBEAT_TIMEOUT_SECS",
+                os.environ.get("TLLM_EXECUTOR_STALL_TIMEOUT_SECS", "300")))
+        # Warn about long result-quiet periods without killing the worker. This is diagnostic only:
+        # a healthy heartbeat means the worker is still polling responses.
+        self._result_quiet_warning_secs = float(
+            os.environ.get("TLLM_EXECUTOR_RESULT_QUIET_WARNING_SECS",
+                           str(self._heartbeat_timeout_secs)))
+        self._last_result_quiet_warning_time = self._last_result_time
         # PID of the leader worker process, learned from the init handshake; used to request a
         # thread-stack dump (SIGUSR1) when a stall is detected. Stays `None` for remote/out-of-host
         # worker sessions.
@@ -282,23 +295,34 @@ def _error_monitor_loop(self) -> None:
                 if self._fatal_error is not None:
                     return
 
-                # Progress watchdog: a worker that silently stops servicing requests (no result for
-                # a long time while requests are in flight) is treated as a fatal stall so callers
-                # fail fast instead of hanging indefinitely.
-                if self._results and (
-                        time.monotonic() -
-                        self._last_progress_time) > self._stall_timeout_secs:
+                try:
+                    self._drain_heartbeat_queue()
+                except Exception as exc:
+                    logger.warning(
+                        "Error monitor: failed to drain worker heartbeat "
+                        f"queue; continuing timeout check: {exc!r}")
+
+                # Heartbeat watchdog: this checks liveness of the worker's response-polling thread,
+                # not generation forward progress. Lack of result traffic alone is not fatal because
+                # long non-streaming requests can be healthy but silent.
+                if worker_heartbeat_timed_out(
+                        has_inflight_requests=bool(self._results),
+                        now=time.monotonic(),
+                        last_heartbeat_time=self._last_worker_heartbeat_time,
+                        timeout_secs=self._heartbeat_timeout_secs):
                     logger.error(
-                        f"Error monitor: no result progress for {self._stall_timeout_secs:.2f}s "
+                        f"Error monitor: no worker heartbeat for {self._heartbeat_timeout_secs:.2f}s "
                         f"with {len(self._results)} request(s) in flight; "
                         "treating worker as stalled.")
                     self._maybe_dump_worker_traceback()
                     self._set_fatal_error(
                         RuntimeError(
-                            f"Worker stalled: no result for {self._stall_timeout_secs:.2f}s "
+                            f"Worker stalled: no heartbeat for {self._heartbeat_timeout_secs:.2f}s "
                             f"with {len(self._results)} request(s) in flight."))
                     self.pre_shutdown()
                     return
+
+                self._maybe_log_result_quiet_warning()
             except Exception as exc:
                 logger.debug(f"Error monitor: unexpected exception (ignored): "
                              f"{exc!r}")
@@ -325,6 +349,30 @@ def _maybe_dump_worker_traceback(self) -> None:
         except OSError as e:
             logger.debug(f"Could not signal worker pid {pid}: {e!r}")
 
+    def _drain_heartbeat_queue(self) -> None:
+        """Drain worker heartbeats and refresh the proxy-local liveness timestamp."""
+        while self.heartbeat_queue.poll(0):
+            heartbeat = self.heartbeat_queue.get()
+            if isinstance(heartbeat, WorkerHeartbeat):
+                self._last_worker_heartbeat_time = time.monotonic()
+                if heartbeat.rank == 0:
+                    self._worker_pid = heartbeat.pid
+
+    def _maybe_log_result_quiet_warning(self) -> None:
+        if not self._results:
+            return
+        now = time.monotonic()
+        if (now - self._last_result_time) <= self._result_quiet_warning_secs:
+            return
+        time_since_last_warning = now - self._last_result_quiet_warning_time
+        if time_since_last_warning <= self._result_quiet_warning_secs:
+            return
+        logger.warning(
+            f"No result emitted for {self._result_quiet_warning_secs:.2f}s "
+            f"with {len(self._results)} request(s) in flight, but worker "
+            "heartbeats are still arriving.")
+        self._last_result_quiet_warning_time = now
+
     def _setup_queues(self) -> WorkerCommIpcAddrs:
 
         self.request_queue = IpcQueue(is_server=True,
@@ -342,6 +390,9 @@ def _setup_queues(self) -> WorkerCommIpcAddrs:
             socket_type=zmq.PULL
             if self.enable_postprocess_parallel else zmq.PAIR,
             name="proxy_result_queue")
+        self.heartbeat_queue = IpcQueue(is_server=True,
+                                        socket_type=zmq.PULL,
+                                        name="proxy_heartbeat_queue")
         self._resource_governor_queue = IpcQueue(
             is_server=True, name="proxy_resource_governor_queue"
         ) if self._enable_resource_governor else None
@@ -350,6 +401,7 @@ def _setup_queues(self) -> WorkerCommIpcAddrs:
             request_queue_addr=self.request_queue.address,
             worker_init_status_queue_addr=self.worker_init_status_queue.address,
             result_queue_addr=self.result_queue.address,
+            heartbeat_queue_addr=self.heartbeat_queue.address,
             resource_governor_queue_addr=self._resource_governor_queue.address
             if self._resource_governor_queue is not None else None,
         )
@@ -383,7 +435,7 @@ def dispatch_result_task(self) -> bool:
             return False  # shutdown the thread
 
         # A result arrived: the worker is making progress.
-        self._last_progress_time = time.monotonic()
+        self._last_result_time = time.monotonic()
 
         async_queues = []
         event_loop = None
@@ -633,6 +685,7 @@ def shutdown(self):
         self.request_queue.close()
         self.worker_init_status_queue.close()
         self.result_queue.close()
+        self.heartbeat_queue.close()
         if self._resource_governor_queue is not None:
             self._resource_governor_queue.close()
 
@@ -663,6 +716,9 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
             executor=self,
             disaggregated_params=request.disaggregated_params,
             logprob_params=logprob_params)
+        now = time.monotonic()
+        self._last_result_time = now
+        self._last_worker_heartbeat_time = now
         self._results[request.id] = result
 
         with nvtx_range_debug("request_queue.put"):
@@ -698,7 +754,9 @@ def _submit_request(self, request: GenerationRequest) -> None:
                 self._set_fatal_error(err)
                 self.pre_shutdown()
                 raise err
-        self._last_progress_time = time.monotonic()
+        now = time.monotonic()
+        self._last_result_time = now
+        self._last_worker_heartbeat_time = now
         self.request_queue.put(request)
 
     def collective_rpc(
diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py
@@ -30,12 +30,12 @@ class LlmLauncherEnvs(StrEnum):
 
 
 def get_spawn_proxy_process_ipc_addr_env() -> str | None:
-    ''' Get the IPC address for the spawn proxy process dynamically. '''
+    """Get the IPC address for the spawn proxy process dynamically."""
     return os.getenv(LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR)
 
 
 def get_spawn_proxy_process_ipc_hmac_key_env() -> bytes:
-    ''' Get the HMAC key for the spawn proxy process dynamically. '''
+    """Get the HMAC key for the spawn proxy process dynamically."""
     key = os.getenv("TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY")
     assert key is not None, (
         f"{LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY} is not set. "
@@ -44,7 +44,7 @@ def get_spawn_proxy_process_ipc_hmac_key_env() -> bytes:
 
 
 def get_spawn_proxy_process_env() -> bool:
-    ''' Get the environment variable for the spawn proxy process dynamically. '''
+    """Get the environment variable for the spawn proxy process dynamically."""
     return os.getenv(LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS) == "1"
 
 
@@ -77,7 +77,7 @@ def has_event_loop() -> bool:
 
 
 class RequestError(RuntimeError):
-    ''' The error raised when the request is failed. '''
+    """The error raised when the request is failed."""
 
 
 class ProcessPoolExecutorSession(MpiSession):
@@ -113,8 +113,15 @@ class ErrorResponse(NamedTuple):
     request_id: int
 
 
+class WorkerHeartbeat(NamedTuple):
+    """A liveness pulse from the worker process to the proxy."""
+
+    pid: int
+    rank: int
+
+
 class IntraProcessQueue:
-    ''' A Queue-like container for IPC within the same process. '''
+    """A Queue-like container for IPC within the same process."""
 
     def __init__(self):
         self.queue = Queue()
@@ -149,11 +156,12 @@ def poll(self, timeout=None) -> bool:
 
 
 class WorkerCommIpcAddrs(NamedTuple):
-    ''' IPC addresses (str) and HMAC keys (bytes) for communication with the worker processes. '''
+    """IPC addresses (str) and HMAC keys (bytes) for communication with the worker processes."""
     request_queue_addr: tuple[str, Optional[bytes]]
     worker_init_status_queue_addr: tuple[str, Optional[bytes]]
     result_queue_addr: tuple[str, Optional[bytes]]
     resource_governor_queue_addr: Optional[tuple[str, Optional[bytes]]] = None
+    heartbeat_queue_addr: Optional[tuple[str, Optional[bytes]]] = None
 
 
 def is_llm_response(instance):
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -229,6 +229,7 @@ def _print_stacks():
 
     result_queue: Optional[IpcQueue] = None
     result_queues: Optional[List[IpcQueue]] = None
+    heartbeat_queue: Optional[IpcQueue] = None
     resource_governor_queue: Optional[IpcQueue] = None
 
     postproc_worker_config = postproc_worker_config or PostprocWorkerConfig()
@@ -263,6 +264,12 @@ def _print_stacks():
             is_server=False,
             name="worker_resource_governor_queue"
         ) if worker_queues.resource_governor_queue_addr else None
+        heartbeat_queue = IpcQueue(
+            worker_queues.heartbeat_queue_addr,
+            is_server=False,
+            socket_type=zmq.PUSH,
+            name="worker_heartbeat_queue"
+        ) if worker_queues.heartbeat_queue_addr else None
 
         if postproc_worker_config.enabled:
             # IPC queues for sending inputs to the postprocess parallel
@@ -361,6 +368,8 @@ def notify_proxy_threads_to_quit():
                     worker.set_postproc_queues(result_queues)
                 else:
                     worker.set_result_queue(result_queue)
+                if heartbeat_queue is not None:
+                    worker.set_heartbeat_queue(heartbeat_queue)
 
                 # Send ready signal with confirmation. The payload carries the worker PID so the
                 # proxy can signal it (SIGUSR1 -> thread-stack dump) if it later stalls.
diff --git a/tensorrt_llm/serialization.py b/tensorrt_llm/serialization.py
@@ -82,7 +82,8 @@
         "GenerationResult", "GenerationResultBase", "IterationResult",
         "Logprob", "LogProbsResult", "ResponseWrapper"
     ],
-    "tensorrt_llm.executor.utils": ["ErrorResponse", "WorkerCommIpcAddrs"],
+    "tensorrt_llm.executor.utils":
+    ["ErrorResponse", "WorkerCommIpcAddrs", "WorkerHeartbeat"],
     "tensorrt_llm.executor.worker": ["GenerationExecutorWorker", "worker_main"],
     "tensorrt_llm.llmapi.llm_args": [
         "_ModelFormatKind", "_ParallelConfig", "CalibConfig",
diff --git a/tests/unittest/executor/test_fatal_error_health_check.py b/tests/unittest/executor/test_fatal_error_health_check.py