NVIDIA
diff --git a/‎tensorrt_llm/_torch/alltoall_watchdog.py‎
Lines changed: 100 additions & 16 deletions b/‎tensorrt_llm/_torch/alltoall_watchdog.py‎
Lines changed: 100 additions & 16 deletions
diff --git a/‎tensorrt_llm/_torch/distributed/moe_alltoall.py‎
Lines changed: 25 additions & 3 deletions b/‎tensorrt_llm/_torch/distributed/moe_alltoall.py‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py‎
Lines changed: 13 additions & 0 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_one_sided.py‎
Lines changed: 9 additions & 2 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_one_sided.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 7 additions & 0 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py‎
Lines changed: 7 additions & 0 deletions
@@ -31,8 +31,13 @@
 
 import torch
 
+from tensorrt_llm._utils import prefer_pinned
 from tensorrt_llm.logger import logger as tllm_logger
 
+DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S = 5.0
+DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S = 0.1
+UNKNOWN_COMPLETION_FLAG = -(2**63)
+
 
 class CompletionFlagReader(Protocol):
     """Reads one phase's rank-local completion flag row."""
@@ -51,6 +56,10 @@ def mark_failed(self, rank: int) -> bool:
         """Mark ``rank`` failed and return whether state changed."""
 
 
+class CompletionFlagReadTimeout(TimeoutError):
+    """Raised when the host watchdog cannot read completion flags in time."""
+
+
 @dataclass(frozen=True)
 class AlltoAllWatchdogTimeout:
     """Details emitted when an AlltoAll phase times out."""
@@ -61,6 +70,7 @@ class AlltoAllWatchdogTimeout:
     missing_ranks: tuple[int, ...]
     marked_failed_ranks: tuple[int, ...]
     elapsed_s: float
+    poll_timed_out: bool = False
 
 
 @dataclass(frozen=True)
@@ -81,6 +91,7 @@ def __init__(
         ep_size: int,
         dispatch_completion_flags_offset: int,
         combine_completion_flags_offset: int,
+        device_copy_timeout_s: float = DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
     ) -> None:
         if workspace.dim() != 2:
             raise ValueError("workspace must be a 2D tensor [ep_size, size_per_rank]")
@@ -97,11 +108,50 @@ def __init__(
             "dispatch": int(dispatch_completion_flags_offset),
             "combine": int(combine_completion_flags_offset),
         }
+        self._device_copy_timeout_s = float(device_copy_timeout_s)
+        self._copy_stream: torch.cuda.Stream | None = None
+        self._retired_copies: list[tuple[torch.Tensor, torch.cuda.Event]] = []
+        if workspace.device.type == "cuda":
+            self._copy_stream = torch.cuda.Stream(device=workspace.device)
+
+    def _prune_retired_copies(self) -> None:
+        self._retired_copies = [
+            (host_flags, event) for host_flags, event in self._retired_copies if not event.query()
+        ]
+
+    def _read_cuda_flags(self, flags: torch.Tensor) -> tuple[int, ...]:
+        assert self._copy_stream is not None
+        self._prune_retired_copies()
+
+        host_flags = torch.empty(
+            (self._ep_size,),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=prefer_pinned(),
+        )
+        event = torch.cuda.Event(blocking=False)
+        with torch.cuda.device(flags.device), torch.cuda.stream(self._copy_stream):
+            host_flags.copy_(flags.detach(), non_blocking=True)
+            event.record(self._copy_stream)
+
+        deadline_s = time.monotonic() + self._device_copy_timeout_s
+        while not event.query():
+            remaining_s = deadline_s - time.monotonic()
+            if remaining_s <= 0:
+                self._retired_copies.append((host_flags, event))
+                raise CompletionFlagReadTimeout(
+                    "timed out copying AlltoAll completion flags to host"
+                )
+            time.sleep(min(remaining_s, 0.001))
+
+        return tuple(int(v) for v in host_flags.tolist())
 
     def read_completion_flags(self, phase: str) -> tuple[int, ...]:
         offset = self._offsets[phase]
         end = offset + self._ep_size * 4
         flags = self._workspace[self._ep_rank, offset:end].view(torch.int32)
+        if flags.device.type == "cuda":
+            return self._read_cuda_flags(flags)
         if flags.device.type != "cpu":
             flags = flags.detach().cpu()
         return tuple(int(v) for v in flags.tolist())
@@ -123,8 +173,8 @@ def __init__(
         ep_size: int,
         ep_rank: int,
         completion_reader: CompletionFlagReader,
-        timeout_s: float,
-        poll_interval_s: float = 0.05,
+        timeout_s: float = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S,
+        poll_interval_s: float = DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
         health: Optional[EPGroupHealthLike] = None,
         on_timeout: Optional[Callable[[AlltoAllWatchdogTimeout], None]] = None,
     ) -> None:
@@ -160,8 +210,8 @@ def from_workspace(
         metainfo_index: Mapping[str, int],
         ep_rank: int,
         ep_size: int,
-        timeout_s: float,
-        poll_interval_s: float = 0.05,
+        timeout_s: float = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S,
+        poll_interval_s: float = DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
         health: Optional[EPGroupHealthLike] = None,
         on_timeout: Optional[Callable[[AlltoAllWatchdogTimeout], None]] = None,
     ) -> "AlltoAllWatchdog":
@@ -178,6 +228,7 @@ def from_workspace(
             ep_size=ep_size,
             dispatch_completion_flags_offset=dispatch_offset,
             combine_completion_flags_offset=combine_offset,
+            device_copy_timeout_s=poll_interval_s,
         )
         return cls(
             ep_size=ep_size,
@@ -288,11 +339,18 @@ def _missing_ranks(
             if observed_flags[rank] != watch.expected_flag
         )
 
-    def _handle_timeout(self, watch: _CollectiveWatch, observed_flags: tuple[int, ...]) -> None:
+    def _handle_timeout(
+        self,
+        watch: _CollectiveWatch,
+        observed_flags: tuple[int, ...],
+        *,
+        poll_timed_out: bool = False,
+    ) -> None:
         elapsed_s = time.monotonic() - watch.start_s
         missing_ranks = self._missing_ranks(watch, observed_flags)
         marked_failed: list[int] = []
-        if self._health is not None:
+        has_known_flags = UNKNOWN_COMPLETION_FLAG not in observed_flags
+        if self._health is not None and (has_known_flags or not poll_timed_out):
             for rank in missing_ranks:
                 if rank == self._ep_rank:
                     continue
@@ -306,20 +364,37 @@ def _handle_timeout(self, watch: _CollectiveWatch, observed_flags: tuple[int, ..
             missing_ranks=missing_ranks,
             marked_failed_ranks=tuple(marked_failed),
             elapsed_s=elapsed_s,
+            poll_timed_out=poll_timed_out,
         )
-        tllm_logger.warning(
-            "AlltoAll watchdog timeout on rank %d during %s: expected flag %d, "
-            "missing ranks %s, observed flags %s",
-            self._ep_rank,
-            watch.phase,
-            watch.expected_flag,
-            list(missing_ranks),
-            list(observed_flags),
-        )
+        if poll_timed_out:
+            tllm_logger.error(
+                "AlltoAll watchdog could not read completion flags on rank %d "
+                "during %s before timeout %.3fs; expected flag %d, active "
+                "ranks %s, observed flags %s, marked ranks %s",
+                self._ep_rank,
+                watch.phase,
+                elapsed_s,
+                watch.expected_flag,
+                list(self._active_ranks(watch.active_mask)),
+                list(observed_flags),
+                list(marked_failed),
+            )
+        else:
+            tllm_logger.warning(
+                "AlltoAll watchdog timeout on rank %d during %s: expected flag %d, "
+                "missing ranks %s, observed flags %s",
+                self._ep_rank,
+                watch.phase,
+                watch.expected_flag,
+                list(missing_ranks),
+                list(observed_flags),
+            )
         if self._on_timeout is not None:
             self._on_timeout(event)
 
     def _run(self) -> None:
+        last_observed_flags = tuple(UNKNOWN_COMPLETION_FLAG for _ in range(self._ep_size))
+        poll_timed_out = False
         while True:
             with self._cv:
                 while not self._queue and not self._stopping:
@@ -337,6 +412,11 @@ def _run(self) -> None:
                         f"completion reader returned {len(observed_flags)} flags; "
                         f"expected ep_size={self._ep_size}"
                     )
+                last_observed_flags = observed_flags
+                poll_timed_out = False
+            except CompletionFlagReadTimeout:
+                observed_flags = last_observed_flags
+                poll_timed_out = True
             except BaseException as exc:  # noqa: BLE001 - keep watchdog failures visible.
                 with self._cv:
                     self._last_error = exc
@@ -350,16 +430,20 @@ def _run(self) -> None:
                     if self._queue and self._queue[0] is watch:
                         self._queue.popleft()
                     self._cv.notify_all()
+                last_observed_flags = tuple(UNKNOWN_COMPLETION_FLAG for _ in range(self._ep_size))
+                poll_timed_out = False
                 continue
 
             if time.monotonic() - watch.start_s >= self._timeout_s:
-                self._handle_timeout(watch, observed_flags)
+                self._handle_timeout(watch, observed_flags, poll_timed_out=poll_timed_out)
                 with self._cv:
                     # The GPU stream is no longer trustworthy once a collective
                     # times out. Drop queued follow-on phases so they do not
                     # produce duplicate or misleading reports.
                     self._queue.clear()
                     self._cv.notify_all()
+                last_observed_flags = tuple(UNKNOWN_COMPLETION_FLAG for _ in range(self._ep_size))
+                poll_timed_out = False
                 continue
 
             with self._cv:
 
@@ -8,14 +8,17 @@
 # ruff: noqa: E501
 
 import os
+import sys
 from dataclasses import dataclass
 from typing import Callable, Dict, Optional
 
 import torch
 
 from tensorrt_llm._mnnvl_utils import MnnvlMemory
-from tensorrt_llm._torch.alltoall_watchdog import (AlltoAllWatchdog,
-                                                   AlltoAllWatchdogTimeout)
+from tensorrt_llm._torch.alltoall_watchdog import (
+    DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
+    DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S, AlltoAllWatchdog,
+    AlltoAllWatchdogTimeout)
 from tensorrt_llm.bindings import internal as _tllm_internal
 from tensorrt_llm.logger import logger as tllm_logger
 from tensorrt_llm.mapping import Mapping
@@ -130,7 +133,8 @@ def __init__(
         num_experts: Optional[int] = None,
         ep_group_health=None,
         alltoall_watchdog_timeout_s: Optional[float] = None,
-        alltoall_watchdog_poll_interval_s: float = 0.05,
+        alltoall_watchdog_poll_interval_s:
+        float = DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
         alltoall_watchdog_on_timeout: Optional[Callable[
             [AlltoAllWatchdogTimeout], None]] = None,
     ):
@@ -228,8 +232,12 @@ def __init__(
         # Internal state
         self._state: _A2AState = _A2AState()
         self.ep_group_health = ep_group_health
+        self._destroyed = False
         self._watchdog_flag_generation = 0
         self._alltoall_watchdog: AlltoAllWatchdog | None = None
+        if (alltoall_watchdog_timeout_s is None
+                and self.ep_group_health is not None):
+            alltoall_watchdog_timeout_s = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S
         if alltoall_watchdog_timeout_s is not None:
             self._watchdog_flag_generation = self._read_current_flag_val()
             self._alltoall_watchdog = AlltoAllWatchdog.from_workspace(
@@ -244,6 +252,20 @@ def __init__(
                 on_timeout=alltoall_watchdog_on_timeout,
             )
 
+    def destroy(self) -> None:
+        """Stop background watchdog resources owned by this wrapper."""
+        if getattr(self, "_destroyed", False):
+            return
+        self._destroyed = True
+        watchdog = getattr(self, "_alltoall_watchdog", None)
+        if watchdog is not None:
+            watchdog.stop(timeout_s=1.0)
+            self._alltoall_watchdog = None
+
+    def __del__(self) -> None:
+        if not sys.is_finalizing():
+            self.destroy()
+
     def _read_current_flag_val(self) -> int:
         flag_val_offset = self.metainfo[
             self._METAINFO_INDEX["FLAG_VAL_OFFSET_INDEX"]].item()
 
@@ -28,6 +28,7 @@
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm.logger import logger
 
+from ..wide_ep_ft import get_wide_ep_ft_options
 from .allgather_reducescatter import AllGatherReduceScatter
 from .base import Communication
 from .deep_ep import DeepEP
@@ -133,6 +134,9 @@ def create_strategy(
 
         try:
             enable_eplb = model_config.moe_load_balancer is not None
+            ep_group_health, watchdog_timeout_s, watchdog_poll_interval_s = get_wide_ep_ft_options(
+                model_config
+            )
             strategy = NVLinkOneSided(
                 mapping,
                 num_slots,
@@ -143,6 +147,9 @@ def create_strategy(
                 dtype=act_dtype,
                 num_experts=num_experts if enable_eplb else None,
                 use_low_precision_combine=use_low_precision_combine,
+                ep_group_health=ep_group_health,
+                alltoall_watchdog_timeout_s=watchdog_timeout_s,
+                alltoall_watchdog_poll_interval_s=watchdog_poll_interval_s,
             )
             logger.info("Selected communication strategy: NVLinkOneSided")
             return strategy
@@ -285,6 +292,9 @@ def _create_forced_method(
                 )
         elif method in ["NVLINK_ONE_SIDED"]:
             enable_eplb = model_config.moe_load_balancer is not None
+            ep_group_health, watchdog_timeout_s, watchdog_poll_interval_s = get_wide_ep_ft_options(
+                model_config
+            )
             return NVLinkOneSided(
                 mapping,
                 num_slots,
@@ -295,6 +305,9 @@ def _create_forced_method(
                 dtype=act_dtype,
                 num_experts=num_experts if enable_eplb else None,
                 use_low_precision_combine=use_low_precision_combine,
+                ep_group_health=ep_group_health,
+                alltoall_watchdog_timeout_s=watchdog_timeout_s,
+                alltoall_watchdog_poll_interval_s=watchdog_poll_interval_s,
             )
         elif method == "DEEPEP":
             return DeepEP(
 
@@ -30,7 +30,12 @@
 import torch
 
 from tensorrt_llm._mnnvl_utils import MnnvlMemory
-from tensorrt_llm._torch.alltoall_watchdog import AlltoAllWatchdog, AlltoAllWatchdogTimeout
+from tensorrt_llm._torch.alltoall_watchdog import (
+    DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
+    DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S,
+    AlltoAllWatchdog,
+    AlltoAllWatchdogTimeout,
+)
 from tensorrt_llm.bindings import internal as _tllm_internal
 from tensorrt_llm.logger import logger as tllm_logger
 from tensorrt_llm.mapping import Mapping
@@ -154,7 +159,7 @@ def __init__(
         use_low_precision_combine: bool = False,
         ep_group_health=None,
         alltoall_watchdog_timeout_s: Optional[float] = None,
-        alltoall_watchdog_poll_interval_s: float = 0.05,
+        alltoall_watchdog_poll_interval_s: float = DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
         alltoall_watchdog_on_timeout: Optional[Callable[[AlltoAllWatchdogTimeout], None]] = None,
     ):
         """
@@ -314,6 +319,8 @@ def __init__(
         self.ep_group_health = ep_group_health
         self._watchdog_flag_generation = 0
         self._alltoall_watchdog: AlltoAllWatchdog | None = None
+        if alltoall_watchdog_timeout_s is None and self.ep_group_health is not None:
+            alltoall_watchdog_timeout_s = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S
         if alltoall_watchdog_timeout_s is not None:
             self._watchdog_flag_generation = self._read_current_flag_val()
             self._alltoall_watchdog = AlltoAllWatchdog.from_workspace(
 
@@ -20,6 +20,7 @@
                       Fp4QuantizedTensor)
 from .interface import AlltoallMethodType, MoE
 from .quantization import UnquantizedFusedMoEMethod
+from .wide_ep_ft import get_wide_ep_ft_options
 
 # isort: off
 from .quantization import (
@@ -324,6 +325,8 @@ def __init__(
                         dtype,
                         self.num_experts if self.layer_load_balancer else None,
                     )
+                    ep_group_health, watchdog_timeout_s, watchdog_poll_interval_s = (
+                        get_wide_ep_ft_options(model_config))
 
                     self.moe_a2a = MoeAlltoAll(
                         mapping=self.mapping,
@@ -333,6 +336,10 @@ def __init__(
                         workspace_size_per_rank=workspace_size,
                         num_experts=self.num_experts
                         if self.layer_load_balancer else None,
+                        ep_group_health=ep_group_health,
+                        alltoall_watchdog_timeout_s=watchdog_timeout_s,
+                        alltoall_watchdog_poll_interval_s=
+                        watchdog_poll_interval_s,
                     )
                 elif self.alltoall_method_type == AlltoallMethodType.DeepEP or self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
                     raise NotImplementedError(