fix: address AlltoAll watchdog review comments

chienchunhung · chienchunhung · commit 3ed8a62d79e8 · 2026-06-24T14:02:21.000-07:00
Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -741,7 +741,7 @@ __device__ void vectorized_combine_impl(T* dst_typed_base, int size_per_token, i
         {
             int target_rank = ptrs.topk_target_ranks[local_token_idx * TOP_K + k];
             int dst_idx = ptrs.topk_send_indices[local_token_idx * TOP_K + k];
-            if (dst_idx < 0)
+            if (dst_idx < 0 || !is_rank_active(ptrs.active_rank_mask, target_rank))
             {
                 acc[k].fill(0.0f);
                 continue;
@@ -766,8 +766,12 @@ __device__ void vectorized_combine_impl(T* dst_typed_base, int size_per_token, i
 #pragma unroll
         for (int k = 0; k < TOP_K; ++k)
         {
-            if (ptrs.topk_send_indices[local_token_idx * TOP_K + k] < 0)
+            int target_rank = ptrs.topk_target_ranks[local_token_idx * TOP_K + k];
+            int dst_idx = ptrs.topk_send_indices[local_token_idx * TOP_K + k];
+            if (dst_idx < 0 || !is_rank_active(ptrs.active_rank_mask, target_rank))
+            {
                 continue; // acc[k] already holds 0.0f from fill() above
+            }
 #pragma unroll
             for (int j = elems_per_vec - 1; j >= 0; --j)
                 acc[k][j] = static_cast<float>(reinterpret_cast<InT const*>(&acc[k])[j]);
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h
@@ -93,8 +93,8 @@ struct CombineKernelPointers
     int const* topk_send_indices; // dst index per k, -1 for duplicates
 
     // Active-rank bitmask: see DispatchKernelPointers::active_rank_mask. Combine skips flag
-    // writes/waits to/from masked peers; per-token accumulation uses topk_send_indices[k] < 0
-    // (set by dispatch) to skip dead-targeted slots, so no explicit mask check is needed there.
+    // writes/waits to/from masked peers and also skips per-token accumulation for ranks that
+    // become inactive between dispatch and combine.
     uint64_t active_rank_mask[kRankMaskWords];
 };
 
diff --git a/tensorrt_llm/_torch/alltoall_watchdog.py b/tensorrt_llm/_torch/alltoall_watchdog.py
@@ -26,8 +26,9 @@
 import threading
 import time
 from collections import deque
+from collections.abc import Callable, Mapping, Sequence
 from dataclasses import dataclass
-from typing import Callable, Deque, Mapping, Optional, Protocol, Sequence
+from typing import Protocol
 
 import torch
 
@@ -110,6 +111,8 @@ def __init__(
         }
         self._device_copy_timeout_s = float(device_copy_timeout_s)
         self._copy_stream: torch.cuda.Stream | None = None
+        self._host_flags: torch.Tensor | None = None
+        self._copy_event: torch.cuda.Event | None = None
         self._retired_copies: list[tuple[torch.Tensor, torch.cuda.Event]] = []
         if workspace.device.type == "cuda":
             self._copy_stream = torch.cuda.Stream(device=workspace.device)
@@ -123,13 +126,17 @@ def _read_cuda_flags(self, flags: torch.Tensor) -> tuple[int, ...]:
         assert self._copy_stream is not None
         self._prune_retired_copies()
 
-        host_flags = torch.empty(
-            (self._ep_size,),
-            dtype=torch.int32,
-            device="cpu",
-            pin_memory=prefer_pinned(),
-        )
-        event = torch.cuda.Event(blocking=False)
+        if self._host_flags is None:
+            self._host_flags = torch.empty(
+                (self._ep_size,),
+                dtype=torch.int32,
+                device="cpu",
+                pin_memory=prefer_pinned(),
+            )
+        if self._copy_event is None:
+            self._copy_event = torch.cuda.Event(blocking=False)
+        host_flags = self._host_flags
+        event = self._copy_event
         with torch.cuda.device(flags.device), torch.cuda.stream(self._copy_stream):
             host_flags.copy_(flags.detach(), non_blocking=True)
             event.record(self._copy_stream)
@@ -139,6 +146,8 @@ def _read_cuda_flags(self, flags: torch.Tensor) -> tuple[int, ...]:
             remaining_s = deadline_s - time.monotonic()
             if remaining_s <= 0:
                 self._retired_copies.append((host_flags, event))
+                self._host_flags = None
+                self._copy_event = None
                 raise CompletionFlagReadTimeout(
                     "timed out copying AlltoAll completion flags to host"
                 )
@@ -175,8 +184,8 @@ def __init__(
         completion_reader: CompletionFlagReader,
         timeout_s: float = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S,
         poll_interval_s: float = DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
-        health: Optional[EPGroupHealthLike] = None,
-        on_timeout: Optional[Callable[[AlltoAllWatchdogTimeout], None]] = None,
+        health: EPGroupHealthLike | None = None,
+        on_timeout: Callable[[AlltoAllWatchdogTimeout], None] | None = None,
     ) -> None:
         if ep_size <= 0:
             raise ValueError(f"ep_size must be > 0, got {ep_size}")
@@ -196,7 +205,7 @@ def __init__(
         self._on_timeout = on_timeout
 
         self._cv = threading.Condition()
-        self._queue: Deque[_CollectiveWatch] = deque()
+        self._queue: deque[_CollectiveWatch] = deque()
         self._closed = False
         self._stopping = False
         self._thread: threading.Thread | None = None
@@ -213,8 +222,8 @@ def from_workspace(
         ep_size: int,
         timeout_s: float = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S,
         poll_interval_s: float = DEFAULT_ALLTOALL_WATCHDOG_POLL_INTERVAL_S,
-        health: Optional[EPGroupHealthLike] = None,
-        on_timeout: Optional[Callable[[AlltoAllWatchdogTimeout], None]] = None,
+        health: EPGroupHealthLike | None = None,
+        on_timeout: Callable[[AlltoAllWatchdogTimeout], None] | None = None,
     ) -> "AlltoAllWatchdog":
         """Build a watchdog from the MoE AlltoAll workspace and metainfo."""
         dispatch_offset = int(
@@ -421,7 +430,7 @@ def _run(self) -> None:
             except CompletionFlagReadTimeout:
                 observed_flags = last_observed_flags
                 poll_timed_out = True
-            except BaseException as exc:  # noqa: BLE001 - keep watchdog failures visible.
+            except Exception as exc:  # noqa: BLE001 - keep watchdog failures visible.
                 with self._cv:
                     self._last_error = exc
                     self._queue.clear()
diff --git a/tensorrt_llm/_torch/distributed/moe_alltoall.py b/tensorrt_llm/_torch/distributed/moe_alltoall.py
@@ -9,6 +9,7 @@
 
 import os
 import sys
+import threading
 from dataclasses import dataclass
 from typing import Callable, Dict, Optional
 
@@ -212,6 +213,8 @@ def __init__(
                 "mnnvl_mem": mnnvl_mem,
                 "workspace": workspace,
                 "metainfo": metainfo,
+                "watchdog_flag_generation": 0,
+                "watchdog_flag_generation_lock": threading.Lock(),
             }
         else:
             assert self._WORKSPACE[
@@ -229,17 +232,20 @@ def __init__(
         self.mnnvl_mem = self._WORKSPACE["mnnvl_mem"]
         self.workspace = self._WORKSPACE["workspace"]
         self.metainfo = self._WORKSPACE["metainfo"]
+        if "watchdog_flag_generation_lock" not in self._WORKSPACE:
+            self._WORKSPACE["watchdog_flag_generation_lock"] = threading.Lock()
+            self._WORKSPACE[
+                "watchdog_flag_generation"] = self._read_current_flag_val()
         # Internal state
         self._state: _A2AState = _A2AState()
         self.ep_group_health = ep_group_health
         self._destroyed = False
-        self._watchdog_flag_generation = 0
         self._alltoall_watchdog: AlltoAllWatchdog | None = None
         if (alltoall_watchdog_timeout_s is None
                 and self.ep_group_health is not None):
             alltoall_watchdog_timeout_s = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S
         if alltoall_watchdog_timeout_s is not None:
-            self._watchdog_flag_generation = self._read_current_flag_val()
+            self._sync_watchdog_flag_generation()
             self._alltoall_watchdog = AlltoAllWatchdog.from_workspace(
                 workspace=self.workspace,
                 metainfo=self.metainfo,
@@ -276,6 +282,25 @@ def _read_current_flag_val(self) -> int:
             flag_val = flag_val.detach().cpu()
         return int(flag_val.item())
 
+    def _sync_watchdog_flag_generation(self) -> None:
+        workspace_state = self._WORKSPACE
+        assert workspace_state is not None
+        lock = workspace_state["watchdog_flag_generation_lock"]
+        with lock:
+            workspace_state["watchdog_flag_generation"] = max(
+                int(workspace_state["watchdog_flag_generation"]),
+                self._read_current_flag_val(),
+            )
+
+    def _next_watchdog_flag_generation(self) -> int:
+        workspace_state = self._WORKSPACE
+        assert workspace_state is not None
+        lock = workspace_state["watchdog_flag_generation_lock"]
+        with lock:
+            workspace_state["watchdog_flag_generation"] = (
+                int(workspace_state["watchdog_flag_generation"]) + 1)
+            return int(workspace_state["watchdog_flag_generation"])
+
     def _get_active_rank_mask_tensor(
             self,
             active_rank_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
@@ -302,10 +327,9 @@ def _watch_collective(self, phase: str,
                           active_rank_mask: Optional[torch.Tensor]) -> None:
         if self._alltoall_watchdog is None:
             return
-        self._watchdog_flag_generation += 1
         self._alltoall_watchdog.watch(
             phase=phase,
-            expected_flag=self._watchdog_flag_generation,
+            expected_flag=self._next_watchdog_flag_generation(),
             active_mask=self._active_mask_int(active_rank_mask),
         )
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_one_sided.py b/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_one_sided.py
@@ -25,6 +25,7 @@
 """
 
 import os
+import threading
 from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -287,6 +288,8 @@ def __init__(
                 "mnnvl_mem": mnnvl_mem,
                 "workspace": workspace,
                 "metainfo": metainfo,
+                "watchdog_flag_generation": 0,
+                "watchdog_flag_generation_lock": threading.Lock(),
             }
             NVLinkOneSided._WORKSPACES[self._workspace_key] = workspace_state
         else:
@@ -312,17 +315,20 @@ def __init__(
             NVLinkOneSided._WORKSPACE_REFCOUNTS.get(self._workspace_key, 0) + 1
         )
         self._destroyed = False
+        self._workspace_state = workspace_state
         self.mnnvl_mem = workspace_state["mnnvl_mem"]
         self.workspace = workspace_state["workspace"]
         self.moe_a2a_metainfo = workspace_state["metainfo"]
         self.max_num_tokens_per_rank = workspace_state["max_num_tokens_per_rank"]
+        if "watchdog_flag_generation_lock" not in workspace_state:
+            workspace_state["watchdog_flag_generation_lock"] = threading.Lock()
+            workspace_state["watchdog_flag_generation"] = self._read_current_flag_val()
         self.ep_group_health = ep_group_health
-        self._watchdog_flag_generation = 0
         self._alltoall_watchdog: AlltoAllWatchdog | None = None
         if alltoall_watchdog_timeout_s is None and self.ep_group_health is not None:
             alltoall_watchdog_timeout_s = DEFAULT_ALLTOALL_WATCHDOG_TIMEOUT_S
         if alltoall_watchdog_timeout_s is not None:
-            self._watchdog_flag_generation = self._read_current_flag_val()
+            self._sync_watchdog_flag_generation()
             self._alltoall_watchdog = AlltoAllWatchdog.from_workspace(
                 workspace=self.workspace,
                 metainfo=self.moe_a2a_metainfo,
@@ -354,6 +360,22 @@ def _read_current_flag_val(self) -> int:
             flag_val = flag_val.detach().cpu()
         return int(flag_val.item())
 
+    def _sync_watchdog_flag_generation(self) -> None:
+        lock = self._workspace_state["watchdog_flag_generation_lock"]
+        with lock:
+            self._workspace_state["watchdog_flag_generation"] = max(
+                int(self._workspace_state["watchdog_flag_generation"]),
+                self._read_current_flag_val(),
+            )
+
+    def _next_watchdog_flag_generation(self) -> int:
+        lock = self._workspace_state["watchdog_flag_generation_lock"]
+        with lock:
+            self._workspace_state["watchdog_flag_generation"] = (
+                int(self._workspace_state["watchdog_flag_generation"]) + 1
+            )
+            return int(self._workspace_state["watchdog_flag_generation"])
+
     def _get_active_rank_mask_tensor(
         self, active_rank_mask: Optional[torch.Tensor]
     ) -> Optional[torch.Tensor]:
@@ -374,10 +396,9 @@ def _active_mask_int(self, active_rank_mask: Optional[torch.Tensor]) -> Optional
     def _watch_collective(self, phase: str, active_rank_mask: Optional[torch.Tensor]) -> None:
         if self._alltoall_watchdog is None:
             return
-        self._watchdog_flag_generation += 1
         self._alltoall_watchdog.watch(
             phase=phase,
-            expected_flag=self._watchdog_flag_generation,
+            expected_flag=self._next_watchdog_flag_generation(),
             active_mask=self._active_mask_int(active_rank_mask),
         )
 
@@ -424,6 +445,7 @@ def destroy(self):
         self.mnnvl_mem = None
         self.workspace = None
         self.moe_a2a_metainfo = None
+        self._workspace_state = None
         self._dispatch_state = {"phase": "destroyed"}
 
     def is_workload_feasible(self, all_rank_num_tokens: List[int], num_chunks: int) -> bool:
diff --git a/tests/unittest/_torch/modules/moe/test_moe_comm.py b/tests/unittest/_torch/modules/moe/test_moe_comm.py
diff --git a/tests/unittest/_torch/modules/test_alltoall_watchdog.py b/tests/unittest/_torch/modules/test_alltoall_watchdog.py