[None][feat] WideEP FT: add active_rank_mask to NVLink AlltoAll kernels

chienchunhung · chienchunhung · commit ea9466eed686 · 2026-06-22T10:57:08.000-07:00
Eliminates the infinite-spin AlltoAll hang that turns a single GPU failure in a Wide-EP group into a 5-minute HangDetector fire + full restart. The dispatch and combine kernels now take a uint64[2] bitmask of currently-alive EP ranks; dead ranks are skipped on every completion-flag write/wait, peer recv_counter store, EPLB stats write, and per-token routing decision (dead-targeted slots collapse to the same -1 sentinel combine already uses for duplicates).

The mask is optional on both torch ops; omitting it (or passing all-ones) produces bit-identical output to the pre-change kernel. kMaxRanks is bumped 64 -&gt; 128 to cover NVL72 with headroom; kRankMaskWords = 2 names the kernel ABI explicitly.

Tests cover (a) all-ones mask matches no-mask bit-for-bit, and (b) one rank masked dead -&gt; surviving ranks complete dispatch+combine without hang, dead-targeted topk slots dropped, in tests/unittest/_torch/multi_gpu/test_moe_a2a_rank_mask.py.

Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -210,6 +210,14 @@ __device__ __forceinline__ int compute_target_rank_id(int expert_id, int base, i
     return remainder + (expert_id - split) / base;
 }
 
+// Test bit `rank` in a kRankMaskWords-wide little-endian uint64 bitmask.
+// Word 0 covers ranks 0..63, word 1 covers ranks 64..127, etc.
+// `rank >> 6` and `rank & 63` divide / modulo by 64.
+__device__ __forceinline__ bool is_rank_active(uint64_t const* mask, int rank)
+{
+    return (mask[rank >> 6] >> (rank & 63)) & 1ULL;
+}
+
 // ============================================================================
 // Helper Functions for Vectorized Memory Operations
 // ============================================================================
@@ -432,7 +440,12 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
             // Supports the non-divisible case where num_experts % ep_size != 0.
             int target_rank = compute_target_rank_id(expert_id, ep_base, ep_remainder);
 
-            if (already_copied & (1ULL << target_rank))
+            // Skip duplicates AND dead ranks: both produce the same -1 sentinel that combine
+            // checks via topk_send_indices[k] < 0. A token whose only target is dead is dropped
+            // from this collective; higher-layer logic (EPLB redistribution) is responsible
+            // for re-routing such tokens on subsequent iterations.
+            bool const target_dead = !is_rank_active(ptrs.active_rank_mask, target_rank);
+            if ((already_copied & (1ULL << target_rank)) || target_dead)
             {
                 if (thread_idx == 0)
                 {
@@ -511,20 +524,26 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
 
         if (is_last_token)
         {
-// Store send_counters to recv_counters
+// Store send_counters to recv_counters.
+// Skip masked target ranks: their symmetric memory may be inaccessible.
 #pragma unroll 1 // No unroll as one iter is typically enough
             for (int target_rank = lane_id; target_rank < ep_size; target_rank += warpSize)
             {
+                if (!is_rank_active(ptrs.active_rank_mask, target_rank))
+                    continue;
                 int send_count = ptrs.send_counters[target_rank];
                 ptrs.recv_counters[target_rank][rank_id] = send_count;
             }
 
             if constexpr (ENABLE_EPLB)
             {
                 // Write local stats into peer buffers before the release fence below.
+                // Skip masked target ranks for the same reason as above.
 #pragma unroll 1
                 for (int target_rank = 0; target_rank < ep_size; ++target_rank)
                 {
+                    if (!is_rank_active(ptrs.active_rank_mask, target_rank))
+                        continue;
                     int* target_stats = ptrs.eplb_gathered_stats[target_rank];
                     for (int expert_id = lane_id; expert_id < eplb_stats_num_experts; expert_id += warpSize)
                     {
@@ -543,9 +562,13 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
 #else
             asm volatile("fence.acq_rel.sys;");
 #endif
+            // Signal completion to all active peers; skip dead ranks (their symmetric memory
+            // is unreachable).
 #pragma unroll 1 // No unroll as one iter is typically enough
             for (int target_rank = lane_id; target_rank < ep_size; target_rank += warpSize)
             {
+                if (!is_rank_active(ptrs.active_rank_mask, target_rank))
+                    continue;
                 uint32_t* flag_addr = &ptrs.completion_flags[target_rank][rank_id];
                 asm volatile("st.relaxed.sys.u32 [%0], %1;" ::"l"(flag_addr), "r"(expected_value));
 
@@ -555,9 +578,13 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
 #endif
             }
 
+            // Wait for all active peers to signal; skip dead ranks (otherwise we would
+            // spin forever — this is the bug the rank-mask is here to prevent).
 #pragma unroll 1 // No unroll
             for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
             {
+                if (!is_rank_active(ptrs.active_rank_mask, peer_rank))
+                    continue;
                 bool flag_set = false;
                 auto s = clock64();
                 do
@@ -605,6 +632,10 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params)
     TLLM_CHECK(params.ep_size > 0 && params.ep_size <= kMaxRanks);
     TLLM_CHECK(params.local_num_tokens >= 0);
     TLLM_CHECK(params.num_payloads > 0 && params.num_payloads <= kMaxPayloads);
+    // The local rank must always be marked active in its own view of the mask;
+    // otherwise the kernel itself would be running on a "dead" rank.
+    TLLM_CHECK_WITH_INFO((params.active_rank_mask[params.ep_rank >> 6] >> (params.ep_rank & 63)) & 1ULL,
+        "active_rank_mask must mark the local ep_rank (%d) as active", params.ep_rank);
 
     // Prepare kernel pointers struct
     DispatchKernelPointers kernel_ptrs = {};
@@ -642,6 +673,12 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params)
     kernel_ptrs.topk_send_indices = params.topk_send_indices;
     kernel_ptrs.eplb_local_stats = params.eplb_local_stats;
 
+    // Copy active-rank bitmask into the kernel pointers struct
+    for (int w = 0; w < kRankMaskWords; ++w)
+    {
+        kernel_ptrs.active_rank_mask[w] = params.active_rank_mask[w];
+    }
+
     int const kBlockSize = tensorrt_llm::common::getEnvMoeA2ADispatchBlockSize();
 
     // One block per token: grid_size == local_num_tokens. If 0, launch a single block to
@@ -1153,9 +1190,13 @@ __global__ void moeA2ACombineKernel(
 
         if (blockIdx.x == 0)
         {
+            // Signal readiness to all active peers; skip dead ranks (their symmetric memory
+            // is unreachable).
 #pragma unroll 1 // No unroll
             for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
             {
+                if (!is_rank_active(ptrs.active_rank_mask, peer_rank))
+                    continue;
                 uint32_t* flag_addr = &ptrs.completion_flags[peer_rank][rank_id];
                 asm volatile("st.relaxed.sys.u32 [%0], %1;" ::"l"(flag_addr), "r"(expected_value));
 #if ENABLE_DEBUG_PRINT
@@ -1165,9 +1206,13 @@ __global__ void moeA2ACombineKernel(
             }
         }
 
+        // Wait for all active peers to signal; skip dead ranks (otherwise we would spin
+        // forever — this is the bug the rank-mask is here to prevent).
 #pragma unroll 1 // No unroll
         for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
         {
+            if (!is_rank_active(ptrs.active_rank_mask, peer_rank))
+                continue;
             bool flag_set = false;
             auto s = clock64();
             do
@@ -1273,6 +1318,10 @@ void moe_a2a_combine_launch(MoeA2ACombineParams const& params)
     TLLM_CHECK(params.ep_size > 0 && params.ep_size <= kMaxRanks);
     TLLM_CHECK(params.local_num_tokens >= 0);
     TLLM_CHECK(params.elements_per_token > 0);
+    // The local rank must always be marked active in its own view of the mask;
+    // otherwise the kernel itself would be running on a "dead" rank.
+    TLLM_CHECK_WITH_INFO((params.active_rank_mask[params.ep_rank >> 6] >> (params.ep_rank & 63)) & 1ULL,
+        "active_rank_mask must mark the local ep_rank (%d) as active", params.ep_rank);
 
     // Configure kernel launch (one block per token).
     int const kBlockSize = tensorrt_llm::common::getEnvMoeA2ACombineBlockSize();
@@ -1306,6 +1355,12 @@ void moe_a2a_combine_launch(MoeA2ACombineParams const& params)
     kernel_ptrs.topk_target_ranks = params.topk_target_ranks;
     kernel_ptrs.topk_send_indices = params.topk_send_indices;
 
+    // Copy active-rank bitmask into the kernel pointers struct
+    for (int w = 0; w < kRankMaskWords; ++w)
+    {
+        kernel_ptrs.active_rank_mask[w] = params.active_rank_mask[w];
+    }
+
     // stride_per_token: byte distance between tokens in the recv buffer.
     //   FP8 external payload: EPT × 1            (compact FP8 layout)
     //   FP8 in-place / non-FP8: EPT × sizeof(PayloadT)  (payload-dtype stride)
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h
@@ -26,9 +26,12 @@ namespace kernels::moe_comm
 {
 
 // Configuration constants
-static constexpr int kMaxTopK = 22;    // Maximum top-k experts per token
-static constexpr int kMaxPayloads = 4; // Maximum number of different payload types
-static constexpr int kMaxRanks = 64;   // Maximum supported EP size
+static constexpr int kMaxTopK = 22;      // Maximum top-k experts per token
+static constexpr int kMaxPayloads = 4;   // Maximum number of different payload types
+static constexpr int kMaxRanks = 128;    // Maximum supported EP size (covers NVL72 with headroom)
+static constexpr int kRankMaskWords = 2; // uint64 words to hold the active-rank bitmask
+                                         // (kRankMaskWords * 64 must be >= kMaxRanks)
+static_assert(kRankMaskWords * 64 >= kMaxRanks, "active_rank_mask too small for kMaxRanks");
 
 // Describes a single payload type to be communicated
 struct PayloadDescriptor
@@ -65,6 +68,12 @@ struct DispatchKernelPointers
     // Optional: Statistics for EPLB
     int const* eplb_local_stats;         // [eplb_stats_num_experts]
     int* eplb_gathered_stats[kMaxRanks]; // [ep_size, eplb_stats_num_experts] per rank
+
+    // Active-rank bitmask: bit i set => rank i is alive and participates in this collective.
+    // Word 0 covers ranks 0..63; word 1 covers ranks 64..127. Tokens routed to a masked
+    // rank are dropped (topk_*[k] = -1); flag writes/waits to/from masked peers are skipped.
+    // The local rank's own bit must always be set; this is checked at launch time.
+    uint64_t active_rank_mask[kRankMaskWords];
 };
 
 // Combine kernel pointers - non-const output in src_data_ptrs[0], const recv buffers
@@ -82,6 +91,11 @@ struct CombineKernelPointers
     // Top-K compact routing info per local token (size: [local_num_tokens, top_k])
     int const* topk_target_ranks; // target rank per k, -1 for duplicates
     int const* topk_send_indices; // dst index per k, -1 for duplicates
+
+    // Active-rank bitmask: see DispatchKernelPointers::active_rank_mask. Combine skips flag
+    // writes/waits to/from masked peers; per-token accumulation uses topk_send_indices[k] < 0
+    // (set by dispatch) to skip dead-targeted slots, so no explicit mask check is needed there.
+    uint64_t active_rank_mask[kRankMaskWords];
 };
 
 // Dispatch phase parameters
@@ -125,6 +139,11 @@ struct MoeA2ADispatchParams
     int const* eplb_local_stats;         // [eplb_stats_num_experts]
     int* eplb_gathered_stats[kMaxRanks]; // [ep_size, eplb_stats_num_experts] per rank
 
+    // Active-rank bitmask: see DispatchKernelPointers::active_rank_mask. The launch function
+    // copies these words into the kernel pointers struct. Defaults to all-ones for
+    // backwards-compatible "no masking" behavior.
+    uint64_t active_rank_mask[kRankMaskWords];
+
     // CUDA stream
     cudaStream_t stream;
 };
@@ -170,6 +189,11 @@ struct MoeA2ACombineParams
                                            // rank has signaled the target rank
     void const* recv_buffers[kMaxRanks];   // Per-rank receive buffers (only for single payload)
 
+    // Active-rank bitmask: see DispatchKernelPointers::active_rank_mask. The launch function
+    // copies these words into the kernel pointers struct. Defaults to all-ones for
+    // backwards-compatible "no masking" behavior.
+    uint64_t active_rank_mask[kRankMaskWords];
+
     // CUDA stream
     cudaStream_t stream;
 };
diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp
@@ -42,6 +42,40 @@ inline size_t alignOffset(size_t offset, size_t alignment)
     return (offset + alignment - 1) & ~(alignment - 1);
 }
 
+// Resolve an optional rank-mask tensor into a fixed-width uint64 array.
+// If the caller did not provide a mask, default to "all ranks active" (all bits set), which
+// reproduces the pre-fault-tolerance behavior bit-for-bit.
+//
+// On failure (wrong dtype / device / shape), throws via TORCH_CHECK so the error surfaces
+// at the Python op boundary rather than the kernel launch.
+inline void resolveActiveRankMask(torch::optional<torch::Tensor> const& maskTensor, int64_t epRank,
+    uint64_t (&out)[tensorrt_llm::kernels::moe_comm::kRankMaskWords])
+{
+    using tensorrt_llm::kernels::moe_comm::kRankMaskWords;
+    if (!maskTensor.has_value() || !maskTensor.value().defined())
+    {
+        for (int w = 0; w < kRankMaskWords; ++w)
+        {
+            out[w] = ~uint64_t{0};
+        }
+        return;
+    }
+    torch::Tensor const& t = maskTensor.value();
+    TORCH_CHECK(t.is_cpu(), "active_rank_mask must be a CPU tensor");
+    TORCH_CHECK(t.scalar_type() == torch::kUInt64, "active_rank_mask must have dtype uint64");
+    TORCH_CHECK(t.dim() == 1, "active_rank_mask must be a 1D tensor");
+    TORCH_CHECK(t.numel() == kRankMaskWords, "active_rank_mask must have exactly ", kRankMaskWords, " uint64 elements");
+    TORCH_CHECK(t.is_contiguous(), "active_rank_mask must be contiguous");
+    auto const* src = static_cast<uint64_t const*>(t.const_data_ptr());
+    for (int w = 0; w < kRankMaskWords; ++w)
+    {
+        out[w] = src[w];
+    }
+    // Local rank's bit must be set; otherwise the kernel would be running on a "dead" rank.
+    TORCH_CHECK((out[epRank >> 6] >> (epRank & 63)) & 1ULL, "active_rank_mask must mark the local ep_rank (", epRank,
+        ") as active");
+}
+
 // Calculate auxiliary data offsets
 MoeA2ADataOffsets calculateOffsets(int epSize, int maxNumTokens, int eplbStatsNumExperts)
 {
@@ -181,7 +215,8 @@ torch::Tensor moeA2AInitializeOp(torch::Tensor const& workspace, int64_t epRank,
 std::tuple<std::vector<torch::Tensor>, int64_t, torch::Tensor> moeA2ADispatchOp(
     torch::Tensor const& tokenSelectedExperts, std::vector<torch::Tensor> const& inputPayloads,
     torch::Tensor const& workspace, torch::Tensor const& metainfo, int64_t runtimeMaxTokensPerRank, int64_t epRank,
-    int64_t epSize, int64_t topK, int64_t numExperts, torch::optional<torch::Tensor> eplbLocalStats)
+    int64_t epSize, int64_t topK, int64_t numExperts, torch::optional<torch::Tensor> eplbLocalStats,
+    torch::optional<torch::Tensor> activeRankMask)
 {
     using tensorrt_llm::kernels::moe_comm::PayloadDescriptor;
     using tensorrt_llm::kernels::moe_comm::MoeA2ADispatchParams;
@@ -360,6 +395,10 @@ std::tuple<std::vector<torch::Tensor>, int64_t, torch::Tensor> moeA2ADispatchOp(
         params.eplb_local_stats = nullptr;
     }
 
+    // Resolve the optional active-rank mask. Default (no mask) = all bits set, which
+    // exactly reproduces the pre-fault-tolerance kernel behavior.
+    resolveActiveRankMask(activeRankMask, epRank, params.active_rank_mask);
+
     params.stream = at::cuda::getCurrentCUDAStream();
 
     // Prepare for dispatch (zero counters/indices and increment flag_val)
@@ -413,7 +452,8 @@ std::tuple<std::vector<torch::Tensor>, int64_t, torch::Tensor> moeA2ADispatchOp(
 // In both cases, the combine kernel reads from the workspace at 'combinePayloadOffset'.
 torch::Tensor moeA2ACombineOp(torch::Tensor const& payload, int64_t localNumTokens, torch::Tensor const& workspace,
     torch::Tensor const& metainfo, int64_t runtimeMaxTokensPerRank, int64_t epRank, int64_t epSize, int64_t topK,
-    int64_t combinePayloadOffset, bool payloadInWorkspace, bool useLowPrecision = false)
+    int64_t combinePayloadOffset, bool payloadInWorkspace, bool useLowPrecision = false,
+    torch::optional<torch::Tensor> activeRankMask = torch::nullopt)
 {
     using tensorrt_llm::kernels::moe_comm::MoeA2ACombineParams;
     using tensorrt_llm::kernels::moe_comm::moe_a2a_combine_launch;
@@ -520,6 +560,9 @@ torch::Tensor moeA2ACombineOp(torch::Tensor const& payload, int64_t localNumToke
         params.recv_buffers[target_rank] = target_workspace_ptr + combinePayloadOffset;
     }
 
+    // Resolve the optional active-rank mask. Default (no mask) = all bits set.
+    resolveActiveRankMask(activeRankMask, epRank, params.active_rank_mask);
+
     params.stream = at::cuda::getCurrentCUDAStream();
 
     moe_a2a_prepare_combine_launch(params);
@@ -613,12 +656,14 @@ TORCH_LIBRARY_FRAGMENT(trtllm, module)
         "moe_a2a_dispatch(Tensor token_selected_experts, Tensor[] input_payloads, "
         "Tensor(a!->*) workspace, Tensor metainfo, int runtime_max_tokens_per_rank, "
         "int ep_rank, int ep_size, int top_k, int num_experts, "
-        "Tensor? eplb_local_stats=None) -> (Tensor(a!)[], int, Tensor(a!))");
+        "Tensor? eplb_local_stats=None, "
+        "Tensor? active_rank_mask=None) -> (Tensor(a!)[], int, Tensor(a!))");
     module.def(
         "moe_a2a_combine(Tensor(a) payload, int local_num_tokens,"
         "Tensor(a!) workspace, Tensor metainfo, int runtime_max_tokens_per_rank, "
         "int ep_rank, int ep_size, int top_k, int combine_payload_offset, "
-        "bool payload_in_workspace, bool use_low_precision=False) -> Tensor");
+        "bool payload_in_workspace, bool use_low_precision=False, "
+        "Tensor? active_rank_mask=None) -> Tensor");
     module.def(
         "moe_a2a_initialize(Tensor(a!) workspace, int ep_rank, int ep_size, int max_num_tokens_per_rank, "
         "int? eplb_stats_num_experts=None) -> Tensor");
diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
@@ -448,6 +448,7 @@ def _(
         top_k: int,
         num_experts: int,
         eplb_local_stats: Optional[torch.Tensor] = None,
+        active_rank_mask: Optional[torch.Tensor] = None,
     ) -> Tuple[List[torch.Tensor], int, torch.Tensor]:
         recv_tensors: List[torch.Tensor] = []
         for payload in input_payloads:
@@ -478,6 +479,7 @@ def _(
         combine_payload_offset: int,
         payload_in_workspace: bool,
         use_low_precision: bool = False,
+        active_rank_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return payload.new_empty((local_num_tokens, payload.shape[2]))
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_one_sided.py b/tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_one_sided.py
@@ -51,7 +51,7 @@ class NVLinkOneSided(Communication):
     """
 
     # Constants from C++ (must match moeAlltoAllKernels.h)
-    MAX_RANKS = 64
+    MAX_RANKS = 128
     MAX_TOP_K = 8
     MAX_PAYLOADS = 8
 
diff --git a/tests/unittest/_torch/multi_gpu/test_moe_a2a_rank_mask.py b/tests/unittest/_torch/multi_gpu/test_moe_a2a_rank_mask.py