Address active rank mask review comments

chienchunhung · chienchunhung · commit e77620453e99 · 2026-06-22T10:57:08.000-07:00
Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -424,7 +424,7 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
         int* smem_topk_target_ranks = smem;
         int* smem_topk_send_indices = smem + TOP_K;
 
-        uint64_t already_copied = 0;
+        uint64_t already_copied[kRankMaskWords] = {};
         // Precompute the ceil/floor partition parameters once per thread, outside the
         // per-token TOP_K loop. The fast path (remainder == 0) then collapses to a single
         // integer divide per call, matching the pre-PR uniform-partition cost exactly.
@@ -444,8 +444,11 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
             // checks via topk_send_indices[k] < 0. A token whose only target is dead is dropped
             // from this collective; higher-layer logic (EPLB redistribution) is responsible
             // for re-routing such tokens on subsequent iterations.
+            int const mask_word = target_rank >> 6;
+            uint64_t const mask_bit = 1ULL << (target_rank & 63);
+            bool const target_already_copied = already_copied[mask_word] & mask_bit;
             bool const target_dead = !is_rank_active(ptrs.active_rank_mask, target_rank);
-            if ((already_copied & (1ULL << target_rank)) || target_dead)
+            if (target_already_copied || target_dead)
             {
                 if (thread_idx == 0)
                 {
@@ -470,7 +473,7 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
                 smem_topk_target_ranks[k] = target_rank;
                 smem_topk_send_indices[k] = dst_token_idx;
             }
-            already_copied |= 1ULL << target_rank;
+            already_copied[mask_word] |= mask_bit;
         }
         // Sync before dispatching data
         ThreadingPolicy::sync();
@@ -630,6 +633,7 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params)
     // Validate parameters
     TLLM_CHECK(params.top_k > 0 && params.top_k <= kMaxTopK);
     TLLM_CHECK(params.ep_size > 0 && params.ep_size <= kMaxRanks);
+    TLLM_CHECK(params.ep_rank >= 0 && params.ep_rank < params.ep_size);
     TLLM_CHECK(params.local_num_tokens >= 0);
     TLLM_CHECK(params.num_payloads > 0 && params.num_payloads <= kMaxPayloads);
     // The local rank must always be marked active in its own view of the mask;
@@ -1316,6 +1320,7 @@ void moe_a2a_combine_launch(MoeA2ACombineParams const& params)
     // Validate parameters
     TLLM_CHECK(params.top_k > 0 && params.top_k <= kMaxTopK);
     TLLM_CHECK(params.ep_size > 0 && params.ep_size <= kMaxRanks);
+    TLLM_CHECK(params.ep_rank >= 0 && params.ep_rank < params.ep_size);
     TLLM_CHECK(params.local_num_tokens >= 0);
     TLLM_CHECK(params.elements_per_token > 0);
     // The local rank must always be marked active in its own view of the mask;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h
@@ -142,7 +142,7 @@ struct MoeA2ADispatchParams
     // Active-rank bitmask: see DispatchKernelPointers::active_rank_mask. The launch function
     // copies these words into the kernel pointers struct. Defaults to all-ones for
     // backwards-compatible "no masking" behavior.
-    uint64_t active_rank_mask[kRankMaskWords];
+    uint64_t active_rank_mask[kRankMaskWords] = {~uint64_t{0}, ~uint64_t{0}};
 
     // CUDA stream
     cudaStream_t stream;
@@ -192,7 +192,7 @@ struct MoeA2ACombineParams
     // Active-rank bitmask: see DispatchKernelPointers::active_rank_mask. The launch function
     // copies these words into the kernel pointers struct. Defaults to all-ones for
     // backwards-compatible "no masking" behavior.
-    uint64_t active_rank_mask[kRankMaskWords];
+    uint64_t active_rank_mask[kRankMaskWords] = {~uint64_t{0}, ~uint64_t{0}};
 
     // CUDA stream
     cudaStream_t stream;
diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp
@@ -52,6 +52,9 @@ inline void resolveActiveRankMask(torch::optional<torch::Tensor> const& maskTens
     uint64_t (&out)[tensorrt_llm::kernels::moe_comm::kRankMaskWords])
 {
     using tensorrt_llm::kernels::moe_comm::kRankMaskWords;
+    using tensorrt_llm::kernels::moe_comm::kMaxRanks;
+    TORCH_CHECK(
+        epRank >= 0 && epRank < kMaxRanks, "epRank must be in the range [0, ", kMaxRanks, ") for active_rank_mask");
     if (!maskTensor.has_value() || !maskTensor.value().defined())
     {
         for (int w = 0; w < kRankMaskWords; ++w)
@@ -151,11 +154,14 @@ MoeA2ADataOffsets calculateOffsets(int epSize, int maxNumTokens, int eplbStatsNu
 torch::Tensor moeA2AInitializeOp(torch::Tensor const& workspace, int64_t epRank, int64_t epSize, int64_t maxNumTokens,
     torch::optional<int64_t> eplbStatsNumExperts)
 {
+    using tensorrt_llm::kernels::moe_comm::kMaxRanks;
+
     // Validate inputs
     CHECK_TH_CUDA(workspace);
     CHECK_TYPE(workspace, torch::kUInt8);
     TORCH_CHECK(workspace.dim() == 2, "workspace must be a 2D tensor of shape [epSize, sizePerRank]");
     TORCH_CHECK(workspace.size(0) == epSize, "workspace first dimension must equal epSize");
+    TORCH_CHECK(epSize > 0 && epSize <= kMaxRanks, "epSize must be in the range (0, ", kMaxRanks, "]");
     TORCH_CHECK(epRank >= 0 && epRank < epSize, "epRank must be in the range [0, epSize)");
 
     // Initialize workspace to zero
@@ -223,6 +229,7 @@ std::tuple<std::vector<torch::Tensor>, int64_t, torch::Tensor> moeA2ADispatchOp(
     using tensorrt_llm::kernels::moe_comm::moe_a2a_dispatch_launch;
     using tensorrt_llm::kernels::moe_comm::kMaxTopK;
     using tensorrt_llm::kernels::moe_comm::kMaxPayloads;
+    using tensorrt_llm::kernels::moe_comm::kMaxRanks;
 
     // Validate inputs
     CHECK_INPUT(tokenSelectedExperts, torch::kInt32);
@@ -238,6 +245,7 @@ std::tuple<std::vector<torch::Tensor>, int64_t, torch::Tensor> moeA2ADispatchOp(
 
     int64_t localNumTokens = tokenSelectedExperts.size(0);
     TORCH_CHECK(runtimeMaxTokensPerRank > 0, "runtimeMaxTokensPerRank must be positive");
+    TORCH_CHECK(epSize > 0 && epSize <= kMaxRanks, "epSize must be in the range (0, ", kMaxRanks, "]");
     TORCH_CHECK(epRank >= 0 && epRank < epSize, "epRank must be in the range [0, epSize)");
     TORCH_CHECK(topK > 0 && topK <= kMaxTopK, "topK must be in the range (0, kMaxTopK]");
     TORCH_CHECK(!inputPayloads.empty(), "inputPayloads must not be empty");
@@ -458,6 +466,7 @@ torch::Tensor moeA2ACombineOp(torch::Tensor const& payload, int64_t localNumToke
     using tensorrt_llm::kernels::moe_comm::MoeA2ACombineParams;
     using tensorrt_llm::kernels::moe_comm::moe_a2a_combine_launch;
     using tensorrt_llm::kernels::moe_comm::kMaxTopK;
+    using tensorrt_llm::kernels::moe_comm::kMaxRanks;
 
     // Validate inputs
     CHECK_TH_CUDA(payload);
@@ -471,6 +480,7 @@ torch::Tensor moeA2ACombineOp(torch::Tensor const& payload, int64_t localNumToke
     TORCH_CHECK(reinterpret_cast<uintptr_t>(payload.data_ptr()) % 16 == 0, "payload must be 16-byte aligned");
     int64_t elementsPerToken = payload.size(2);
     TORCH_CHECK(elementsPerToken > 0, "elementsPerToken must be positive");
+    TORCH_CHECK(epSize > 0 && epSize <= kMaxRanks, "epSize must be in the range (0, ", kMaxRanks, "]");
     TORCH_CHECK(epRank >= 0 && epRank < epSize, "epRank must be in the range [0, epSize)");
     TORCH_CHECK(topK > 0 && topK <= kMaxTopK, "topK must be in the range (0, kMaxTopK]");
 
diff --git a/tests/unittest/_torch/multi_gpu/test_moe_a2a_rank_mask.py b/tests/unittest/_torch/multi_gpu/test_moe_a2a_rank_mask.py
@@ -32,9 +32,9 @@
 
 import pickle
 import sys
-import traceback
 
 import cloudpickle
+import pynvml
 import pytest
 import torch
 from mpi4py import MPI
@@ -63,6 +63,16 @@ def setup_test():
     tllm.logger.set_level("error")
 
 
+def _skip_if_mnnvl_unsupported() -> None:
+    try:
+        MnnvlMemory.initialize()
+        supports_mnnvl = MnnvlMemory.supports_mnnvl()
+    except (RuntimeError, pynvml.NVMLError) as exc:
+        pytest.skip(f"MNNVL not supported on this system: {exc}")
+    if not supports_mnnvl:
+        pytest.skip("MNNVL not supported on this system")
+
+
 def _ep_mask_words(ep_size: int, dead_ranks: set[int]) -> torch.Tensor:
     """Build the uint64[EP_MASK_NUM_WORDS] CPU tensor expected by the C++ op."""
     mask_int = ((1 << ep_size) - 1) & ~sum(1 << r for r in dead_ranks)
@@ -162,41 +172,35 @@ def _worker_all_active_matches_no_mask(
 ):
     rank = tllm.mpi_rank()
     torch.cuda.set_device(rank)
-    try:
-        mapping = Mapping(rank=rank, tp_size=ep_size, moe_ep_size=ep_size, world_size=ep_size)
-        moe_a2a = MoeAlltoAll(
-            mapping=mapping,
-            max_num_tokens=local_num_tokens,
-            top_k=top_k,
-            num_slots=num_experts,
-            workspace_size_per_rank=workspace_size_per_rank,
-        )
+    mapping = Mapping(rank=rank, tp_size=ep_size, moe_ep_size=ep_size, world_size=ep_size)
+    moe_a2a = MoeAlltoAll(
+        mapping=mapping,
+        max_num_tokens=local_num_tokens,
+        top_k=top_k,
+        num_slots=num_experts,
+        workspace_size_per_rank=workspace_size_per_rank,
+    )
 
-        # Same RNG seed across both runs => identical inputs.
-        torch.manual_seed(0xA2A + rank)
-        token_selected_experts = _generate_token_selected_experts(
-            local_num_tokens, num_experts, top_k
-        )
-        payload = _make_payload(local_num_tokens, hidden_size, rank)
+    # Same RNG seed across both runs => identical inputs.
+    torch.manual_seed(0xA2A + rank)
+    token_selected_experts = _generate_token_selected_experts(local_num_tokens, num_experts, top_k)
+    payload = _make_payload(local_num_tokens, hidden_size, rank)
 
-        out_no_mask, topk_no_mask = _run_dispatch_combine(
-            moe_a2a, token_selected_experts, payload, local_num_tokens, active_rank_mask=None
-        )
-        out_all_active, topk_all_active = _run_dispatch_combine(
-            moe_a2a,
-            token_selected_experts,
-            payload,
-            local_num_tokens,
-            active_rank_mask=_ep_mask_words(ep_size, dead_ranks=set()),
-        )
+    out_no_mask, topk_no_mask = _run_dispatch_combine(
+        moe_a2a, token_selected_experts, payload, local_num_tokens, active_rank_mask=None
+    )
+    out_all_active, topk_all_active = _run_dispatch_combine(
+        moe_a2a,
+        token_selected_experts,
+        payload,
+        local_num_tokens,
+        active_rank_mask=_ep_mask_words(ep_size, dead_ranks=set()),
+    )
 
-        return (
-            torch.equal(out_no_mask, out_all_active),
-            torch.equal(topk_no_mask, topk_all_active),
-        )
-    except Exception:
-        traceback.print_exc()
-        raise
+    return (
+        torch.equal(out_no_mask, out_all_active),
+        torch.equal(topk_no_mask, topk_all_active),
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -215,53 +219,47 @@ def _worker_one_rank_masked(
 ):
     rank = tllm.mpi_rank()
     torch.cuda.set_device(rank)
-    try:
-        mapping = Mapping(rank=rank, tp_size=ep_size, moe_ep_size=ep_size, world_size=ep_size)
-        # Every rank participates in workspace init (it has MPI barriers internally).
-        moe_a2a = MoeAlltoAll(
-            mapping=mapping,
-            max_num_tokens=local_num_tokens,
-            top_k=top_k,
-            num_slots=num_experts,
-            workspace_size_per_rank=workspace_size_per_rank,
-        )
+    mapping = Mapping(rank=rank, tp_size=ep_size, moe_ep_size=ep_size, world_size=ep_size)
+    # Every rank participates in workspace init (it has MPI barriers internally).
+    moe_a2a = MoeAlltoAll(
+        mapping=mapping,
+        max_num_tokens=local_num_tokens,
+        top_k=top_k,
+        num_slots=num_experts,
+        workspace_size_per_rank=workspace_size_per_rank,
+    )
 
-        if rank == dead_rank:
-            # Simulate a dead rank: do not call dispatch/combine. Wait at a final
-            # barrier so the surviving ranks have someone to synchronize with at
-            # the end of the test. (The kernel itself never observes us because
-            # the surviving ranks pass a mask with our bit cleared.)
-            MPI.COMM_WORLD.barrier()
-            return ("dead", None, None, None)
-
-        torch.manual_seed(0xA2A + rank)
-        token_selected_experts = _generate_token_selected_experts(
-            local_num_tokens, num_experts, top_k
-        )
-        payload = _make_payload(local_num_tokens, hidden_size, rank)
+    if rank == dead_rank:
+        # Simulate a dead rank: do not call dispatch/combine. Wait at a final
+        # barrier so the surviving ranks have someone to synchronize with at
+        # the end of the test. (The kernel itself never observes us because
+        # the surviving ranks pass a mask with our bit cleared.)
+        MPI.COMM_WORLD.barrier()
+        return ("dead", None, None, None)
 
-        # Build mask with dead_rank's bit cleared.
-        mask = _ep_mask_words(ep_size, dead_ranks={dead_rank})
+    torch.manual_seed(0xA2A + rank)
+    token_selected_experts = _generate_token_selected_experts(local_num_tokens, num_experts, top_k)
+    payload = _make_payload(local_num_tokens, hidden_size, rank)
 
-        # Compute the per-token target ranks the way the kernel does so we can
-        # cross-check the workspace afterwards.
-        num_experts_per_rank = num_experts // ep_size
-        expected_target_ranks = (token_selected_experts // num_experts_per_rank).cpu()
+    # Build mask with dead_rank's bit cleared.
+    mask = _ep_mask_words(ep_size, dead_ranks={dead_rank})
 
-        combined, topk_target_ranks = _run_dispatch_combine(
-            moe_a2a, token_selected_experts, payload, local_num_tokens, active_rank_mask=mask
-        )
+    # Compute the per-token target ranks the way the kernel does so we can
+    # cross-check the workspace afterwards.
+    num_experts_per_rank = num_experts // ep_size
+    expected_target_ranks = (token_selected_experts // num_experts_per_rank).cpu()
 
-        MPI.COMM_WORLD.barrier()
-        return (
-            "alive",
-            combined,
-            topk_target_ranks,
-            expected_target_ranks,
-        )
-    except Exception:
-        traceback.print_exc()
-        raise
+    combined, topk_target_ranks = _run_dispatch_combine(
+        moe_a2a, token_selected_experts, payload, local_num_tokens, active_rank_mask=mask
+    )
+
+    MPI.COMM_WORLD.barrier()
+    return (
+        "alive",
+        combined,
+        topk_target_ranks,
+        expected_target_ranks,
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -280,11 +278,7 @@ def _worker_one_rank_masked(
 )
 def test_all_active_mask_matches_no_mask(mpi_pool_executor, local_num_tokens, top_k):
     """An all-ones active_rank_mask must produce identical output to omitting it."""
-    try:
-        MnnvlMemory.initialize()
-        assert MnnvlMemory.supports_mnnvl()
-    except Exception:
-        pytest.skip("MNNVL not supported on this system")
+    _skip_if_mnnvl_unsupported()
 
     ep_size = mpi_pool_executor.num_workers
     if ep_size > torch.cuda.device_count():
@@ -300,7 +294,7 @@ def test_all_active_mask_matches_no_mask(mpi_pool_executor, local_num_tokens, to
     results = list(
         mpi_pool_executor.map(
             _worker_all_active_matches_no_mask,
-            *zip(*[args] * ep_size),
+            *zip(*[args] * ep_size, strict=True),
         )
     )
 
@@ -329,11 +323,7 @@ def test_one_rank_masked_completes(mpi_pool_executor, dead_rank, local_num_token
       * Slots whose expert mapped to a surviving rank are unchanged from what
         the contiguous-partition routing rule predicts.
     """
-    try:
-        MnnvlMemory.initialize()
-        assert MnnvlMemory.supports_mnnvl()
-    except Exception:
-        pytest.skip("MNNVL not supported on this system")
+    _skip_if_mnnvl_unsupported()
 
     ep_size = mpi_pool_executor.num_workers
     if ep_size > torch.cuda.device_count():
@@ -358,7 +348,7 @@ def test_one_rank_masked_completes(mpi_pool_executor, dead_rank, local_num_token
     results = list(
         mpi_pool_executor.map(
             _worker_one_rank_masked,
-            *zip(*[args] * ep_size),
+            *zip(*[args] * ep_size, strict=True),
         )
     )