NVIDIA
diff --git a/‎cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp‎
Lines changed: 4 additions & 1 deletion b/‎cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp‎
Lines changed: 171 additions & 10 deletions b/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp‎
Lines changed: 171 additions & 10 deletions
diff --git a/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h‎
Lines changed: 16 additions & 3 deletions b/‎cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h‎
Lines changed: 16 additions & 3 deletions
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -117,6 +117,9 @@ void initMoeBindings(nb::module_& m)
             nb::call_guard<nb::gil_scoped_release>())
         .def("end_iter", &tr::MoeLoadBalancer::endIter, nb::arg("iter_id"), "End the iteration with the given ID",
             nb::call_guard<nb::gil_scoped_release>())
+        .def("reconfigure_mask_only", &tr::MoeLoadBalancer::reconfigureMaskOnly, nb::arg("dead_ranks"),
+            "Reconfigure EPLB routing metadata so slots on dead EP ranks are unreachable",
+            nb::call_guard<nb::gil_scoped_release>())
         .def("shutdown", &tr::MoeLoadBalancer::shutdown, "Shutdown the load balancer and clean up resources",
             nb::call_guard<nb::gil_scoped_release>());
 
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,11 +54,39 @@ struct ReplicaInfo
     }
 };
 
+namespace
+{
+
+bool isRankMasked(std::vector<uint8_t> const* deadRankMask, int rank)
+{
+    return deadRankMask != nullptr && rank >= 0 && rank < static_cast<int>(deadRankMask->size())
+        && ((*deadRankMask)[rank] != 0);
+}
+
+int getActiveSlotCount(
+    tensorrt_llm::kernels::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<uint8_t> const* deadRankMask)
+{
+    int activeRankCount = 0;
+    for (int rank = 0; rank < metaInfo.epSize; ++rank)
+    {
+        if (!isRankMasked(deadRankMask, rank))
+        {
+            ++activeRankCount;
+        }
+    }
+    return activeRankCount * metaInfo.slotCountPerRank;
+}
+
+} // namespace
+
 void doReplication(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float* const expertLoadFactor,
-    MoePlacementCpuInfo* cpuPlacement)
+    MoePlacementCpuInfo* cpuPlacement, std::vector<uint8_t> const* deadRankMask)
 {
     cpuPlacement->expertReplicaCount.resize(metaInfo.expertCount);
-    int totalSlotCount = metaInfo.epSize * metaInfo.slotCountPerRank;
+    int totalSlotCount = getActiveSlotCount(metaInfo, deadRankMask);
+    TLLM_CHECK_WITH_INFO(totalSlotCount >= metaInfo.expertCount,
+        "Mask-only EPLB reconfigure would leave fewer active slots (%d) than experts (%d)", totalSlotCount,
+        metaInfo.expertCount);
     // --- Edge Case 1: No replication needed ---
     if (totalSlotCount == metaInfo.expertCount)
     {
@@ -122,13 +150,13 @@ void doReplication(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float
 }
 
 void doPlacement(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float* const expertLoadFactor,
-    MoePlacementCpuInfo* cpuPlacement)
+    MoePlacementCpuInfo* cpuPlacement, std::vector<uint8_t> const* deadRankMask)
 {
     // This function only update these two vectors
     auto& rankExpertIds = cpuPlacement->rankExpertIds;
     auto& replicaCount = cpuPlacement->expertReplicaCount;
 
-    int totalSlotCount = metaInfo.epSize * metaInfo.slotCountPerRank;
+    int totalSlotCount = getActiveSlotCount(metaInfo, deadRankMask);
     // 1. Create all replica information
     std::vector<ReplicaInfo> allReplicas;
     allReplicas.reserve(totalSlotCount);
@@ -152,6 +180,10 @@ void doPlacement(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float*
     // 3. Maintain Rank state and initialize Priority Queue
     std::vector<double> currentRankLoad(metaInfo.epSize, 0.0);
     std::vector<int> currentRankSlots(metaInfo.epSize, 0); // Tracks the count of assigned slots per rank
+    for (int rank = 0; rank < metaInfo.epSize; ++rank)
+    {
+        std::fill_n(rankExpertIds[rank].begin(), metaInfo.slotCountPerRank, -1);
+    }
 
     // Define a min-priority queue storing pairs of {load, rank_id}
     using RankLoadPair = std::pair<double, int>;
@@ -160,8 +192,12 @@ void doPlacement(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float*
     // Initialize the priority queue with all ranks having 0 load
     for (int rank = 0; rank < metaInfo.epSize; ++rank)
     {
-        pq.push({0.0, rank});
+        if (!isRankMasked(deadRankMask, rank))
+        {
+            pq.push({0.0, rank});
+        }
     }
+    TLLM_CHECK_WITH_INFO(!pq.empty(), "Mask-only EPLB reconfigure requires at least one active rank");
 
     // 4. Optimized Greedy assignment using Priority Queue, writing directly to rankExpertIds
     for (auto const& replica : allReplicas)
@@ -252,19 +288,36 @@ void prepareGpuPlacementInfo(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaI
     // globalSlotIds[i][j] is the list of global slot ids for expert i's j-th replica
     // different experts have different number of replicas, so globalSlotIds is a vector of vectors
     // the sum of sizes of all vectors in globalSlotIds is equal to the total number of slots
+    int const totalSlotCount = metaInfo.epSize * metaInfo.slotCountPerRank;
+    int const invalidSlotId = totalSlotCount;
+    std::fill_n(cpuPlacement->placementInfoForGPU.globalSlotIds, totalSlotCount, invalidSlotId);
+
     std::vector<std::vector<int>> globalSlotIds(metaInfo.expertCount);
+    int assignedSlotCount = 0;
     for (int rank = 0; rank < metaInfo.epSize; ++rank)
     {
         for (int slotId = 0; slotId < metaInfo.slotCountPerRank; ++slotId)
         {
             int expertId = cpuPlacement->rankExpertIds[rank][slotId];
+            if (expertId < 0)
+            {
+                continue;
+            }
+            TLLM_CHECK_WITH_INFO(expertId < metaInfo.expertCount, "expertId=%d should be in range [0, %d)", expertId,
+                metaInfo.expertCount);
             int replicaId = globalSlotIds[expertId].size();
+            TLLM_CHECK_WITH_INFO(replicaId < cpuPlacement->placementInfoForGPU.expertReplicaCount[expertId],
+                "Expert %d has more placed replicas than its replica count (%d)", expertId,
+                cpuPlacement->placementInfoForGPU.expertReplicaCount[expertId]);
             int globalSlotId = rank * metaInfo.slotCountPerRank + slotId;
             globalSlotIds[expertId].push_back(globalSlotId);
             int offset = cpuPlacement->placementInfoForGPU.expertReplicaStartOffset[expertId] + replicaId;
             cpuPlacement->placementInfoForGPU.globalSlotIds[offset] = globalSlotId;
+            ++assignedSlotCount;
         }
     }
+    TLLM_CHECK_WITH_INFO(assignedSlotCount == startOffset,
+        "Placed slot count (%d) must match active replica count (%d)", assignedSlotCount, startOffset);
     // printCpuPlacementInfo(metaInfo, cpuPlacement);
 }
 
@@ -515,6 +568,61 @@ void SingleLayerMoeLoadBalancer::waitLastUpdateDone()
     }
 }
 
+std::vector<int> SingleLayerMoeLoadBalancer::computeMaskOnlyReplicaCounts(
+    std::vector<uint8_t> const& deadRankMask) const
+{
+    TLLM_CHECK_WITH_INFO(static_cast<int>(deadRankMask.size()) == mMetaInfo.epSize,
+        "deadRankMask size (%ld) must match epSize (%d)", static_cast<long>(deadRankMask.size()), mMetaInfo.epSize);
+
+    std::vector<int> expertReplicaCount(mMetaInfo.expertCount, 0);
+    for (int rank = 0; rank < mMetaInfo.epSize; ++rank)
+    {
+        bool const rankDead = deadRankMask[rank] != 0;
+        for (int localSlotId = 0; localSlotId < mMetaInfo.slotCountPerRank; ++localSlotId)
+        {
+            int expertId = mCpuPlacementInfo.oldRankExpertIds[rank][localSlotId];
+            TLLM_CHECK_WITH_INFO(expertId == -1 || (expertId >= 0 && expertId < mMetaInfo.expertCount),
+                "expertId=%d should be -1 or in range [0, %d)", expertId, mMetaInfo.expertCount);
+            if (!rankDead && expertId >= 0)
+            {
+                ++expertReplicaCount[expertId];
+            }
+        }
+    }
+
+    for (int expertId = 0; expertId < mMetaInfo.expertCount; ++expertId)
+    {
+        TLLM_CHECK_WITH_INFO(expertReplicaCount[expertId] > 0,
+            "Mask-only EPLB reconfigure would leave expert %d with no surviving replica", expertId);
+    }
+    return expertReplicaCount;
+}
+
+void SingleLayerMoeLoadBalancer::validateMaskOnly(std::vector<uint8_t> const& deadRankMask) const
+{
+    (void) computeMaskOnlyReplicaCounts(deadRankMask);
+}
+
+void SingleLayerMoeLoadBalancer::reconfigureMaskOnly(std::vector<uint8_t> const& deadRankMask)
+{
+    mCpuPlacementInfo.expertReplicaCount = computeMaskOnlyReplicaCounts(deadRankMask);
+
+    for (int rank = 0; rank < mMetaInfo.epSize; ++rank)
+    {
+        bool const rankDead = deadRankMask[rank] != 0;
+        for (int localSlotId = 0; localSlotId < mMetaInfo.slotCountPerRank; ++localSlotId)
+        {
+            int expertId = mCpuPlacementInfo.oldRankExpertIds[rank][localSlotId];
+            mCpuPlacementInfo.rankExpertIds[rank][localSlotId] = rankDead ? -1 : expertId;
+        }
+    }
+
+    prepareGpuPlacementInfo(mMetaInfo, &mCpuPlacementInfo);
+    copyPlacementInfoToGpu();
+    TLLM_CUDA_CHECK(cudaEventRecord(mUpdateWeightsDoneEvent, mMoeLoadBalancer->mStream));
+    TLLM_CUDA_CHECK(cudaEventSynchronize(mUpdateWeightsDoneEvent));
+}
+
 cudaStream_t SingleLayerMoeLoadBalancer::getStream() const
 {
     return mMoeLoadBalancer->mStream;
@@ -561,8 +669,9 @@ void SingleLayerMoeLoadBalancer::copyPlacementInfoToGpuByCpu()
 
 void SingleLayerMoeLoadBalancer::updateWeightsRoutine()
 {
-    doReplication(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo);
-    doPlacement(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo);
+    auto const deadRankMask = mMoeLoadBalancer->getDeadRankMaskSnapshot();
+    doReplication(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo, &deadRankMask);
+    doPlacement(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo, &deadRankMask);
     prepareGpuPlacementInfo(mMetaInfo, &mCpuPlacementInfo);
     mWeightUpdater->updateWeights(&mCpuPlacementInfo);
     copyPlacementInfoToGpu();
@@ -575,8 +684,9 @@ void SingleLayerMoeLoadBalancer::updateWeightsRoutine()
 
 void SingleLayerMoeLoadBalancer::updateWeightsRoutineByCpu()
 {
-    doReplication(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo);
-    doPlacement(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo);
+    auto const deadRankMask = mMoeLoadBalancer->getDeadRankMaskSnapshot();
+    doReplication(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo, &deadRankMask);
+    doPlacement(mMetaInfo, mStatisticInfo.expertLoadFactor, &mCpuPlacementInfo, &deadRankMask);
     prepareGpuPlacementInfo(mMetaInfo, &mCpuPlacementInfo);
     mLastUpdateTaskId = mMoeLoadBalancer->addCopyTask(
         [this](int rank, int size) { mWeightUpdater->updateWeights(&mCpuPlacementInfo, rank, size); });
@@ -832,6 +942,7 @@ void HostMemoryMoeWeightUpdater::updateWeights(
 MoeLoadBalancer::MoeLoadBalancer(int epRank, int epSize, int layerUpdatesPerIter)
     : mEpRank{epRank}
     , mEpSize{epSize}
+    , mDeadRankMask(epSize, 0)
     , mLayerUpdatesPerIter{layerUpdatesPerIter}
 {
     TLLM_CUDA_CHECK(cudaGetDevice(&mCudaDeviceId));
@@ -947,9 +1058,11 @@ void MoeLoadBalancer::startIter(int64_t iterId, bool enableStatistic, bool enabl
 {
     std::unique_lock<std::mutex> lock(mWorkerThreadMutex);
     TLLM_CHECK_WITH_INFO(mModelFinalized == true, "Model is not finalized, cannot start iteration.");
+    TLLM_CHECK_WITH_INFO(!mIterActive, "Cannot start iteration while another iteration is active.");
     TLLM_CHECK_WITH_INFO(mIterId + 1 == iterId, "Expected iterId=%ld, but got %ld", mIterId + 1, iterId);
 
     mIterId = iterId;
+    mIterActive = true;
     // disable update for warm up iters.
     bool isWarmUpIter = mIterId <= mWarmUpUntilIter;
     bool fixedUpdateWeightsEnabled = enableUpdateWeights && !isWarmUpIter;
@@ -961,7 +1074,10 @@ void MoeLoadBalancer::startIter(int64_t iterId, bool enableStatistic, bool enabl
 
 void MoeLoadBalancer::endIter(int64_t iterId)
 {
+    std::unique_lock<std::mutex> lock(mWorkerThreadMutex);
+    TLLM_CHECK_WITH_INFO(mIterActive, "No active iteration to end.");
     TLLM_CHECK_WITH_INFO(mIterId == iterId, "endIter expected iterId=%ld, but got %ld", mIterId, iterId);
+    mIterActive = false;
 }
 
 void MoeLoadBalancer::shutdown()
@@ -979,6 +1095,45 @@ void MoeLoadBalancer::shutdown()
     }
 }
 
+void MoeLoadBalancer::reconfigureMaskOnly(std::vector<int> const& deadRanks)
+{
+    std::unique_lock<std::mutex> workerLock(mWorkerThreadMutex);
+    TLLM_CHECK_WITH_INFO(mModelFinalized == true, "Model is not finalized, cannot reconfigure mask-only placement.");
+    TLLM_CHECK_WITH_INFO(!mIterActive, "Cannot reconfigure mask-only placement while an iteration is active.");
+    TLLM_CHECK_WITH_INFO(mIterInfoQueue.empty(), "Cannot reconfigure mask-only placement while iterations are queued.");
+
+    std::vector<uint8_t> candidateDeadRankMask;
+    {
+        std::lock_guard<std::mutex> maskLock(mDeadRankMaskMutex);
+        candidateDeadRankMask = mDeadRankMask;
+        for (int deadRank : deadRanks)
+        {
+            TLLM_CHECK_WITH_INFO(
+                deadRank >= 0 && deadRank < mEpSize, "deadRank=%d should be in range [0, %d)", deadRank, mEpSize);
+            candidateDeadRankMask[deadRank] = 1;
+        }
+        TLLM_CHECK_WITH_INFO(
+            candidateDeadRankMask[mEpRank] == 0, "Local epRank (%d) cannot be masked by a survivor", mEpRank);
+    }
+
+    for (auto& layer : mLayers)
+    {
+        layer->waitLastUpdateDone();
+    }
+    for (auto& layer : mLayers)
+    {
+        layer->validateMaskOnly(candidateDeadRankMask);
+    }
+    {
+        std::lock_guard<std::mutex> maskLock(mDeadRankMaskMutex);
+        mDeadRankMask = candidateDeadRankMask;
+    }
+    for (auto& layer : mLayers)
+    {
+        layer->reconfigureMaskOnly(candidateDeadRankMask);
+    }
+}
+
 void MoeLoadBalancer::workerThread()
 {
     TLLM_CUDA_CHECK(cudaSetDevice(mCudaDeviceId));
@@ -1064,6 +1219,12 @@ void MoeLoadBalancer::waitCopyTaskDone(int64_t taskId)
     }
 }
 
+std::vector<uint8_t> MoeLoadBalancer::getDeadRankMaskSnapshot()
+{
+    std::lock_guard<std::mutex> lock(mDeadRankMaskMutex);
+    return mDeadRankMask;
+}
+
 MultiThreadWorker::MultiThreadWorker(int numThreads, int cudaDeviceId)
     : mNumThreads(numThreads)
     , mCudaDeviceId(cudaDeviceId)
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -191,6 +191,9 @@ class SingleLayerMoeLoadBalancer
     void waitCpuStage();
     void maybeStartUpdateWeights();
     void waitLastUpdateDone();
+    void validateMaskOnly(std::vector<uint8_t> const& deadRankMask) const;
+    void reconfigureMaskOnly(std::vector<uint8_t> const& deadRankMask);
+    std::vector<int> computeMaskOnlyReplicaCounts(std::vector<uint8_t> const& deadRankMask) const;
 
     MoeLoadBalancer* mMoeLoadBalancer = nullptr;
 
@@ -283,6 +286,11 @@ class MoeLoadBalancer
     // should bind to python
     void shutdown();
 
+    // should bind to python
+    // This API only validates placement safety, i.e. every expert keeps at least one surviving replica.
+    // The caller must separately gate degraded-mode capacity/HBM headroom before invoking it.
+    void reconfigureMaskOnly(std::vector<int> const& deadRanks);
+
     // Test interface to use GPU to do memcpy test functionality
     void setUseGpuMemcpy(bool useGpuMemcpy = false)
     {
@@ -312,6 +320,7 @@ class MoeLoadBalancer
     void addUpdateTask(std::function<void()> task);
     int64_t addCopyTask(std::function<void(int, int)> task);
     void waitCopyTaskDone(int64_t taskId);
+    std::vector<uint8_t> getDeadRankMaskSnapshot();
 
     std::vector<std::shared_ptr<SingleLayerMoeLoadBalancer>> mLayers;
 
@@ -327,6 +336,7 @@ class MoeLoadBalancer
     std::queue<IterInfo> mIterInfoQueue;
 
     bool mModelFinalized = false;
+    bool mIterActive = false;
 
     int mEpRank = 0;
     int mEpSize = 1;
@@ -339,6 +349,9 @@ class MoeLoadBalancer
 
     std::unique_ptr<MultiThreadWorker> mMultiThreadWorker;
 
+    std::mutex mDeadRankMaskMutex;
+    std::vector<uint8_t> mDeadRankMask;
+
     // update plan member and function
     int mLayerUpdatesPerIter = 1;
     std::deque<std::set<int>> mUpdateLayerQueue;
@@ -349,9 +362,9 @@ class MoeLoadBalancer
 
 // functions exposed for testing
 void doReplication(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float* const expertLoadFactor,
-    MoePlacementCpuInfo* cpuPlacement);
+    MoePlacementCpuInfo* cpuPlacement, std::vector<uint8_t> const* deadRankMask = nullptr);
 
 void doPlacement(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float* const expertLoadFactor,
-    MoePlacementCpuInfo* cpuPlacement);
+    MoePlacementCpuInfo* cpuPlacement, std::vector<uint8_t> const* deadRankMask = nullptr);
 
 } // namespace tensorrt_llm::runtime