Skip to content

Commit 5f391a6

Browse files
committed
fix: address EPLB mask review feedback
Signed-off-by: Chien-Chun Hung <2679986+chienchunhung@users.noreply.github.com>
1 parent a3ac34c commit 5f391a6

3 files changed

Lines changed: 118 additions & 9 deletions

File tree

cpp/tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.cpp

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,16 @@ namespace
5959

6060
bool isRankMasked(std::vector<uint8_t> const* deadRankMask, int rank)
6161
{
62-
return deadRankMask != nullptr && rank >= 0 && rank < static_cast<int>(deadRankMask->size())
63-
&& ((*deadRankMask)[rank] != 0);
62+
return deadRankMask != nullptr && ((*deadRankMask)[rank] != 0);
6463
}
6564

6665
int getActiveSlotCount(
6766
tensorrt_llm::kernels::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<uint8_t> const* deadRankMask)
6867
{
68+
TLLM_CHECK_WITH_INFO(deadRankMask == nullptr || static_cast<int>(deadRankMask->size()) == metaInfo.epSize,
69+
"deadRankMask size (%ld) must match epSize (%d)",
70+
deadRankMask == nullptr ? 0L : static_cast<long>(deadRankMask->size()), metaInfo.epSize);
71+
6972
int activeRankCount = 0;
7073
for (int rank = 0; rank < metaInfo.epSize; ++rank)
7174
{
@@ -163,7 +166,8 @@ void doPlacement(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float*
163166

164167
for (int expertId = 0; expertId < metaInfo.expertCount; ++expertId)
165168
{
166-
assert(replicaCount[expertId] > 0); // Ensure replica count is positive
169+
TLLM_CHECK_WITH_INFO(replicaCount[expertId] > 0, "Replica count (%d) for expert %d must be positive",
170+
replicaCount[expertId], expertId);
167171
double slotSize = expertLoadFactor[expertId] / static_cast<double>(replicaCount[expertId]);
168172
for (int replicaId = 0; replicaId < replicaCount[expertId]; ++replicaId)
169173
{
@@ -172,7 +176,9 @@ void doPlacement(tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo, float*
172176
}
173177
}
174178

175-
assert(static_cast<int>(allReplicas.size()) == totalSlotCount);
179+
TLLM_CHECK_WITH_INFO(static_cast<int>(allReplicas.size()) == totalSlotCount,
180+
"Replica count sum (%ld) must match active slot count (%d)", static_cast<long>(allReplicas.size()),
181+
totalSlotCount);
176182

177183
// 2. Sort replicas by slotSize descending
178184
std::sort(allReplicas.begin(), allReplicas.end());
@@ -1124,14 +1130,14 @@ void MoeLoadBalancer::reconfigureMaskOnly(std::vector<int> const& deadRanks)
11241130
{
11251131
layer->validateMaskOnly(candidateDeadRankMask);
11261132
}
1127-
{
1128-
std::lock_guard<std::mutex> maskLock(mDeadRankMaskMutex);
1129-
mDeadRankMask = candidateDeadRankMask;
1130-
}
11311133
for (auto& layer : mLayers)
11321134
{
11331135
layer->reconfigureMaskOnly(candidateDeadRankMask);
11341136
}
1137+
{
1138+
std::lock_guard<std::mutex> maskLock(mDeadRankMaskMutex);
1139+
mDeadRankMask = candidateDeadRankMask;
1140+
}
11351141
}
11361142

11371143
void MoeLoadBalancer::workerThread()

cpp/tests/unit_tests/runtime/moeLoadBalancerTest.cpp

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,109 @@ INSTANTIATE_TEST_SUITE_P(PlacementTests, MoePlacementTest,
290290
return name;
291291
});
292292

293+
TEST(MoeLoadBalancerMaskOnlyTest, DynamicPlacementHonorsDeadRankMask)
294+
{
295+
constexpr int kExpertCount = 4;
296+
constexpr int kTopK = 2;
297+
constexpr int kEpRank = 0;
298+
constexpr int kEpSize = 4;
299+
constexpr int kSlotCountPerRank = 2;
300+
constexpr int kDeadRank = 2;
301+
constexpr int kActiveSlotCount = (kEpSize - 1) * kSlotCountPerRank;
302+
303+
tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo{kExpertCount, kTopK, kEpRank, kEpSize, kSlotCountPerRank};
304+
std::vector<float> expertLoadFactor{8.0F, 4.0F, 2.0F, 1.0F};
305+
std::vector<uint8_t> deadRankMask{0, 0, 1, 0};
306+
307+
MoePlacementCpuInfo cpuPlacement;
308+
doReplication(metaInfo, expertLoadFactor.data(), &cpuPlacement, &deadRankMask);
309+
310+
int replicaSum = 0;
311+
for (int replicaCount : cpuPlacement.expertReplicaCount)
312+
{
313+
replicaSum += replicaCount;
314+
}
315+
EXPECT_EQ(replicaSum, kActiveSlotCount);
316+
317+
cpuPlacement.rankExpertIds.resize(kEpSize);
318+
for (int rank = 0; rank < kEpSize; ++rank)
319+
{
320+
cpuPlacement.rankExpertIds[rank].resize(kSlotCountPerRank, 99);
321+
}
322+
323+
doPlacement(metaInfo, expertLoadFactor.data(), &cpuPlacement, &deadRankMask);
324+
325+
std::vector<int> placedReplicas(kExpertCount, 0);
326+
int assignedSlotCount = 0;
327+
for (int rank = 0; rank < kEpSize; ++rank)
328+
{
329+
for (int slot = 0; slot < kSlotCountPerRank; ++slot)
330+
{
331+
int const expertId = cpuPlacement.rankExpertIds[rank][slot];
332+
if (rank == kDeadRank)
333+
{
334+
EXPECT_EQ(expertId, -1);
335+
continue;
336+
}
337+
338+
EXPECT_GE(expertId, 0);
339+
EXPECT_LT(expertId, kExpertCount);
340+
if (expertId >= 0 && expertId < kExpertCount)
341+
{
342+
++placedReplicas[expertId];
343+
++assignedSlotCount;
344+
}
345+
}
346+
}
347+
348+
EXPECT_EQ(assignedSlotCount, kActiveSlotCount);
349+
for (int expertId = 0; expertId < kExpertCount; ++expertId)
350+
{
351+
EXPECT_EQ(placedReplicas[expertId], cpuPlacement.expertReplicaCount[expertId]);
352+
}
353+
}
354+
355+
TEST(MoeLoadBalancerMaskOnlyTest, DynamicPlacementRejectsMismatchedDeadRankMask)
356+
{
357+
constexpr int kExpertCount = 4;
358+
constexpr int kTopK = 2;
359+
constexpr int kEpRank = 0;
360+
constexpr int kEpSize = 4;
361+
constexpr int kSlotCountPerRank = 2;
362+
363+
tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo{kExpertCount, kTopK, kEpRank, kEpSize, kSlotCountPerRank};
364+
std::vector<float> expertLoadFactor{1.0F, 1.0F, 1.0F, 1.0F};
365+
std::vector<uint8_t> deadRankMask{0, 1};
366+
367+
MoePlacementCpuInfo cpuPlacement;
368+
EXPECT_THROW(doReplication(metaInfo, expertLoadFactor.data(), &cpuPlacement, &deadRankMask),
369+
tensorrt_llm::common::TllmException);
370+
}
371+
372+
TEST(MoeLoadBalancerMaskOnlyTest, DynamicPlacementRejectsReplicaCountMismatch)
373+
{
374+
constexpr int kExpertCount = 4;
375+
constexpr int kTopK = 2;
376+
constexpr int kEpRank = 0;
377+
constexpr int kEpSize = 4;
378+
constexpr int kSlotCountPerRank = 2;
379+
380+
tensorrt_llm::kernels::MoeLoadBalanceMetaInfo metaInfo{kExpertCount, kTopK, kEpRank, kEpSize, kSlotCountPerRank};
381+
std::vector<float> expertLoadFactor{1.0F, 1.0F, 1.0F, 1.0F};
382+
std::vector<uint8_t> deadRankMask{0, 0, 1, 0};
383+
384+
MoePlacementCpuInfo cpuPlacement;
385+
cpuPlacement.expertReplicaCount = {1, 1, 1, 1};
386+
cpuPlacement.rankExpertIds.resize(kEpSize);
387+
for (int rank = 0; rank < kEpSize; ++rank)
388+
{
389+
cpuPlacement.rankExpertIds[rank].resize(kSlotCountPerRank, -1);
390+
}
391+
392+
EXPECT_THROW(doPlacement(metaInfo, expertLoadFactor.data(), &cpuPlacement, &deadRankMask),
393+
tensorrt_llm::common::TllmException);
394+
}
395+
293396
TEST(MoeLoadBalancerMaskOnlyTest, ReconfigureMaskOnlyRemovesDeadRankSlots)
294397
{
295398
setenv("TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK", "1", 1);

tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,7 @@ def set_iter_info(self, enable_statistic: Optional[bool],
989989
if enable_update_weights is not None:
990990
self.enable_update_weights = enable_update_weights
991991

992-
def reconfigure_mask_only(self, dead_ranks: List[int]):
992+
def reconfigure_mask_only(self, dead_ranks: list[int]) -> None:
993993
"""
994994
Reconfigure EPLB routing so slots on dead EP ranks are unreachable.
995995

0 commit comments

Comments
 (0)