2424
2525import torch
2626
27+ from tensorrt_llm ._utils import local_mpi_size
2728from tensorrt_llm .models .modeling_utils import QuantAlgo
2829
2930from .backend import MoeBackendType , get_backend_class
3334_FUSED_COMM_BACKENDS = frozenset ({"MEGAMOE_DEEPGEMM" })
3435
3536
37+ def _is_deepep_feasible (num_ranks : int ) -> bool :
38+ """Return True if DeepEP supports the given EP rank count on this node topology.
39+
40+ Intranode: num_ranks in {2, 4, 8} and num_ranks == local_mpi_size().
41+ Internode: exactly 8 ranks per node, with 2/4/8/16 RDMA nodes.
42+ Mirrors the feasibility check in fused_moe_wide_ep.py::select_alltoall_method_type.
43+ """
44+ _INTRANODE_RANKS = {2 , 4 , 8 }
45+ _REQUIRED_LOCAL_SIZE = 8
46+ _INTERNODE_RDMA_NODES = {2 , 4 , 8 , 16 }
47+ mpi_size = local_mpi_size ()
48+ if num_ranks == mpi_size and num_ranks in _INTRANODE_RANKS :
49+ return True
50+ if mpi_size != _REQUIRED_LOCAL_SIZE :
51+ return False
52+ return (num_ranks // mpi_size ) in _INTERNODE_RDMA_NODES
53+
54+
3655def _check_backend_can_implement (
3756 backend_str : str ,
3857 quant_algo : Optional [QuantAlgo ],
@@ -151,6 +170,61 @@ def is_candidate_valid(
151170 "use TEP/DEP only with other backends"
152171 )
153172
173+ # MegaMoEDeepGemm is EP-only (asserts moe_tp_size == 1 in __init__); DTP/TTP are invalid.
174+ if config .backend .upper () == "MEGAMOE_DEEPGEMM" and moe_tp > 1 :
175+ return False , (
176+ f"MEGAMOE_DEEPGEMM does not support MoE-TP (moe_tp_size={ moe_tp } ); "
177+ "use DEP/TEP modes only"
178+ )
179+
180+ # DENSEGEMM DTP: FC2 kernel requires (intermediate_size / moe_tp_size) % 256 == 0.
181+ # DENSEGEMM __init__ only checks the full intermediate_size, so a model like
182+ # DeepSeek V3 (intermediate_size=2048, 2048%256=0) passes __init__ but fails
183+ # at runtime with moe_tp_size=16 (2048/16=128, 128%256!=0).
184+ if config .backend .upper () == "DENSEGEMM" and moe_ep == 1 and moe_tp > 1 :
185+ if model .intermediate_size % moe_tp != 0 :
186+ return False , (
187+ f"DENSEGEMM DTP: intermediate_size={ model .intermediate_size } "
188+ f"not divisible by moe_tp_size={ moe_tp } "
189+ )
190+ per_tp_k = model .intermediate_size // moe_tp
191+ _DENSEGEMM_MMA_TILE_K = 256
192+ if per_tp_k % _DENSEGEMM_MMA_TILE_K != 0 :
193+ return False , (
194+ f"DENSEGEMM DTP moe_tp_size={ moe_tp } : intermediate_size/tp={ per_tp_k } "
195+ f"not aligned to FC2 MMA tile-K={ _DENSEGEMM_MMA_TILE_K } "
196+ )
197+
198+ # NVFP4 on CuteDSL / TRTLLM-Gen requires the per-partition intermediate size
199+ # (intermediate_size / moe_tp_size) to be a multiple of the NVFP4 weight
200+ # alignment (128). Unlike CUTLASS (which pads intermediate_size_per_partition
201+ # up to 128), these backends use the unpadded logical size when laying out the
202+ # block-scale tensor and fail during weight load: CUTEDSL raises a reshape
203+ # RuntimeError (e.g. "shape '[-1, 192, 448]' is invalid for input of size
204+ # 114688" — 192 padded to 256) and TRTLLM-Gen hits `assert intermediate_size %
205+ # weight_alignment == 0`. Prune the unsupported combo with a clear reason
206+ # instead of letting it crash mid-sweep. Example: DeepSeek-V4-Pro
207+ # (intermediate_size=3072) at moe_tp_size=32 -> 3072/32=96, 96%128!=0.
208+ if (
209+ config .backend .upper () in ("CUTEDSL" , "TRTLLM" )
210+ and model .quant_algo_enum == QuantAlgo .NVFP4
211+ and moe_tp > 1
212+ ):
213+ _NVFP4_WEIGHT_ALIGNMENT = 128
214+ if model .intermediate_size % moe_tp != 0 :
215+ return False , (
216+ f"{ config .backend .upper ()} NVFP4: intermediate_size="
217+ f"{ model .intermediate_size } not divisible by moe_tp_size={ moe_tp } "
218+ )
219+ per_tp_k = model .intermediate_size // moe_tp
220+ if per_tp_k % _NVFP4_WEIGHT_ALIGNMENT != 0 :
221+ return False , (
222+ f"{ config .backend .upper ()} NVFP4 moe_tp_size={ moe_tp } : "
223+ f"intermediate_size/tp={ per_tp_k } not aligned to NVFP4 weight "
224+ f"alignment={ _NVFP4_WEIGHT_ALIGNMENT } (CUTLASS pads to 128, "
225+ f"CUTEDSL/TRTLLM do not)"
226+ )
227+
154228 # Forced communication on non-DP / MoE-TP paths.
155229 forced = config .comm_method .upper ()
156230 if forced not in ("AUTO" , "NONE" ):
@@ -160,6 +234,12 @@ def is_candidate_valid(
160234 return False , f"comm_method={ forced } requires moe_tp_size=1 (got { moe_tp } )"
161235 if world_size == 1 :
162236 return False , f"comm_method={ forced } has no effect at world_size=1"
237+ if forced == "DEEPEP" and not _is_deepep_feasible (moe_ep ):
238+ return False , (
239+ f"comm_method={ forced } : moe_ep_size={ moe_ep } not supported by DeepEP topology "
240+ f"(local_mpi_size={ local_mpi_size ()} ; supported: intranode {{2,4,8}}, "
241+ f"internode 8-ranks/node x {{2,4,8,16}} nodes)"
242+ )
163243
164244 return True , None
165245
0 commit comments