Skip to content

Commit 0c686b2

Browse files
committed
prune
1 parent bb95d9f commit 0c686b2

15 files changed

Lines changed: 142 additions & 606 deletions

File tree

cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4124,22 +4124,6 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
41244124
TLLM_CHECK_WITH_INFO(gemm1_config_, "MOE GEMM1 Config is not set");
41254125
TLLM_CHECK_WITH_INFO(gemm2_config_, "MOE GEMM2 Config is not set");
41264126

4127-
// DEBUG(moe-lora): dump the compile-time precision flags of the runner that
4128-
// actually ran, so we can see whether NVFP4 reached this runMoe with FP4
4129-
// activations (act_fp4) or via some bf16/fp8 path. Capped to avoid flooding.
4130-
{
4131-
static int _krn_diag = 0;
4132-
if (use_lora && _krn_diag++ < 64)
4133-
{
4134-
printf("[moe-lora-diag][kernel] use_lora=%d act_fp4=%d weight_fp4=%d use_fp4=%d use_fp8=%d "
4135-
"use_w4afp8=%d use_wfp4afp8=%d use_wfp4a16=%d use_block_scaling=%d "
4136-
"use_deepseek_fp8_block_scale=%d\n",
4137-
(int) use_lora, (int) act_fp4, (int) weight_fp4, (int) use_fp4, (int) use_fp8, (int) use_w4afp8,
4138-
(int) use_wfp4afp8, (int) use_wfp4a16, (int) use_block_scaling, (int) use_deepseek_fp8_block_scale);
4139-
fflush(stdout);
4140-
}
4141-
}
4142-
41434127
TLLM_CHECK_WITH_INFO(!use_lora || !act_fp4, "MOE does not support LoRA with FP4 model");
41444128

41454129
if (int_scales_required)

cpp/tensorrt_llm/thop/moeOp.cpp

Lines changed: 15 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -611,40 +611,21 @@ class FusedMoeRunner : public torch::CustomClassHolder
611611
TORCH_CHECK(!enable_alltoall,
612612
"MoE LoRA is not supported with alltoall: the per-token adapter pointer arrays do not survive "
613613
"cross-rank token reshuffling.");
614-
// These checks constrain the dtype of the base MoE inputs (the
615-
// activations and the routed-expert weights), not the LoRA adapters,
616-
// which are always fp16/bf16. FP8 base weights are supported in both
617-
// the per-tensor (qdq) and block-scale forms: the LoRA GEMM runs on
618-
// fp16/bf16 activations (loraFC1/loraFC2 in moe_kernels.cu), either
619-
// dequantizing the per-tensor FP8 activations or reading the
620-
// block-scale path's bf16 activations directly. NVFP4 base weights
621-
// are presented as packed FP4 (c10::Long) with FP4 activations, so
622-
// the dtype checks below are skipped for NVFP4 and the runtime
623-
// FP4-activation handling is enforced in the kernel (moe_kernels.cu).
624-
// Integer quant has no LoRA path.
625-
bool const is_nvfp4 = isNvfp4Quant();
626-
// DEBUG(moe-lora): report the dtypes/quant flags the op actually
627-
// received so we can see what precision NVFP4 surfaces as here.
628-
{
629-
static int _thop_diag = 0;
630-
if (_thop_diag++ < 64)
631-
{
632-
printf("[moe-lora-diag][thop] act_dtype=%s weight_dtype=%s output_dtype=%s isNvfp4=%d isFp8=%d "
633-
"deepseekBlockScale=%d useW4Group=%d useMxfp8Act=%d\n",
634-
torch::toString(mActivationDtype), torch::toString(mWeightDtype),
635-
torch::toString(mOutputDtype), (int) is_nvfp4, (int) isFp8Quant(),
636-
(int) mUseDeepSeekFP8BlockScaling, (int) mUseW4GroupScaling, (int) mUseMxfp8ActScaling);
637-
fflush(stdout);
638-
}
639-
}
640-
TORCH_CHECK(is_nvfp4 || mActivationDtype == c10::ScalarType::Half
641-
|| mActivationDtype == c10::ScalarType::BFloat16
642-
|| mActivationDtype == c10::ScalarType::Float8_e4m3fn,
643-
"MoE LoRA only supports fp16, bf16, per-tensor FP8, or NVFP4 activation dtypes.");
644-
TORCH_CHECK(is_nvfp4 || mWeightDtype == c10::ScalarType::Half
645-
|| mWeightDtype == c10::ScalarType::BFloat16
646-
|| mWeightDtype == c10::ScalarType::Float8_e4m3fn,
647-
"MoE LoRA supports unquantized fp16/bf16, FP8, or NVFP4 base expert weights only "
614+
// Constrain the dtype of the base MoE inputs (activations and
615+
// routed-expert weights), not the LoRA adapters, which are always
616+
// fp16/bf16. Per-tensor FP8 (qdq) is supported because the LoRA GEMM
617+
// runs on fp16/bf16 activations after the kernel dequantizes them.
618+
// isFp8Quant() matches only per-tensor FP8, so FP8 block-scale,
619+
// NVFP4, MXFP8, and integer quant (none of which have a LoRA path)
620+
// stay rejected.
621+
bool const is_per_tensor_fp8 = isFp8Quant();
622+
TORCH_CHECK(mActivationDtype == c10::ScalarType::Half || mActivationDtype == c10::ScalarType::BFloat16
623+
|| is_per_tensor_fp8,
624+
"MoE LoRA only supports fp16, bf16, or per-tensor FP8 (qdq) base weights. FP8 block-scale, NVFP4, "
625+
"MXFP8, and integer quant are not supported.");
626+
TORCH_CHECK(
627+
mWeightDtype == c10::ScalarType::Half || mWeightDtype == c10::ScalarType::BFloat16 || is_per_tensor_fp8,
628+
"MoE LoRA supports unquantized fp16/bf16 or per-tensor FP8 (qdq) base expert weights only "
648629
"(LoRA adapters are always fp16/bf16).");
649630
// CUDA-graph capture is only safe on the device LoRA path. The
650631
// legacy host path performs a host-side cudaEventSynchronize and
@@ -1322,15 +1303,6 @@ class FusedMoeRunner : public torch::CustomClassHolder
13221303
TORCH_CHECK(mOutputDtype != c10::ScalarType::Float8_e4m3fn,
13231304
"MoE LoRA with FP8 base activations requires an fp16/bf16 output (LoRA compute) dtype.");
13241305
return loraTypeFromActDtype(mOutputDtype);
1325-
#endif
1326-
#ifdef ENABLE_FP4
1327-
// NVFP4 base activations are presented as packed FP4 (uint8/Byte) with
1328-
// separate per-block scales; the LoRA GEMM runs in the fp16/bf16 output
1329-
// (backbone) dtype after the kernel dequantizes the FP4 activations.
1330-
case c10::ScalarType::Byte:
1331-
TORCH_CHECK(mOutputDtype != c10::ScalarType::Float8_e4m3fn,
1332-
"MoE LoRA with NVFP4 base activations requires an fp16/bf16 output (LoRA compute) dtype.");
1333-
return loraTypeFromActDtype(mOutputDtype);
13341306
#endif
13351307
default: C10_THROW_ERROR_FORMATTED(Error, "MoE LoRA only supports fp16/bf16/fp32 activation dtype.");
13361308
}

tensorrt_llm/_torch/models/modeling_deepseekv3.py

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,8 +1102,7 @@ def _get_shared_experts_quant_config(model_config,
11021102
return quant_config
11031103

11041104
def compute_routed_output(self, hidden_states, hidden_states_fp4,
1105-
all_rank_num_tokens, do_finalize,
1106-
lora_params=None):
1105+
all_rank_num_tokens, do_finalize):
11071106
# max-throughput
11081107
use_dp_padding = False
11091108
# Add DP padding on SM120 for context comm performance
@@ -1124,7 +1123,6 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
11241123
output_dtype=hidden_states.dtype,
11251124
all_rank_num_tokens=all_rank_num_tokens,
11261125
use_dp_padding=use_dp_padding,
1127-
lora_params=lora_params,
11281126
**({
11291127
"alltoall_result_do_sum": False
11301128
} if isinstance(self.experts, WideEPMoE) else {}),
@@ -1139,24 +1137,10 @@ def forward(
11391137
all_rank_num_tokens: Optional[list[int]] = None,
11401138
final_all_reduce_params: Optional[AllReduceParams] = None,
11411139
do_finalize: Optional[bool] = True,
1142-
lora_params: Optional[dict] = None,
11431140
) -> torch.Tensor:
11441141
if not do_finalize:
11451142
assert not self.use_dp
11461143

1147-
# DEBUG(moe-lora): confirm routed-expert LoRA params reach the DeepSeek
1148-
# MoE module. Report the per-layer module ids present in lora_params
1149-
# (routed-expert ids: moe_h_to_4h=13, moe_4h_to_h=14, moe_gate=15).
1150-
_layer_idx = getattr(self.experts, "layer_idx", None)
1151-
_layer_module_ids = (sorted(lora_params.get(_layer_idx, {}).keys()) if
1152-
(lora_params and _layer_idx is not None) else None)
1153-
print(
1154-
f"[deepseek-moe] layer={_layer_idx} "
1155-
f"experts_type={type(self.experts).__name__} "
1156-
f"lora_params_present={lora_params is not None} "
1157-
f"layer_module_ids={_layer_module_ids}",
1158-
flush=True)
1159-
11601144
def _compute_shared_output():
11611145
shared_input = (hidden_states_fp4 if
11621146
(hidden_states_fp4 is not None
@@ -1171,8 +1155,7 @@ def _compute_routed_output():
11711155
routed_output = self.compute_routed_output(hidden_states,
11721156
hidden_states_fp4,
11731157
all_rank_num_tokens,
1174-
do_finalize,
1175-
lora_params=lora_params)
1158+
do_finalize)
11761159
return routed_output
11771160

11781161
# NOTE: define compiled helpers at module scope to avoid defining decorators inside compiled frames
@@ -1424,15 +1407,12 @@ def forward(
14241407
attn_metadata: AttentionMetadata,
14251408
residual: torch.Tensor,
14261409
spec_metadata: Optional[SpecMetadata] = None,
1427-
lora_params: Optional[dict] = None,
14281410
**kwargs,
14291411
) -> Tuple[torch.Tensor, torch.Tensor]:
14301412
if residual is None:
14311413
residual = hidden_states
14321414
hidden_states = self.input_layernorm(hidden_states)
1433-
# Self Attention. DeepSeek attention is MLA, whose forward does not
1434-
# accept lora_params, so it is intentionally not threaded here; routed-
1435-
# expert MoE LoRA is applied in the MoE path below.
1415+
# Self Attention
14361416
hidden_states = self.self_attn(
14371417
position_ids=position_ids,
14381418
hidden_states=hidden_states,
@@ -1453,7 +1433,6 @@ def forward(
14531433
attn_metadata=attn_metadata,
14541434
residual=residual,
14551435
spec_metadata=spec_metadata,
1456-
lora_params=lora_params,
14571436
)
14581437
else:
14591438
if spec_metadata is not None and spec_metadata.is_layer_capture(
@@ -1472,7 +1451,6 @@ def forward_MoE(
14721451
attn_metadata: AttentionMetadata,
14731452
residual: torch.Tensor,
14741453
spec_metadata: Optional[SpecMetadata] = None,
1475-
lora_params: Optional[dict] = None,
14761454
) -> Tuple[torch.Tensor, torch.Tensor]:
14771455

14781456
def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
@@ -1484,7 +1462,6 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
14841462
enable_allreduce=not (self.fusion_config.POST_MOE_FUSION
14851463
or self.mapping.tp_size == 1)),
14861464
do_finalize=do_finalize,
1487-
lora_params=lora_params,
14881465
)
14891466

14901467
if self.fusion_config.PRE_MOE_FUSION:
@@ -1803,7 +1780,6 @@ def forward(
18031780
position_ids: Optional[torch.IntTensor] = None,
18041781
inputs_embeds: Optional[torch.FloatTensor] = None,
18051782
spec_metadata: Optional[SpecMetadata] = None,
1806-
lora_params: Optional[dict] = None,
18071783
**kwargs,
18081784
) -> torch.Tensor:
18091785
if (input_ids is None) ^ (inputs_embeds is not None):
@@ -1825,7 +1801,6 @@ def forward(
18251801
attn_metadata=attn_metadata,
18261802
residual=residual,
18271803
spec_metadata=spec_metadata,
1828-
lora_params=lora_params,
18291804
)
18301805

18311806
hidden_states = maybe_allgather_for_helix_cp(hidden_states,

tensorrt_llm/_torch/models/modeling_mixtral.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -63,22 +63,6 @@ def forward(
6363
attn_metadata: AttentionMetadata,
6464
lora_params: Optional[dict] = None,
6565
) -> torch.Tensor:
66-
# DEBUG(moe-lora): confirm routed-expert LoRA params reach the Mixtral
67-
# MoE module. Without this plumbing the experts run without LoRA and the
68-
# adapter output is identical to the base model. `self.experts` may be a
69-
# ConfigurableMoE wrapper whose CUTLASS impl (with _moe_lora_active)
70-
# lives on `.backend`, so resolve through it before giving up.
71-
_impl = self.experts
72-
if not hasattr(_impl, "_moe_lora_active"):
73-
_impl = getattr(self.experts, "backend", None)
74-
_active = _impl._moe_lora_active(lora_params) if hasattr(
75-
_impl, "_moe_lora_active") else None
76-
print(
77-
f"[mixtral-moe] layer={getattr(self.experts, 'layer_idx', None)} "
78-
f"experts_type={type(self.experts).__name__} "
79-
f"lora_params_present={lora_params is not None} "
80-
f"moe_lora_active={_active}",
81-
flush=True)
8266
all_rank_num_tokens = attn_metadata.all_rank_num_tokens
8367
router_logits = self.gate(hidden_states)
8468
final_hidden_states = self.experts(

tensorrt_llm/_torch/models/modeling_qwen3_moe.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ def forward(
134134
attn_metadata: AttentionMetadata,
135135
all_reduce_params: Optional[AllReduceParams] = None,
136136
do_finalize: Optional[bool] = True,
137-
lora_params: Optional[dict] = None,
138137
) -> torch.Tensor:
139138
assert hidden_states.shape[-1] == self.hidden_dim
140139
orig_shape = hidden_states.shape
@@ -145,30 +144,13 @@ def forward(
145144
if not do_finalize:
146145
assert not self.enable_attention_dp
147146

148-
# DEBUG(moe-lora): confirm routed-expert LoRA params reach the Qwen3 MoE
149-
# module. Report the actual per-layer module ids present in lora_params
150-
# (ground truth) rather than _moe_lora_active, which keys on the backend
151-
# instance's layer_idx and can be misleading through the ConfigurableMoE
152-
# wrapper. The routed-expert ids are moe_h_to_4h=13, moe_4h_to_h=14,
153-
# moe_gate=15.
154-
_layer_idx = getattr(self.experts, "layer_idx", None)
155-
_layer_module_ids = (sorted(lora_params.get(_layer_idx, {}).keys()) if
156-
(lora_params and _layer_idx is not None) else None)
157-
print(
158-
f"[qwen3-moe] layer={_layer_idx} "
159-
f"experts_type={type(self.experts).__name__} "
160-
f"lora_params_present={lora_params is not None} "
161-
f"layer_module_ids={_layer_module_ids}",
162-
flush=True)
163-
164147
router_logits = self.gate(hidden_states)
165148
final_hidden_states = self.experts(
166149
hidden_states,
167150
router_logits,
168151
all_rank_num_tokens=all_rank_num_tokens,
169152
use_dp_padding=use_dp_padding,
170153
do_finalize=do_finalize,
171-
lora_params=lora_params,
172154
)
173155

174156
if not do_finalize:
@@ -243,7 +225,6 @@ def forward(
243225
spec_metadata: Optional[SpecMetadata] = None,
244226
mrope_config: Optional[Dict[str, torch.Tensor]] = None,
245227
deepstack_embeds: Optional[List[torch.Tensor]] = None,
246-
lora_params: Optional[dict] = None,
247228
**kwargs,
248229
) -> torch.Tensor:
249230
if residual is None:
@@ -260,7 +241,6 @@ def forward(
260241
all_reduce_params=AllReduceParams(
261242
enable_allreduce=not self.disable_attn_allreduce),
262243
mrope_config=mrope_config,
263-
lora_params=lora_params,
264244
**kwargs,
265245
)
266246

@@ -292,7 +272,6 @@ def forward(
292272
enable_allreduce=not (self.fusion_config.POST_MOE_FUSION
293273
or self.mapping.tp_size == 1)),
294274
do_finalize=do_finalize,
295-
lora_params=lora_params,
296275
)
297276

298277
if deepstack_embeds is not None and self.layer_idx in range(

tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,20 +1024,6 @@ def run_moe(
10241024
if lora_kwargs is None:
10251025
lora_kwargs = {}
10261026

1027-
# DEBUG(moe-lora): report what the CUTLASS MoE op actually receives for
1028-
# this layer -- which quant method, the input tensor type/dtype (this is
1029-
# the activation precision reaching the op), and whether a routed-expert
1030-
# LoRA delta is being fused. Lets us see how NVFP4 surfaces at runtime.
1031-
if self._moe_lora_active(lora_params):
1032-
_x_dtype = getattr(x, "dtype", None)
1033-
print(
1034-
f"[moe-lora-diag][py] layer={self.layer_idx} "
1035-
f"quant_method={type(self.quant_method).__name__} "
1036-
f"x_type={type(x).__name__} x_dtype={_x_dtype} "
1037-
f"x_sf_present={x_sf is not None} has_nvfp4={self.has_nvfp4} "
1038-
f"weight_dtype={weight_dtype} lora_active={bool(lora_kwargs)}",
1039-
flush=True)
1040-
10411027
result = torch.ops.trtllm.fused_moe(
10421028
x,
10431029
token_selected_experts,

0 commit comments

Comments
 (0)