@@ -1102,8 +1102,7 @@ def _get_shared_experts_quant_config(model_config,
11021102 return quant_config
11031103
11041104 def compute_routed_output (self , hidden_states , hidden_states_fp4 ,
1105- all_rank_num_tokens , do_finalize ,
1106- lora_params = None ):
1105+ all_rank_num_tokens , do_finalize ):
11071106 # max-throughput
11081107 use_dp_padding = False
11091108 # Add DP padding on SM120 for context comm performance
@@ -1124,7 +1123,6 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
11241123 output_dtype = hidden_states .dtype ,
11251124 all_rank_num_tokens = all_rank_num_tokens ,
11261125 use_dp_padding = use_dp_padding ,
1127- lora_params = lora_params ,
11281126 ** ({
11291127 "alltoall_result_do_sum" : False
11301128 } if isinstance (self .experts , WideEPMoE ) else {}),
@@ -1139,24 +1137,10 @@ def forward(
11391137 all_rank_num_tokens : Optional [list [int ]] = None ,
11401138 final_all_reduce_params : Optional [AllReduceParams ] = None ,
11411139 do_finalize : Optional [bool ] = True ,
1142- lora_params : Optional [dict ] = None ,
11431140 ) -> torch .Tensor :
11441141 if not do_finalize :
11451142 assert not self .use_dp
11461143
1147- # DEBUG(moe-lora): confirm routed-expert LoRA params reach the DeepSeek
1148- # MoE module. Report the per-layer module ids present in lora_params
1149- # (routed-expert ids: moe_h_to_4h=13, moe_4h_to_h=14, moe_gate=15).
1150- _layer_idx = getattr (self .experts , "layer_idx" , None )
1151- _layer_module_ids = (sorted (lora_params .get (_layer_idx , {}).keys ()) if
1152- (lora_params and _layer_idx is not None ) else None )
1153- print (
1154- f"[deepseek-moe] layer={ _layer_idx } "
1155- f"experts_type={ type (self .experts ).__name__ } "
1156- f"lora_params_present={ lora_params is not None } "
1157- f"layer_module_ids={ _layer_module_ids } " ,
1158- flush = True )
1159-
11601144 def _compute_shared_output ():
11611145 shared_input = (hidden_states_fp4 if
11621146 (hidden_states_fp4 is not None
@@ -1171,8 +1155,7 @@ def _compute_routed_output():
11711155 routed_output = self .compute_routed_output (hidden_states ,
11721156 hidden_states_fp4 ,
11731157 all_rank_num_tokens ,
1174- do_finalize ,
1175- lora_params = lora_params )
1158+ do_finalize )
11761159 return routed_output
11771160
11781161 # NOTE: define compiled helpers at module scope to avoid defining decorators inside compiled frames
@@ -1424,15 +1407,12 @@ def forward(
14241407 attn_metadata : AttentionMetadata ,
14251408 residual : torch .Tensor ,
14261409 spec_metadata : Optional [SpecMetadata ] = None ,
1427- lora_params : Optional [dict ] = None ,
14281410 ** kwargs ,
14291411 ) -> Tuple [torch .Tensor , torch .Tensor ]:
14301412 if residual is None :
14311413 residual = hidden_states
14321414 hidden_states = self .input_layernorm (hidden_states )
1433- # Self Attention. DeepSeek attention is MLA, whose forward does not
1434- # accept lora_params, so it is intentionally not threaded here; routed-
1435- # expert MoE LoRA is applied in the MoE path below.
1415+ # Self Attention
14361416 hidden_states = self .self_attn (
14371417 position_ids = position_ids ,
14381418 hidden_states = hidden_states ,
@@ -1453,7 +1433,6 @@ def forward(
14531433 attn_metadata = attn_metadata ,
14541434 residual = residual ,
14551435 spec_metadata = spec_metadata ,
1456- lora_params = lora_params ,
14571436 )
14581437 else :
14591438 if spec_metadata is not None and spec_metadata .is_layer_capture (
@@ -1472,7 +1451,6 @@ def forward_MoE(
14721451 attn_metadata : AttentionMetadata ,
14731452 residual : torch .Tensor ,
14741453 spec_metadata : Optional [SpecMetadata ] = None ,
1475- lora_params : Optional [dict ] = None ,
14761454 ) -> Tuple [torch .Tensor , torch .Tensor ]:
14771455
14781456 def _run_MoE (hidden_states , hidden_states_fp4 , do_finalize ):
@@ -1484,7 +1462,6 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
14841462 enable_allreduce = not (self .fusion_config .POST_MOE_FUSION
14851463 or self .mapping .tp_size == 1 )),
14861464 do_finalize = do_finalize ,
1487- lora_params = lora_params ,
14881465 )
14891466
14901467 if self .fusion_config .PRE_MOE_FUSION :
@@ -1803,7 +1780,6 @@ def forward(
18031780 position_ids : Optional [torch .IntTensor ] = None ,
18041781 inputs_embeds : Optional [torch .FloatTensor ] = None ,
18051782 spec_metadata : Optional [SpecMetadata ] = None ,
1806- lora_params : Optional [dict ] = None ,
18071783 ** kwargs ,
18081784 ) -> torch .Tensor :
18091785 if (input_ids is None ) ^ (inputs_embeds is not None ):
@@ -1825,7 +1801,6 @@ def forward(
18251801 attn_metadata = attn_metadata ,
18261802 residual = residual ,
18271803 spec_metadata = spec_metadata ,
1828- lora_params = lora_params ,
18291804 )
18301805
18311806 hidden_states = maybe_allgather_for_helix_cp (hidden_states ,
0 commit comments