@@ -323,7 +323,7 @@ def __init__(self,
323323
324324 # After loading both gate_up_proj and down_proj, we need to set the scales needed by the special kernels and by
325325 # the trtllm-gen gemm+swiglu kernel.
326- def cache_derived_state (self ):
326+ def cache_derived_state (self ) -> None :
327327 if self .gate_up_proj .has_fp8_qdq :
328328 # For the special gemm+swiglu kernel, we need to set the inverse of the output scale, which is the inverse
329329 # of down_proj's combined input scale.
@@ -332,7 +332,7 @@ def cache_derived_state(self):
332332 # combined input scale times inv_output_scale.
333333 self .gate_up_proj .trtllm_gen_global_scale = self .gate_up_proj .combined_scale * self .gate_up_proj .inv_output_scale
334334
335- def post_load_weights (self ):
335+ def post_load_weights (self ) -> None :
336336 self .cache_derived_state ()
337337
338338 def forward (
@@ -584,7 +584,7 @@ def __init__(
584584 dtype = model_config .pretrained_config .torch_dtype ,
585585 quant_config = None )
586586
587- def cache_derived_state (self ):
587+ def cache_derived_state (self ) -> None :
588588 # Set min-latency quant scales for routed experts if we plan to use min-latency MoE kernels.
589589 # This is because the routed experts' input scale is after the score multiplication, so we must use the
590590 # pre-score scaling input scale, which happens to be shared expert's input scale.
@@ -600,7 +600,7 @@ def cache_derived_state(self):
600600 fc1_input_dequant = pre_score_scaling_input_scale ,
601601 )
602602
603- def post_load_weights (self ):
603+ def post_load_weights (self ) -> None :
604604 self .cache_derived_state ()
605605
606606 def compute_routed_output (
0 commit comments