@@ -385,13 +385,8 @@ def _invoke_fused_moe_quantized_function(
385385 # First layer of expert
386386 A = self .w13_input_quantizer (A ) # noqa: N806
387387 if self .w13_weight_quantizer .is_enabled : # pragma: no cover
388- original_weight , self .w13_weight = (
389- self .w13_weight ,
390- self .w13_weight_quantizer (self .w13_weight ),
391- )
392- # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
393- # quantized weight to the kernel.
394- B = self .w13_weight # noqa: N806
388+ original_weight = self .w13_weight
389+ B = self .w13_weight_quantizer (original_weight ) # noqa: N806
395390 try :
396391 original_kernel (A , B , C , * args , ** kwargs )
397392 finally :
@@ -403,13 +398,8 @@ def _invoke_fused_moe_quantized_function(
403398 elif B is self .w2_weight :
404399 A = self .w2_input_quantizer (A ) # noqa: N806
405400 if self .w2_weight_quantizer .is_enabled : # pragma: no cover
406- original_weight , self .w2_weight = (
407- self .w2_weight ,
408- self .w2_weight_quantizer (self .w2_weight ),
409- )
410- # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
411- # quantized weight to the kernel.
412- B = self .w2_weight # noqa: N806
401+ original_weight = self .w2_weight
402+ B = self .w2_weight_quantizer (original_weight ) # noqa: N806
413403 try :
414404 original_kernel (A , B , C , * args , ** kwargs )
415405 finally :
0 commit comments