bug fix

kinjalpatel27 · kinjalpatel27 · commit dd948e57d7cd · 2026-04-21T01:02:53.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
@@ -385,13 +385,8 @@ def _invoke_fused_moe_quantized_function(
             # First layer of expert
             A = self.w13_input_quantizer(A)  # noqa: N806
             if self.w13_weight_quantizer.is_enabled:  # pragma: no cover
-                original_weight, self.w13_weight = (
-                    self.w13_weight,
-                    self.w13_weight_quantizer(self.w13_weight),
-                )
-                # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
-                # quantized weight to the kernel.
-                B = self.w13_weight  # noqa: N806
+                original_weight = self.w13_weight
+                B = self.w13_weight_quantizer(original_weight)  # noqa: N806
                 try:
                     original_kernel(A, B, C, *args, **kwargs)
                 finally:
@@ -403,13 +398,8 @@ def _invoke_fused_moe_quantized_function(
         elif B is self.w2_weight:
             A = self.w2_input_quantizer(A)  # noqa: N806
             if self.w2_weight_quantizer.is_enabled:  # pragma: no cover
-                original_weight, self.w2_weight = (
-                    self.w2_weight,
-                    self.w2_weight_quantizer(self.w2_weight),
-                )
-                # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
-                # quantized weight to the kernel.
-                B = self.w2_weight  # noqa: N806
+                original_weight = self.w2_weight
+                B = self.w2_weight_quantizer(original_weight)  # noqa: N806
                 try:
                     original_kernel(A, B, C, *args, **kwargs)
                 finally: