minor

kinjalpatel27 · kinjalpatel27 · commit a99f27f6f684 · 2026-04-06T19:24:15.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
@@ -365,6 +365,9 @@ def invoke_fused_moe_quantized(
                     self.w13_weight,
                     self.w13_weight_quantizer(self.w13_weight),
                 )
+                # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
+                # quantized weight to the kernel.
+                B = self.w13_weight  # noqa: N806
                 vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
                 self.w13_weight = original_weight
             else:
@@ -378,6 +381,9 @@ def invoke_fused_moe_quantized(
                     self.w2_weight,
                     self.w2_weight_quantizer(self.w2_weight),
                 )
+                # In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
+                # quantized weight to the kernel.
+                B = self.w2_weight  # noqa: N806
                 vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
                 self.w2_weight = original_weight
             else: