Skip to content

Commit dd948e5

Browse files
committed
bug fix
Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
1 parent 065cfca commit dd948e5

1 file changed

Lines changed: 4 additions & 14 deletions

File tree

  • modelopt/torch/quantization/plugins

modelopt/torch/quantization/plugins/vllm.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -385,13 +385,8 @@ def _invoke_fused_moe_quantized_function(
385385
# First layer of expert
386386
A = self.w13_input_quantizer(A) # noqa: N806
387387
if self.w13_weight_quantizer.is_enabled: # pragma: no cover
388-
original_weight, self.w13_weight = (
389-
self.w13_weight,
390-
self.w13_weight_quantizer(self.w13_weight),
391-
)
392-
# In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
393-
# quantized weight to the kernel.
394-
B = self.w13_weight # noqa: N806
388+
original_weight = self.w13_weight
389+
B = self.w13_weight_quantizer(original_weight) # noqa: N806
395390
try:
396391
original_kernel(A, B, C, *args, **kwargs)
397392
finally:
@@ -403,13 +398,8 @@ def _invoke_fused_moe_quantized_function(
403398
elif B is self.w2_weight:
404399
A = self.w2_input_quantizer(A) # noqa: N806
405400
if self.w2_weight_quantizer.is_enabled: # pragma: no cover
406-
original_weight, self.w2_weight = (
407-
self.w2_weight,
408-
self.w2_weight_quantizer(self.w2_weight),
409-
)
410-
# In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
411-
# quantized weight to the kernel.
412-
B = self.w2_weight # noqa: N806
401+
original_weight = self.w2_weight
402+
B = self.w2_weight_quantizer(original_weight) # noqa: N806
413403
try:
414404
original_kernel(A, B, C, *args, **kwargs)
415405
finally:

0 commit comments

Comments
 (0)