Skip to content

Commit a99f27f

Browse files
committed
minor
Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
1 parent b6b6d1e commit a99f27f

1 file changed

Lines changed: 6 additions & 0 deletions

File tree

  • modelopt/torch/quantization/plugins

modelopt/torch/quantization/plugins/vllm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,9 @@ def invoke_fused_moe_quantized(
365365
self.w13_weight,
366366
self.w13_weight_quantizer(self.w13_weight),
367367
)
368+
# In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
369+
# quantized weight to the kernel.
370+
B = self.w13_weight # noqa: N806
368371
vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
369372
self.w13_weight = original_weight
370373
else:
@@ -378,6 +381,9 @@ def invoke_fused_moe_quantized(
378381
self.w2_weight,
379382
self.w2_weight_quantizer(self.w2_weight),
380383
)
384+
# In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
385+
# quantized weight to the kernel.
386+
B = self.w2_weight # noqa: N806
381387
vllm_fused_moe_package._invoke_fused_moe_kernel(A, B, C, *args, **kwargs)
382388
self.w2_weight = original_weight
383389
else:

0 commit comments

Comments
 (0)