Skip to content

Commit 571f26b

Browse files
committed
addressed comment
Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
1 parent 6441197 commit 571f26b

File tree

1 file changed

+8
-4
lines changed
  • modelopt/torch/quantization/plugins

1 file changed

+8
-4
lines changed

modelopt/torch/quantization/plugins/vllm.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,10 @@ def _invoke_fused_moe_quantized_function(
392392
# In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
393393
# quantized weight to the kernel.
394394
B = self.w13_weight # noqa: N806
395-
original_kernel(A, B, C, *args, **kwargs)
396-
self.w13_weight = original_weight
395+
try:
396+
original_kernel(A, B, C, *args, **kwargs)
397+
finally:
398+
self.w13_weight = original_weight
397399
else:
398400
original_kernel(A, B, C, *args, **kwargs)
399401
if self.w13_output_quantizer.is_enabled:
@@ -408,8 +410,10 @@ def _invoke_fused_moe_quantized_function(
408410
# In case the weight quantizer isn't folded yet in vllm_serve_fakequant, pass the
409411
# quantized weight to the kernel.
410412
B = self.w2_weight # noqa: N806
411-
original_kernel(A, B, C, *args, **kwargs)
412-
self.w2_weight = original_weight
413+
try:
414+
original_kernel(A, B, C, *args, **kwargs)
415+
finally:
416+
self.w2_weight = original_weight
413417
else:
414418
original_kernel(A, B, C, *args, **kwargs)
415419
if self.w2_output_quantizer.is_enabled:

0 commit comments

Comments
 (0)