fixed bug

kinjalpatel27 · kinjalpatel27 · commit 065cfcaafba5 · 2026-04-21T01:02:53.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/export/plugins/vllm_fakequant_megatron.py b/modelopt/torch/export/plugins/vllm_fakequant_megatron.py
@@ -135,9 +135,9 @@ def _get_quantized_state(
             # string then it usually ends with "." which needs to be removed.
             self.exclude_modules.append(prefix.removesuffix("."))
         block_size = 0
-
-        if hasattr(module, "weight") and module.weight is not None:
-            weight = module.weight.to(dtype)
+        name_to_value = self._get_weight_bias(module, dtype, name_to_value)
+        if "weight" in name_to_value:
+            weight = name_to_value["weight"]
             # Fold the weight_quantizer into the weight by applying fake-quantization
             # (quantize then dequantize). The weight_quantizer amax is not exported;
             # the vLLM fakequant reload path disables the weight quantizer when absent.
@@ -171,9 +171,6 @@ def _get_quantized_state(
         else:
             return name_to_value, qformat, block_size
 
-        if hasattr(module, "bias") and module.bias is not None:
-            name_to_value["bias"] = module.bias.to(dtype).cpu()
-
         # Only save input/output quantizer state; weight_quantizer amax is not exported
         # since it has been folded into the weight above.
         for name, param in get_quantizer_state_dict(module).items():
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -743,6 +743,38 @@ def _custom_mapping_to_lambda(mapping):
 
         return all_rules
 
+    def _get_weight_bias(
+        self,
+        module: torch.nn.Module,
+        dtype: torch.dtype = torch.float16,
+        name_to_value: dict[str, torch.Tensor] = {},
+    ) -> dict[str, torch.Tensor]:
+        """Get the weight and bias of the module.
+
+        Args:
+            module: The target module to get the weight and bias.
+            dtype: The data type of the weight and bias.
+            name_to_value: The dictionary to store the weight and bias.
+
+        Returns:
+            The dictionary containing the weight and bias.
+        """
+        if hasattr(module, "weight") and module.weight is not None and module.weight.numel() > 0:
+            weight = module.weight.to(dtype).cpu()
+            name_to_value["weight"] = weight
+
+        if hasattr(module, "bias") and module.bias is not None and module.bias.numel() > 0:
+            name_to_value["bias"] = module.bias.to(dtype).cpu()
+
+        if (
+            hasattr(module, "expert_bias")
+            and module.expert_bias is not None
+            and module.expert_bias.numel() > 0
+        ):
+            name_to_value["expert_bias"] = module.expert_bias.to(dtype).cpu()
+
+        return name_to_value
+
     def _get_quantized_state(
         self,
         module: torch.nn.Module,
@@ -767,21 +799,12 @@ def _get_quantized_state(
             self.exclude_modules.append(prefix.removesuffix("."))
         block_size = get_weight_block_size(module)
 
-        if hasattr(module, "weight") and module.weight is not None and module.weight.numel() > 0:
-            weight = module.weight.to(dtype).cpu()
-            name_to_value["weight"] = weight
-        else:
-            return name_to_value, qformat, block_size
-
-        if hasattr(module, "bias") and module.bias is not None and module.bias.numel() > 0:
-            name_to_value["bias"] = module.bias.to(dtype).cpu()
+        name_to_value = self._get_weight_bias(module, dtype, name_to_value)
 
-        if (
-            hasattr(module, "expert_bias")
-            and module.expert_bias is not None
-            and module.expert_bias.numel() > 0
+        if not (
+            hasattr(module, "weight") and module.weight is not None and module.weight.numel() > 0
         ):
-            name_to_value["expert_bias"] = module.expert_bias.to(dtype).cpu()
+            return name_to_value, qformat, block_size
 
         if qformat == QUANTIZATION_NONE:
             return name_to_value, qformat, block_size