minor

kinjalpatel27 · kinjalpatel27 · commit 2c50141a6f36 · 2026-04-21T04:36:08.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/modelopt/torch/export/plugins/vllm_fakequant_megatron.py b/modelopt/torch/export/plugins/vllm_fakequant_megatron.py
@@ -137,7 +137,9 @@ def _get_quantized_state(
         block_size = 0
         name_to_value = self._get_weight_bias(module, dtype, name_to_value)
         if "weight" in name_to_value:
-            weight = name_to_value["weight"]
+            # Use the original device (avoid the CPU round-trip introduced by _get_weight_bias;
+            # fake-quantization runs on CUDA and the result is moved to CPU below).
+            weight = module.weight.to(dtype)
             # Fold the weight_quantizer into the weight by applying fake-quantization
             # (quantize then dequantize). The weight_quantizer amax is not exported;
             # the vLLM fakequant reload path disables the weight quantizer when absent.
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -747,18 +747,24 @@ def _get_weight_bias(
         self,
         module: torch.nn.Module,
         dtype: torch.dtype = torch.float16,
-        name_to_value: dict[str, torch.Tensor] = {},
+        name_to_value: dict[str, torch.Tensor] | None = None,
     ) -> dict[str, torch.Tensor]:
         """Get the weight and bias of the module.
 
         Args:
             module: The target module to get the weight and bias.
             dtype: The data type of the weight and bias.
-            name_to_value: The dictionary to store the weight and bias.
+            name_to_value: The dictionary to store the weight and bias. A new dict is created
+                if not provided.
 
         Returns:
             The dictionary containing the weight and bias.
         """
+        if name_to_value is None:
+            name_to_value = {}
+        # numel() > 0 intentionally excludes zero-element weight tensors (e.g. MoE routing
+        # layers whose weight is a placeholder) so callers can use "weight" in name_to_value
+        # as a reliable guard without re-inspecting module.weight.
         if hasattr(module, "weight") and module.weight is not None and module.weight.numel() > 0:
             weight = module.weight.to(dtype).cpu()
             name_to_value["weight"] = weight
@@ -801,9 +807,7 @@ def _get_quantized_state(
 
         name_to_value = self._get_weight_bias(module, dtype, name_to_value)
 
-        if not (
-            hasattr(module, "weight") and module.weight is not None and module.weight.numel() > 0
-        ):
+        if "weight" not in name_to_value:
             return name_to_value, qformat, block_size
 
         if qformat == QUANTIZATION_NONE: