Merge branch 'main' into jingyux/diffusion-skip-softmax

jingyu-ml · web-flow · commit 8151232c6486 · 2026-04-02T00:54:25.000-07:00
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -1184,6 +1184,18 @@ def sync_moe_gate_up_amax(model: nn.Module) -> int:
                 up_amax = getattr(up_wq, "amax", None)
                 if gate_amax is None or up_amax is None:
                     break
+                # Meta tensors have no storage (e.g. CPU-offloaded experts that
+                # were never activated during calibration). Skip — there is no
+                # real amax data to sync.
+                if gate_amax.is_meta or up_amax.is_meta:
+                    warn(
+                        f"Skipping gate/up amax sync for expert with meta tensors "
+                        f"(gate_amax.is_meta={gate_amax.is_meta}, "
+                        f"up_amax.is_meta={up_amax.is_meta}). "
+                        f"This typically means the expert was CPU-offloaded and "
+                        f"not activated during calibration."
+                    )
+                    break
                 if not torch.equal(gate_amax, up_amax):
                     shared_amax = torch.max(gate_amax, up_amax)
                     gate_wq.amax = shared_amax
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -419,16 +419,25 @@ def _get_state_dict(self):
         if hasattr(model, "output_layer") and not model.share_embeddings_and_output_weights:
             self.rules["output_layer"](model.output_layer)
 
+    def _get_fused_norm_weight(self, module):
+        """Return ``module.layer_norm_weight`` when TE fuses the norm into a linear layer.
+
+        Returns ``None`` when the ``"fused_norm"`` rule is absent or the module has no
+        ``layer_norm_weight`` attribute (or its value is ``None``).
+        """
+        if "fused_norm" not in self.rules:
+            return None
+        return getattr(module, "layer_norm_weight", None)
+
     def _get_transformer_layer_state_dict(self, layer, layer_id):
         if not isinstance(layer.input_layernorm, IdentityOp):
             self.rules["input_layernorm"](layer.input_layernorm, layer_id)
         elif (
-            hasattr(layer.self_attention, "linear_qkv")
-            and hasattr(layer.self_attention.linear_qkv, "layer_norm_weight")
-            and layer.self_attention.linear_qkv.layer_norm_weight is not None
-            and "fused_norm" in self.rules
-        ):
-            self.rules["fused_norm"](layer.self_attention.linear_qkv.layer_norm_weight, layer_id)
+            norm_weight := self._get_fused_norm_weight(
+                getattr(layer.self_attention, "linear_qkv", None)
+            )
+        ) is not None:
+            self.rules["fused_norm"](norm_weight, layer_id)
 
         if not isinstance(layer.self_attention, IdentityOp):
             if "MLASelfAttention" in str(type(layer.self_attention)):
@@ -470,12 +479,10 @@ def _get_transformer_layer_state_dict(self, layer, layer_id):
         elif (
             not isinstance(layer.mlp, IdentityOp)
             and "MoE" not in str(type(layer.mlp))
-            and hasattr(layer.mlp, "linear_fc1")
-            and hasattr(layer.mlp.linear_fc1, "layer_norm_weight")
-            and layer.mlp.linear_fc1.layer_norm_weight is not None
-            and "fused_norm" in self.rules
+            and (norm_weight := self._get_fused_norm_weight(getattr(layer.mlp, "linear_fc1", None)))
+            is not None
         ):
-            self.rules["fused_norm"](layer.mlp.linear_fc1.layer_norm_weight, layer_id)
+            self.rules["fused_norm"](norm_weight, layer_id)
 
         if not isinstance(layer.mlp, IdentityOp):
             if "MoE" in str(type(layer.mlp)):
@@ -555,14 +562,9 @@ def _get_mtp_state_dict(self) -> dict[str, torch.Tensor]:
     def _get_mamba_layer_state_dict(self, layer, layer_id):
         if not isinstance(layer.norm, IdentityOp):
             self.rules["norm"](layer.norm, layer_id)
-        elif (
-            isinstance(layer.norm, IdentityOp)
-            and hasattr(layer.mixer.in_proj, "layer_norm_weight")
-            and layer.mixer.in_proj.layer_norm_weight is not None
-            and "fused_norm" in self.rules
-        ):
+        elif (norm_weight := self._get_fused_norm_weight(layer.mixer.in_proj)) is not None:
             # TE spec: norm is fused into in_proj (QuantTELayerNormColumnParallelLinear).
-            self.rules["fused_norm"](layer.mixer.in_proj.layer_norm_weight, layer_id)
+            self.rules["fused_norm"](norm_weight, layer_id)
 
         self.rules["mixer_norm"](layer.mixer.norm, layer_id)
         self.rules["A_log"](layer.mixer.A_log, layer_id)