fix(prune): support HybridModel in mcore_minitron + Qwen3 fused-TE import

kevalmorabia97 · kevalmorabia97 · commit df7ab6388b30 · 2026-05-15T11:53:47.000-07:00
Pruning Megatron-LM HybridModel-based models (Nemotron-H et al.) under the modern Megatron-LM layout (HybridModel as the parent of MambaModel) silently produced an unloadable checkpoint: - `_DynamicMCoreLanguageModel` only registered `GPTModel` and `MambaModel` in `SUPPORTED_MODELS`; `HybridModel` instances fell through the dynamic-space converter. - `MCoreMinitronConfig.default_rules` likewise had no entry for HybridModel, so `convert_to_dynamic` ran `mod.freeze()` on the top-level model. That collapsed `hidden_size` and `num_layers` to a single choice each, so `_prune` skipped them — yielding a saved checkpoint with pruned per-layer dims but unpruned hidden/depth. For GPT-family models (Qwen3) under `--export-default-te-spec`, the fused `TELayerNormColumnParallelLinear.layer_norm_weight` was never loaded from HF: the importer's fused-norm path was keyed on a single `fused_norm` rule (only Nemotron-H provided it, mapping a single HF norm tensor per layer). Standard transformer layers need separate attention vs MLP norm sources. Changes: - `nas/plugins/megatron.py`: register `HybridModel` in `SUPPORTED_MODELS` under a new `HAS_HYBRID` flag; have `_DynamicTEQKVLayerNormColumnParallelLinear` track `in_features` so TE's forward-time `inp_shape[-1] == in_features` assertion holds when hidden_size is pruned. - `prune/plugins/mcore_minitron.py`: add HybridModel entry to `MCoreMinitronConfig.default_rules`, gated on `HAS_HYBRID`. - `export/plugins/megatron_importer.py`: prefer per-context keys `fused_input_layernorm` / `fused_pre_mlp_layernorm`, fall back to legacy `fused_norm` for Nemotron-H back-compat. - `export/plugins/mcore_qwen.py`: add the two fused-norm rules for Qwen3, mapping to `model.layers.{}.input_layernorm.weight` and `model.layers.{}.post_attention_layernorm.weight`. - `utils/plugins/megatron_generate.py`: `.contiguous()` on the logits slice before `broadcast_from_last_pipeline_stage`, which asserts contiguity when SP pads seq_length up to a multiple of TP. - `utils/plugins/megatron_mmlu.py`: accept a `mmlu_dataset` kwarg so callers can point at a local copy. Consumer: Megatron-LM PR NVIDIA/Megatron-LM#4807 Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py
@@ -35,12 +35,17 @@
     "output_layer": NameRemapping("lm_head.", COL_TP),
     # Attention
     "input_layernorm": NameRemapping("model.layers.{}.input_layernorm.", REPLICATE),
+    # Fused TE spec (TELayerNormColumnParallelLinear): the LayerNorm weight lives on
+    # linear_qkv.layer_norm_weight, loaded directly from the HF norm tensor (no `.weight` suffix
+    # appended since the value is a Parameter, not a sub-module).
+    "fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
     "linear_qkv": QKVMerging("model.layers.{}.self_attn.", COL_TP),
     "linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj.", ROW_TP),
     "q_layernorm": NameRemapping("model.layers.{}.self_attn.q_norm.", REPLICATE),
     "k_layernorm": NameRemapping("model.layers.{}.self_attn.k_norm.", REPLICATE),
     # MLP
     "pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.", REPLICATE),
+    "fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
     "linear_fc1": GatedMLPMerging("model.layers.{}.mlp.", COL_TP),
     "linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj.", ROW_TP),
     # MoE
diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py
@@ -599,16 +599,24 @@ def _import_transformer_layer(self, layer, layer_id, layer_pbar, is_mtp: bool =
                     )
 
             # TE spec: input_layernorm is fused into linear_qkv (TELayerNormColumnParallelLinear).
-            # Load the fused layer_norm_weight from the HF norm path.
+            # Load the fused layer_norm_weight from the HF norm path. Prefer the explicit
+            # per-norm key (needed for standard GPT models where attention and MLP fused norms
+            # map to different HF tensors); fall back to `fused_norm` for Nemotron-H style
+            # (one norm per layer, shared across attention/mlp/mamba slots).
             if (
                 isinstance(layer.input_layernorm, IdentityOp)
                 and hasattr(attention, "linear_qkv")
                 and hasattr(attention.linear_qkv, "layer_norm_weight")
-                and "fused_norm" in self.rules
             ):
-                self.rules["fused_norm"](
-                    attention.linear_qkv.layer_norm_weight, layer_id, is_mtp=is_mtp
+                fused_key = (
+                    "fused_input_layernorm"
+                    if "fused_input_layernorm" in self.rules
+                    else "fused_norm"
                 )
+                if fused_key in self.rules:
+                    self.rules[fused_key](
+                        attention.linear_qkv.layer_norm_weight, layer_id, is_mtp=is_mtp
+                    )
 
         if not isinstance(layer.pre_mlp_layernorm, IdentityOp):
             self.rules["pre_mlp_layernorm"](layer.pre_mlp_layernorm, layer_id, is_mtp=is_mtp)
@@ -707,16 +715,20 @@ def _import_transformer_layer(self, layer, layer_id, layer_pbar, is_mtp: bool =
                 self.rules["linear_fc2"](layer.mlp.linear_fc2, layer_id, is_mtp=is_mtp)
 
                 # TE spec: pre_mlp_layernorm is fused into linear_fc1
-                # (TELayerNormColumnParallelLinear).
-                # Load the fused layer_norm_weight from the HF norm path.
-                if (
-                    isinstance(layer.pre_mlp_layernorm, IdentityOp)
-                    and hasattr(layer.mlp.linear_fc1, "layer_norm_weight")
-                    and "fused_norm" in self.rules
+                # (TELayerNormColumnParallelLinear). See input_layernorm path above for the
+                # rule-key fallback rationale.
+                if isinstance(layer.pre_mlp_layernorm, IdentityOp) and hasattr(
+                    layer.mlp.linear_fc1, "layer_norm_weight"
                 ):
-                    self.rules["fused_norm"](
-                        layer.mlp.linear_fc1.layer_norm_weight, layer_id, is_mtp=is_mtp
+                    fused_key = (
+                        "fused_pre_mlp_layernorm"
+                        if "fused_pre_mlp_layernorm" in self.rules
+                        else "fused_norm"
                     )
+                    if fused_key in self.rules:
+                        self.rules[fused_key](
+                            layer.mlp.linear_fc1.layer_norm_weight, layer_id, is_mtp=is_mtp
+                        )
 
     def _import_state_dict(self):
         model = self.model
diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
@@ -79,6 +79,18 @@
 except ImportError:
     HAS_MAMBA = False
 
+# Newer Megatron-LM splits MambaModel out of HybridModel; instantiates Nemotron-H et al.
+# as plain HybridModel. Register that as well so the dynamic-space converter can build
+# a configurable search space on hybrid models.
+try:
+    from megatron.core.models.hybrid.hybrid_model import HybridModel
+
+    SUPPORTED_MODELS[HybridModel] = "megatron.core.models.hybrid.HybridModel"
+
+    HAS_HYBRID = True
+except ImportError:
+    HAS_HYBRID = False
+
 __all__ = ["get_te_mamba_stack_spec"]
 
 
@@ -394,6 +406,9 @@ def _setup(self, *, num_attention_heads: NumAttentionHeadsHp, hidden_size: Trace
             lambda mod, val: (num_attention_heads.active + 2 * mod.config.num_query_groups)
             * mod.config.kv_channels,
         )
+        # in_features must track input_size so TE's forward-time inp_shape[-1] == in_features
+        # assertion holds when hidden_size is pruned.
+        self._register_dynamic_attribute("in_features", lambda mod, val: mod.input_size)
         self._register_dynamic_attribute("weight", self._get_weight)
         # TE stores a zero-length tensor (not None) when bias=False; only register if non-empty
         if hasattr(self, "bias") and self.bias is not None and self.bias.numel() > 0:
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -56,6 +56,7 @@
 
 from modelopt.torch.nas.conversion import NASModeRegistry
 from modelopt.torch.nas.plugins.megatron import (
+    HAS_HYBRID,
     HAS_MAMBA,
     SUPPORTED_MODELS,
     _DynamicMambaLayer,
@@ -756,6 +757,19 @@ def _compute_candidate_metrics(self, ss_config: dict, max_num_layers: int) -> di
                 if HAS_MAMBA
                 else {}
             ),
+            **(
+                {
+                    "megatron.core.models.hybrid.HybridModel": {
+                        "hidden_size_divisor": 256,
+                        "ffn_hidden_size_divisor": 512,
+                        "mamba_head_dim_divisor": 8,
+                        "num_moe_experts_divisor": 8,
+                        "num_layers_divisor": 2,
+                    },
+                }
+                if HAS_HYBRID
+                else {}
+            ),
         },
         doc='Configuration for the ``"mcore_minitron"`` mode.',
     ),
diff --git a/modelopt/torch/utils/plugins/megatron_generate.py b/modelopt/torch/utils/plugins/megatron_generate.py
@@ -150,7 +150,9 @@ def megatron_prefill(
         )
         send_to_next_pipeline_rank(output.to(dtype=pp_dtype))
 
-    logits = output[:, :seq_length, :].detach() if pp_last else None
+    # .contiguous() is required because the slice is a view with the padded stride; the broadcast
+    # below asserts contiguity when SP pads seq_length up to a multiple of TP.
+    logits = output[:, :seq_length, :].detach().contiguous() if pp_last else None
 
     if model.config.bf16:
         logits_dtype = torch.bfloat16
diff --git a/modelopt/torch/utils/plugins/megatron_mmlu.py b/modelopt/torch/utils/plugins/megatron_mmlu.py
@@ -60,6 +60,7 @@ def megatron_mmlu(
     few_shots: int = 0,
     fraction: float = 0.05,
     batch_size: int = 1,
+    mmlu_dataset: str = "cais/mmlu",
 ) -> float:
     """Evaluate the model on MMLU using log-likelihood scoring over batched prefill passes.
 
@@ -73,6 +74,8 @@ def megatron_mmlu(
         few_shots: The number of few-shot examples to use.
         fraction: The fraction of the test set to evaluate on.
         batch_size: Number of examples to process in one forward pass.
+        mmlu_dataset: HF dataset name or local MMLU dataset path passed to `datasets.load_dataset`.
+            Defaults to ``cais/mmlu``.
     """
     print_rank_0(
         f"\nMMLU ({fraction * 100}%, {few_shots}-shot, Batch Size: {batch_size}) evaluation started...\n"
@@ -104,8 +107,8 @@ def _generate_prompt(test_example, dev_examples, few_shots=0):
 
     # Load all subjects in two dataset calls instead of 2x num_subjects calls.
     # The "all" config includes a "subject" field for per-subject reporting.
-    test_dataset = load_dataset("cais/mmlu", "all", split="test")
-    dev_dataset = load_dataset("cais/mmlu", "all", split="dev") if few_shots > 0 else None
+    test_dataset = load_dataset(mmlu_dataset, "all", split="test")
+    dev_dataset = load_dataset(mmlu_dataset, "all", split="dev") if few_shots > 0 else None
 
     # Group dev examples by subject for few-shot prompt construction.
     dev_by_subject: dict = {}

Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,9 @@ def megatron_prefill(`
`150`	`150`	`)`
`151`	`151`	`send_to_next_pipeline_rank(output.to(dtype=pp_dtype))`
`152`	`152`
`153`		`- logits = output[:, :seq_length, :].detach() if pp_last else None`
	`153`	`+ # .contiguous() is required because the slice is a view with the padded stride; the broadcast`
	`154`	`+ # below asserts contiguity when SP pads seq_length up to a multiple of TP.`
	`155`	`+ logits = output[:, :seq_length, :].detach().contiguous() if pp_last else None`
`154`	`156`
`155`	`157`	`if model.config.bf16:`
`156`	`158`	`logits_dtype = torch.bfloat16`