Support EP mcore import for TE Spec and Fix mamba moe config (#1342)

jenchen13 · web-flow · commit 8eec6d4459f0 · 2026-04-28T06:39:41.000-07:00
### What does this PR do? Type of change: Bug fix - Enable EP (expert parallelism) import for HF to MCore when using TE Spec - Fix bug in mamba moe config which doesn't skip attention layers properly in MCore (Mcore uses different naming for attention layers than HF) - Add getter for Quant Config (used in MLM modelopt examples to get quant cfg fields) ### Usage ```python # In Megatron-LM/examples/post_training/modelopt MLM_EXTRA_ARGS="--export-default-te-spec --trust-remote-code --moe-router-dtype fp32" EP=4 HF_MODEL_CKPT=</path/to/hf> MLM_MODEL_SAVE=<save/path> ./convert.sh nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 ``` ### Testing  ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ / ❌ / N/A  - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: ✅ / ❌ / N/A  - Did you write any new necessary tests?: ✅ / ❌ / N/A  - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A  ### Additional Information   ## Summary by CodeRabbit * **Bug Fixes** * Corrected expert-slice assignment so each expert-parallel rank loads the proper expert slice. * Improved detection of pipeline-parallel layer indices in submodule names. * **Improvements** * Relaxed constraints between local and global expert counts for grouped-local-expert imports. * Added typed helpers for managing quantization configuration entries and expanded quantizer disable patterns. * Exporter now accepts an additional hybrid model type when available.  --------- Signed-off-by: Jennifer Chen <jennifchen@nvidia.com>
diff --git a/modelopt/torch/export/plugins/megatron_importer.py b/modelopt/torch/export/plugins/megatron_importer.py
@@ -39,6 +39,7 @@
 has_mcore = False
 with import_plugin("megatron"):
     from megatron.core.parallel_state import (
+        get_expert_model_parallel_rank,
         get_expert_tensor_parallel_world_size,
         get_tensor_model_parallel_world_size,
     )
@@ -294,9 +295,13 @@ def _grouped_mlp_merging(
         assert module.num_gemms == num_local_experts, (
             "num_gemms must be equal to num_local_experts in TEGroupedMLP"
         )
-        for expert_id in range(init_expert_id, init_expert_id + num_local_experts):
-            tensor = self._get_safetensor(prefix.format(expert_id) + ".weight")
-            state_dict[f"weight{expert_id}"] = tensor
+        # init_expert_id is the global index of this rank's first local expert.
+        # TEGroupedMLP stores weights as weight0..weight{num_local-1} locally, so we
+        # map global expert_id -> local slot (expert_id - init_expert_id).
+        for local_id in range(num_local_experts):
+            global_expert_id = init_expert_id + local_id
+            tensor = self._get_safetensor(prefix.format(global_expert_id) + ".weight")
+            state_dict[f"weight{local_id}"] = tensor
             # TODO handle weight_scale
 
         module.load_state_dict(state_dict)
@@ -653,10 +658,13 @@ def _import_transformer_layer(self, layer, layer_id, layer_pbar, is_mtp: bool =
                         layer_pbar.set_description("Importing MoE grouped local experts")
                         num_local_experts = experts.num_local_experts
                         num_global_experts = experts.config.num_moe_experts
-                        assert num_local_experts == num_global_experts, (
-                            "num_local_experts must be equal to num_global_experts during MoE import"
+                        assert num_global_experts % num_local_experts == 0, (
+                            "num_global_experts must be divisible by num_local_experts "
+                            "during MoE import"
                         )
-                        init_index = 0
+                        # Each EP rank owns a contiguous slice of global experts:
+                        # [ep_rank * num_local_experts, (ep_rank + 1) * num_local_experts).
+                        init_index = get_expert_model_parallel_rank() * num_local_experts
 
                         self.rules["experts.linear_fc1"](
                             experts.linear_fc1,
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -72,6 +72,11 @@
 with import_plugin("megatron"):
     from megatron.core.models.gpt import GPTModel
     from megatron.core.models.mamba import MambaModel
+
+    try:
+        from megatron.core.models.hybrid.hybrid_model import HybridModel
+    except ImportError:
+        HybridModel = MambaModel
     from megatron.core.models.multimodal.llava_model import LLaVAModel
     from megatron.core.parallel_state import (
         get_pipeline_model_parallel_rank,
@@ -121,7 +126,7 @@ def __init__(
         moe_router_dtype: str | None = None,
     ):
         """Create a GPTModel exporter instance."""
-        if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)):
+        if not isinstance(model, (GPTModel, MambaModel, HybridModel, LLaVAModel)):
             raise ValueError("Input to GPTModelExport must be a megatron.core.models.GPTModel!")
 
         self._state_dict = OrderedDict()
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -236,10 +236,18 @@ def find_quant_cfg_entry_by_path(
 _mamba_moe_disabled_quantizer_cfg: list[QuantizerCfgEntry] = [
     {"quantizer_name": "*fc1_latent_proj*", "enable": False},  # Skip Latent MOE
     {"quantizer_name": "*fc2_latent_proj*", "enable": False},  # Skip Latent MOE
-    {"quantizer_name": "*q_proj*", "enable": False},  # Skip QKV Linear
-    {"quantizer_name": "*k_proj*", "enable": False},  # Skip QKV Linear
-    {"quantizer_name": "*v_proj*", "enable": False},  # Skip QKV Linear
-    {"quantizer_name": "*o_proj*", "enable": False},  # Skip QKV Output Projection
+    {"quantizer_name": "*q_proj*", "enable": False},  # Skip QKV Linear (HF naming)
+    {"quantizer_name": "*k_proj*", "enable": False},  # Skip QKV Linear (HF naming)
+    {"quantizer_name": "*v_proj*", "enable": False},  # Skip QKV Linear (HF naming)
+    {"quantizer_name": "*o_proj*", "enable": False},  # Skip QKV Output Projection (HF naming)
+    {
+        "quantizer_name": "*self_attention.linear_qkv*",
+        "enable": False,
+    },  # Skip QKV Linear (Mcore naming)
+    {
+        "quantizer_name": "*self_attention.linear_proj*",
+        "enable": False,
+    },  # Skip QKV Output Projection (Mcore naming)
 ]
 
 INT8_DEFAULT_CFG = {