Skip to content

Commit 8eec6d4

Browse files
authored
Support EP mcore import for TE Spec and Fix mamba moe config (#1342)
### What does this PR do? Type of change: Bug fix - Enable EP (expert parallelism) import for HF to MCore when using TE Spec - Fix bug in mamba moe config which doesn't skip attention layers properly in MCore (Mcore uses different naming for attention layers than HF) - Add getter for Quant Config (used in MLM modelopt examples to get quant cfg fields) ### Usage ```python # In Megatron-LM/examples/post_training/modelopt MLM_EXTRA_ARGS="--export-default-te-spec --trust-remote-code --moe-router-dtype fp32" EP=4 HF_MODEL_CKPT=</path/to/hf> MLM_MODEL_SAVE=<save/path> ./convert.sh nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 ``` ### Testing <!-- Mention how have you tested your change if applicable. --> ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ / ❌ / N/A <!--- If ❌, explain why. --> - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: ✅ / ❌ / N/A <!--- Mandatory --> - Did you write any new necessary tests?: ✅ / ❌ / N/A <!--- Mandatory for new features or examples. --> - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A <!--- Only for new features, API changes, critical bug fixes or backward incompatible changes. --> ### Additional Information <!-- E.g. related issue. --> <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Corrected expert-slice assignment so each expert-parallel rank loads the proper expert slice. * Improved detection of pipeline-parallel layer indices in submodule names. * **Improvements** * Relaxed constraints between local and global expert counts for grouped-local-expert imports. * Added typed helpers for managing quantization configuration entries and expanded quantizer disable patterns. * Exporter now accepts an additional hybrid model type when available. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Jennifer Chen <jennifchen@nvidia.com>
1 parent 6d33078 commit 8eec6d4

3 files changed

Lines changed: 32 additions & 11 deletions

File tree

modelopt/torch/export/plugins/megatron_importer.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
has_mcore = False
4040
with import_plugin("megatron"):
4141
from megatron.core.parallel_state import (
42+
get_expert_model_parallel_rank,
4243
get_expert_tensor_parallel_world_size,
4344
get_tensor_model_parallel_world_size,
4445
)
@@ -294,9 +295,13 @@ def _grouped_mlp_merging(
294295
assert module.num_gemms == num_local_experts, (
295296
"num_gemms must be equal to num_local_experts in TEGroupedMLP"
296297
)
297-
for expert_id in range(init_expert_id, init_expert_id + num_local_experts):
298-
tensor = self._get_safetensor(prefix.format(expert_id) + ".weight")
299-
state_dict[f"weight{expert_id}"] = tensor
298+
# init_expert_id is the global index of this rank's first local expert.
299+
# TEGroupedMLP stores weights as weight0..weight{num_local-1} locally, so we
300+
# map global expert_id -> local slot (expert_id - init_expert_id).
301+
for local_id in range(num_local_experts):
302+
global_expert_id = init_expert_id + local_id
303+
tensor = self._get_safetensor(prefix.format(global_expert_id) + ".weight")
304+
state_dict[f"weight{local_id}"] = tensor
300305
# TODO handle weight_scale
301306

302307
module.load_state_dict(state_dict)
@@ -653,10 +658,13 @@ def _import_transformer_layer(self, layer, layer_id, layer_pbar, is_mtp: bool =
653658
layer_pbar.set_description("Importing MoE grouped local experts")
654659
num_local_experts = experts.num_local_experts
655660
num_global_experts = experts.config.num_moe_experts
656-
assert num_local_experts == num_global_experts, (
657-
"num_local_experts must be equal to num_global_experts during MoE import"
661+
assert num_global_experts % num_local_experts == 0, (
662+
"num_global_experts must be divisible by num_local_experts "
663+
"during MoE import"
658664
)
659-
init_index = 0
665+
# Each EP rank owns a contiguous slice of global experts:
666+
# [ep_rank * num_local_experts, (ep_rank + 1) * num_local_experts).
667+
init_index = get_expert_model_parallel_rank() * num_local_experts
660668

661669
self.rules["experts.linear_fc1"](
662670
experts.linear_fc1,

modelopt/torch/export/unified_export_megatron.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@
7272
with import_plugin("megatron"):
7373
from megatron.core.models.gpt import GPTModel
7474
from megatron.core.models.mamba import MambaModel
75+
76+
try:
77+
from megatron.core.models.hybrid.hybrid_model import HybridModel
78+
except ImportError:
79+
HybridModel = MambaModel
7580
from megatron.core.models.multimodal.llava_model import LLaVAModel
7681
from megatron.core.parallel_state import (
7782
get_pipeline_model_parallel_rank,
@@ -121,7 +126,7 @@ def __init__(
121126
moe_router_dtype: str | None = None,
122127
):
123128
"""Create a GPTModel exporter instance."""
124-
if not isinstance(model, (GPTModel, MambaModel, LLaVAModel)):
129+
if not isinstance(model, (GPTModel, MambaModel, HybridModel, LLaVAModel)):
125130
raise ValueError("Input to GPTModelExport must be a megatron.core.models.GPTModel!")
126131

127132
self._state_dict = OrderedDict()

modelopt/torch/quantization/config.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,18 @@ def find_quant_cfg_entry_by_path(
236236
_mamba_moe_disabled_quantizer_cfg: list[QuantizerCfgEntry] = [
237237
{"quantizer_name": "*fc1_latent_proj*", "enable": False}, # Skip Latent MOE
238238
{"quantizer_name": "*fc2_latent_proj*", "enable": False}, # Skip Latent MOE
239-
{"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear
240-
{"quantizer_name": "*k_proj*", "enable": False}, # Skip QKV Linear
241-
{"quantizer_name": "*v_proj*", "enable": False}, # Skip QKV Linear
242-
{"quantizer_name": "*o_proj*", "enable": False}, # Skip QKV Output Projection
239+
{"quantizer_name": "*q_proj*", "enable": False}, # Skip QKV Linear (HF naming)
240+
{"quantizer_name": "*k_proj*", "enable": False}, # Skip QKV Linear (HF naming)
241+
{"quantizer_name": "*v_proj*", "enable": False}, # Skip QKV Linear (HF naming)
242+
{"quantizer_name": "*o_proj*", "enable": False}, # Skip QKV Output Projection (HF naming)
243+
{
244+
"quantizer_name": "*self_attention.linear_qkv*",
245+
"enable": False,
246+
}, # Skip QKV Linear (Mcore naming)
247+
{
248+
"quantizer_name": "*self_attention.linear_proj*",
249+
"enable": False,
250+
}, # Skip QKV Output Projection (Mcore naming)
243251
]
244252

245253
INT8_DEFAULT_CFG = {

0 commit comments

Comments
 (0)