Skip to content

Commit cd3a9e0

Browse files
Move non-pack fixes to separate PR
Split #1501 so this PR keeps only the pack=True calibration dataloader change. The HybridModel pruning, fused-TE-spec import/export, and related fixes are now in #1518 (also targeting main). This commit reverts to main the files that belong to #1518: - modelopt/torch/export/plugins/{mcore_deepseek,mcore_gptoss,mcore_llama,mcore_qwen,megatron_importer,unified_export_megatron}.py - modelopt/torch/nas/plugins/megatron.py - modelopt/torch/prune/plugins/mcore_minitron.py - modelopt/torch/utils/logging.py - modelopt/torch/utils/plugins/{megatron_generate,megatron_mmlu}.py - tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml CHANGELOG: drop the Bug Fixes entry and the 0.44 date correction (those go with #1518). Keep the pack=True New Features entry. History is preserved — earlier commits with the dropped changes remain on this branch's log, this commit just rolls the working state back to "pack only". Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
1 parent 08ec542 commit cd3a9e0

13 files changed

Lines changed: 70 additions & 195 deletions

File tree

CHANGELOG.rst

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,7 @@ Changelog
2727
- Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. vLLM deployment support is in progress.
2828
- Add ``pack: bool`` option to ``modelopt.torch.utils.dataset_utils.get_dataset_dataloader``. When ``True``, raw samples from each source are concatenated into a per-source token stream (separated by ``tokenizer.eos_token_id``) and sliced into uniform ``max_sample_length`` chunks, preserving the requested per-source ratio in ``num_samples``. Eliminates padding-token noise from calibration and keeps long-document context intact. Default ``False`` for backward compatibility; recommended for pruning and amax-based PTQ.
2929

30-
**Bug Fixes**
31-
32-
- Fix Megatron-Core HF importer to load fused ``TELayerNormColumnParallelLinear.layer_norm_weight`` from HF for GPT-family models (Qwen3 etc.) under ``--export-default-te-spec``. Importer now prefers per-context keys ``fused_input_layernorm`` / ``fused_pre_mlp_layernorm`` (fallback ``fused_norm`` for Nemotron-H backward compatibility); ``mcore_qwen.py`` provides the new rules. Without this fix, post-prune MMLU sat at chance.
33-
34-
0.44 (2026-05-14)
30+
0.44 (2026-05-18)
3531
^^^^^^^^^^^^^^^^^
3632

3733
**New Features**

modelopt/torch/export/plugins/mcore_deepseek.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,6 @@
4343
"linear_kv_up_proj": NameRemapping("model.layers.{}.self_attn.kv_b_proj."),
4444
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj."),
4545
"pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm."),
46-
# Fused TE spec (mirrors the import side). MLA has no linear_qkv so
47-
# fused_input_layernorm is inert today; fused_pre_mlp_layernorm reaches dense layers.
48-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
49-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
5046
# MLP for dense layers
5147
"linear_fc1": GatedMLPSlicing("model.layers.{}.mlp."),
5248
"linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj."),
@@ -92,11 +88,6 @@
9288
"output_layer": NameRemapping("lm_head.", COL_TP),
9389
# Per-layer
9490
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm.", REPLICATE),
95-
# Fused TE spec (TELayerNormColumnParallelLinear) — see mcore_qwen.py for rationale.
96-
# MLA has no linear_qkv so fused_input_layernorm is inert for DeepSeek today; included
97-
# for parity in case a future spec fuses the layernorm into a Q/KV projection.
98-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
99-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
10091
"linear_q_proj": NameRemapping("model.layers.{}.self_attn.q_proj.", COL_TP),
10192
"linear_q_down_proj": NameRemapping("model.layers.{}.self_attn.q_a_proj.", REPLICATE),
10293
"linear_q_layernorm": NameRemapping("model.layers.{}.self_attn.q_a_layernorm.", REPLICATE),

modelopt/torch/export/plugins/mcore_gptoss.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
gptoss_causal_lm_export: dict[str, CustomModuleMapping | bool] = {
3232
"word_embeddings": NameRemapping("model.embed_tokens."),
3333
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm."),
34-
# MoE-only on MLP side, so fused_pre_mlp_layernorm path is unreachable.
35-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
3634
"linear_qkv": QKVSlicing("model.layers.{}.self_attn."),
3735
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj."),
3836
"softmax_offset": NameRemapping("model.layers.{}.self_attn.sinks"),
@@ -54,10 +52,6 @@
5452
gptoss_causal_lm_import: dict[str, CustomModuleMapping | bool] = {
5553
"word_embeddings": NameRemapping("model.embed_tokens.", COL_TP),
5654
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm.", REPLICATE),
57-
# Fused TE spec (TELayerNormColumnParallelLinear) — see mcore_qwen.py for rationale.
58-
# gpt-oss is MoE-only on the MLP side (no layer.mlp.linear_fc1), so the importer's
59-
# fused_pre_mlp_layernorm path is unreachable; only fused_input_layernorm is wired.
60-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
6155
"linear_qkv": QKVMerging("model.layers.{}.self_attn.", COL_TP),
6256
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj.", ROW_TP),
6357
"softmax_offset": NameRemapping("model.layers.{}.self_attn.sinks", COL_TP),

modelopt/torch/export/plugins/mcore_llama.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,11 @@
3737
llama_causal_lm_export: dict[str, CustomModuleMapping] = {
3838
"word_embeddings": NameRemapping("model.embed_tokens."),
3939
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm."),
40-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
4140
"linear_qkv": QKVSlicing("model.layers.{}.self_attn."),
4241
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj."),
4342
# KV cache quant export
4443
"core_attention": SelfAttentionScaling("model.layers.{}.self_attn."),
4544
"pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm."),
46-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
4745
"linear_fc1": GatedMLPSlicing("model.layers.{}.mlp."),
4846
"linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj."),
4947
"final_layernorm": NameRemapping("model.norm."),
@@ -53,8 +51,6 @@
5351
llama4_causal_lm_export: dict[str, CustomModuleMapping | bool] = {
5452
"word_embeddings": NameRemapping("language_model.model.embed_tokens."),
5553
"input_layernorm": NameRemapping("language_model.model.layers.{}.input_layernorm."),
56-
# MoE-only on MLP side, so fused_pre_mlp_layernorm path is unreachable.
57-
"fused_input_layernorm": NameRemapping("language_model.model.layers.{}.input_layernorm.weight"),
5854
# self_attn
5955
"linear_qkv": QKVSlicing("language_model.model.layers.{}.self_attn."),
6056
"linear_proj": NameRemapping("language_model.model.layers.{}.self_attn.o_proj."),
@@ -154,12 +150,9 @@
154150
llama_causal_lm_import: dict[str, CustomModuleMapping] = {
155151
"word_embeddings": NameRemapping("model.embed_tokens.", COL_TP),
156152
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm.", REPLICATE),
157-
# Fused TE spec (TELayerNormColumnParallelLinear) — see mcore_qwen.py for rationale.
158-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
159153
"linear_qkv": QKVMerging("model.layers.{}.self_attn.", COL_TP),
160154
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj.", ROW_TP),
161155
"pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.", REPLICATE),
162-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
163156
"linear_fc1": GatedMLPMerging("model.layers.{}.mlp.", COL_TP),
164157
"linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj.", ROW_TP),
165158
"final_layernorm": NameRemapping("model.norm.", REPLICATE),
@@ -169,10 +162,6 @@
169162
llama4_causal_lm_import: dict[str, CustomModuleMapping | bool] = {
170163
"word_embeddings": NameRemapping("language_model.model.embed_tokens.", COL_TP),
171164
"input_layernorm": NameRemapping("language_model.model.layers.{}.input_layernorm.", REPLICATE),
172-
# Fused TE spec (TELayerNormColumnParallelLinear) — see mcore_qwen.py for rationale.
173-
# Llama4 is MoE-only on the MLP side (no layer.mlp.linear_fc1), so the importer's
174-
# fused_pre_mlp_layernorm path is unreachable; only fused_input_layernorm is wired.
175-
"fused_input_layernorm": NameRemapping("language_model.model.layers.{}.input_layernorm.weight"),
176165
"linear_qkv": QKVMerging("language_model.model.layers.{}.self_attn.", COL_TP),
177166
"linear_proj": NameRemapping("language_model.model.layers.{}.self_attn.o_proj.", ROW_TP),
178167
"pre_mlp_layernorm": NameRemapping(

modelopt/torch/export/plugins/mcore_qwen.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,12 @@
3535
"output_layer": NameRemapping("lm_head.", COL_TP),
3636
# Attention
3737
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm.", REPLICATE),
38-
# Fused TE spec (TELayerNormColumnParallelLinear): the LayerNorm weight lives on
39-
# linear_qkv.layer_norm_weight, loaded directly from the HF norm tensor (no `.weight` suffix
40-
# appended since the value is a Parameter, not a sub-module).
41-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
4238
"linear_qkv": QKVMerging("model.layers.{}.self_attn.", COL_TP),
4339
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj.", ROW_TP),
4440
"q_layernorm": NameRemapping("model.layers.{}.self_attn.q_norm.", REPLICATE),
4541
"k_layernorm": NameRemapping("model.layers.{}.self_attn.k_norm.", REPLICATE),
4642
# MLP
4743
"pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.", REPLICATE),
48-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
4944
"linear_fc1": GatedMLPMerging("model.layers.{}.mlp.", COL_TP),
5045
"linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj.", ROW_TP),
5146
# MoE
@@ -61,14 +56,12 @@
6156
"output_layer": NameRemapping("lm_head."),
6257
# Attention
6358
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm."),
64-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
6559
"linear_qkv": QKVSlicing("model.layers.{}.self_attn."),
6660
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj."),
6761
"q_layernorm": NameRemapping("model.layers.{}.self_attn.q_norm."),
6862
"k_layernorm": NameRemapping("model.layers.{}.self_attn.k_norm."),
6963
# MLP
7064
"pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm."),
71-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
7265
"linear_fc1": GatedMLPSlicing("model.layers.{}.mlp."),
7366
"linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj."),
7467
# MoE
@@ -83,12 +76,10 @@
8376
"output_layer": NameRemapping("lm_head.", COL_TP),
8477
# Attention
8578
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm.", REPLICATE),
86-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
8779
"linear_qkv": QKVMerging("model.layers.{}.self_attn.", COL_TP),
8880
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj.", ROW_TP),
8981
# MLP
9082
"pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.", REPLICATE),
91-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
9283
"linear_fc1": GatedMLPMerging("model.layers.{}.mlp.", COL_TP),
9384
"linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj.", ROW_TP),
9485
}
@@ -99,12 +90,10 @@
9990
"output_layer": NameRemapping("lm_head."),
10091
# Attention
10192
"input_layernorm": NameRemapping("model.layers.{}.input_layernorm."),
102-
"fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
10393
"linear_qkv": QKVSlicing("model.layers.{}.self_attn."),
10494
"linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj."),
10595
# MLP
10696
"pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm."),
107-
"fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
10897
"linear_fc1": GatedMLPSlicing("model.layers.{}.mlp."),
10998
"linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj."),
11099
}

modelopt/torch/export/plugins/megatron_importer.py

Lines changed: 13 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -238,9 +238,8 @@ def _gated_mlp_merging(
238238
else:
239239
prefix = prefix.replace("model", "mtp")
240240

241-
module_state_dict = module.state_dict()
242-
weight = module_state_dict.get("weight", None)
243-
weight_scale = module_state_dict.get("weight_quantizer._scale", None)
241+
weight = module.state_dict().get("weight", None)
242+
weight_scale = module.state_dict().get("weight_quantizer._scale", None)
244243

245244
state_dict = {}
246245

@@ -274,15 +273,6 @@ def _gated_mlp_merging(
274273
else:
275274
state_dict["weight"] = tensor.to(self.dtype).to(device=weight.device)
276275

277-
# Preserve the fused LayerNorm weight + TE _extra_state already on the module so
278-
# the strict load_state_dict below doesn't fail for TELayerNormColumnParallelLinear
279-
# (fused under --export-default-te-spec). The actual HF norm tensor is loaded
280-
# separately via the `fused_pre_mlp_layernorm` rule.
281-
layer_norm_weight = module_state_dict.get("layer_norm_weight", None)
282-
if layer_norm_weight is not None:
283-
state_dict["layer_norm_weight"] = layer_norm_weight
284-
state_dict["_extra_state"] = module_state_dict.get("_extra_state")
285-
286276
module.load_state_dict(state_dict)
287277

288278
def _grouped_mlp_merging(
@@ -443,10 +433,7 @@ def _qkv_merging(
443433
layer_norm_weight = module_state_dict.get("layer_norm_weight", None)
444434
if layer_norm_weight is not None:
445435
state_dict["layer_norm_weight"] = layer_norm_weight
446-
# Preserve the TE metadata struct (FP8 amax history, recipe version, etc.) —
447-
# `load_state_dict(..., strict=True)` requires the key, but blanking it could
448-
# zero out per-module FP8 bookkeeping on TE versions that populate it.
449-
state_dict["_extra_state"] = module_state_dict.get("_extra_state")
436+
state_dict["_extra_state"] = None # for TE modules require _extra_state key
450437

451438
module.load_state_dict(state_dict)
452439

@@ -612,32 +599,14 @@ def _import_transformer_layer(self, layer, layer_id, layer_pbar, is_mtp: bool =
612599
)
613600

614601
# TE spec: input_layernorm is fused into linear_qkv (TELayerNormColumnParallelLinear).
615-
# Prefer the per-context key (`fused_input_layernorm`); fall back to the legacy
616-
# single-key `fused_norm` for Nemotron-H style (one norm shared across slots).
617-
# Missing both is a plugin misconfig — raise rather than silently random-init.
602+
# Load the fused layer_norm_weight from the HF norm path.
618603
if (
619604
isinstance(layer.input_layernorm, IdentityOp)
620605
and hasattr(attention, "linear_qkv")
621606
and hasattr(attention.linear_qkv, "layer_norm_weight")
607+
and "fused_norm" in self.rules
622608
):
623-
fused_key = (
624-
"fused_input_layernorm"
625-
if "fused_input_layernorm" in self.rules
626-
else "fused_norm"
627-
)
628-
if fused_key not in self.rules:
629-
# Branch only fires when model uses fused TELayerNormColumnParallelLinear,
630-
# so missing rule is unambiguously a plugin misconfiguration; raise so it
631-
# doesn't silently ship a chance-accuracy checkpoint.
632-
raise KeyError(
633-
f"{self.arch} uses fused TELayerNormColumnParallelLinear for "
634-
"attention but neither `fused_input_layernorm` nor legacy "
635-
"`fused_norm` is in its import mapping; `linear_qkv.layer_norm_weight` "
636-
"would be left at random init. Add "
637-
'`fused_input_layernorm: NameRemapping("...input_layernorm.weight")` '
638-
f"to the {self.arch} import mapping."
639-
)
640-
self.rules[fused_key](
609+
self.rules["fused_norm"](
641610
attention.linear_qkv.layer_norm_weight, layer_id, is_mtp=is_mtp
642611
)
643612

@@ -738,27 +707,14 @@ def _import_transformer_layer(self, layer, layer_id, layer_pbar, is_mtp: bool =
738707
self.rules["linear_fc2"](layer.mlp.linear_fc2, layer_id, is_mtp=is_mtp)
739708

740709
# TE spec: pre_mlp_layernorm is fused into linear_fc1
741-
# (TELayerNormColumnParallelLinear). See input_layernorm path above for the
742-
# rule-key fallback rationale.
743-
if isinstance(layer.pre_mlp_layernorm, IdentityOp) and hasattr(
744-
layer.mlp.linear_fc1, "layer_norm_weight"
710+
# (TELayerNormColumnParallelLinear).
711+
# Load the fused layer_norm_weight from the HF norm path.
712+
if (
713+
isinstance(layer.pre_mlp_layernorm, IdentityOp)
714+
and hasattr(layer.mlp.linear_fc1, "layer_norm_weight")
715+
and "fused_norm" in self.rules
745716
):
746-
fused_key = (
747-
"fused_pre_mlp_layernorm"
748-
if "fused_pre_mlp_layernorm" in self.rules
749-
else "fused_norm"
750-
)
751-
if fused_key not in self.rules:
752-
raise KeyError(
753-
f"{self.arch} uses fused TELayerNormColumnParallelLinear for "
754-
"MLP but neither `fused_pre_mlp_layernorm` nor legacy "
755-
"`fused_norm` is in its import mapping; "
756-
"`linear_fc1.layer_norm_weight` would be left at random init. "
757-
"Add `fused_pre_mlp_layernorm: NameRemapping("
758-
'"...post_attention_layernorm.weight")` '
759-
f"to the {self.arch} import mapping."
760-
)
761-
self.rules[fused_key](
717+
self.rules["fused_norm"](
762718
layer.mlp.linear_fc1.layer_norm_weight, layer_id, is_mtp=is_mtp
763719
)
764720

0 commit comments

Comments
 (0)