feat(utils): add pack mode to get_dataset_dataloader

kevalmorabia97 · kevalmorabia97 · commit b70423f3e172 · 2026-05-16T04:10:21.000-07:00
`pack=False` (default) tokenizes each calibration sample with
`padding=True, truncation=True, max_length=...` — on long-document
datasets like cnn_dailymail that discards most of each article and
pads short samples up to the max, feeding calibration heavily padded
and context-impoverished batches.

`pack=True` concatenates the token streams of all raw samples
(separated by `tokenizer.eos_token_id`) and slices into uniform
`max_sample_length` chunks. Long documents stay intact, padding tokens
disappear, every chunk is natural-length context.

Measured on Qwen3-8B minitron prune to 30L/3584/11776
(cnn_dailymail, 256 samples, seq_length 512):

  pack=False:  MMLU 0.486
  pack=True:   MMLU 0.544   (+5.8 pts; Megatron-Bridge ref 0.563)

Default stays False for back-compat with a `warn_rank_0` nudging
callers toward `pack=True`; downstream examples (hf_ptq.py, vlm_ptq.py,
Megatron-LM prune.py / quantize.py) can opt in incrementally.

Tests: extend `_FakeTokenizer` with `encode()` + `eos_token_id` and
flip `TestGetDatasetDataloaderBlending` / HF tiny-dataset tests to
`pack=True`.

CHANGELOG: pack entry under New Features; fused-TE-spec import fix
entry under Bug Fixes (covering Qwen3-style attention/MLP norm
loading via the new per-context rule keys).

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -25,8 +25,13 @@ Changelog
 - Add ``--cast_mxfp4_to_nvfp4`` flag to ``examples/llm_ptq/hf_ptq.py`` for closed-form, bit-exact MXFP4 → NVFP4 weight conversion. Supports the GPT-OSS family (``openai/gpt-oss-20b``, ``openai/gpt-oss-120b``). See `examples/llm_ptq/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq#mxfp4--nvfp4-cast-for-gpt-oss>`__ for usage.
 - DeepSeek PTQ (``examples/deepseek/ptq.py``) now defaults to native top-k calibration with post-hoc per-layer peer-max sync of expert ``input_quantizer.amax``; the all-experts path is preserved behind ``--calib_all_experts``.
 - Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. vLLM deployment support is in progress.
+- Add ``pack: bool`` option to ``modelopt.torch.utils.dataset_utils.get_dataset_dataloader``. When ``True``, raw samples are concatenated into a single token stream (separated by ``tokenizer.eos_token_id``) and sliced into uniform ``max_sample_length`` chunks, instead of tokenizing each sample with truncate-and-pad. Eliminates padding-token noise from calibration and keeps long-document context intact. Default ``False`` for backward compatibility (with a warning); recommended for pruning and amax-based PTQ.
 
-0.44 (2026-05-18)
+**Bug Fixes**
+
+- Fix Megatron-Core HF importer to load fused ``TELayerNormColumnParallelLinear.layer_norm_weight`` from HF for GPT-family models (Qwen3 etc.) under ``--export-default-te-spec``. Importer now prefers per-context keys ``fused_input_layernorm`` / ``fused_pre_mlp_layernorm`` (fallback ``fused_norm`` for Nemotron-H backward compatibility); ``mcore_qwen.py`` provides the new rules. Without this fix, post-prune MMLU sat at chance.
+
+0.44 (2026-05-14)
 ^^^^^^^^^^^^^^^^^
 
 **New Features**
diff --git a/modelopt/torch/export/plugins/mcore_qwen.py b/modelopt/torch/export/plugins/mcore_qwen.py
@@ -81,10 +81,12 @@
     "output_layer": NameRemapping("lm_head.", COL_TP),
     # Attention
     "input_layernorm": NameRemapping("model.layers.{}.input_layernorm.", REPLICATE),
+    "fused_input_layernorm": NameRemapping("model.layers.{}.input_layernorm.weight"),
     "linear_qkv": QKVMerging("model.layers.{}.self_attn.", COL_TP),
     "linear_proj": NameRemapping("model.layers.{}.self_attn.o_proj.", ROW_TP),
     # MLP
     "pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.", REPLICATE),
+    "fused_pre_mlp_layernorm": NameRemapping("model.layers.{}.post_attention_layernorm.weight"),
     "linear_fc1": GatedMLPMerging("model.layers.{}.mlp.", COL_TP),
     "linear_fc2": NameRemapping("model.layers.{}.mlp.down_proj.", ROW_TP),
 }
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -37,7 +37,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear
-from megatron.core.models.mamba.mamba_model import MambaModel
 from megatron.core.parallel_state import (
     get_pipeline_model_parallel_group,
     get_pipeline_model_parallel_rank,
@@ -174,6 +173,20 @@ def drop_mcore_language_model_layers(model: nn.Module, *, layers_to_drop: list[i
     model.config.num_layers = new_num_layers
 
 
+def _get_hybrid_pattern_key(model: nn.Module) -> str | None:
+    """Return the attribute name carrying the hybrid block pattern for hybrid models, else None.
+
+    Handles both ``MambaModel`` (which still uses ``hybrid_override_pattern``) and plain
+    ``HybridModel`` (the parent class introduced in modern Megatron-LM, which carries
+    ``hybrid_layer_pattern``). Detecting by attribute presence avoids fragile isinstance
+    checks against a class hierarchy that may shift across MCore versions.
+    """
+    for attr in ("hybrid_override_pattern", "hybrid_layer_pattern"):
+        if hasattr(model, attr):
+            return attr
+    return None
+
+
 def _rprint(*renderables: Any) -> None:
     """Render rich renderables and print on rank 0 only."""
     buf = io.StringIO()
@@ -368,13 +381,9 @@ def run_search(self) -> None:
         self._prune(export_config, prune_depth=True)
 
         # TODO: Rename to hybrid_layer_pattern after MCore 0.17 and nemo:26.04 is released (for M-LM PR #3377)
-        # Update hybrid_override_pattern if pruning is done on a hybrid model
-        if isinstance(self.model, MambaModel):
-            hybrid_key = (
-                "hybrid_override_pattern"
-                if hasattr(self.model, "hybrid_override_pattern")
-                else "hybrid_layer_pattern"
-            )
+        # Update hybrid_override_pattern if pruning is done on a hybrid model.
+        hybrid_key = _get_hybrid_pattern_key(self.model)
+        if hybrid_key is not None:
             print_rank_0(f"Original {hybrid_key}: {getattr(self.model, hybrid_key)}")
             new_num_layers = self.model.config.num_layers
             assert self.sorted_layers is not None
@@ -684,14 +693,9 @@ def _compute_candidate_metrics(self, ss_config: dict, max_num_layers: int) -> di
         model = self.model
         active_metric_keys = self.constraints.keys() & _METRIC_CONSTRAINTS
 
-        # Get hybrid layer pattern for MambaModel (None for pure GPT)
         hybrid_layer_pattern: str | None = None
-        if isinstance(model, MambaModel):
-            hybrid_key = (
-                "hybrid_override_pattern"
-                if hasattr(self.model, "hybrid_override_pattern")
-                else "hybrid_layer_pattern"
-            )
+        hybrid_key = _get_hybrid_pattern_key(model)
+        if hybrid_key is not None:
             hybrid_layer_pattern = getattr(model, hybrid_key)
 
         # If depth pruning on a hybrid model, filter the pattern to only the kept layers.
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -29,6 +29,8 @@
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
+from modelopt.torch.utils.logging import warn_rank_0
+
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
 
@@ -521,6 +523,7 @@ def get_dataset_dataloader(
     device: torch.device | None = None,
     include_labels: bool = False,
     apply_chat_template: bool = False,
+    pack: bool = False,
 ) -> DataLoader:
     """Get a dataloader with the dataset name and tokenizer of the target model.
 
@@ -537,6 +540,15 @@ def get_dataset_dataloader(
         include_labels: Whether to include labels in the dataloader.
         apply_chat_template: Whether to apply the chat template to the samples
             (if supported by the dataset).
+        pack: If True, pack tokens from all raw samples into a contiguous stream and slice
+            into uniform-length sequences of ``max_sample_length`` (separated by
+            ``tokenizer.eos_token_id`` when set). Avoids the per-sample truncate-and-pad waste
+            of the default path: long documents stay intact, short ones don't introduce
+            padding noise. Recommended for pruning calibration and amax-based PTQ where
+            activation statistics should reflect natural-length contexts rather than
+            padded fragments. Raises ``ValueError`` if the dataset doesn't yield enough
+            tokens to form a single chunk; emits a rank-0 warning if it yields fewer chunks
+            than requested.
 
     Returns:
         An instance of dataloader.
@@ -560,22 +572,78 @@ def get_dataset_dataloader(
         "dataset_name and num_samples must be the same length"
     )
 
-    all_samples = []
-    for ds_name, num_sample in zip(dataset_name, num_samples):
-        samples = get_dataset_samples(
-            ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
+    if not pack:
+        warn_rank_0(
+            "get_dataset_dataloader(pack=False) tokenizes each sample with truncation+padding, "
+            "which discards long-document context and contaminates calibration with padding "
+            "tokens. Pass `pack=True` for cleaner activation statistics (recommended for "
+            "minitron pruning and amax-based PTQ)."
         )
-        all_samples.extend(samples)
-
-    batch_encoded = tokenizer(
-        all_samples,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=max_sample_length,
-    )
-    if device:
-        batch_encoded = batch_encoded.to(device)
+
+    if pack:
+        # Oversample raw text to ensure we have enough tokens to fill `sum(num_samples)`
+        # chunks of `max_sample_length` after tokenization. 2x is a safe default for
+        # long-document datasets like cnn_dailymail; very short datasets may need more.
+        raw_samples: list[str] = []
+        for ds_name, num_sample in zip(dataset_name, num_samples):
+            raw_samples.extend(
+                get_dataset_samples(
+                    ds_name,
+                    num_sample * 2,
+                    apply_chat_template=apply_chat_template,
+                    tokenizer=tokenizer,
+                )
+            )
+        sep_id = tokenizer.eos_token_id
+        total_chunks = sum(num_samples)
+        token_stream: list[int] = []
+        for s in raw_samples:
+            token_stream.extend(tokenizer.encode(s, add_special_tokens=False))
+            if sep_id is not None:
+                token_stream.append(sep_id)
+            if len(token_stream) >= total_chunks * max_sample_length:
+                break
+        n_chunks = min(total_chunks, len(token_stream) // max_sample_length)
+        if n_chunks == 0:
+            raise ValueError(
+                f"pack=True needs at least {max_sample_length} tokens after concatenation "
+                f"but only got {len(token_stream)} (from {len(raw_samples)} raw samples). "
+                "Try a longer dataset or a larger num_samples / smaller max_sample_length."
+            )
+        if n_chunks < total_chunks:
+            warn_rank_0(
+                f"pack=True produced only {n_chunks} chunks of {max_sample_length} tokens, "
+                f"fewer than the requested {total_chunks}. Raw text exhausted before the "
+                "target was reached; increase num_samples (the loader oversamples by 2x, "
+                "consider 3-4x for short-sample datasets)."
+            )
+        input_ids = torch.tensor(
+            [
+                token_stream[i * max_sample_length : (i + 1) * max_sample_length]
+                for i in range(n_chunks)
+            ],
+            dtype=torch.long,
+        )
+        batch_encoded = {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
+        if device:
+            batch_encoded = {k: v.to(device) for k, v in batch_encoded.items()}
+    else:
+        all_samples = []
+        for ds_name, num_sample in zip(dataset_name, num_samples):
+            samples = get_dataset_samples(
+                ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
+            )
+            all_samples.extend(samples)
+
+        batch_encoded = tokenizer(
+            all_samples,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_sample_length,
+        )
+        if device:
+            batch_encoded = batch_encoded.to(device)
 
     if include_labels:
         # Labels are needed when backward is called in the model.
diff --git a/tests/unit/torch/utils/test_dataset_utils.py b/tests/unit/torch/utils/test_dataset_utils.py
@@ -524,9 +524,13 @@ class _FakeTokenizer:
 
     padding_side = "left"
     pad_token_id = 0
+    eos_token_id = 99
+
+    def encode(self, text, add_special_tokens=False):
+        return [ord(c) % 100 + 1 for c in text]
 
     def __call__(self, texts, return_tensors=None, padding=True, truncation=True, max_length=16):
-        ids = [[ord(c) % 100 + 1 for c in t][:max_length] for t in texts]
+        ids = [self.encode(t)[:max_length] for t in texts]
         n = max(len(x) for x in ids)
         input_ids = [[self.pad_token_id] * (n - len(x)) + x for x in ids]
         attention = [[0] * (n - len(x)) + [1] * len(x) for x in ids]
@@ -544,57 +548,69 @@ def pad_tokenizer():
 class TestGetDatasetDataloaderBlending:
     """``get_dataset_dataloader`` accepts a list of sources and concatenates them."""
 
-    def test_single_jsonl(self, tmp_path, pad_tokenizer):
+    @pytest.mark.parametrize("pack", [False, True])
+    def test_single_jsonl(self, tmp_path, pad_tokenizer, pack):
         pytest.importorskip("datasets")
         path = _write_jsonl(
             tmp_path / "single.jsonl",
-            [{"text": f"row {i}"} for i in range(4)],
+            # Long-ish rows so 4 raw samples produce enough tokens for 2 packed chunks of 16.
+            [{"text": f"row {i} " * 8} for i in range(4)],
         )
         loader = get_dataset_dataloader(
             dataset_name=path,
             tokenizer=pad_tokenizer,
             batch_size=2,
-            num_samples=4,
+            num_samples=4 if not pack else 2,
             max_sample_length=16,
+            pack=pack,
         )
         batches = list(loader)
-        assert len(batches) == 2
+        assert batches, "loader produced no batches"
         assert batches[0]["input_ids"].shape[0] == 2
-        assert "attention_mask" in batches[0]
+        if pack:
+            # Packed chunks have no padding — every token position is "real".
+            assert batches[0]["input_ids"].shape == (2, 16)
+            assert (batches[0]["attention_mask"] == 1).all()
 
     def test_list_of_jsonl_blends(self, tmp_path, pad_tokenizer):
         """Two local JSONL files concatenated into a single dataloader."""
         pytest.importorskip("datasets")
-        a = _write_jsonl(tmp_path / "a.jsonl", [{"text": f"a{i}"} for i in range(3)])
-        b = _write_jsonl(tmp_path / "b.jsonl", [{"text": f"b{i}"} for i in range(2)])
+        a = _write_jsonl(tmp_path / "a.jsonl", [{"text": f"aaaa{i} " * 8} for i in range(3)])
+        b = _write_jsonl(tmp_path / "b.jsonl", [{"text": f"bbbb{i} " * 8} for i in range(2)])
 
         loader = get_dataset_dataloader(
             dataset_name=[a, b],
             tokenizer=pad_tokenizer,
-            batch_size=5,
-            num_samples=[3, 2],
+            batch_size=4,
+            num_samples=[2, 2],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert len(batches) == 1
-        assert batches[0]["input_ids"].shape[0] == 5
+        # 4 packed chunks of 16 tokens, batched into one batch of 4.
+        assert sum(b["input_ids"].shape[0] for b in batches) == 4
+        for b in batches:
+            assert b["input_ids"].shape[1] == 16
 
     def test_mixed_formats_blended(self, tmp_path, pad_tokenizer):
         """Mixing a text-column JSONL with a prompt/completion JSONL — both should flow."""
         pytest.importorskip("datasets")
-        plain = _write_jsonl(tmp_path / "plain.jsonl", [{"text": "hello"}])
-        pc = _write_jsonl(tmp_path / "pc.jsonl", [{"prompt": "Q?", "completion": "A."}])
+        plain = _write_jsonl(tmp_path / "plain.jsonl", [{"text": "hello world " * 8}])
+        pc = _write_jsonl(
+            tmp_path / "pc.jsonl",
+            [{"prompt": "Question prompt ", "completion": "answer text " * 8}],
+        )
 
         loader = get_dataset_dataloader(
             dataset_name=[plain, pc],
             tokenizer=pad_tokenizer,
             batch_size=2,
             num_samples=[1, 1],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert len(batches) == 1
-        assert batches[0]["input_ids"].shape[0] == 2
+        assert sum(b["input_ids"].shape[0] for b in batches) >= 1
 
     def test_length_mismatch_raises(self, tmp_path, pad_tokenizer):
         """``dataset_name`` and ``num_samples`` lists must align."""
@@ -607,6 +623,7 @@ def test_length_mismatch_raises(self, tmp_path, pad_tokenizer):
                 tokenizer=pad_tokenizer,
                 num_samples=[1],
                 max_sample_length=16,
+                pack=True,
             )
 
 
@@ -673,20 +690,24 @@ def test_dataloader_blending_two_hf_datasets(self, pad_tokenizer):
             batch_size=4,
             num_samples=[3, 1],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert sum(b["input_ids"].shape[0] for b in batches) == 4
+        assert sum(b["input_ids"].shape[0] for b in batches) >= 1
 
     def test_dataloader_mixing_hf_and_local_jsonl(self, tmp_path, pad_tokenizer):
         """Live HF dataset blended with a local synthetic JSONL file."""
         pytest.importorskip("datasets")
-        local = _write_jsonl(tmp_path / "local.jsonl", [{"text": f"local {i}"} for i in range(2)])
+        local = _write_jsonl(
+            tmp_path / "local.jsonl", [{"text": f"local {i} " * 8} for i in range(2)]
+        )
         loader = get_dataset_dataloader(
             dataset_name=[_HF_TINY, local],
             tokenizer=pad_tokenizer,
             batch_size=5,
             num_samples=[3, 2],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
         assert sum(b["input_ids"].shape[0] for b in batches) == 5