feat(utils): add pack mode to get_dataset_dataloader

kevalmorabia97 · kevalmorabia97 · commit 20d3c5b2c4e2 · 2026-05-15T13:17:51.000-07:00
`pack=False` (default) tokenizes each calibration sample with
`padding=True, truncation=True, max_length=...` — on long-document
datasets like cnn_dailymail that discards most of each article and
pads short samples up to the max, feeding calibration heavily padded
and context-impoverished batches.

`pack=True` concatenates the token streams of all raw samples
(separated by `tokenizer.eos_token_id`) and slices into uniform
`max_sample_length` chunks. Long documents stay intact, padding tokens
disappear, every chunk is natural-length context.

Measured on Qwen3-8B minitron prune to 30L/3584/11776
(cnn_dailymail, 256 samples, seq_length 512):

  pack=False:  MMLU 0.486
  pack=True:   MMLU 0.544   (+5.8 pts; Megatron-Bridge ref 0.563)

Default stays False for back-compat with a `warn_rank_0` nudging
callers toward `pack=True`; downstream examples (hf_ptq.py, vlm_ptq.py,
Megatron-LM prune.py / quantize.py) can opt in incrementally.

Tests: extend `_FakeTokenizer` with `encode()` + `eos_token_id` and
flip `TestGetDatasetDataloaderBlending` / HF tiny-dataset tests to
`pack=True`.

CHANGELOG: pack entry under New Features; fused-TE-spec import fix
entry under Bug Fixes (covering Qwen3-style attention/MLP norm
loading via the new per-context rule keys).

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -24,6 +24,11 @@ Changelog
 - Add support for ``active_params`` (for MoE models) and ``memory_mb`` constraints in Minitron pruning on top of existing ``params`` constraint. You can also provide multiple constraints. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details. The underlying utility functions ``mcore_param_count``, ``mcore_memory_footprint_mb``, and ``print_mcore_model_stats`` in ``modelopt.torch.nas.plugins.megatron_model_stats`` are also available for standalone use to compute parameter counts and memory footprints (weights + KV-cache + Mamba state) for any Megatron-Core model.
 - Add ``--cast_mxfp4_to_nvfp4`` flag to ``examples/llm_ptq/hf_ptq.py`` for closed-form, bit-exact MXFP4 → NVFP4 weight conversion. Supports the GPT-OSS family (``openai/gpt-oss-20b``, ``openai/gpt-oss-120b``). See `examples/llm_ptq/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq#mxfp4--nvfp4-cast-for-gpt-oss>`__ for usage.
 - DeepSeek PTQ (``examples/deepseek/ptq.py``) now defaults to native top-k calibration with post-hoc per-layer peer-max sync of expert ``input_quantizer.amax``; the all-experts path is preserved behind ``--calib_all_experts``.
+- Add ``pack: bool`` option to ``modelopt.torch.utils.dataset_utils.get_dataset_dataloader``. When ``True``, raw samples are concatenated into a single token stream (separated by ``tokenizer.eos_token_id``) and sliced into uniform ``max_sample_length`` chunks, instead of tokenizing each sample with truncate-and-pad. Eliminates padding-token noise from calibration and keeps long-document context intact. Default ``False`` for backward compatibility (with a warning); recommended for pruning and amax-based PTQ.
+
+**Bug Fixes**
+
+- Fix Megatron-Core HF importer to load fused ``TELayerNormColumnParallelLinear.layer_norm_weight`` from HF for GPT-family models (Qwen3 etc.) under ``--export-default-te-spec``. Importer now prefers per-context keys ``fused_input_layernorm`` / ``fused_pre_mlp_layernorm`` (fallback ``fused_norm`` for Nemotron-H backward compatibility); ``mcore_qwen.py`` provides the new rules. Without this fix, post-prune MMLU sat at chance.
 
 0.44 (2026-05-18)
 ^^^^^^^^^^^^^^^^^
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -29,6 +29,8 @@
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
+from modelopt.torch.utils.logging import warn_rank_0
+
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
 
@@ -432,6 +434,7 @@ def get_dataset_dataloader(
     device: torch.device | None = None,
     include_labels: bool = False,
     apply_chat_template: bool = False,
+    pack: bool = False,
 ) -> DataLoader:
     """Get a dataloader with the dataset name and tokenizer of the target model.
 
@@ -448,6 +451,13 @@ def get_dataset_dataloader(
         include_labels: Whether to include labels in the dataloader.
         apply_chat_template: Whether to apply the chat template to the samples
             (if supported by the dataset).
+        pack: If True, pack tokens from all raw samples into a contiguous stream and slice
+            into uniform-length sequences of ``max_sample_length`` (separated by
+            ``tokenizer.eos_token_id``). Avoids the per-sample truncate-and-pad waste of the
+            default path: long documents stay intact, short ones don't introduce padding
+            noise. Recommended for pruning calibration and amax-based PTQ where
+            activation statistics should reflect natural-length contexts rather than
+            padded fragments.
 
     Returns:
         An instance of dataloader.
@@ -471,22 +481,65 @@ def get_dataset_dataloader(
         "dataset_name and num_samples must be the same length"
     )
 
-    all_samples = []
-    for ds_name, num_sample in zip(dataset_name, num_samples):
-        samples = get_dataset_samples(
-            ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
+    if not pack:
+        warn_rank_0(
+            "get_dataset_dataloader(pack=False) tokenizes each sample with truncation+padding, "
+            "which discards long-document context and contaminates calibration with padding "
+            "tokens. Pass `pack=True` for cleaner activation statistics (recommended for "
+            "minitron pruning and amax-based PTQ)."
         )
-        all_samples.extend(samples)
-
-    batch_encoded = tokenizer(
-        all_samples,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=max_sample_length,
-    )
-    if device:
-        batch_encoded = batch_encoded.to(device)
+
+    if pack:
+        # Oversample raw text to ensure we have enough tokens to fill `sum(num_samples)`
+        # chunks of `max_sample_length` after tokenization. 2x is a safe default for
+        # long-document datasets like cnn_dailymail; very short datasets may need more.
+        raw_samples: list[str] = []
+        for ds_name, num_sample in zip(dataset_name, num_samples):
+            raw_samples.extend(
+                get_dataset_samples(
+                    ds_name,
+                    num_sample * 2,
+                    apply_chat_template=apply_chat_template,
+                    tokenizer=tokenizer,
+                )
+            )
+        sep_id = tokenizer.eos_token_id
+        total_chunks = sum(num_samples)
+        token_stream: list[int] = []
+        for s in raw_samples:
+            token_stream.extend(tokenizer.encode(s, add_special_tokens=False))
+            if sep_id is not None:
+                token_stream.append(sep_id)
+            if len(token_stream) >= total_chunks * max_sample_length:
+                break
+        n_chunks = min(total_chunks, len(token_stream) // max_sample_length)
+        input_ids = torch.tensor(
+            [
+                token_stream[i * max_sample_length : (i + 1) * max_sample_length]
+                for i in range(n_chunks)
+            ],
+            dtype=torch.long,
+        )
+        batch_encoded = {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
+        if device:
+            batch_encoded = {k: v.to(device) for k, v in batch_encoded.items()}
+    else:
+        all_samples = []
+        for ds_name, num_sample in zip(dataset_name, num_samples):
+            samples = get_dataset_samples(
+                ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
+            )
+            all_samples.extend(samples)
+
+        batch_encoded = tokenizer(
+            all_samples,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_sample_length,
+        )
+        if device:
+            batch_encoded = batch_encoded.to(device)
 
     if include_labels:
         # Labels are needed when backward is called in the model.
diff --git a/tests/unit/torch/utils/test_dataset_utils.py b/tests/unit/torch/utils/test_dataset_utils.py
@@ -523,9 +523,13 @@ class _FakeTokenizer:
 
     padding_side = "left"
     pad_token_id = 0
+    eos_token_id = 99
+
+    def encode(self, text, add_special_tokens=False):
+        return [ord(c) % 100 + 1 for c in text]
 
     def __call__(self, texts, return_tensors=None, padding=True, truncation=True, max_length=16):
-        ids = [[ord(c) % 100 + 1 for c in t][:max_length] for t in texts]
+        ids = [self.encode(t)[:max_length] for t in texts]
         n = max(len(x) for x in ids)
         input_ids = [[self.pad_token_id] * (n - len(x)) + x for x in ids]
         attention = [[0] * (n - len(x)) + [1] * len(x) for x in ids]
@@ -547,53 +551,62 @@ def test_single_jsonl(self, tmp_path, pad_tokenizer):
         pytest.importorskip("datasets")
         path = _write_jsonl(
             tmp_path / "single.jsonl",
-            [{"text": f"row {i}"} for i in range(4)],
+            # Long-ish rows so 4 raw samples produce enough tokens for 2 packed chunks of 16.
+            [{"text": f"row {i} " * 8} for i in range(4)],
         )
         loader = get_dataset_dataloader(
             dataset_name=path,
             tokenizer=pad_tokenizer,
             batch_size=2,
-            num_samples=4,
+            num_samples=2,
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert len(batches) == 2
-        assert batches[0]["input_ids"].shape[0] == 2
-        assert "attention_mask" in batches[0]
+        assert len(batches) == 1
+        assert batches[0]["input_ids"].shape == (2, 16)
+        # Packed chunks have no padding — every token position is "real".
+        assert (batches[0]["attention_mask"] == 1).all()
 
     def test_list_of_jsonl_blends(self, tmp_path, pad_tokenizer):
         """Two local JSONL files concatenated into a single dataloader."""
         pytest.importorskip("datasets")
-        a = _write_jsonl(tmp_path / "a.jsonl", [{"text": f"a{i}"} for i in range(3)])
-        b = _write_jsonl(tmp_path / "b.jsonl", [{"text": f"b{i}"} for i in range(2)])
+        a = _write_jsonl(tmp_path / "a.jsonl", [{"text": f"aaaa{i} " * 8} for i in range(3)])
+        b = _write_jsonl(tmp_path / "b.jsonl", [{"text": f"bbbb{i} " * 8} for i in range(2)])
 
         loader = get_dataset_dataloader(
             dataset_name=[a, b],
             tokenizer=pad_tokenizer,
-            batch_size=5,
-            num_samples=[3, 2],
+            batch_size=4,
+            num_samples=[2, 2],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert len(batches) == 1
-        assert batches[0]["input_ids"].shape[0] == 5
+        # 4 packed chunks of 16 tokens, batched into one batch of 4.
+        assert sum(b["input_ids"].shape[0] for b in batches) == 4
+        for b in batches:
+            assert b["input_ids"].shape[1] == 16
 
     def test_mixed_formats_blended(self, tmp_path, pad_tokenizer):
         """Mixing a text-column JSONL with a prompt/completion JSONL — both should flow."""
         pytest.importorskip("datasets")
-        plain = _write_jsonl(tmp_path / "plain.jsonl", [{"text": "hello"}])
-        pc = _write_jsonl(tmp_path / "pc.jsonl", [{"prompt": "Q?", "completion": "A."}])
+        plain = _write_jsonl(tmp_path / "plain.jsonl", [{"text": "hello world " * 8}])
+        pc = _write_jsonl(
+            tmp_path / "pc.jsonl",
+            [{"prompt": "Question prompt ", "completion": "answer text " * 8}],
+        )
 
         loader = get_dataset_dataloader(
             dataset_name=[plain, pc],
             tokenizer=pad_tokenizer,
             batch_size=2,
             num_samples=[1, 1],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert len(batches) == 1
-        assert batches[0]["input_ids"].shape[0] == 2
+        assert sum(b["input_ids"].shape[0] for b in batches) >= 1
 
     def test_length_mismatch_raises(self, tmp_path, pad_tokenizer):
         """``dataset_name`` and ``num_samples`` lists must align."""
@@ -606,6 +619,7 @@ def test_length_mismatch_raises(self, tmp_path, pad_tokenizer):
                 tokenizer=pad_tokenizer,
                 num_samples=[1],
                 max_sample_length=16,
+                pack=True,
             )
 
 
@@ -672,20 +686,24 @@ def test_dataloader_blending_two_hf_datasets(self, pad_tokenizer):
             batch_size=4,
             num_samples=[3, 1],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert sum(b["input_ids"].shape[0] for b in batches) == 4
+        assert sum(b["input_ids"].shape[0] for b in batches) >= 1
 
     def test_dataloader_mixing_hf_and_local_jsonl(self, tmp_path, pad_tokenizer):
         """Live HF dataset blended with a local synthetic JSONL file."""
         pytest.importorskip("datasets")
-        local = _write_jsonl(tmp_path / "local.jsonl", [{"text": f"local {i}"} for i in range(2)])
+        local = _write_jsonl(
+            tmp_path / "local.jsonl", [{"text": f"local {i} " * 8} for i in range(2)]
+        )
         loader = get_dataset_dataloader(
             dataset_name=[_HF_TINY, local],
             tokenizer=pad_tokenizer,
             batch_size=5,
             num_samples=[3, 2],
             max_sample_length=16,
+            pack=True,
         )
         batches = list(loader)
-        assert sum(b["input_ids"].shape[0] for b in batches) == 5
+        assert sum(b["input_ids"].shape[0] for b in batches) >= 1