feat(utils): support pack=True calibration mode for get_dataset_dataloader

kevalmorabia97 · kevalmorabia97 · commit c4c662ef109d · 2026-05-19T12:51:39.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -25,6 +25,7 @@ Changelog
 - Add ``--cast_mxfp4_to_nvfp4`` flag to ``examples/llm_ptq/hf_ptq.py`` for closed-form, bit-exact MXFP4 → NVFP4 weight conversion. Supports the GPT-OSS family (``openai/gpt-oss-20b``, ``openai/gpt-oss-120b``). See `examples/llm_ptq/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq#mxfp4--nvfp4-cast-for-gpt-oss>`__ for usage.
 - DeepSeek PTQ (``examples/deepseek/ptq.py``) now defaults to native top-k calibration with post-hoc per-layer peer-max sync of expert ``input_quantizer.amax``; the all-experts path is preserved behind ``--calib_all_experts``.
 - Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. vLLM deployment support is in progress.
+- Add ``pack: bool`` option to ``modelopt.torch.utils.dataset_utils.get_dataset_dataloader``. When ``True``, raw samples from each source are concatenated into a per-source token stream (separated by ``tokenizer.eos_token_id``) and sliced into uniform ``max_sample_length`` chunks, preserving the requested per-source ratio in ``num_samples``. Eliminates padding-token noise from calibration and keeps long-document context intact. Default ``False`` for backward compatibility; recommended for pruning and amax-based PTQ.
 
 **Bug Fixes**
 
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -29,6 +29,8 @@
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
+from .logging import warn_rank_0
+
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
 
@@ -512,6 +514,103 @@ def __len__(self):
         return len(next(iter(self.encodings.values())))
 
 
+def _build_packed_input_ids(
+    dataset_name: list[str],
+    num_samples: list[int],
+    max_sample_length: int,
+    tokenizer: "PreTrainedTokenizerBase",
+    apply_chat_template: bool,
+) -> torch.Tensor:
+    """Pack raw samples into a ``(n_chunks, max_sample_length)`` int tensor.
+
+    Each source contributes ``num_sample`` chunks (or fewer if exhausted), so the requested
+    per-source ratio in ``num_samples`` is preserved instead of letting whichever source
+    appears first dominate the budget. Within a source, tokenization runs in batches of
+    ``max(8, num_sample // 4)`` samples so we stop tokenizing once the chunk budget is
+    full, instead of eagerly paying for the entire ``num_sample * 2`` oversample.
+
+    Documents are separated by ``tokenizer.eos_token_id`` when set; ``add_special_tokens=False``
+    avoids injecting a fresh BOS at every sample boundary. Note that packed chunks therefore
+    have no BOS at position 0 — fine for amax / sensitivity calibration where boundary
+    tokens are statistically dominated, less ideal for callers that need BOS-prefixed
+    sequences (use ``pack=False`` for those). When ``apply_chat_template=True``, the rendered
+    samples often already end with the chat EOS marker (e.g. ``<|im_end|>``), which can
+    tokenize to ``eos_token_id`` and produce ``<eos><eos>`` at document boundaries —
+    harmless for calibration statistics but worth noting.
+
+    Sizing note: ``num_sample`` here is the desired chunk count per source. The loader
+    internally fetches ``num_sample * 2`` raw samples. Short-document sources can still
+    under-fill — to recover the target, scale ``num_sample`` itself (which doubles both
+    the target and the internal raw-sample draw). Example: short-row source returning 1
+    chunk for ``num_sample=64`` typically returns 4 chunks for ``num_sample=128`` because
+    the raw draw goes from 128 to 256.
+    """
+    sep_id = tokenizer.eos_token_id
+    if sep_id is None:
+        warn_rank_0(
+            "pack=True: tokenizer has no eos_token_id; raw documents will be concatenated "
+            "without a separator, so calibration activations will span document boundaries. "
+            "Set tokenizer.eos_token_id (or another sentinel) for explicit separators."
+        )
+
+    per_source_chunks: list[list[int]] = []
+    actual_per_source: list[int] = []
+    for ds_name, num_sample in zip(dataset_name, num_samples):
+        # 2x oversample sized for cnn_dailymail-style long docs; short-sample datasets may
+        # still under-fill and trigger the warning below.
+        raw_samples = get_dataset_samples(
+            ds_name,
+            num_sample * 2,
+            apply_chat_template=apply_chat_template,
+            tokenizer=tokenizer,
+        )
+        needed_tokens = num_sample * max_sample_length
+        # max(8, ...) floor keeps the Rust-batched tokenizer happy for small calibrations
+        # (num_sample < 32 → batch is 8); above that, `// 4` grows the batch with the
+        # request while keeping the early-exit check granular enough to actually skip
+        # tokenizing the back half of the 2x oversample on long-doc sources.
+        tokenize_batch_size = max(8, num_sample // 4)
+        stream: list[int] = []
+        for batch_start in range(0, len(raw_samples), tokenize_batch_size):
+            if len(stream) >= needed_tokens:
+                break
+            batch = raw_samples[batch_start : batch_start + tokenize_batch_size]
+            # padding/truncation=False explicit: don't trust subclass __call__ defaults.
+            encoded = tokenizer(batch, add_special_tokens=False, padding=False, truncation=False)[
+                "input_ids"
+            ]
+            for ids in encoded:
+                stream.extend(ids)
+                if sep_id is not None:
+                    stream.append(sep_id)
+                if len(stream) >= needed_tokens:
+                    break
+        available = len(stream) // max_sample_length
+        take = min(num_sample, available)
+        per_source_chunks.extend(
+            stream[i * max_sample_length : (i + 1) * max_sample_length] for i in range(take)
+        )
+        actual_per_source.append(take)
+
+    n_chunks = len(per_source_chunks)
+    total_chunks = sum(num_samples)
+    if n_chunks == 0:
+        raise ValueError(
+            f"pack=True yielded 0 chunks across {len(dataset_name)} source(s); each source "
+            f"needs at least {max_sample_length} tokens after concatenation. Try longer "
+            "samples or a smaller max_sample_length."
+        )
+    if n_chunks < total_chunks:
+        warn_rank_0(
+            f"pack=True produced {n_chunks} chunks (per-source {actual_per_source}) vs "
+            f"requested {total_chunks} (per-source {list(num_samples)}). Some sources "
+            "exhausted before reaching their target. The loader internally fetches "
+            "`num_samples * 2` raw samples per source; for very short-sample sources, "
+            "pass a 2-3x larger `num_samples` so the 2x draw covers the chunk budget."
+        )
+    return torch.tensor(per_source_chunks, dtype=torch.long)
+
+
 def get_dataset_dataloader(
     dataset_name: str | list[str] = "cnn_dailymail",
     tokenizer: "PreTrainedTokenizerBase | None" = None,
@@ -521,6 +620,7 @@ def get_dataset_dataloader(
     device: torch.device | None = None,
     include_labels: bool = False,
     apply_chat_template: bool = False,
+    pack: bool = False,
 ) -> DataLoader:
     """Get a dataloader with the dataset name and tokenizer of the target model.
 
@@ -531,12 +631,31 @@ def get_dataset_dataloader(
             an ``int`` (applied to a single source) or a list aligned with ``dataset_name``.
         tokenizer: Instance of HuggingFace tokenizer.
         batch_size: Batch size of the returned dataloader.
-        num_samples: Number of samples from the dataset.
+        num_samples: Number of samples from the dataset. Semantics depend on ``pack``:
+            with ``pack=False`` this is the number of raw samples to fetch and tokenize
+            (each becomes one row of ``max_sample_length`` after truncate-and-pad); with
+            ``pack=True`` this is the number of ``max_sample_length``-token chunks to
+            produce per source. Migrating an existing call site to ``pack=True`` may
+            therefore need a different value to hit the same total-token calibration
+            budget.
         max_sample_length: Maximum length of a sample.
         device: Target device for the returned dataloader.
         include_labels: Whether to include labels in the dataloader.
         apply_chat_template: Whether to apply the chat template to the samples
             (if supported by the dataset).
+        pack: If True, raw samples from each source are concatenated into a per-source token
+            stream (separated by ``tokenizer.eos_token_id`` when set) and sliced into
+            uniform-length chunks of ``max_sample_length``; the per-source chunks are then
+            concatenated **contiguously by source** (no cross-source interleaving), preserving
+            the requested per-source ratio in ``num_samples``. Avoids the per-sample
+            truncate-and-pad waste of the default path: long documents stay intact, short
+            ones don't introduce padding noise. Recommended for pruning calibration and
+            amax-based PTQ where activation statistics should reflect natural-length
+            contexts rather than padded fragments. ``attention_mask`` is unconditionally
+            all-ones — attention crosses document boundaries (the ``eos`` separator is a
+            token, not a mask boundary). Raises ``ValueError`` if the dataset doesn't yield
+            enough tokens to form a single chunk; emits a rank-0 warning if it yields
+            fewer chunks than requested.
 
     Returns:
         An instance of dataloader.
@@ -560,22 +679,30 @@ def get_dataset_dataloader(
         "dataset_name and num_samples must be the same length"
     )
 
-    all_samples = []
-    for ds_name, num_sample in zip(dataset_name, num_samples):
-        samples = get_dataset_samples(
-            ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
+    if pack:
+        input_ids = _build_packed_input_ids(
+            dataset_name, num_samples, max_sample_length, tokenizer, apply_chat_template
         )
-        all_samples.extend(samples)
-
-    batch_encoded = tokenizer(
-        all_samples,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=max_sample_length,
-    )
-    if device:
-        batch_encoded = batch_encoded.to(device)
+        batch_encoded = {"input_ids": input_ids, "attention_mask": torch.ones_like(input_ids)}
+        if device:
+            batch_encoded = {k: v.to(device) for k, v in batch_encoded.items()}
+    else:
+        all_samples = []
+        for ds_name, num_sample in zip(dataset_name, num_samples):
+            samples = get_dataset_samples(
+                ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
+            )
+            all_samples.extend(samples)
+
+        batch_encoded = tokenizer(
+            all_samples,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_sample_length,
+        )
+        if device:
+            batch_encoded = batch_encoded.to(device)
 
     if include_labels:
         # Labels are needed when backward is called in the model.
@@ -844,6 +971,7 @@ def create_forward_loop(
     include_labels: bool = False,
     dataloader: DataLoader | None = None,
     allowed_non_tensor_keys: set | None = None,
+    pack: bool = False,
 ) -> Callable:
     """Creates and returns a forward loop function configured for a specific model, dataset, and tokenizer.
 
@@ -865,6 +993,8 @@ def create_forward_loop(
         allowed_non_tensor_keys: Set of key names whose batch values may be non-tensor types.
             Useful when the dataloader yields batches with non-standard fields (e.g., nested
             model outputs).
+        pack: Forwarded to :func:`get_dataset_dataloader`. See its docstring for semantics
+            (including the ``num_samples`` chunk-vs-document distinction).
 
     Example usage for quantization:
 
@@ -902,6 +1032,7 @@ def create_forward_loop(
             max_sample_length=max_sample_length,
             device=device,
             include_labels=include_labels,
+            pack=pack,
         )
 
     return lambda model: _forward_loop(model, dataloader, allowed_non_tensor_keys)
diff --git a/tests/unit/torch/utils/test_dataset_utils.py b/tests/unit/torch/utils/test_dataset_utils.py