Create shared megatron calibration forward loop for MLM and MBridge prune/quantize

kevalmorabia97 · kevalmorabia97 · commit d174a0848cc4 · 2026-05-19T12:56:13.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/examples/megatron_bridge/prune_minitron.py b/examples/megatron_bridge/prune_minitron.py
@@ -53,10 +53,8 @@
 import modelopt.torch.prune as mtp
 import modelopt.torch.utils.distributed as dist
 from modelopt.torch.utils import get_supported_datasets, print_rank_0, warn_rank_0
-from modelopt.torch.utils.plugins.mbridge import (
-    get_hf_mbridge_calibration_loop,
-    load_mbridge_model_from_hf,
-)
+from modelopt.torch.utils.plugins.mbridge import load_mbridge_model_from_hf
+from modelopt.torch.utils.plugins.megatron_calibration import get_megatron_calibration_forward_loop
 from modelopt.torch.utils.plugins.megatron_mmlu import megatron_mmlu
 
 
@@ -296,16 +294,12 @@ def main(args: argparse.Namespace):
         init_model_parallel=True,
         moe_grouped_gemm=False,
     )
-    forward_loop = get_hf_mbridge_calibration_loop(
-        model=model,
-        provider=provider,
-        tokenizer=tokenizer,
-        hf_model_name_or_path=args.hf_model_name_or_path,
-        trust_remote_code=args.trust_remote_code,
+    forward_loop = get_megatron_calibration_forward_loop(
+        tokenizer,
         dataset_name=args.calib_dataset_name,
         num_samples=args.calib_num_samples,
-        micro_batch_size=args.calib_mbs,
-        global_batch_size=args.calib_gbs,
+        seq_length=args.seq_length,
+        batch_size=args.calib_gbs,
     )
 
     pruning_config = {
diff --git a/examples/pruning/README.md b/examples/pruning/README.md
@@ -50,9 +50,9 @@ Please see example snippets of both modes for Minitron pruning on Megatron-Bridg
 ```python
 import torch
 import modelopt.torch.prune as mtp
-from modelopt.torch.utils.plugins.mbridge import (
-    get_hf_mbridge_calibration_loop,
-    load_mbridge_model_from_hf,
+from modelopt.torch.utils.plugins.mbridge import load_mbridge_model_from_hf
+from modelopt.torch.utils.plugins.megatron_calibration import (
+    get_megatron_calibration_forward_loop,
 )
 
 # Import the Megatron-Bridge Qwen3-8B model from Hugging Face checkpoint
@@ -67,13 +67,11 @@ bridge, provider, model, unwrapped_model, tokenizer = load_mbridge_model_from_hf
 )
 
 # Set up the forward loop to run on 1024 train samples
-forward_loop = get_hf_mbridge_calibration_loop(
-    model=model,
-    provider=provider,
-    tokenizer=tokenizer,
-    hf_model_name_or_path="Qwen/Qwen3-8B",
+forward_loop = get_megatron_calibration_forward_loop(
+    tokenizer,
     dataset_name="nemotron-post-training-dataset-v2",
     num_samples=1024,
+    seq_length=4096,
 )
 
 # Run pruning on the unwrapped model
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -617,7 +617,7 @@ def get_dataset_dataloader(
     batch_size: int = 1,
     num_samples: int | list[int] = 512,
     max_sample_length: int = 512,
-    device: torch.device | None = None,
+    device: torch.device | str | None = None,
     include_labels: bool = False,
     apply_chat_template: bool = False,
     pack: bool = False,
diff --git a/modelopt/torch/utils/plugins/__init__.py b/modelopt/torch/utils/plugins/__init__.py
@@ -17,6 +17,9 @@
 
 from modelopt.torch.utils import import_plugin
 
+with import_plugin("megatron_calibration"):
+    from .megatron_calibration import *
+
 with import_plugin("megatron_generate"):
     from .megatron_generate import *
 
diff --git a/modelopt/torch/utils/plugins/mbridge.py b/modelopt/torch/utils/plugins/mbridge.py
@@ -14,43 +14,23 @@
 # limitations under the License.
 """Megatron-Bridge plugins for using with Model-Optimizer."""
 
-from collections.abc import Callable
 from typing import Any
 
-import torch.nn as nn
-from datasets import DatasetDict
 from megatron.bridge import AutoBridge
-from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig
-from megatron.bridge.data.loaders import setup_data_iterators
-from megatron.bridge.data.utils import get_dataset_provider
 from megatron.bridge.models.gpt_provider import GPTModelProvider
 from megatron.bridge.models.hf_pretrained.utils import is_safe_repo
 from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    LoggerConfig,
-    OptimizerConfig,
-    SchedulerConfig,
-    TrainingConfig,
-    runtime_config_update,
-)
-from megatron.bridge.training.eval import evaluate_and_print_results
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.state import GlobalState
-from megatron.bridge.training.tokenizers.config import TokenizerConfig
 from megatron.core.models.gpt import GPTModel
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.models.mamba import MambaModel
-from megatron.core.parallel_state import get_data_parallel_group
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.utils import unwrap_model
 from transformers import AutoTokenizer
 
 from modelopt.torch.nas.plugins.megatron import get_te_mamba_stack_spec
-from modelopt.torch.utils import get_dataset_samples, print_rank_0, warn_rank_0
+from modelopt.torch.utils import print_rank_0
 
-__all__ = ["get_hf_mbridge_calibration_loop", "load_mbridge_model_from_hf"]
+__all__ = ["load_mbridge_model_from_hf"]
 
 
 def load_mbridge_model_from_hf(
@@ -118,134 +98,3 @@ def load_mbridge_model_from_hf(
     )
 
     return bridge, provider, model, unwrapped_model, tokenizer
-
-
-def _get_dataset_cfg(
-    dataset_name: str,
-    num_samples: int,
-    seq_length: int,
-    apply_chat_template: bool = True,
-    tokenizer: AutoTokenizer | None = None,
-) -> HFDatasetConfig:
-    """Get a dataset config for the dataset."""
-    dataset = get_dataset_samples(
-        dataset_name, num_samples, apply_chat_template=apply_chat_template, tokenizer=tokenizer
-    )
-    dataset_cfg = HFDatasetConfig(
-        dataset_name=f"{dataset_name}_{num_samples}",
-        dataset_dict=DatasetDict({"train": dataset}),
-        process_example_fn=lambda example, tokenizer: {"input": example, "output": ""},
-        seq_length=seq_length,
-        dataloader_type="batch",
-        num_workers=1,
-        do_validation=False,
-        do_test=False,
-        val_proportion=None,
-        split_val_from_train=False,
-        rewrite=True,
-    )
-
-    return dataset_cfg
-
-
-def get_hf_mbridge_calibration_loop(
-    *,
-    model: list[MegatronModule],
-    provider: GPTModelProvider | MambaModelProvider,
-    tokenizer: AutoTokenizer,
-    hf_model_name_or_path: str,
-    trust_remote_code: bool = False,
-    dataset_name: str = "nemotron-post-training-dataset-v2",
-    num_samples: int = 512,
-    micro_batch_size: int = 1,
-    global_batch_size: int = 1,
-) -> Callable[[nn.Module], None]:
-    """Get a modelopt calibration loop for a Megatron-Bridge model.
-
-    Args:
-        model: The model to calibrate.
-        provider: The provider to use for the model.
-        tokenizer: The tokenizer to use for the model.
-        hf_model_name_or_path: The name or path of the HF model.
-        trust_remote_code: Whether to trust remote code.
-        dataset_name: The name of the dataset to use for evaluation.
-        num_samples: The number of samples to use for evaluation.
-        micro_batch_size: The micro batch size to use for evaluation.
-        global_batch_size: The global batch size to use for evaluation.
-
-    Returns:
-        A function that can be used to calibrate the model with a modelopt.torch API.
-    """
-    if global_batch_size < micro_batch_size:
-        warn_rank_0(
-            f"{global_batch_size=} is smaller than {micro_batch_size=}. Setting gbs to {micro_batch_size}."
-        )
-        global_batch_size = micro_batch_size
-    num_iters = num_samples // global_batch_size
-
-    cfg = ConfigContainer(
-        model=provider,
-        train=TrainingConfig(
-            micro_batch_size=micro_batch_size,
-            global_batch_size=global_batch_size,
-            train_iters=num_iters,
-            eval_iters=num_iters,
-            skip_train=True,
-        ),
-        # TODO: Replace validation args in train with validation config in nemo:26.04
-        # validation=ValidationConfig(eval_iters=num_iters, eval_interval=1, skip_train=True),
-        dataset=_get_dataset_cfg(
-            dataset_name,
-            num_samples,
-            provider.seq_length,
-            apply_chat_template=True,
-            tokenizer=tokenizer,
-        ),
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=hf_model_name_or_path,
-            # NOTE: Issue with Nemotron Nano v2 tokenizer returning bool hence using use_fast=True as a WAR
-            hf_tokenizer_kwargs={
-                "trust_remote_code": trust_remote_code,
-                "use_fast": tokenizer.is_fast,
-            },
-        ),
-        # Unused
-        optimizer=OptimizerConfig(optimizer="adam", lr=1e-4, use_distributed_optimizer=False),
-        scheduler=SchedulerConfig(lr_decay_style="constant"),
-        logger=LoggerConfig(),
-        checkpoint=CheckpointConfig(),
-    )
-    runtime_config_update(cfg)
-
-    state = GlobalState()
-    state.cfg = cfg
-
-    dataset_provider = get_dataset_provider(cfg.dataset)
-
-    def _train_valid_test_datasets_provider(
-        train_val_test_num_samples: tuple, dataset_cfg: HFDatasetConfig
-    ):
-        return dataset_provider(train_val_test_num_samples, dataset_cfg, tokenizer=state.tokenizer)
-
-    train_data_iterator, _, _ = setup_data_iterators(
-        cfg=cfg,
-        train_state=state.train_state,
-        model_length=len(model),
-        train_valid_test_datasets_provider=_train_valid_test_datasets_provider,
-        dp_group=get_data_parallel_group(),
-    )
-
-    def forward_loop(m):
-        evaluate_and_print_results(
-            state,
-            prefix="iteration 1",
-            forward_step_func=forward_step,
-            data_iterator=train_data_iterator,
-            model=model,
-            config=cfg,
-            verbose=True,
-            write_to_tensorboard=False,
-        )
-
-    return forward_loop
diff --git a/modelopt/torch/utils/plugins/megatron_calibration.py b/modelopt/torch/utils/plugins/megatron_calibration.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared calibration forward-loop builder for Megatron-Core models.
+
+Drives a logits-free prefill pass through the model over a calibration dataset,
+producing the ``forward_loop`` callable that ``mtq.quantize`` / ``mtp.prune`` /
+``mtq.calibrate`` expect. Replaces the bespoke calibration loops in
+``Megatron-LM/examples/post_training/modelopt/{quantize,prune}.py``,
+``Megatron-Bridge/examples/quantization/quantize.py``, and
+``examples/megatron_bridge/prune_minitron.py``.
+
+Picks the best primitives from each existing path:
+- ``get_dataset_dataloader`` for dataset surface (HF registry + JSONL auto-detection,
+  multi-source blending, ``pack=True`` for real-token density)
+- ``megatron_prefill(skip_return_logits=True)`` for the forward primitive (no
+  logits compute, just activation flow for hooks)
+- ``get_batch_on_this_cp_rank`` for context-parallel correctness
+"""
+
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+
+import torch
+from megatron.core.utils import get_batch_on_this_cp_rank
+from tqdm import tqdm
+
+from modelopt.torch.utils import distributed as dist
+from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
+
+from .megatron_generate import megatron_prefill
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+
+__all__ = ["get_megatron_calibration_forward_loop"]
+
+
+def get_megatron_calibration_forward_loop(
+    tokenizer: "PreTrainedTokenizerBase",
+    *,
+    dataset_name: str | list[str] = "cnn_dailymail",
+    num_samples: int | list[int] = 512,
+    seq_length: int = 512,
+    batch_size: int = 1,
+    pack: bool = True,
+    apply_chat_template: bool = False,
+    device: torch.device | str | None = "cuda",
+) -> Callable[[torch.nn.Module], None]:
+    """Build a Megatron-Core calibration ``forward_loop(model)`` for PTQ / pruning.
+
+    The returned callable iterates a ``get_dataset_dataloader``-produced dataloader,
+    slices each batch to the local context-parallel (CP) rank, and drives a
+    logits-free prefill pass through the model so activation hooks fire.
+
+    Args:
+        tokenizer: HuggingFace tokenizer. ``pad_token`` is set to ``eos_token`` if
+            missing so non-packing tokenization paths don't fail.
+        dataset_name: Dataset key (see :func:`get_supported_datasets`), a path to a
+            ``.jsonl`` file, or a list mixing the two. Multi-source blends are
+            supported when ``pack=True``.
+        num_samples: With ``pack=True``, the number of ``seq_length``-token chunks
+            per source; with ``pack=False``, the number of raw samples (each
+            padded/truncated). May be a list aligned with ``dataset_name``.
+        seq_length: Tokens per row. Under ``pack=True`` (default) every row is
+            exactly this length; under ``pack=False`` it's the truncation /
+            padding target. Matches Megatron-Core's ``seq_length`` convention.
+        batch_size: Calibration micro-batch size. Default ``1`` matches the
+            historical convention. Under ``pack=True`` it is safe to raise this
+            for throughput — every position is a real token (no per-sample
+            padding bias), and causal attention masking ensures batch entries
+            don't cross-attend, so ``mbs=N`` is forward-equivalent to
+            ``mbs=1`` repeated ``N`` times. Under ``pack=False``, keep ``mbs=1``
+            to avoid pad-token activations contaminating amax / sensitivity
+            statistics (calibration hooks fire before ``attention_mask`` is applied).
+        pack: Forwarded to :func:`get_dataset_dataloader`. Default ``True`` here
+            (vs. ``False`` in the underlying loader) because every Megatron
+            calibration call site we know of benefits from packing — long
+            documents stop being truncated and padding stops contaminating
+            activation statistics.
+        apply_chat_template: Forwarded to :func:`get_dataset_dataloader`.
+        device: Forwarded to :func:`get_dataset_dataloader`.
+
+    Returns:
+        A ``forward_loop(model)`` callable to pass into ``mtq.quantize``,
+        ``mtp.prune``, or ``mtq.calibrate``.
+
+    Example::
+
+        import modelopt.torch.quantization as mtq
+        from modelopt.torch.utils.plugins.megatron_calibration import (
+            get_megatron_calibration_forward_loop,
+        )
+
+        forward_loop = get_megatron_calibration_forward_loop(
+            tokenizer,
+            dataset_name="cnn_dailymail",
+            num_samples=1024,
+            seq_length=512,
+        )
+        mtq.quantize(unwrapped_model, mtq_config, forward_loop)
+    """
+    if getattr(tokenizer, "pad_token", None) is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    dataloader = get_dataset_dataloader(
+        dataset_name=dataset_name,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        num_samples=num_samples,
+        max_sample_length=seq_length,
+        device=device,
+        apply_chat_template=apply_chat_template,
+        pack=pack,
+    )
+
+    def _forward_loop(model: torch.nn.Module) -> None:
+        for sample in tqdm(dataloader, disable=not dist.is_master()):
+            # CP shard slicing is a no-op under CP=1 and required under CP>1.
+            sample = get_batch_on_this_cp_rank(sample)
+            megatron_prefill(model, sample["input_ids"], skip_return_logits=True)
+
+    return _forward_loop