Add Megatron-Bridge recipe-free distillation example script

kevalmorabia97 · kevalmorabia97 · commit a4ad1b89113a · 2026-02-05T18:04:34.000-08:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,7 +13,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add standalone type inference option (``--use_standalone_type_inference``) in ONNX AutoCast as an alternative to ONNX's ``infer_shapes``. This experimental feature performs type-only inference without shape inference, useful as a workaround when shape inference fails or to avoid unnecessary shape inference overhead.
 - Add support for Kimi K2 Thinking model quantization from the original int4 checkpoint.
 - Add support for ``params`` constraint based automatic neural architecture search in Minitron pruning (``mcore_minitron``) as an alternative to manual pruning (using ``export_config``). See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details on its usage.
-- New example for Minitron pruning with Megatron-Bridge framework along with advanced pruning usage with new ``params`` constraint based pruning. Check `examples/megatron_bridge/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge>`_ for example scripts.
+- New example for Minitron pruning with Megatron-Bridge framework along with advanced pruning usage with new ``params`` constraint based pruning. Also add example for distillation with Megatron-Bridge framework. Check `examples/megatron_bridge/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge>`_ for example scripts.
 - Add support for calibration data with multiple samples in ``npz`` format in the ONNX Autocast workflow.
 - Add ``--opset`` option to ONNX quantization CLI to specify the target opset version for the quantized model.
 - Add support for context parallelism in Eagle speculative decoding for huggingface and megatron core models.
diff --git a/examples/megatron_bridge/README.md b/examples/megatron_bridge/README.md
@@ -50,7 +50,7 @@ torchrun --nproc_per_node 2 /opt/Megatron-Bridge/3rdparty/Model-Optimizer/exampl
 To see the full usage for advanced configurations, run:
 
 ```bash
-python /opt/Megatron-Bridge/3rdparty/Model-Optimizer/examples/megatron_bridge/prune_minitron.py --help
+torchrun --nproc_per_node 1 /opt/Megatron-Bridge/3rdparty/Model-Optimizer/examples/megatron_bridge/prune_minitron.py --help
 ```
 
 > [!TIP]
@@ -60,7 +60,7 @@ python /opt/Megatron-Bridge/3rdparty/Model-Optimizer/examples/megatron_bridge/pr
 
 ## Distillation
 
-TODO
+TODO - Add info!
 
 ## Quantization
 
diff --git a/examples/megatron_bridge/distill.py b/examples/megatron_bridge/distill.py
@@ -0,0 +1,279 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Distillation script for Megatron-Bridge.
+
+Loads student and teacher models directly from HuggingFace checkpoints (local or remote) and saves the distilled model
+to <log_dir>/checkpoints in megatron torch_dist checkpoint format.
+
+Example usage to distill a 4B student from an 8B teacher on 8 GPUs:
+
+.. code-block:: bash
+
+    torchrun --nproc_per_node 8 distill.py \
+        --teacher_hf_path Qwen/Qwen3-8B \
+        --student_hf_path Qwen/Qwen3-4B \
+        --tp_size 8 \
+        --data_paths 1.0 /path/to/tokenized/data \
+        --seq_length 8192 \
+        --mbs 1 \
+        --gbs 768 \
+        --train_iters 15000 \
+        --lr 1e-4 \
+        --min_lr 1e-5 \
+        --lr_warmup_iters 50 \
+        --eval_interval 100 \
+        --eval_iters 32 \
+        --log_interval 10 \
+        --log_dir /output/qwen3_8b_to_4b_distill
+
+Example usage to use mock data for quick testing:
+
+.. code-block:: bash
+
+    torchrun --nproc_per_node 8 distill.py \
+        --teacher_hf_path Qwen/Qwen3-0.6B \
+        --student_hf_path Qwen/Qwen3-0.6B \
+        --tp_size 8 \
+        --use_mock_data \
+        --seq_length 512 \
+        --mbs 1 \
+        --gbs 8 \
+        --train_iters 100 \
+        --log_dir /tmp/test_distill
+
+If you want to tokenize your own data for a specific tokenizer, you can use the following command:
+
+.. code-block:: python
+
+    from modelopt.torch.utils.plugins import megatron_preprocess_data
+
+    megatron_preprocess_data(
+        input_path="/path/to/your/data.jsonl",
+        output_dir="/path/to/tokenized/data",
+        tokenizer_name_or_path="Qwen/Qwen3-0.6B",
+        json_keys=["text"],
+        workers=32,
+        log_interval=100000,
+        max_sequence_length=256000,
+    )
+"""
+# TODO: Fix resuming distillation from an intermediate checkpoint.
+
+import argparse
+import os
+
+import torch
+from megatron.bridge import AutoBridge
+from megatron.bridge.models.distillation_provider import convert_to_distillation_provider
+from megatron.bridge.recipes.utils.optimizer_utils import (
+    distributed_fused_adam_with_cosine_annealing,
+)
+from megatron.bridge.training.config import (
+    CheckpointConfig,
+    ConfigContainer,
+    GPTDatasetConfig,
+    LoggerConfig,
+    MockGPTDatasetConfig,
+    RNGConfig,
+    TokenizerConfig,
+    TrainingConfig,
+)
+from megatron.bridge.training.distill import distill
+from megatron.bridge.training.post_training.distillation import ModelOptDistillConfig
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.distributed import DistributedDataParallelConfig
+
+import modelopt.torch.utils.distributed as dist
+from modelopt.torch.utils import print_rank_0
+
+SEED = 1234
+
+
+def get_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description="Distillation for Megatron-Bridge.")
+    # Model arguments
+    parser.add_argument(
+        "--student_hf_path",
+        type=str,
+        required=True,
+        help="HuggingFace model name or path for the student (e.g. Qwen/Qwen3-0.6B)",
+    )
+    parser.add_argument(
+        "--teacher_hf_path",
+        type=str,
+        required=True,
+        help="HuggingFace model name or path for the teacher (e.g. Qwen/Qwen3-8B)",
+    )
+    # Parallelism arguments
+    parser.add_argument("--tp_size", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--pp_size", type=int, default=1, help="Pipeline parallel size")
+    # Dataset arguments
+    parser.add_argument(
+        "--data_paths",
+        nargs="+",
+        help="List of tokenized data paths to load from (weight1 path1 weight2 path2 ...)",
+    )
+    parser.add_argument(
+        "--split", type=str, default="99,1,0", help="Train,Val,Test ratios to split data"
+    )
+    parser.add_argument(
+        "--use_mock_data", action="store_true", help="Use mock data instead of --data_paths"
+    )
+    # Training arguments
+    parser.add_argument(
+        "--log_dir", type=str, required=True, help="Folder for logging and checkpoint saving"
+    )
+    parser.add_argument(
+        "--seq_length", type=int, default=8192, help="Number of tokens per input sample"
+    )
+    parser.add_argument("--mbs", type=int, default=1, help="Micro-batch Size")
+    parser.add_argument("--gbs", type=int, default=768, help="Global Batch Size")
+    parser.add_argument(
+        "--train_iters", type=int, required=True, help="Number of training iterations"
+    )
+    parser.add_argument("--lr", type=float, default=1e-4, help="Peak learning rate")
+    parser.add_argument("--min_lr", type=float, default=1e-5, help="Minimum learning rate")
+    parser.add_argument("--lr_warmup_iters", type=int, default=50, help="Number of LR warmup steps")
+    parser.add_argument(
+        "--eval_interval", type=int, default=100, help="Validate + checkpoint every <N> steps"
+    )
+    parser.add_argument(
+        "--eval_iters", type=int, default=32, help="Number of batches per validation stage"
+    )
+    parser.add_argument("--log_interval", type=int, default=10, help="Write to log every <N> steps")
+    args = parser.parse_args()
+
+    # Sanity checks
+    if not args.use_mock_data and not args.data_paths:
+        raise ValueError("Must provide either --data_paths or set --use_mock_data.")
+
+    print_rank_0("\n==================== Arguments ====================")
+    for k, v in args.__dict__.items():
+        print_rank_0(f"{k:<35} {v}")
+    print_rank_0("===================================================\n")
+
+    return args
+
+
+def main(args: argparse.Namespace):
+    checkpoint_dir = os.path.join(args.log_dir, "checkpoints")
+    tensorboard_dir = os.path.join(args.log_dir, "tb_logs")
+
+    # Build student and teacher model providers
+    def _build_model_provider(hf_path):
+        bridge = AutoBridge.from_hf_pretrained(hf_path)
+        provider = bridge.to_megatron_provider(load_weights=True)
+        provider.tensor_model_parallel_size = args.tp_size
+        provider.pipeline_model_parallel_size = args.pp_size
+        provider.context_parallel_size = 1
+        provider.sequence_parallel = args.tp_size > 1
+        provider.seq_length = args.seq_length
+        provider.pipeline_dtype = torch.bfloat16
+        provider.cross_entropy_fusion_impl = "te"
+        return provider
+
+    # TODO: Support megatron-ckpt as an alternative to HF checkpoints
+    student_provider = _build_model_provider(args.student_hf_path)
+    teacher_provider = _build_model_provider(args.teacher_hf_path)
+
+    # Wrap into DistillationProvider
+    kd_config = ModelOptDistillConfig()
+    distill_provider = convert_to_distillation_provider(
+        student_provider, teacher_provider, kd_config
+    )
+
+    # Build optimizer and scheduler
+    optimizer_config, scheduler_config = distributed_fused_adam_with_cosine_annealing(
+        lr_warmup_iters=args.lr_warmup_iters,
+        max_lr=args.lr,
+        min_lr=args.min_lr,
+        adam_beta2=0.98,
+    )
+
+    # Build dataset config
+    dataset_kwargs = {
+        "seq_length": args.seq_length,
+        "random_seed": SEED,
+        "reset_attention_mask": False,
+        "reset_position_ids": False,
+        "eod_mask_loss": False,
+        "num_dataset_builder_threads": 1,
+        "data_sharding": True,
+        "dataloader_type": "single",
+        "skip_getting_attention_mask_from_dataset": True,
+    }
+    if args.use_mock_data:
+        dataset_config = MockGPTDatasetConfig(**dataset_kwargs)
+    else:
+        # Convert flat CLI list (e.g. ["1.0", "/path/data"]) to Megatron blend format
+        blend = get_blend_from_list(args.data_paths)
+        dataset_config = GPTDatasetConfig(blend=blend, split=args.split, **dataset_kwargs)
+
+    # Assemble ConfigContainer and run distillation
+    config = ConfigContainer(
+        model=distill_provider,
+        train=TrainingConfig(
+            train_iters=args.train_iters,
+            eval_interval=args.eval_interval,
+            eval_iters=args.eval_iters,
+            global_batch_size=args.gbs,
+            micro_batch_size=args.mbs,
+            manual_gc=True,
+            manual_gc_interval=100,
+        ),
+        optimizer=optimizer_config,
+        scheduler=scheduler_config,
+        ddp=DistributedDataParallelConfig(
+            check_for_nan_in_grad=True,
+            grad_reduce_in_fp32=True,
+            overlap_grad_reduce=True,
+            overlap_param_gather=True,
+            average_in_collective=True,
+            use_distributed_optimizer=True,
+        ),
+        dataset=dataset_config,
+        logger=LoggerConfig(
+            log_interval=args.log_interval,
+            tensorboard_dir=tensorboard_dir,
+            log_timers_to_tensorboard=True,
+        ),
+        tokenizer=TokenizerConfig(
+            tokenizer_type="NullTokenizer", vocab_size=distill_provider.vocab_size
+        ),
+        checkpoint=CheckpointConfig(
+            save_interval=args.eval_interval,
+            save=checkpoint_dir,
+            load=checkpoint_dir,
+            ckpt_format="torch_dist",
+            fully_parallel_save=True,
+            finetune=True,
+        ),
+        rng=RNGConfig(seed=SEED),
+        mixed_precision="bf16_mixed",
+    )
+
+    print_rank_0("\nStarting distillation...")
+    distill(config)
+    print_rank_0(f"\nDistillation done! Saved checkpoint to {checkpoint_dir}\n")
+
+
+if __name__ == "__main__":
+    dist.setup()
+    args = get_args()
+    try:
+        main(args)
+    finally:
+        dist.cleanup()
diff --git a/modelopt/torch/utils/plugins/megatron_preprocess_data.py b/modelopt/torch/utils/plugins/megatron_preprocess_data.py
@@ -42,6 +42,8 @@
 from megatron.core.datasets import indexed_dataset
 from transformers import AutoTokenizer
 
+from modelopt.torch.utils import num2hrb
+
 __all__ = ["megatron_preprocess_data"]
 
 
@@ -109,7 +111,7 @@ def __init__(self, vocab_size: int, json_keys: list[str], log_interval: int, wor
     def _print_processing_stats(self, count: int, total_doc_len: int, total_enc_len: int):
         if count % self.log_interval == 0:
             print(
-                f"Processed {count} documents, {total_doc_len} chars, {total_enc_len} tokens",
+                f"Processed {num2hrb(count)} docs = {num2hrb(total_doc_len)} chars = {num2hrb(total_enc_len)} tokens",
                 file=sys.stderr,
             )
 
@@ -202,7 +204,7 @@ def megatron_preprocess_data(
         num_tokens = partition.process_json_file(name, output_dir, encoder)
         final_enc_len += num_tokens
 
-    print(f">>> Total number of tokens: {final_enc_len}")
+    print(f">>> Total number of tokens: {num2hrb(final_enc_len)}")
 
 
 def main():