diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index 71836a64d2..e316618852 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -125,7 +125,7 @@ jobs: strategy: &nemo_strategy fail-fast: false matrix: - example: [megatron_bridge, puzzletron] + example: [megatron_bridge] uses: ./.github/workflows/_example_tests_runner.yml secrets: inherit with: diff --git a/examples/megatron_bridge/README.md b/examples/megatron_bridge/README.md index 8d7f9b840c..b261775415 100644 --- a/examples/megatron_bridge/README.md +++ b/examples/megatron_bridge/README.md @@ -92,7 +92,7 @@ This section shows how to distill a student model from a teacher model in the Me This can be used stand-alone or after [Pruning](#pruning) / [Post-Training Quantization](#post-training-quantization) to recover accuracy of the model by distilling from the original model (teacher). -The [distill.py](distill.py) script loads student and teacher models from HuggingFace checkpoints and saves the distilled model to `/checkpoints` in Megatron distributed checkpoint format. +The [distill.py](distill.py) script supports both standard HuggingFace checkpoints and [Puzzletron AnyModel](../puzzletron/README.md) checkpoints as student/teacher inputs. Just pass the checkpoint path via `--student_hf_path` / `--teacher_hf_path`. The distilled model is saved to `/checkpoints` in Megatron distributed checkpoint format. ### Data Preparation @@ -158,9 +158,22 @@ torchrun --nproc_per_node 8 distill.py \ To run the distillation script on a Slurm cluster for multi-node training, you just need use `python` instead of `torchrun` and set the number of nodes using `#SBATCH --nodes=` clause in your Slurm script. -### Convert Megatron checkpoint to Hugging Face format +### Converting to Hugging Face format (optional) -To convert the Megatron checkpoint from last iteration (or any intermediate iteration) to Hugging Face format, you need the pruned model config (`--output_hf_path` from `prune_minitron.py` script) and the distilled megatron checkpoint dir (`/checkpoints/iter_`) to run the following command: +The distilled checkpoint is saved in Megatron distributed format. If you need a HuggingFace checkpoint, there are two ways to convert it: + +**Inline** -- add `--hf_export_path` and `--student_hf_model` to the `distill.py` command to automatically convert the final checkpoint after distillation: + +```bash +torchrun --nnodes 1 --nproc_per_node 8 distill.py \ + ... \ + --hf_export_path /path/to/save/distilled_hf_ckpt \ + --student_hf_model Qwen/Qwen3-4B +``` + +`--student_hf_model` should match the base architecture of the student (used as a template for export). For non-Puzzletron (i.e. standard) models, it should be same as `--student_hf_path`. + +**Separate conversion** -- convert any saved iteration using the Megatron-Bridge conversion script: ```bash uv run python /opt/Megatron-Bridge/examples/conversion/convert_checkpoints.py export \ @@ -169,7 +182,11 @@ uv run python /opt/Megatron-Bridge/examples/conversion/convert_checkpoints.py ex --hf-path ``` -For more details, you can refer to the checkpoint conversion scripts in the [Megatron-Bridge README](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/conversion). +For more details, see the [Megatron-Bridge conversion README](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/conversion). + +### Distillation Results + +See [results/puzzletron.md](results/puzzletron.md) for MMLU results demonstrating knowledge distillation on Puzzletron-compressed student models. ## Post-Training Quantization diff --git a/examples/megatron_bridge/distill.py b/examples/megatron_bridge/distill.py index f725fa07ac..9b22612d11 100644 --- a/examples/megatron_bridge/distill.py +++ b/examples/megatron_bridge/distill.py @@ -15,17 +15,22 @@ """Distillation script for Megatron-Bridge. Loads student and teacher models directly from HuggingFace checkpoints (local or remote) and saves the distilled model -to `/checkpoints` in megatron distributed checkpoint format. +to `/checkpoints` in megatron distributed checkpoint or HuggingFace format. See `README.md` in this directory for example usage and data preparation instructions. """ import argparse +import contextlib import os +from dataclasses import fields import torch from megatron.bridge import AutoBridge -from megatron.bridge.models.distillation_provider import convert_to_distillation_provider +from megatron.bridge.models.distillation_provider import ( + DistillationProvider, + convert_to_distillation_provider, +) from megatron.bridge.recipes.utils.optimizer_utils import ( distributed_fused_adam_with_cosine_annealing, ) @@ -43,13 +48,50 @@ from megatron.bridge.training.post_training.distillation import ModelOptDistillConfig from megatron.core.datasets.utils import get_blend_from_list from megatron.core.distributed import DistributedDataParallelConfig +from transformers import AutoConfig import modelopt.torch.utils.distributed as dist from modelopt.torch.utils import print_rank_0 +with contextlib.suppress(ImportError): + import modelopt.torch.puzzletron.plugins.mbridge # noqa: F401 + SEED = 1234 +def _patched_to_cfg_dict(self): + """Patched DistillationProvider.to_cfg_dict method for heterogeneous teacher and student models. + + TODO: Upstream this patch to Megatron-Bridge. + """ + from megatron.bridge.training.utils.config_utils import _ConfigContainerBase + + result = {"_target_": f"{self._super_class.__module__}.{self._super_class.__qualname__}"} + # Use fields from the actual student provider class, not DistillationProvider. + # DistillationProvider's __dataclass_fields__ only includes TransformerConfig fields + # (set at class definition time), missing GPTModelProvider-level fields like + # vocab_size, share_embeddings_and_output_weights, etc. + excluded_fields = {"teacher", "kd_config"} + for field in fields(self._super_class): + if field.name.startswith("_") or field.name in excluded_fields: + continue + if hasattr(self, field.name): + result[field.name] = _ConfigContainerBase._convert_value_to_dict( + getattr(self, field.name) + ) + for field in fields(self): + if field.name.startswith("_") or field.name in excluded_fields: + continue + if field.name not in result: + result[field.name] = _ConfigContainerBase._convert_value_to_dict( + getattr(self, field.name) + ) + return result + + +DistillationProvider.to_cfg_dict = _patched_to_cfg_dict + + def get_args(): """Parse command-line arguments.""" parser = argparse.ArgumentParser(description="Distillation for Megatron-Bridge.") @@ -124,12 +166,33 @@ def get_args(): ) parser.add_argument("--wandb_entity", type=str, help="Wandb entity name (optional)") parser.add_argument("--wandb_exp_name", type=str, help="Wandb experiment name (optional)") + # Export arguments + parser.add_argument( + "--hf_export_path", + type=str, + default=None, + help=( + "Path where to save the HuggingFace export. " + "If provided, exports last iteration checkpoint to HF format after distillation." + ), + ) + parser.add_argument( + "--student_hf_model", + type=str, + required=False, + default=None, + help="HuggingFace model ID to use as template for export (e.g., Qwen/Qwen3-0.6B). " + "Should match the base architecture of the student model if --hf_export_path is provided.", + ) args = parser.parse_args() # Sanity checks if not args.use_mock_data and not args.data_paths: raise ValueError("Must provide either --data_paths or set --use_mock_data.") + if args.hf_export_path and not args.student_hf_model: + raise ValueError("Must provide --student_hf_model if --hf_export_path is provided.") + print_rank_0("\n==================== Arguments ====================") for k, v in args.__dict__.items(): print_rank_0(f"{k:<35} {v}") @@ -252,9 +315,35 @@ def _build_model_provider(hf_path): print_rank_0("\nStarting distillation...") distill(config) print_rank_0( - f"\nDistillation done! Saved checkpoint to {checkpoint_dir} in megatron distributed checkpoint format.\n" + f"\nDistillation done! Saved checkpoint to {checkpoint_dir}" + " in megatron distributed checkpoint format.\n" ) + if args.hf_export_path: + print_rank_0(f"Exporting final distilled ckpt to HF format to {args.hf_export_path}") + # Save rank before destroying process group (dist.rank() won't work after destruction) + is_rank_0 = dist.rank() == 0 + + # Destroy process group on all ranks -- export_ckpt will create its own temporary one. + # This prevents cleanup from hanging (cleanup tries to barrier, but rank 0 would be gone). + dist.cleanup() + + if is_rank_0: + export_bridge = AutoBridge.from_hf_pretrained( + args.student_hf_model, trust_remote_code=args.trust_remote_code + ) + # Copy weights and remote code + export_bridge.export_ckpt( + megatron_path=f"{checkpoint_dir}/iter_{args.train_iters:07d}", + hf_path=args.hf_export_path, + show_progress=True, + strict=True, + ) + # Copy config.json from student_hf_path (handles both local paths and HF model IDs) + AutoConfig.from_pretrained( + args.student_hf_path, trust_remote_code=args.trust_remote_code + ).save_pretrained(args.hf_export_path) + if __name__ == "__main__": dist.setup() diff --git a/examples/megatron_bridge/results/puzzletron.md b/examples/megatron_bridge/results/puzzletron.md new file mode 100644 index 0000000000..89ba114f58 --- /dev/null +++ b/examples/megatron_bridge/results/puzzletron.md @@ -0,0 +1,42 @@ +# Puzzletron Distillation Results + +The following MMLU results demonstrate knowledge distillation on student models that were first compressed using [Puzzletron](../../puzzletron/README.md). The original (uncompressed) model serves as the teacher, and distillation recovers accuracy lost during compression. + +## Qwen3-8B compressed to 80% of original + +The student was created by compressing Qwen3-8B to 80% of its original size using Puzzletron. + +| Model | MMLU | Humanities | Other | Social Sci | STEM | +|-------|------|------------|-------|------------|------| +| Student (before distillation) | 0.5910 | 0.5046 | 0.6363 | 0.6831 | 0.5855 | +| Student (after distillation) | 0.6921 | 0.5906 | 0.7316 | 0.7975 | 0.7016 | +| Teacher (original Qwen3-8B) | 0.7493 | 0.6648 | 0.7856 | 0.8385 | 0.7526 | + +MMLU accuracy improved from 59.10% to 69.21% (+10.11 pp) after distillation with just 100 iterations on WikiText-103, recovering 64% of the gap to the teacher model. + +## Llama-3.1-8B-Instruct compressed to 50% of original + +The student was created by compressing Llama-3.1-8B-Instruct to 50% of its original size using Puzzletron. + +| Model | MMLU | Humanities | Other | Social Sciences | STEM | +|-------|------|------------|-------|-----------------|------| +| Student (before distillation) | 0.2316 | 0.2462 | 0.2292 | 0.2250 | 0.2274 | +| Student (after distillation) | 0.2960 | 0.3146 | 0.3085 | 0.2925 | 0.2768 | +| Teacher (original Llama-3.1-8B-Instruct) | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 | + +## Llama-3.1-8B-Instruct compressed to 69% of original (regression) + +The student was created by compressing Llama-3.1-8B-Instruct to ~69% of its original size using Puzzletron. This example shows regression due to overfitting on the small WikiText-103 dataset (100 iterations). MMLU was evaluated on a subset of 100 samples per task: + +| Model | MMLU | Humanities | Other | Social Sciences | STEM | +|-------|------|------------|-------|-----------------|------| +| Student (before distillation) | 0.6626 | 0.7069 | 0.6892 | 0.7525 | 0.5574 | +| Student (after distillation) | 0.6496 | 0.6862 | 0.6677 | 0.7433 | 0.5532 | +| Teacher (original Llama-3.1-8B-Instruct) | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 | + +MMLU decreased from 66.26% to 64.96% (-1.30 pp) -- the model overfitted to WikiText-103. This highlights the importance of using larger, more diverse datasets for distillation. + +## Recommendations + +- **Use larger datasets** for production distillation (e.g., [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1)) to avoid overfitting as shown in the regression case above. +- **Train for more iterations** to ensure proper convergence. diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 69da4c14c7..322b082c12 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -299,7 +299,7 @@ vllm bench throughput --model path/to/model --input-len 2000 --output-len 100 -- To recover degradation in the quality of the compressed model, we can use knowledge distillation. This allows transferring the capabilities of the original model to the pruned one. -See [mbridge_distillation/README.md](./mbridge_distillation/README.md) for instructions on using Megatron-Bridge for knowledge distillation. +See [Megatron-Bridge distillation](../megatron_bridge/README.md#distillation) for instructions on using Megatron-Bridge for knowledge distillation. The distillation script supports both standard HuggingFace and Puzzletron AnyModel checkpoints. ## Advanced Usage diff --git a/examples/puzzletron/mbridge_distillation/README.md b/examples/puzzletron/mbridge_distillation/README.md deleted file mode 100644 index 9658e48ebc..0000000000 --- a/examples/puzzletron/mbridge_distillation/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Knowledge Distillation with Megatron-Bridge - -This guide shows how to perform knowledge distillation on Puzzletron-compressed AnyModel checkpoints using Megatron-Bridge. - -## Overview - -1. Set up the environment with Megatron-Bridge -2. Prepare tokenized dataset -3. Run knowledge distillation training directly from HuggingFace checkpoints -4. Review MMLU evaluation results (before/after distillation) - -## Setup - -**Clone Model-Optimizer repo:** - -The NeMo container does not include Model-Optimizer examples, so you need to clone the Model-Optimizer repo: - -```bash -export MODELOPT_DIR=${PWD}/Model-Optimizer -git clone https://github.com/NVIDIA/Model-Optimizer.git ${MODELOPT_DIR} -``` - -**Start Docker container:** - -Use the [NeMo 26.02 container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo?version=26.02): - -```bash -# Recommended to mount a workspace directory for storing datasets and distilled models -docker run --gpus all -it --rm \ - -v /path/to/your/project:/workspace \ - -v ${MODELOPT_DIR}:/opt/Model-Optimizer \ - -v ${MODELOPT_DIR}/modelopt:/opt/venv/lib/python3.12/site-packages/modelopt \ - -w /opt/Model-Optimizer \ - nvcr.io/nvidia/nemo:26.02 \ - /bin/bash -``` - -## Dataset Preparation - -This section describes how to prepare datasets for knowledge distillation. We provide examples using WikiText-103, which is a small dataset that can still produce decent results (see the Qwen3-8B example below showing +10.11 percentage point improvement). For production use, larger datasets like [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) are recommended. - -### Download and Tokenize Dataset - -Download and tokenize the dataset in a single step. This downloads the dataset from HuggingFace, tokenizes it, and saves it in the Megatron format (`.bin` and `.idx` files): - -```bash -python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ - --hf_dataset Salesforce/wikitext \ - --hf_name wikitext-103-v1 \ - --hf_split train \ - --output_dir path/to/hf_datasets/wikitext-103-v1 \ - --tokenizer meta-llama/Llama-3.1-8B-Instruct \ - --json_keys text \ - --workers 32 -``` - -This will create: - -- `Salesforce--wikitext_wikitext-103-v1_train_text_document.bin` - Binary tokenized data -- `Salesforce--wikitext_wikitext-103-v1_train_text_document.idx` - Index file for the binary data -- `Salesforce--wikitext_wikitext-103-v1_train_text_document/cache/` - Cache directory (created after running distillation) - -## Run Knowledge Distillation - -Run distillation directly from HuggingFace checkpoints (student and teacher) with tokenized dataset: - -```bash -torchrun --nproc_per_node=8 examples/puzzletron/mbridge_distillation/distill_hf.py \ - --student_hf_path /path/to/student/puzzletron/checkpoint \ - --student_hf_model meta-llama/Llama-3.1-8B-Instruct \ - --teacher_hf_path /path/to/teacher/huggingface/checkpoint \ - --data_paths 1.0 /path/to/hf_datasets/wikitext-103-v1/Salesforce--wikitext_wikitext-103-v1_train_text_document \ - --output_dir /path/to/distilled/checkpoint \ - --hf_export_path /path/to/exported/hf/model \ - --seq_length 4096 \ - --tp_size 8 \ - --pp_size 1 \ - --mbs 1 \ - --gbs 4 \ - --train_iters 100 \ - --lr 0.0001 \ - --min_lr 1e-05 \ - --lr_warmup_iters 10 \ - --eval_interval 10 \ - --eval_iters 10 \ - --log_interval 1 -``` - -**Notes:** - -- Add `--trust_remote_code` if student or teacher checkpoints need HuggingFace custom modeling code. -- The distilled Megatron-Bridge checkpoint will be saved to `--output_dir/checkpoints/iter_`. -- Add `--hf_export_path` to automatically export the final checkpoint to HuggingFace format after distillation. When exporting, you must also provide `--student_hf_model` as the HuggingFace model ID for the export template (e.g., `meta-llama/Llama-3.1-8B-Instruct`). It should match the base architecture of the student model. The exported model can be evaluated for accuracy using the evaluation tools described in the main [README.md](../README.md#evaluation). -- For production use, use larger datasets like [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) and train for more iterations. See the [Megatron-Bridge distillation tutorial](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge#distillation) for best practices. - -## MMLU Evaluation Results - -This section presents MMLU evaluation results for knowledge distillation experiments compressing Qwen3-8B and Llama-3.1-8B-Instruct. - -### Successful Case: Qwen3-8B (80% of original) - -Distillation results for a memory-compressed Qwen3-8B checkpoint (80% of original size): - -| Model | MMLU | Humanities | Other | Social Sci | STEM | -|-------|------|------------|-------|------------|------| -| 80% pre-distillation | 0.5910 | 0.5046 | 0.6363 | 0.6831 | 0.5855 | -| 80% post-distillation | 0.6921 | 0.5906 | 0.7316 | 0.7975 | 0.7016 | -| Original Qwen3-8B | 0.7493 | 0.6648 | 0.7856 | 0.8385 | 0.7526 | - -**Key observations:** - -- MMLU accuracy improved from 59.10% to 69.21% (+10.11 percentage points) after distillation -- Achieved with just 100 iterations on WikiText-103, demonstrating efficient knowledge transfer -- Recovery of 64% of the gap to the teacher model (from 59.10% to 69.21%, closing 64% of the gap from 59.10% to 74.93%) -- All individual category scores (Humanities, Other, Social Sciences, STEM) improved significantly - -### Successful Case: Llama-3.1-8B-Instruct (50% of original, 56,810 MiB) - -Distillation results for a pruned Llama-3.1-8B-Instruct checkpoint (50% of original size, 56,810 MiB memory constraint): - -| Model | MMLU | Humanities | Other | Social Sciences | STEM | -|-------|------|------------|-------|-----------------|------| -| Before distillation | 0.2316 | 0.2462 | 0.2292 | 0.2250 | 0.2274 | -| After distillation | 0.2960 | 0.3146 | 0.3085 | 0.2925 | 0.2768 | -| Original Llama-3.1-8B-Instruct | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 | - -**Key observations:** - -- MMLU accuracy (average across all categories) improved from 23.16% to 29.60% (+6.44 percentage points) -- All individual category scores (Humanities, Other, Social Sciences, STEM) improved, demonstrating effective knowledge transfer from teacher to student - -### Regression Case: Llama-3.1-8B-Instruct (69% of original, 78,000 MiB) - -Distillation results for a pruned Llama-3.1-8B-Instruct checkpoint (approximately 69% of original size, 78,000 MiB memory constraint) showing regression due to overfitting on the small WikiText-103 dataset (evaluated with limit 100): - -| Model | MMLU | Humanities | Other | Social Sciences | STEM | -|-------|------|------------|-------|-----------------|------| -| Before distillation | 0.6626 | 0.7069 | 0.6892 | 0.7525 | 0.5574 | -| After distillation | 0.6496 | 0.6862 | 0.6677 | 0.7433 | 0.5532 | -| Original Llama-3.1-8B-Instruct | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 | - -**Key observations:** - -- MMLU accuracy (average across all categories) decreased from 66.26% to 64.96% (-1.30 percentage points) after distillation -- The model overfitted to the small WikiText-103 dataset, causing performance regression -- This demonstrates the critical importance of using larger, more diverse datasets for knowledge distillation - -### Recommendations - -- **For production distillation:** Use larger production datasets like [nvidia/Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) for better results and to avoid overfitting (see regression case above) -- **Training duration:** Train for more iterations to ensure proper convergence -- **See the [Megatron-Bridge distillation tutorial](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge#distillation) for best practices** diff --git a/examples/puzzletron/mbridge_distillation/distill_hf.py b/examples/puzzletron/mbridge_distillation/distill_hf.py deleted file mode 100644 index 75db004128..0000000000 --- a/examples/puzzletron/mbridge_distillation/distill_hf.py +++ /dev/null @@ -1,300 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Distillation script for Megatron-Bridge. - -Loads student and teacher models directly from HuggingFace checkpoints (local or remote) and saves the distilled model -to `/checkpoints` in megatron distributed checkpoint format. - -See `README.md` in this directory for example usage and data preparation instructions. -""" - -import argparse -import os -import shutil - -import torch -from megatron.bridge import AutoBridge -from megatron.bridge.models.distillation_provider import convert_to_distillation_provider -from megatron.bridge.recipes.utils.optimizer_utils import ( - distributed_fused_adam_with_cosine_annealing, -) -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - GPTDatasetConfig, - LoggerConfig, - MockGPTDatasetConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, -) -from megatron.bridge.training.distill import distill -from megatron.bridge.training.post_training.distillation import ModelOptDistillConfig -from megatron.core.datasets.utils import get_blend_from_list -from megatron.core.distributed import DistributedDataParallelConfig - -# Import to register heterogeneous bridges (side effect) -import modelopt.torch.puzzletron # noqa: F401 -import modelopt.torch.utils.distributed as dist -from modelopt.torch.utils import print_rank_0 - -SEED = 1234 - - -def get_args(): - """Parse command-line arguments.""" - parser = argparse.ArgumentParser(description="Distillation for Megatron-Bridge.") - # Model arguments (accepts HuggingFace input only at the moment) - parser.add_argument( - "--student_hf_path", - type=str, - required=True, - help="HuggingFace model name or path for the student (standard HF format or puzzletron any_model format)", - ) - parser.add_argument( - "--teacher_hf_path", - type=str, - required=True, - help="HuggingFace model name or path for the teacher (standard HF format or puzzletron any_model format)", - ) - parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code") - # Parallelism arguments - parser.add_argument("--tp_size", type=int, default=1, help="Tensor parallel size") - parser.add_argument("--pp_size", type=int, default=1, help="Pipeline parallel size") - # Dataset arguments - parser.add_argument( - "--data_paths", - nargs="+", - help="List of tokenized data paths to load from (weight1 path1 weight2 path2 ...)", - ) - parser.add_argument( - "--split", type=str, default="99,1,0", help="Train,Val,Test ratios to split data" - ) - parser.add_argument( - "--data_path_to_cache", type=str, default=None, help="Path to cache the dataset indices" - ) - parser.add_argument( - "--use_mock_data", action="store_true", help="Use mock data instead of --data_paths" - ) - # Training & Eval arguments - parser.add_argument( - "--output_dir", type=str, required=True, help="Folder for logging and checkpoint saving" - ) - parser.add_argument( - "--seq_length", - type=int, - default=4096, - help="Number of tokens per input sample. Use 8192 if your dataset has longer sequences.", - ) - parser.add_argument("--mbs", type=int, default=1, help="Micro-batch Size") - parser.add_argument("--gbs", type=int, default=768, help="Global Batch Size") - parser.add_argument( - "--train_iters", type=int, required=True, help="Number of training iterations" - ) - parser.add_argument("--lr", type=float, default=1e-4, help="Peak learning rate") - parser.add_argument("--min_lr", type=float, default=1e-5, help="Minimum learning rate") - parser.add_argument("--lr_warmup_iters", type=int, default=50, help="Number of LR warmup steps") - parser.add_argument( - "--eval_interval", type=int, default=100, help="Validate + checkpoint every steps" - ) - parser.add_argument( - "--eval_iters", type=int, default=32, help="Number of batches per validation stage" - ) - # Logging arguments - parser.add_argument("--log_interval", type=int, default=10, help="Write to log every steps") - parser.add_argument( - "--wandb_project", type=str, help="Wandb project name (required to enable Wandb logging)" - ) - parser.add_argument("--wandb_entity", type=str, help="Wandb entity name (optional)") - parser.add_argument("--wandb_exp_name", type=str, help="Wandb experiment name (optional)") - # Export arguments - parser.add_argument( - "--hf_export_path", - type=str, - default=None, - help=( - "Path where to save the HuggingFace export. " - "If provided, exports last iteration checkpoint to HF format after distillation." - ), - ) - parser.add_argument( - "--student_hf_model", - type=str, - required=False, - default=None, - help="HuggingFace model ID to use as template for export (e.g., Qwen/Qwen3-0.6B). " - "Should match the base architecture of the student model if --hf_export_path is provided.", - ) - args = parser.parse_args() - - # Sanity checks - if not args.use_mock_data and not args.data_paths: - raise ValueError("Must provide either --data_paths or set --use_mock_data.") - - if args.hf_export_path and not args.student_hf_model: - raise ValueError("Must provide --student_hf_model if --hf_export_path is provided.") - - print_rank_0("\n==================== Arguments ====================") - for k, v in args.__dict__.items(): - print_rank_0(f"{k:<35} {v}") - print_rank_0("===================================================\n") - - return args - - -def main(args: argparse.Namespace): - checkpoint_dir = os.path.join(args.output_dir, "checkpoints") - tensorboard_dir = os.path.join(args.output_dir, "tb_logs") - - # Build student and teacher model providers - def _build_model_provider(hf_path): - bridge = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=args.trust_remote_code) - provider = bridge.to_megatron_provider(load_weights=True) - - # Override parallelism / training settings - provider.tensor_model_parallel_size = args.tp_size - provider.pipeline_model_parallel_size = args.pp_size - provider.context_parallel_size = 1 - provider.sequence_parallel = args.tp_size > 1 - provider.seq_length = args.seq_length - provider.pipeline_dtype = torch.bfloat16 - return provider - - # TODO: Support megatron-ckpt as an alternative to HF checkpoints (e.g. /path/to/ckpt/iter_0000000) - # Still requires an HF model name or path to build provider correctly - student_provider = _build_model_provider(args.student_hf_path) - teacher_provider = _build_model_provider(args.teacher_hf_path) - - # Wrap into DistillationProvider - kd_config = ModelOptDistillConfig() - distill_provider = convert_to_distillation_provider( - student_provider, teacher_provider, kd_config - ) - - # Build optimizer and scheduler - optimizer_config, scheduler_config = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=args.lr_warmup_iters, - max_lr=args.lr, - min_lr=args.min_lr, - adam_beta2=0.98, - ) - - # Build dataset config - dataset_kwargs = { - "seq_length": args.seq_length, - "path_to_cache": args.data_path_to_cache, - "random_seed": SEED, - "reset_attention_mask": False, - "reset_position_ids": False, - "eod_mask_loss": False, - "num_dataset_builder_threads": 1, - "data_sharding": True, - "dataloader_type": "single", - "skip_getting_attention_mask_from_dataset": True, - } - if args.use_mock_data: - dataset_config = MockGPTDatasetConfig(**dataset_kwargs) - else: - # Convert flat CLI list (e.g. ["1.0", "/path/data"]) to Megatron blend format - blend = get_blend_from_list(args.data_paths) - dataset_config = GPTDatasetConfig(blend=blend, split=args.split, **dataset_kwargs) - - # Assemble ConfigContainer and run distillation - config = ConfigContainer( - model=distill_provider, - train=TrainingConfig( - train_iters=args.train_iters, - eval_interval=args.eval_interval, - eval_iters=args.eval_iters, - global_batch_size=args.gbs, - micro_batch_size=args.mbs, - manual_gc=True, - manual_gc_interval=100, - ), - # TODO: Replace validation args in train with validation config in nemo:26.04 - # validation=ValidationConfig(eval_interval=args.eval_interval, eval_iters=args.eval_iters), - optimizer=optimizer_config, - scheduler=scheduler_config, - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - use_distributed_optimizer=True, - ), - dataset=dataset_config, - logger=LoggerConfig( - log_interval=args.log_interval, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - # Weights & Biases logging - wandb_project=args.wandb_project, - wandb_entity=args.wandb_entity, # optional - wandb_exp_name=args.wandb_exp_name, - ), - tokenizer=TokenizerConfig( - tokenizer_type="NullTokenizer", vocab_size=distill_provider.vocab_size - ), - checkpoint=CheckpointConfig( - save_interval=args.eval_interval, - save=checkpoint_dir, - load=checkpoint_dir, # Resume from this directory (if exists) - most_recent_k=3, # Keeps 3 most recent checkpoints (not metric-based) - ckpt_format="torch_dist", - async_save=True, - fully_parallel_save=True, - ), - rng=RNGConfig(seed=SEED), - mixed_precision="bf16_mixed", - ) - - print_rank_0("\nStarting distillation...") - distill(config) - print_rank_0(f"\nDistillation done! Saved checkpoint to {checkpoint_dir}\n") - - # Export to HuggingFace format if hf_export_path is provided - if args.hf_export_path: - print_rank_0(f"Exporting final distilled ckpt to HF format to {args.hf_export_path}") - # Save rank before destroying process group (dist.rank() won't work after destruction) - is_rank_0 = dist.rank() == 0 - - # Destroy process group on all ranks - export_ckpt will create its own temporary one - # This prevents cleanup from hanging (cleanup tries to barrier, but rank 0 would be gone) - dist.cleanup() - - # Only rank 0 exports - if is_rank_0: - export_bridge = AutoBridge.from_hf_pretrained( - args.student_hf_model, trust_remote_code=args.trust_remote_code - ) - export_bridge.export_ckpt( - megatron_path=f"{checkpoint_dir}/iter_{args.train_iters:07d}", - hf_path=args.hf_export_path, - show_progress=True, - strict=True, - ) - - # save config from student_model to hf_export_path - shutil.copy(f"{args.student_hf_path}/config.json", f"{args.hf_export_path}/config.json") - - -if __name__ == "__main__": - dist.setup() - args = get_args() - try: - main(args) - finally: - dist.cleanup() diff --git a/tests/_test_utils/torch/puzzletron/utils.py b/tests/_test_utils/torch/puzzletron/utils.py index 365325a938..86047cb711 100644 --- a/tests/_test_utils/torch/puzzletron/utils.py +++ b/tests/_test_utils/torch/puzzletron/utils.py @@ -70,17 +70,17 @@ def create_and_save_small_hf_model( hf_model_name: str, hybrid_override_pattern: str | None = None, ): - """ - Create and save a small HuggingFace model for testing the conversion pipeline. + """Create and save a small HuggingFace model for testing the conversion pipeline. + Uses real HuggingFace config to preserve model-specific settings (like tie_word_embeddings), but shrinks size parameters for fast testing. Args: - output_path: Where to save the model - tokenizer: Tokenizer to save alongside the model - hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct") - hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for Attention+MLP, - "M-" for Mamba+MLP). Must match num_hidden_layers. None for non-NemotronH models. + output_path: Where to save the model. + tokenizer: Tokenizer to save alongside the model. + hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct"). + hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for + Attention+MLP, "M-" for Mamba+MLP). Must match num_hidden_layers. """ # Load real HuggingFace config (preserves tie_word_embeddings, rope_scaling, etc.) config = AutoConfig.from_pretrained(hf_model_name, trust_remote_code=True) diff --git a/tests/examples/megatron_bridge/test_distill.py b/tests/examples/megatron_bridge/test_distill.py index b5a0ca86d6..9f84f50c28 100644 --- a/tests/examples/megatron_bridge/test_distill.py +++ b/tests/examples/megatron_bridge/test_distill.py @@ -18,12 +18,14 @@ from pathlib import Path from _test_utils.examples.run_command import extend_cmd_parts, run_example_command -from _test_utils.torch.transformers_models import create_tiny_qwen3_dir +from _test_utils.torch.puzzletron.utils import create_and_save_small_hf_model +from _test_utils.torch.transformers_models import create_tiny_qwen3_dir, get_tiny_tokenizer + +from modelopt.torch.puzzletron.anymodel import convert_model def test_distill_and_convert(tmp_path: Path, num_gpus): teacher_hf_path = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True) - train_iters = 5 distill_output_dir = tmp_path / "distill_output" distill_cmd_parts = extend_cmd_parts( @@ -32,6 +34,7 @@ def test_distill_and_convert(tmp_path: Path, num_gpus): teacher_hf_path=teacher_hf_path, output_dir=distill_output_dir, tp_size=num_gpus, + pp_size=1, seq_length=32, mbs=1, gbs=4, @@ -63,3 +66,68 @@ def test_distill_and_convert(tmp_path: Path, num_gpus): check=True, ) assert (distilled_hf_path / "config.json").exists() + + +def test_distill_puzzletron_anymodel(tmp_path: Path, num_gpus): + """Integration test for distill.py with Puzzletron AnyModel (heterogeneous) checkpoints. + + Creates Qwen3 models, converts the student to Puzzletron AnyModel format + (heterogeneous layer architectures), runs mbridge distillation, and exports + the distilled checkpoint to HuggingFace format via --hf_export_path. + """ + student_hf_dir, student_anymodel_dir, teacher_hf_dir = ( + _prepare_puzzletron_anymodel_student_and_teacher(tmp_path) + ) + + train_iters = 5 + output_dir = tmp_path / "distill_output" + hf_export_path = tmp_path / "distilled_anymodel_hf" + cmd_parts = extend_cmd_parts( + ["torchrun", f"--nproc_per_node={num_gpus}", "distill.py", "--use_mock_data"], + student_hf_path=student_anymodel_dir, + teacher_hf_path=teacher_hf_dir, + output_dir=output_dir, + tp_size=num_gpus, + pp_size=1, + seq_length=32, + mbs=1, + gbs=4, + train_iters=train_iters, + lr_warmup_iters=2, + eval_interval=5, + eval_iters=1, + log_interval=1, + hf_export_path=hf_export_path, + student_hf_model=student_hf_dir, + ) + run_example_command(cmd_parts, example_path="megatron_bridge") + + run_config_path = output_dir / "checkpoints" / f"iter_{train_iters:07d}" / "run_config.yaml" + assert run_config_path.exists(), f"Expected run_config.yaml at: {run_config_path}" + + assert (hf_export_path / "config.json").exists(), ( + f"Expected HF export at: {hf_export_path}/config.json" + ) + + +def _prepare_puzzletron_anymodel_student_and_teacher(tmp_path: Path) -> tuple[Path, Path, Path]: + """Create Qwen3 models and convert student to Puzzletron AnyModel format.""" + student_hf_dir = tmp_path / "student_hf" + teacher_hf_dir = tmp_path / "teacher_hf" + + tokenizer = get_tiny_tokenizer() + + create_and_save_small_hf_model( + output_path=str(student_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B" + ) + + create_and_save_small_hf_model( + output_path=str(teacher_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B" + ) + + student_anymodel_dir = tmp_path / "student_anymodel" + convert_model( + input_dir=str(student_hf_dir), output_dir=str(student_anymodel_dir), converter="qwen3" + ) + + return student_hf_dir, student_anymodel_dir, teacher_hf_dir diff --git a/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py b/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py deleted file mode 100644 index c8d45693a1..0000000000 --- a/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py +++ /dev/null @@ -1,142 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for distill_hf.py script.""" - -from pathlib import Path - -import torch -from _test_utils.examples.run_command import extend_cmd_parts, run_example_command -from _test_utils.torch.distributed.utils import get_free_port -from _test_utils.torch.puzzletron.utils import create_and_save_small_hf_model -from _test_utils.torch.transformers_models import get_tiny_tokenizer -from transformers import AutoModelForCausalLM - -import modelopt.torch.puzzletron as mtpz - - -def test_distill_hf(project_root_path: Path, tmp_path: Path): - """Integration test for distill_hf.py. - - Creates Qwen3 models programmatically, converts them to heterogeneous format (AnyModel), - and runs mbridge distillation. The models are created with reduced size for faster testing. - Models are converted to include block_configs. - """ - # Prepare student and teacher models - student_hf_dir, student_anymodel_dir, teacher_hf_dir, _ = _prepare_student_and_teacher_models( - project_root_path, tmp_path - ) - - output_dir = tmp_path / "distill_output" - hf_export_dir = tmp_path / "hf_export" - - # Build command-line arguments for distill_hf.py - nproc_per_node = torch.cuda.device_count() - tp_size = nproc_per_node - train_iters = 5 - - cmd_parts = [ - "torchrun", - f"--nproc_per_node={nproc_per_node}", - "--master-addr", - "127.0.0.1", - "--master-port", - str(get_free_port()), - "distill_hf.py", - "--use_mock_data", - ] - extend_cmd_parts( - cmd_parts, - student_hf_path=student_anymodel_dir, - teacher_hf_path=teacher_hf_dir, - output_dir=output_dir, - tp_size=tp_size, - pp_size=1, - seq_length=128, - split="99,1,0", - mbs=1, - gbs=4, - train_iters=train_iters, - lr=0.0001, - min_lr=1e-5, - lr_warmup_iters=2, - eval_interval=100, - eval_iters=0, - log_interval=5, - hf_export_path=hf_export_dir, - student_hf_model=student_hf_dir, - ) - - run_example_command(cmd_parts, example_path="puzzletron/mbridge_distillation") - - # Check that distillation checkpoint contains run_config.yaml - run_config_path = output_dir / "checkpoints" / f"iter_{train_iters:07d}" / "run_config.yaml" - assert run_config_path.exists(), f"Expected run_config.yaml to exist at: {run_config_path}" - - # Verify that the distilled model can be loaded in HuggingFace format - model = AutoModelForCausalLM.from_pretrained(hf_export_dir) - assert model is not None, "Failed to load distilled model with AutoModelForCausalLM" - - -def _prepare_student_and_teacher_models( - project_root_path: Path, tmp_path: Path -) -> tuple[Path, Path, Path, Path]: - """Prepare student and teacher models for distillation. - - Creates Qwen3 models programmatically, converts them to heterogeneous format (AnyModel), - and returns the paths to the converted checkpoints. - - """ - - # Create temporary directories for models - student_hf_dir = tmp_path / "student_hf" - teacher_hf_dir = tmp_path / "teacher_hf" - - # Create tokenizer (uses local tokenizer from test resources) - tokenizer = get_tiny_tokenizer() - - # Create student model using utility function (loads config from Hub). - # TODO: Make the student model using different ffn sizes across layers. - create_and_save_small_hf_model( - output_path=str(student_hf_dir), - tokenizer=tokenizer, - hf_model_name="Qwen/Qwen3-0.6B", - hybrid_override_pattern=None, - ) - - # Create teacher model (same as student for testing) - create_and_save_small_hf_model( - output_path=str(teacher_hf_dir), - tokenizer=tokenizer, - hf_model_name="Qwen/Qwen3-0.6B", - hybrid_override_pattern=None, - ) - - # Convert models to AnyModel format BEFORE distillation - # This is needed as converted checkpoints will be used as input for distillation later - student_anymodel_dir = tmp_path / "student_anymodel" - teacher_anymodel_dir = tmp_path / "teacher_anymodel" - - mtpz.anymodel.convert_model( - input_dir=str(student_hf_dir), output_dir=str(student_anymodel_dir), converter="qwen3" - ) - - mtpz.anymodel.convert_model( - input_dir=str(teacher_hf_dir), output_dir=str(teacher_anymodel_dir), converter="qwen3" - ) - print("Models converted to AnyModel format:") - print(f" Student AnyModel: {student_anymodel_dir}") - print(f" Teacher AnyModel: {teacher_anymodel_dir}") - - return student_hf_dir, student_anymodel_dir, teacher_hf_dir, teacher_anymodel_dir