diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
index 71836a64d2..e316618852 100644
--- a/.github/workflows/example_tests.yml
+++ b/.github/workflows/example_tests.yml
@@ -125,7 +125,7 @@ jobs:
     strategy: &nemo_strategy
       fail-fast: false
       matrix:
-        example: [megatron_bridge, puzzletron]
+        example: [megatron_bridge]
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
diff --git a/examples/megatron_bridge/README.md b/examples/megatron_bridge/README.md
index 8d7f9b840c..b261775415 100644
--- a/examples/megatron_bridge/README.md
+++ b/examples/megatron_bridge/README.md
@@ -92,7 +92,7 @@ This section shows how to distill a student model from a teacher model in the Me
 
 This can be used stand-alone or after [Pruning](#pruning) / [Post-Training Quantization](#post-training-quantization) to recover accuracy of the model by distilling from the original model (teacher).
 
-The [distill.py](distill.py) script loads student and teacher models from HuggingFace checkpoints and saves the distilled model to `<output_dir>/checkpoints` in Megatron distributed checkpoint format.
+The [distill.py](distill.py) script supports both standard HuggingFace checkpoints and [Puzzletron AnyModel](../puzzletron/README.md) checkpoints as student/teacher inputs. Just pass the checkpoint path via `--student_hf_path` / `--teacher_hf_path`. The distilled model is saved to `<output_dir>/checkpoints` in Megatron distributed checkpoint format.
 
 ### Data Preparation
 
@@ -158,9 +158,22 @@ torchrun --nproc_per_node 8 distill.py \
 
 To run the distillation script on a Slurm cluster for multi-node training, you just need use `python` instead of `torchrun` and set the number of nodes using `#SBATCH --nodes=<num_nodes>` clause in your Slurm script.
 
-### Convert Megatron checkpoint to Hugging Face format
+### Converting to Hugging Face format (optional)
 
-To convert the Megatron checkpoint from last iteration (or any intermediate iteration) to Hugging Face format, you need the pruned model config (`--output_hf_path` from `prune_minitron.py` script) and the distilled megatron checkpoint dir (`<distill_output_dir>/checkpoints/iter_<iter_number>`) to run the following command:
+The distilled checkpoint is saved in Megatron distributed format. If you need a HuggingFace checkpoint, there are two ways to convert it:
+
+**Inline** -- add `--hf_export_path` and `--student_hf_model` to the `distill.py` command to automatically convert the final checkpoint after distillation:
+
+```bash
+torchrun --nnodes 1 --nproc_per_node 8 distill.py \
+    ... \
+    --hf_export_path /path/to/save/distilled_hf_ckpt \
+    --student_hf_model Qwen/Qwen3-4B
+```
+
+`--student_hf_model` should match the base architecture of the student (used as a template for export). For non-Puzzletron (i.e. standard) models, it should be same as `--student_hf_path`.
+
+**Separate conversion** -- convert any saved iteration using the Megatron-Bridge conversion script:
 
 ```bash
 uv run python /opt/Megatron-Bridge/examples/conversion/convert_checkpoints.py export \
@@ -169,7 +182,11 @@ uv run python /opt/Megatron-Bridge/examples/conversion/convert_checkpoints.py ex
     --hf-path <path_to_save_distilled_hf_ckpt>
 ```
 
-For more details, you can refer to the checkpoint conversion scripts in the [Megatron-Bridge README](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/conversion).
+For more details, see the [Megatron-Bridge conversion README](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples/conversion).
+
+### Distillation Results
+
+See [results/puzzletron.md](results/puzzletron.md) for MMLU results demonstrating knowledge distillation on Puzzletron-compressed student models.
 
 ## Post-Training Quantization
 
diff --git a/examples/megatron_bridge/distill.py b/examples/megatron_bridge/distill.py
index f725fa07ac..9b22612d11 100644
--- a/examples/megatron_bridge/distill.py
+++ b/examples/megatron_bridge/distill.py
@@ -15,17 +15,22 @@
 """Distillation script for Megatron-Bridge.
 
 Loads student and teacher models directly from HuggingFace checkpoints (local or remote) and saves the distilled model
-to `<output_dir>/checkpoints` in megatron distributed checkpoint format.
+to `<output_dir>/checkpoints` in megatron distributed checkpoint or HuggingFace format.
 
 See `README.md` in this directory for example usage and data preparation instructions.
 """
 
 import argparse
+import contextlib
 import os
+from dataclasses import fields
 
 import torch
 from megatron.bridge import AutoBridge
-from megatron.bridge.models.distillation_provider import convert_to_distillation_provider
+from megatron.bridge.models.distillation_provider import (
+    DistillationProvider,
+    convert_to_distillation_provider,
+)
 from megatron.bridge.recipes.utils.optimizer_utils import (
     distributed_fused_adam_with_cosine_annealing,
 )
@@ -43,13 +48,50 @@
 from megatron.bridge.training.post_training.distillation import ModelOptDistillConfig
 from megatron.core.datasets.utils import get_blend_from_list
 from megatron.core.distributed import DistributedDataParallelConfig
+from transformers import AutoConfig
 
 import modelopt.torch.utils.distributed as dist
 from modelopt.torch.utils import print_rank_0
 
+with contextlib.suppress(ImportError):
+    import modelopt.torch.puzzletron.plugins.mbridge  # noqa: F401
+
 SEED = 1234
 
 
+def _patched_to_cfg_dict(self):
+    """Patched DistillationProvider.to_cfg_dict method for heterogeneous teacher and student models.
+
+    TODO: Upstream this patch to Megatron-Bridge.
+    """
+    from megatron.bridge.training.utils.config_utils import _ConfigContainerBase
+
+    result = {"_target_": f"{self._super_class.__module__}.{self._super_class.__qualname__}"}
+    # Use fields from the actual student provider class, not DistillationProvider.
+    # DistillationProvider's __dataclass_fields__ only includes TransformerConfig fields
+    # (set at class definition time), missing GPTModelProvider-level fields like
+    # vocab_size, share_embeddings_and_output_weights, etc.
+    excluded_fields = {"teacher", "kd_config"}
+    for field in fields(self._super_class):
+        if field.name.startswith("_") or field.name in excluded_fields:
+            continue
+        if hasattr(self, field.name):
+            result[field.name] = _ConfigContainerBase._convert_value_to_dict(
+                getattr(self, field.name)
+            )
+    for field in fields(self):
+        if field.name.startswith("_") or field.name in excluded_fields:
+            continue
+        if field.name not in result:
+            result[field.name] = _ConfigContainerBase._convert_value_to_dict(
+                getattr(self, field.name)
+            )
+    return result
+
+
+DistillationProvider.to_cfg_dict = _patched_to_cfg_dict
+
+
 def get_args():
     """Parse command-line arguments."""
     parser = argparse.ArgumentParser(description="Distillation for Megatron-Bridge.")
@@ -124,12 +166,33 @@ def get_args():
     )
     parser.add_argument("--wandb_entity", type=str, help="Wandb entity name (optional)")
     parser.add_argument("--wandb_exp_name", type=str, help="Wandb experiment name (optional)")
+    # Export arguments
+    parser.add_argument(
+        "--hf_export_path",
+        type=str,
+        default=None,
+        help=(
+            "Path where to save the HuggingFace export. "
+            "If provided, exports last iteration checkpoint to HF format after distillation."
+        ),
+    )
+    parser.add_argument(
+        "--student_hf_model",
+        type=str,
+        required=False,
+        default=None,
+        help="HuggingFace model ID to use as template for export (e.g., Qwen/Qwen3-0.6B). "
+        "Should match the base architecture of the student model if --hf_export_path is provided.",
+    )
     args = parser.parse_args()
 
     # Sanity checks
     if not args.use_mock_data and not args.data_paths:
         raise ValueError("Must provide either --data_paths or set --use_mock_data.")
 
+    if args.hf_export_path and not args.student_hf_model:
+        raise ValueError("Must provide --student_hf_model if --hf_export_path is provided.")
+
     print_rank_0("\n==================== Arguments ====================")
     for k, v in args.__dict__.items():
         print_rank_0(f"{k:<35} {v}")
@@ -252,9 +315,35 @@ def _build_model_provider(hf_path):
     print_rank_0("\nStarting distillation...")
     distill(config)
     print_rank_0(
-        f"\nDistillation done! Saved checkpoint to {checkpoint_dir} in megatron distributed checkpoint format.\n"
+        f"\nDistillation done! Saved checkpoint to {checkpoint_dir}"
+        " in megatron distributed checkpoint format.\n"
     )
 
+    if args.hf_export_path:
+        print_rank_0(f"Exporting final distilled ckpt to HF format to {args.hf_export_path}")
+        # Save rank before destroying process group (dist.rank() won't work after destruction)
+        is_rank_0 = dist.rank() == 0
+
+        # Destroy process group on all ranks -- export_ckpt will create its own temporary one.
+        # This prevents cleanup from hanging (cleanup tries to barrier, but rank 0 would be gone).
+        dist.cleanup()
+
+        if is_rank_0:
+            export_bridge = AutoBridge.from_hf_pretrained(
+                args.student_hf_model, trust_remote_code=args.trust_remote_code
+            )
+            # Copy weights and remote code
+            export_bridge.export_ckpt(
+                megatron_path=f"{checkpoint_dir}/iter_{args.train_iters:07d}",
+                hf_path=args.hf_export_path,
+                show_progress=True,
+                strict=True,
+            )
+            # Copy config.json from student_hf_path (handles both local paths and HF model IDs)
+            AutoConfig.from_pretrained(
+                args.student_hf_path, trust_remote_code=args.trust_remote_code
+            ).save_pretrained(args.hf_export_path)
+
 
 if __name__ == "__main__":
     dist.setup()
diff --git a/examples/megatron_bridge/results/puzzletron.md b/examples/megatron_bridge/results/puzzletron.md
new file mode 100644
index 0000000000..89ba114f58
--- /dev/null
+++ b/examples/megatron_bridge/results/puzzletron.md
@@ -0,0 +1,42 @@
+# Puzzletron Distillation Results
+
+The following MMLU results demonstrate knowledge distillation on student models that were first compressed using [Puzzletron](../../puzzletron/README.md). The original (uncompressed) model serves as the teacher, and distillation recovers accuracy lost during compression.
+
+## Qwen3-8B compressed to 80% of original
+
+The student was created by compressing Qwen3-8B to 80% of its original size using Puzzletron.
+
+| Model | MMLU | Humanities | Other | Social Sci | STEM |
+|-------|------|------------|-------|------------|------|
+| Student (before distillation) | 0.5910 | 0.5046 | 0.6363 | 0.6831 | 0.5855 |
+| Student (after distillation) | 0.6921 | 0.5906 | 0.7316 | 0.7975 | 0.7016 |
+| Teacher (original Qwen3-8B) | 0.7493 | 0.6648 | 0.7856 | 0.8385 | 0.7526 |
+
+MMLU accuracy improved from 59.10% to 69.21% (+10.11 pp) after distillation with just 100 iterations on WikiText-103, recovering 64% of the gap to the teacher model.
+
+## Llama-3.1-8B-Instruct compressed to 50% of original
+
+The student was created by compressing Llama-3.1-8B-Instruct to 50% of its original size using Puzzletron.
+
+| Model | MMLU | Humanities | Other | Social Sciences | STEM |
+|-------|------|------------|-------|-----------------|------|
+| Student (before distillation) | 0.2316 | 0.2462 | 0.2292 | 0.2250 | 0.2274 |
+| Student (after distillation) | 0.2960 | 0.3146 | 0.3085 | 0.2925 | 0.2768 |
+| Teacher (original Llama-3.1-8B-Instruct) | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 |
+
+## Llama-3.1-8B-Instruct compressed to 69% of original (regression)
+
+The student was created by compressing Llama-3.1-8B-Instruct to ~69% of its original size using Puzzletron. This example shows regression due to overfitting on the small WikiText-103 dataset (100 iterations). MMLU was evaluated on a subset of 100 samples per task:
+
+| Model | MMLU | Humanities | Other | Social Sciences | STEM |
+|-------|------|------------|-------|-----------------|------|
+| Student (before distillation) | 0.6626 | 0.7069 | 0.6892 | 0.7525 | 0.5574 |
+| Student (after distillation) | 0.6496 | 0.6862 | 0.6677 | 0.7433 | 0.5532 |
+| Teacher (original Llama-3.1-8B-Instruct) | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 |
+
+MMLU decreased from 66.26% to 64.96% (-1.30 pp) -- the model overfitted to WikiText-103. This highlights the importance of using larger, more diverse datasets for distillation.
+
+## Recommendations
+
+- **Use larger datasets** for production distillation (e.g., [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1)) to avoid overfitting as shown in the regression case above.
+- **Train for more iterations** to ensure proper convergence.
diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index 69da4c14c7..322b082c12 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -299,7 +299,7 @@ vllm bench throughput --model path/to/model --input-len 2000 --output-len 100 --
 
 To recover degradation in the quality of the compressed model, we can use knowledge distillation. This allows transferring the capabilities of the original model to the pruned one.
 
-See [mbridge_distillation/README.md](./mbridge_distillation/README.md) for instructions on using Megatron-Bridge for knowledge distillation.
+See [Megatron-Bridge distillation](../megatron_bridge/README.md#distillation) for instructions on using Megatron-Bridge for knowledge distillation. The distillation script supports both standard HuggingFace and Puzzletron AnyModel checkpoints.
 
 ## Advanced Usage
 
diff --git a/examples/puzzletron/mbridge_distillation/README.md b/examples/puzzletron/mbridge_distillation/README.md
deleted file mode 100644
index 9658e48ebc..0000000000
--- a/examples/puzzletron/mbridge_distillation/README.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# Knowledge Distillation with Megatron-Bridge
-
-This guide shows how to perform knowledge distillation on Puzzletron-compressed AnyModel checkpoints using Megatron-Bridge.
-
-## Overview
-
-1. Set up the environment with Megatron-Bridge
-2. Prepare tokenized dataset
-3. Run knowledge distillation training directly from HuggingFace checkpoints
-4. Review MMLU evaluation results (before/after distillation)
-
-## Setup
-
-**Clone Model-Optimizer repo:**
-
-The NeMo container does not include Model-Optimizer examples, so you need to clone the Model-Optimizer repo:
-
-```bash
-export MODELOPT_DIR=${PWD}/Model-Optimizer
-git clone https://github.com/NVIDIA/Model-Optimizer.git ${MODELOPT_DIR}
-```
-
-**Start Docker container:**
-
-Use the [NeMo 26.02 container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo?version=26.02):
-
-```bash
-# Recommended to mount a workspace directory for storing datasets and distilled models
-docker run --gpus all -it --rm \
-  -v /path/to/your/project:/workspace \
-  -v ${MODELOPT_DIR}:/opt/Model-Optimizer \
-  -v ${MODELOPT_DIR}/modelopt:/opt/venv/lib/python3.12/site-packages/modelopt \
-  -w /opt/Model-Optimizer \
-  nvcr.io/nvidia/nemo:26.02 \
-  /bin/bash
-```
-
-## Dataset Preparation
-
-This section describes how to prepare datasets for knowledge distillation. We provide examples using WikiText-103, which is a small dataset that can still produce decent results (see the Qwen3-8B example below showing +10.11 percentage point improvement). For production use, larger datasets like [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) are recommended.
-
-### Download and Tokenize Dataset
-
-Download and tokenize the dataset in a single step. This downloads the dataset from HuggingFace, tokenizes it, and saves it in the Megatron format (`.bin` and `.idx` files):
-
-```bash
-python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
-    --hf_dataset Salesforce/wikitext \
-    --hf_name wikitext-103-v1 \
-    --hf_split train \
-    --output_dir path/to/hf_datasets/wikitext-103-v1 \
-    --tokenizer meta-llama/Llama-3.1-8B-Instruct \
-    --json_keys text \
-    --workers 32
-```
-
-This will create:
-
-- `Salesforce--wikitext_wikitext-103-v1_train_text_document.bin` - Binary tokenized data
-- `Salesforce--wikitext_wikitext-103-v1_train_text_document.idx` - Index file for the binary data
-- `Salesforce--wikitext_wikitext-103-v1_train_text_document/cache/` - Cache directory (created after running distillation)
-
-## Run Knowledge Distillation
-
-Run distillation directly from HuggingFace checkpoints (student and teacher) with tokenized dataset:
-
-```bash
-torchrun --nproc_per_node=8 examples/puzzletron/mbridge_distillation/distill_hf.py \
-    --student_hf_path /path/to/student/puzzletron/checkpoint \
-    --student_hf_model meta-llama/Llama-3.1-8B-Instruct \
-    --teacher_hf_path /path/to/teacher/huggingface/checkpoint \
-    --data_paths 1.0 /path/to/hf_datasets/wikitext-103-v1/Salesforce--wikitext_wikitext-103-v1_train_text_document \
-    --output_dir /path/to/distilled/checkpoint \
-    --hf_export_path /path/to/exported/hf/model \
-    --seq_length 4096 \
-    --tp_size 8 \
-    --pp_size 1 \
-    --mbs 1 \
-    --gbs 4 \
-    --train_iters 100 \
-    --lr 0.0001 \
-    --min_lr 1e-05 \
-    --lr_warmup_iters 10 \
-    --eval_interval 10 \
-    --eval_iters 10 \
-    --log_interval 1
-```
-
-**Notes:**
-
-- Add `--trust_remote_code` if student or teacher checkpoints need HuggingFace custom modeling code.
-- The distilled Megatron-Bridge checkpoint will be saved to `--output_dir/checkpoints/iter_<train_iters>`.
-- Add `--hf_export_path` to automatically export the final checkpoint to HuggingFace format after distillation. When exporting, you must also provide `--student_hf_model` as the HuggingFace model ID for the export template (e.g., `meta-llama/Llama-3.1-8B-Instruct`). It should match the base architecture of the student model. The exported model can be evaluated for accuracy using the evaluation tools described in the main [README.md](../README.md#evaluation).
-- For production use, use larger datasets like [Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) and train for more iterations. See the [Megatron-Bridge distillation tutorial](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge#distillation) for best practices.
-
-## MMLU Evaluation Results
-
-This section presents MMLU evaluation results for knowledge distillation experiments compressing Qwen3-8B and Llama-3.1-8B-Instruct.
-
-### Successful Case: Qwen3-8B (80% of original)
-
-Distillation results for a memory-compressed Qwen3-8B checkpoint (80% of original size):
-
-| Model | MMLU | Humanities | Other | Social Sci | STEM |
-|-------|------|------------|-------|------------|------|
-| 80% pre-distillation | 0.5910 | 0.5046 | 0.6363 | 0.6831 | 0.5855 |
-| 80% post-distillation | 0.6921 | 0.5906 | 0.7316 | 0.7975 | 0.7016 |
-| Original Qwen3-8B | 0.7493 | 0.6648 | 0.7856 | 0.8385 | 0.7526 |
-
-**Key observations:**
-
-- MMLU accuracy improved from 59.10% to 69.21% (+10.11 percentage points) after distillation
-- Achieved with just 100 iterations on WikiText-103, demonstrating efficient knowledge transfer
-- Recovery of 64% of the gap to the teacher model (from 59.10% to 69.21%, closing 64% of the gap from 59.10% to 74.93%)
-- All individual category scores (Humanities, Other, Social Sciences, STEM) improved significantly
-
-### Successful Case: Llama-3.1-8B-Instruct (50% of original, 56,810 MiB)
-
-Distillation results for a pruned Llama-3.1-8B-Instruct checkpoint (50% of original size, 56,810 MiB memory constraint):
-
-| Model | MMLU | Humanities | Other | Social Sciences | STEM |
-|-------|------|------------|-------|-----------------|------|
-| Before distillation | 0.2316 | 0.2462 | 0.2292 | 0.2250 | 0.2274 |
-| After distillation | 0.2960 | 0.3146 | 0.3085 | 0.2925 | 0.2768 |
-| Original Llama-3.1-8B-Instruct | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 |
-
-**Key observations:**
-
-- MMLU accuracy (average across all categories) improved from 23.16% to 29.60% (+6.44 percentage points)
-- All individual category scores (Humanities, Other, Social Sciences, STEM) improved, demonstrating effective knowledge transfer from teacher to student
-
-### Regression Case: Llama-3.1-8B-Instruct (69% of original, 78,000 MiB)
-
-Distillation results for a pruned Llama-3.1-8B-Instruct checkpoint (approximately 69% of original size, 78,000 MiB memory constraint) showing regression due to overfitting on the small WikiText-103 dataset (evaluated with limit 100):
-
-| Model | MMLU | Humanities | Other | Social Sciences | STEM |
-|-------|------|------------|-------|-----------------|------|
-| Before distillation | 0.6626 | 0.7069 | 0.6892 | 0.7525 | 0.5574 |
-| After distillation | 0.6496 | 0.6862 | 0.6677 | 0.7433 | 0.5532 |
-| Original Llama-3.1-8B-Instruct | 0.6839 | 0.7231 | 0.7038 | 0.7667 | 0.5911 |
-
-**Key observations:**
-
-- MMLU accuracy (average across all categories) decreased from 66.26% to 64.96% (-1.30 percentage points) after distillation
-- The model overfitted to the small WikiText-103 dataset, causing performance regression
-- This demonstrates the critical importance of using larger, more diverse datasets for knowledge distillation
-
-### Recommendations
-
-- **For production distillation:** Use larger production datasets like [nvidia/Nemotron-Pretraining-SFT-v1](https://huggingface.co/datasets/nvidia/Nemotron-Pretraining-SFT-v1) for better results and to avoid overfitting (see regression case above)
-- **Training duration:** Train for more iterations to ensure proper convergence
-- **See the [Megatron-Bridge distillation tutorial](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge#distillation) for best practices**
diff --git a/examples/puzzletron/mbridge_distillation/distill_hf.py b/examples/puzzletron/mbridge_distillation/distill_hf.py
deleted file mode 100644
index 75db004128..0000000000
--- a/examples/puzzletron/mbridge_distillation/distill_hf.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Distillation script for Megatron-Bridge.
-
-Loads student and teacher models directly from HuggingFace checkpoints (local or remote) and saves the distilled model
-to `<output_dir>/checkpoints` in megatron distributed checkpoint format.
-
-See `README.md` in this directory for example usage and data preparation instructions.
-"""
-
-import argparse
-import os
-import shutil
-
-import torch
-from megatron.bridge import AutoBridge
-from megatron.bridge.models.distillation_provider import convert_to_distillation_provider
-from megatron.bridge.recipes.utils.optimizer_utils import (
-    distributed_fused_adam_with_cosine_annealing,
-)
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    GPTDatasetConfig,
-    LoggerConfig,
-    MockGPTDatasetConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-)
-from megatron.bridge.training.distill import distill
-from megatron.bridge.training.post_training.distillation import ModelOptDistillConfig
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.core.distributed import DistributedDataParallelConfig
-
-# Import to register heterogeneous bridges (side effect)
-import modelopt.torch.puzzletron  # noqa: F401
-import modelopt.torch.utils.distributed as dist
-from modelopt.torch.utils import print_rank_0
-
-SEED = 1234
-
-
-def get_args():
-    """Parse command-line arguments."""
-    parser = argparse.ArgumentParser(description="Distillation for Megatron-Bridge.")
-    # Model arguments (accepts HuggingFace input only at the moment)
-    parser.add_argument(
-        "--student_hf_path",
-        type=str,
-        required=True,
-        help="HuggingFace model name or path for the student (standard HF format or puzzletron any_model format)",
-    )
-    parser.add_argument(
-        "--teacher_hf_path",
-        type=str,
-        required=True,
-        help="HuggingFace model name or path for the teacher (standard HF format or puzzletron any_model format)",
-    )
-    parser.add_argument("--trust_remote_code", action="store_true", help="Trust remote code")
-    # Parallelism arguments
-    parser.add_argument("--tp_size", type=int, default=1, help="Tensor parallel size")
-    parser.add_argument("--pp_size", type=int, default=1, help="Pipeline parallel size")
-    # Dataset arguments
-    parser.add_argument(
-        "--data_paths",
-        nargs="+",
-        help="List of tokenized data paths to load from (weight1 path1 weight2 path2 ...)",
-    )
-    parser.add_argument(
-        "--split", type=str, default="99,1,0", help="Train,Val,Test ratios to split data"
-    )
-    parser.add_argument(
-        "--data_path_to_cache", type=str, default=None, help="Path to cache the dataset indices"
-    )
-    parser.add_argument(
-        "--use_mock_data", action="store_true", help="Use mock data instead of --data_paths"
-    )
-    # Training & Eval arguments
-    parser.add_argument(
-        "--output_dir", type=str, required=True, help="Folder for logging and checkpoint saving"
-    )
-    parser.add_argument(
-        "--seq_length",
-        type=int,
-        default=4096,
-        help="Number of tokens per input sample. Use 8192 if your dataset has longer sequences.",
-    )
-    parser.add_argument("--mbs", type=int, default=1, help="Micro-batch Size")
-    parser.add_argument("--gbs", type=int, default=768, help="Global Batch Size")
-    parser.add_argument(
-        "--train_iters", type=int, required=True, help="Number of training iterations"
-    )
-    parser.add_argument("--lr", type=float, default=1e-4, help="Peak learning rate")
-    parser.add_argument("--min_lr", type=float, default=1e-5, help="Minimum learning rate")
-    parser.add_argument("--lr_warmup_iters", type=int, default=50, help="Number of LR warmup steps")
-    parser.add_argument(
-        "--eval_interval", type=int, default=100, help="Validate + checkpoint every <N> steps"
-    )
-    parser.add_argument(
-        "--eval_iters", type=int, default=32, help="Number of batches per validation stage"
-    )
-    # Logging arguments
-    parser.add_argument("--log_interval", type=int, default=10, help="Write to log every <N> steps")
-    parser.add_argument(
-        "--wandb_project", type=str, help="Wandb project name (required to enable Wandb logging)"
-    )
-    parser.add_argument("--wandb_entity", type=str, help="Wandb entity name (optional)")
-    parser.add_argument("--wandb_exp_name", type=str, help="Wandb experiment name (optional)")
-    # Export arguments
-    parser.add_argument(
-        "--hf_export_path",
-        type=str,
-        default=None,
-        help=(
-            "Path where to save the HuggingFace export. "
-            "If provided, exports last iteration checkpoint to HF format after distillation."
-        ),
-    )
-    parser.add_argument(
-        "--student_hf_model",
-        type=str,
-        required=False,
-        default=None,
-        help="HuggingFace model ID to use as template for export (e.g., Qwen/Qwen3-0.6B). "
-        "Should match the base architecture of the student model if --hf_export_path is provided.",
-    )
-    args = parser.parse_args()
-
-    # Sanity checks
-    if not args.use_mock_data and not args.data_paths:
-        raise ValueError("Must provide either --data_paths or set --use_mock_data.")
-
-    if args.hf_export_path and not args.student_hf_model:
-        raise ValueError("Must provide --student_hf_model if --hf_export_path is provided.")
-
-    print_rank_0("\n==================== Arguments ====================")
-    for k, v in args.__dict__.items():
-        print_rank_0(f"{k:<35} {v}")
-    print_rank_0("===================================================\n")
-
-    return args
-
-
-def main(args: argparse.Namespace):
-    checkpoint_dir = os.path.join(args.output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(args.output_dir, "tb_logs")
-
-    # Build student and teacher model providers
-    def _build_model_provider(hf_path):
-        bridge = AutoBridge.from_hf_pretrained(hf_path, trust_remote_code=args.trust_remote_code)
-        provider = bridge.to_megatron_provider(load_weights=True)
-
-        # Override parallelism / training settings
-        provider.tensor_model_parallel_size = args.tp_size
-        provider.pipeline_model_parallel_size = args.pp_size
-        provider.context_parallel_size = 1
-        provider.sequence_parallel = args.tp_size > 1
-        provider.seq_length = args.seq_length
-        provider.pipeline_dtype = torch.bfloat16
-        return provider
-
-    # TODO: Support megatron-ckpt as an alternative to HF checkpoints (e.g. /path/to/ckpt/iter_0000000)
-    # Still requires an HF model name or path to build provider correctly
-    student_provider = _build_model_provider(args.student_hf_path)
-    teacher_provider = _build_model_provider(args.teacher_hf_path)
-
-    # Wrap into DistillationProvider
-    kd_config = ModelOptDistillConfig()
-    distill_provider = convert_to_distillation_provider(
-        student_provider, teacher_provider, kd_config
-    )
-
-    # Build optimizer and scheduler
-    optimizer_config, scheduler_config = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=args.lr_warmup_iters,
-        max_lr=args.lr,
-        min_lr=args.min_lr,
-        adam_beta2=0.98,
-    )
-
-    # Build dataset config
-    dataset_kwargs = {
-        "seq_length": args.seq_length,
-        "path_to_cache": args.data_path_to_cache,
-        "random_seed": SEED,
-        "reset_attention_mask": False,
-        "reset_position_ids": False,
-        "eod_mask_loss": False,
-        "num_dataset_builder_threads": 1,
-        "data_sharding": True,
-        "dataloader_type": "single",
-        "skip_getting_attention_mask_from_dataset": True,
-    }
-    if args.use_mock_data:
-        dataset_config = MockGPTDatasetConfig(**dataset_kwargs)
-    else:
-        # Convert flat CLI list (e.g. ["1.0", "/path/data"]) to Megatron blend format
-        blend = get_blend_from_list(args.data_paths)
-        dataset_config = GPTDatasetConfig(blend=blend, split=args.split, **dataset_kwargs)
-
-    # Assemble ConfigContainer and run distillation
-    config = ConfigContainer(
-        model=distill_provider,
-        train=TrainingConfig(
-            train_iters=args.train_iters,
-            eval_interval=args.eval_interval,
-            eval_iters=args.eval_iters,
-            global_batch_size=args.gbs,
-            micro_batch_size=args.mbs,
-            manual_gc=True,
-            manual_gc_interval=100,
-        ),
-        # TODO: Replace validation args in train with validation config in nemo:26.04
-        # validation=ValidationConfig(eval_interval=args.eval_interval, eval_iters=args.eval_iters),
-        optimizer=optimizer_config,
-        scheduler=scheduler_config,
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            use_distributed_optimizer=True,
-        ),
-        dataset=dataset_config,
-        logger=LoggerConfig(
-            log_interval=args.log_interval,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-            # Weights & Biases logging
-            wandb_project=args.wandb_project,
-            wandb_entity=args.wandb_entity,  # optional
-            wandb_exp_name=args.wandb_exp_name,
-        ),
-        tokenizer=TokenizerConfig(
-            tokenizer_type="NullTokenizer", vocab_size=distill_provider.vocab_size
-        ),
-        checkpoint=CheckpointConfig(
-            save_interval=args.eval_interval,
-            save=checkpoint_dir,
-            load=checkpoint_dir,  # Resume from this directory (if exists)
-            most_recent_k=3,  # Keeps 3 most recent checkpoints (not metric-based)
-            ckpt_format="torch_dist",
-            async_save=True,
-            fully_parallel_save=True,
-        ),
-        rng=RNGConfig(seed=SEED),
-        mixed_precision="bf16_mixed",
-    )
-
-    print_rank_0("\nStarting distillation...")
-    distill(config)
-    print_rank_0(f"\nDistillation done! Saved checkpoint to {checkpoint_dir}\n")
-
-    # Export to HuggingFace format if hf_export_path is provided
-    if args.hf_export_path:
-        print_rank_0(f"Exporting final distilled ckpt to HF format to {args.hf_export_path}")
-        # Save rank before destroying process group (dist.rank() won't work after destruction)
-        is_rank_0 = dist.rank() == 0
-
-        # Destroy process group on all ranks - export_ckpt will create its own temporary one
-        # This prevents cleanup from hanging (cleanup tries to barrier, but rank 0 would be gone)
-        dist.cleanup()
-
-        # Only rank 0 exports
-        if is_rank_0:
-            export_bridge = AutoBridge.from_hf_pretrained(
-                args.student_hf_model, trust_remote_code=args.trust_remote_code
-            )
-            export_bridge.export_ckpt(
-                megatron_path=f"{checkpoint_dir}/iter_{args.train_iters:07d}",
-                hf_path=args.hf_export_path,
-                show_progress=True,
-                strict=True,
-            )
-
-            # save config from student_model to hf_export_path
-            shutil.copy(f"{args.student_hf_path}/config.json", f"{args.hf_export_path}/config.json")
-
-
-if __name__ == "__main__":
-    dist.setup()
-    args = get_args()
-    try:
-        main(args)
-    finally:
-        dist.cleanup()
diff --git a/tests/_test_utils/torch/puzzletron/utils.py b/tests/_test_utils/torch/puzzletron/utils.py
index 365325a938..86047cb711 100644
--- a/tests/_test_utils/torch/puzzletron/utils.py
+++ b/tests/_test_utils/torch/puzzletron/utils.py
@@ -70,17 +70,17 @@ def create_and_save_small_hf_model(
     hf_model_name: str,
     hybrid_override_pattern: str | None = None,
 ):
-    """
-    Create and save a small HuggingFace model for testing the conversion pipeline.
+    """Create and save a small HuggingFace model for testing the conversion pipeline.
+
     Uses real HuggingFace config to preserve model-specific settings (like tie_word_embeddings),
     but shrinks size parameters for fast testing.
 
     Args:
-        output_path: Where to save the model
-        tokenizer: Tokenizer to save alongside the model
-        hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct")
-        hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for Attention+MLP,
-                                 "M-" for Mamba+MLP). Must match num_hidden_layers. None for non-NemotronH models.
+        output_path: Where to save the model.
+        tokenizer: Tokenizer to save alongside the model.
+        hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct").
+        hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for
+            Attention+MLP, "M-" for Mamba+MLP). Must match num_hidden_layers.
     """
     # Load real HuggingFace config (preserves tie_word_embeddings, rope_scaling, etc.)
     config = AutoConfig.from_pretrained(hf_model_name, trust_remote_code=True)
diff --git a/tests/examples/megatron_bridge/test_distill.py b/tests/examples/megatron_bridge/test_distill.py
index b5a0ca86d6..9f84f50c28 100644
--- a/tests/examples/megatron_bridge/test_distill.py
+++ b/tests/examples/megatron_bridge/test_distill.py
@@ -18,12 +18,14 @@
 from pathlib import Path
 
 from _test_utils.examples.run_command import extend_cmd_parts, run_example_command
-from _test_utils.torch.transformers_models import create_tiny_qwen3_dir
+from _test_utils.torch.puzzletron.utils import create_and_save_small_hf_model
+from _test_utils.torch.transformers_models import create_tiny_qwen3_dir, get_tiny_tokenizer
+
+from modelopt.torch.puzzletron.anymodel import convert_model
 
 
 def test_distill_and_convert(tmp_path: Path, num_gpus):
     teacher_hf_path = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
-
     train_iters = 5
     distill_output_dir = tmp_path / "distill_output"
     distill_cmd_parts = extend_cmd_parts(
@@ -32,6 +34,7 @@ def test_distill_and_convert(tmp_path: Path, num_gpus):
         teacher_hf_path=teacher_hf_path,
         output_dir=distill_output_dir,
         tp_size=num_gpus,
+        pp_size=1,
         seq_length=32,
         mbs=1,
         gbs=4,
@@ -63,3 +66,68 @@ def test_distill_and_convert(tmp_path: Path, num_gpus):
         check=True,
     )
     assert (distilled_hf_path / "config.json").exists()
+
+
+def test_distill_puzzletron_anymodel(tmp_path: Path, num_gpus):
+    """Integration test for distill.py with Puzzletron AnyModel (heterogeneous) checkpoints.
+
+    Creates Qwen3 models, converts the student to Puzzletron AnyModel format
+    (heterogeneous layer architectures), runs mbridge distillation, and exports
+    the distilled checkpoint to HuggingFace format via --hf_export_path.
+    """
+    student_hf_dir, student_anymodel_dir, teacher_hf_dir = (
+        _prepare_puzzletron_anymodel_student_and_teacher(tmp_path)
+    )
+
+    train_iters = 5
+    output_dir = tmp_path / "distill_output"
+    hf_export_path = tmp_path / "distilled_anymodel_hf"
+    cmd_parts = extend_cmd_parts(
+        ["torchrun", f"--nproc_per_node={num_gpus}", "distill.py", "--use_mock_data"],
+        student_hf_path=student_anymodel_dir,
+        teacher_hf_path=teacher_hf_dir,
+        output_dir=output_dir,
+        tp_size=num_gpus,
+        pp_size=1,
+        seq_length=32,
+        mbs=1,
+        gbs=4,
+        train_iters=train_iters,
+        lr_warmup_iters=2,
+        eval_interval=5,
+        eval_iters=1,
+        log_interval=1,
+        hf_export_path=hf_export_path,
+        student_hf_model=student_hf_dir,
+    )
+    run_example_command(cmd_parts, example_path="megatron_bridge")
+
+    run_config_path = output_dir / "checkpoints" / f"iter_{train_iters:07d}" / "run_config.yaml"
+    assert run_config_path.exists(), f"Expected run_config.yaml at: {run_config_path}"
+
+    assert (hf_export_path / "config.json").exists(), (
+        f"Expected HF export at: {hf_export_path}/config.json"
+    )
+
+
+def _prepare_puzzletron_anymodel_student_and_teacher(tmp_path: Path) -> tuple[Path, Path, Path]:
+    """Create Qwen3 models and convert student to Puzzletron AnyModel format."""
+    student_hf_dir = tmp_path / "student_hf"
+    teacher_hf_dir = tmp_path / "teacher_hf"
+
+    tokenizer = get_tiny_tokenizer()
+
+    create_and_save_small_hf_model(
+        output_path=str(student_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B"
+    )
+
+    create_and_save_small_hf_model(
+        output_path=str(teacher_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B"
+    )
+
+    student_anymodel_dir = tmp_path / "student_anymodel"
+    convert_model(
+        input_dir=str(student_hf_dir), output_dir=str(student_anymodel_dir), converter="qwen3"
+    )
+
+    return student_hf_dir, student_anymodel_dir, teacher_hf_dir
diff --git a/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py b/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py
deleted file mode 100644
index c8d45693a1..0000000000
--- a/tests/examples/puzzletron/mbridge_distillation/test_distill_hf.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for distill_hf.py script."""
-
-from pathlib import Path
-
-import torch
-from _test_utils.examples.run_command import extend_cmd_parts, run_example_command
-from _test_utils.torch.distributed.utils import get_free_port
-from _test_utils.torch.puzzletron.utils import create_and_save_small_hf_model
-from _test_utils.torch.transformers_models import get_tiny_tokenizer
-from transformers import AutoModelForCausalLM
-
-import modelopt.torch.puzzletron as mtpz
-
-
-def test_distill_hf(project_root_path: Path, tmp_path: Path):
-    """Integration test for distill_hf.py.
-
-    Creates Qwen3 models programmatically, converts them to heterogeneous format (AnyModel),
-    and runs mbridge distillation. The models are created with reduced size for faster testing.
-    Models are converted to include block_configs.
-    """
-    # Prepare student and teacher models
-    student_hf_dir, student_anymodel_dir, teacher_hf_dir, _ = _prepare_student_and_teacher_models(
-        project_root_path, tmp_path
-    )
-
-    output_dir = tmp_path / "distill_output"
-    hf_export_dir = tmp_path / "hf_export"
-
-    # Build command-line arguments for distill_hf.py
-    nproc_per_node = torch.cuda.device_count()
-    tp_size = nproc_per_node
-    train_iters = 5
-
-    cmd_parts = [
-        "torchrun",
-        f"--nproc_per_node={nproc_per_node}",
-        "--master-addr",
-        "127.0.0.1",
-        "--master-port",
-        str(get_free_port()),
-        "distill_hf.py",
-        "--use_mock_data",
-    ]
-    extend_cmd_parts(
-        cmd_parts,
-        student_hf_path=student_anymodel_dir,
-        teacher_hf_path=teacher_hf_dir,
-        output_dir=output_dir,
-        tp_size=tp_size,
-        pp_size=1,
-        seq_length=128,
-        split="99,1,0",
-        mbs=1,
-        gbs=4,
-        train_iters=train_iters,
-        lr=0.0001,
-        min_lr=1e-5,
-        lr_warmup_iters=2,
-        eval_interval=100,
-        eval_iters=0,
-        log_interval=5,
-        hf_export_path=hf_export_dir,
-        student_hf_model=student_hf_dir,
-    )
-
-    run_example_command(cmd_parts, example_path="puzzletron/mbridge_distillation")
-
-    # Check that distillation checkpoint contains run_config.yaml
-    run_config_path = output_dir / "checkpoints" / f"iter_{train_iters:07d}" / "run_config.yaml"
-    assert run_config_path.exists(), f"Expected run_config.yaml to exist at: {run_config_path}"
-
-    # Verify that the distilled model can be loaded in HuggingFace format
-    model = AutoModelForCausalLM.from_pretrained(hf_export_dir)
-    assert model is not None, "Failed to load distilled model with AutoModelForCausalLM"
-
-
-def _prepare_student_and_teacher_models(
-    project_root_path: Path, tmp_path: Path
-) -> tuple[Path, Path, Path, Path]:
-    """Prepare student and teacher models for distillation.
-
-    Creates Qwen3 models programmatically, converts them to heterogeneous format (AnyModel),
-    and returns the paths to the converted checkpoints.
-
-    """
-
-    # Create temporary directories for models
-    student_hf_dir = tmp_path / "student_hf"
-    teacher_hf_dir = tmp_path / "teacher_hf"
-
-    # Create tokenizer (uses local tokenizer from test resources)
-    tokenizer = get_tiny_tokenizer()
-
-    # Create student model using utility function (loads config from Hub).
-    # TODO: Make the student model using different ffn sizes across layers.
-    create_and_save_small_hf_model(
-        output_path=str(student_hf_dir),
-        tokenizer=tokenizer,
-        hf_model_name="Qwen/Qwen3-0.6B",
-        hybrid_override_pattern=None,
-    )
-
-    # Create teacher model (same as student for testing)
-    create_and_save_small_hf_model(
-        output_path=str(teacher_hf_dir),
-        tokenizer=tokenizer,
-        hf_model_name="Qwen/Qwen3-0.6B",
-        hybrid_override_pattern=None,
-    )
-
-    # Convert models to AnyModel format BEFORE distillation
-    # This is needed as converted checkpoints will be used as input for distillation later
-    student_anymodel_dir = tmp_path / "student_anymodel"
-    teacher_anymodel_dir = tmp_path / "teacher_anymodel"
-
-    mtpz.anymodel.convert_model(
-        input_dir=str(student_hf_dir), output_dir=str(student_anymodel_dir), converter="qwen3"
-    )
-
-    mtpz.anymodel.convert_model(
-        input_dir=str(teacher_hf_dir), output_dir=str(teacher_anymodel_dir), converter="qwen3"
-    )
-    print("Models converted to AnyModel format:")
-    print(f"  Student AnyModel: {student_anymodel_dir}")
-    print(f"  Teacher AnyModel: {teacher_anymodel_dir}")
-
-    return student_hf_dir, student_anymodel_dir, teacher_hf_dir, teacher_anymodel_dir