Make tokenizer.json 128 size to avoid hacky vocab handling + cleanup

kevalmorabia97 · kevalmorabia97 · commit 8c2fa10c0a08 · 2026-04-10T11:53:56.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -125,7 +125,7 @@ jobs:
     strategy: &nemo_strategy
       fail-fast: false
       matrix:
-        example: [megatron_bridge, puzzletron]
+        example: [megatron_bridge]
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
diff --git a/examples/megatron_bridge/README.md b/examples/megatron_bridge/README.md
@@ -171,7 +171,7 @@ torchrun --nnodes 1 --nproc_per_node 8 distill.py \
     --student_hf_model Qwen/Qwen3-4B
 ```
 
-`--student_hf_model` should match the base architecture of the student (used as a template for export).
+`--student_hf_model` should match the base architecture of the student (used as a template for export). For non-Puzzletron (i.e. standard) models, it should be same as `--student_hf_path`.
 
 **Separate conversion** -- convert any saved iteration using the Megatron-Bridge conversion script:
 
diff --git a/examples/megatron_bridge/distill.py b/examples/megatron_bridge/distill.py
@@ -23,7 +23,6 @@
 import argparse
 import contextlib
 import os
-import shutil
 
 import torch
 from megatron.bridge import AutoBridge
@@ -45,6 +44,7 @@
 from megatron.bridge.training.post_training.distillation import ModelOptDistillConfig
 from megatron.core.datasets.utils import get_blend_from_list
 from megatron.core.distributed import DistributedDataParallelConfig
+from transformers import AutoConfig
 
 with contextlib.suppress(ImportError):
     import modelopt.torch.puzzletron.export.mbridge  # noqa: F401
@@ -301,7 +301,10 @@ def _build_model_provider(hf_path):
                 show_progress=True,
                 strict=True,
             )
-            shutil.copy(f"{args.student_hf_path}/config.json", f"{args.hf_export_path}/config.json")
+            # Copy config.json from student_hf_path (handles both local paths and HF model IDs)
+            AutoConfig.from_pretrained(
+                args.student_hf_path, trust_remote_code=args.trust_remote_code
+            ).save_pretrained(args.hf_export_path)
 
 
 if __name__ == "__main__":
diff --git a/tests/_test_utils/torch/puzzletron/utils.py b/tests/_test_utils/torch/puzzletron/utils.py
@@ -69,7 +69,6 @@ def create_and_save_small_hf_model(
     tokenizer: PreTrainedTokenizerBase,
     hf_model_name: str,
     hybrid_override_pattern: str | None = None,
-    vocab_size: int | None = None,
 ):
     """Create and save a small HuggingFace model for testing the conversion pipeline.
 
@@ -82,10 +81,7 @@ def create_and_save_small_hf_model(
         hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct").
         hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for
             Attention+MLP, "M-" for Mamba+MLP). Must match num_hidden_layers.
-        vocab_size: Override vocab size. Defaults to tokenizer.vocab_size.
     """
-    if vocab_size is None:
-        vocab_size = tokenizer.vocab_size
     # Load real HuggingFace config (preserves tie_word_embeddings, rope_scaling, etc.)
     config = AutoConfig.from_pretrained(hf_model_name, trust_remote_code=True)
 
@@ -95,7 +91,7 @@ def create_and_save_small_hf_model(
 
     # VL models have nested configs (text_config, vision_config)
     if hasattr(config, "text_config") and hasattr(config, "vision_config"):
-        config.text_config.vocab_size = vocab_size
+        config.text_config.vocab_size = tokenizer.vocab_size
         config.text_config.hidden_size = 256
         config.text_config.intermediate_size = 512
         config.text_config.num_hidden_layers = 2
@@ -113,7 +109,7 @@ def create_and_save_small_hf_model(
         config.num_hidden_layers = config.text_config.num_hidden_layers
     else:
         # Regular models have flat config
-        config.vocab_size = vocab_size
+        config.vocab_size = tokenizer.vocab_size
         config.hidden_size = 256
         config.intermediate_size = 512
         config.num_hidden_layers = max(2, dist.size())
@@ -134,7 +130,10 @@ def create_and_save_small_hf_model(
             config.hybrid_override_pattern = hybrid_override_pattern
 
         # Ensure pad_token_id is within vocab_size (nn.Embedding requires padding_idx < num_embeddings)
-        if getattr(config, "pad_token_id", None) is not None and config.pad_token_id >= vocab_size:
+        if (
+            getattr(config, "pad_token_id", None) is not None
+            and config.pad_token_id >= tokenizer.vocab_size
+        ):
             config.pad_token_id = 0
 
         # Ensure moe_latent_size is present: the native transformers NemotronH model (>=5.5)
diff --git a/tests/_test_utils/torch/tokenizer/tokenizer.json b/tests/_test_utils/torch/tokenizer/tokenizer.json
@@ -205,7 +205,33 @@
       "¥": 98,
       "¦": 99,
       "<|begin_of_text|>": 100,
-      "<|eot_id|>": 101
+      "<|eot_id|>": 101,
+      "<|extra_0|>": 102,
+      "<|extra_1|>": 103,
+      "<|extra_2|>": 104,
+      "<|extra_3|>": 105,
+      "<|extra_4|>": 106,
+      "<|extra_5|>": 107,
+      "<|extra_6|>": 108,
+      "<|extra_7|>": 109,
+      "<|extra_8|>": 110,
+      "<|extra_9|>": 111,
+      "<|extra_10|>": 112,
+      "<|extra_11|>": 113,
+      "<|extra_12|>": 114,
+      "<|extra_13|>": 115,
+      "<|extra_14|>": 116,
+      "<|extra_15|>": 117,
+      "<|extra_16|>": 118,
+      "<|extra_17|>": 119,
+      "<|extra_18|>": 120,
+      "<|extra_19|>": 121,
+      "<|extra_20|>": 122,
+      "<|extra_21|>": 123,
+      "<|extra_22|>": 124,
+      "<|extra_23|>": 125,
+      "<|extra_24|>": 126,
+      "<|extra_25|>": 127
     },
     "merges": []
   }
diff --git a/tests/examples/megatron_bridge/test_distill.py b/tests/examples/megatron_bridge/test_distill.py
@@ -18,28 +18,23 @@
 from pathlib import Path
 
 from _test_utils.examples.run_command import extend_cmd_parts, run_example_command
-from _test_utils.torch.distributed.utils import get_free_port
 from _test_utils.torch.puzzletron.utils import create_and_save_small_hf_model
-from _test_utils.torch.transformers_models import get_tiny_qwen3, get_tiny_tokenizer
+from _test_utils.torch.transformers_models import create_tiny_qwen3_dir, get_tiny_tokenizer
 
 from modelopt.torch.puzzletron.anymodel import convert_model
 
 
 def test_distill_and_convert(tmp_path: Path, num_gpus):
-    # vocab_size=128 ensures divisibility by any TP size up to 128
-    teacher_hf_path = tmp_path / "tiny_qwen3"
-    get_tiny_tokenizer().save_pretrained(teacher_hf_path)
-    get_tiny_qwen3(vocab_size=128).save_pretrained(teacher_hf_path)
-
-    tp_size = num_gpus
+    teacher_hf_path = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
     train_iters = 5
     distill_output_dir = tmp_path / "distill_output"
     distill_cmd_parts = extend_cmd_parts(
-        ["torchrun", f"--nproc_per_node={tp_size}", "distill.py", "--use_mock_data"],
+        ["torchrun", f"--nproc_per_node={num_gpus}", "distill.py", "--use_mock_data"],
         student_hf_path=teacher_hf_path,
         teacher_hf_path=teacher_hf_path,
         output_dir=distill_output_dir,
-        tp_size=tp_size,
+        tp_size=num_gpus,
+        pp_size=1,
         seq_length=32,
         mbs=1,
         gbs=4,
@@ -88,41 +83,24 @@ def test_distill_puzzletron_anymodel(tmp_path: Path, num_gpus):
         tmp_path
     )
 
-    output_dir = tmp_path / "distill_output"
-
-    tp_size = num_gpus
     train_iters = 5
-
-    cmd_parts = [
-        "torchrun",
-        f"--nproc_per_node={tp_size}",
-        "--master-addr",
-        "127.0.0.1",
-        "--master-port",
-        str(get_free_port()),
-        "distill.py",
-        "--use_mock_data",
-    ]
-    extend_cmd_parts(
-        cmd_parts,
+    output_dir = tmp_path / "distill_output"
+    cmd_parts = extend_cmd_parts(
+        ["torchrun", f"--nproc_per_node={num_gpus}", "distill.py", "--use_mock_data"],
         student_hf_path=student_anymodel_dir,
         teacher_hf_path=teacher_hf_dir,
         output_dir=output_dir,
-        tp_size=tp_size,
+        tp_size=num_gpus,
         pp_size=1,
-        seq_length=128,
-        split="99,1,0",
+        seq_length=32,
         mbs=1,
         gbs=4,
         train_iters=train_iters,
-        lr=0.0001,
-        min_lr=1e-5,
         lr_warmup_iters=2,
-        eval_interval=100,
-        eval_iters=0,
-        log_interval=5,
+        eval_interval=5,
+        eval_iters=1,
+        log_interval=1,
     )
-
     run_example_command(cmd_parts, example_path="megatron_bridge")
 
     run_config_path = output_dir / "checkpoints" / f"iter_{train_iters:07d}" / "run_config.yaml"
@@ -135,20 +113,13 @@ def _prepare_puzzletron_anymodel_student_and_teacher(tmp_path: Path) -> tuple[Pa
     teacher_hf_dir = tmp_path / "teacher_hf"
 
     tokenizer = get_tiny_tokenizer()
-    vocab_size = 128  # must be divisible by TP size
 
     create_and_save_small_hf_model(
-        output_path=str(student_hf_dir),
-        tokenizer=tokenizer,
-        hf_model_name="Qwen/Qwen3-0.6B",
-        vocab_size=vocab_size,
+        output_path=str(student_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B"
     )
 
     create_and_save_small_hf_model(
-        output_path=str(teacher_hf_dir),
-        tokenizer=tokenizer,
-        hf_model_name="Qwen/Qwen3-0.6B",
-        vocab_size=vocab_size,
+        output_path=str(teacher_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B"
     )
 
     student_anymodel_dir = tmp_path / "student_anymodel"