Skip to content

Commit 8c2fa10

Browse files
Make tokenizer.json 128 size to avoid hacky vocab handling + cleanup
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
1 parent b98f373 commit 8c2fa10

File tree

6 files changed

+55
-56
lines changed

6 files changed

+55
-56
lines changed

.github/workflows/example_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ jobs:
125125
strategy: &nemo_strategy
126126
fail-fast: false
127127
matrix:
128-
example: [megatron_bridge, puzzletron]
128+
example: [megatron_bridge]
129129
uses: ./.github/workflows/_example_tests_runner.yml
130130
secrets: inherit
131131
with:

examples/megatron_bridge/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ torchrun --nnodes 1 --nproc_per_node 8 distill.py \
171171
--student_hf_model Qwen/Qwen3-4B
172172
```
173173

174-
`--student_hf_model` should match the base architecture of the student (used as a template for export).
174+
`--student_hf_model` should match the base architecture of the student (used as a template for export). For non-Puzzletron (i.e. standard) models, it should be same as `--student_hf_path`.
175175

176176
**Separate conversion** -- convert any saved iteration using the Megatron-Bridge conversion script:
177177

examples/megatron_bridge/distill.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import argparse
2424
import contextlib
2525
import os
26-
import shutil
2726

2827
import torch
2928
from megatron.bridge import AutoBridge
@@ -45,6 +44,7 @@
4544
from megatron.bridge.training.post_training.distillation import ModelOptDistillConfig
4645
from megatron.core.datasets.utils import get_blend_from_list
4746
from megatron.core.distributed import DistributedDataParallelConfig
47+
from transformers import AutoConfig
4848

4949
with contextlib.suppress(ImportError):
5050
import modelopt.torch.puzzletron.export.mbridge # noqa: F401
@@ -301,7 +301,10 @@ def _build_model_provider(hf_path):
301301
show_progress=True,
302302
strict=True,
303303
)
304-
shutil.copy(f"{args.student_hf_path}/config.json", f"{args.hf_export_path}/config.json")
304+
# Copy config.json from student_hf_path (handles both local paths and HF model IDs)
305+
AutoConfig.from_pretrained(
306+
args.student_hf_path, trust_remote_code=args.trust_remote_code
307+
).save_pretrained(args.hf_export_path)
305308

306309

307310
if __name__ == "__main__":

tests/_test_utils/torch/puzzletron/utils.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def create_and_save_small_hf_model(
6969
tokenizer: PreTrainedTokenizerBase,
7070
hf_model_name: str,
7171
hybrid_override_pattern: str | None = None,
72-
vocab_size: int | None = None,
7372
):
7473
"""Create and save a small HuggingFace model for testing the conversion pipeline.
7574
@@ -82,10 +81,7 @@ def create_and_save_small_hf_model(
8281
hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct").
8382
hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for
8483
Attention+MLP, "M-" for Mamba+MLP). Must match num_hidden_layers.
85-
vocab_size: Override vocab size. Defaults to tokenizer.vocab_size.
8684
"""
87-
if vocab_size is None:
88-
vocab_size = tokenizer.vocab_size
8985
# Load real HuggingFace config (preserves tie_word_embeddings, rope_scaling, etc.)
9086
config = AutoConfig.from_pretrained(hf_model_name, trust_remote_code=True)
9187

@@ -95,7 +91,7 @@ def create_and_save_small_hf_model(
9591

9692
# VL models have nested configs (text_config, vision_config)
9793
if hasattr(config, "text_config") and hasattr(config, "vision_config"):
98-
config.text_config.vocab_size = vocab_size
94+
config.text_config.vocab_size = tokenizer.vocab_size
9995
config.text_config.hidden_size = 256
10096
config.text_config.intermediate_size = 512
10197
config.text_config.num_hidden_layers = 2
@@ -113,7 +109,7 @@ def create_and_save_small_hf_model(
113109
config.num_hidden_layers = config.text_config.num_hidden_layers
114110
else:
115111
# Regular models have flat config
116-
config.vocab_size = vocab_size
112+
config.vocab_size = tokenizer.vocab_size
117113
config.hidden_size = 256
118114
config.intermediate_size = 512
119115
config.num_hidden_layers = max(2, dist.size())
@@ -134,7 +130,10 @@ def create_and_save_small_hf_model(
134130
config.hybrid_override_pattern = hybrid_override_pattern
135131

136132
# Ensure pad_token_id is within vocab_size (nn.Embedding requires padding_idx < num_embeddings)
137-
if getattr(config, "pad_token_id", None) is not None and config.pad_token_id >= vocab_size:
133+
if (
134+
getattr(config, "pad_token_id", None) is not None
135+
and config.pad_token_id >= tokenizer.vocab_size
136+
):
138137
config.pad_token_id = 0
139138

140139
# Ensure moe_latent_size is present: the native transformers NemotronH model (>=5.5)

tests/_test_utils/torch/tokenizer/tokenizer.json

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,33 @@
205205
"¥": 98,
206206
"¦": 99,
207207
"<|begin_of_text|>": 100,
208-
"<|eot_id|>": 101
208+
"<|eot_id|>": 101,
209+
"<|extra_0|>": 102,
210+
"<|extra_1|>": 103,
211+
"<|extra_2|>": 104,
212+
"<|extra_3|>": 105,
213+
"<|extra_4|>": 106,
214+
"<|extra_5|>": 107,
215+
"<|extra_6|>": 108,
216+
"<|extra_7|>": 109,
217+
"<|extra_8|>": 110,
218+
"<|extra_9|>": 111,
219+
"<|extra_10|>": 112,
220+
"<|extra_11|>": 113,
221+
"<|extra_12|>": 114,
222+
"<|extra_13|>": 115,
223+
"<|extra_14|>": 116,
224+
"<|extra_15|>": 117,
225+
"<|extra_16|>": 118,
226+
"<|extra_17|>": 119,
227+
"<|extra_18|>": 120,
228+
"<|extra_19|>": 121,
229+
"<|extra_20|>": 122,
230+
"<|extra_21|>": 123,
231+
"<|extra_22|>": 124,
232+
"<|extra_23|>": 125,
233+
"<|extra_24|>": 126,
234+
"<|extra_25|>": 127
209235
},
210236
"merges": []
211237
}

tests/examples/megatron_bridge/test_distill.py

Lines changed: 15 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -18,28 +18,23 @@
1818
from pathlib import Path
1919

2020
from _test_utils.examples.run_command import extend_cmd_parts, run_example_command
21-
from _test_utils.torch.distributed.utils import get_free_port
2221
from _test_utils.torch.puzzletron.utils import create_and_save_small_hf_model
23-
from _test_utils.torch.transformers_models import get_tiny_qwen3, get_tiny_tokenizer
22+
from _test_utils.torch.transformers_models import create_tiny_qwen3_dir, get_tiny_tokenizer
2423

2524
from modelopt.torch.puzzletron.anymodel import convert_model
2625

2726

2827
def test_distill_and_convert(tmp_path: Path, num_gpus):
29-
# vocab_size=128 ensures divisibility by any TP size up to 128
30-
teacher_hf_path = tmp_path / "tiny_qwen3"
31-
get_tiny_tokenizer().save_pretrained(teacher_hf_path)
32-
get_tiny_qwen3(vocab_size=128).save_pretrained(teacher_hf_path)
33-
34-
tp_size = num_gpus
28+
teacher_hf_path = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
3529
train_iters = 5
3630
distill_output_dir = tmp_path / "distill_output"
3731
distill_cmd_parts = extend_cmd_parts(
38-
["torchrun", f"--nproc_per_node={tp_size}", "distill.py", "--use_mock_data"],
32+
["torchrun", f"--nproc_per_node={num_gpus}", "distill.py", "--use_mock_data"],
3933
student_hf_path=teacher_hf_path,
4034
teacher_hf_path=teacher_hf_path,
4135
output_dir=distill_output_dir,
42-
tp_size=tp_size,
36+
tp_size=num_gpus,
37+
pp_size=1,
4338
seq_length=32,
4439
mbs=1,
4540
gbs=4,
@@ -88,41 +83,24 @@ def test_distill_puzzletron_anymodel(tmp_path: Path, num_gpus):
8883
tmp_path
8984
)
9085

91-
output_dir = tmp_path / "distill_output"
92-
93-
tp_size = num_gpus
9486
train_iters = 5
95-
96-
cmd_parts = [
97-
"torchrun",
98-
f"--nproc_per_node={tp_size}",
99-
"--master-addr",
100-
"127.0.0.1",
101-
"--master-port",
102-
str(get_free_port()),
103-
"distill.py",
104-
"--use_mock_data",
105-
]
106-
extend_cmd_parts(
107-
cmd_parts,
87+
output_dir = tmp_path / "distill_output"
88+
cmd_parts = extend_cmd_parts(
89+
["torchrun", f"--nproc_per_node={num_gpus}", "distill.py", "--use_mock_data"],
10890
student_hf_path=student_anymodel_dir,
10991
teacher_hf_path=teacher_hf_dir,
11092
output_dir=output_dir,
111-
tp_size=tp_size,
93+
tp_size=num_gpus,
11294
pp_size=1,
113-
seq_length=128,
114-
split="99,1,0",
95+
seq_length=32,
11596
mbs=1,
11697
gbs=4,
11798
train_iters=train_iters,
118-
lr=0.0001,
119-
min_lr=1e-5,
12099
lr_warmup_iters=2,
121-
eval_interval=100,
122-
eval_iters=0,
123-
log_interval=5,
100+
eval_interval=5,
101+
eval_iters=1,
102+
log_interval=1,
124103
)
125-
126104
run_example_command(cmd_parts, example_path="megatron_bridge")
127105

128106
run_config_path = output_dir / "checkpoints" / f"iter_{train_iters:07d}" / "run_config.yaml"
@@ -135,20 +113,13 @@ def _prepare_puzzletron_anymodel_student_and_teacher(tmp_path: Path) -> tuple[Pa
135113
teacher_hf_dir = tmp_path / "teacher_hf"
136114

137115
tokenizer = get_tiny_tokenizer()
138-
vocab_size = 128 # must be divisible by TP size
139116

140117
create_and_save_small_hf_model(
141-
output_path=str(student_hf_dir),
142-
tokenizer=tokenizer,
143-
hf_model_name="Qwen/Qwen3-0.6B",
144-
vocab_size=vocab_size,
118+
output_path=str(student_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B"
145119
)
146120

147121
create_and_save_small_hf_model(
148-
output_path=str(teacher_hf_dir),
149-
tokenizer=tokenizer,
150-
hf_model_name="Qwen/Qwen3-0.6B",
151-
vocab_size=vocab_size,
122+
output_path=str(teacher_hf_dir), tokenizer=tokenizer, hf_model_name="Qwen/Qwen3-0.6B"
152123
)
153124

154125
student_anymodel_dir = tmp_path / "student_anymodel"

0 commit comments

Comments
 (0)