Skip to content

Commit 6ca0891

Browse files
trvachovclaude
andcommitted
Use local tokenizer in mixtral tests to avoid HF Hub dependency
Tests were failing in CI with 'Unable to load vocabulary from file' because they relied on downloading nvidia/Llama-3.1-8B-Instruct-FP8 tokenizer from HuggingFace Hub. Added a session-scoped local_tokenizer_path fixture to conftest.py that creates a small WordLevel tokenizer on disk, and updated all tests to override dataset.tokenizer_name_or_path with it. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent cad830b commit 6ca0891

3 files changed

Lines changed: 83 additions & 55 deletions

File tree

bionemo-recipes/recipes/mixtral_native_te/tests/conftest.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,29 +14,75 @@
1414
# limitations under the License.
1515

1616
import sys
17+
import tempfile
1718
from pathlib import Path
1819
from unittest import mock
1920

2021
import pytest
2122
import torch
23+
from tokenizers import Tokenizer
24+
from tokenizers.models import WordLevel
25+
from tokenizers.pre_tokenizers import Whitespace
2226
from transformer_engine.pytorch import fp8 as te_fp8
27+
from transformers import PreTrainedTokenizerFast
2328

2429

2530
sys.path.append(Path(__file__).parent.parent.as_posix())
2631
sys.path.append(Path(__file__).parent.as_posix())
2732
from distributed_config import DistributedConfig
2833

2934

35+
def _create_local_tokenizer(directory: Path) -> str:
36+
"""Create a small local tokenizer so tests don't depend on HF Hub."""
37+
directory.mkdir(parents=True, exist_ok=True)
38+
tokenizer = Tokenizer(
39+
WordLevel(
40+
vocab={
41+
"[UNK]": 0,
42+
"[PAD]": 1,
43+
"[BOS]": 2,
44+
"[EOS]": 3,
45+
"the": 4,
46+
"quick": 5,
47+
"brown": 6,
48+
"fox": 7,
49+
"jumps": 8,
50+
"over": 9,
51+
"lazy": 10,
52+
"dog": 11,
53+
},
54+
unk_token="[UNK]",
55+
)
56+
)
57+
tokenizer.pre_tokenizer = Whitespace()
58+
fast_tokenizer = PreTrainedTokenizerFast(
59+
tokenizer_object=tokenizer,
60+
unk_token="[UNK]",
61+
pad_token="[PAD]",
62+
bos_token="[BOS]",
63+
eos_token="[EOS]",
64+
)
65+
fast_tokenizer.save_pretrained(directory)
66+
return str(directory)
67+
68+
69+
@pytest.fixture(scope="session")
70+
def local_tokenizer_path():
71+
"""Session-scoped local tokenizer that avoids HF Hub downloads."""
72+
with tempfile.TemporaryDirectory() as tmpdir:
73+
yield _create_local_tokenizer(Path(tmpdir) / "tokenizer")
74+
75+
3076
@pytest.fixture
3177
def recipe_path() -> Path:
3278
"""Return the root directory of the recipe."""
3379
return Path(__file__).parent.parent
3480

3581

3682
@pytest.fixture
37-
def tokenizer_path(recipe_path):
38-
"""Get the path to the recipe tokenizer."""
39-
return "nvidia/Llama-3.1-8B-Instruct-FP8"
83+
def tokenizer_path(local_tokenizer_path):
84+
"""Get the path to the local test tokenizer."""
85+
return local_tokenizer_path
4086

4187

4288
@pytest.fixture(autouse=True)

bionemo-recipes/recipes/mixtral_native_te/tests/test_train.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,15 @@ def set_seed():
5555
torch.cuda.manual_seed_all(42)
5656

5757

58-
def test_sanity_convergence_fsdp2_te_bshd(tmp_path, recipe_path):
58+
def test_sanity_convergence_fsdp2_te_bshd(tmp_path, recipe_path, local_tokenizer_path):
59+
tokenizer_path = local_tokenizer_path
5960
with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
6061
sanity_config = compose(
6162
config_name="L0_sanity",
6263
overrides=[
6364
f"+wandb.dir={tmp_path}",
6465
f"checkpoint.ckpt_dir={tmp_path}",
66+
f"dataset.tokenizer_name_or_path={tokenizer_path}",
6567
"checkpoint.resume_from_checkpoint=false",
6668
"num_train_steps=40",
6769
"config_kwargs.attn_input_format=bshd",
@@ -74,13 +76,15 @@ def test_sanity_convergence_fsdp2_te_bshd(tmp_path, recipe_path):
7476
assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
7577

7678

77-
def test_sanity_convergence_fsdp2_te_thd(tmp_path, recipe_path):
79+
def test_sanity_convergence_fsdp2_te_thd(tmp_path, recipe_path, local_tokenizer_path):
80+
tokenizer_path = local_tokenizer_path
7881
with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
7982
sanity_config = compose(
8083
config_name="L0_sanity",
8184
overrides=[
8285
f"+wandb.dir={tmp_path}",
8386
f"checkpoint.ckpt_dir={tmp_path}",
87+
f"dataset.tokenizer_name_or_path={tokenizer_path}",
8488
"checkpoint.resume_from_checkpoint=false",
8589
"num_train_steps=40",
8690
"use_sequence_packing=true",
@@ -95,14 +99,16 @@ def test_sanity_convergence_fsdp2_te_thd(tmp_path, recipe_path):
9599
assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
96100

97101

98-
def test_sanity_convergence_fsdp2_te_bshd_grad_acc(tmp_path, recipe_path):
102+
def test_sanity_convergence_fsdp2_te_bshd_grad_acc(tmp_path, recipe_path, local_tokenizer_path):
99103
"""Test FSDP2 training with gradient accumulation."""
104+
tokenizer_path = local_tokenizer_path
100105
with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
101106
sanity_config = compose(
102107
config_name="L0_sanity",
103108
overrides=[
104109
f"+wandb.dir={tmp_path}",
105110
f"checkpoint.ckpt_dir={tmp_path}",
111+
f"dataset.tokenizer_name_or_path={tokenizer_path}",
106112
"checkpoint.resume_from_checkpoint=false",
107113
"num_train_steps=40",
108114
"config_kwargs.attn_input_format=bshd",
@@ -117,21 +123,16 @@ def test_sanity_convergence_fsdp2_te_bshd_grad_acc(tmp_path, recipe_path):
117123
assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
118124

119125

120-
def test_sanity_convergence_ddp_te(tmp_path, recipe_path):
121-
"""Test that DDP training converges on sanity-scale data.
122-
123-
This test validates:
124-
- The train_ddp.py script runs end-to-end without errors
125-
- Model, optimizer, and dataloader integrate correctly
126-
- Training converges to reasonable loss on small dataset
127-
- Uses L0_sanity config with small model and few training steps
128-
"""
126+
def test_sanity_convergence_ddp_te(tmp_path, recipe_path, local_tokenizer_path):
127+
"""Test that DDP training converges on sanity-scale data."""
128+
tokenizer_path = local_tokenizer_path
129129
with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
130130
sanity_config = compose(
131131
config_name="L0_sanity",
132132
overrides=[
133133
f"+wandb.dir={tmp_path}",
134134
f"checkpoint.ckpt_dir={tmp_path}",
135+
f"dataset.tokenizer_name_or_path={tokenizer_path}",
135136
"checkpoint.resume_from_checkpoint=false",
136137
"num_train_steps=40",
137138
"config_kwargs.attn_input_format=bshd",
@@ -144,14 +145,16 @@ def test_sanity_convergence_ddp_te(tmp_path, recipe_path):
144145
assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
145146

146147

147-
def test_sanity_convergence_ddp_te_grad_acc(tmp_path, recipe_path):
148+
def test_sanity_convergence_ddp_te_grad_acc(tmp_path, recipe_path, local_tokenizer_path):
148149
"""Test DDP training with gradient accumulation."""
150+
tokenizer_path = local_tokenizer_path
149151
with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
150152
sanity_config = compose(
151153
config_name="L0_sanity",
152154
overrides=[
153155
f"+wandb.dir={tmp_path}",
154156
f"checkpoint.ckpt_dir={tmp_path}",
157+
f"dataset.tokenizer_name_or_path={tokenizer_path}",
155158
"checkpoint.resume_from_checkpoint=false",
156159
"num_train_steps=40",
157160
"config_kwargs.attn_input_format=bshd",
@@ -165,20 +168,16 @@ def test_sanity_convergence_ddp_te_grad_acc(tmp_path, recipe_path):
165168
assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
166169

167170

168-
def test_sanity_convergence_fsdp2_hf(tmp_path, recipe_path):
169-
"""Test that FSDP2 training converges with HuggingFace (non-TE) model.
170-
171-
This test validates:
172-
- The train_fsdp2.py script runs end-to-end without errors using vanilla HF layers
173-
- FSDP2 wrapping and sharding work correctly without TransformerEngine
174-
- Training converges to reasonable loss on small dataset
175-
"""
171+
def test_sanity_convergence_fsdp2_hf(tmp_path, recipe_path, local_tokenizer_path):
172+
"""Test that FSDP2 training converges with HuggingFace (non-TE) model."""
173+
tokenizer_path = local_tokenizer_path
176174
with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
177175
sanity_config = compose(
178176
config_name="L0_sanity",
179177
overrides=[
180178
f"+wandb.dir={tmp_path}",
181179
f"checkpoint.ckpt_dir={tmp_path}",
180+
f"dataset.tokenizer_name_or_path={tokenizer_path}",
182181
"checkpoint.resume_from_checkpoint=false",
183182
"num_train_steps=40",
184183
"use_te=false",
@@ -194,14 +193,16 @@ def test_sanity_convergence_fsdp2_hf(tmp_path, recipe_path):
194193

195194
@requires_fp8
196195
@requires_datacenter_hardware
197-
def test_sanity_convergence_fsdp2_te_fp8(tmp_path, recipe_path, fp_recipe):
196+
def test_sanity_convergence_fsdp2_te_fp8(tmp_path, recipe_path, local_tokenizer_path, fp_recipe):
198197
"""Test FSDP2 training with FP8 enabled using parametrized FP8 recipes."""
198+
tokenizer_path = local_tokenizer_path
199199
with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
200200
sanity_config = compose(
201201
config_name="L0_sanity",
202202
overrides=[
203203
f"+wandb.dir={tmp_path}",
204204
f"checkpoint.ckpt_dir={tmp_path}",
205+
f"dataset.tokenizer_name_or_path={tokenizer_path}",
205206
"checkpoint.resume_from_checkpoint=false",
206207
"num_train_steps=40",
207208
"config_kwargs.attn_input_format=bshd",

bionemo-recipes/recipes/mixtral_native_te/tests/test_train_two_gpu.py

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -44,17 +44,8 @@ def run_train_cmd(cmd, recipe_path):
4444

4545

4646
@requires_multi_gpu
47-
def test_multi_gpu_train_ddp(recipe_path):
48-
"""Test DDP training on 2 GPUs.
49-
50-
This test validates:
51-
- DDP launches successfully with 2 processes
52-
- Both GPUs are utilized
53-
- Training completes without errors
54-
- Gradient synchronization works across GPUs
55-
56-
The test runs only 4 training steps for speed.
57-
"""
47+
def test_multi_gpu_train_ddp(recipe_path, local_tokenizer_path):
48+
"""Test DDP training on 2 GPUs."""
5849
run_train_cmd(
5950
[
6051
"torchrun",
@@ -66,13 +57,14 @@ def test_multi_gpu_train_ddp(recipe_path):
6657
"L0_sanity",
6758
"num_train_steps=4",
6859
"expert_parallel_size=1",
60+
f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
6961
],
7062
recipe_path,
7163
)
7264

7365

7466
@requires_multi_gpu
75-
def test_multi_gpu_train_fsdp2(recipe_path):
67+
def test_multi_gpu_train_fsdp2(recipe_path, local_tokenizer_path):
7668
run_train_cmd(
7769
[
7870
"torchrun",
@@ -83,20 +75,15 @@ def test_multi_gpu_train_fsdp2(recipe_path):
8375
"--config-name",
8476
"L0_sanity",
8577
"num_train_steps=4",
78+
f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
8679
],
8780
recipe_path,
8881
)
8982

9083

9184
@requires_multi_gpu
92-
def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
93-
"""Test FSDP2 training on 2 GPUs with checkpoint saving.
94-
95-
This test validates:
96-
- FSDP2 can save checkpoints with multiple processes
97-
- Sharded checkpoints are created correctly
98-
- No race conditions in checkpoint saving
99-
"""
85+
def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path, local_tokenizer_path):
86+
"""Test FSDP2 training on 2 GPUs with checkpoint saving."""
10087
run_train_cmd(
10188
[
10289
"torchrun",
@@ -111,6 +98,7 @@ def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
11198
"checkpoint.save_every_n_steps=5",
11299
"dataset.use_stateful_dataloader=true",
113100
"expert_parallel_size=1",
101+
f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
114102
],
115103
recipe_path,
116104
)
@@ -122,16 +110,8 @@ def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
122110

123111

124112
@requires_multi_gpu
125-
def test_multi_gpu_train_fsdp2_ep2(recipe_path):
126-
"""Test FSDP2 training with expert parallelism on 2 GPUs.
127-
128-
This test validates:
129-
- Expert parallelism (EP=2) works with FSDP2 on 2 GPUs
130-
- MoE routing and expert distribution across GPUs functions correctly
131-
- Training completes without errors
132-
133-
The test runs only 4 training steps for speed.
134-
"""
113+
def test_multi_gpu_train_fsdp2_ep2(recipe_path, local_tokenizer_path):
114+
"""Test FSDP2 training with expert parallelism (EP=2) on 2 GPUs."""
135115
run_train_cmd(
136116
[
137117
"torchrun",
@@ -143,6 +123,7 @@ def test_multi_gpu_train_fsdp2_ep2(recipe_path):
143123
"L0_sanity",
144124
"num_train_steps=4",
145125
"expert_parallel_size=2",
126+
f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
146127
],
147128
recipe_path,
148129
)

0 commit comments

Comments
 (0)