Use local tokenizer in mixtral tests to avoid HF Hub dependency

trvachov · claude · trvachov · commit 6ca089108025 · 2026-04-15T20:40:19.000-04:00
Tests were failing in CI with 'Unable to load vocabulary from file'
because they relied on downloading nvidia/Llama-3.1-8B-Instruct-FP8
tokenizer from HuggingFace Hub. Added a session-scoped local_tokenizer_path
fixture to conftest.py that creates a small WordLevel tokenizer on disk,
and updated all tests to override dataset.tokenizer_name_or_path with it.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/bionemo-recipes/recipes/mixtral_native_te/tests/conftest.py b/bionemo-recipes/recipes/mixtral_native_te/tests/conftest.py
@@ -14,29 +14,75 @@
 # limitations under the License.
 
 import sys
+import tempfile
 from pathlib import Path
 from unittest import mock
 
 import pytest
 import torch
+from tokenizers import Tokenizer
+from tokenizers.models import WordLevel
+from tokenizers.pre_tokenizers import Whitespace
 from transformer_engine.pytorch import fp8 as te_fp8
+from transformers import PreTrainedTokenizerFast
 
 
 sys.path.append(Path(__file__).parent.parent.as_posix())
 sys.path.append(Path(__file__).parent.as_posix())
 from distributed_config import DistributedConfig
 
 
+def _create_local_tokenizer(directory: Path) -> str:
+    """Create a small local tokenizer so tests don't depend on HF Hub."""
+    directory.mkdir(parents=True, exist_ok=True)
+    tokenizer = Tokenizer(
+        WordLevel(
+            vocab={
+                "[UNK]": 0,
+                "[PAD]": 1,
+                "[BOS]": 2,
+                "[EOS]": 3,
+                "the": 4,
+                "quick": 5,
+                "brown": 6,
+                "fox": 7,
+                "jumps": 8,
+                "over": 9,
+                "lazy": 10,
+                "dog": 11,
+            },
+            unk_token="[UNK]",
+        )
+    )
+    tokenizer.pre_tokenizer = Whitespace()
+    fast_tokenizer = PreTrainedTokenizerFast(
+        tokenizer_object=tokenizer,
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        bos_token="[BOS]",
+        eos_token="[EOS]",
+    )
+    fast_tokenizer.save_pretrained(directory)
+    return str(directory)
+
+
+@pytest.fixture(scope="session")
+def local_tokenizer_path():
+    """Session-scoped local tokenizer that avoids HF Hub downloads."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        yield _create_local_tokenizer(Path(tmpdir) / "tokenizer")
+
+
 @pytest.fixture
 def recipe_path() -> Path:
     """Return the root directory of the recipe."""
     return Path(__file__).parent.parent
 
 
 @pytest.fixture
-def tokenizer_path(recipe_path):
-    """Get the path to the recipe tokenizer."""
-    return "nvidia/Llama-3.1-8B-Instruct-FP8"
+def tokenizer_path(local_tokenizer_path):
+    """Get the path to the local test tokenizer."""
+    return local_tokenizer_path
 
 
 @pytest.fixture(autouse=True)
diff --git a/bionemo-recipes/recipes/mixtral_native_te/tests/test_train.py b/bionemo-recipes/recipes/mixtral_native_te/tests/test_train.py
@@ -55,13 +55,15 @@ def set_seed():
         torch.cuda.manual_seed_all(42)
 
 
-def test_sanity_convergence_fsdp2_te_bshd(tmp_path, recipe_path):
+def test_sanity_convergence_fsdp2_te_bshd(tmp_path, recipe_path, local_tokenizer_path):
+    tokenizer_path = local_tokenizer_path
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
+                f"dataset.tokenizer_name_or_path={tokenizer_path}",
                 "checkpoint.resume_from_checkpoint=false",
                 "num_train_steps=40",
                 "config_kwargs.attn_input_format=bshd",
@@ -74,13 +76,15 @@ def test_sanity_convergence_fsdp2_te_bshd(tmp_path, recipe_path):
     assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
 
 
-def test_sanity_convergence_fsdp2_te_thd(tmp_path, recipe_path):
+def test_sanity_convergence_fsdp2_te_thd(tmp_path, recipe_path, local_tokenizer_path):
+    tokenizer_path = local_tokenizer_path
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
+                f"dataset.tokenizer_name_or_path={tokenizer_path}",
                 "checkpoint.resume_from_checkpoint=false",
                 "num_train_steps=40",
                 "use_sequence_packing=true",
@@ -95,14 +99,16 @@ def test_sanity_convergence_fsdp2_te_thd(tmp_path, recipe_path):
     assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
 
 
-def test_sanity_convergence_fsdp2_te_bshd_grad_acc(tmp_path, recipe_path):
+def test_sanity_convergence_fsdp2_te_bshd_grad_acc(tmp_path, recipe_path, local_tokenizer_path):
     """Test FSDP2 training with gradient accumulation."""
+    tokenizer_path = local_tokenizer_path
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
+                f"dataset.tokenizer_name_or_path={tokenizer_path}",
                 "checkpoint.resume_from_checkpoint=false",
                 "num_train_steps=40",
                 "config_kwargs.attn_input_format=bshd",
@@ -117,21 +123,16 @@ def test_sanity_convergence_fsdp2_te_bshd_grad_acc(tmp_path, recipe_path):
     assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
 
 
-def test_sanity_convergence_ddp_te(tmp_path, recipe_path):
-    """Test that DDP training converges on sanity-scale data.
-
-    This test validates:
-    - The train_ddp.py script runs end-to-end without errors
-    - Model, optimizer, and dataloader integrate correctly
-    - Training converges to reasonable loss on small dataset
-    - Uses L0_sanity config with small model and few training steps
-    """
+def test_sanity_convergence_ddp_te(tmp_path, recipe_path, local_tokenizer_path):
+    """Test that DDP training converges on sanity-scale data."""
+    tokenizer_path = local_tokenizer_path
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
+                f"dataset.tokenizer_name_or_path={tokenizer_path}",
                 "checkpoint.resume_from_checkpoint=false",
                 "num_train_steps=40",
                 "config_kwargs.attn_input_format=bshd",
@@ -144,14 +145,16 @@ def test_sanity_convergence_ddp_te(tmp_path, recipe_path):
     assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
 
 
-def test_sanity_convergence_ddp_te_grad_acc(tmp_path, recipe_path):
+def test_sanity_convergence_ddp_te_grad_acc(tmp_path, recipe_path, local_tokenizer_path):
     """Test DDP training with gradient accumulation."""
+    tokenizer_path = local_tokenizer_path
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
+                f"dataset.tokenizer_name_or_path={tokenizer_path}",
                 "checkpoint.resume_from_checkpoint=false",
                 "num_train_steps=40",
                 "config_kwargs.attn_input_format=bshd",
@@ -165,20 +168,16 @@ def test_sanity_convergence_ddp_te_grad_acc(tmp_path, recipe_path):
     assert final_loss < 8.5, f"Final loss {final_loss} is too high, expected < 8.5"
 
 
-def test_sanity_convergence_fsdp2_hf(tmp_path, recipe_path):
-    """Test that FSDP2 training converges with HuggingFace (non-TE) model.
-
-    This test validates:
-    - The train_fsdp2.py script runs end-to-end without errors using vanilla HF layers
-    - FSDP2 wrapping and sharding work correctly without TransformerEngine
-    - Training converges to reasonable loss on small dataset
-    """
+def test_sanity_convergence_fsdp2_hf(tmp_path, recipe_path, local_tokenizer_path):
+    """Test that FSDP2 training converges with HuggingFace (non-TE) model."""
+    tokenizer_path = local_tokenizer_path
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
+                f"dataset.tokenizer_name_or_path={tokenizer_path}",
                 "checkpoint.resume_from_checkpoint=false",
                 "num_train_steps=40",
                 "use_te=false",
@@ -194,14 +193,16 @@ def test_sanity_convergence_fsdp2_hf(tmp_path, recipe_path):
 
 @requires_fp8
 @requires_datacenter_hardware
-def test_sanity_convergence_fsdp2_te_fp8(tmp_path, recipe_path, fp_recipe):
+def test_sanity_convergence_fsdp2_te_fp8(tmp_path, recipe_path, local_tokenizer_path, fp_recipe):
     """Test FSDP2 training with FP8 enabled using parametrized FP8 recipes."""
+    tokenizer_path = local_tokenizer_path
     with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
         sanity_config = compose(
             config_name="L0_sanity",
             overrides=[
                 f"+wandb.dir={tmp_path}",
                 f"checkpoint.ckpt_dir={tmp_path}",
+                f"dataset.tokenizer_name_or_path={tokenizer_path}",
                 "checkpoint.resume_from_checkpoint=false",
                 "num_train_steps=40",
                 "config_kwargs.attn_input_format=bshd",
diff --git a/bionemo-recipes/recipes/mixtral_native_te/tests/test_train_two_gpu.py b/bionemo-recipes/recipes/mixtral_native_te/tests/test_train_two_gpu.py
@@ -44,17 +44,8 @@ def run_train_cmd(cmd, recipe_path):
 
 
 @requires_multi_gpu
-def test_multi_gpu_train_ddp(recipe_path):
-    """Test DDP training on 2 GPUs.
-
-    This test validates:
-    - DDP launches successfully with 2 processes
-    - Both GPUs are utilized
-    - Training completes without errors
-    - Gradient synchronization works across GPUs
-
-    The test runs only 4 training steps for speed.
-    """
+def test_multi_gpu_train_ddp(recipe_path, local_tokenizer_path):
+    """Test DDP training on 2 GPUs."""
     run_train_cmd(
         [
             "torchrun",
@@ -66,13 +57,14 @@ def test_multi_gpu_train_ddp(recipe_path):
             "L0_sanity",
             "num_train_steps=4",
             "expert_parallel_size=1",
+            f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
         ],
         recipe_path,
     )
 
 
 @requires_multi_gpu
-def test_multi_gpu_train_fsdp2(recipe_path):
+def test_multi_gpu_train_fsdp2(recipe_path, local_tokenizer_path):
     run_train_cmd(
         [
             "torchrun",
@@ -83,20 +75,15 @@ def test_multi_gpu_train_fsdp2(recipe_path):
             "--config-name",
             "L0_sanity",
             "num_train_steps=4",
+            f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
         ],
         recipe_path,
     )
 
 
 @requires_multi_gpu
-def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
-    """Test FSDP2 training on 2 GPUs with checkpoint saving.
-
-    This test validates:
-    - FSDP2 can save checkpoints with multiple processes
-    - Sharded checkpoints are created correctly
-    - No race conditions in checkpoint saving
-    """
+def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path, local_tokenizer_path):
+    """Test FSDP2 training on 2 GPUs with checkpoint saving."""
     run_train_cmd(
         [
             "torchrun",
@@ -111,6 +98,7 @@ def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
             "checkpoint.save_every_n_steps=5",
             "dataset.use_stateful_dataloader=true",
             "expert_parallel_size=1",
+            f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
         ],
         recipe_path,
     )
@@ -122,16 +110,8 @@ def test_multi_gpu_train_fsdp2_with_checkpointing(tmp_path, recipe_path):
 
 
 @requires_multi_gpu
-def test_multi_gpu_train_fsdp2_ep2(recipe_path):
-    """Test FSDP2 training with expert parallelism on 2 GPUs.
-
-    This test validates:
-    - Expert parallelism (EP=2) works with FSDP2 on 2 GPUs
-    - MoE routing and expert distribution across GPUs functions correctly
-    - Training completes without errors
-
-    The test runs only 4 training steps for speed.
-    """
+def test_multi_gpu_train_fsdp2_ep2(recipe_path, local_tokenizer_path):
+    """Test FSDP2 training with expert parallelism (EP=2) on 2 GPUs."""
     run_train_cmd(
         [
             "torchrun",
@@ -143,6 +123,7 @@ def test_multi_gpu_train_fsdp2_ep2(recipe_path):
             "L0_sanity",
             "num_train_steps=4",
             "expert_parallel_size=2",
+            f"dataset.tokenizer_name_or_path={local_tokenizer_path}",
         ],
         recipe_path,
     )