NVIDIA-BioNeMo
diff --git a/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎.devcontainer/recipes/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bionemo-recipes/models/llama3/nucleotide_fast_tokenizer/special_tokens_map.json‎
Lines changed: 28 additions & 4 deletions b/‎bionemo-recipes/models/llama3/nucleotide_fast_tokenizer/special_tokens_map.json‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/dataset.py‎
Lines changed: 22 additions & 22 deletions b/‎bionemo-recipes/recipes/llama3_native_te/dataset.py‎
Lines changed: 22 additions & 22 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/example_checkpoint/README.md‎
Lines changed: 0 additions & 62 deletions b/‎bionemo-recipes/recipes/llama3_native_te/example_checkpoint/README.md‎
Lines changed: 0 additions & 62 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/example_checkpoint/generation_config.json‎
Lines changed: 0 additions & 10 deletions b/‎bionemo-recipes/recipes/llama3_native_te/example_checkpoint/generation_config.json‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_convergence.yaml‎
Lines changed: 12 additions & 3 deletions b/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_convergence.yaml‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity.yaml‎
Lines changed: 13 additions & 5 deletions b/‎bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity.yaml‎
Lines changed: 13 additions & 5 deletions
@@ -13,3 +13,4 @@ transformer_engine
 transformers
 typer
 wandb
+zstandard
@@ -1,6 +1,30 @@
 {
-  "bos_token": "<BOS>",
-  "eos_token": "<EOS>",
-  "pad_token": "<PAD>",
-  "unk_token": "<UNK>"
+  "bos_token": {
+    "content": "<BOS>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<EOS>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<PAD>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<UNK>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }
@@ -32,25 +32,25 @@
 
 def create_tokenized_dataset(
     distributed_config: DistributedConfig,
-    tokenizer_path: str,
+    tokenizer_name_or_path: str,
     load_dataset_kwargs: dict,
     max_seq_length: int = 8192,
     stride: int = 200,
     buffer_size: int = 500_000,
     use_lazy_tokenization: bool = True,
-    sequence_column: str = "sequence",
+    text_column: str = "text",
 ):
     """Create a tokenized dataset with windowing.
 
     Args:
         distributed_config: The distributed configuration.
-        tokenizer_path: Path to the nucleotide tokenizer directory.
+        tokenizer_name_or_path: Name or path to the nucleotide tokenizer directory.
         load_dataset_kwargs: Keyword arguments to pass to `load_dataset`.
         max_seq_length: The maximum length of sequences (window size).
         stride: The stride for windowing (overlap = stride tokens).
         buffer_size: The buffer size for shuffle.
         use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
-        sequence_column: Name of the column containing genomic sequences (default: "sequence").
+        text_column: Name of the column containing genomic sequences (default: "text").
 
     Returns:
         Tuple of (tokenized_dataset, tokenizer).
@@ -67,13 +67,13 @@ def create_tokenized_dataset(
         )
         dataset = dataset.shuffle(seed=42, buffer_size=buffer_size)
 
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
 
     def tokenize_with_windowing(examples):
         """Tokenize nucleotide sequences with windowing (one-to-many mapping)."""
         # Tokenize with windowing using return_overflowing_tokens
         result = tokenizer(
-            examples[sequence_column],
+            examples[text_column],
             max_length=max_seq_length,
             stride=stride,
             truncation=True,
@@ -91,7 +91,7 @@ def tokenize_with_windowing(examples):
         # This causes dataset.column_names to be None for streaming IterableDataset.
         #
         # For IterableDataset with None column_names (OpenGenome2):
-        #   - Must explicitly list columns to remove: [sequence_column, "record"]
+        #   - Must explicitly list columns to remove: [text_column, "record"]
         #   - IterableDataset.map() handles missing columns gracefully
         #
         # For regular Dataset (non-streaming, or streaming with consistent schema like ESM2):
@@ -100,9 +100,9 @@ def tokenize_with_windowing(examples):
         #
         # TODO: Remove this workaround once Arc Institute fixes OpenGenome2 schema consistency.
         # When all shards have the same columns, dataset.column_names will work for both cases.
-        if isinstance(dataset, datasets.IterableDataset):
+        if isinstance(dataset, datasets.IterableDataset) and dataset.column_names is None:
             # Streaming dataset: column_names may be None due to inconsistent schema
-            columns_to_remove = [sequence_column, "record"]
+            columns_to_remove = [text_column, "record"]
         else:
             # Non-streaming dataset: use actual column names
             columns_to_remove = dataset.column_names
@@ -120,7 +120,7 @@ def tokenize_with_windowing(examples):
 
 def create_bshd_dataloader(
     distributed_config: DistributedConfig,
-    tokenizer_path: str,
+    tokenizer_name_or_path: str,
     load_dataset_kwargs: dict,
     micro_batch_size: int,
     num_workers: int = 1,
@@ -130,15 +130,15 @@ def create_bshd_dataloader(
     buffer_size: int = 500_000,
     use_lazy_tokenization: bool = True,
     use_stateful_dataloader: bool = False,
-    sequence_column: str = "sequence",
+    text_column: str = "text",
     uppercase_labels: bool = False,
     mask_degenerate_bases: bool = True,
 ):
     """Create a BSHD dataloader for genomic sequences using CLM (causal language modeling).
 
     Args:
         distributed_config: The distributed configuration.
-        tokenizer_path: Path to the nucleotide tokenizer directory.
+        tokenizer_name_or_path: Name or path to the nucleotide tokenizer directory.
         load_dataset_kwargs: Keyword arguments to pass to `load_dataset`.
         micro_batch_size: The batch size per device.
         num_workers: The number of workers to use for the dataloader.
@@ -148,7 +148,7 @@ def create_bshd_dataloader(
         buffer_size: The buffer size for shuffle.
         use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
         use_stateful_dataloader: Whether to use the StatefulDataLoader to enable checkpointing the dataloader state.
-        sequence_column: Name of the column containing genomic sequences (default: "sequence").
+        text_column: Name of the column containing genomic sequences (default: "text").
         uppercase_labels: Whether to uppercase labels (genomic masking). Default: False.
         mask_degenerate_bases: Whether to mask non-ACGT bases (genomic masking). Default: False.
 
@@ -157,13 +157,13 @@ def create_bshd_dataloader(
     """
     tokenized_dataset, tokenizer = create_tokenized_dataset(
         distributed_config=distributed_config,
-        tokenizer_path=tokenizer_path,
+        tokenizer_name_or_path=tokenizer_name_or_path,
         load_dataset_kwargs=load_dataset_kwargs,
         max_seq_length=max_seq_length,
         stride=stride,
         buffer_size=buffer_size,
         use_lazy_tokenization=use_lazy_tokenization,
-        sequence_column=sequence_column,
+        text_column=text_column,
     )
 
     if isinstance(tokenized_dataset, datasets.IterableDataset):
@@ -214,7 +214,7 @@ def create_bshd_dataloader(
 
 def create_thd_dataloader(
     distributed_config: DistributedConfig,
-    tokenizer_path: str,
+    tokenizer_name_or_path: str,
     load_dataset_kwargs: dict,
     micro_batch_size: int | None = None,
     token_micro_batch_size: int | None = None,
@@ -224,15 +224,15 @@ def create_thd_dataloader(
     buffer_size: int = 500_000,
     use_lazy_tokenization: bool = True,
     use_stateful_dataloader: bool = False,
-    sequence_column: str = "sequence",
+    text_column: str = "text",
     uppercase_labels: bool = False,
     mask_degenerate_bases: bool = True,
 ):
     """Create a dataloader that packs up to the maximum number of tokens per batch.
 
     Args:
         distributed_config: The distributed configuration.
-        tokenizer_path: Path to the nucleotide tokenizer directory.
+        tokenizer_name_or_path: Name or path to the nucleotide tokenizer directory.
         load_dataset_kwargs: Keyword arguments to pass to `load_dataset`.
         micro_batch_size: The batch size per device.
         token_micro_batch_size: The maximum number of tokens per batch. If None, the micro_batch_size * max_seq_length
@@ -244,22 +244,22 @@ def create_thd_dataloader(
         buffer_size: The buffer size for shuffle.
         use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
         use_stateful_dataloader: Whether to use the StatefulDataLoader to enable checkpointing the dataloader state.
-        sequence_column: Name of the column containing genomic sequences (default: "sequence").
+        text_column: Name of the column containing genomic sequences (default: "text").
         uppercase_labels: Whether to uppercase labels (genomic masking). Default: False.
         mask_degenerate_bases: Whether to mask degenerate bases (genomic masking). Default: True.
 
     Returns:
         A dataloader that can be used for training.
     """
-    tokenized_dataset, tokenizer = create_tokenized_dataset(
+    tokenized_dataset, _ = create_tokenized_dataset(
         distributed_config=distributed_config,
-        tokenizer_path=tokenizer_path,
+        tokenizer_name_or_path=tokenizer_name_or_path,
         load_dataset_kwargs=load_dataset_kwargs,
         max_seq_length=max_seq_length,
         stride=stride,
         buffer_size=buffer_size,
         use_lazy_tokenization=use_lazy_tokenization,
-        sequence_column=sequence_column,
+        text_column=text_column,
     )
 
     assert isinstance(tokenized_dataset, datasets.IterableDataset), "THD token packing requires a streaming dataset."
 
@@ -9,13 +9,22 @@ defaults:
   - _self_
 
 # Use tiny Llama config for fast convergence testing
-model_tag: ./example_checkpoint
+config_name_or_path: ./model_configs/meta-llama/Llama-3.2-1B
+config_kwargs: # Arguments to pass to the AutoConfig.from_pretrained method
+  trust_remote_code: true
+  vocab_size: 256 # Overrides to the default config that comes from meta-llama/Llama-3.2-1B
+  tie_word_embeddings: false
+  eos_token_id: 0
+  pad_token_id: 1
+  bos_token_id: 2
+  attn_input_format: "bshd"
 
 num_train_steps: 270_000
 
 dataset:
+  tokenizer_name_or_path: ./tokenizers/nucleotide_fast_tokenizer
   micro_batch_size: 1 # Conservative for single GPU
-  sequence_column: "text"
+  text_column: "text"
   load_dataset_kwargs:
     path: "arcinstitute/opengenome2"
     data_dir: "json/pretraining_or_both_phases"
@@ -31,7 +40,7 @@ adamw_kwargs:
 
 lr_scheduler_kwargs:
   num_warmup_steps: 2_000
-  num_training_steps: 500_000
+  num_decay_steps: 500_000
 
 checkpoint:
   ckpt_dir: null # No checkpoints
 
@@ -2,27 +2,35 @@ defaults:
   - defaults
   - _self_
 
-# Training config
-model_tag: ./example_checkpoint # Use tiny Llama config for testing (4 layers, 384 hidden, ~9.6M params)
-
-config_kwargs:
+# Use tiny Llama config for fast convergence testing
+config_name_or_path: ./model_configs/meta-llama/Llama-3.2-1B
+config_kwargs: # Arguments to pass to the AutoConfig.from_pretrained method
+  trust_remote_code: true
+  vocab_size: 256 # Overrides to the default config that comes from meta-llama/Llama-3.2-1B
+  tie_word_embeddings: false
+  eos_token_id: 0
+  pad_token_id: 1
+  bos_token_id: 2
   num_hidden_layers: 2
   hidden_size: 384
   intermediate_size: 1536
   num_attention_heads: 6
   num_key_value_heads: 6
+  attn_input_format: "bshd"
 
 num_train_steps: 250
 
 # We want this on in CI/CD to validate that the script runs successfully with torch.compile.
 use_torch_compile: true # Disable for faster startup during testing
 
 dataset:
+  tokenizer_name_or_path: ./tokenizers/nucleotide_fast_tokenizer
   micro_batch_size: 1 # Small batch size for limited GPU memory
+  text_column: "sequence"
   load_dataset_kwargs:
     path: "parquet"
     split: "train"
-    data_files: "test_genomic_sequences.parquet"  # Use local test file in recipe directory
+    data_files: "test_genomic_sequences.parquet" # Use local test file in recipe directory
     streaming: True
 
 # WandB config