added review feedback and added llama train and finetune test

yzhang123 · yzhang123 · commit 2d6c7d96074f · 2025-09-04T19:37:12.000-07:00
Signed-off-by: Yang Zhang &lt;yangzhang@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/data/sharded_eden_dataloader.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/data/sharded_eden_dataloader.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 # high performance implementation of the EdenDataModule, assuming some items are pre-computed + sharded fasta files and fasta index files.
-
+# Contributed by: BaseCamp Research https://basecamp-research.com/ https://github.com/NVIDIA/bionemo-framework/pull/1091
 import argparse
 import csv
 import os
@@ -100,7 +100,7 @@ def __init__(
         log_dir: Optional[str] = None,
         **kwargs,
     ):
-        """Initialize the ShardedEdenDataModule."""
+        """Initialize the ShardedEdenDataModule. See sub-packages/bionemo-evo2/src/bionemo/evo2/data/sharded_eden_dataloader.md for how to prepare the input data. """
         super().__init__()
         self.sequence_db_dir = sequence_db_dir
         self.train_window_db_path = train_window_db_path
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/models/llama.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/models/llama.py
@@ -13,13 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
-import math
+
 from dataclasses import dataclass
 from typing import Optional
 
-import torch
 from nemo.collections import llm
+from nemo.collections.llm.gpt.model.llama import apply_rope_scaling
 
 
 @dataclass
@@ -57,48 +56,6 @@ def configure_model(self, *args, **kwargs):
         return model
 
 
-def apply_rope_scaling(
-    inv_freq,
-    factor: int = 8,
-    low_freq_factor: int = 1,
-    high_freq_factor: int = 4,
-    old_context_len: int = 8192,
-):
-    """Apply RoPE scaling for extending context length in Llama models.
-
-    This implements the NTK-aware RoPE scaling method used in Llama 3.1 models to
-    extend context length beyond the original training length.
-
-    Args:
-        inv_freq: Original inverse frequency tensor
-        factor: Scaling factor for context length extension
-        low_freq_factor: Factor for low frequency components
-        high_freq_factor: Factor for high frequency components
-        old_context_len: Original context length
-
-    Returns:
-        torch.Tensor: Modified inverse frequency tensor for extended context
-    """
-    logging.info(
-        f"Apply rope scaling with factor={factor}, low_freq_factor={low_freq_factor}, high_freq_factor={high_freq_factor}, old_context_len={old_context_len}."
-    )
-
-    low_freq_wavelen = old_context_len / low_freq_factor
-    high_freq_wavelen = old_context_len / high_freq_factor
-
-    wavelen = 2 * math.pi / inv_freq
-    # wavelen < high_freq_wavelen: do nothing
-    # wavelen > low_freq_wavelen: divide by factor
-    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
-    # otherwise: interpolate between the two, using a smooth factor
-    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
-    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
-    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
-    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
-
-    return inv_freq_llama
-
-
 @dataclass
 class Eden11BConfig(EdenConfig):
     """Eden-flavoured Llama-3.1 ~14B (keeps all Eden behaviors)."""
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
@@ -756,9 +756,10 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         "distribute_saved_activations": False if args.sequence_parallel else True,
         "cross_entropy_loss_fusion": args.cross_entropy_loss_fusion,
         "fp32_residual_connection": not args.no_fp32_residual_connection,
-        "add_bias_output": args.add_bias_output,
         **activation_checkpointing_args,
     }
+    if args.add_bias_output:
+        config_modifiers_init["add_bias_output"] = args.add_bias_output
     if args.spike_no_more_embedding_init:
         config_modifiers_init["embedding_init_method_std"] = 1.0
         # When using spike_no_more_embedding_init, we don't want to share embeddings and outputs.
@@ -803,12 +804,10 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         config_modifiers_init["lowercase_loss_reweighting"] = args.mamba_lowercase_loss_weight
         if args.model_size not in MAMBA_MODEL_OPTIONS:
             raise ValueError(f"Invalid model size for Mamba: {args.model_size}")
-        add_bias_output = config_modifiers_init.pop("add_bias_output")
-        if add_bias_output:
-            raise ValueError("Bias output is not supported for Mamba models.")
         model_config = MAMBA_MODEL_OPTIONS[args.model_size](**config_modifiers_init)
         model = MambaModel(model_config, tokenizer=data_module.tokenizer)
     elif model_type == "llama":
+        config_modifiers_init.pop("to_upper")
         model_config = LLAMA_MODEL_OPTIONS[args.model_size](**config_modifiers_init)
         model = llm.LlamaModel(model_config, tokenizer=data_module.tokenizer)
 
@@ -895,7 +894,7 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         f"-GBS{global_batch_size}-MBS{args.micro_batch_size}-SkipLossRenorm{args.no_renormalize_loss}"
         f"-NOAC{args.no_activation_checkpointing}-SELAC{args.selective_activation_checkpointing}"
         f"-ACRNL{model_config.recompute_num_layers}"
-        f"-PAT{model_config.hybrid_override_pattern}"
+        f"-PAT{getattr(model_config, 'hybrid_override_pattern', 'None')}"
         f"-F32R{model_config.fp32_residual_connection}"
         f"-FCE{model_config.cross_entropy_loss_fusion}"
         f"-AIC{average_in_collective}"
@@ -1071,7 +1070,7 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         constant_steps=args.constant_steps,
     )
     # This is where the no weight decay condition is applied to the optimizer state.
-    opt = MegatronOptimizerModule(opt_config, sched, no_weight_decay_cond=model_config.hyena_no_weight_decay_cond_fn)
+    opt = MegatronOptimizerModule(opt_config, sched, no_weight_decay_cond=getattr(model_config, 'hyena_no_weight_decay_cond_fn', None))
     opt.connect(model)
     # Start training
     trainer.fit(model, data_module)
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/data/test_sharded_eden_dataset.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/data/test_sharded_eden_dataset.py
@@ -430,7 +430,7 @@ def test_dataset_reverse_complement(sequence_db_dir, window_dbs):
     # Test with N bases
     test_seq_with_n = "ATCN"
     rc_seq_with_n = dataset.reverse_complement(test_seq_with_n)
-    assert rc_seq_with_n == "NCGAT"
+    assert rc_seq_with_n == "NGAT"
 
     # Clean up
     dataset.__del__()
@@ -480,54 +480,6 @@ def test_dataset_collate_fn(sequence_db_dir, window_dbs):
     dataset.__del__()
 
 
-def test_invalid_sequence_db_dir(window_dbs):
-    """Test error handling for invalid sequence database directory."""
-    # Mock tokenizer
-    mock_tokenizer = Mock()
-    mock_tokenizer.bos_id = 1
-    mock_tokenizer.eos_id = 2
-    mock_tokenizer._sep_id = 3
-    mock_tokenizer.pad_id = 0
-    mock_tokenizer.text_to_ids.return_value = [10, 11, 12]
-
-    # Test with non-existent directory
-    with pytest.raises(ValueError, match="No SQLite files found"):
-        ShardedEdenDataset(
-            tokenizer=mock_tokenizer,
-            sequence_db_dir="/non/existent/path",
-            window_db_path=window_dbs["train"],
-            seq_length=8192,
-            create_attention_mask=False,
-            stride=7992,
-            rc_aug=False,
-            use_control_tags=False,
-            split="train",
-        )
-
-
-def test_invalid_window_db_path(sequence_db_dir):
-    """Test error handling for invalid window database path."""
-    # Mock tokenizer
-    mock_tokenizer = Mock()
-    mock_tokenizer.bos_id = 1
-    mock_tokenizer.eos_id = 2
-    mock_tokenizer._sep_id = 3
-    mock_tokenizer.pad_id = 0
-    mock_tokenizer.text_to_ids.return_value = [10, 11, 12]
-
-    # Test with non-existent window database
-    with pytest.raises(ValueError):
-        ShardedEdenDataset(
-            tokenizer=mock_tokenizer,
-            sequence_db_dir=sequence_db_dir,
-            window_db_path="/non/existent/windows.db",
-            seq_length=8192,
-            create_attention_mask=False,
-            stride=7992,
-            rc_aug=False,
-            use_control_tags=False,
-            split="train",
-        )
 
 
 def test_window_min_length_threshold(temp_dir, train_parquet):
@@ -594,33 +546,3 @@ def test_dataset_length_and_iteration(sequence_db_dir, window_dbs):
     # Clean up
     dataset.__del__()
 
-
-def test_dataset_with_different_seq_lengths(sequence_db_dir, window_dbs):
-    """Test dataset with different sequence lengths."""
-    # Mock tokenizer
-    mock_tokenizer = Mock()
-    mock_tokenizer.bos_id = 1
-    mock_tokenizer.eos_id = 2
-    mock_tokenizer._sep_id = 3
-    mock_tokenizer.pad_id = 0
-    mock_tokenizer.text_to_ids.return_value = [10, 11, 12]
-
-    # Test with different sequence lengths
-    for seq_length in [4096, 8192, 16384]:
-        dataset = ShardedEdenDataset(
-            tokenizer=mock_tokenizer,
-            sequence_db_dir=sequence_db_dir,
-            window_db_path=window_dbs["train"],
-            seq_length=seq_length,
-            create_attention_mask=False,
-            stride=seq_length - 200,  # Adjust stride
-            rc_aug=False,
-            use_control_tags=False,
-            split="train",
-        )
-
-        # Verify sequence length
-        assert dataset.seq_length == seq_length
-
-        # Clean up
-        dataset.__del__()
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py
@@ -96,6 +96,29 @@ def small_training_mamba_finetune_cmd(
     return cmd
 
 
+def small_training_llama_cmd(path, max_steps, val_check, devices: int = 1, additional_args: str = ""):
+    cmd = (
+        f"train_evo2 --no-fp32-residual-connection --mock-data --result-dir {path} --devices {devices} "
+        "--model-size 8B --num-layers 2 --limit-val-batches 1 "
+        "--no-activation-checkpointing --create-tensorboard-logger --create-tflops-callback "
+        f"--max-steps {max_steps} --warmup-steps 1 --val-check-interval {val_check} --limit-val-batches 1 "
+        f"--seq-length 8 --hidden-dropout 0.1 --attention-dropout 0.1 {additional_args}"
+    )
+    return cmd
+
+
+def small_training_llama_finetune_cmd(
+    path, max_steps, val_check, prev_ckpt, devices: int = 1, additional_args: str = ""
+):
+    cmd = (
+        f"train_evo2 --no-fp32-residual-connection --mock-data --result-dir {path} --devices {devices} "
+        "--model-size 8B --num-layers 2 --limit-val-batches 1 "
+        "--no-activation-checkpointing --create-tensorboard-logger --create-tflops-callback "
+        f"--max-steps {max_steps} --warmup-steps 1 --val-check-interval {val_check} --limit-val-batches 1 "
+        f"--seq-length 16 --hidden-dropout 0.1 --attention-dropout 0.1 {additional_args} --ckpt-dir {prev_ckpt}"
+    )
+    return cmd
+
 @pytest.mark.timeout(512)  # Optional: fail if the test takes too long.
 @pytest.mark.slow
 def test_train_evo2_finetune_runs(tmp_path):
@@ -243,6 +266,80 @@ def test_train_evo2_mamba_finetune_runs(tmp_path):
     assert len(matching_subfolders_ft) == 1, "Only one checkpoint subfolder should be found."
 
 
+
+@pytest.mark.timeout(512)  # Optional: fail if the test takes too long.
+@pytest.mark.slow
+def test_train_evo2_llama_finetune_runs(tmp_path):
+    """
+    This test runs the `train_evo2` command with mock data in a temporary directory using Llama model.
+    It uses the temporary directory provided by pytest as the working directory.
+    The command is run in a subshell, and we assert that it returns an exit code of 0.
+    """
+    num_steps = 2
+    # Note: The command assumes that `train_evo2` is in your PATH.
+    command = small_training_llama_cmd(tmp_path / "pretrain", max_steps=num_steps, val_check=num_steps)
+    stdout_pretrain: str = run_command_in_subprocess(command=command, path=str(tmp_path))
+    assert "Restoring model weights from RestoreConfig(path='" not in stdout_pretrain
+
+    log_dir = tmp_path / "pretrain" / "evo2"
+    checkpoints_dir = log_dir / "checkpoints"
+    tensorboard_dir = log_dir / "dev"
+
+    # Check if logs dir exists
+    assert log_dir.exists(), "Logs folder should exist."
+    # Check if checkpoints dir exists
+    assert checkpoints_dir.exists(), "Checkpoints folder does not exist."
+
+    expected_checkpoint_suffix = f"{num_steps}.0-last"
+    # Check if any subfolder ends with the expected suffix
+    matching_subfolders = [
+        p for p in checkpoints_dir.iterdir() if p.is_dir() and (expected_checkpoint_suffix in p.name)
+    ]
+
+    assert matching_subfolders, (
+        f"No checkpoint subfolder ending with '{expected_checkpoint_suffix}' found in {checkpoints_dir}."
+    )
+
+    # Check if directory with tensorboard logs exists
+    assert tensorboard_dir.exists(), "TensorBoard logs folder does not exist."
+    # Recursively search for files with tensorboard logger
+    event_files = list(tensorboard_dir.rglob("events.out.tfevents*"))
+    assert event_files, f"No TensorBoard event files found under {tensorboard_dir}"
+
+    assert len(matching_subfolders) == 1, "Only one checkpoint subfolder should be found."
+    command_finetune = small_training_llama_finetune_cmd(
+        tmp_path / "finetune", max_steps=num_steps, val_check=num_steps, prev_ckpt=matching_subfolders[0]
+    )
+    stdout_finetune: str = run_command_in_subprocess(command=command_finetune, path=str(tmp_path))
+    assert "Restoring model weights from RestoreConfig(path='" in stdout_finetune
+
+    log_dir_ft = tmp_path / "finetune" / "evo2"
+    checkpoints_dir_ft = log_dir_ft / "checkpoints"
+    tensorboard_dir_ft = log_dir_ft / "dev"
+
+    # Check if logs dir exists
+    assert log_dir_ft.exists(), "Logs folder should exist."
+    # Check if checkpoints dir exists
+    assert checkpoints_dir_ft.exists(), "Checkpoints folder does not exist."
+
+    expected_checkpoint_suffix = f"{num_steps}.0-last"
+    matching_subfolders_ft = [
+        p for p in checkpoints_dir_ft.iterdir() if p.is_dir() and (expected_checkpoint_suffix in p.name)
+    ]
+
+    assert matching_subfolders_ft, (
+        f"No checkpoint subfolder ending with '{expected_checkpoint_suffix}' found in {checkpoints_dir_ft}."
+    )
+
+    # Check if directory with tensorboard logs exists
+    assert tensorboard_dir_ft.exists(), "TensorBoard logs folder does not exist."
+    # Recursively search for files with tensorboard logger
+    event_files = list(tensorboard_dir_ft.rglob("events.out.tfevents*"))
+    assert event_files, f"No TensorBoard event files found under {tensorboard_dir_ft}"
+
+    assert len(matching_subfolders_ft) == 1, "Only one checkpoint subfolder should be found."
+
+
 @pytest.mark.timeout(256)  # Optional: fail if the test takes too long.
 @pytest.mark.slow
 def test_train_evo2_stops(tmp_path):