add llama predict test

yzhang123 · yzhang123 · commit 3baf7307b0a4 · 2025-09-10T09:10:19.000-07:00
Signed-off-by: Yang Zhang &lt;yangzhang@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_predict.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_predict.py
@@ -22,14 +22,27 @@
 import subprocess
 import sys
 
+import pytest
 import torch
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
 from bionemo.core.data.load import load
 from bionemo.noodles.nvfaidx import NvFaidx
 from bionemo.testing.data.fasta import ALU_SEQUENCE, create_fasta_file
+from bionemo.testing.subprocess_utils import run_command_in_subprocess
 
 
+
+def small_training_llama_cmd(path, max_steps, val_check, devices: int = 1, additional_args: str = ""):
+    cmd = (
+        f"train_evo2 --no-fp32-residual-connection --mock-data --result-dir {path} --devices {devices} "
+        "--model-size 8B --num-layers 2 --limit-val-batches 1 "
+        "--no-activation-checkpointing --create-tensorboard-logger --create-tflops-callback "
+        f"--max-steps {max_steps} --warmup-steps 1 --val-check-interval {val_check} --limit-val-batches 1 "
+        f"--seq-length 8 --hidden-dropout 0.1 --attention-dropout 0.1 {additional_args}"
+    )
+    return cmd
+
 def test_predict_evo2_runs(
     tmp_path, num_sequences: int = 5, target_sequence_lengths: list[int] = [3149, 3140, 1024, 3149, 3149]
 ):
@@ -104,3 +117,90 @@ def test_predict_evo2_runs(
         idx = seq_idx_map[seq_name]  # look up the out of order prediction index for this sequence.
         assert preds["pad_mask"][idx].sum() == expected_len
         assert preds["token_logits"][idx].shape == (max(target_sequence_lengths), 512)
+
+
+@pytest.mark.timeout(512)  # Optional: fail if the test takes too long.
+def test_predict_evo2_llama_runs(
+    tmp_path, num_sequences: int = 5, target_sequence_lengths: list[int] = [3149, 3140, 1024, 3149, 3149]
+):
+    """
+    This test first trains a small Llama model to create a checkpoint, then runs the `predict_evo2` command
+    with that checkpoint and mock data in a temporary directory.
+    It uses the temporary directory provided by pytest as the working directory.
+    The command is run in a subshell, and we assert that it returns an exit code of 0.
+    """
+    # First, train a small Llama model to create a checkpoint
+    num_steps = 2
+    train_command = small_training_llama_cmd(tmp_path / "pretrain", max_steps=num_steps, val_check=num_steps)
+    stdout_pretrain: str = run_command_in_subprocess(command=train_command, path=str(tmp_path))
+    assert "Restoring model weights from RestoreConfig(path='" not in stdout_pretrain
+
+    # Find the created checkpoint
+    log_dir = tmp_path / "pretrain" / "evo2"
+    checkpoints_dir = log_dir / "checkpoints"
+    assert checkpoints_dir.exists(), "Checkpoints folder does not exist."
+
+    expected_checkpoint_suffix = f"{num_steps}.0-last"
+    matching_subfolders = [
+        p for p in checkpoints_dir.iterdir() if p.is_dir() and (expected_checkpoint_suffix in p.name)
+    ]
+    assert matching_subfolders, (
+        f"No checkpoint subfolder ending with '{expected_checkpoint_suffix}' found in {checkpoints_dir}."
+    )
+    assert len(matching_subfolders) == 1, "Only one checkpoint subfolder should be found."
+    checkpoint_path = matching_subfolders[0]
+
+    # Now create the FASTA file for prediction
+    fasta_file_path = tmp_path / "test_llama.fasta"
+    create_fasta_file(
+        fasta_file_path, num_sequences, sequence_lengths=target_sequence_lengths, repeating_dna_pattern=ALU_SEQUENCE
+    )
+
+    # Build the command string for Llama model prediction.
+    # Note: The command assumes that `predict_evo2` is in your PATH.
+    output_dir = tmp_path / "test_llama_output"
+    command = (
+        f"predict_evo2 --fasta {fasta_file_path} --ckpt-dir {checkpoint_path} "
+        f"--output-dir {output_dir} --model-type llama --model-size 8B --tensor-parallel-size 1 "
+        "--pipeline-model-parallel-size 1 --context-parallel-size 1 --num-layers 2"
+    )
+
+    # Run the command in a subshell, using the temporary directory as the current working directory.
+    result = subprocess.run(
+        command,
+        shell=True,  # Use the shell to interpret wildcards (e.g. SDH*)
+        cwd=tmp_path,  # Run in the temporary directory
+        capture_output=True,  # Capture stdout and stderr for debugging
+        text=True,  # Decode output as text
+    )
+
+    # For debugging purposes, print the output if the test fails.
+    if result.returncode != 0:
+        sys.stderr.write("STDOUT:\n" + result.stdout + "\n")
+        sys.stderr.write("STDERR:\n" + result.stderr + "\n")
+
+    # Assert that the command completed successfully.
+    assert result.returncode == 0, "predict_evo2 command with Llama model failed."
+
+    # Assert that the output directory was created.
+    pred_files = glob.glob(os.path.join(output_dir, "predictions__rank_*.pt"))
+    assert len(pred_files) == 1, "Expected 1 prediction file (for this test), got {}".format(len(pred_files))
+    with open(output_dir / "seq_idx_map.json", "r") as f:
+        seq_idx_map = json.load(
+            f
+        )  # This gives us the mapping from the sequence names to the indices in the predictions.
+    preds = torch.load(pred_files[0])
+    assert isinstance(preds, dict)
+    assert "token_logits" in preds
+    assert "pad_mask" in preds
+    assert "seq_idx" in preds
+    assert len(preds["token_logits"]) == len(preds["pad_mask"]) == len(preds["seq_idx"]) == num_sequences
+    assert len(seq_idx_map) == num_sequences
+    fasta = NvFaidx(fasta_file_path)
+    for i, seq_name in enumerate(sorted(fasta.keys())):
+        expected_len = target_sequence_lengths[i]
+        idx = seq_idx_map[seq_name]  # look up the out of order prediction index for this sequence.
+        assert preds["pad_mask"][idx].sum() == expected_len
+        assert preds["token_logits"][idx].shape == (max(target_sequence_lengths), 512)
+
+