add tests for random model weights on dna (#1349)

pstjohn · web-flow · commit 0ea44614f10b · 2025-11-26T16:40:47.000Z
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/llama3/tests/conftest.py b/bionemo-recipes/models/llama3/tests/conftest.py
@@ -17,18 +17,13 @@
 from pathlib import Path
 
 import pytest
-from transformers import AutoModelForCausalLM
 
 
 sys.path.append(Path(__file__).parent.parent.as_posix())
 sys.path.append(Path(__file__).parent.as_posix())
 
 
-@pytest.fixture
-def te_model_checkpoint(tmp_path):
-    from convert import convert_llama_hf_to_te
-
-    model_hf = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-    model_te = convert_llama_hf_to_te(model_hf)
-    model_te.save_pretrained(tmp_path / "te_model_checkpoint")
-    return tmp_path / "te_model_checkpoint"
+@pytest.fixture(scope="session")
+def recipe_path() -> Path:
+    """Return the root directory of the recipe."""
+    return Path(__file__).parent.parent
diff --git a/bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py b/bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py
@@ -20,6 +20,7 @@
 import torch
 from transformer_engine.pytorch.attention import InferenceParams
 from transformers import (
+    AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
     DataCollatorWithFlattening,
@@ -111,11 +112,12 @@ def test_llama_model_forward_pass_thd_inputs(input_text):
 @pytest.mark.parametrize(
     "upstream_model_name", ["meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.1-8B-Instruct"]
 )
-def test_llama_model_golden_values(input_text, upstream_model_name: str):
+@pytest.mark.parametrize("attn_input_format", ["thd", "bshd"])
+def test_llama_model_golden_values(input_text, upstream_model_name: str, attn_input_format: str):
     tokenizer = AutoTokenizer.from_pretrained(upstream_model_name)
     model_hf = AutoModelForCausalLM.from_pretrained(upstream_model_name, dtype=torch.bfloat16)
 
-    model_te = convert_llama_hf_to_te(model_hf)
+    model_te = convert_llama_hf_to_te(model_hf, attn_input_format=attn_input_format)
 
     tokenizer.pad_token = tokenizer.eos_token
     # TODO: figure out padding_side="left" with TE, make this several tests with different input types.
@@ -370,3 +372,67 @@ def test_te_llama_model_generate_with_cache_bshd_beam_search():
     generated_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     assert "http://www.apache.org/licenses/LICENSE-2.0" in generated_text[0]
     assert "et dolore magna aliqua. Ut enim ad minim " in generated_text[1]
+
+
+@pytest.mark.parametrize("attn_input_format", ["thd", "bshd"])
+def test_loss_with_random_weights_for_input_gene_sequence(recipe_path, attn_input_format: str):
+    tokenizer = AutoTokenizer.from_pretrained(recipe_path / "nucleotide_fast_tokenizer")
+    input_text = "GCACGGTCTGCACCACCGTCTGCCCGGTCAGCGGCGTTAACCCGCGCTATCCCGGTCCGAAACAGGCCGGGCCGGACGGCGAGCGCCTTCGTCTGAAGGA"
+
+    inputs = tokenizer(input_text, return_tensors="pt")
+    inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    labels = inputs["input_ids"].clone()
+
+    # This unsloth config is identical to the meta-llama/Llama-3.2-1B config, but is available in CI without having to
+    # sign the EULA. Since we don't need any weights here, we can just use this model tag instead.
+    config = AutoConfig.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
+    model_hf = AutoModelForCausalLM.from_config(config)
+
+    model_hf.to("cuda")
+    with torch.no_grad():
+        outputs_hf = model_hf(**inputs, labels=labels, output_hidden_states=True)
+    loss_hf = outputs_hf.loss
+
+    del model_hf
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    config_te = NVLlamaConfig.from_pretrained("unsloth/Llama-3.2-1B-Instruct", attn_input_format=attn_input_format)
+    model_te = NVLlamaForCausalLM(config_te)
+
+    model_te.to("cuda")
+    with torch.no_grad():
+        outputs_te = model_te(**inputs, labels=labels, output_hidden_states=True)
+    loss_te = outputs_te.loss
+
+    torch.testing.assert_close(loss_te, loss_hf, atol=0.5, rtol=0.05)
+
+
+@pytest.mark.parametrize("attn_input_format", ["thd", "bshd"])
+def test_loss_with_random_weights_similar_grad_norms(recipe_path, attn_input_format: str):
+    tokenizer = AutoTokenizer.from_pretrained(recipe_path / "nucleotide_fast_tokenizer")
+    input_text = "GCACGGTCTGCACCACCGTCTGCCCGGTCAGCGGCGTTAACCCGCGCTATCCCGGTCCGAAACAGGCCGGGCCGGACGGCGAGCGCCTTCGTCTGAAGGA"
+
+    inputs = tokenizer(input_text, return_tensors="pt")
+    inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    labels = inputs["input_ids"].clone()
+
+    config = AutoConfig.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
+    model_hf = AutoModelForCausalLM.from_config(config)
+    model_te = convert_llama_hf_to_te(model_hf, attn_input_format=attn_input_format)
+
+    model_hf.to("cuda")
+    model_hf.train()
+    outputs_hf = model_hf(**inputs, labels=labels, output_hidden_states=True)
+    loss_hf = outputs_hf.loss
+    loss_hf.backward()
+    grad_norm_hf = torch.nn.utils.clip_grad_norm_(model_hf.parameters(), max_norm=float("inf"))
+
+    model_te.to("cuda")
+    model_te.train()
+    outputs_te = model_te(**inputs, labels=labels, output_hidden_states=True)
+    loss_te = outputs_te.loss
+    loss_te.backward()
+    grad_norm_te = torch.nn.utils.clip_grad_norm_(model_te.parameters(), max_norm=float("inf"))
+
+    torch.testing.assert_close(grad_norm_te, grad_norm_hf)
diff --git a/bionemo-recipes/models/llama3/tests/test_tokenizer.py b/bionemo-recipes/models/llama3/tests/test_tokenizer.py
@@ -28,9 +28,9 @@
 
 
 @pytest.fixture(scope="session")
-def tokenizer():
+def tokenizer(recipe_path: Path):
     """Load the ASCII nucleotide tokenizer."""
-    tokenizer_path = Path(__file__).parent.parent / "nucleotide_fast_tokenizer"
+    tokenizer_path = recipe_path / "nucleotide_fast_tokenizer"
     return AutoTokenizer.from_pretrained(str(tokenizer_path))