Skip to content

Commit 0ea4461

Browse files
authored
add tests for random model weights on dna (#1349)
Signed-off-by: Peter St. John <pstjohn@nvidia.com>
1 parent c86162b commit 0ea4461

3 files changed

Lines changed: 74 additions & 13 deletions

File tree

bionemo-recipes/models/llama3/tests/conftest.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,13 @@
1717
from pathlib import Path
1818

1919
import pytest
20-
from transformers import AutoModelForCausalLM
2120

2221

2322
sys.path.append(Path(__file__).parent.parent.as_posix())
2423
sys.path.append(Path(__file__).parent.as_posix())
2524

2625

27-
@pytest.fixture
28-
def te_model_checkpoint(tmp_path):
29-
from convert import convert_llama_hf_to_te
30-
31-
model_hf = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
32-
model_te = convert_llama_hf_to_te(model_hf)
33-
model_te.save_pretrained(tmp_path / "te_model_checkpoint")
34-
return tmp_path / "te_model_checkpoint"
26+
@pytest.fixture(scope="session")
27+
def recipe_path() -> Path:
28+
"""Return the root directory of the recipe."""
29+
return Path(__file__).parent.parent

bionemo-recipes/models/llama3/tests/test_modeling_llama_te.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import torch
2121
from transformer_engine.pytorch.attention import InferenceParams
2222
from transformers import (
23+
AutoConfig,
2324
AutoModelForCausalLM,
2425
AutoTokenizer,
2526
DataCollatorWithFlattening,
@@ -111,11 +112,12 @@ def test_llama_model_forward_pass_thd_inputs(input_text):
111112
@pytest.mark.parametrize(
112113
"upstream_model_name", ["meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.1-8B-Instruct"]
113114
)
114-
def test_llama_model_golden_values(input_text, upstream_model_name: str):
115+
@pytest.mark.parametrize("attn_input_format", ["thd", "bshd"])
116+
def test_llama_model_golden_values(input_text, upstream_model_name: str, attn_input_format: str):
115117
tokenizer = AutoTokenizer.from_pretrained(upstream_model_name)
116118
model_hf = AutoModelForCausalLM.from_pretrained(upstream_model_name, dtype=torch.bfloat16)
117119

118-
model_te = convert_llama_hf_to_te(model_hf)
120+
model_te = convert_llama_hf_to_te(model_hf, attn_input_format=attn_input_format)
119121

120122
tokenizer.pad_token = tokenizer.eos_token
121123
# TODO: figure out padding_side="left" with TE, make this several tests with different input types.
@@ -370,3 +372,67 @@ def test_te_llama_model_generate_with_cache_bshd_beam_search():
370372
generated_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
371373
assert "http://www.apache.org/licenses/LICENSE-2.0" in generated_text[0]
372374
assert "et dolore magna aliqua. Ut enim ad minim " in generated_text[1]
375+
376+
377+
@pytest.mark.parametrize("attn_input_format", ["thd", "bshd"])
378+
def test_loss_with_random_weights_for_input_gene_sequence(recipe_path, attn_input_format: str):
379+
tokenizer = AutoTokenizer.from_pretrained(recipe_path / "nucleotide_fast_tokenizer")
380+
input_text = "GCACGGTCTGCACCACCGTCTGCCCGGTCAGCGGCGTTAACCCGCGCTATCCCGGTCCGAAACAGGCCGGGCCGGACGGCGAGCGCCTTCGTCTGAAGGA"
381+
382+
inputs = tokenizer(input_text, return_tensors="pt")
383+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
384+
labels = inputs["input_ids"].clone()
385+
386+
# This unsloth config is identical to the meta-llama/Llama-3.2-1B config, but is available in CI without having to
387+
# sign the EULA. Since we don't need any weights here, we can just use this model tag instead.
388+
config = AutoConfig.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
389+
model_hf = AutoModelForCausalLM.from_config(config)
390+
391+
model_hf.to("cuda")
392+
with torch.no_grad():
393+
outputs_hf = model_hf(**inputs, labels=labels, output_hidden_states=True)
394+
loss_hf = outputs_hf.loss
395+
396+
del model_hf
397+
gc.collect()
398+
torch.cuda.empty_cache()
399+
400+
config_te = NVLlamaConfig.from_pretrained("unsloth/Llama-3.2-1B-Instruct", attn_input_format=attn_input_format)
401+
model_te = NVLlamaForCausalLM(config_te)
402+
403+
model_te.to("cuda")
404+
with torch.no_grad():
405+
outputs_te = model_te(**inputs, labels=labels, output_hidden_states=True)
406+
loss_te = outputs_te.loss
407+
408+
torch.testing.assert_close(loss_te, loss_hf, atol=0.5, rtol=0.05)
409+
410+
411+
@pytest.mark.parametrize("attn_input_format", ["thd", "bshd"])
412+
def test_loss_with_random_weights_similar_grad_norms(recipe_path, attn_input_format: str):
413+
tokenizer = AutoTokenizer.from_pretrained(recipe_path / "nucleotide_fast_tokenizer")
414+
input_text = "GCACGGTCTGCACCACCGTCTGCCCGGTCAGCGGCGTTAACCCGCGCTATCCCGGTCCGAAACAGGCCGGGCCGGACGGCGAGCGCCTTCGTCTGAAGGA"
415+
416+
inputs = tokenizer(input_text, return_tensors="pt")
417+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
418+
labels = inputs["input_ids"].clone()
419+
420+
config = AutoConfig.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
421+
model_hf = AutoModelForCausalLM.from_config(config)
422+
model_te = convert_llama_hf_to_te(model_hf, attn_input_format=attn_input_format)
423+
424+
model_hf.to("cuda")
425+
model_hf.train()
426+
outputs_hf = model_hf(**inputs, labels=labels, output_hidden_states=True)
427+
loss_hf = outputs_hf.loss
428+
loss_hf.backward()
429+
grad_norm_hf = torch.nn.utils.clip_grad_norm_(model_hf.parameters(), max_norm=float("inf"))
430+
431+
model_te.to("cuda")
432+
model_te.train()
433+
outputs_te = model_te(**inputs, labels=labels, output_hidden_states=True)
434+
loss_te = outputs_te.loss
435+
loss_te.backward()
436+
grad_norm_te = torch.nn.utils.clip_grad_norm_(model_te.parameters(), max_norm=float("inf"))
437+
438+
torch.testing.assert_close(grad_norm_te, grad_norm_hf)

bionemo-recipes/models/llama3/tests/test_tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@
2828

2929

3030
@pytest.fixture(scope="session")
31-
def tokenizer():
31+
def tokenizer(recipe_path: Path):
3232
"""Load the ASCII nucleotide tokenizer."""
33-
tokenizer_path = Path(__file__).parent.parent / "nucleotide_fast_tokenizer"
33+
tokenizer_path = recipe_path / "nucleotide_fast_tokenizer"
3434
return AutoTokenizer.from_pretrained(str(tokenizer_path))
3535

3636

0 commit comments

Comments
 (0)