update amplify model with some loss tests

pstjohn · pstjohn · commit f40e4b05fb0f · 2025-09-09T16:09:15.000-07:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/models/amplify/.devcontainer/devcontainer.json b/models/amplify/.devcontainer/devcontainer.json
@@ -2,14 +2,11 @@
 // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 {
     "name": "Existing Dockerfile",
-    "build": {
-        "context": "..",
-        "dockerfile": "Dockerfile.dev"
-    },
+    "image": "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025",
     "mounts": [
         "source=${localEnv:HOME}/.cache,target=/home/ubuntu/.cache,type=bind,consistency=cached"
     ],
-    "postCreateCommand": "pip install -e .[convert,test]",
+    "postCreateCommand": "PIP_CONSTRAINT= pip install -e .",
     "remoteUser": "ubuntu",
     "runArgs": [
         "--gpus=all",
diff --git a/models/amplify/tests/conftest.py b/models/amplify/tests/conftest.py
@@ -68,6 +68,7 @@ def input_data(tokenizer):
         tokenizer=tokenizer,
         mlm_probability=0.15,
         pad_to_multiple_of=1024,
+        seed=42,
     )
 
     def tokenize_function(examples):
diff --git a/models/amplify/tests/test_amplify_model.py b/models/amplify/tests/test_amplify_model.py
@@ -23,11 +23,22 @@
 from conftest import requires_fp8
 from transformer_engine.common.recipe import DelayedScaling, Format
 
-import amplify.amplify_hf as amp_hf
 import amplify.amplify_te as amp_te
 from amplify.state_dict_convert import convert_amplify_hf_to_te
 
 
+try:
+    import xformers
+except ImportError:
+    xformers = None
+
+if xformers is not None:
+    import amplify.amplify_hf as amp_hf
+else:
+    amp_hf = None
+
+
+@pytest.mark.skipif(amp_hf is None, reason="xformers is not installed")
 def test_amplify_hf_model(config, input_data):
     model = amp_hf.AMPLIFY(config)
     model.to("cuda")
@@ -67,6 +78,7 @@ def test_te_model_has_all_te_layers(config):
         assert not isinstance(module, nn.RMSNorm), f"Vanilla RMSNorm layer found in {name}"
 
 
+@pytest.mark.skipif(amp_hf is None, reason="xformers is not installed")
 def test_models_have_identical_outputs(input_data):
     model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
     model_te = convert_amplify_hf_to_te(model_hf)
@@ -84,6 +96,7 @@ def test_models_have_identical_outputs(input_data):
     torch.testing.assert_close(outputs_hf.loss, outputs_te.loss, atol=1e-2, rtol=1e-3)
 
 
+@pytest.mark.skipif(amp_hf is None, reason="xformers is not installed")
 def test_converted_model_roundtrip(input_data, tmp_path):
     model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
     model_te = convert_amplify_hf_to_te(model_hf)
@@ -107,6 +120,7 @@ def test_converted_model_roundtrip(input_data, tmp_path):
     torch.testing.assert_close(outputs_hf.loss, outputs_te.loss, atol=1e-2, rtol=1e-3)
 
 
+@pytest.mark.skipif(amp_hf is None, reason="xformers is not installed")
 def test_convert_state_dict():
     model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
     model_te = convert_amplify_hf_to_te(model_hf)
@@ -168,3 +182,52 @@ def test_convert_state_dict():
     te_state_dict_keys.remove("decoder.bias")
 
     assert len(te_state_dict_keys) == 0
+
+
+def test_hf_trained_model_loss(input_data):
+    model = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
+
+
+def test_te_trained_model_loss(input_data):
+    model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = convert_amplify_hf_to_te(model_hf)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
+
+
+def test_hf_reinitialized_model_loss(input_data):
+    config = amp_hf.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = amp_hf.AMPLIFY(config)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    loss = output.loss.detach().cpu()
+    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
+
+
+def test_te_reinitialized_model_loss(input_data):
+    config = amp_te.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = amp_te.AMPLIFYForMaskedLM(config)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    loss = output.loss.detach().cpu()
+    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
diff --git a/recipes/esm2_accelerate/train.py b/recipes/esm2_accelerate/train.py
@@ -46,13 +46,11 @@ def main(args: DictConfig):
     )
 
     config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True)
-    config.max_seq_length = args.max_seq_length
-    config.micro_batch_size = args.trainer.per_device_train_batch_size
     model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True, torch_dtype=torch.bfloat16)
 
     train_dataset, eval_dataset, data_collator = create_datasets_and_collator(
         tokenizer_name=args.model_tag,
-        max_length=config.max_seq_length,
+        max_length=args.max_seq_length,
     )
 
     training_args = TrainingArguments(**args.trainer)

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ def input_data(tokenizer):`
`68`	`68`	`tokenizer=tokenizer,`
`69`	`69`	`mlm_probability=0.15,`
`70`	`70`	`pad_to_multiple_of=1024,`
	`71`	`+ seed=42,`
`71`	`72`	`)`
`72`	`73`
`73`	`74`	`def tokenize_function(examples):`
Original file line number	Diff line number	Diff line change
`@@ -46,13 +46,11 @@ def main(args: DictConfig):`
`46`	`46`	`)`
`47`	`47`
`48`	`48`	`config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True)`
`49`		`- config.max_seq_length = args.max_seq_length`
`50`		`- config.micro_batch_size = args.trainer.per_device_train_batch_size`
`51`	`49`	`model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True, torch_dtype=torch.bfloat16)`
`52`	`50`
`53`	`51`	`train_dataset, eval_dataset, data_collator = create_datasets_and_collator(`
`54`	`52`	`tokenizer_name=args.model_tag,`
`55`		`- max_length=config.max_seq_length,`
	`53`	`+ max_length=args.max_seq_length,`
`56`	`54`	`)`
`57`	`55`
`58`	`56`	`training_args = TrainingArguments(**args.trainer)`