Update amplify model, add loss tests (#1135)

pstjohn · web-flow · commit cd74c2bdce22 · 2025-09-15T19:57:33.000Z
Not sure we support changing configs like this, but this would fail
without handling the non-swiglu variant

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **Bug Fixes**
- Fixes feed‑forward sizing for non‑SwiGLU activations and prevents
initialization/runtime errors, improving stability across activation
options.
- Note: forward method signatures were simplified (removed variable
kwargs), which changes the public forward API.

- **Tests**
- Adds deterministic masking seed and four loss/regression tests
validating model behavior for pretrained and reinitialized variants.

- **Chores**
- Updates development container setup and install/run configuration for
the project.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/models/amplify/tests/test_amplify_model.py b/models/amplify/tests/test_amplify_model.py
@@ -168,3 +168,52 @@ def test_convert_state_dict():
     te_state_dict_keys.remove("decoder.bias")
 
     assert len(te_state_dict_keys) == 0
+
+
+def test_hf_trained_model_loss(input_data):
+    model = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
+
+
+def test_te_trained_model_loss(input_data):
+    model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = convert_amplify_hf_to_te(model_hf)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
+
+
+def test_hf_reinitialized_model_loss(input_data):
+    config = amp_hf.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = amp_hf.AMPLIFY(config)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    loss = output.loss.detach().cpu()
+    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
+
+
+def test_te_reinitialized_model_loss(input_data):
+    config = amp_te.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = amp_te.AMPLIFYForMaskedLM(config)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    loss = output.loss.detach().cpu()
+    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"