Remove kwargs in amplify forward pass (#1141)

pstjohn · web-flow · commit 3e56b8a03d77 · 2025-09-10T09:22:49.000-06:00
Having a **kwargs in model.forward leads to some odd complications with
accelerate, where it sums rather than averages loss across parallel
processes.

Also does some other fixes in the amplify model since we'll need to push
a new version to the HF hub

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;

## Summary by CodeRabbit

* New Features
  * Data collator now supports a seed option for deterministic masking.
* Refactor
* Standardized dtype handling to a single dtype setting across
embeddings, norms, and layers.
* Ensured intermediate size is always defined when activation is not
swiglu.
* Simplified model forward APIs by removing unused keyword passthroughs.
* Tests
* Added loss verification tests for pretrained and reinitialized models
across implementations.
* Chores
* Updated development container to use a prebuilt image, increased
shared memory, and simplified dependency installation.

&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/models/amplify/.devcontainer/devcontainer.json b/models/amplify/.devcontainer/devcontainer.json
@@ -2,14 +2,11 @@
 // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 {
     "name": "Existing Dockerfile",
-    "build": {
-        "context": "..",
-        "dockerfile": "Dockerfile.dev"
-    },
+    "image": "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025",
     "mounts": [
         "source=${localEnv:HOME}/.cache,target=/home/ubuntu/.cache,type=bind,consistency=cached"
     ],
-    "postCreateCommand": "pip install -e .[convert,test]",
+    "postCreateCommand": "PIP_CONSTRAINT= pip install -e .",
     "remoteUser": "ubuntu",
     "runArgs": [
         "--gpus=all",
diff --git a/models/amplify/Dockerfile b/models/amplify/Dockerfile
@@ -4,8 +4,7 @@ FROM nvcr.io/nvidia/pytorch:25.01-py3
 RUN MAX_JOBS=4 pip --disable-pip-version-check --no-cache-dir install -v git+https://github.com/facebookresearch/xformers.git@v0.0.29.post1#egg=xformers
 RUN PIP_CONSTRAINT= NVTE_FRAMEWORK=pytorch MAX_JOBS=4 pip --disable-pip-version-check --no-cache-dir install -v git+https://github.com/nvidia/TransformerEngine.git@v2.4
 
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /workspace/bionemo
 COPY . .
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
     PIP_CONSTRAINT= pip install -e .
diff --git a/models/amplify/export.py b/models/amplify/export.py
@@ -36,7 +36,7 @@
     # Smoke test that the model can be loaded.
     model_te = AutoModelForMaskedLM.from_pretrained(
         f"./checkpoint_export/{tag}",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         trust_remote_code=True,
     )
     del model_te
diff --git a/models/amplify/src/amplify/amplify_te.py b/models/amplify/src/amplify/amplify_te.py
@@ -147,17 +147,15 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
             config.padded_vocab_size,
             config.hidden_size,
             padding_idx=config.pad_token_id,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         if config.layer_norm_after_embedding:
             self.layer_norm_1 = (
-                transformer_engine.pytorch.RMSNorm(
-                    config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
-                )
+                transformer_engine.pytorch.RMSNorm(config.hidden_size, config.norm_eps, params_dtype=config.dtype)
                 if config.rms_norm
                 else transformer_engine.pytorch.LayerNorm(
-                    config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
+                    config.hidden_size, config.norm_eps, params_dtype=config.dtype
                 )
             )
 
@@ -169,6 +167,9 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
             intermediate_size = int(2 * config.intermediate_size / 3)
             intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
 
+        else:
+            intermediate_size = config.intermediate_size
+
         self.transformer_encoder = nn.ModuleList()
         for layer_num in range(config.num_hidden_layers):
             self.transformer_encoder.append(
@@ -194,7 +195,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
                     window_size=(-1, -1),
                     rotary_pos_interleaved=True,
                     seq_length=config.max_length,
-                    params_dtype=config.torch_dtype,
+                    params_dtype=config.dtype,
                 )
             )
 
@@ -212,7 +213,6 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
         labels=None,
-        **kwargs,
     ) -> BaseModelOutput:
         """Forward pass of the AMPLIFY model.
 
@@ -222,7 +222,6 @@ def forward(
             output_hidden_states (bool): Whether to output the hidden states.
             output_attentions (bool): Whether to output the attention weights.
             labels (torch.Tensor): The labels.
-            **kwargs: Additional arguments.
 
         Returns:
             BaseModelOutput: The output of the model.
@@ -277,7 +276,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
                 config.hidden_size,
                 config.padded_vocab_size,
                 config.norm_eps,
-                params_dtype=config.torch_dtype,
+                params_dtype=config.dtype,
                 normalization="RMSNorm" if config.rms_norm else "LayerNorm",
                 init_method=lambda x: torch.nn.init.uniform_(
                     x, -self.config.decoder_init_range, self.config.decoder_init_range
@@ -286,7 +285,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
 
         else:
             self.decoder = transformer_engine.pytorch.Linear(
-                config.hidden_size, config.vocab_size, params_dtype=config.torch_dtype
+                config.hidden_size, config.vocab_size, params_dtype=config.dtype
             )
 
     def forward(
@@ -296,7 +295,6 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
         labels=None,
-        **kwargs,
     ) -> MaskedLMOutput:
         """Forward pass of the AMPLIFYForMaskedLM model.
 
@@ -306,7 +304,6 @@ def forward(
             output_hidden_states (bool): Whether to output the hidden states.
             output_attentions (bool): Whether to output the attention weights.
             labels (torch.Tensor): The labels.
-            **kwargs: Additional arguments.
 
         Returns:
             MaskedLMOutput: The output of the model.
@@ -317,7 +314,6 @@ def forward(
             output_hidden_states,
             output_attentions,
             labels,
-            **kwargs,
         )
 
         # Classification head with layer norm
diff --git a/models/amplify/src/amplify/state_dict_convert.py b/models/amplify/src/amplify/state_dict_convert.py
@@ -46,7 +46,7 @@ def convert_amplify_hf_to_te(model_hf: nn.Module, **config_kwargs) -> nn.Module:
     """
     te_config = AMPLIFYConfig(**model_hf.config.to_dict(), **config_kwargs)
     with init_empty_weights():
-        model_te = AMPLIFYForMaskedLM(te_config, torch_dtype=te_config.torch_dtype)
+        model_te = AMPLIFYForMaskedLM(te_config, dtype=te_config.dtype)
 
     output_model = io.apply_transforms(
         model_hf,
diff --git a/models/amplify/tests/conftest.py b/models/amplify/tests/conftest.py
@@ -36,7 +36,7 @@ def tokenizer():
 @pytest.fixture
 def config():
     config = AutoConfig.from_pretrained("chandar-lab/AMPLIFY_120M", trust_remote_code=True)
-    config.torch_dtype = torch.bfloat16
+    config.dtype = torch.bfloat16
     return config
 
 
@@ -68,6 +68,7 @@ def input_data(tokenizer):
         tokenizer=tokenizer,
         mlm_probability=0.15,
         pad_to_multiple_of=1024,
+        seed=42,
     )
 
     def tokenize_function(examples):
diff --git a/models/amplify/tests/test_amplify_model.py b/models/amplify/tests/test_amplify_model.py
@@ -168,3 +168,52 @@ def test_convert_state_dict():
     te_state_dict_keys.remove("decoder.bias")
 
     assert len(te_state_dict_keys) == 0
+
+
+def test_hf_trained_model_loss(input_data):
+    model = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
+
+
+def test_te_trained_model_loss(input_data):
+    model_hf = amp_hf.AMPLIFY.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = convert_amplify_hf_to_te(model_hf)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    torch.testing.assert_close(output.loss.detach().cpu(), torch.tensor(2.4), atol=1e-1, rtol=1e-2)
+
+
+def test_hf_reinitialized_model_loss(input_data):
+    config = amp_hf.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = amp_hf.AMPLIFY(config)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    loss = output.loss.detach().cpu()
+    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
+
+
+def test_te_reinitialized_model_loss(input_data):
+    config = amp_te.AMPLIFYConfig.from_pretrained("chandar-lab/AMPLIFY_120M")
+    model = amp_te.AMPLIFYForMaskedLM(config)
+    model.to("cuda", dtype=torch.bfloat16)
+    input_data = {k: v.to("cuda") for k, v in input_data.items()}
+    model.eval()
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        output = model(**input_data)
+
+    loss = output.loss.detach().cpu()
+    assert loss < 3.5, f"Loss is {loss}, expected less than 3.5"
diff --git a/models/amplify/tests/test_encoder_block.py b/models/amplify/tests/test_encoder_block.py
@@ -57,7 +57,7 @@ def data(self) -> torch.Tensor:
 @pytest.fixture
 def config():
     config = AutoConfig.from_pretrained("chandar-lab/AMPLIFY_120M", trust_remote_code=True)
-    config.torch_dtype = torch.bfloat16
+    config.dtype = torch.bfloat16
     return config
 
 
@@ -169,7 +169,7 @@ def test_encoder_block_forward(inputs, config):
         window_size=(-1, -1),
         rotary_pos_interleaved=True,
         seq_length=config.max_length,
-        params_dtype=config.torch_dtype,
+        params_dtype=config.dtype,
     ).to("cuda", dtype=torch.bfloat16)
 
     state_dict_mapping = {
diff --git a/models/esm2/Dockerfile b/models/esm2/Dockerfile
@@ -1,6 +1,5 @@
 FROM nvcr.io/nvidia/pytorch:25.06-py3
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /workspace/bionemo
 COPY . .
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/pip \
     PIP_CONSTRAINT= pip install -e .

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`# Smoke test that the model can be loaded.`
`37`	`37`	`model_te = AutoModelForMaskedLM.from_pretrained(`
`38`	`38`	`f"./checkpoint_export/{tag}",`
`39`		`- torch_dtype=torch.bfloat16,`
	`39`	`+ dtype=torch.bfloat16,`
`40`	`40`	`trust_remote_code=True,`
`41`	`41`	`)`
`42`	`42`	`del model_te`