remove kwargs from amplify

pstjohn · pstjohn · commit b51d21e98780 · 2025-09-10T07:26:25.000-07:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/models/amplify/src/amplify/amplify_te.py b/models/amplify/src/amplify/amplify_te.py
@@ -212,7 +212,6 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
         labels=None,
-        **kwargs,
     ) -> BaseModelOutput:
         """Forward pass of the AMPLIFY model.
 
@@ -222,7 +221,6 @@ def forward(
             output_hidden_states (bool): Whether to output the hidden states.
             output_attentions (bool): Whether to output the attention weights.
             labels (torch.Tensor): The labels.
-            **kwargs: Additional arguments.
 
         Returns:
             BaseModelOutput: The output of the model.
@@ -296,7 +294,6 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
         labels=None,
-        **kwargs,
     ) -> MaskedLMOutput:
         """Forward pass of the AMPLIFYForMaskedLM model.
 
@@ -306,7 +303,6 @@ def forward(
             output_hidden_states (bool): Whether to output the hidden states.
             output_attentions (bool): Whether to output the attention weights.
             labels (torch.Tensor): The labels.
-            **kwargs: Additional arguments.
 
         Returns:
             MaskedLMOutput: The output of the model.
@@ -317,7 +313,6 @@ def forward(
             output_hidden_states,
             output_attentions,
             labels,
-            **kwargs,
         )
 
         # Classification head with layer norm
diff --git a/recipes/esm2_accelerate/train.py b/recipes/esm2_accelerate/train.py
@@ -46,7 +46,7 @@ def main(args: DictConfig):
     )
 
     config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True)
-    model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True, torch_dtype=torch.bfloat16)
+    model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True, dtype=torch.bfloat16)
 
     train_dataset, eval_dataset, data_collator = create_datasets_and_collator(
         tokenizer_name=args.model_tag,
@@ -72,6 +72,8 @@ def main(args: DictConfig):
             logger.info("Resuming from checkpoint: %s", last_checkpoint)
         else:
             logger.info("No checkpoint found, starting from scratch")
+        if state.is_main_process:
+            breakpoint()
         train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
         logger.info("Training complete. Metrics: %s", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)