Switch deprecated torch_dtype parameter to dtype

pstjohn · pstjohn · commit b6600a5ef1f7 · 2025-09-08T14:58:42.000-07:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/models/README.md b/models/README.md
@@ -111,7 +111,7 @@ def convert_hf_to_te(model_hf: nn.Module, **config_kwargs) -> nn.Module:
     """Convert HuggingFace model to TransformerEngine format."""
     te_config = MyModelTEConfig(**model_hf.config.to_dict(), **config_kwargs)
     with init_empty_weights():
-        model_te = MyModelTE(te_config, torch_dtype=te_config.torch_dtype)
+        model_te = MyModelTE(te_config, dtype=te_config.dtype)
 
     output_model = io.apply_transforms(model_hf, model_te, ...)
     return output_model
diff --git a/models/amplify/export.py b/models/amplify/export.py
@@ -36,7 +36,7 @@
     # Smoke test that the model can be loaded.
     model_te = AutoModelForMaskedLM.from_pretrained(
         f"./checkpoint_export/{tag}",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         trust_remote_code=True,
     )
     del model_te
diff --git a/models/amplify/src/amplify/amplify_te.py b/models/amplify/src/amplify/amplify_te.py
@@ -147,17 +147,15 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
             config.padded_vocab_size,
             config.hidden_size,
             padding_idx=config.pad_token_id,
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         if config.layer_norm_after_embedding:
             self.layer_norm_1 = (
-                transformer_engine.pytorch.RMSNorm(
-                    config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
-                )
+                transformer_engine.pytorch.RMSNorm(config.hidden_size, config.norm_eps, params_dtype=config.dtype)
                 if config.rms_norm
                 else transformer_engine.pytorch.LayerNorm(
-                    config.hidden_size, config.norm_eps, params_dtype=config.torch_dtype
+                    config.hidden_size, config.norm_eps, params_dtype=config.dtype
                 )
             )
 
@@ -194,7 +192,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
                     window_size=(-1, -1),
                     rotary_pos_interleaved=True,
                     seq_length=config.max_length,
-                    params_dtype=config.torch_dtype,
+                    params_dtype=config.dtype,
                 )
             )
 
@@ -277,7 +275,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
                 config.hidden_size,
                 config.padded_vocab_size,
                 config.norm_eps,
-                params_dtype=config.torch_dtype,
+                params_dtype=config.dtype,
                 normalization="RMSNorm" if config.rms_norm else "LayerNorm",
                 init_method=lambda x: torch.nn.init.uniform_(
                     x, -self.config.decoder_init_range, self.config.decoder_init_range
@@ -286,7 +284,7 @@ def __init__(self, config: AMPLIFYConfig, **kwargs):
 
         else:
             self.decoder = transformer_engine.pytorch.Linear(
-                config.hidden_size, config.vocab_size, params_dtype=config.torch_dtype
+                config.hidden_size, config.vocab_size, params_dtype=config.dtype
             )
 
     def forward(
diff --git a/models/amplify/src/amplify/state_dict_convert.py b/models/amplify/src/amplify/state_dict_convert.py
@@ -46,7 +46,7 @@ def convert_amplify_hf_to_te(model_hf: nn.Module, **config_kwargs) -> nn.Module:
     """
     te_config = AMPLIFYConfig(**model_hf.config.to_dict(), **config_kwargs)
     with init_empty_weights():
-        model_te = AMPLIFYForMaskedLM(te_config, torch_dtype=te_config.torch_dtype)
+        model_te = AMPLIFYForMaskedLM(te_config, dtype=te_config.dtype)
 
     output_model = io.apply_transforms(
         model_hf,
diff --git a/models/amplify/tests/conftest.py b/models/amplify/tests/conftest.py
@@ -36,7 +36,7 @@ def tokenizer():
 @pytest.fixture
 def config():
     config = AutoConfig.from_pretrained("chandar-lab/AMPLIFY_120M", trust_remote_code=True)
-    config.torch_dtype = torch.bfloat16
+    config.dtype = torch.bfloat16
     return config
 
 
diff --git a/models/amplify/tests/test_encoder_block.py b/models/amplify/tests/test_encoder_block.py
@@ -57,7 +57,7 @@ def data(self) -> torch.Tensor:
 @pytest.fixture
 def config():
     config = AutoConfig.from_pretrained("chandar-lab/AMPLIFY_120M", trust_remote_code=True)
-    config.torch_dtype = torch.bfloat16
+    config.dtype = torch.bfloat16
     return config
 
 
@@ -169,7 +169,7 @@ def test_encoder_block_forward(inputs, config):
         window_size=(-1, -1),
         rotary_pos_interleaved=True,
         seq_length=config.max_length,
-        params_dtype=config.torch_dtype,
+        params_dtype=config.dtype,
     ).to("cuda", dtype=torch.bfloat16)
 
     state_dict_mapping = {
diff --git a/models/esm2/src/esm/export.py b/models/esm2/src/esm/export.py
@@ -64,7 +64,7 @@ def export_hf_checkpoint(tag: str, export_path: Path):
     # Smoke test that the model can be loaded.
     model_te = AutoModelForMaskedLM.from_pretrained(
         export_path / tag,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         trust_remote_code=True,
     )
     del model_te
diff --git a/models/esm2/src/esm/modeling_esm_te.py b/models/esm2/src/esm/modeling_esm_te.py
@@ -129,7 +129,7 @@ def __init__(self, config: NVEsmConfig):
                     micro_batch_size=config.micro_batch_size,
                     num_gqa_groups=config.num_attention_heads,
                     fuse_qkv_params=config.fuse_qkv_params,
-                    params_dtype=config.torch_dtype,
+                    params_dtype=config.dtype,
                     window_size=(-1, -1),
                 )
                 for i in range(config.num_hidden_layers)
diff --git a/models/esm2/tests/test_distributed_strategies.py b/models/esm2/tests/test_distributed_strategies.py
@@ -145,14 +145,14 @@ def run_forward_backward(use_te: bool, strategy: Strategy, input_data: dict, dis
         if use_te:
             model = AutoModelForMaskedLM.from_pretrained(
                 "nvidia/esm2_t6_8M_UR50D",
-                torch_dtype=torch.bfloat16,
+                dtype=torch.bfloat16,
                 trust_remote_code=True,
             )
             transformer_layers = model.esm.encoder.layers
         else:
             model = AutoModelForMaskedLM.from_pretrained(
                 "facebook/esm2_t6_8M_UR50D",
-                torch_dtype=torch.bfloat16,
+                dtype=torch.bfloat16,
             )
             transformer_layers = model.esm.encoder.layer
             del model.esm.contact_head  # Unused in backwards pass.
diff --git a/recipes/amplify_accelerate_te_fp8/train.py b/recipes/amplify_accelerate_te_fp8/train.py
@@ -41,7 +41,7 @@ def main(args: DictConfig):
     model = AutoModelForMaskedLM.from_config(
         config,
         trust_remote_code=True,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
     )
 
     train_dataset, eval_dataset, data_collator = create_datasets_and_collator(
diff --git a/recipes/esm2_accelerate/train.py b/recipes/esm2_accelerate/train.py
@@ -38,7 +38,7 @@ def main(args: DictConfig):
     config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True)
     config.max_seq_length = args.max_seq_length
     config.micro_batch_size = args.trainer.per_device_train_batch_size
-    model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True, torch_dtype=torch.bfloat16)
+    model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True, dtype=torch.bfloat16)
 
     train_dataset, eval_dataset, data_collator = create_datasets_and_collator(
         tokenizer_name=args.model_tag,
diff --git a/recipes/esm2_native_te_nvfsdp_thd/modeling_esm_te.py b/recipes/esm2_native_te_nvfsdp_thd/modeling_esm_te.py
@@ -163,7 +163,7 @@ def __init__(self, config: NVEsmConfig):
                     micro_batch_size=config.micro_batch_size,
                     num_gqa_groups=config.num_attention_heads,
                     fuse_qkv_params=config.fuse_qkv_params,
-                    params_dtype=config.torch_dtype,
+                    params_dtype=config.dtype,
                     window_size=(-1, -1),
                 )
                 for i in range(config.num_hidden_layers)
diff --git a/recipes/esm2_native_te_nvfsdp_thd/train.py b/recipes/esm2_native_te_nvfsdp_thd/train.py
@@ -112,7 +112,7 @@ def main(args: DictConfig):
     config = NVEsmConfig(
         **AutoConfig.from_pretrained(
             f"facebook/{args.model_name}",
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
         ).to_dict(),
         micro_batch_size=args.micro_batch_size,
         max_seq_length=args.max_seq_length,
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/README.md b/recipes/geneformer_native_te_nvfsdp_fp8/README.md
@@ -153,7 +153,7 @@ import torch
 # Load the trained model
 model_path = "/workspace/bionemo/checkpoints/your_run/final_model"
 model = BertForMaskedLM.from_pretrained(
-    model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+    model_path, dtype=torch.bfloat16, trust_remote_code=True
 )
 
 # Example 1: Model inference
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/modeling_bert_te.py b/recipes/geneformer_native_te_nvfsdp_fp8/modeling_bert_te.py
@@ -75,7 +75,7 @@ class TEBertConfig(BertConfig):
     """Configuration class for the TE BERT model.
 
     This class is a subclass of BertConfig, and it adds the following attributes:
-    - torch_dtype: The dtype of the model parameters.
+    - dtype: The dtype of the model parameters.
     - use_te_layers: Whether to use the TE layers.
     """
 
@@ -87,7 +87,7 @@ def __init__(self, **kwargs):
         """
         super().__init__(**kwargs)
         # TODO(@jomitchell): Fix this in JIRA BIONEMO-2406
-        self.torch_dtype = kwargs.get("torch_dtype", torch.bfloat16)
+        self.dtype = kwargs.get("dtype", torch.bfloat16)
         self.use_te_layers = kwargs.get("use_te_layers", False)
 
 
@@ -117,7 +117,7 @@ def __init__(self, config, layer_number=None):
             micro_batch_size=config.micro_batch_size,
             num_gqa_groups=config.num_attention_heads,
             fuse_qkv_params=False,
-            params_dtype=config.torch_dtype,
+            params_dtype=config.dtype,
             window_size=(-1, -1),
         )
         self.is_decoder = config.is_decoder
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/test_distributed_checkpointing.py b/recipes/geneformer_native_te_nvfsdp_fp8/test_distributed_checkpointing.py
@@ -483,7 +483,7 @@ def test_safetensors_save_load_roundtrip_nvfsdp():
             # Load the model using our custom BertForMaskedLM class
             loaded_transformers_model = BertForMaskedLM.from_pretrained(
                 final_model_dir,  # Use the directory created by save_pretrained
-                torch_dtype=torch.bfloat16,
+                dtype=torch.bfloat16,
                 trust_remote_code=True,
             )
 
@@ -641,7 +641,7 @@ def test_distributed_safetensors_multiprocess_nvfsdp():
             from modeling_bert_te import BertForMaskedLM
 
             loaded_model = BertForMaskedLM.from_pretrained(
-                final_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                final_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             # Basic validation
@@ -740,7 +740,7 @@ def test_safetensors_multiprocess_roundtrip_nvfsdp():
 
             # Load the model using our custom BertForMaskedLM class
             loaded_model = BertForMaskedLM.from_pretrained(
-                final_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                final_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             print("✅ Successfully loaded multiprocess model using BertForMaskedLM.from_pretrained()")
@@ -878,11 +878,11 @@ def test_safetensors_unsharded_weights_consistency():
 
             # Load both models
             single_model = BertForMaskedLM.from_pretrained(
-                single_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                single_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             multi_model = BertForMaskedLM.from_pretrained(
-                multi_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                multi_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             # Get state dicts
@@ -1038,7 +1038,7 @@ def test_distributed_safetensors_multiprocess_ddp():
             from modeling_bert_te import BertForMaskedLM
 
             loaded_model = BertForMaskedLM.from_pretrained(
-                final_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                final_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             # Basic validation
@@ -1137,7 +1137,7 @@ def test_safetensors_multiprocess_roundtrip_ddp():
 
             # Load the model using our custom BertForMaskedLM class
             loaded_model = BertForMaskedLM.from_pretrained(
-                final_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                final_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             print("✅ Successfully loaded multiprocess DDP model using BertForMaskedLM.from_pretrained()")
@@ -1275,11 +1275,11 @@ def test_safetensors_unsharded_weights_consistency_ddp():
 
             # Load both models
             single_model = BertForMaskedLM.from_pretrained(
-                single_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                single_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             multi_model = BertForMaskedLM.from_pretrained(
-                multi_model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
+                multi_model_dir, dtype=torch.bfloat16, trust_remote_code=True
             )
 
             # Get state dicts
diff --git a/recipes/geneformer_native_te_nvfsdp_fp8/train.py b/recipes/geneformer_native_te_nvfsdp_fp8/train.py
@@ -128,7 +128,7 @@ def main(cfg: DictConfig) -> None:
             },
         )
 
-    bert_model_config = TEBertConfig(**cfg.model, torch_dtype=torch.bfloat16)
+    bert_model_config = TEBertConfig(**cfg.model, dtype=torch.bfloat16)
     # Note. One may notice here that we are using BertConfig from transformers.models.bert.configuration_bert. instead of one from modeling_bert_te.py
     # This is because, the BertConfig will simply pass through any additional argument to the model.
     model = BertForMaskedLM(bert_model_config)
@@ -146,7 +146,7 @@ def main(cfg: DictConfig) -> None:
     # Here we cast the model layers to the specified dtype. in our TEBertConfig we specify the dtype for the
     # TE layers, and here we simply cast the all the other layers to the same dtype.
     # TODO(@jomitchell): BIONEMO-2406: Remove this after verifying FP8 works.
-    model = model.to(device=device, dtype=bert_model_config.torch_dtype)  # type: ignore
+    model = model.to(device=device, dtype=bert_model_config.dtype)  # type: ignore
 
     if cfg.training.use_nvfsdp:
         model, optimizer = fully_shard(
@@ -165,7 +165,7 @@ def main(cfg: DictConfig) -> None:
     else:
         # Use standard PyTorch DDP (no nvFSDP config)
         # TODO(@jomitchell): BIONEMO-2406: Keep this until this ticket is done.
-        # model = model.to(device=device, dtype=bert_model_config.torch_dtype)  # type: ignore
+        # model = model.to(device=device, dtype=bert_model_config.dtype)  # type: ignore
         model = torch.nn.parallel.DistributedDataParallel(
             model,
             device_ids=[dist_config.local_rank],

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`# Smoke test that the model can be loaded.`
`37`	`37`	`model_te = AutoModelForMaskedLM.from_pretrained(`
`38`	`38`	`f"./checkpoint_export/{tag}",`
`39`		`- torch_dtype=torch.bfloat16,`
	`39`	`+ dtype=torch.bfloat16,`
`40`	`40`	`trust_remote_code=True,`
`41`	`41`	`)`
`42`	`42`	`del model_te`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ def export_hf_checkpoint(tag: str, export_path: Path):`
`64`	`64`	`# Smoke test that the model can be loaded.`
`65`	`65`	`model_te = AutoModelForMaskedLM.from_pretrained(`
`66`	`66`	`export_path / tag,`
`67`		`- torch_dtype=torch.bfloat16,`
	`67`	`+ dtype=torch.bfloat16,`
`68`	`68`	`trust_remote_code=True,`
`69`	`69`	`)`
`70`	`70`	`del model_te`
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ def __init__(self, config: NVEsmConfig):`
`129`	`129`	`micro_batch_size=config.micro_batch_size,`
`130`	`130`	`num_gqa_groups=config.num_attention_heads,`
`131`	`131`	`fuse_qkv_params=config.fuse_qkv_params,`
`132`		`- params_dtype=config.torch_dtype,`
	`132`	`+ params_dtype=config.dtype,`
`133`	`133`	`window_size=(-1, -1),`
`134`	`134`	`)`
`135`	`135`	`for i in range(config.num_hidden_layers)`
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def main(args: DictConfig):`
`41`	`41`	`model = AutoModelForMaskedLM.from_config(`
`42`	`42`	`config,`
`43`	`43`	`trust_remote_code=True,`
`44`		`- torch_dtype=torch.bfloat16,`
	`44`	`+ dtype=torch.bfloat16,`
`45`	`45`	`)`
`46`	`46`
`47`	`47`	`train_dataset, eval_dataset, data_collator = create_datasets_and_collator(`