revert dtype calls to transformers 4.55.0

pstjohn · pstjohn · commit cae320a3797e · 2025-09-16T13:10:46.000-07:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/models/esm2/src/esm/modeling_esm_te.py b/models/esm2/src/esm/modeling_esm_te.py
@@ -128,7 +128,7 @@ def __init__(self, config: NVEsmConfig):
                     micro_batch_size=config.micro_batch_size,
                     num_gqa_groups=config.num_attention_heads,
                     fuse_qkv_params=config.fuse_qkv_params,
-                    params_dtype=config.dtype,
+                    params_dtype=config.torch_dtype,
                     window_size=(-1, -1),
                 )
                 for i in range(config.num_hidden_layers)
diff --git a/models/esm2/tests/test_thd_inputs.py b/models/esm2/tests/test_thd_inputs.py
@@ -30,7 +30,9 @@ def te_model_checkpoint(tmp_path):
 
 
 def test_thd_from_collator_output(te_model_checkpoint, input_data_thd):
-    model_thd = NVEsmForMaskedLM.from_pretrained(te_model_checkpoint, attn_input_format="thd", dtype=torch.bfloat16)
+    model_thd = NVEsmForMaskedLM.from_pretrained(
+        te_model_checkpoint, attn_input_format="thd", torch_dtype=torch.bfloat16
+    )
     model_thd.to("cuda")
     input_data_thd = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in input_data_thd.items()}
     with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):
@@ -76,8 +78,10 @@ def test_thd_values_match(te_model_checkpoint, tokenizer):
     input_data_bhsd = bhsd_collator(sequences)
     input_data_thd = thd_collator(sequences)
 
-    model_bshd = NVEsmForMaskedLM.from_pretrained(te_model_checkpoint, dtype=torch.bfloat16)
-    model_thd = NVEsmForMaskedLM.from_pretrained(te_model_checkpoint, attn_input_format="thd", dtype=torch.bfloat16)
+    model_bshd = NVEsmForMaskedLM.from_pretrained(te_model_checkpoint, torch_dtype=torch.bfloat16)
+    model_thd = NVEsmForMaskedLM.from_pretrained(
+        te_model_checkpoint, attn_input_format="thd", torch_dtype=torch.bfloat16
+    )
     model_bshd.to("cuda")
     model_thd.to("cuda")
 

Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ def __init__(self, config: NVEsmConfig):`
`128`	`128`	`micro_batch_size=config.micro_batch_size,`
`129`	`129`	`num_gqa_groups=config.num_attention_heads,`
`130`	`130`	`fuse_qkv_params=config.fuse_qkv_params,`
`131`		`- params_dtype=config.dtype,`
	`131`	`+ params_dtype=config.torch_dtype,`
`132`	`132`	`window_size=(-1, -1),`
`133`	`133`	`)`
`134`	`134`	`for i in range(config.num_hidden_layers)`