fix: fixes eager conversion on GPU

CYHSM · CYHSM · commit 53da3fd19ffe · 2025-08-13T15:41:34.000+02:00
diff --git a/src/modalities/conversion/gpt2/modeling_gpt2.py b/src/modalities/conversion/gpt2/modeling_gpt2.py
@@ -161,7 +161,9 @@ def eager_attention_forward(
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    # Note we do not upcast the attention weights to float32 here, as it introduces
+    # noise in the attention weights and is not necessary when using BF16
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=query.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
@@ -479,14 +481,16 @@ def forward(
         )
 
 
-class GPT2ForSequenceClassification(GenericForSequenceClassification, GPT2PreTrainedModel): ...
+class GPT2ForSequenceClassification(GenericForSequenceClassification, GPT2PreTrainedModel):
+    ...
 
 
 class GPT2ForQuestionAnswering(GenericForQuestionAnswering, GPT2PreTrainedModel):
     base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`
 
 
-class GPT2ForTokenClassification(GenericForTokenClassification, GPT2PreTrainedModel): ...
+class GPT2ForTokenClassification(GenericForTokenClassification, GPT2PreTrainedModel):
+    ...
 
 
 __all__ = [