Merge branch 'main' into cb-tp2

remi-or · web-flow · commit df3ff5af12b6 · 2026-05-11T11:59:07.000+02:00
diff --git a/docs/source/en/attention_interface.md b/docs/source/en/attention_interface.md
@@ -130,18 +130,23 @@ model = AutoModelForImageTextToText.from_pretrained(
 
 Customize or create new attention functions by adding them to the attention registry with [`AttentionInterface.register`]. Models use these functions through the `attn_implementation` argument.
 
-This example customizes the attention function to print a statement for each layer.
+> [!WARNING]  
+> Register a matching attention mask function when you register a custom attention function. If the custom `attn_implementation` name is not registered in [`AttentionMaskInterface`], Transformers skips mask creation and passes `attention_mask=None` to the attention layers. Your attention function must handle causal, padding, packing, or sliding-window constraints itself, or those constraints can be silently dropped.
+
+This example customizes the attention function to print a statement for each layer. It keeps the mask in the original implementation by registering `masking_utils.sdpa_mask` as the attention mask function.
 
 ```python
 import torch
-from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers import AutoModelForCausalLM, AttentionInterface, AttentionMaskInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
+from transformers.masking_utils import sdpa_mask
 
 def my_new_sdpa(*args, **kwargs):
     print("I just entered the attention computation")
     return sdpa_attention_forward(*args, **kwargs)
 
 AttentionInterface.register("my_new_sdpa", my_new_sdpa)
+AttentionMaskInterface.register("my_new_sdpa", sdpa_mask)  # must have the same name as the registered attention function
 
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="my_new_sdpa")
 model(torch.ones(1, 5, dtype=int))
@@ -151,8 +156,9 @@ You can also add new arguments to the attention function. Models supporting [`At
 
 ```python
 import torch
-from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers import AutoModelForCausalLM, AttentionInterface, AttentionMaskInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
+from transformers.masking_utils import sdpa_mask
 
 def custom_attention(
     module: torch.nn.Module,  # required arg
@@ -168,6 +174,7 @@ def custom_attention(
     return attn_output, attn_weights  # attn_weights are optional here
 
 AttentionInterface.register("custom", custom_attention)
+AttentionMaskInterface.register("custom", sdpa_mask)  # to leave the existing mask untouched
 
 model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom")
 model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -3778,7 +3778,7 @@ def _prefill(
             use_inputs_embeds = True
         if (cache := model_kwargs.get("past_key_values")) is not None:
             past_length = cache.get_seq_length()
-            # It will be sliced as input_embeds = inputs_embeds[:, -next_sequence_length:, :] in `prepare_inputs_for_generation`
+            # It will be sliced as inputs_embeds = inputs_embeds[:, -next_sequence_length:, :] in `prepare_inputs_for_generation`
             if use_inputs_embeds:
                 next_sequence_length = model_kwargs["inputs_embeds"].shape[1] - past_length
             else:
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
@@ -20,7 +20,6 @@
 from .cache_utils import Cache
 from .configuration_utils import PreTrainedConfig
 from .utils import is_torch_xpu_available, logging
-from .utils.deprecation import deprecate_kwarg
 from .utils.generic import GeneralInterface, is_flash_attention_requested
 from .utils.import_utils import is_torch_flex_attn_available, is_torch_greater_or_equal, is_tracing
 
@@ -788,7 +787,6 @@ def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor | N
     return packed_sequence_mask
 
 
-@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
 def _preprocess_mask_arguments(
     config: PreTrainedConfig,
     inputs_embeds: torch.Tensor,
@@ -893,7 +891,6 @@ def _preprocess_mask_arguments(
     return False, attention_mask, packed_sequence_mask, q_length, kv_length, q_offset, kv_offset
 
 
-@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
 def create_causal_mask(
     config: PreTrainedConfig,
     inputs_embeds: torch.Tensor,
@@ -1019,7 +1016,6 @@ def create_causal_mask(
     return causal_mask
 
 
-@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
 def create_bidirectional_mask(
     config: PreTrainedConfig,
     inputs_embeds: torch.Tensor,
@@ -1110,7 +1106,6 @@ def create_bidirectional_mask(
     return attention_mask
 
 
-@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
 def create_sliding_window_causal_mask(
     config: PreTrainedConfig,
     inputs_embeds: torch.Tensor,
@@ -1237,7 +1232,6 @@ def create_sliding_window_causal_mask(
     return causal_mask
 
 
-@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
 def create_bidirectional_sliding_window_mask(
     config: PreTrainedConfig,
     inputs_embeds: torch.Tensor,
@@ -1324,7 +1318,6 @@ def create_bidirectional_sliding_window_mask(
     return attention_mask
 
 
-@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
 def create_chunked_causal_mask(
     config: PreTrainedConfig,
     inputs_embeds: torch.Tensor,
@@ -1453,7 +1446,6 @@ def create_chunked_causal_mask(
 }
 
 
-@deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
 def create_masks_for_generate(
     config: PreTrainedConfig,
     inputs_embeds: torch.Tensor,
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
@@ -39,7 +39,6 @@
     is_torch_accelerator_available,
     logging,
 )
-from ...utils.deprecation import deprecate_kwarg
 from ..auto import AutoModel
 from .configuration_bark import (
     BarkCoarseConfig,
@@ -392,7 +391,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, new_embeddings):
         self.input_embeds_layer = new_embeddings
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     @auto_docstring
     def forward(
         self,
@@ -990,7 +988,6 @@ def resize_token_embeddings(
 
         return model_embeds
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     @auto_docstring
     def forward(
         self,
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -375,7 +375,7 @@ def forward(
 
         causal_mask = create_causal_mask(
             config=self.config,
-            input_embeds=inputs_embeds,
+            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             past_key_values=self_attn_cache,
         )
diff --git a/src/transformers/models/biogpt/modular_biogpt.py b/src/transformers/models/biogpt/modular_biogpt.py
@@ -207,7 +207,7 @@ def forward(
 
         causal_mask = create_causal_mask(
             config=self.config,
-            input_embeds=inputs_embeds,
+            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             past_key_values=self_attn_cache,
         )
diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py
@@ -38,7 +38,6 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
-from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import OutputRecorder, capture_outputs
 from .configuration_blt import (
@@ -806,7 +805,6 @@ def __init__(self, config: BltGlobalTransformerConfig):
 
         self.post_init()
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     def forward(
         self,
         inputs_embeds: torch.Tensor,
diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py
@@ -29,7 +29,6 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
-from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import OutputRecorder, capture_outputs
 from ..cohere2.modeling_cohere2 import rotate_half  # noqa: F401
@@ -740,7 +739,6 @@ def __init__(self, config: BltGlobalTransformerConfig):
 
         self.post_init()
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     def forward(
         self,
         inputs_embeds: torch.Tensor,
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -48,7 +48,6 @@
     auto_docstring,
     logging,
 )
-from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import can_return_tuple, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from .configuration_distilbert import DistilBertConfig
@@ -92,7 +91,6 @@ def __init__(self, config: PreTrainedConfig):
             "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
         )
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -43,7 +43,6 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
-from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..auto import AutoModel
@@ -1052,7 +1051,6 @@ def prepare_inputs_for_generation(
         return model_inputs
 
     @staticmethod
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     def create_masks_for_generate(
         config: PreTrainedConfig,
         inputs_embeds: torch.Tensor,
diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
@@ -487,7 +487,7 @@ def forward(
 
         attention_mask = create_causal_mask(
             config=self.config,
-            input_embeds=hidden_states,
+            inputs_embeds=hidden_states,
             attention_mask=attention_mask,
             past_key_values=None,
         )
diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py
@@ -258,7 +258,7 @@ def forward(
 
         attention_mask = create_causal_mask(
             config=self.config,
-            input_embeds=hidden_states,
+            inputs_embeds=hidden_states,
             attention_mask=attention_mask,
             past_key_values=None,
         )
diff --git a/src/transformers/models/nemotron_h/modeling_nemotron_h.py b/src/transformers/models/nemotron_h/modeling_nemotron_h.py
@@ -1078,7 +1078,7 @@ def forward(
 
         causal_mask = create_causal_mask(
             config=self.config,
-            input_embeds=inputs_embeds,
+            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             position_ids=position_ids,
diff --git a/src/transformers/models/nemotron_h/modular_nemotron_h.py b/src/transformers/models/nemotron_h/modular_nemotron_h.py
@@ -431,7 +431,7 @@ def forward(
 
         causal_mask = create_causal_mask(
             config=self.config,
-            input_embeds=inputs_embeds,
+            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             position_ids=position_ids,
diff --git a/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py
@@ -994,7 +994,7 @@ def forward(
 
         causal_mask = create_causal_mask(
             config=self.config,
-            input_embeds=inputs_embeds,
+            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             position_ids=position_ids,
diff --git a/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py
@@ -752,7 +752,7 @@ def forward(
 
         causal_mask = create_causal_mask(
             config=self.config,
-            input_embeds=inputs_embeds,
+            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             position_ids=position_ids,
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -34,7 +34,6 @@
     logging,
     torch_compilable_check,
 )
-from ...utils.deprecation import deprecate_kwarg
 from ..auto import AutoModel
 from .configuration_paligemma import PaliGemmaConfig
 
@@ -434,7 +433,6 @@ def prepare_inputs_for_generation(
         return model_inputs
 
     @staticmethod
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     def create_masks_for_generate(
         config: PreTrainedConfig,
         inputs_embeds: torch.Tensor,
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -37,7 +37,6 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
-from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import OutputRecorder, capture_outputs
 from .configuration_pegasus_x import PegasusXConfig
@@ -100,7 +99,6 @@ def __init__(self, embed_dim, max_scale: int = 10000.0):
         self.embed_dim = embed_dim
         self.max_scale = max_scale
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     @torch.no_grad()
     def forward(
         self, inputs_embeds: torch.Tensor, past_key_values_length: int = 0, position_ids: torch.Tensor | None = None
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -38,7 +38,6 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import ModelOutput, auto_docstring, logging
-from ...utils.deprecation import deprecate_kwarg
 from .configuration_seamless_m4t import SeamlessM4TConfig
 
 
@@ -2250,7 +2249,6 @@ def __init__(self, config: SeamlessM4TConfig):
 
         self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     def forward(self, inputs_embeds: torch.FloatTensor) -> torch.FloatTensor:
         r"""
         Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -38,7 +38,6 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import ModelOutput, auto_docstring, logging
-from ...utils.deprecation import deprecate_kwarg
 from .configuration_seamless_m4t_v2 import SeamlessM4Tv2Config
 
 
@@ -2446,7 +2445,6 @@ def __init__(self, config: SeamlessM4Tv2Config):
 
         self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
 
-    @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")
     def forward(self, inputs_embeds: torch.FloatTensor) -> torch.FloatTensor:
         r"""
         Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
@@ -653,7 +653,7 @@ def test_inputs_embeds(self):
 
             wte = model.get_input_embeddings()[inputs_dict["codebook_idx"]]
 
-            inputs["input_embeds"] = wte(input_ids[:, :, inputs_dict["codebook_idx"]])
+            inputs["inputs_embeds"] = wte(input_ids[:, :, inputs_dict["codebook_idx"]])
 
             with torch.no_grad():
                 model(**inputs)[0]

Original file line number	Diff line number	Diff line change
`@@ -375,7 +375,7 @@ def forward(`
`375`	`375`
`376`	`376`	`causal_mask = create_causal_mask(`
`377`	`377`	`config=self.config,`
`378`		`- input_embeds=inputs_embeds,`
	`378`	`+ inputs_embeds=inputs_embeds,`
`379`	`379`	`attention_mask=attention_mask,`
`380`	`380`	`past_key_values=self_attn_cache,`
`381`	`381`	`)`
Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@ def forward(`
`207`	`207`
`208`	`208`	`causal_mask = create_causal_mask(`
`209`	`209`	`config=self.config,`
`210`		`- input_embeds=inputs_embeds,`
	`210`	`+ inputs_embeds=inputs_embeds,`
`211`	`211`	`attention_mask=attention_mask,`
`212`	`212`	`past_key_values=self_attn_cache,`
`213`	`213`	`)`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,6 @@`
`48`	`48`	`auto_docstring,`
`49`	`49`	`logging,`
`50`	`50`	`)`
`51`		`-from ...utils.deprecation import deprecate_kwarg`
`52`	`51`	`from ...utils.generic import can_return_tuple, merge_with_config_defaults`
`53`	`52`	`from ...utils.output_capturing import capture_outputs`
`54`	`53`	`from .configuration_distilbert import DistilBertConfig`
`@@ -92,7 +91,6 @@ def __init__(self, config: PreTrainedConfig):`
`92`	`91`	`"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False`
`93`	`92`	`)`
`94`	`93`
`95`		`- @deprecate_kwarg("input_embeds", version="5.6.0", new_name="inputs_embeds")`
`96`	`94`	`def forward(`
`97`	`95`	`self,`
`98`	`96`	`input_ids: torch.Tensor,`