[Fix] Fix FA2 kernels ut (huggingface#42803)

YangKai0616 · web-flow · commit e17b1b85e3a8 · 2025-12-15T09:24:52.000Z
* Fixed FA2 kernels UT

* Update

* Refactor FA2 kernel map

* Update

* Update
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -155,6 +155,12 @@
 _is_quantized = False
 _is_ds_init_called = False
 
+# Mapping from flash attention implementations to their kernel fallback repositories
+FLASH_ATTN_KERNEL_FALLBACK = {
+    "flash_attention_2": "kernels-community/flash-attn2",
+    "flash_attention_3": "kernels-community/vllm-flash-attn3",
+}
+
 
 def is_local_dist_rank_0():
     return (
@@ -1592,7 +1598,9 @@ def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
                 return True
 
             if is_torch_xpu_available():
-                logger.info("Detect using FlashAttention2 (via kernel `kernels-community/flash-attn2`) on XPU.")
+                logger.info(
+                    f"Detect using FlashAttention2 (via kernel `{FLASH_ATTN_KERNEL_FALLBACK['flash_attention_2']}`) on XPU."
+                )
                 return True
 
             if importlib.util.find_spec("flash_attn") is None:
@@ -1824,14 +1832,12 @@ def _check_and_adjust_attn_implementation(
             and is_kernels_available()
             and not is_torch_npu_available()
         ):
-            if attn_implementation.endswith("2"):
-                applicable_attn_implementation = "kernels-community/flash-attn2"
-                if is_torch_xpu_available():
-                    # On XPU, kernels library is the native implementation
-                    # Disabling this flag to avoid giving wrong fallbacks on errors and warnings
-                    requested_original_flash_attn = False
-            else:
-                applicable_attn_implementation = "kernels-community/vllm-flash-attn3"
+            applicable_attn_implementation = FLASH_ATTN_KERNEL_FALLBACK[attn_implementation.removeprefix("paged|")]
+
+            if is_torch_xpu_available() and attn_implementation.removeprefix("paged|") == "flash_attention_2":
+                # On XPU, kernels library is the native implementation
+                # Disabling this flag to avoid giving wrong fallbacks on errors and warnings
+                requested_original_flash_attn = False
 
             if is_paged:
                 applicable_attn_implementation = f"paged|{applicable_attn_implementation}"
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
@@ -221,7 +221,7 @@
     import torch
     from safetensors.torch import load_file
 
-    from .modeling_utils import PreTrainedModel
+    from .modeling_utils import FLASH_ATTN_KERNEL_FALLBACK, PreTrainedModel
 
     IS_ROCM_SYSTEM = torch.version.hip is not None
     IS_CUDA_SYSTEM = torch.version.cuda is not None
@@ -620,7 +620,7 @@ def require_flash_attn(test_case):
     try:
         from kernels import get_kernel
 
-        get_kernel("kernels-community/flash-attn2")
+        get_kernel(FLASH_ATTN_KERNEL_FALLBACK["flash_attention_2"])
     except Exception as _:
         kernels_available = False
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -52,7 +52,7 @@
     unset_hf_deepspeed_config,
 )
 from transformers.modeling_layers import GradientCheckpointingLayer
-from transformers.modeling_utils import _get_tied_weight_keys
+from transformers.modeling_utils import FLASH_ATTN_KERNEL_FALLBACK, _get_tied_weight_keys
 from transformers.models.auto import get_values
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
@@ -3243,6 +3243,20 @@ def flash_attn_can_dispatch_composite_models(self, attn_implementation: str):
             self.skipTest(f"bfloat16 not supported on {torch_device} (on the specific device currently used)")
 
         dtype = torch.bfloat16
+
+        def _expected_attn_implementations(attention_implementation: str) -> set[str]:
+            # Allow kernels fallbacks for flash attention tests.
+            requested = attention_implementation
+            base = requested.removeprefix("paged|")
+            prefix = "paged|" if requested.startswith("paged|") else ""
+
+            expected = {requested}
+            if base in FLASH_ATTN_KERNEL_FALLBACK:
+                expected.add(f"{prefix}{FLASH_ATTN_KERNEL_FALLBACK[base]}")
+            return expected
+
+        expected_attn_implementations = _expected_attn_implementations(attn_implementation)
+
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -3275,15 +3289,15 @@ def flash_attn_can_dispatch_composite_models(self, attn_implementation: str):
                     for key in model_fa.config:
                         if isinstance(getattr(model_fa.config, key), PreTrainedConfig):
                             sub_config = getattr(model_fa.config, key)
-                            self.assertTrue(sub_config._attn_implementation == attn_implementation)
+                            self.assertIn(sub_config._attn_implementation, expected_attn_implementations)
 
                     has_fa = False
                     for name, submodule in model_fa.named_modules():
                         class_name = submodule.__class__.__name__
                         if (
                             "Attention" in class_name
                             and getattr(submodule, "config", None)
-                            and submodule.config._attn_implementation == attn_implementation
+                            and submodule.config._attn_implementation in expected_attn_implementations
                         ):
                             has_fa = True
                             break
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
@@ -129,7 +129,11 @@
         _prepare_4d_attention_mask,
         _prepare_4d_causal_attention_mask,
     )
-    from transformers.modeling_utils import _find_disjoint, _find_identical
+    from transformers.modeling_utils import (
+        FLASH_ATTN_KERNEL_FALLBACK,
+        _find_disjoint,
+        _find_identical,
+    )
     from transformers.pytorch_utils import isin_mps_friendly
 
     # Fake pretrained models for tests
@@ -3028,7 +3032,7 @@ def test_kernels_fallback(self):
                 )
 
         self.assertTrue(
-            "You do not have `flash_attn` installed, using `kernels-community/flash-attn2` from the `kernels` library instead!"
+            f"You do not have `flash_attn` installed, using `{FLASH_ATTN_KERNEL_FALLBACK['flash_attention_2']}` from the `kernels` library instead!"
             in cl.out
         )
 
@@ -3040,7 +3044,8 @@ def test_not_available_kernels(self):
 
         with self.assertRaises(ImportError) as cm:
             _ = AutoModel.from_pretrained(
-                "hf-tiny-model-private/tiny-random-MCTCTModel", attn_implementation="kernels-community/flash-attn2"
+                "hf-tiny-model-private/tiny-random-MCTCTModel",
+                attn_implementation=FLASH_ATTN_KERNEL_FALLBACK["flash_attention_2"],
             )
 
         self.assertTrue("`kernels` is either not installed or uses an incompatible version." in str(cm.exception))