remove flash-attn2

terarachang · terarachang · commit 0cc6351e57ea · 2026-05-05T19:18:07.000Z
diff --git a/examples/cosmos/README.md b/examples/cosmos/README.md
@@ -14,13 +14,6 @@ cd examples/cosmos
 pip install -r requirements.txt
 ```
 
-> [!NOTE]
-> `flash-attn` is required for the default `flash_attention_2` text encoder attention implementation and must be installed separately after PyTorch:
-> ```bash
-> pip install flash-attn --no-build-isolation
-> ```
-> If your hardware does not support it, pass `--text_encoder_attn_implementation sdpa` to the training and eval scripts instead.
-
 ## Data preparation
 
 The training script expects a dataset directory with the following layout:
diff --git a/examples/cosmos/eval_cosmos_predict25_lora.py b/examples/cosmos/eval_cosmos_predict25_lora.py
@@ -102,14 +102,6 @@ def parse_args():
         default=None,
         help="Negative prompt. Defaults to the pipeline's built-in negative prompt.",
     )
-    parser.add_argument(
-        "--text_encoder_attn_implementation",
-        type=str,
-        default="flash_attention_2",
-        choices=["eager", "sdpa", "flash_attention_2"],
-        help="The attention implementation to use for the text encoder (Qwen2.5 VL).",
-    )
-
     return parser.parse_args()
 
 
@@ -144,7 +136,6 @@ def check_video_safety(self, video):
         device_map=args.device,
         torch_dtype=torch.bfloat16,
         safety_checker=MockSafetyChecker(),
-        text_encoder_attn_implementation=args.text_encoder_attn_implementation,
     )
 
     if args.lora_dir is not None:
diff --git a/examples/cosmos/requirements.txt b/examples/cosmos/requirements.txt
@@ -1,3 +1,6 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch
+torchvision
 accelerate>=0.31.0
 huggingface_hub
 imageio
diff --git a/examples/cosmos/train_cosmos_predict25_lora.py b/examples/cosmos/train_cosmos_predict25_lora.py
@@ -74,13 +74,6 @@ def parse_args():
         default=None,
         help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
     )
-    parser.add_argument(
-        "--text_encoder_attn_implementation",
-        type=str,
-        default="flash_attention_2",
-        choices=["eager", "sdpa", "flash_attention_2"],
-        help="The attention implementation to use for the text encoder (Qwen2.5 VL).",
-    )
     parser.add_argument(
         "--train_data_dir",
         type=str,
@@ -516,7 +509,6 @@ def main():
         args.pretrained_model_name_or_path,
         revision=args.revision,
         torch_dtype=torch.bfloat16,
-        text_encoder_attn_implementation=args.text_encoder_attn_implementation,
         safety_checker=MockSafetyChecker(),
     )
 
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from typing import Callable
 
 import numpy as np
@@ -245,28 +244,6 @@ def __init__(
         self.latents_mean = latents_mean
         self.latents_std = 1.0 / latents_std
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        text_encoder_attn_implementation = kwargs.pop("text_encoder_attn_implementation", "flash_attention_2")
-        if "text_encoder" not in kwargs:
-            load_kwargs = {
-                "revision": kwargs.get("revision", None),
-                "device_map": kwargs.get("device_map", None),
-                "torch_dtype": kwargs.get("torch_dtype", None),
-                "attn_implementation": text_encoder_attn_implementation,
-            }
-
-            if os.path.isdir(pretrained_model_name_or_path):
-                text_encoder_path = os.path.join(pretrained_model_name_or_path, "text_encoder")
-            else:
-                text_encoder_path = pretrained_model_name_or_path
-                load_kwargs["subfolder"] = "text_encoder"
-            kwargs["text_encoder"] = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                text_encoder_path, **load_kwargs
-            )
-
-        return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-
     def get_latent_shape_cthw(self, height: int, width: int, num_frames: int):
         C = self.vae.config.z_dim
         T = (num_frames - 1) // self.vae_scale_factor_temporal + 1