autocast_fp32 default to False and only set to True at training

root · terarachang · commit 60fa32625a12 · 2026-04-28T05:23:50.000Z
diff --git a/examples/cosmos/train_cosmos_predict25_lora.py b/examples/cosmos/train_cosmos_predict25_lora.py
@@ -423,7 +423,7 @@ def __getitem__(self, index: int) -> dict | Any:
 
             # Load caption based on format
             video_path = self.video_paths[index]
-            video_basename = os.path.basename(video_path).replace(".mp4", "")
+            video_basename = os.path.splitext(os.path.basename(video_path))[0]
 
             if self.caption_format == "json":
                 caption_path = os.path.join(self.caption_dir, f"{video_basename}.json")
@@ -550,6 +550,7 @@ def main():
     vae = pipe.vae
     text_encoder = pipe.text_encoder
 
+    dit.set_autocast_fp32(True)
     dit.requires_grad_(False)
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -67,7 +67,7 @@ def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
 
 
 class CosmosEmbedding(nn.Module):
-    def __init__(self, embedding_dim: int, condition_dim: int, autocast_fp32: bool = True) -> None:
+    def __init__(self, embedding_dim: int, condition_dim: int, autocast_fp32: bool = False) -> None:
         super().__init__()
 
         self.autocast_fp32 = autocast_fp32
@@ -116,7 +116,7 @@ def forward(
 
 
 class CosmosAdaLayerNormZero(nn.Module):
-    def __init__(self, in_features: int, hidden_features: int | None = None, autocast_fp32: bool = True) -> None:
+    def __init__(self, in_features: int, hidden_features: int | None = None, autocast_fp32: bool = False) -> None:
         super().__init__()
 
         self.autocast_fp32 = autocast_fp32
@@ -158,7 +158,7 @@ def forward(
 
 
 class CosmosAttnProcessor2_0:
-    def __init__(self, autocast_fp32: bool = True):
+    def __init__(self, autocast_fp32: bool = False):
         if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
             raise ImportError("CosmosAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.")
         self.autocast_fp32 = autocast_fp32
@@ -228,7 +228,7 @@ def __call__(
 
 
 class CosmosAttnProcessor2_5:
-    def __init__(self, autocast_fp32: bool = True):
+    def __init__(self, autocast_fp32: bool = False):
         if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
             raise ImportError("CosmosAttnProcessor2_5 requires PyTorch 2.0. Please upgrade PyTorch to 2.0 or newer.")
         self.autocast_fp32 = autocast_fp32
@@ -373,7 +373,7 @@ def __init__(
         img_context: bool = False,
         before_proj: bool = False,
         after_proj: bool = False,
-        autocast_fp32: bool = True,
+        autocast_fp32: bool = False,
     ) -> None:
         super().__init__()
 
@@ -622,7 +622,7 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin,
         img_context_dim_out (`int`):
             The output dimension of the image context projection layer. If `img_context_dim_in` is not provided, then
             this parameter is ignored.
-        autocast_fp32 (`bool`, defaults to `True`):
+        autocast_fp32 (`bool`, defaults to `False`):
             Whether to cast certain computations (AdaLN, timestep embedding, RoPE, final norm and projection) to
             float32 for numerical stability. Set to `False` to disable autocasting (e.g., when the model is already
             running in float32 or when autocasting is handled externally).
@@ -656,7 +656,7 @@ def __init__(
         img_context_dim_in: int | None = None,
         img_context_num_tokens: int = 256,
         img_context_dim_out: int = 2048,
-        autocast_fp32: bool = True,
+        autocast_fp32: bool = False,
     ) -> None:
         super().__init__()
         hidden_size = num_attention_heads * attention_head_dim