make style

terarachang · terarachang · commit 95810d03071d · 2026-04-28T05:23:50.000Z
diff --git a/examples/cosmos/create_prompts_for_gr1_dataset.py b/examples/cosmos/create_prompts_for_gr1_dataset.py
@@ -18,6 +18,7 @@
 
 from tqdm import tqdm
 
+
 """example command
 python create_prompts_for_gr1_dataset.py --dataset_path datasets/benchmark_train/gr1
 """
diff --git a/examples/cosmos/eval_cosmos_predict25_lora.py b/examples/cosmos/eval_cosmos_predict25_lora.py
@@ -7,8 +7,8 @@
 
 import numpy as np
 import torch
-from tqdm import tqdm
 from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
 
 from diffusers import Cosmos2_5_PredictBasePipeline
 from diffusers.utils import export_to_video, load_image
diff --git a/examples/cosmos/train_cosmos_predict25_lora.py b/examples/cosmos/train_cosmos_predict25_lora.py
@@ -4,8 +4,6 @@
 import math
 import os
 import random
-import shutil
-from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Optional
 
@@ -19,28 +17,22 @@
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from peft import LoraConfig
-from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
 from torch.utils.data import DataLoader, Dataset
 from tqdm.auto import tqdm
 
 import diffusers
 from diffusers import Cosmos2_5_PredictBasePipeline
-from diffusers.optimization import get_linear_schedule_with_warmup, get_scheduler
+from diffusers.optimization import get_linear_schedule_with_warmup
 from diffusers.training_utils import cast_training_params
-from diffusers.utils.torch_utils import is_compiled_module
 from diffusers.utils import (
     convert_state_dict_to_diffusers,
-    is_wandb_available,
-    load_video,
     export_to_video,
+    load_video,
 )
 from diffusers.video_processor import VideoProcessor
 
 
-if is_wandb_available():
-    import wandb
-
-
 logger = get_logger(__name__, log_level="INFO")
 
 
@@ -287,7 +279,7 @@ def __init__(
         caption_format: str = "auto",  # "text", "json", or "auto"
         video_paths: Optional[list[str]] = None,
     ) -> None:
-        
+
         super().__init__()
         self.dataset_dir = dataset_dir
         self.num_frames = num_frames
@@ -307,7 +299,7 @@ def __init__(
         logger.info(f"{len(self.video_paths)} videos in total", main_process_only=True)
 
         self.video_size = video_size
-        self.video_processor = VideoProcessor(vae_scale_factor=8, resample='bilinear')
+        self.video_processor = VideoProcessor(vae_scale_factor=8, resample="bilinear")
         self.num_failed_loads = 0
 
     def __str__(self) -> str:
@@ -326,7 +318,7 @@ def _load_video(self, video_path: str) -> list:
 
         # randomly sample a consecutive window of frames
         max_start_idx = total_frames - self.num_frames
-        start_frame = np.random.randint(0, max_start_idx+1)
+        start_frame = np.random.randint(0, max_start_idx + 1)
         return frames[start_frame : start_frame + self.num_frames]
 
     def _setup_caption_format(self) -> None:
@@ -401,7 +393,7 @@ def _get_frames(self, video_path: str) -> torch.Tensor:
 
     def __getitem__(self, index: int) -> dict | Any:
         try:
-            data = dict()
+            data = {}
             video = self._get_frames(self.video_paths[index])  # [C, T, H, W]
 
             # Load caption based on format
@@ -463,7 +455,7 @@ def sample_train_sigma_t(batch_size, distribution, device, dtype=torch.float32,
         t = torch.sigmoid(torch.randn((batch_size,))).to(device=device, dtype=dtype)
     else:
         raise NotImplementedError(f"Time distribution {distribution} is not implemented.")
-    sigma_t = shift * t / (1 + (shift - 1) * t) # 0.0 <= sigma_t <= 1.0
+    sigma_t = shift * t / (1 + (shift - 1) * t)  # 0.0 <= sigma_t <= 1.0
     return sigma_t.view(batch_size, 1, 1, 1, 1)
 
 
@@ -516,9 +508,9 @@ def main():
         if args.output_dir is not None:
             os.makedirs(args.output_dir, exist_ok=True)
 
-        print('-'*100)
+        print("-" * 100)
         print(args)
-        print('-'*100)
+        print("-" * 100)
 
     # Initialize models
     pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(
@@ -538,7 +530,7 @@ def main():
     vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
 
-    target_modules_list = ['to_q', 'to_k', 'to_v', 'to_out.0', 'ff.net.0.proj', 'ff.net.2']
+    target_modules_list = ["to_q", "to_k", "to_v", "to_out.0", "ff.net.0.proj", "ff.net.2"]
     dit_lora_config = LoraConfig(
         r=args.lora_rank,
         lora_alpha=args.lora_alpha,
@@ -600,7 +592,7 @@ def save_model_hook(models, weights, output_dir):
                 transformer_lora_layers=dit_lora_state_dict,
                 safe_serialization=True,
             )
-            
+
     accelerator.register_save_state_pre_hook(save_model_hook)
 
     if accelerator.is_main_process:
@@ -634,7 +626,7 @@ def save_model_hook(models, weights, output_dir):
     padding_mask = torch.zeros(1, 1, args.height, args.width, dtype=dit_dtype, device=device)
     latent_shape = pipe.get_latent_shape_cthw(args.height, args.width, args.num_frames)
     latents_mean = pipe.latents_mean.float().to(device)
-    latents_std = pipe.latents_std.float().to(device) # 1/σ
+    latents_std = pipe.latents_std.float().to(device)  # 1/σ
     # Start training
     torch.set_grad_enabled(True)  # re-enable grad disabled by Cosmos2_5_PredictBasePipeline
     for epoch in range(first_epoch, args.num_train_epochs):
@@ -647,15 +639,15 @@ def save_model_hook(models, weights, output_dir):
                 raw_state = batch["video"].to(device=device, dtype=vae.dtype)
                 mu = vae.encode(raw_state).latent_dist.mean  # deterministic
                 clean_latent = ((mu - latents_mean) * latents_std).contiguous().float()
-                assert clean_latent.requires_grad == False
+                assert not clean_latent.requires_grad
                 torch.cuda.empty_cache()
 
                 # Encode text to text embeddings
                 prompt_embeds = pipe._get_prompt_embeds(
                     prompt=batch["caption"],
                     device=device,
                 )
-                assert prompt_embeds.requires_grad == False
+                assert not prompt_embeds.requires_grad
 
                 # CFG dropout: independently zero out text conditioning per sample
                 bsz = clean_latent.shape[0]
@@ -667,18 +659,21 @@ def save_model_hook(models, weights, output_dir):
                 weights = list(args.conditional_frames_probs.values())
                 num_conditional_frames = random.choices(frames_options, weights=weights, k=bsz)
                 cond_indicator, cond_mask = pipe.create_condition_mask(
-                    (bsz, *latent_shape), device=device, dtype=torch.float32, num_cond_latent_frames=num_conditional_frames
+                    (bsz, *latent_shape),
+                    device=device,
+                    dtype=torch.float32,
+                    num_cond_latent_frames=num_conditional_frames,
                 )
 
                 # Sample a random timestep
-                sigma_t = sample_train_sigma_t(bsz, distribution='logitnormal', device=device)
+                sigma_t = sample_train_sigma_t(bsz, distribution="logitnormal", device=device)
                 # 1. Sample noise 2. Get the target velocity 3. Get xt by interpolation between noise and clean
                 xt_B_C_T_H_W, target_velocity = get_flow_xt_and_target_v(clean_latent, sigma_t, cond_mask)
-                
+
                 # Denoise
                 if args.conditional_frame_timestep >= 0:
                     in_timestep = cond_indicator * args.conditional_frame_timestep + (1 - cond_indicator) * sigma_t
-                
+
                 pred_velocity = dit(
                     hidden_states=xt_B_C_T_H_W,
                     condition_mask=cond_mask,
@@ -717,7 +712,7 @@ def save_model_hook(models, weights, output_dir):
             if global_step >= max_train_steps:
                 break
 
-        if (epoch+1) % args.checkpointing_epochs == 0 and (epoch+1) < args.num_train_epochs:
+        if (epoch + 1) % args.checkpointing_epochs == 0 and (epoch + 1) < args.num_train_epochs:
             if accelerator.is_main_process:
                 save_path = os.path.join(args.output_dir, f"checkpoint-{epoch}")
                 accelerator.save_state(save_path)
@@ -738,7 +733,7 @@ def save_model_hook(models, weights, output_dir):
         if args.do_final_eval:
             noises = arch_invariant_rand((1, *latent_shape), dtype=torch.float32, device=device, seed=args.seed)
             inputs = train_dataloader.dataset[0]
-            
+
             pipe.transformer.eval()
             with torch.inference_mode():
                 frames = pipe(
@@ -747,14 +742,15 @@ def save_model_hook(models, weights, output_dir):
                     prompt=inputs["caption"],
                     num_frames=args.num_frames,
                     num_inference_steps=args.num_inference_steps,
-                    latents=noises, # ensure architecture invariant generation
+                    latents=noises,  # ensure architecture invariant generation
                     height=args.height,
                     width=args.width,
                 ).frames[0]
-            
+
             export_to_video(frames, os.path.join(args.output_dir, "eval_output.mp4"), fps=16)
 
     accelerator.end_training()
 
+
 if __name__ == "__main__":
     main()
diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py
@@ -2263,16 +2263,14 @@ def _prepare_outputs(state_dict, metadata, alphas=None, return_alphas=False, ret
 
 class CosmosLoraLoaderMixin(FluxLoraLoaderMixin):
     r"""
-    Load LoRA layers into [`CosmosTransformer3DModel`],
-    Specific to [`Cosmos2_5_PredictBasePipeline`].
+    Load LoRA layers into [`CosmosTransformer3DModel`], Specific to [`Cosmos2_5_PredictBasePipeline`].
     """
 
     _lora_loadable_modules = ["transformer"]
     transformer_name = TRANSFORMER_NAME
     text_encoder_name = TEXT_ENCODER_NAME
     _control_lora_supported_norm_keys = ["norm_q", "norm_k", "norm_added_q", "norm_added_k"]
 
-
     def load_lora_weights(
         self,
         pretrained_model_name_or_path_or_dict: str | dict[str, torch.Tensor],
@@ -2312,11 +2310,6 @@ def load_lora_weights(
         if not (has_lora_keys or has_norm_keys):
             raise ValueError("Invalid LoRA checkpoint. Make sure all LoRA param names contain `'lora'` substring.")
 
-        transformer_lora_state_dict = {
-            k: state_dict.get(k)
-            for k in list(state_dict.keys())
-            if k.startswith(f"{self.transformer_name}.") and "lora" in k
-        }
         transformer_norm_state_dict = {
             k: state_dict.pop(k)
             for k in list(state_dict.keys())
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -194,7 +194,9 @@ def __call__(
             original_dtype = query.dtype
             with torch.amp.autocast("cuda", enabled=self.autocast_fp32, dtype=torch.float32):
                 target_dtype = torch.float32 if self.autocast_fp32 else original_dtype
-                query = apply_rotary_emb(query.to(target_dtype), image_rotary_emb, use_real=True, use_real_unbind_dim=-2)
+                query = apply_rotary_emb(
+                    query.to(target_dtype), image_rotary_emb, use_real=True, use_real_unbind_dim=-2
+                )
                 key = apply_rotary_emb(key.to(target_dtype), image_rotary_emb, use_real=True, use_real_unbind_dim=-2)
             query = query.to(original_dtype)
             key = key.to(original_dtype)
@@ -267,7 +269,9 @@ def __call__(
             original_dtype = query.dtype
             with torch.amp.autocast("cuda", enabled=self.autocast_fp32, dtype=torch.float32):
                 target_dtype = torch.float32 if self.autocast_fp32 else original_dtype
-                query = apply_rotary_emb(query.to(target_dtype), image_rotary_emb, use_real=True, use_real_unbind_dim=-2)
+                query = apply_rotary_emb(
+                    query.to(target_dtype), image_rotary_emb, use_real=True, use_real_unbind_dim=-2
+                )
                 key = apply_rotary_emb(key.to(target_dtype), image_rotary_emb, use_real=True, use_real_unbind_dim=-2)
             query = query.to(original_dtype)
             key = key.to(original_dtype)
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -21,6 +21,7 @@
 
 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput
+from ...loaders import CosmosLoraLoaderMixin
 from ...models import AutoencoderKLWan, CosmosTransformer3DModel
 from ...schedulers import UniPCMultistepScheduler
 from ...utils import (
@@ -33,7 +34,6 @@
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..pipeline_utils import DiffusionPipeline
-from ...loaders import CosmosLoraLoaderMixin
 from .pipeline_output import CosmosPipelineOutput
 
 
@@ -239,11 +239,11 @@ def __init__(
 
         self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
         self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
-        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial, resample='bilinear')
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial, resample="bilinear")
 
         assert getattr(self.vae.config, "latents_mean", None), "VAE configuration must define `latents_mean`."
         assert getattr(self.vae.config, "latents_std", None), "VAE configuration must define `latents_std`."
-        
+
         latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).float()
         latents_std = torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).float()
         self.latents_mean = latents_mean
@@ -259,7 +259,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 "torch_dtype": kwargs.get("torch_dtype", None),
                 "attn_implementation": text_encoder_attn_implementation,
             }
-            
+
             if os.path.isdir(pretrained_model_name_or_path):
                 text_encoder_path = os.path.join(pretrained_model_name_or_path, "text_encoder")
             else:
@@ -270,21 +270,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             )
 
         return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
-    
+
     def get_latent_shape_cthw(self, height: int, width: int, num_frames: int):
         C = self.vae.config.z_dim
         T = (num_frames - 1) // self.vae_scale_factor_temporal + 1
         H = height // self.vae_scale_factor_spatial
         W = width // self.vae_scale_factor_spatial
         return (C, T, H, W)
-    
+
     def create_condition_mask(self, latent_shape, device, dtype, num_cond_latent_frames):
         bsz, C, T, H, W = latent_shape
         cond_indicator = torch.zeros(bsz, 1, T, 1, 1, dtype=dtype, device=device)
         if isinstance(num_cond_latent_frames, int):
             num_cond_latent_frames = [num_cond_latent_frames] * bsz
         for idx in range(bsz):
-            cond_indicator[idx, :, :num_cond_latent_frames[idx], :, :] = 1.0
+            cond_indicator[idx, :, : num_cond_latent_frames[idx], :, :] = 1.0
         cond_mask = cond_indicator.expand(-1, -1, -1, H, W)
         return cond_indicator, cond_mask
 
@@ -493,11 +493,16 @@ def prepare_latents(
 
             if isinstance(generator, list):
                 cond_latents = [
-                    retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator=generator[i], sample_mode="argmax")
+                    retrieve_latents(
+                        self.vae.encode(video[i].unsqueeze(0)), generator=generator[i], sample_mode="argmax"
+                    )
                     for i in range(batch_size)
                 ]
             else:
-                cond_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator, sample_mode="argmax") for vid in video]
+                cond_latents = [
+                    retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator, sample_mode="argmax")
+                    for vid in video
+                ]
 
             cond_latents = torch.cat(cond_latents, dim=0).to(dtype)
 
@@ -760,8 +765,8 @@ def __call__(
                 raise ValueError(
                     f"Input video has only {total_input_frames} frames but Video2World requires at least "
                     f"{frames_to_extract} frames for conditioning."
-                ) 
-            
+                )
+
             video = video[:, :, -frames_to_extract:, :, :]
             if video.shape[2] < num_frames:
                 n_pad_frames = num_frames - video.shape[2]
@@ -807,7 +812,7 @@ def __call__(
                     continue
 
                 self._current_timestep = t.cpu().item()
-                
+
                 # NOTE: assumes sigma(t) \in [0, 1]
                 sigma_t = self.scheduler.sigmas[i].expand(batch_size).to(device=device, dtype=torch.float32)
                 if conditional_frame_timestep >= 0: