remove the get_latent_shape_cthw method and fix style

Ting-Yun Chang · Ting-Yun Chang · commit 6acdc1c5cfba · 2026-05-05T20:00:03.000Z
diff --git a/examples/cosmos/eval_cosmos_predict25_lora.py b/examples/cosmos/eval_cosmos_predict25_lora.py
@@ -143,7 +143,12 @@ def check_video_safety(self, video):
         pipe.fuse_lora(lora_scale=1.0)
         print(f"Loaded LoRA weights from {args.lora_dir}")
 
-    latent_shape = pipe.get_latent_shape_cthw(args.height, args.width, args.num_output_frames)
+    latent_shape = (
+        pipe.vae.config.z_dim,
+        (args.num_output_frames - 1) // pipe.vae_scale_factor_temporal + 1,
+        args.height // pipe.vae_scale_factor_spatial,
+        args.width // pipe.vae_scale_factor_spatial,
+    )
     noises = arch_invariant_rand(
         (args.batch_size, *latent_shape), dtype=torch.float32, device=args.device, seed=args.seed
     )
diff --git a/examples/cosmos/train_cosmos_predict25_lora.py b/examples/cosmos/train_cosmos_predict25_lora.py
@@ -614,7 +614,12 @@ def save_model_hook(models, weights, output_dir):
     )
 
     padding_mask = torch.zeros(1, 1, args.height, args.width, dtype=dit_dtype, device=device)
-    latent_shape = pipe.get_latent_shape_cthw(args.height, args.width, args.num_frames)
+    latent_shape = (
+        pipe.vae.config.z_dim,
+        (args.num_frames - 1) // pipe.vae_scale_factor_temporal + 1,
+        args.height // pipe.vae_scale_factor_spatial,
+        args.width // pipe.vae_scale_factor_spatial,
+    )
     latents_mean = pipe.latents_mean.float().to(device)
     latents_std = pipe.latents_std.float().to(device)  # 1/σ
     # Start training
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -236,21 +236,11 @@ def __init__(
         self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial, resample="bilinear")
 
-        assert getattr(self.vae.config, "latents_mean", None), "VAE configuration must define `latents_mean`."
-        assert getattr(self.vae.config, "latents_std", None), "VAE configuration must define `latents_std`."
-
         latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).float()
         latents_std = torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).float()
         self.latents_mean = latents_mean
         self.latents_std = 1.0 / latents_std
 
-    def get_latent_shape_cthw(self, height: int, width: int, num_frames: int):
-        C = self.vae.config.z_dim
-        T = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        H = height // self.vae_scale_factor_spatial
-        W = width // self.vae_scale_factor_spatial
-        return (C, T, H, W)
-
     def create_condition_mask(self, latent_shape, device, dtype, num_cond_latent_frames):
         bsz, C, T, H, W = latent_shape
         cond_indicator = torch.zeros(bsz, 1, T, 1, 1, dtype=dtype, device=device)
@@ -438,9 +428,11 @@ def prepare_latents(
             )
 
         B = batch_size
-        C, T, H, W = self.get_latent_shape_cthw(height, width, num_frames_out)
+        C = num_channels_latents
+        T = (num_frames_out - 1) // self.vae_scale_factor_temporal + 1
+        H = height // self.vae_scale_factor_spatial
+        W = width // self.vae_scale_factor_spatial
         shape = (B, C, T, H, W)
-        assert C == num_channels_latents, f"Expected number of channels to be {num_channels_latents}, but got {C}."
 
         if num_frames_in == 0:
             if latents is None: