|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +import argparse |
| 6 | +import os |
| 7 | + |
| 8 | +import torch |
| 9 | +from torch.utils.data import DataLoader, Dataset |
| 10 | +from tqdm import tqdm |
| 11 | + |
| 12 | +from diffusers import Cosmos2_5_PredictBasePipeline |
| 13 | +from diffusers.utils import export_to_video, load_image |
| 14 | + |
| 15 | + |
| 16 | +IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png"} |
| 17 | + |
| 18 | + |
| 19 | +class ImageDataset(Dataset): |
| 20 | + """Dataset that loads images and their corresponding text prompts. |
| 21 | +
|
| 22 | + Expects a directory with: |
| 23 | + <filename>.jpg / .jpeg / .png — the conditioning image |
| 24 | + <filename>.txt — the prompt text |
| 25 | + """ |
| 26 | + |
| 27 | + def __init__(self, data_dir: str): |
| 28 | + self.data_dir = data_dir |
| 29 | + self.samples = [] |
| 30 | + |
| 31 | + for filename in sorted(os.listdir(data_dir)): |
| 32 | + stem, ext = os.path.splitext(filename) |
| 33 | + if ext.lower() not in IMAGE_EXTENSIONS: |
| 34 | + continue |
| 35 | + img_path = os.path.join(data_dir, filename) |
| 36 | + txt_path = os.path.join(data_dir, stem + ".txt") |
| 37 | + if not os.path.exists(txt_path): |
| 38 | + print(f"WARNING: no prompt file found for {img_path}, skipping.") |
| 39 | + continue |
| 40 | + self.samples.append((img_path, txt_path, stem)) |
| 41 | + |
| 42 | + if len(self.samples) == 0: |
| 43 | + raise ValueError(f"No valid image/prompt pairs found in {data_dir}") |
| 44 | + |
| 45 | + def __len__(self): |
| 46 | + return len(self.samples) |
| 47 | + |
| 48 | + def __getitem__(self, idx): |
| 49 | + img_path, txt_path, stem = self.samples[idx] |
| 50 | + image = load_image(img_path) |
| 51 | + with open(txt_path) as f: |
| 52 | + prompt = f.read().strip() |
| 53 | + return { |
| 54 | + "image": image, |
| 55 | + "prompt": prompt, |
| 56 | + "stem": stem, |
| 57 | + } |
| 58 | + |
| 59 | + |
| 60 | +def collate_fn(batch): |
| 61 | + """Keep images as a list (PIL images can't be stacked into a tensor).""" |
| 62 | + return { |
| 63 | + "images": [item["image"] for item in batch], |
| 64 | + "prompts": [item["prompt"] for item in batch], |
| 65 | + "stems": [item["stem"] for item in batch], |
| 66 | + } |
| 67 | + |
| 68 | + |
| 69 | +def parse_args(): |
| 70 | + parser = argparse.ArgumentParser(description="Eval Cosmos Predict 2.5 with optional LoRA weights.") |
| 71 | + |
| 72 | + parser.add_argument("--data_dir", type=str, required=True, help="Directory with image/prompt pairs.") |
| 73 | + parser.add_argument("--output_dir", type=str, required=True, help="Directory to save generated outputs.") |
| 74 | + parser.add_argument( |
| 75 | + "--model_id", type=str, default="nvidia/Cosmos-Predict2.5-2B", help="HuggingFace model repository." |
| 76 | + ) |
| 77 | + parser.add_argument( |
| 78 | + "--revision", |
| 79 | + type=str, |
| 80 | + default="diffusers/base/post-trained", |
| 81 | + choices=["diffusers/base/post-trained", "diffusers/base/pre-trained"], |
| 82 | + ) |
| 83 | + parser.add_argument("--lora_dir", type=str, default=None, help="Path to LoRA weights directory.") |
| 84 | + parser.add_argument("--num_output_frames", type=int, default=93, help="1 for image output, 93 for video output.") |
| 85 | + parser.add_argument("--num_steps", type=int, default=36, help="Number of inference steps.") |
| 86 | + parser.add_argument("--height", type=int, default=704, help="Output height in pixels (must be divisible by 16).") |
| 87 | + parser.add_argument("--width", type=int, default=1280, help="Output width in pixels (must be divisible by 16).") |
| 88 | + parser.add_argument("--seed", type=int, default=0, help="Random seed.") |
| 89 | + parser.add_argument("--device", type=str, default="cuda", help="Device to use.") |
| 90 | + parser.add_argument("--batch_size", type=int, default=1, help="Number of samples per batch.") |
| 91 | + parser.add_argument("--num_workers", type=int, default=4, help="DataLoader worker processes.") |
| 92 | + parser.add_argument( |
| 93 | + "--negative_prompt", |
| 94 | + type=str, |
| 95 | + default=None, |
| 96 | + help="Negative prompt. Defaults to the pipeline's built-in negative prompt.", |
| 97 | + ) |
| 98 | + return parser.parse_args() |
| 99 | + |
| 100 | + |
| 101 | +def main(): |
| 102 | + args = parse_args() |
| 103 | + os.makedirs(args.output_dir, exist_ok=True) |
| 104 | + |
| 105 | + dataset = ImageDataset(args.data_dir) |
| 106 | + dataloader = DataLoader( |
| 107 | + dataset, |
| 108 | + batch_size=args.batch_size, |
| 109 | + shuffle=False, |
| 110 | + num_workers=args.num_workers, |
| 111 | + collate_fn=collate_fn, |
| 112 | + ) |
| 113 | + |
| 114 | + print(f"Found {len(dataset)} examples.") |
| 115 | + |
| 116 | + class MockSafetyChecker: |
| 117 | + def to(self, *args, **kwargs): |
| 118 | + return self |
| 119 | + |
| 120 | + def check_text_safety(self, *args, **kwargs): |
| 121 | + return True |
| 122 | + |
| 123 | + def check_video_safety(self, video): |
| 124 | + return video |
| 125 | + |
| 126 | + pipe = Cosmos2_5_PredictBasePipeline.from_pretrained( |
| 127 | + args.model_id, |
| 128 | + revision=args.revision, |
| 129 | + device_map=args.device, |
| 130 | + torch_dtype=torch.bfloat16, |
| 131 | + safety_checker=MockSafetyChecker(), |
| 132 | + ) |
| 133 | + |
| 134 | + if args.lora_dir is not None: |
| 135 | + pipe.load_lora_weights(args.lora_dir) |
| 136 | + pipe.fuse_lora(lora_scale=1.0) |
| 137 | + print(f"Loaded LoRA weights from {args.lora_dir}") |
| 138 | + |
| 139 | + progress = tqdm(total=len(dataset), desc="Generating") |
| 140 | + for batch in dataloader: |
| 141 | + images = batch["images"] |
| 142 | + prompts = batch["prompts"] |
| 143 | + stems = batch["stems"] |
| 144 | + |
| 145 | + for image, prompt, stem in zip(images, prompts, stems): |
| 146 | + frames = pipe( |
| 147 | + image=image, |
| 148 | + prompt=prompt, |
| 149 | + negative_prompt=args.negative_prompt, |
| 150 | + num_frames=args.num_output_frames, |
| 151 | + num_inference_steps=args.num_steps, |
| 152 | + height=args.height, |
| 153 | + width=args.width, |
| 154 | + ).frames[0] # NOTE: batch_size == 1 |
| 155 | + |
| 156 | + out_path = os.path.join(args.output_dir, f"{stem}.mp4") |
| 157 | + export_to_video(frames, out_path, fps=16) |
| 158 | + |
| 159 | + tqdm.write(f" Saved to: {out_path}") |
| 160 | + progress.update(1) |
| 161 | + |
| 162 | + |
| 163 | +if __name__ == "__main__": |
| 164 | + main() |
0 commit comments