add readme, requirements, and make flash attn optional

terarachang · terarachang · commit 431aecdfeb93 · 2026-04-28T05:23:50.000Z
diff --git a/examples/cosmos/README.md b/examples/cosmos/README.md
@@ -0,0 +1,104 @@
+# LoRA fine-tuning for Cosmos Predict 2.5
+
+This example shows how to fine-tune [Cosmos Predict 2.5](https://huggingface.co/nvidia/Cosmos-Predict2.5-2B) using LoRA on a custom video dataset.
+
+## Requirements
+
+Install the library from source and the example-specific dependencies:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e ".[dev]"
+cd examples/cosmos
+pip install -r requirements.txt
+```
+
+> [!NOTE]
+> `flash-attn` is required for the default `flash_attention_2` text encoder attention implementation and must be installed separately after PyTorch:
+> ```bash
+> pip install flash-attn --no-build-isolation
+> ```
+> If your hardware does not support it, pass `--text_encoder_attn_implementation sdpa` to the training and eval scripts instead.
+
+## Data preparation
+
+The training script expects a dataset directory with the following layout:
+
+```
+<dataset_dir>/
+├── videos/          # .mp4 files
+└── metas/           # one .txt prompt file per video (same stem)
+    ├── 0.txt
+    ├── 1.txt
+    └── ...
+```
+
+### GR1 dataset (quick start)
+
+The `download_and_preprocess_datasets.sh` script downloads the GR1-100 training set and the EVAL-175 test set, then runs the preprocessing script to create the per-video prompt files.
+
+```bash
+bash download_and_preprocess_datasets.sh
+```
+
+This produces:
+- `gr1_dataset/train/` — training videos + prompts
+- `gr1_dataset/test/`  — evaluation images + prompts
+
+## Training
+
+Launch LoRA training with `accelerate`:
+
+```bash
+export MODEL_NAME="nvidia/Cosmos-Predict2.5-2B"
+export DATA_DIR="gr1_dataset/train"
+export OUT_DIR="lora-output"
+
+accelerate launch --mixed_precision="bf16" train_cosmos_predict25_lora.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --revision diffusers/base/post-trained \
+  --train_data_dir=$DATA_DIR \
+  --output_dir=$OUT_DIR \
+  --train_batch_size=1 \
+  --num_train_epochs=500 \
+  --checkpointing_epochs=100 \
+  --seed=0 \
+  --height 432 --width 768 \
+  --allow_tf32 \
+  --gradient_checkpointing \
+  --lora_rank 32 --lora_alpha 32 \
+  --report_to=wandb
+```
+
+Or use the provided shell script:
+
+```bash
+bash train_lora.sh
+```
+
+## Evaluation
+
+Run inference with the trained LoRA adapter:
+
+```bash
+export DATA_DIR="gr1_dataset/test"
+export LORA_DIR="lora-output"
+export OUT_DIR="eval-output"
+
+python eval_cosmos_predict25_lora.py \
+  --data_dir $DATA_DIR \
+  --output_dir $OUT_DIR \
+  --lora_dir $LORA_DIR \
+  --revision diffusers/base/post-trained \
+  --height 432 --width 768 \
+  --num_output_frames 93 \
+  --num_steps 36 \
+  --seed 0
+```
+
+Or use the provided shell script:
+
+```bash
+bash eval_lora.sh
+```
diff --git a/examples/cosmos/create_prompts_for_gr1_dataset.py b/examples/cosmos/create_prompts_for_gr1_dataset.py
@@ -19,7 +19,7 @@
 from tqdm import tqdm
 
 """example command
-python -m scripts.create_prompts_for_gr1_dataset --dataset_path datasets/benchmark_train/gr1
+python create_prompts_for_gr1_dataset.py --dataset_path datasets/benchmark_train/gr1
 """
 
 
@@ -32,13 +32,13 @@ def parse_args() -> argparse.ArgumentParser:
         "--prompt_prefix", type=str, default="The robot arm is performing a task. ", help="Prefix of the prompt"
     )
     parser.add_argument(
-        "--meta_csv", type=str, default="datasets/benchmark_train/gr1/metadata.csv", help="Metadata csv file"
+        "--meta_csv", type=str, default=None, help="Metadata csv file (defaults to <dataset_path>/metadata.csv)"
     )
     return parser.parse_args()
 
 
 def main(args) -> None:
-    meta_csv = args.meta_csv
+    meta_csv = args.meta_csv or os.path.join(args.dataset_path, "metadata.csv")
     meta_lines = open(meta_csv).readlines()[1:]
     meta_txt_dir = os.path.join(args.dataset_path, "metas")
     os.makedirs(meta_txt_dir, exist_ok=True)
diff --git a/examples/cosmos/download_and_preprocess_datasets.sh b/examples/cosmos/download_and_preprocess_datasets.sh
@@ -20,3 +20,6 @@ mv datasets/benchmark_train/gr1 $train_dir
 mv dream_gen_benchmark/gr1_object $test_dir
 echo Download training data to $train_dir
 echo Download test data to $test_dir
+
+# Clean up staging directories
+rm -rf datasets/ dream_gen_benchmark/
diff --git a/examples/cosmos/eval_cosmos_predict25_lora.py b/examples/cosmos/eval_cosmos_predict25_lora.py
@@ -102,6 +102,13 @@ def parse_args():
         default=None,
         help="Negative prompt. Defaults to the pipeline's built-in negative prompt.",
     )
+    parser.add_argument(
+        "--text_encoder_attn_implementation",
+        type=str,
+        default="flash_attention_2",
+        choices=["eager", "sdpa", "flash_attention_2"],
+        help="The attention implementation to use for the text encoder (Qwen2.5 VL).",
+    )
 
     return parser.parse_args()
 
@@ -137,6 +144,7 @@ def check_video_safety(self, video):
         device_map=args.device,
         torch_dtype=torch.bfloat16,
         safety_checker=MockSafetyChecker(),
+        text_encoder_attn_implementation=args.text_encoder_attn_implementation,
     )
 
     if args.lora_dir is not None:
diff --git a/examples/cosmos/requirements.txt b/examples/cosmos/requirements.txt
@@ -0,0 +1,12 @@
+accelerate>=0.31.0
+huggingface_hub
+imageio
+imageio-ffmpeg
+transformers>=4.41.2
+peft>=0.11.1
+datasets
+numpy
+tqdm
+sentencepiece
+tensorboard
+wandb
diff --git a/examples/cosmos/train_cosmos_predict25_lora.py b/examples/cosmos/train_cosmos_predict25_lora.py
@@ -380,13 +380,7 @@ def _load_json_caption(self, json_path: Path) -> str:
         """Load caption from JSON file with prompt type selection."""
         try:
             with open(json_path, "r") as f:
-                content = f.read()
-                # Handle JSON that might not have top-level object
-                if not content.strip().startswith("{"):
-                    # Wrap in object if needed
-                    data = json.loads("{" + content + "}")
-                else:
-                    data = json.loads(content)
+                data = json.load(f)
 
             # Get the first model's captions (e.g., "qwen3_vl_30b_a3b")
             model_key = next(iter(data.keys()))