Skip to content

Commit 3049d45

Browse files
Ting-Yunterarachang
authored andcommitted
edit data preprocess script to avoid downloading cosmos2.5 repo
1 parent 60fa326 commit 3049d45

2 files changed

Lines changed: 63 additions & 6 deletions

File tree

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import argparse
17+
import os
18+
19+
from tqdm import tqdm
20+
21+
"""example command
22+
python -m scripts.create_prompts_for_gr1_dataset --dataset_path datasets/benchmark_train/gr1
23+
"""
24+
25+
26+
def parse_args() -> argparse.ArgumentParser:
27+
parser = argparse.ArgumentParser(description="Create text prompts for GR1 dataset")
28+
parser.add_argument(
29+
"--dataset_path", type=str, default="datasets/benchmark_train/gr1", help="Root path to the dataset"
30+
)
31+
parser.add_argument(
32+
"--prompt_prefix", type=str, default="The robot arm is performing a task. ", help="Prefix of the prompt"
33+
)
34+
parser.add_argument(
35+
"--meta_csv", type=str, default="datasets/benchmark_train/gr1/metadata.csv", help="Metadata csv file"
36+
)
37+
return parser.parse_args()
38+
39+
40+
def main(args) -> None:
41+
meta_csv = args.meta_csv
42+
meta_lines = open(meta_csv).readlines()[1:]
43+
meta_txt_dir = os.path.join(args.dataset_path, "metas")
44+
os.makedirs(meta_txt_dir, exist_ok=True)
45+
46+
for meta_line in tqdm(meta_lines):
47+
video_filename, prompt = meta_line.split(",", 1)
48+
prompt = prompt.strip("\n")
49+
if prompt.startswith('"') and prompt.endswith('"'):
50+
# Remove the quotes
51+
prompt = prompt[1:-1]
52+
prompt = args.prompt_prefix + prompt
53+
meta_txt_filename = os.path.join(meta_txt_dir, os.path.basename(video_filename).replace(".mp4", ".txt"))
54+
with open(meta_txt_filename, "w") as fp:
55+
fp.write(prompt)
56+
57+
print(f"encoding prompt: {prompt}")
58+
59+
60+
if __name__ == "__main__":
61+
args = parse_args()
62+
main(args)

examples/cosmos/download_and_preprocess_datasets.sh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,12 @@ train_dir=$dataset_dir/train
33
test_dir=$dataset_dir/test
44

55
# Download and Preprocess Training Dataset
6-
git clone https://github.com/nvidia-cosmos/cosmos-predict2.5.git
7-
cd cosmos-predict2.5
8-
96
hf download nvidia/GR1-100 --repo-type dataset --local-dir datasets/benchmark_train/hf_gr1/ && \
107
mkdir -p datasets/benchmark_train/gr1/videos && \
118
mv datasets/benchmark_train/hf_gr1/gr1/*mp4 datasets/benchmark_train/gr1/videos && \
129
mv datasets/benchmark_train/hf_gr1/metadata.csv datasets/benchmark_train/gr1/
1310

14-
python -m scripts.create_prompts_for_gr1_dataset --dataset_path datasets/benchmark_train/gr1
11+
python create_prompts_for_gr1_dataset.py --dataset_path datasets/benchmark_train/gr1
1512

1613
# Download Eval Dataset
1714
hf download nvidia/EVAL-175 --repo-type dataset --local-dir dream_gen_benchmark
@@ -23,5 +20,3 @@ mv datasets/benchmark_train/gr1 $train_dir
2320
mv dream_gen_benchmark/gr1_object $test_dir
2421
echo Download training data to $train_dir
2522
echo Download test data to $test_dir
26-
mv $dataset_dir ..
27-
cd ..

0 commit comments

Comments
 (0)