Skip to content

Commit 8029c3c

Browse files
committed
feat(cosmos3): add Cosmos3 Super Omni inference tasks
1 parent 567edba commit 8029c3c

33 files changed

Lines changed: 1427 additions & 72 deletions
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"infer_steps": 30,
3+
"sample_guide_scale": 1.0,
4+
"sample_shift": 10.0,
5+
"target_height": 720,
6+
"target_width": 640,
7+
"target_video_length": 17,
8+
"target_fps": 10.0,
9+
"enable_cfg": true,
10+
"action_mode": "forward_dynamics",
11+
"domain_name": "agibotworld",
12+
"view_point": "concat_view",
13+
"action_chunk_size": 16,
14+
"action_chunk_index": 0,
15+
"feature_caching": "NoCaching",
16+
"rms_norm_type": "one-pass",
17+
"attn_rms_norm_type": "one-pass",
18+
"rope_type": "triton",
19+
"self_attn_type": "flash_attn3",
20+
"causal_self_attn_type": "flash_attn3",
21+
"add_resolution_template": false,
22+
"add_duration_template": false,
23+
"use_system_prompt": false,
24+
"cosmos3_meta_init": true,
25+
"vae_cpu_offload": false,
26+
"cpu_offload": false,
27+
"offload_granularity": "block"
28+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"infer_steps": 30,
3+
"sample_guide_scale": 1.0,
4+
"sample_shift": 10.0,
5+
"target_height": 720,
6+
"target_width": 640,
7+
"target_video_length": 17,
8+
"target_fps": 10.0,
9+
"enable_cfg": true,
10+
"action_mode": "forward_dynamics",
11+
"domain_name": "agibotworld",
12+
"view_point": "concat_view",
13+
"action_chunk_size": 16,
14+
"action_chunk_index": 0,
15+
"action_multichunk": true,
16+
"action_num_chunks": 4,
17+
"feature_caching": "NoCaching",
18+
"rms_norm_type": "one-pass",
19+
"attn_rms_norm_type": "one-pass",
20+
"rope_type": "triton",
21+
"self_attn_type": "flash_attn3",
22+
"causal_self_attn_type": "flash_attn3",
23+
"add_resolution_template": false,
24+
"add_duration_template": false,
25+
"use_system_prompt": false,
26+
"cosmos3_meta_init": true,
27+
"vae_cpu_offload": false,
28+
"cpu_offload": false,
29+
"offload_granularity": "block"
30+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"infer_steps": 30,
3+
"sample_guide_scale": 1.0,
4+
"sample_shift": 10.0,
5+
"target_height": 480,
6+
"target_width": 832,
7+
"target_video_length": 61,
8+
"target_fps": 10.0,
9+
"enable_cfg": true,
10+
"action_mode": "inverse_dynamics",
11+
"domain_name": "av",
12+
"view_point": "ego_view",
13+
"action_chunk_size": 60,
14+
"feature_caching": "NoCaching",
15+
"rms_norm_type": "one-pass",
16+
"attn_rms_norm_type": "one-pass",
17+
"rope_type": "triton",
18+
"self_attn_type": "flash_attn3",
19+
"causal_self_attn_type": "flash_attn3",
20+
"add_resolution_template": false,
21+
"add_duration_template": false,
22+
"use_system_prompt": false,
23+
"cosmos3_meta_init": true,
24+
"vae_cpu_offload": false,
25+
"cpu_offload": false,
26+
"offload_granularity": "block"
27+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"infer_steps": 35,
3+
"sample_guide_scale": 6.0,
4+
"sample_shift": 10.0,
5+
"target_height": 720,
6+
"target_width": 1280,
7+
"target_video_length": 189,
8+
"target_fps": 24.0,
9+
"enable_cfg": true,
10+
"enable_sound": true,
11+
"feature_caching": "NoCaching",
12+
"rms_norm_type": "one-pass",
13+
"attn_rms_norm_type": "one-pass",
14+
"rope_type": "triton",
15+
"self_attn_type": "flash_attn3",
16+
"causal_self_attn_type": "flash_attn3",
17+
"add_resolution_template": false,
18+
"add_duration_template": false,
19+
"cosmos3_meta_init": true,
20+
"vae_cpu_offload": false,
21+
"cpu_offload": false,
22+
"offload_granularity": "block"
23+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"infer_steps": 35,
3+
"sample_guide_scale": 6.0,
4+
"sample_shift": 10.0,
5+
"target_height": 720,
6+
"target_width": 1280,
7+
"target_video_length": 189,
8+
"target_fps": 24.0,
9+
"enable_cfg": true,
10+
"feature_caching": "NoCaching",
11+
"rms_norm_type": "one-pass",
12+
"attn_rms_norm_type": "one-pass",
13+
"rope_type": "triton",
14+
"self_attn_type": "flash_attn3",
15+
"causal_self_attn_type": "flash_attn3",
16+
"add_resolution_template": false,
17+
"add_duration_template": false,
18+
"cosmos3_meta_init": true,
19+
"vae_cpu_offload": false,
20+
"cpu_offload": false,
21+
"offload_granularity": "block"
22+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"infer_steps": 35,
3+
"sample_guide_scale": 6.0,
4+
"sample_shift": 10.0,
5+
"target_height": 720,
6+
"target_width": 1280,
7+
"target_video_length": 189,
8+
"target_fps": 24.0,
9+
"enable_cfg": true,
10+
"enable_sound": true,
11+
"feature_caching": "NoCaching",
12+
"rms_norm_type": "one-pass",
13+
"attn_rms_norm_type": "one-pass",
14+
"rope_type": "triton",
15+
"self_attn_type": "flash_attn3",
16+
"causal_self_attn_type": "flash_attn3",
17+
"add_resolution_template": false,
18+
"add_duration_template": false,
19+
"cosmos3_meta_init": true,
20+
"vae_cpu_offload": false,
21+
"cpu_offload": false,
22+
"offload_granularity": "block"
23+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"infer_steps": 35,
3+
"sample_guide_scale": 6.0,
4+
"sample_shift": 10.0,
5+
"target_height": 720,
6+
"target_width": 1280,
7+
"target_video_length": 189,
8+
"target_fps": 24.0,
9+
"enable_cfg": true,
10+
"feature_caching": "NoCaching",
11+
"rms_norm_type": "one-pass",
12+
"attn_rms_norm_type": "one-pass",
13+
"rope_type": "triton",
14+
"self_attn_type": "flash_attn3",
15+
"causal_self_attn_type": "flash_attn3",
16+
"add_resolution_template": false,
17+
"add_duration_template": false,
18+
"cosmos3_meta_init": true,
19+
"vae_cpu_offload": false,
20+
"cpu_offload": false,
21+
"offload_granularity": "block"
22+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"infer_steps": 35,
3+
"sample_guide_scale": 6.0,
4+
"sample_shift": 10.0,
5+
"target_height": 720,
6+
"target_width": 1280,
7+
"target_video_length": 189,
8+
"target_fps": 24.0,
9+
"enable_cfg": true,
10+
"feature_caching": "NoCaching",
11+
"rms_norm_type": "one-pass",
12+
"attn_rms_norm_type": "one-pass",
13+
"rope_type": "triton",
14+
"self_attn_type": "flash_attn3",
15+
"causal_self_attn_type": "flash_attn3",
16+
"add_resolution_template": false,
17+
"add_duration_template": false,
18+
"cosmos3_meta_init": true,
19+
"vae_cpu_offload": false,
20+
"cpu_offload": false,
21+
"offload_granularity": "block"
22+
}

lightx2v/infer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,11 @@ def main():
199199
default=None,
200200
help="Directory path for lingbot camera/action control files (poses.npy, intrinsics.npy, optional action.npy).",
201201
)
202+
parser.add_argument("--action_mode", type=str, default=None, choices=["forward_dynamics", "inverse_dynamics", "policy"], help="Cosmos3 action mode.")
203+
parser.add_argument("--domain_name", type=str, default=None, help="Cosmos3 action embodiment domain name.")
204+
parser.add_argument("--view_point", type=str, default=None, help="Cosmos3 action viewpoint label.")
205+
parser.add_argument("--action_chunk_size", type=int, default=None, help="Cosmos3 action chunk size.")
206+
parser.add_argument("--action_chunk_index", type=int, default=None, help="Cosmos3 action chunk index when action_path contains action_chunks.")
202207
parser.add_argument(
203208
"--action_ckpt",
204209
type=str,

lightx2v/models/audio_encoders/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)