Skip to content

Commit 9fd62f9

Browse files
authored
Merge pull request #2 from SenseTime-FVG/dev/0.1.1
Release UniMLVG checkpoint, update doc and related configs.
2 parents eae7ee6 + 2d79cd1 commit 9fd62f9

7 files changed

Lines changed: 427 additions & 167 deletions

File tree

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ Our cross-view temporal SD (CTSD) pipeline support loading the pretrained SD 2.1
6262
| Base model | Text conditioned <br/> driving generation | Text and layout (box, map) <br/> conditioned driving generation |
6363
| :-: | :-: | :-: |
6464
| [SD 2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) | [Config](configs/ctsd/multi_datasets/ctsd_21_tirda_nwao.json), [Download](http://103.237.29.236:10030/ctsd_21_tirda_nwao_30k.pth) | [Config](configs/ctsd/multi_datasets/ctsd_21_tirda_bm_nwa.json), [Download](http://103.237.29.236:10030/ctsd_21_tirda_bm_nwa_30k.pth) |
65-
| [SD 3.0](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) | | [UniMLVG Config](configs/ctsd/unimlvg/unimlvg_stage3_tirda_nwa.json), Released by 2025-2-1 |
66-
| [SD 3.5](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) | [Config](configs/ctsd/multi_datasets/ctsd_35_tirda_nwao.json), [Download](http://103.237.29.236:10030/ctsd_35_tirda_nwao_20k.pth) | [Config](configs/ctsd/multi_datasets/ctsd_35_tirda_bm_nwa.json), Released by 2025-2-1 |
65+
| [SD 3.0](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) | | [UniMLVG Config](configs/ctsd/unimlvg/unimlvg_stage3_tirda_nwa.json), [Download](http://103.237.29.236:10030/ctsd_unimlvg_tirda_bm_nwa_60k.pth) |
66+
| [SD 3.5](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) | [Config](configs/ctsd/multi_datasets/ctsd_35_tirda_nwao.json), [Download](http://103.237.29.236:10030/ctsd_35_tirda_nwao_20k.pth) | [Config](configs/ctsd/multi_datasets/ctsd_35_tirda_bm_nwa.json), Released by 2025-3-1 |
6767

6868
## Examples
6969

@@ -82,7 +82,7 @@ PYTHONPATH=src python examples/ctsd_generation_example.py -c examples/ctsd_35_6v
8282
3. Run this command to generate the video.
8383

8484
```
85-
PYTHONPATH=src python src/dwm/preview.py -c examples/ctsd_21_6views_video_generation_with_layout.json -o output/ctsd_21_6views_video_generation_with_layout
85+
PYTHONPATH=src python src/dwm/preview.py -c examples/ctsd_unimlvg_6views_video_generation.json -o output/ctsd_unimlvg_6views_video_generation
8686
```
8787

8888
## Train

configs/ctsd/unimlvg/unimlvg_stage1_tirda_o.json renamed to configs/ctsd/unimlvg/ctsd_unimlvg_stage1_tird_o.json

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"_class_name": "dwm.fs.czip.CombinedZipFileSystem",
1111
"fs": {
1212
"_class_name": "dwm.fs.dirfs.DirFileSystem",
13-
"path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/wuzehuan"
13+
"path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/wuzehuan"
1414
},
1515
"paths": [
1616
"data/opendv/opendv-youtube-10hz-720_0.zip",
@@ -23,7 +23,7 @@
2323
"_class_name": "torch.distributed.device_mesh.init_device_mesh",
2424
"device_type": "cuda",
2525
"mesh_shape": [
26-
4,
26+
2,
2727
8
2828
]
2929
}
@@ -36,7 +36,6 @@
3636
"_class_name": "dwm.pipelines.ctsd.CrossviewTemporalSD",
3737
"common_config": {
3838
"frame_prediction_style": "ctsd",
39-
"reference_frame_count": 3,
4039
"cat_condition": true,
4140
"cond_with_action": false,
4241
"condition_on_all_frames": true,
@@ -101,18 +100,16 @@
101100
"memory_efficient_batch": 12
102101
},
103102
"training_config": {
103+
"freezing_pattern": "^(transformer_blocks|time_text_embed|context_embedder|pos_embed|norm_out|proj_out)$",
104104
"text_prompt_condition_ratio": 0.8,
105-
"3dbox_condition_ratio": 0,
106-
"hdmap_condition_ratio": 0,
107105
"explicit_view_modeling_ratio": 0.8,
108106
"reference_frame_count": 3,
109107
"generation_task_ratio": 0.2,
110108
"image_generation_ratio": 0.5,
111109
"all_reference_visible_ratio": 0.8,
112110
"reference_visible_rate": 0.5,
113111
"disable_reference_frame_loss": true,
114-
"enable_grad_scaler": true,
115-
"freezing_pattern": "^(transformer_blocks|time_text_embed|context_embedder|pos_embed|norm_out|proj_out)"
112+
"enable_grad_scaler": true
116113
},
117114
"inference_config": {
118115
"guidance_scale": 3,
@@ -121,7 +118,8 @@
121118
448,
122119
252
123120
],
124-
"sequence_length_per_iteration": 6,
121+
"generate_frames_for_reference": false,
122+
"sequence_length_per_iteration": 19,
125123
"reference_frame_count": 3,
126124
"autoregression_data_exception_for_take_sequence": [
127125
"crossview_mask"
@@ -167,8 +165,8 @@
167165
"mixer_type": "AlphaBlender",
168166
"merge_factor": 2
169167
},
170-
"pretrained_model_name_or_path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/wuzehuan/models/stable-diffusion-3-medium-diffusers",
171-
"model_checkpoint_path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/wuzehuan/models/stable-diffusion-3-medium-diffusers/transformer/diffusion_pytorch_model.safetensors",
168+
"pretrained_model_name_or_path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/wuzehuan/models/stable-diffusion-3-medium-diffusers",
169+
"model_checkpoint_path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/wuzehuan/models/stable-diffusion-3-medium-diffusers/transformer/diffusion_pytorch_model.safetensors",
172170
"model_load_state_args": {
173171
"strict": false
174172
},
@@ -179,7 +177,7 @@
179177
},
180178
"fvd": {
181179
"_class_name": "dwm.metrics.fvd.FrechetVideoDistance",
182-
"inception_3d_checkpoint_path": "/mnt/storage/user/wuzehuan/Downloads/models/inception_3d/i3d_pretrained_400.pt",
180+
"inception_3d_checkpoint_path": "/mnt/afs/user/wuzehuan/Documents/DWM/externals/TATS/tats/fvd/i3d_pretrained_400.pt",
183181
"sequence_count": 16
184182
}
185183
}
@@ -192,8 +190,8 @@
192190
"_class_name": "dwm.common.get_state",
193191
"key": "opendv_fs"
194192
},
195-
"meta_path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/nijingcheng/datasets/OpenDV-YouTube.json",
196-
"sequence_length": 6,
193+
"meta_path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/nijingcheng/datasets/OpenDV-YouTube.json",
194+
"sequence_length": 19,
197195
"fps_stride_tuples": [
198196
[
199197
10,
@@ -207,8 +205,8 @@
207205
],
208206
"enable_fake_camera_transforms": true,
209207
"image_description_settings": {
210-
"path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_caption.json",
211-
"candidates_times_path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_candidates_times.json",
208+
"path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_caption.json",
209+
"candidates_times_path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_candidates_times.json",
212210
"seed": 5,
213211
"reorder_keys": true,
214212
"drop_rates": {
@@ -280,20 +278,20 @@
280278
"_class_name": "dwm.common.get_state",
281279
"key": "opendv_fs"
282280
},
283-
"meta_path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/nijingcheng/datasets/OpenDV-YouTube.json",
284-
"sequence_length": 18,
281+
"meta_path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/nijingcheng/datasets/OpenDV-YouTube.json",
282+
"sequence_length": 19,
285283
"fps_stride_tuples": [
286284
[
287285
10,
288-
180
286+
60
289287
]
290288
],
291289
"split": "Val",
292290
"mini_batch": 6,
293291
"enable_fake_camera_transforms": true,
294292
"image_description_settings": {
295-
"path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_caption.json",
296-
"candidates_times_path": "/cache/aoss-v2.st-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_candidates_times.json"
293+
"path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_caption.json",
294+
"candidates_times_path": "/cache/aoss.cn-sh-01.sensecoreapi-oss.cn/users/wuzehuan/workspaces/worldmodels/data/opendv_candidates_times.json"
297295
},
298296
"stub_key_data_dict": {
299297
"crossview_mask": [
@@ -351,7 +349,7 @@
351349
]
352350
},
353351
"training_dataloader": {
354-
"batch_size": 1,
352+
"batch_size": 2,
355353
"num_workers": 3,
356354
"prefetch_factor": 3,
357355
"collate_fn": {
@@ -389,9 +387,9 @@
389387
"persistent_workers": true
390388
},
391389
"informations": {
392-
"fid": -1,
393-
"fvd": -1,
390+
"fid": 9.10,
391+
"fvd": 132.89,
394392
"total_batch_sizes": 32,
395-
"steps": 60000
393+
"steps": 30000
396394
}
397395
}

0 commit comments

Comments
 (0)