CVD-STORM/index.html at main · SenseTime-FVG/CVD-STORM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
<!DOCTYPE html>
<html><head lang="en"><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

    <meta http-equiv="x-ua-compatible" content="ie=edge">

    <title>CVD-STORM</title>

    <meta name="description" content="">
    <meta name="viewport" content="width=device-width, initial-scale=1">

    <!-- <meta property="og:image" content="https://dorverbin.github.io/refnerf/img/refnerf_titlecard.jpg">
    <meta property="og:image:type" content="image/png">
    <meta property="og:image:width" content="1200">
    <meta property="og:image:height" content="630">
    <meta property="og:type" content="website">
    <meta property="og:url" content="https://dorverbin.github.io/refnerf">
    <meta property="og:title" content="Ref-NeRF: Structured View-Dependent Appearance for Neural Radiance Fields">
    <meta property="og:description" content="Neural Radiance Fields (NeRF) is a popular view synthesis technique that represents a scene as a continuous volumetric function, parameterized by multilayer perceptrons that provide the volume density and view-dependent emitted radiance at each location. While NeRF-based techniques excel at representing fine geometric structures with smoothly varying view-dependent appearance, they often fail to accurately capture and reproduce the appearance of glossy surfaces. We address this limitation by introducing Ref-NeRF, which replaces NeRF's parameterization of view-dependent outgoing radiance with a representation of reflected radiance and structures this function using a collection of spatially-varying scene properties. We show that together with a regularizer on normal vectors, our model significantly improves the realism and accuracy of specular reflections. Furthermore, we show that our model's internal representation of outgoing radiance is interpretable and useful for scene editing.">

    <meta name="twitter:card" content="summary_large_image">
    <meta name="twitter:title" content="Ref-NeRF: Structured View-Dependent Appearance for Neural Radiance Fields">
    <meta name="twitter:description" content="Neural Radiance Fields (NeRF) is a popular view synthesis technique that represents a scene as a continuous volumetric function, parameterized by multilayer perceptrons that provide the volume density and view-dependent emitted radiance at each location. While NeRF-based techniques excel at representing fine geometric structures with smoothly varying view-dependent appearance, they often fail to accurately capture and reproduce the appearance of glossy surfaces. We address this limitation by introducing Ref-NeRF, which replaces NeRF's parameterization of view-dependent outgoing radiance with a representation of reflected radiance and structures this function using a collection of spatially-varying scene properties. We show that together with a regularizer on normal vectors, our model significantly improves the realism and accuracy of specular reflections. Furthermore, we show that our model's internal representation of outgoing radiance is interpretable and useful for scene editing.">
    <meta name="twitter:image" content="https://dorverbin.github.io/refnerf/img/refnerf_titlecard.jpg"> -->


    <!-- mirror: F0%9F%AA%9E&lt -->
    <!-- <link rel="icon" href="data:image/svg+xml,&lt;svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22&gt;&lt;text y=%22.9em%22 font-size=%2290%22&gt;%E2%9C%A8&lt;/text&gt;&lt;/svg&gt;"> -->
    <link rel="stylesheet" href="css/bootstrap.min.css">
    <link rel="stylesheet" href="css/font-awesome.min.css">
    <link rel="stylesheet" href="css/codemirror.min.css">
    <link rel="stylesheet" href="css/app.css">

    <script src="js/jquery.min.js"></script>
    <script src="js/bootstrap.min.js"></script>
    <script src="js/codemirror.min.js"></script>
    <script src="js/clipboard.min.js"></script>
    <script src="js/video_comparison.js"></script>
    <script src="js/app.js"></script>
</head>

<body>
    <div class="container" id="header" style="text-align: center; margin: auto;">
        <div class="row" id="title-row" style="max-width: 100%; margin: 0 auto; display: inline-block">
            <h2 class="col-md-12 text-center" id="title">
                <b>CVD-STORM</b>: Cross-View Video Diffusion with Spatial-Temporal Reconstruction Model for Autonomous Driving <br>
                <!-- <small>

                </small> -->
            </h2>
        </div>
        <div class="is-size-4 publication-authors">
              <span style="color: black;" class="author-block">Tianrui Zhang<sup>2</sup>,</span>
              <span style="color: black;" class="author-block">Yichen Liu<sup>1</sup>,</span>
              <span style="color: black;" class="author-block">Zilin Guo<sup>2</sup>,</span>
              <span style="color: black;" class="author-block">Jingcheng Ni<sup>1</sup>,</span>
              <span style="color: black;" class="author-block">Yuxin Guo<sup>1</sup>,</span>
              <span style="color: black;" class="author-block">Chenjing Ding<sup>1</sup>,</span>
              <span style="color: black;" class="author-block">Dan Xu<sup>2</sup>,</span>
              <span style="color: black;" class="author-block">Lewei Lu<sup>1</sup>,</span>
              <span style="color: black;" class="author-block">Zehuan Wu<sup>1</sup>,</span>
        </div>
        <div class="is-size-4 publication-authors">
            <span class="author-block"><sup>1</sup>SenseTime Research</span>
            <span class="author-block"><sup>2</sup>The Hong Kong University of Science and Technology</span>
        </div>
    </div>
    <script>
        document.getElementById('author-row').style.maxWidth = document.getElementById("title-row").clientWidth + 'px';
    </script>
    <div class="container" id="main">
        <div class="row">
                <div class="col-sm-6 col-sm-offset-3 text-center">
                    <ul class="nav nav-pills nav-justified">
                        <li>
                            <a href="https://arxiv.org/abs/2510.07944">
                            <img src="./img/paper_image.png" height="60px">
                                <h4><strong>Paper</strong></h4>
                            </a>
                        </li>
                        <!-- <li>
                            <a href="https://youtu.be/qrdRH9irAlk">
                            <img src="./img/youtube_icon.png" height="60px">
                                <h4><strong>Video</strong></h4>
                            </a>
                        </li>
                        <li>
                            <a href="https://storage.googleapis.com/gresearch/refraw360/ref.zip" target="_blank">
                            <image src="img/database_icon.png" height="60px">
                                <h4><strong>Shiny Dataset</strong></h4>
                            </a>
                        </li>
                        <li>
                            <a href="https://dorverbin.github.io/refnerf/data/shiny_blender_source.zip" target="_blank">
                            <image src="img/database_icon.png" height="60px">
                                <h4><strong>Shiny Dataset Source</strong></h4>
                            </a>
                        </li>
                        <li>
                            <a href="https://storage.googleapis.com/gresearch/refraw360/ref_real.zip" target="_blank">
                            <image src="img/real_database_icon.png" height="60px">
                                <h4><strong>Real Dataset</strong></h4>
                            </a>
                        </li>                             -->
                        <li>
                            <a href="https://github.com/SenseTime-FVG/OpenDWM" target="_blank">
                            <image src="img/github.png" height="60px">
                                <h4><strong>Code</strong></h4>
                            </a>
                        </li>
                    </ul>
                </div>
        </div>


        <!-- <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <div class="video-compare-container" id="materialsDiv">
                    <video class="video" id="materials" loop playsinline autoPlay muted src="video/materials_circle_mipnerf_ours.mp4" onplay="resizeAndPlay(this)"></video>

                    <canvas height=0 class="videoMerge" id="materialsMerge"></canvas>
                </div>
			</div>
        </div> -->


        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Abstract
                </h3>
                <p class="text-justify">
                    Generative models have been widely applied to world modeling for environment simulation and future state prediction. With advancements in autonomous driving, there is a growing demand not only for high-fidelity video generation under various controls, but also for producing diverse and meaningful information such as depth estimation. To address this, we propose CVD-STORM, a cross-view video diffusion model utilizing a spatial-temporal reconstruction Variational Autoencoder (VAE) that generates long-term, multi-view videos with 4D reconstruction capabilities under various control inputs. Our approach first fine-tunes the VAE with an auxiliary 4D reconstruction task, enhancing its ability to encode 3D structures and temporal dynamics. Subsequently, we integrate this VAE into the video diffusion process to significantly improve generation quality. Experimental results demonstrate that our model achieves substantial improvements in both FID and FVD metrics. Additionally, the jointly-trained Gaussian Splatting Decoder effectively reconstructs dynamic scenes, providing valuable geometric information for comprehensive scene understanding.
                </p>
            </div>
        </div>

        <image src="img/arch.png" class="img-responsive" alt="overview" width="60%" style="max-height: 450px;margin:auto;">

        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Video
                </h3>
                <div class="text-center">
                    <!-- <div style="position:relative;padding-top:56.25%;">
                        <iframe src="https://youtube.com/embed/qrdRH9irAlk" allowfullscreen style="position:absolute;top:0;left:0;width:100%;height:100%;"></iframe>
                    </div> -->
                    <video id="demo" width="100%" playsinline controls muted>
                        <source src="video/demo.mp4" type="video/mp4" />
                    </video>
                </div>
            </div>
        </div>

        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Depth Estimation
                </h3>
                <div class="text-justify">
                    We train a Gaussian Splatting decoder for 4D scene reconstruction in the first stage.
                    During inference, it can decode the Gaussian Splatting representation from the latent code and render the depth maps of the generative images.
                    We provide the visualization of the depth maps of the generative images below.
                    <br><br>

                </div>
                <div class="text-center">
                    <!-- <video id="refdir" width="40%" playsinline autoplay loop muted>
                        <source src="video/reflection_animation.mp4" type="video/mp4" />
                    </video> -->
                    <img src="./img/demo_depth.png" width="100%">
                </div>
            </div>
        </div>

        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Video Generation
                </h3>
                <div class="text-justify">
                    Our method can generate diverse and meaningful videos under various control inputs.
                </div>
                <br>
                <div class="text-center">
                    <img src="./img/demo_generation.png" width="100%">
                </div>
                <br>
            </div>
        </div>

        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Video Prediction
                </h3>
                <div class="text-justify">
                    Our method can predict the future video frames of the input video.
                </div>
                <div class="text-center">
                    <img src="./img/demo_prediction.png" width="100%">
                </div>
			</div>
        </div>

        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Citation
                </h3>
                <div class="form-group col-md-10 col-md-offset-1">
                    <textarea id="bibtex" class="form-control" readonly>
@article{zhang2025cvd,
  title={CVD-STORM: Cross-View Video Diffusion with Spatial-Temporal Reconstruction Model for Autonomous Driving},
  author={Zhang, Tianrui and Liu, Yichen and Guo, Zilin and Guo, Yuxin and Ni, Jingcheng and Ding, Chenjing and Xu, Dan and Lu, Lewei and Wu, Zehuan},
  journal={arXiv preprint arXiv:2510.07944},
  year={2025}
}</textarea>
                </div>
            </div>
        </div>

        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Acknowledgements
                </h3>
                <p class="text-justify">
                The website template was borrowed from <a href="http://mgharbi.com/">Michaël Gharbi</a> and <a href="https://dorverbin.github.io/refnerf/">Ref-NeRF</a>
                </p>
            </div>
        </div>
    </div>


</body></html>