Skip to content

Commit f7dd7ec

Browse files
authored
[None][feat] VisualGen: async mp4 encode + fixed noise latent via env vars (#15229)
1 parent eddaa3a commit f7dd7ec

2 files changed

Lines changed: 26 additions & 2 deletions

File tree

tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import time
23
from typing import List, Optional, Union
34

@@ -108,6 +109,18 @@ def __init__(self, pipeline_config):
108109
"Use cache_backend='none' or 'cache_dit' (not 'teacache')."
109110
)
110111

112+
# Fixed latent for reproducible benchmarking (e.g. MLPerf).
113+
# Set TRTLLM_VIDEO_FIXED_LATENT_PATH to a .pt file containing a pre-sampled
114+
# noise tensor; it will be used in place of freshly sampled random latents for
115+
# all T2V requests. Loaded once at server startup, reused across requests.
116+
self._fixed_latent: Optional[torch.Tensor] = None
117+
_fixed_latent_path = os.environ.get("TRTLLM_VIDEO_FIXED_LATENT_PATH")
118+
if _fixed_latent_path:
119+
self._fixed_latent = torch.load(_fixed_latent_path, weights_only=True)
120+
logger.warning(
121+
f"Loaded fixed latent from {_fixed_latent_path}, shape={self._fixed_latent.shape}"
122+
)
123+
111124
super().__init__(pipeline_config)
112125

113126
def _compute_wan_timestep_embedding(self, module, timestep=None, **kwargs):
@@ -486,6 +499,8 @@ def forward(
486499
latents, i2v_condition, i2v_first_frame_mask = self._prepare_latents_wan22_5B_i2v(
487500
batch_size, image, height, width, num_frames, generator
488501
)
502+
elif self._fixed_latent is not None:
503+
latents = self._fixed_latent.to(device=self.device, dtype=self.dtype)
489504
else:
490505
latents = self._prepare_latents(batch_size, height, width, num_frames, generator)
491506
logger.debug(f"Latents shape: {latents.shape}")

tensorrt_llm/serve/openai_video_routes.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,20 @@ async def openai_video_generation_sync(self, raw_request: Request) -> Response:
160160
resolved_fmt = resolved_encoder_fmt
161161
batch_size = output.video.shape[0] if output.video.dim() == 5 else 1
162162
paths_in = [self.media_storage_path / f"{video_id}_{i}" for i in range(batch_size)]
163-
saved_paths = output.save(
164-
paths_in,
163+
_save_kwargs = dict(
165164
format=resolved_fmt,
166165
frame_rate=output.frame_rate or request.frame_rate or params.frame_rate,
167166
)
167+
if os.environ.get("TRTLLM_VIDEO_ASYNC_ENCODE", "1") != "0":
168+
# Offload the blocking ffmpeg encode to a thread-pool executor so
169+
# the event loop can start the next request's diffusion while this
170+
# video encodes. Only overlaps when >=2 requests are in flight per
171+
# server (i.e. client num_workers > server count).
172+
saved_paths = await asyncio.get_running_loop().run_in_executor(
173+
None, lambda: output.save(paths_in, **_save_kwargs)
174+
)
175+
else:
176+
saved_paths = output.save(paths_in, **_save_kwargs)
168177
latency = time.perf_counter() - sync_video_start # seconds
169178
metrics = output.metrics
170179
generation = metrics.generation if metrics is not None else 0.0

0 commit comments

Comments
 (0)