[None][feat] VisualGen: async mp4 encode + fixed noise latent via env vars (#15229)

wu6u3tw · web-flow · commit f7dd7ec542e6 · 2026-06-22T11:37:45.000-07:00
diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py
@@ -1,3 +1,4 @@
+import os
 import time
 from typing import List, Optional, Union
 
@@ -108,6 +109,18 @@ def __init__(self, pipeline_config):
                 "Use cache_backend='none' or 'cache_dit' (not 'teacache')."
             )
 
+        # Fixed latent for reproducible benchmarking (e.g. MLPerf).
+        # Set TRTLLM_VIDEO_FIXED_LATENT_PATH to a .pt file containing a pre-sampled
+        # noise tensor; it will be used in place of freshly sampled random latents for
+        # all T2V requests.  Loaded once at server startup, reused across requests.
+        self._fixed_latent: Optional[torch.Tensor] = None
+        _fixed_latent_path = os.environ.get("TRTLLM_VIDEO_FIXED_LATENT_PATH")
+        if _fixed_latent_path:
+            self._fixed_latent = torch.load(_fixed_latent_path, weights_only=True)
+            logger.warning(
+                f"Loaded fixed latent from {_fixed_latent_path}, shape={self._fixed_latent.shape}"
+            )
+
         super().__init__(pipeline_config)
 
     def _compute_wan_timestep_embedding(self, module, timestep=None, **kwargs):
@@ -486,6 +499,8 @@ def forward(
             latents, i2v_condition, i2v_first_frame_mask = self._prepare_latents_wan22_5B_i2v(
                 batch_size, image, height, width, num_frames, generator
             )
+        elif self._fixed_latent is not None:
+            latents = self._fixed_latent.to(device=self.device, dtype=self.dtype)
         else:
             latents = self._prepare_latents(batch_size, height, width, num_frames, generator)
         logger.debug(f"Latents shape: {latents.shape}")
diff --git a/tensorrt_llm/serve/openai_video_routes.py b/tensorrt_llm/serve/openai_video_routes.py
@@ -160,11 +160,20 @@ async def openai_video_generation_sync(self, raw_request: Request) -> Response:
             resolved_fmt = resolved_encoder_fmt
             batch_size = output.video.shape[0] if output.video.dim() == 5 else 1
             paths_in = [self.media_storage_path / f"{video_id}_{i}" for i in range(batch_size)]
-            saved_paths = output.save(
-                paths_in,
+            _save_kwargs = dict(
                 format=resolved_fmt,
                 frame_rate=output.frame_rate or request.frame_rate or params.frame_rate,
             )
+            if os.environ.get("TRTLLM_VIDEO_ASYNC_ENCODE", "1") != "0":
+                # Offload the blocking ffmpeg encode to a thread-pool executor so
+                # the event loop can start the next request's diffusion while this
+                # video encodes. Only overlaps when >=2 requests are in flight per
+                # server (i.e. client num_workers > server count).
+                saved_paths = await asyncio.get_running_loop().run_in_executor(
+                    None, lambda: output.save(paths_in, **_save_kwargs)
+                )
+            else:
+                saved_paths = output.save(paths_in, **_save_kwargs)
             latency = time.perf_counter() - sync_video_start  # seconds
             metrics = output.metrics
             generation = metrics.generation if metrics is not None else 0.0