[None][feat] Side-stream for MM encoder (NVIDIA#14322)

2ez4bz · GitLab CI Bot · commit 65b74570e268 · 2026-06-24T03:01:08.000Z
* Why?

Multimodal context requests currently run their encoder only after they
are scheduled. That potentially keeps the next request's image encoding
on the critical path even when the executor already has independent GPU
work from the current iteration to overlap it with.

* What?

Add an opt-in cross-iteration prefetch path gated by
`TLLM_MM_SIDE_STREAM_MAX_AHEAD`. The executor picks pending multimodal
context requests that are not in flight, moves their inputs to CUDA and
runs the encoder on an auxiliary stream.

This leverages the recently added `MultimodalEncoderMixin`.

Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
Signed-off-by: GitLab CI Bot &lt;gitlab-ci@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -612,7 +612,7 @@ def __init__(
         # NOTE: attn_backend: Pixtral head size not always divisible by 128
         vision_model_config = self._get_sub_model_config(model_config_cp,
                                                          "vision_config",
-                                                         attn_backend="VANILLA",
+                                                         attn_backend="TRTLLM",
                                                          quant_config=None)
 
         self._vision_tower = modeling_pixtral.PixtralVisionModel(
@@ -705,7 +705,6 @@ def infer_max_seq_len(self) -> int:
     def encode_multimodal_inputs(
         self,
         multimodal_params: Sequence[MultimodalParams],
-        **encoder_kwargs: Any,
     ) -> MultimodalEncoderOutput:
         mm_embeds = self._vision_forward(list(multimodal_params))
         return MultimodalEncoderOutput(embeddings=mm_embeds[0])
diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_mixin.py b/tensorrt_llm/_torch/models/modeling_multimodal_mixin.py
diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
@@ -219,6 +219,13 @@ def get_multimodal_embeddings(
     if not multimodal_params:
         return []
 
+    # Wait before touching tensors produced on the MM side stream. Do not
+    # clear the event here; repeated stream-side waits are cheap, and leaving
+    # the event field untouched avoids races if a caller accidentally reuses it.
+    for param in multimodal_params:
+        if param.encoder_event is not None:
+            torch.cuda.current_stream().wait_event(param.encoder_event)
+
     # Step 1: Find uncached multimodal params that need encoder processing
     uncached_multimodal_params = _get_uncached_multimodal_params(
         multimodal_params)
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -686,6 +686,13 @@ def __init__(
                 and encoder_output_len is None):
             encoder_output_len = len(encoder_input_tokens)
             kwargs["encoder_output_len"] = encoder_output_len
+
+        # Cross-iter MM encoder prefetch event: stamped by the side-stream
+        # producer in `modeling_multimodal_mixin._dispatch_cross_iter_prefetch`
+        # and consumed (then cleared) in `model_engine._prepare_inputs` when
+        # the request is next scheduled.
+        self.py_mm_encoder_event: Optional[torch.cuda.Event] = None
+
         if llm_request is not None:
             super().__init__(llm_request)
         else:
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -3037,6 +3037,17 @@ def append_cross_attention_state(request: LlmRequest,
                 multimodal_data=request.py_multimodal_data,
                 multimodal_runtime=py_multimodal_runtime,
                 input_ids_start_offset=context_start_idx)
+            # Transfer any cross-iter MM encoder prefetch event stamped on the request onto the
+            # freshly-built MultimodalParams. The downstream consume site reads it from the wrapper,
+            # not from the request.
+            # NOTE: the prefetch producer always writes the cached embedding into
+            # `py_multimodal_data` before stamping the event, so whenever the event is present,
+            # `has_content()` below is `True` and the wrapper reaches the consume site that waits on
+            # it.
+            mm_encoder_event = request.py_mm_encoder_event
+            if mm_encoder_event is not None:
+                multimodal_params.encoder_event = mm_encoder_event
+                request.py_mm_encoder_event = None
             if multimodal_params.has_content():
                 # TODO: Visit later to decide the appropriate position of sending multimodal data & selectively sending multimodal data
                 multimodal_params.to_device("multimodal_data",
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -47,6 +47,8 @@
 from ..distributed.communicator import ReduceOp
 from ..expert_statistic import ExpertStatistic
 from ..models.modeling_llama import Llama4ForConditionalGeneration
+from ..models.modeling_multimodal_mixin import \
+    maybe_prefetch_mm_encoder_for_next_iter
 from ..models.modeling_utils import DecoderModelForCausalLM
 from ..modules.decoder_layer import DecoderLayer
 from ..speculative.drafter import Drafter
@@ -3025,6 +3027,8 @@ def _executor_loop(self):
                             self.dwdp_manager.prefetch_first_layers()
                         batch_outputs = self._forward_step(scheduled_batch)
 
+                    self._maybe_prefetch_next_iter_mm_encoders(scheduled_batch)
+
                     guided_decoder_failed_requests = None
                     if self.guided_decoder is not None:
                         guided_decoder_failed_requests = self.guided_decoder.execute(
@@ -3150,7 +3154,7 @@ def _handle_control_request(self):
     def _sync_and_process_resource_governor_queue(self):
         """Synchronize and process resource governor requests across all ranks.
 
-        Only called when ``_resource_governor_enabled`` is ``True``.
+        Only called when ``_resource_governor_enabled`` is `True`.
         Uses a two-phase broadcast: first broadcast the count (a single int),
         then broadcast the actual requests only when count > 0.  This avoids
         serializing and deserializing an empty Python list on every iteration.
@@ -3441,6 +3445,8 @@ def _executor_loop_overlap(self):
                             scheduled_batch, previous_tensors_device,
                             num_accepted_tokens_device)
 
+                    self._maybe_prefetch_next_iter_mm_encoders(scheduled_batch)
+
                 if self.previous_batch is not None and should_process_previous_batch:
                     self._update_requests(self.previous_batch.sample_state)
 
@@ -4729,6 +4735,61 @@ def _check_disagg_gen_cache_transfer_status(self, atLeastNum: int = 0):
                     req.state = LlmRequestState.DISAGG_TRANS_ERROR
         self._check_cache_transfer_errors("generation requests")
 
+    def _maybe_prefetch_next_iter_mm_encoders(
+            self, scheduled_batch: ScheduledRequests) -> None:
+        """Best-effort hook for cross-iter MM encoder prefetch.
+
+        Called immediately after `_forward_step`, so the side-stream encoder
+        work can overlap current-iteration sampling in the non-overlap loop and
+        previous-batch `_update_requests` in the overlap loop. No-op unless
+        `TLLM_MM_SIDE_STREAM_MAX_AHEAD` is positive and the model is a
+        `MultimodalModelMixin` subclass.
+
+        Walks `active_requests` for context-init candidates that are NOT
+        in the just-scheduled batch (and, in overlap mode, not in the
+        previous batch either) and dispatches one of them, subject to the
+        outstanding-ahead cap in `maybe_prefetch_mm_encoder_for_next_iter`.
+        That helper runs the encoder on a side CUDA stream and stashes
+        results back into `request.py_multimodal_data`. The next iteration's
+        `_prepare_inputs` then picks up the cached embedding and the mixin
+        consume site waits on the recorded CUDA event.
+
+        Shared between `_executor_loop` (non-overlap) and
+        `_executor_loop_overlap`. `self.previous_batch` is always None in
+        non-overlap mode, so the second union term is a no-op there.
+        """
+        model = getattr(self.model_engine, "model", None)
+        if model is None:
+            return
+        in_flight = {r.py_request_id for r in scheduled_batch.all_requests()}
+        if self.previous_batch is not None:
+            in_flight |= {
+                r.py_request_id
+                for r in self.previous_batch.scheduled_requests.all_requests()
+            }
+        pending = [
+            r for r in self.active_requests
+            if r.state == LlmRequestState.CONTEXT_INIT
+        ]
+        if not pending:
+            return
+        try:
+            maybe_prefetch_mm_encoder_for_next_iter(
+                model=model,
+                pending_requests=pending,
+                in_flight_request_ids=in_flight,
+                max_prefetch=1,
+            )
+        except Exception:
+            # Speculative prefetch is best-effort and must never crash the
+            # executor loop. On failure, `py_mm_encoder_event` is not stamped,
+            # so the next iteration's `_prepare_inputs` falls back to the
+            # standard in-iter encode path (which re-runs `to_device` and the
+            # encoder unconditionally when no cached embedding is present).
+            logger.warning(
+                f"Cross-iter MM encoder prefetch failed; falling back to "
+                f"in-iter encode.\n{traceback.format_exc()}")
+
     def _forward_step(
             self,
             scheduled_requests: ScheduledRequests,
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -503,6 +503,13 @@ class MultimodalParams:
     multimodal_data: Optional[Dict[str, Any]] = field(default_factory=dict)
     multimodal_runtime: Optional[MultimodalRuntimeData] = None
     input_ids_start_offset: int = 0
+    # CUDA event recorded on a side stream by the MM encoder prefetch path.
+    # When set, the consume site in `get_multimodal_embeddings` issues a
+    # `wait_event` on the current stream before reading cached embeddings.
+    # Always `None` unless `TLLM_MM_SIDE_STREAM_MAX_AHEAD` is positive and a prefetch ran.
+    encoder_event: Optional[torch.cuda.Event] = field(default=None,
+                                                      repr=False,
+                                                      compare=False)
 
     def __post_init__(self):
         """Ensure default values are properly set."""
diff --git a/tests/unittest/_torch/multimodal/test_mm_encoder_cross_iter_prefetch.py b/tests/unittest/_torch/multimodal/test_mm_encoder_cross_iter_prefetch.py
diff --git a/tests/unittest/_torch/multimodal/test_multimodal_mixin.py b/tests/unittest/_torch/multimodal/test_multimodal_mixin.py