[None][refactor] Add explicit Qwen VL LLM compile hook

yechank-nvidia · 2ez4bz · commit b03659475e61 · 2026-06-12T10:26:57.000-07:00
Signed-off-by: yechank &lt;161688079+yechank-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -1373,6 +1373,14 @@ def vocab_size_padded(self) -> int:
     def infer_max_seq_len(self) -> int:
         return self.llm.infer_max_seq_len()
 
+    def apply_llm_torch_compile(self, *, backend: Any, fullgraph: bool) -> None:
+        # TODO: Move this hook to MultimodalModelMixin once multimodal models
+        # consistently expose an LLM compile contract.
+        """Compile only the LLM decoder; the vision encoder stays eager."""
+        self.llm.model = torch.compile(self.llm.model,
+                                       backend=backend,
+                                       fullgraph=fullgraph)
+
     @nvtx_range("Qwen2.5-VL prepare_mrope_config")
     def prepare_mrope_config(
             self,
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
@@ -1121,6 +1121,12 @@ def vocab_size_padded(self) -> int:
     def infer_max_seq_len(self) -> int:
         return self.llm.infer_max_seq_len()
 
+    def apply_llm_torch_compile(self, *, backend: Any, fullgraph: bool) -> None:
+        # TODO: Move this hook to MultimodalModelMixin once multimodal models
+        # consistently expose an LLM compile contract.
+        """Compile only the LLM decoder; the vision encoder stays eager."""
+        self.llm.model = torch.compile(self.llm.model, backend=backend, fullgraph=fullgraph)
+
     def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]):
         config = model_config.pretrained_config.text_config
         pos_embd_params = PositionalEmbeddingParams(
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -458,24 +458,20 @@ def __init__(
                     capture_num_tokens=self._piecewise_cuda_graph_num_tokens,
                     max_num_streams=torch_compile_max_num_streams,
                     mapping=self.mapping)
+                apply_llm_torch_compile = getattr(self.model,
+                                                  "apply_llm_torch_compile",
+                                                  None)
                 if isinstance(self.model, DecoderModelForCausalLM):
                     self.model.model = torch.compile(
                         self.model.model,
                         backend=self._torch_compile_backend,
                         fullgraph=torch_compile_fullgraph)
-                elif hasattr(self.model, "llm") and isinstance(
-                        getattr(self.model.llm, "model", None),
-                        torch.nn.Module):
-                    # Multi-modal wrapper (e.g. Qwen2/3-VL): compile only the
-                    # text decoder. Tracing the outer wrapper pulls the
-                    # vision-tower output path + `fuse_input_embeds` into
-                    # the same graph, which lets the vision hidden_dim
-                    # propagate into the LM o_proj fake-tensor trace and
-                    # blows up the piecewise CUDA graph warmup.
-                    self.model.llm.model = torch.compile(
-                        self.model.llm.model,
-                        backend=self._torch_compile_backend,
-                        fullgraph=torch_compile_fullgraph)
+                elif callable(apply_llm_torch_compile):
+                    # TODO: Move this contract to MultimodalModelMixin once
+                    # multimodal models consistently expose their LLM compile
+                    # scope through the mixin.
+                    apply_llm_torch_compile(backend=self._torch_compile_backend,
+                                            fullgraph=torch_compile_fullgraph)
                 else:
                     self.model = torch.compile(
                         self.model,