[None][fix] Support Qwen VL image embedding inputs

yechank-nvidia · 2ez4bz · commit 1dea0dd2466b · 2026-06-12T10:26:57.000-07:00
Signed-off-by: yechank &lt;161688079+yechank-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -476,6 +476,83 @@ def get_mrope_config(
             'cpu').to(torch.int32).clone()
         return mrope_config
 
+    @staticmethod
+    def _infer_image_grid_thw(num_tokens: int,
+                              spatial_merge_size: int) -> List[int]:
+        if num_tokens <= 0:
+            raise ValueError(
+                f"Image embedding must contain at least one token, got {num_tokens}"
+            )
+        llm_grid_h = int(num_tokens**0.5)
+        while llm_grid_h > 1 and num_tokens % llm_grid_h != 0:
+            llm_grid_h -= 1
+        llm_grid_w = num_tokens // llm_grid_h
+        return [
+            1,
+            llm_grid_h * spatial_merge_size,
+            llm_grid_w * spatial_merge_size,
+        ]
+
+    def _attach_multimodal_embeddings_impl(
+        self,
+        inputs: TextPrompt,
+        multimodal_embedding: Dict[str, List[torch.Tensor]],
+        sampling_params: SamplingParams,
+    ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
+        if not isinstance(multimodal_embedding, dict):
+            raise ValueError("multimodal_embedding must be a dictionary")
+        if set(multimodal_embedding) != {"image"}:
+            raise ValueError(
+                "Only image modality is supported for external multimodal embedding"
+            )
+
+        image_embeddings = multimodal_embedding["image"]
+        if isinstance(image_embeddings, torch.Tensor):
+            image_embeddings = [image_embeddings]
+        if not image_embeddings:
+            raise ValueError("At least one image embedding is required")
+        for index, image_embedding in enumerate(image_embeddings):
+            if image_embedding.dim() != 2:
+                raise ValueError(
+                    f"Image embedding {index} must be rank 2, got shape {tuple(image_embedding.shape)}"
+                )
+
+        get_prompt_token_ids = getattr(self, "get_prompt_token_ids", None)
+        if not callable(get_prompt_token_ids):
+            raise NotImplementedError(
+                f"{type(self).__name__} does not support external multimodal embeddings"
+            )
+
+        mm_handles = [{
+            "tensor_size": tuple(image_embedding.shape)
+        } for image_embedding in image_embeddings]
+        prompt_token_ids, _, _ = get_prompt_token_ids(inputs, mm_handles)
+
+        mrope_input_ids = torch.tensor(prompt_token_ids,
+                                       dtype=torch.long).unsqueeze(0)
+        mrope_input_ids = mrope_input_ids.clone()
+        multimodal_token_id = self.tllm_multimodal_token_id
+        mrope_input_ids[mrope_input_ids ==
+                        multimodal_token_id] = self.config.image_token_id
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_grid_thw = torch.tensor(
+            [
+                self._infer_image_grid_thw(image_embedding.shape[0],
+                                           spatial_merge_size)
+                for image_embedding in image_embeddings
+            ],
+            dtype=torch.long,
+        )
+        attention_mask = torch.ones_like(mrope_input_ids)
+        mrope_config = self.get_mrope_config(mrope_input_ids, image_grid_thw,
+                                             None, attention_mask, None)
+
+        multimodal_data = {
+            "multimodal_embedding": image_embeddings,
+            "mrope_config": mrope_config,
+        }
+        return prompt_token_ids, {"multimodal_data": multimodal_data}
+
     @nvtx_range("Qwen2VLInputProcessorBase forward()")
     @torch.inference_mode()
     def call_with_text_prompt(
diff --git a/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py b/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
@@ -1,6 +1,5 @@
 import io
 import os
-import sys
 import tempfile
 from base64 import b64encode
 from pathlib import Path
@@ -147,35 +146,14 @@ def temp_extra_encoder_options_file() -> str:
     return "/dummy/path"
 
 
-@pytest.fixture(scope="module")
-def server_patched(model_name: str, temp_extra_llm_api_options_file: str):
-    # Custom module implements missing 'attach_multimodal_embeddings' to intercept
-    # embeddings.
-    model_path = get_model_path(model_name)
-    args = [
-        "--extra_llm_api_options",
-        temp_extra_llm_api_options_file,
-        "--max_batch_size",
-        "64",
-        "--max_num_tokens",
-        "16384",
-        "--custom_module_dirs",
-        str(
-            Path(sys.modules[test_single_chat_session_image_embeds.__module__].
-                 __file__).parent / "_attach_multimodal_embeddings_patch"),
-    ]
-    with RemoteOpenAIServer(model_path, args) as remote_server:
-        yield remote_server
-
-
 @pytest.mark.needs_l40s
 @pytest.mark.asyncio(loop_scope="module")
 def test_single_chat_session_image_embeds(
-    server_patched: RemoteOpenAIServer,
+    server: RemoteOpenAIServer,
     model_name: str,
     mm_encoder_server: RemoteMMEncoderServer,
 ):
-    client = server_patched.get_client()
+    client = server.get_client()
     messages, mm_embed_handle = _test_multimodal_content_mm_encoder(
         mm_encoder_server.get_client(), model_name)
 
@@ -201,30 +179,15 @@ def test_single_chat_session_image_embeds(
         "data": b64encode(mm_embed_bytes).decode("ascii")
     }
 
-    # test single completion
-    #
-    # FIXME: Remove try-except and use 'server' instead of 'server_patched',
-    #        once Qwen2VLInputProcessorBase implements attach_multimodal_embeddings.
-    try:
-        chat_completion_embeds = client.chat.completions.create(
-            model=model_name,
-            messages=messages,
-            max_completion_tokens=max_completion_tokens,
-            temperature=0.0,
-            logprobs=False)
-
-        assert chat_completion_embeds.choices[
-            0].message == chat_completion_image.choices[0].message
-    except openai.BadRequestError as e:
-        assert isinstance(e.body, dict)
-        with open(Path(e.body["message"]), "rb") as f:
-            intercepted_embeddings = torch.load(f, weights_only=True)
-        assert list(intercepted_embeddings.keys()) == ["image"]
-        assert len(intercepted_embeddings["image"]) == 1
-        torch.testing.assert_close(intercepted_embeddings["image"][0],
-                                   mm_embed.cpu())
-        pytest.xfail(
-            reason="Model does not implement 'attach_multimodal_embeddings'")
+    chat_completion_embeds = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=max_completion_tokens,
+        temperature=0.0,
+        logprobs=False)
+
+    assert chat_completion_embeds.choices[
+        0].message == chat_completion_image.choices[0].message
 
 
 @pytest.mark.asyncio(loop_scope="module")