Skip to content

Commit 2a9e003

Browse files
yechank-nvidia2ez4bz
authored andcommitted
[None][test] Cover Qwen VL image embedding attach
Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com>
1 parent 1dea0dd commit 2a9e003

2 files changed

Lines changed: 116 additions & 1 deletion

File tree

tests/unittest/_torch/multimodal/test_external_embedding.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
1+
from types import SimpleNamespace
12
from unittest.mock import Mock, patch
23

34
import pytest
5+
import torch
46

57
from tensorrt_llm._torch.models.modeling_llava_next import \
68
LlavaNextInputProcessor
9+
from tensorrt_llm._torch.models.modeling_qwen2vl import \
10+
Qwen2_5VLInputProcessorBase
11+
from tensorrt_llm._torch.models.modeling_qwen3vl import \
12+
Qwen3VLInputProcessorBase
713
from tensorrt_llm.inputs.data import TextPrompt
814
from tensorrt_llm.sampling_params import SamplingParams
915

@@ -54,6 +60,45 @@ def setattr_multiple(obj, attr_dict):
5460
setattr(obj, attr_path, value)
5561

5662

63+
def _make_qwen_vl_config(hidden_size: int, deepstack_visual_indexes=None):
64+
text_config = SimpleNamespace(hidden_size=hidden_size,
65+
vocab_size=100,
66+
dtype=torch.float32)
67+
vision_config = SimpleNamespace(
68+
spatial_merge_size=2,
69+
temporal_patch_size=2,
70+
tokens_per_second=2,
71+
deepstack_visual_indexes=deepstack_visual_indexes or [],
72+
)
73+
return SimpleNamespace(
74+
torch_dtype=torch.float32,
75+
text_config=text_config,
76+
vision_config=vision_config,
77+
image_token_id=12,
78+
video_token_id=14,
79+
vision_start_token_id=11,
80+
vision_end_token_id=13,
81+
)
82+
83+
84+
def _make_tokenizer(input_ids):
85+
tokenizer = Mock()
86+
tokenizer.return_value.input_ids = [torch.tensor(input_ids)]
87+
return tokenizer
88+
89+
90+
def _make_qwen_vl_processor(processor_cls, config, tokenizer):
91+
with patch("tensorrt_llm._torch.models.modeling_qwen2vl.AutoProcessor"
92+
) as mock_auto_processor:
93+
mock_auto_processor.from_pretrained.return_value = Mock()
94+
return processor_cls(
95+
model_path="dummy_path",
96+
config=config,
97+
tokenizer=tokenizer,
98+
trust_remote_code=True,
99+
)
100+
101+
57102
@pytest.fixture(params=["LlavaNextInputProcessor"])
58103
def processor_setup(request):
59104
"""Fixture to set up different input processors based on the parameter."""
@@ -192,5 +237,74 @@ def test_attach_multimodal_embeddings_multiple_images(
192237
text_prompt["prompt"], return_tensors="pt")
193238

194239

240+
@pytest.mark.parametrize(
241+
"processor_cls,deepstack_visual_indexes",
242+
[
243+
pytest.param(Qwen2_5VLInputProcessorBase, [], id="qwen2.5-vl"),
244+
pytest.param(Qwen3VLInputProcessorBase, [0], id="qwen3-vl"),
245+
],
246+
)
247+
def test_qwen_vl_attach_multimodal_embeddings_builds_mrope_config(
248+
processor_cls,
249+
deepstack_visual_indexes,
250+
):
251+
hidden_size = 8
252+
config = _make_qwen_vl_config(hidden_size, deepstack_visual_indexes)
253+
expected_embedding_width = hidden_size * (1 + len(deepstack_visual_indexes))
254+
tokenizer = _make_tokenizer([
255+
7,
256+
config.vision_start_token_id,
257+
config.image_token_id,
258+
config.vision_end_token_id,
259+
8,
260+
])
261+
processor = _make_qwen_vl_processor(processor_cls, config, tokenizer)
262+
263+
num_image_tokens = 4
264+
image_embedding = torch.arange(
265+
num_image_tokens * expected_embedding_width,
266+
dtype=torch.float32,
267+
).reshape(num_image_tokens, expected_embedding_width)
268+
269+
prompt_token_ids, extra_processed_inputs = processor.attach_multimodal_embeddings(
270+
{"prompt": "Describe this image."},
271+
{"image": [image_embedding]},
272+
SamplingParams(),
273+
)
274+
275+
placeholder_id = processor.tllm_multimodal_token_id
276+
assert prompt_token_ids == [
277+
7,
278+
config.vision_start_token_id,
279+
placeholder_id,
280+
placeholder_id,
281+
placeholder_id,
282+
placeholder_id,
283+
config.vision_end_token_id,
284+
8,
285+
]
286+
tokenizer.assert_called_once_with("Describe this image.",
287+
return_tensors="pt")
288+
289+
multimodal_data = extra_processed_inputs["multimodal_data"]
290+
assert multimodal_data["multimodal_embedding"] == [image_embedding]
291+
292+
mrope_config = multimodal_data["mrope_config"]
293+
expected_position_ids = torch.tensor(
294+
[
295+
[[0, 1, 2, 2, 2, 2, 4, 5]],
296+
[[0, 1, 2, 2, 3, 3, 4, 5]],
297+
[[0, 1, 2, 3, 2, 3, 4, 5]],
298+
],
299+
dtype=torch.long,
300+
)
301+
torch.testing.assert_close(mrope_config["mrope_position_ids"],
302+
expected_position_ids)
303+
torch.testing.assert_close(
304+
mrope_config["mrope_position_deltas"],
305+
torch.tensor([[-2]], dtype=torch.int32),
306+
)
307+
308+
195309
if __name__ == "__main__":
196310
pytest.main([__file__])

tests/unittest/_torch/multimodal/test_mm_encoder_standalone.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -953,7 +953,8 @@ def test_pd_disagg_with_image_input(
953953
pytest.param(_LLAVA_DIR, True, False, id="llava_7b-encoder_embeddings"),
954954
# Encoder embeddings routed back through default_multimodal_input_loader.
955955
pytest.param(_LLAVA_DIR, True, True, id="llava_7b-loader_embeddings"),
956-
# Qwen models don't implement attach_multimodal_embeddings, so only the raw path is exercised.
956+
# Keep Qwen rows on the raw path here; Qwen image-embedding attach is
957+
# covered by dedicated input-processor and OpenAI image_embeds tests.
957958
pytest.param(_QWEN_2_5_VL_DIR, False, False,
958959
id="qwen2.5_3b-raw_inputs"),
959960
pytest.param(_QWEN_3_VL_DIR, False, False, id="qwen3_2b-raw_inputs"),

0 commit comments

Comments
 (0)