|
| 1 | +from types import SimpleNamespace |
1 | 2 | from unittest.mock import Mock, patch |
2 | 3 |
|
3 | 4 | import pytest |
| 5 | +import torch |
4 | 6 |
|
5 | 7 | from tensorrt_llm._torch.models.modeling_llava_next import \ |
6 | 8 | LlavaNextInputProcessor |
| 9 | +from tensorrt_llm._torch.models.modeling_qwen2vl import \ |
| 10 | + Qwen2_5VLInputProcessorBase |
| 11 | +from tensorrt_llm._torch.models.modeling_qwen3vl import \ |
| 12 | + Qwen3VLInputProcessorBase |
7 | 13 | from tensorrt_llm.inputs.data import TextPrompt |
8 | 14 | from tensorrt_llm.sampling_params import SamplingParams |
9 | 15 |
|
@@ -54,6 +60,45 @@ def setattr_multiple(obj, attr_dict): |
54 | 60 | setattr(obj, attr_path, value) |
55 | 61 |
|
56 | 62 |
|
| 63 | +def _make_qwen_vl_config(hidden_size: int, deepstack_visual_indexes=None): |
| 64 | + text_config = SimpleNamespace(hidden_size=hidden_size, |
| 65 | + vocab_size=100, |
| 66 | + dtype=torch.float32) |
| 67 | + vision_config = SimpleNamespace( |
| 68 | + spatial_merge_size=2, |
| 69 | + temporal_patch_size=2, |
| 70 | + tokens_per_second=2, |
| 71 | + deepstack_visual_indexes=deepstack_visual_indexes or [], |
| 72 | + ) |
| 73 | + return SimpleNamespace( |
| 74 | + torch_dtype=torch.float32, |
| 75 | + text_config=text_config, |
| 76 | + vision_config=vision_config, |
| 77 | + image_token_id=12, |
| 78 | + video_token_id=14, |
| 79 | + vision_start_token_id=11, |
| 80 | + vision_end_token_id=13, |
| 81 | + ) |
| 82 | + |
| 83 | + |
| 84 | +def _make_tokenizer(input_ids): |
| 85 | + tokenizer = Mock() |
| 86 | + tokenizer.return_value.input_ids = [torch.tensor(input_ids)] |
| 87 | + return tokenizer |
| 88 | + |
| 89 | + |
| 90 | +def _make_qwen_vl_processor(processor_cls, config, tokenizer): |
| 91 | + with patch("tensorrt_llm._torch.models.modeling_qwen2vl.AutoProcessor" |
| 92 | + ) as mock_auto_processor: |
| 93 | + mock_auto_processor.from_pretrained.return_value = Mock() |
| 94 | + return processor_cls( |
| 95 | + model_path="dummy_path", |
| 96 | + config=config, |
| 97 | + tokenizer=tokenizer, |
| 98 | + trust_remote_code=True, |
| 99 | + ) |
| 100 | + |
| 101 | + |
57 | 102 | @pytest.fixture(params=["LlavaNextInputProcessor"]) |
58 | 103 | def processor_setup(request): |
59 | 104 | """Fixture to set up different input processors based on the parameter.""" |
@@ -192,5 +237,74 @@ def test_attach_multimodal_embeddings_multiple_images( |
192 | 237 | text_prompt["prompt"], return_tensors="pt") |
193 | 238 |
|
194 | 239 |
|
| 240 | +@pytest.mark.parametrize( |
| 241 | + "processor_cls,deepstack_visual_indexes", |
| 242 | + [ |
| 243 | + pytest.param(Qwen2_5VLInputProcessorBase, [], id="qwen2.5-vl"), |
| 244 | + pytest.param(Qwen3VLInputProcessorBase, [0], id="qwen3-vl"), |
| 245 | + ], |
| 246 | +) |
| 247 | +def test_qwen_vl_attach_multimodal_embeddings_builds_mrope_config( |
| 248 | + processor_cls, |
| 249 | + deepstack_visual_indexes, |
| 250 | +): |
| 251 | + hidden_size = 8 |
| 252 | + config = _make_qwen_vl_config(hidden_size, deepstack_visual_indexes) |
| 253 | + expected_embedding_width = hidden_size * (1 + len(deepstack_visual_indexes)) |
| 254 | + tokenizer = _make_tokenizer([ |
| 255 | + 7, |
| 256 | + config.vision_start_token_id, |
| 257 | + config.image_token_id, |
| 258 | + config.vision_end_token_id, |
| 259 | + 8, |
| 260 | + ]) |
| 261 | + processor = _make_qwen_vl_processor(processor_cls, config, tokenizer) |
| 262 | + |
| 263 | + num_image_tokens = 4 |
| 264 | + image_embedding = torch.arange( |
| 265 | + num_image_tokens * expected_embedding_width, |
| 266 | + dtype=torch.float32, |
| 267 | + ).reshape(num_image_tokens, expected_embedding_width) |
| 268 | + |
| 269 | + prompt_token_ids, extra_processed_inputs = processor.attach_multimodal_embeddings( |
| 270 | + {"prompt": "Describe this image."}, |
| 271 | + {"image": [image_embedding]}, |
| 272 | + SamplingParams(), |
| 273 | + ) |
| 274 | + |
| 275 | + placeholder_id = processor.tllm_multimodal_token_id |
| 276 | + assert prompt_token_ids == [ |
| 277 | + 7, |
| 278 | + config.vision_start_token_id, |
| 279 | + placeholder_id, |
| 280 | + placeholder_id, |
| 281 | + placeholder_id, |
| 282 | + placeholder_id, |
| 283 | + config.vision_end_token_id, |
| 284 | + 8, |
| 285 | + ] |
| 286 | + tokenizer.assert_called_once_with("Describe this image.", |
| 287 | + return_tensors="pt") |
| 288 | + |
| 289 | + multimodal_data = extra_processed_inputs["multimodal_data"] |
| 290 | + assert multimodal_data["multimodal_embedding"] == [image_embedding] |
| 291 | + |
| 292 | + mrope_config = multimodal_data["mrope_config"] |
| 293 | + expected_position_ids = torch.tensor( |
| 294 | + [ |
| 295 | + [[0, 1, 2, 2, 2, 2, 4, 5]], |
| 296 | + [[0, 1, 2, 2, 3, 3, 4, 5]], |
| 297 | + [[0, 1, 2, 3, 2, 3, 4, 5]], |
| 298 | + ], |
| 299 | + dtype=torch.long, |
| 300 | + ) |
| 301 | + torch.testing.assert_close(mrope_config["mrope_position_ids"], |
| 302 | + expected_position_ids) |
| 303 | + torch.testing.assert_close( |
| 304 | + mrope_config["mrope_position_deltas"], |
| 305 | + torch.tensor([[-2]], dtype=torch.int32), |
| 306 | + ) |
| 307 | + |
| 308 | + |
195 | 309 | if __name__ == "__main__": |
196 | 310 | pytest.main([__file__]) |
0 commit comments