[EXPERIMENT][WIP][OpenVINO] Add support for Qwen3-VL-Embedding#1723
[EXPERIMENT][WIP][OpenVINO] Add support for Qwen3-VL-Embedding#1723mlukasze wants to merge 5 commits into
Conversation
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
|
The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update. |
There was a problem hiding this comment.
Pull request overview
Adds OpenVINO feature-extraction enablement for Qwen3-VL (including Qwen3-VL-Embedding) by wiring task registration/export configs and providing an OpenVINO runtime wrapper for loading and running the exported IR, plus integration/exporter CLI tests and a small docs update.
Changes:
- Register/extend OpenVINO exporter configs to support
qwen3_vlfeature-extraction and export the language backbone for embeddings. - Add a dedicated OpenVINO runtime class
_OVQwen3VLForFeatureExtractionand routeOVModelForFeatureExtractionloading/exporting formodel_type == "qwen3_vl". - Add targeted integration and CLI exporter tests; update docs to mention Qwen3-VL-Embedding.
Reviewed changes
Copilot reviewed 6 out of 6 changed files in this pull request and generated 3 comments.
Show a summary per file
| File | Description |
|---|---|
| tests/openvino/test_modeling.py | Adds an integration test comparing OpenVINO vs Transformers for Qwen3-VL feature extraction (text-only). |
| tests/openvino/test_exporters_cli.py | Adds a CLI export test for qwen3_vl feature-extraction and verifies last_hidden_state output. |
| optimum/intel/openvino/modeling.py | Routes Qwen3-VL feature-extraction load/export through a dedicated visual-language OpenVINO implementation and disables cache. |
| optimum/intel/openvino/modeling_visual_language.py | Introduces _OVQwen3VLForFeatureExtraction implementation returning BaseModelOutput(last_hidden_state=...). |
| optimum/exporters/openvino/model_configs.py | Registers qwen3_vl_text feature-extraction export config and adds helper path for VLM text feature configs; enables qwen3_vl feature-extraction task. |
| docs/source/openvino/models.mdx | Updates supported architecture list to explicitly include Qwen3-VL-Embedding. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| ): | ||
| internal_export_config = get_vlm_internal_text_feature_config(model_type, model_config, int_dtype, float_dtype) | ||
| export_config = LMInputEmbedsConfigHelper( | ||
| internal_export_config, | ||
| patcher_cls=model_patcher, | ||
| dummy_input_generator=dummy_input_generator, | ||
| inputs_update=inputs_update, | ||
| ) | ||
| export_config._normalized_config = internal_export_config._normalized_config | ||
| return export_config |
| def forward( | ||
| self, | ||
| input_ids, | ||
| pixel_values=None, | ||
| past_key_values=None, | ||
| inputs_embeds=None, | ||
| image_sizes=None, | ||
| attention_mask=None, | ||
| position_ids=None, | ||
| image_bound=None, | ||
| tgt_sizes=None, | ||
| pixel_values_videos=None, | ||
| image_grid_thw=None, | ||
| video_grid_thw=None, | ||
| rope_deltas=None, | ||
| images=None, | ||
| second_per_grid_ts=None, | ||
| token_type_ids=None, | ||
| pixel_attention_mask=None, | ||
| input_image_embeds: Optional[torch.FloatTensor] = None, | ||
| image_pixel_values: Optional[torch.FloatTensor] = None, | ||
| image_attention_mask=None, | ||
| audio_input_features: Optional[torch.FloatTensor] = None, | ||
| input_audio_embeds: Optional[torch.FloatTensor] = None, | ||
| audio_embed_sizes=None, | ||
| audio_attention_mask=None, | ||
| input_mode=None, | ||
| **kwargs, | ||
| ): | ||
| if pixel_values is None: | ||
| pixel_values = images if images is not None else image_pixel_values | ||
| inputs_embeds, attention_mask, position_ids, *extra_outputs = self.get_multimodal_embeddings( | ||
| input_ids, | ||
| pixel_values, | ||
| inputs_embeds=inputs_embeds, | ||
| image_sizes=image_sizes, | ||
| attention_mask=attention_mask, | ||
| position_ids=position_ids, | ||
| past_key_values=past_key_values, | ||
| image_bound=image_bound, | ||
| tgt_sizes=tgt_sizes, | ||
| pixel_values_videos=pixel_values_videos, | ||
| image_grid_thw=image_grid_thw, | ||
| video_grid_thw=video_grid_thw, | ||
| rope_deltas=rope_deltas, | ||
| second_per_grid_ts=second_per_grid_ts, | ||
| pixel_attention_mask=pixel_attention_mask, | ||
| input_image_embeds=input_image_embeds, | ||
| image_attention_mask=image_attention_mask, | ||
| input_audio_embeds=input_audio_embeds if input_audio_embeds is not None else audio_input_features, | ||
| audio_embed_sizes=audio_embed_sizes, | ||
| audio_attention_mask=audio_attention_mask, | ||
| input_mode=input_mode, | ||
| **kwargs, | ||
| ) | ||
|
|
||
| additional_inputs = {} | ||
| if extra_outputs: | ||
| additional_inputs["visual_pos_masks"] = extra_outputs[0] | ||
| additional_inputs["deepstack_visual_embeds"] = extra_outputs[1] | ||
|
|
||
| self.language_model.compile() | ||
| np_inputs = isinstance(inputs_embeds, np.ndarray) | ||
| inputs = self.language_model.prepare_inputs( | ||
| input_ids=input_ids, | ||
| attention_mask=attention_mask, | ||
| position_ids=position_ids, | ||
| past_key_values=None, | ||
| inputs_embeds=inputs_embeds, | ||
| token_type_ids=token_type_ids, | ||
| **additional_inputs, | ||
| ) | ||
| self.language_model.request.start_async(inputs, share_inputs=True) | ||
| self.language_model.request.wait() | ||
| outputs = self.language_model.request.get_tensor("last_hidden_state").data | ||
| last_hidden_state = outputs if np_inputs else torch.from_numpy(outputs).clone().to(self.device) | ||
| return BaseModelOutput(last_hidden_state=last_hidden_state) | ||
|
|
| self.assertIn("last_hidden_state", ov_outputs) | ||
| self.assertTrue( | ||
| torch.allclose(torch.Tensor(ov_outputs.last_hidden_state), transformers_outputs.last_hidden_state, atol=1e-4) | ||
| ) | ||
|
|
| else: | ||
| return super()._from_pretrained(model_id, config, *args, **kwargs) | ||
| if config.model_type == "qwen3_vl": | ||
| from .modeling_visual_language import _OVQwen3VLForFeatureExtraction |
There was a problem hiding this comment.
Let's move this import at the top of the file to stay consistent with what is done for SAM
| @classmethod | ||
| def _export(cls, model_id: str, config: PretrainedConfig, *args, **kwargs): | ||
| if config.model_type == "qwen3_vl": | ||
| from .modeling_visual_language import _OVQwen3VLForFeatureExtraction |
| return common_inputs | ||
|
|
||
|
|
||
| @register_in_tasks_manager("qwen3_vl_text", *["feature-extraction"], library_name="transformers") |
There was a problem hiding this comment.
feature-extraction-with-past?
|
|
||
|
|
||
| @unittest.skipIf(is_transformers_version("<", "4.57.0"), reason="Qwen3-VL requires transformers >= 4.57.0") | ||
| class OVQwen3VLFeatureExtractionIntegrationTest(unittest.TestCase): |
There was a problem hiding this comment.
please develop generic test suite for feature extraction problem
|
|
||
|
|
||
| @unittest.skipIf(is_transformers_version("<", "4.57.0"), reason="Qwen3-VL requires transformers >= 4.57.0") | ||
| class OVQwen3VLFeatureExtractionIntegrationTest(unittest.TestCase): |
There was a problem hiding this comment.
Please extend the existing test, instead of creating a new one:
optimum-intel/tests/openvino/test_modeling.py
Line 1032 in fb79914
| self.assertFalse("last_hidden_state" in model.output_names) | ||
|
|
||
| @unittest.skipIf(is_transformers_version("<", "4.57.0"), reason="Qwen3-VL requires transformers >= 4.57.0") | ||
| def test_exporters_cli_qwen3_vl_feature_extraction(self): |
There was a problem hiding this comment.
Please extend existing export test, do not create a separate test:
What does this PR do?
Add OpenVINO feature-extraction support for Qwen/Qwen3-VL-Embedding-2B by registering the qwen3_vl task, exporting the Qwen3-VL language backbone without generation cache, and loading the exported IR through OVModelForFeatureExtraction. The patch also adds focused export and modeling tests plus a docs update for Qwen3-VL-Embedding support.
Installation instructions
Exporting cmd-line
Inference script
Before submitting