diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index b46742639e..72255b55a3 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -136,7 +136,7 @@ Here is the list of the supported architectures : - Qwen2.5-VL - Qwen3 - Qwen3MoE -- Qwen3-VL +- Qwen3-VL (including Qwen3-VL-Embedding) - Qwen3.5 - Qwen3.5-MoE - Qwen3.6 diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index fddd840b7d..2a1f51660b 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -368,7 +368,6 @@ def export_pytorch( if input_shapes is None: input_shapes = {} # will use the defaults from DEFAULT_DUMMY_SHAPES - # Check that inputs match, and order them properly dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes) device = torch.device(device) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 41b91ae65b..e91db9b181 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5746,7 +5746,9 @@ class SiglipOpenVINOConfig(SiglipOnnxConfig): "transformer", *["feature-extraction", "sentence-similarity"], library_name="sentence_transformers" ) class SentenceTransformersTransformerOpenVINOConfig(SentenceTransformersTransformerOnnxConfig): - pass + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + config.vocab_size = config.get_text_config().vocab_size @register_in_tasks_manager("rembert", *COMMON_TEXT_TASKS) diff --git a/optimum/intel/openvino/modeling_sentence_transformers.py b/optimum/intel/openvino/modeling_sentence_transformers.py index 74ebd7330c..ea4e4bfda6 100644 --- a/optimum/intel/openvino/modeling_sentence_transformers.py +++ b/optimum/intel/openvino/modeling_sentence_transformers.py @@ -1,14 +1,20 @@ +import json +import os from pathlib import Path from types import MethodType -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np import torch +from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import EntryNotFoundError from sentence_transformers import SentenceTransformer -from transformers import AutoTokenizer, PretrainedConfig +from transformers import AutoProcessor, AutoTokenizer, PretrainedConfig from transformers.file_utils import add_start_docstrings +from optimum.intel.utils.import_utils import is_sentence_transformers_version + from .configuration import OVQuantizationConfigBase from .modeling import MODEL_START_DOCSTRING, OVModel @@ -27,7 +33,17 @@ def __init__(self, model=None, config=None, tokenizer=None, **kwargs): super().__init__(model, config, **kwargs) self.encode = MethodType(SentenceTransformer.encode, self) - self._text_length = MethodType(SentenceTransformer._text_length, self) + if is_sentence_transformers_version(">=", "5.4.0"): + self.supports = MethodType(SentenceTransformer.supports, self) + self._input_length = SentenceTransformer._input_length + self._resolve_prompt = MethodType(SentenceTransformer._resolve_prompt, self) + self.is_singular_input = MethodType(SentenceTransformer.is_singular_input, self) + self.modalities = ["text", "image", "video", "message"] + self.default_prompt_name = kwargs.get("default_prompt_name", None) + self.prompts = kwargs.get("prompts", {}) or {} + self.processor = kwargs.get("processor", None) + else: + self._text_length = MethodType(SentenceTransformer._text_length, self) self.default_prompt_name = None self.truncate_dim = None self.tokenizer = tokenizer @@ -36,6 +52,9 @@ def _save_pretrained(self, save_directory: Union[str, Path]): super()._save_pretrained(save_directory) self.tokenizer.save_pretrained(save_directory) + def _can_flatten_inputs(self): + return False + def forward(self, inputs: Dict[str, torch.Tensor]): self.compile() input_ids = inputs.get("input_ids") @@ -94,6 +113,50 @@ def _from_pretrained( tokenizer = AutoTokenizer.from_pretrained(model_id, **tokenizer_args) + if is_sentence_transformers_version(">=", "5.4.0"): + processor = None + try: + processor = AutoProcessor.from_pretrained(model_id, **tokenizer_args) + except (OSError, ValueError, KeyError, EnvironmentError): + processor = None + + # Load sentence-transformers prompts/default_prompt_name from config_sentence_transformers.json, + # so that SentenceTransformer.encode's prompt resolution behaves the same as the reference model. + st_prompts: Dict[str, str] = {} + st_default_prompt_name: Optional[str] = None + st_config_path: Optional[str] = None + try: + if os.path.isdir(model_id): + candidate = ( + os.path.join(model_id, subfolder, "config_sentence_transformers.json") + if subfolder + else os.path.join(model_id, "config_sentence_transformers.json") + ) + if os.path.isfile(candidate): + st_config_path = candidate + else: + st_config_path = hf_hub_download( + repo_id=str(model_id), + filename="config_sentence_transformers.json", + subfolder=subfolder or None, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + ) + except (EntryNotFoundError, OSError, ValueError): + st_config_path = None + + if st_config_path is not None: + try: + with open(st_config_path, "r", encoding="utf-8") as f: + st_cfg = json.load(f) + st_prompts = st_cfg.get("prompts", {}) or {} + st_default_prompt_name = st_cfg.get("default_prompt_name", None) + except (OSError, json.JSONDecodeError): + pass + model = super()._from_pretrained( model_id=model_id, config=config, @@ -111,6 +174,11 @@ def _from_pretrained( **kwargs, ) + if is_sentence_transformers_version(">=", "5.4.0"): + model.prompts = st_prompts + model.default_prompt_name = st_default_prompt_name + model.processor = processor + return model def tokenize( @@ -160,3 +228,119 @@ def _preprocess_quantization_config( quantization_config = quantization_config.clone() quantization_config.tokenizer = model_name_or_path return quantization_config + + def preprocess( + self, + inputs, + prompt, + **kwargs, + ): + """ + Preprocesses the inputs for the model. + + Mirrors :meth:`sentence_transformers.base.modules.transformer.Transformer.preprocess` + for the text/message modalities so that tokenization matches the reference + SentenceTransformer model when a chat template is used (e.g. Qwen3-VL-Embedding). + """ + from sentence_transformers.base.modality import format_modality, infer_batch_modality + + if not inputs: + return {} + + # Infer modality (used both for validation and to decide preprocessing path). + modality = None + try: + modality = infer_batch_modality(inputs, supported_modalities=self.modalities) + except (ValueError, TypeError): + pass + + if modality is not None and not self.supports(modality): + supported = ", ".join(format_modality(m) for m in self.modalities) + message = ( + f"Modality '{format_modality(modality)}' is not supported by this {type(self).__name__} model. " + f"Supported modalities: {supported}" + ) + if isinstance(modality, tuple) and all(part in self.modalities for part in modality): + message += ( + f"\nThis model supports {' and '.join(modality)} individually, " + "but not in the same input. Please process each modality separately." + ) + raise ValueError(message) + + # If the model has a chat template, route inputs through apply_chat_template so the output + # matches the reference SentenceTransformer (which uses the processor when available). + tokenizer = self.tokenizer + processor = getattr(self, "processor", None) + chat_template_owner = None + if processor is not None and getattr(processor, "chat_template", None) is not None: + chat_template_owner = processor + elif tokenizer is not None and getattr(tokenizer, "chat_template", None) is not None: + chat_template_owner = tokenizer + + if chat_template_owner is not None and "message" in self.modalities: + messages_batch = self._build_messages_batch(inputs, modality, prompt) + preprocessed = chat_template_owner.apply_chat_template( + messages_batch, + tokenize=True, + return_dict=True, + add_generation_prompt=True, + padding=True, + truncation="longest_first", + return_tensors="pt", + ) + preprocessed = dict(preprocessed) + preprocessed["modality"] = "message" + else: + # Fallback: plain tokenization (e.g. for text-only models without a chat template). + if prompt and modality == "text": + inputs = [ + (prompt + inp[0],) + tuple(inp[1:]) if isinstance(inp, tuple) else prompt + inp for inp in inputs + ] + preprocessed = self.tokenize(inputs, **kwargs) + preprocessed["modality"] = modality + + print("inputs_ids {}".format(preprocessed["input_ids"])) + return preprocessed + + @staticmethod + def _build_messages_batch( + inputs: List[Any], + modality: Any, + prompt: Optional[str], + ) -> List[List[Dict[str, Any]]]: + """Convert SentenceTransformer-style inputs into a list of chat-template message lists. + + Each text input becomes a ``user`` message with structured content; if ``prompt`` is + provided it is prepended as a ``system`` message (matching + ``InputFormatter.prepend_prompt_to_messages``). + """ + + def _content_for_item(item: Any) -> List[Dict[str, Any]]: + if isinstance(item, str): + return [{"type": "text", "text": item}] + if isinstance(item, dict): + content: List[Dict[str, Any]] = [] + for key, value in item.items(): + if key == "text": + content.append({"type": "text", "text": value}) + elif key in ("image", "image_url"): + content.append({"type": "image", "image": value}) + elif key == "video": + content.append({"type": "video", "video": value}) + else: + content.append({"type": key, key: value}) + return content + # Tuples/lists (e.g. text pairs) - flatten into separate text parts. + if isinstance(item, (tuple, list)): + return [{"type": "text", "text": str(v)} for v in item] + return [{"type": "text", "text": str(item)}] + + messages_batch: List[List[Dict[str, Any]]] = [] + for inp in inputs: + user_message = {"role": "user", "content": _content_for_item(inp)} + sample_messages: List[Dict[str, Any]] = [] + if prompt: + sample_messages.append({"role": "system", "content": [{"type": "text", "text": prompt}]}) + sample_messages.append(user_message) + messages_batch.append(sample_messages) + return messages_batch