huggingface · mlukasze · May 6, 2026 · May 27, 2026 · May 27, 2026 · May 28, 2026
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -136,7 +136,7 @@ Here is the list of the supported architectures :
 - Qwen2.5-VL
 - Qwen3
 - Qwen3MoE
-- Qwen3-VL
+- Qwen3-VL (including Qwen3-VL-Embedding)
 - Qwen3.5
 - Qwen3.5-MoE
 - Qwen3.6

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -368,7 +368,6 @@ def export_pytorch(
 
         if input_shapes is None:
             input_shapes = {}  # will use the defaults from DEFAULT_DUMMY_SHAPES
-
         # Check that inputs match, and order them properly
         dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes)
         device = torch.device(device)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -5746,7 +5746,9 @@ class SiglipOpenVINOConfig(SiglipOnnxConfig):
     "transformer", *["feature-extraction", "sentence-similarity"], library_name="sentence_transformers"
 )
 class SentenceTransformersTransformerOpenVINOConfig(SentenceTransformersTransformerOnnxConfig):
-    pass
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        config.vocab_size = config.get_text_config().vocab_size
 
 
 @register_in_tasks_manager("rembert", *COMMON_TEXT_TASKS)

diff --git a/optimum/intel/openvino/modeling_sentence_transformers.py b/optimum/intel/openvino/modeling_sentence_transformers.py
@@ -1,14 +1,20 @@
+import json
+import os
 from pathlib import Path
 from types import MethodType
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import EntryNotFoundError
 from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, PretrainedConfig
+from transformers import AutoProcessor, AutoTokenizer, PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 
+from optimum.intel.utils.import_utils import is_sentence_transformers_version
+
 from .configuration import OVQuantizationConfigBase
 from .modeling import MODEL_START_DOCSTRING, OVModel
 
@@ -27,7 +33,17 @@ def __init__(self, model=None, config=None, tokenizer=None, **kwargs):
         super().__init__(model, config, **kwargs)
 
         self.encode = MethodType(SentenceTransformer.encode, self)
-        self._text_length = MethodType(SentenceTransformer._text_length, self)
+        if is_sentence_transformers_version(">=", "5.4.0"):
+            self.supports = MethodType(SentenceTransformer.supports, self)
+            self._input_length = SentenceTransformer._input_length
+            self._resolve_prompt = MethodType(SentenceTransformer._resolve_prompt, self)
+            self.is_singular_input = MethodType(SentenceTransformer.is_singular_input, self)
+            self.modalities = ["text", "image", "video", "message"]
+            self.default_prompt_name = kwargs.get("default_prompt_name", None)
+            self.prompts = kwargs.get("prompts", {}) or {}
+            self.processor = kwargs.get("processor", None)
+        else:
+            self._text_length = MethodType(SentenceTransformer._text_length, self)
         self.default_prompt_name = None
         self.truncate_dim = None
         self.tokenizer = tokenizer
@@ -36,6 +52,9 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
         super()._save_pretrained(save_directory)
         self.tokenizer.save_pretrained(save_directory)
 
+    def _can_flatten_inputs(self):
+        return False
+
     def forward(self, inputs: Dict[str, torch.Tensor]):
         self.compile()
         input_ids = inputs.get("input_ids")
@@ -94,6 +113,50 @@ def _from_pretrained(
 
         tokenizer = AutoTokenizer.from_pretrained(model_id, **tokenizer_args)
 
+        if is_sentence_transformers_version(">=", "5.4.0"):
+            processor = None
+            try:
+                processor = AutoProcessor.from_pretrained(model_id, **tokenizer_args)
+            except (OSError, ValueError, KeyError, EnvironmentError):
+                processor = None
+
+            # Load sentence-transformers prompts/default_prompt_name from config_sentence_transformers.json,
+            # so that SentenceTransformer.encode's prompt resolution behaves the same as the reference model.
+            st_prompts: Dict[str, str] = {}
+            st_default_prompt_name: Optional[str] = None
+            st_config_path: Optional[str] = None
+            try:
+                if os.path.isdir(model_id):
+                    candidate = (
+                        os.path.join(model_id, subfolder, "config_sentence_transformers.json")
+                        if subfolder
+                        else os.path.join(model_id, "config_sentence_transformers.json")
+                    )
+                    if os.path.isfile(candidate):
+                        st_config_path = candidate
+                else:
+                    st_config_path = hf_hub_download(
+                        repo_id=str(model_id),
+                        filename="config_sentence_transformers.json",
+                        subfolder=subfolder or None,
+                        revision=revision,
+                        cache_dir=cache_dir,
+                        token=token,
+                        local_files_only=local_files_only,
+                        force_download=force_download,
+                    )
+            except (EntryNotFoundError, OSError, ValueError):
+                st_config_path = None
+
+            if st_config_path is not None:
+                try:
+                    with open(st_config_path, "r", encoding="utf-8") as f:
+                        st_cfg = json.load(f)
+                    st_prompts = st_cfg.get("prompts", {}) or {}
+                    st_default_prompt_name = st_cfg.get("default_prompt_name", None)
+                except (OSError, json.JSONDecodeError):
+                    pass
+
         model = super()._from_pretrained(
             model_id=model_id,
             config=config,
@@ -111,6 +174,11 @@ def _from_pretrained(
             **kwargs,
         )
 
+        if is_sentence_transformers_version(">=", "5.4.0"):
+            model.prompts = st_prompts
+            model.default_prompt_name = st_default_prompt_name
+            model.processor = processor
+
         return model
 
     def tokenize(
@@ -160,3 +228,119 @@ def _preprocess_quantization_config(
             quantization_config = quantization_config.clone()
             quantization_config.tokenizer = model_name_or_path
         return quantization_config
+
+    def preprocess(
+        self,
+        inputs,
+        prompt,
+        **kwargs,
+    ):
+        """
+        Preprocesses the inputs for the model.
+
+        Mirrors :meth:`sentence_transformers.base.modules.transformer.Transformer.preprocess`
+        for the text/message modalities so that tokenization matches the reference
+        SentenceTransformer model when a chat template is used (e.g. Qwen3-VL-Embedding).
+        """
+        from sentence_transformers.base.modality import format_modality, infer_batch_modality
+
+        if not inputs:
+            return {}
+
+        # Infer modality (used both for validation and to decide preprocessing path).
+        modality = None
+        try:
+            modality = infer_batch_modality(inputs, supported_modalities=self.modalities)
+        except (ValueError, TypeError):
+            pass
+
+        if modality is not None and not self.supports(modality):
+            supported = ", ".join(format_modality(m) for m in self.modalities)
+            message = (
+                f"Modality '{format_modality(modality)}' is not supported by this {type(self).__name__} model. "
+                f"Supported modalities: {supported}"
+            )
+            if isinstance(modality, tuple) and all(part in self.modalities for part in modality):
+                message += (
+                    f"\nThis model supports {' and '.join(modality)} individually, "
+                    "but not in the same input. Please process each modality separately."
+                )
+            raise ValueError(message)
+
+        # If the model has a chat template, route inputs through apply_chat_template so the output
+        # matches the reference SentenceTransformer (which uses the processor when available).
+        tokenizer = self.tokenizer
+        processor = getattr(self, "processor", None)
+        chat_template_owner = None
+        if processor is not None and getattr(processor, "chat_template", None) is not None:
+            chat_template_owner = processor
+        elif tokenizer is not None and getattr(tokenizer, "chat_template", None) is not None:
+            chat_template_owner = tokenizer
+
+        if chat_template_owner is not None and "message" in self.modalities:
+            messages_batch = self._build_messages_batch(inputs, modality, prompt)
+            preprocessed = chat_template_owner.apply_chat_template(
+                messages_batch,
+                tokenize=True,
+                return_dict=True,
+                add_generation_prompt=True,
+                padding=True,
+                truncation="longest_first",
+                return_tensors="pt",
+            )
+            preprocessed = dict(preprocessed)
+            preprocessed["modality"] = "message"
+        else:
+            # Fallback: plain tokenization (e.g. for text-only models without a chat template).
+            if prompt and modality == "text":
+                inputs = [
+                    (prompt + inp[0],) + tuple(inp[1:]) if isinstance(inp, tuple) else prompt + inp for inp in inputs
+                ]
+            preprocessed = self.tokenize(inputs, **kwargs)
+            preprocessed["modality"] = modality
+
+        print("inputs_ids {}".format(preprocessed["input_ids"]))
+        return preprocessed
+
+    @staticmethod
+    def _build_messages_batch(
+        inputs: List[Any],
+        modality: Any,
+        prompt: Optional[str],
+    ) -> List[List[Dict[str, Any]]]:
+        """Convert SentenceTransformer-style inputs into a list of chat-template message lists.
+
+        Each text input becomes a ``user`` message with structured content; if ``prompt`` is
+        provided it is prepended as a ``system`` message (matching
+        ``InputFormatter.prepend_prompt_to_messages``).
+        """
+
+        def _content_for_item(item: Any) -> List[Dict[str, Any]]:
+            if isinstance(item, str):
+                return [{"type": "text", "text": item}]
+            if isinstance(item, dict):
+                content: List[Dict[str, Any]] = []
+                for key, value in item.items():
+                    if key == "text":
+                        content.append({"type": "text", "text": value})
+                    elif key in ("image", "image_url"):
+                        content.append({"type": "image", "image": value})
+                    elif key == "video":
+                        content.append({"type": "video", "video": value})
+                    else:
+                        content.append({"type": key, key: value})
+                return content
+            # Tuples/lists (e.g. text pairs) - flatten into separate text parts.
+            if isinstance(item, (tuple, list)):
+                return [{"type": "text", "text": str(v)} for v in item]
+            return [{"type": "text", "text": str(item)}]
+
+        messages_batch: List[List[Dict[str, Any]]] = []
+        for inp in inputs:
+            user_message = {"role": "user", "content": _content_for_item(inp)}
+            sample_messages: List[Dict[str, Any]] = []
+            if prompt:
+                sample_messages.append({"role": "system", "content": [{"type": "text", "text": prompt}]})
+            sample_messages.append(user_message)
+            messages_batch.append(sample_messages)
+        return messages_batch