From 53d19b95b0d0fee1d1d7259afd920a97d62b4881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 19 Jan 2026 10:39:10 +0100 Subject: [PATCH 001/222] Transformers v5 --- .github/workflows/test_openvino.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 2d0958b2d6..cffeabc42d 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "latest"] + transformers-version: ["4.45.0", "latest", "5.0.0rc3"] runs-on: ubuntu-22.04 diff --git a/setup.py b/setup.py index f7be8fd778..79a2cac349 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", - "transformers>=4.45,<4.58", + "transformers>=4.45,<5.1", "setuptools", ] From 5205434f5394f98072291ededa597869b1604839 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 19 Jan 2026 16:11:49 +0100 Subject: [PATCH 002/222] fix loading for llava_next_video --- tests/openvino/test_genai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index b8b9e8d6cd..cdb4f8a555 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -247,9 +247,9 @@ def _get_model_class(self, model_arch): return AutoModelForImageTextToText elif model_arch == "llava_next_video": - from transformers import AutoModelForVision2Seq + from transformers import LlavaNextVideoForConditionalGeneration - return AutoModelForVision2Seq + return LlavaNextVideoForConditionalGeneration elif model_arch == "llava": from transformers import LlavaForConditionalGeneration From e8feb0caf0d4286fa633ba2e2907681e2e9605f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 20 Jan 2026 18:09:58 +0100 Subject: [PATCH 003/222] Remove deprecated transformers.onnx --- optimum/intel/openvino/modeling_base.py | 6 ++---- optimum/intel/openvino/utils.py | 13 ------------- tests/openvino/test_modeling.py | 11 +++++------ 3 files changed, 7 insertions(+), 23 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 89fa7f5a88..6632acde68 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -851,16 +851,14 @@ def _export( **kwargs, ): """ - Export a vanilla Transformers model into an ONNX model using `transformers.onnx.export_onnx`. + Load and export a model to the OpenVINO IR. Arguments: model_id (`str` or `Path`): The directory from which to load the model. Can be either: - The model id of a pretrained model hosted inside a model repo on huggingface.co. - - The path to a directory containing the model weights. save_dir (`str` or `Path`): - The directory where the exported ONNX model should be saved, default to - `transformers.file_utils.default_cache_path`, which is the cache directory for transformers. + - The path to a directory containing the model weights. token (Optional[Union[bool, str]], defaults to `None`): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 4baa280fea..bb3ec658ed 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -32,7 +32,6 @@ from openvino import Type as OVType from packaging.version import Version from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size from optimum.intel.utils.import_utils import is_torch_version @@ -228,18 +227,6 @@ def maybe_convert_tokenizer_to_fast( return hf_tokenizer -def use_external_data_format(num_parameters: int) -> bool: - """ - Returns whether or not the model requires using external data format for the ONNX export - Args: - num_parameters: Number of parameter on the model - Returns: - True if model.num_parameters() * size_of(float32) >= 2Gb False otherwise - """ - - return compute_serialized_parameters_size(num_parameters, ParameterFormat.Float) >= EXTERNAL_DATA_FORMAT_SIZE_LIMIT - - def _is_timm_ov_dir(model_dir): config_file = None has_xml = False diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 279fcb9a8d..0c15a1b251 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -53,7 +53,6 @@ pipeline, set_seed, ) -from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow from transformers.utils import http_user_agent from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TENSOR_ALIAS_TO_TYPE, TEST_IMAGE_URL @@ -236,7 +235,7 @@ def test_load_from_hub_and_save_visual_language_model(self): # anymore due to an internal bug in transformers model_ids.append("katuni4ka/phi-4-multimodal-ov") for model_id in model_ids: - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) prompt = "What is shown in this image?" image = Image.open( requests.get( @@ -491,7 +490,7 @@ def test_load_from_hub_and_save_sam_model(self): self.assertEqual( loaded_model.prompt_encoder_mask_decoder.request.get_property("PERFORMANCE_HINT"), "THROUGHPUT" ) - processor = get_preprocessor(self.OV_SAM_MODEL_ID) + processor = AutoProcessor.from_pretrained(self.OV_SAM_MODEL_ID) img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" input_points = [[[450, 600]]] raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") @@ -1846,7 +1845,7 @@ def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertIsInstance(ov_model.vision_encoder, OVSamVisionEncoder) self.assertIsInstance(ov_model.prompt_encoder_mask_decoder, OVSamPromptEncoder) @@ -1899,7 +1898,7 @@ def test_reshape(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertTrue(ov_model.is_dynamic) input_points = [[[450, 600]]] IMAGE = Image.open( @@ -1935,7 +1934,7 @@ def test_compare_to_transformers(self, model_arch): ov_model = OVModelForZeroShotImageClassification.from_pretrained( model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE ) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) self.assertIsInstance(ov_model.config, PretrainedConfig) From bb54f64adee17964693a238de416ebd492a728ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 23 Jan 2026 15:01:05 +0100 Subject: [PATCH 004/222] remove deprecated transformers.onnx from tests --- tests/openvino/test_seq2seq.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index dbc1031a4c..238e13a1ac 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -42,7 +42,6 @@ set_seed, ) from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES -from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow from transformers.utils import http_user_agent from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TEST_IMAGE_URL, Timer @@ -336,7 +335,7 @@ def test_compare_to_transformers(self, model_arch): self._check_openvino_model_attributes(ov_model, use_cache=True, stateful=True) self._check_openvino_model_attributes(ov_model_stateless, use_cache=True, stateful=False) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) data = self._generate_random_audio_data() pt_features = processor.feature_extractor(data, return_tensors="pt") decoder_start_token_id = transformers_model.config.decoder_start_token_id @@ -395,7 +394,7 @@ def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] model = self.OVMODEL_CLASS.from_pretrained(model_id, device=OPENVINO_DEVICE) - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, @@ -1079,7 +1078,7 @@ def test_compare_to_transformers(self, model_arch): question = "Who am I?" transformers_model = self.AUTOMODEL_CLASS.from_pretrained(model_id) - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) inputs = preprocessor(images=self.IMAGE, text=question, padding=True, return_tensors="pt") ov_outputs = ov_model(**inputs) @@ -1100,7 +1099,7 @@ def test_compare_to_transformers(self, model_arch): def test_generate_utils(self, model_arch): model_id = MODEL_NAMES[model_arch] model = self.OVMODEL_CLASS.from_pretrained(model_id, export=True, device=OPENVINO_DEVICE) - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") @@ -1114,7 +1113,7 @@ def test_generate_utils(self, model_arch): def test_compare_with_and_without_past_key_values(self): model_id = MODEL_NAMES["pix2struct"] - preprocessor = get_preprocessor(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) question = "Who am I?" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") model_with_pkv = self.OVMODEL_CLASS.from_pretrained( From 71aa34e773537e6a191463e4c9298720fd3ff714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 23 Jan 2026 15:38:03 +0100 Subject: [PATCH 005/222] remove huggingface_hub deprecated --- optimum/intel/openvino/modeling_base.py | 8 +++++++- optimum/intel/utils/import_utils.py | 18 ++++++++++++++++++ optimum/intel/utils/modeling_utils.py | 11 ++++++++--- setup.py | 1 + 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index b96f375728..569422a085 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -28,12 +28,12 @@ from transformers import GenerationConfig, PretrainedConfig from transformers.file_utils import add_start_docstrings from transformers.generation import GenerationMixin -from transformers.utils import is_offline_mode from transformers.utils.hub import cached_file from optimum.exporters.base import ExportConfig from optimum.exporters.openvino.utils import _MAX_UNCOMPRESSED_SIZE from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel +from optimum.utils.import_utils import is_huggingface_hub_version from ...exporters.openvino import export, main_export from ..utils.import_utils import is_nncf_available @@ -58,6 +58,12 @@ ) +if is_huggingface_hub_version(">=", "1.2.1"): + from huggingface_hub import is_offline_mode +else: + from transformers.utils import is_offline_mode + + core = Core() logger = logging.getLogger(__name__) diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 3ad9877a82..d5e44d06d0 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -119,6 +119,15 @@ pass +_huggingface_hub_available = importlib.util.find_spec("huggingface_hub") is not None +_huggingface_hub_version = "N/A" +if _huggingface_hub_available: + try: + _huggingface_hub_version = importlib_metadata.version("huggingface_hub") + except importlib_metadata.PackageNotFoundError: + _huggingface_hub_available = False + + _safetensors_version = "N/A" _safetensors_available = importlib.util.find_spec("safetensors") is not None if _safetensors_available: @@ -486,6 +495,15 @@ def is_sentence_transformers_version(operation: str, version: str): return compare_versions(parse(_sentence_transformers_version), operation, version) +def is_huggingface_hub_version(operation: str, version: str): + """ + Compare the current huggingface_hub version to a given reference with an operation. + """ + if not _huggingface_hub_available: + return False + return compare_versions(parse(_huggingface_hub_version), operation, version) + + DIFFUSERS_IMPORT_ERROR = """ {0} requires the diffusers library but it was not found in your environment. You can install it with pip: `pip install diffusers`. Please note that you may need to restart your runtime after installation. diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cab9e5efa3..83b5ccc1ac 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -23,14 +23,19 @@ from typing import Dict, List, Optional, Type, Union import torch -from huggingface_hub import HfApi, HfFolder, hf_hub_download +from huggingface_hub import HfApi, get_token, hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.hf_api import file_exists from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel from optimum.exporters.tasks import TasksManager -from .import_utils import is_diffusers_available, is_numa_available, is_open_clip_available, is_psutil_available +from .import_utils import ( + is_diffusers_available, + is_numa_available, + is_open_clip_available, + is_psutil_available, +) if is_diffusers_available(): @@ -115,7 +120,7 @@ def _find_files_matching_pattern( model_path = Path(model_name_or_path) if not isinstance(model_name_or_path, Path) else model_name_or_path if isinstance(use_auth_token, bool): - token = HfFolder().get_token() + token = get_token() else: token = use_auth_token diff --git a/setup.py b/setup.py index 79a2cac349..9937ad3ebf 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", "transformers>=4.45,<5.1", "setuptools", + "huggingface-hub>=0.23.2,<2.0", ] TESTS_REQUIRE = [ From 0954015d7953735a0c1e5f1519bbbbd7cafeb77b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 23 Jan 2026 19:19:17 +0100 Subject: [PATCH 006/222] relative to absolute import --- optimum/intel/openvino/modeling_base.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 569422a085..8a16470fe4 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -31,14 +31,9 @@ from transformers.utils.hub import cached_file from optimum.exporters.base import ExportConfig +from optimum.exporters.openvino import export, main_export from optimum.exporters.openvino.utils import _MAX_UNCOMPRESSED_SIZE -from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel -from optimum.utils.import_utils import is_huggingface_hub_version - -from ...exporters.openvino import export, main_export -from ..utils.import_utils import is_nncf_available -from ..utils.modeling_utils import _find_files_matching_pattern -from .configuration import ( +from optimum.intel.openvino.configuration import ( _DEFAULT_4BIT_WQ_CONFIG, OVConfig, OVQuantizationConfigBase, @@ -47,7 +42,7 @@ _quantization_config_from_dict, get_default_quantization_config, ) -from .utils import ( +from optimum.intel.openvino.utils import ( ONNX_WEIGHTS_NAME, OV_TO_PT_TYPE, OV_XML_FILE_NAME, @@ -56,6 +51,9 @@ classproperty, model_has_dynamic_inputs, ) +from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available +from optimum.intel.utils.modeling_utils import _find_files_matching_pattern +from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel if is_huggingface_hub_version(">=", "1.2.1"): From 1ba9789bd9d8a18cd56631bbb7d85edd8ce8144f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 15:06:56 +0100 Subject: [PATCH 007/222] update workflow to v5 --- .github/workflows/test_openvino.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index cffeabc42d..f1874d3dbd 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "latest", "5.0.0rc3"] + transformers-version: ["4.45.0", "latest", "5.0.0"] runs-on: ubuntu-22.04 From f1586565e90bdf05b900b9e9912089a42d0d417f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 15:43:26 +0100 Subject: [PATCH 008/222] remove redundant --- optimum/exporters/openvino/model_configs.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ca12d455be..e9c7b52d97 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -247,10 +247,6 @@ def init_model_configs(): "transformers", "Qwen2VLForConditionalGeneration", ) - TasksManager._CUSTOM_CLASSES[("pt", "qwen2_5_vl", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( "transformers", "AutoModelForVision2Seq", @@ -259,14 +255,6 @@ def init_model_configs(): "transformers", "Gemma3ForConditionalGeneration", ) - TasksManager._CUSTOM_CLASSES[("pt", "idefics3", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) - TasksManager._CUSTOM_CLASSES[("pt", "smolvlm", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM") TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = ( "transformers", @@ -280,10 +268,6 @@ def init_model_configs(): "transformers", "AutoModelForCausalLM", ) - TasksManager._CUSTOM_CLASSES[("pt", "llama4", "image-text-to-text")] = ( - "transformers", - "AutoModelForImageTextToText", - ) if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" From 93451439c75f2758031a3acb573547a5a55add26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 16:18:13 +0100 Subject: [PATCH 009/222] update loading given transformers version --- optimum/exporters/openvino/model_configs.py | 53 +++++++++++++-------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index e9c7b52d97..67686b94bb 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -235,26 +235,7 @@ def init_model_configs(): if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {} - TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( - "transformers", - "LlavaForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = ( - "transformers", - "LlavaNextForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = ( - "transformers", - "Qwen2VLForConditionalGeneration", - ) - TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( - "transformers", - "AutoModelForVision2Seq", - ) - TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( - "transformers", - "Gemma3ForConditionalGeneration", - ) + TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM") TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = ( "transformers", @@ -269,6 +250,38 @@ def init_model_configs(): "AutoModelForCausalLM", ) + # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776 + if is_transformers_version("<", "4.46"): + TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( + "transformers", + "LlavaForConditionalGeneration", + ) + TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = ( + "transformers", + "LlavaNextForConditionalGeneration", + ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = ( + "transformers", + "Qwen2VLForConditionalGeneration", + ) + + # since transformers v4.50, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.50.0/src/transformers/models/auto/modeling_auto.py#L835 + if is_transformers_version("<", "4.50"): + TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( + "transformers", + "Gemma3ForConditionalGeneration", + ) + + # since transformers v4.52, model can be loaded using default AutoModelForImageTextToText + # https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/auto/modeling_auto.py#L899 + if is_transformers_version("<", "4.52"): + TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( + "transformers", + "AutoModelForVision2Seq", + ) + if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} From b290ae3c36ca4b6dd995b0601be2450a6aed63ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 16:37:48 +0100 Subject: [PATCH 010/222] remove deprecated AutoModelForVision2Seq --- optimum/intel/openvino/modeling_seq2seq.py | 15 +++++++++++++-- tests/openvino/test_seq2seq.py | 21 ++++++++++++++++----- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index c1b2177c59..4a7bc0394d 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -27,7 +27,6 @@ AutoConfig, AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, - AutoModelForVision2Seq, GenerationConfig, Pix2StructForConditionalGeneration, PretrainedConfig, @@ -56,6 +55,18 @@ ) +# AutoModelForVision2Seq is deprecated since v4.54 +# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151 +if is_transformers_version(">=", "4.54.0"): + from transformers import AutoModelForImageTextToText + + transformers_auto_class = AutoModelForImageTextToText +else: + from transformers import AutoModelForVision2Seq + + transformers_auto_class = AutoModelForVision2Seq + + core = Core() logger = logging.getLogger(__name__) @@ -1036,7 +1047,7 @@ def _reorder_cache( INPUTS_DOCSTRING, ) class OVModelForVision2Seq(OVModelForSeq2SeqLM): - auto_model_class = AutoModelForVision2Seq + auto_model_class = transformers_auto_class main_input_name = "pixel_values" export_feature = "image-to-text" diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 238e13a1ac..83a4b7c54f 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -33,7 +33,6 @@ AutoModelForSeq2SeqLM, AutoModelForSpeechSeq2Seq, AutoModelForTextToSpectrogram, - AutoModelForVision2Seq, AutoProcessor, AutoTokenizer, GenerationConfig, @@ -69,6 +68,18 @@ from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version +# AutoModelForVision2Seq is deprecated since v4.54 +# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151 +if is_transformers_version(">=", "4.54.0"): + from transformers import AutoModelForImageTextToText + + transformers_auto_class = AutoModelForImageTextToText +else: + from transformers import AutoModelForVision2Seq + + transformers_auto_class = AutoModelForVision2Seq + + os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -421,7 +432,7 @@ class OVModelForVision2SeqIntegrationTest(OVSeq2SeqTestMixin): UNSUPPORTED_ARCHITECTURES = {"got_ocr2", "pix2struct"} TASK = "image-to-text" OVMODEL_CLASS = OVModelForVision2Seq - AUTOMODEL_CLASS = AutoModelForVision2Seq + AUTOMODEL_CLASS = transformers_auto_class GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 @@ -580,9 +591,9 @@ def get_transformer_model_class(self, model_arch): return AutoModelForImageTextToText if model_arch == "llava_next_video": - from transformers import AutoModelForVision2Seq + from transformers import LlavaNextVideoForConditionalGeneration - return AutoModelForVision2Seq + return LlavaNextVideoForConditionalGeneration if model_arch == "llava": from transformers import LlavaForConditionalGeneration @@ -1056,7 +1067,7 @@ class OVModelForPix2StructIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = ["pix2struct"] TASK = "image-to-text" # is it fine as well with visual-question-answering? OVMODEL_CLASS = OVModelForVision2Seq - AUTOMODEL_CLASS = AutoModelForVision2Seq + AUTOMODEL_CLASS = transformers_auto_class GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 From a4d1dc0067813762978c3252c029b140b7e53ebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 16:44:22 +0100 Subject: [PATCH 011/222] update workflow --- .github/workflows/test_openvino.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index f1874d3dbd..1e8433087c 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -56,7 +56,7 @@ jobs: pip install --upgrade pip uv uv pip install .[openvino,diffusers,tests] - - if: ${{ matrix.transformers-version != 'latest' }} + - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator From ac953baa2715e0f4665a4d6b03303cd679e7ebd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 17:30:56 +0100 Subject: [PATCH 012/222] style --- optimum/intel/utils/modeling_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index 83b5ccc1ac..69de1770ce 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -29,8 +29,7 @@ from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel from optimum.exporters.tasks import TasksManager - -from .import_utils import ( +from optimum.intel.utils.import_utils import ( is_diffusers_available, is_numa_available, is_open_clip_available, From 800188441707ed6c8ea1b216d742cc110911b062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 17:42:39 +0100 Subject: [PATCH 013/222] update setup --- .github/workflows/test_openvino.yml | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 1e8433087c..81c8b4b48a 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "latest", "5.0.0"] + transformers-version: ["4.45", "4.57.6", "latest"] runs-on: ubuntu-22.04 @@ -61,7 +61,7 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*decoder*'}} + - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | uv pip install auto-gptq "autoawq<0.2.8" diff --git a/setup.py b/setup.py index 9937ad3ebf..1c313dbe0c 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 5f2a00716ee2755fe9924d491d30ce476c2d947b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 17:48:35 +0100 Subject: [PATCH 014/222] deprecated is_offline_mode --- optimum/intel/openvino/modeling_open_clip.py | 25 ++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py index db6abf9cc3..11bc115843 100644 --- a/optimum/intel/openvino/modeling_open_clip.py +++ b/optimum/intel/openvino/modeling_open_clip.py @@ -31,16 +31,27 @@ from transformers.file_utils import add_start_docstrings from transformers.modeling_outputs import ModelOutput from transformers.models.clip.modeling_clip import CLIPOutput -from transformers.utils import is_offline_mode +from optimum.exporters.openvino import main_export from optimum.exporters.tasks import TasksManager +from optimum.intel.openvino.configuration import ( + OVConfig, + OVWeightQuantizationConfig, +) +from optimum.intel.openvino.modeling import MODEL_START_DOCSTRING, OVModel +from optimum.intel.openvino.modeling_base import OVModelHostMixin +from optimum.intel.openvino.utils import ( + TemporaryDirectory, + classproperty, +) +from optimum.intel.utils.import_utils import is_huggingface_hub_version +from optimum.intel.utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification + -from ...exporters.openvino import main_export -from ..utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification -from .configuration import OVConfig, OVWeightQuantizationConfig -from .modeling import MODEL_START_DOCSTRING, OVModel -from .modeling_base import OVModelHostMixin -from .utils import TemporaryDirectory, classproperty +if is_huggingface_hub_version(">=", "1.2.1"): + from huggingface_hub import is_offline_mode +else: + from transformers.utils import is_offline_mode logger = logging.getLogger(__name__) From ad477fe92395a73a67aac57349cbe25c4a82e466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 18:15:55 +0100 Subject: [PATCH 015/222] remove incompatible neural-compressor installation --- .github/workflows/build_documentation.yml | 2 +- .github/workflows/build_pr_documentation.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index bcb51d6b58..52dae651de 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,7 +51,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate datasets + uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation shell: bash diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 16ac720c8f..c4a34baaa6 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,7 +38,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate datasets + uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation shell: bash From 42e98b8495fc4ac8dc090cf06c5459f400faff55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 18:18:57 +0100 Subject: [PATCH 016/222] remove documentation reference --- docs/source/neural_compressor/reference.mdx | 40 --------------------- 1 file changed, 40 deletions(-) diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx index b6e3d8f468..b83618b4bc 100644 --- a/docs/source/neural_compressor/reference.mdx +++ b/docs/source/neural_compressor/reference.mdx @@ -14,43 +14,3 @@ specific language governing permissions and limitations under the License. `optimum.intel.neural_compressor` is deprecated and will be removed in the next major release. - -## INCQuantizer - -[[autodoc]] neural_compressor.quantization.INCQuantizer - -## INCTrainer - -[[autodoc]] neural_compressor.trainer.INCTrainer - -## INCModel - -[[autodoc]] neural_compressor.modeling_base.INCModel - -## INCModelForSequenceClassification - -[[autodoc]] neural_compressor.modeling_base.INCModelForSequenceClassification - -## INCModelForQuestionAnswering - -[[autodoc]] neural_compressor.modeling_base.INCModelForQuestionAnswering - -## INCModelForTokenClassification - -[[autodoc]] neural_compressor.modeling_base.INCModelForTokenClassification - -## INCModelForMultipleChoice - -[[autodoc]] neural_compressor.modeling_base.INCModelForMultipleChoice - -## INCModelForMaskedLM - -[[autodoc]] neural_compressor.modeling_base.INCModelForMaskedLM - -## INCModelForCausalLM - -[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM - -## INCModelForSeq2SeqLM - -[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM From 4ee3f51ccdf946ae44644e9980f494d0893c2f71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 26 Jan 2026 18:30:58 +0100 Subject: [PATCH 017/222] add install transformers step --- .github/workflows/test_openvino.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 81c8b4b48a..2fcd23dbcf 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45", "4.57.6", "latest"] + transformers-version: ["4.45.0", "4.57.6", "latest"] runs-on: ubuntu-22.04 @@ -56,6 +56,11 @@ jobs: pip install --upgrade pip uv uv pip install .[openvino,diffusers,tests] + - if: ${{ matrix.transformers-version != 'latest' }} + name: Install transformers + run: | + uv pip install transformers==${{ matrix.transformers-version }} + - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | From 8204264e1ab001d039ccdfae3a3c48418ccc23d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 27 Jan 2026 18:54:01 +0100 Subject: [PATCH 018/222] transformers v5 --- .github/workflows/test_openvino.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 2fcd23dbcf..aef4ef484b 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "4.57.6", "latest"] + transformers-version: ["4.45.0", "5.0.0", "latest"] runs-on: ubuntu-22.04 From b319d19a5e8d33761bad5de23291a2c8c87557af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 27 Jan 2026 19:10:25 +0100 Subject: [PATCH 019/222] install diffusers from source for v5 --- .github/workflows/test_openvino.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index aef4ef484b..5bcbc0e31c 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -66,6 +66,11 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator + - if: ${{ matrix.transformers-version == '5.0.0' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers + - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | From 42300e42bbea84fde261a6cf01f81ac3789081a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 27 Jan 2026 19:21:26 +0100 Subject: [PATCH 020/222] remove deprecated CLIPFeatureExtractor --- optimum/intel/openvino/modeling_diffusion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 368265bc3e..22182ee96c 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -50,7 +50,7 @@ from huggingface_hub.utils import validate_hf_hub_args from openvino import Core from openvino._offline_transformations import compress_model_transformation -from transformers import CLIPFeatureExtractor, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTokenizer from transformers.modeling_outputs import ModelOutput from transformers.utils import http_user_agent @@ -170,7 +170,7 @@ def __init__( tokenizer: Optional[CLIPTokenizer] = None, tokenizer_2: Optional[CLIPTokenizer] = None, tokenizer_3: Optional[CLIPTokenizer] = None, - feature_extractor: Optional[CLIPFeatureExtractor] = None, + feature_extractor: Optional[CLIPImageProcessor] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, requires_aesthetics_score: bool = False, From 2a761024506fa8536a77c603233e875e25a4dbb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 28 Jan 2026 16:17:24 +0100 Subject: [PATCH 021/222] openvino 2025.3.0 --- optimum/intel/openvino/__init__.py | 8 +++++--- setup.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8441944800..28e39f0528 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -35,8 +35,12 @@ warnings.simplefilter(action="ignore", category=FutureWarning) + +logger = logging.getLogger(__name__) + + if is_openvino_version("<", "2025.4.0"): - raise ImportError( + logger.warning( "Optimum-intel requires OpenVINO version 2025.4.0 or higher. " "Please upgrade OpenVINO to version 2025.4 or later. " f"The current version of OpenVINO is {_openvino_version}." @@ -51,8 +55,6 @@ ) -logger = logging.getLogger(__name__) - if is_nncf_available(): import nncf diff --git a/setup.py b/setup.py index 1c313dbe0c..b2c945b37b 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], - "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], + "openvino": ["nncf>=2.19.0", "openvino==2025.3.0", "openvino-tokenizers==2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], From f38703a626c64495bd67233368b2b36a5d0a78af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 10:57:49 +0100 Subject: [PATCH 022/222] add ov cache classes --- optimum/exporters/openvino/model_patcher.py | 167 ++++++++++++++++---- 1 file changed, 140 insertions(+), 27 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 73b25149d9..3639ece9cf 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -18,6 +18,7 @@ import logging as log import math import types +from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch @@ -54,6 +55,118 @@ logger = logging.getLogger(__name__) +class OVDynamicCache(DynamicCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005 + def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: + """ + Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for + backward compatibility. + """ + legacy_cache = () + for layer in self.layers: + legacy_cache += ((layer.keys, layer.values),) + return legacy_cache + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015 + @classmethod + def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "DynamicCache": + """ + Converts a cache in the legacy cache format into an equivalent `Cache`. Used for + backward compatibility. + """ + cache = cls() + if past_key_values is None: + logger.warning_once("past_key_values should not be None in from_legacy_cache()") + if past_key_values is not None: + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx] + cache.update(key_states, value_states, layer_idx) + return cache + + +class OVEncoderDecoderCache(EncoderDecoderCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 + def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: + """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.""" + legacy_cache = () + if len(self.cross_attention_cache) > 0: + for self_attn, cross_attn in zip( + self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache() + ): + legacy_cache += (self_attn + cross_attn,) + else: + legacy_cache = self.self_attention_cache.to_legacy_cache() + return legacy_cache + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1279 + @classmethod + def from_legacy_cache( + cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]] + ) -> "EncoderDecoderCache": + """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.""" + cache = cls(DynamicCache(), DynamicCache()) + if past_key_values is None: + logger.warning_once("past_key_values should not be None in from_legacy_cache()") + else: + for layer_idx, key_value_states in enumerate(past_key_values): + key_states, value_states = key_value_states[:2] + cache.self_attention_cache.update(key_states, value_states, layer_idx) + if len(key_value_states) > 2: + key_states, value_states = key_value_states[2:] + cache.cross_attention_cache.update(key_states, value_states, layer_idx) + cache.is_updated[layer_idx] = True + return cache + + +def preprocess_past_key_values(past_key_values): + if ( + is_transformers_version(">=", "4.48") + and isinstance(past_key_values, (list, tuple)) + and isinstance(past_key_values[0], (list, tuple)) + ): + if len(past_key_values[0]) == 2: + past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + elif len(past_key_values[0]) == 4: + past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) + else: + raise ValueError( + f"past_key_values should have either 2 or 4 elements, but it has {len(past_key_values[0])} elements." + ) + + return past_key_values + + +class OVModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + self.model_patched_forward = self.patched_forward + + @functools.wraps(self.model_patched_forward) + def patched_forward(*args, **kwargs): + signature = inspect.signature(self.model_patched_forward) + args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) + + if "past_key_values" in signature.parameters: + # Most models require past_key_values to be a cache instance instead of a tuple now + pkv_index = list(signature.parameters.keys()).index("past_key_values") + if pkv_index < len(args) and args[pkv_index] is not None: + args[pkv_index] = preprocess_past_key_values(args[pkv_index]) + elif kwargs.get("past_key_values") is not None: + kwargs["past_key_values"] = preprocess_past_key_values(kwargs["past_key_values"]) + + outputs = self.model_patched_forward(*args, **kwargs) + + return outputs + + self.patched_forward = patched_forward + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes @@ -210,7 +323,7 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]: return mask -class OVDecoderModelPatcher(ModelPatcher): +class OVDecoderModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -3069,7 +3182,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class IBertModelPatcher(ModelPatcher): +class IBertModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3087,7 +3200,7 @@ def __init__( self._model(torch.ones([1, 1], dtype=torch.long)) -class InternVLChatImageEmbeddingModelPatcher(ModelPatcher): +class InternVLChatImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3230,7 +3343,7 @@ def maira_vision_embed_forward(self, pixel_values): return self.get_image_features(pixel_values, vision_feature_layer, vision_feature_select_strategy) -class LlavaImageEmbeddingModelPatcher(ModelPatcher): +class LlavaImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3247,7 +3360,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MairaImageEmbeddingModelPatcher(ModelPatcher): +class MairaImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3264,7 +3377,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher): +class LlavaNextVideoImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3305,7 +3418,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: return emb.unsqueeze(1) -class FluxTransfromerModelPatcher(ModelPatcher): +class FluxTransfromerModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() if is_diffusers_version("<", "0.31.0"): @@ -3480,7 +3593,7 @@ def _minicpmv_siglip_transformer_forward( ) -class MiniCPMVResamplerModelPatcher(ModelPatcher): +class MiniCPMVResamplerModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3497,7 +3610,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher): +class MiniCPMVImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3528,7 +3641,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher): +class LlavaQwen2ImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3546,7 +3659,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class InputEmbeddingPatcher(ModelPatcher): +class InputEmbeddingPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3571,7 +3684,7 @@ def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor): return self.get_img_features(pixel_values) -class Phi3VisionImageEmbeddingsPatcher(ModelPatcher): +class Phi3VisionImageEmbeddingsPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4180,7 +4293,7 @@ def block_forward( block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn) -class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): +class Qwen2VLVisionEmbMergerPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4214,7 +4327,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.attn.forward = block.attn._orig_forward -class Qwen2_5_VLVisionEmbMergerPatcher(ModelPatcher): +class Qwen2_5_VLVisionEmbMergerPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4345,7 +4458,7 @@ def __exit__(self, exc_type, exc_value, traceback): block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward -class OVSeq2SeqModelPatcher(ModelPatcher): +class OVSeq2SeqModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4430,7 +4543,7 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) -class SanaTextEncoderModelPatcher(ModelPatcher): +class SanaTextEncoderModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -4481,7 +4594,7 @@ def __init__( super().__init__(config, model, model_kwargs) -class CommonImageEmbeddingsModelPatcher(ModelPatcher): +class CommonImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4631,7 +4744,7 @@ def __exit__(self, exc_type, exc_value, traceback): del self._model.model._orig_update_causual_mask -class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher): +class Idefics3ImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5410,7 +5523,7 @@ def speecht5_decoder_layer_forward( return outputs -class OVSpeechT5ModelPatcher(ModelPatcher): +class OVSpeechT5ModelPatcher(OVModelPatcher): def __enter__(self): if self.real_config._behavior != "vocoder": super().__enter__() @@ -5586,7 +5699,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioForwardEmbeddingsPatcher(ModelPatcher): +class Phi4MMAudioForwardEmbeddingsPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5610,7 +5723,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioEncoderPatcher(ModelPatcher): +class Phi4MMAudioEncoderPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5651,7 +5764,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMVisionEmbeddingsPatcher(ModelPatcher): +class Phi4MMVisionEmbeddingsPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5960,7 +6073,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.img_processor.embeddings.forward = self._model.img_processor.embeddings._orig_forward -class Llama4ImageEmbeddingsModelPatcher(ModelPatcher): +class Llama4ImageEmbeddingsModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6146,7 +6259,7 @@ def llama4_moe_forward(self, hidden_states): return out, router_scores -class Llama4TextModelPatcher(ModelPatcher): +class Llama4TextModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -6316,7 +6429,7 @@ def mamba_mixer_forward( # 1. Inject a MambaCache structure into the original model to simplify input and output handling related to SSM states # 2. Patch ConvSequenceTransform module to avoid if-else branching # 3. Vectorize the selective scan operation to ensure correct behavior during JIT tracing -class MambaPatcher(ModelPatcher): +class MambaPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6810,7 +6923,7 @@ def segment_sum(input_tensor): # for subsequent invocation of the model's `forward` method. # 2. Patches the Zamba2MambaMixer so that the traced `forward` function works correctly # during both the prefill and decoding steps. -class Zamba2ModelPatcher(ModelPatcher): +class Zamba2ModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", @@ -7236,7 +7349,7 @@ def granite_moe_hybrid_update_causal_mask( return causal_mask -class GraniteMoeHybridModelPatcher(ModelPatcher): +class GraniteMoeHybridModelPatcher(OVModelPatcher): def __init__( self, config: "OnnxConfig", From 2d3c734c8abf3a9907cb929accd04ef61d57a5ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 13:28:58 +0100 Subject: [PATCH 023/222] openvino nightly for modeling tests --- .github/workflows/test_openvino.yml | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 5bcbc0e31c..950c2f987c 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -81,15 +81,15 @@ jobs: run: | python tests/scripts/login_with_ci_token.py - - name: Test with Pytest - run: | - pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 - - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install Nightly OpenVINO run: | uv pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - name: Test with Pytest + run: | + pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 + - if: ${{ matrix.test-pattern == '*modeling*' }} name: Test with Pytest - Nightly OpenVINO run: | diff --git a/setup.py b/setup.py index b2c945b37b..1c313dbe0c 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], - "openvino": ["nncf>=2.19.0", "openvino==2025.3.0", "openvino-tokenizers==2025.3.0"], + "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], From b6dcefd0949130e084d378e5bc6d7cc46c9e698c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 14:21:35 +0100 Subject: [PATCH 024/222] openvino 2025.3 for modeling tests --- .github/workflows/test_openvino.yml | 13 +++++++++---- setup.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 950c2f987c..37e61ea335 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -56,6 +56,11 @@ jobs: pip install --upgrade pip uv uv pip install .[openvino,diffusers,tests] + - if: ${{ matrix.test-pattern == '*modeling*' }} + name: Install OpenVINO + run: | + uv pip install openvino==2025.3.0 openvino-tokenizers==2025.3.0 + - if: ${{ matrix.transformers-version != 'latest' }} name: Install transformers run: | @@ -81,15 +86,15 @@ jobs: run: | python tests/scripts/login_with_ci_token.py + - name: Test with Pytest + run: | + pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 + - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install Nightly OpenVINO run: | uv pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - name: Test with Pytest - run: | - pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 - - if: ${{ matrix.test-pattern == '*modeling*' }} name: Test with Pytest - Nightly OpenVINO run: | diff --git a/setup.py b/setup.py index 1c313dbe0c..c072bab7f3 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.19.0"], - "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"], + "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], From ea24727535b789e228106790f2f725c50dc8309b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 15:51:08 +0100 Subject: [PATCH 025/222] stop moving misplaced parameters from config to generation_config --- optimum/exporters/openvino/convert.py | 29 ++++++++++---------- optimum/intel/openvino/modeling_base.py | 32 +++++++++++----------- optimum/intel/openvino/modeling_seq2seq.py | 29 ++++++++++---------- 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 4b0652393d..794e38c9ed 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -688,20 +688,21 @@ def export_from_model( files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] elif library_name != "diffusers": - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - except (AttributeError, KeyError, TypeError): - misplaced_generation_parameters = {} - if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(model.generation_config, param_name, param_value) - setattr(model.config, param_name, None) + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + except (AttributeError, KeyError, TypeError): + misplaced_generation_parameters = {} + if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) # Saving the model config and preprocessor as this is needed sometimes. save_config(model.config, output) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index b3c9a11eb0..0d95cc233d 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -47,7 +47,7 @@ classproperty, model_has_dynamic_inputs, ) -from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available +from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available, is_transformers_version from optimum.intel.utils.modeling_utils import _find_files_matching_pattern from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel @@ -265,21 +265,21 @@ def __init__( if self.can_generate(): self.generation_config = generation_config or GenerationConfig.from_model_config(config) - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except (KeyError, TypeError): - misplaced_generation_parameters = {} - if len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(self.generation_config, param_name, param_value) - setattr(self.config, param_name, None) - + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except (KeyError, TypeError): + misplaced_generation_parameters = {} + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) else: self.generation_config = None diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index c5365e8aae..cb8d6b7fa4 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -370,20 +370,21 @@ def __init__( generation_config = kwargs.get("generation_config", None) self.generation_config = generation_config or GenerationConfig.from_model_config(config) - # some model configs may have issues with loading without parameters initialization - try: - misplaced_generation_parameters = self.config._get_non_default_generation_parameters() - except (KeyError, TypeError): - misplaced_generation_parameters = {} - if len(misplaced_generation_parameters) > 0: - logger.warning( - "Moving the following attributes in the config to the generation config: " - f"{misplaced_generation_parameters}. You are seeing this warning because you've set " - "generation parameters in the model config, as opposed to in the generation config.", - ) - for param_name, param_value in misplaced_generation_parameters.items(): - setattr(self.generation_config, param_name, param_value) - setattr(self.config, param_name, None) + if is_transformers_version("<", "5"): + # some model configs may have issues with loading without parameters initialization + try: + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + except (KeyError, TypeError): + misplaced_generation_parameters = {} + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) self._openvino_config = None if quantization_config: From 07ff06b936fea14798feb0ca208449bc408b3694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 15:54:59 +0100 Subject: [PATCH 026/222] fix transformers version for doc building --- .github/workflows/build_pr_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index c4a34baaa6..01a5bbe7e9 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,6 +38,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation From 1270db0612cad34664ec7b295c55e19ea0be38fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 15:56:32 +0100 Subject: [PATCH 027/222] fix transformers version for doc building --- .github/workflows/build_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 52dae651de..332563450b 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,6 +51,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] nncf openvino diffusers accelerate datasets - name: Make documentation From eb045ce620a72080d746dd2877b12a685c9bb79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 18:57:38 +0100 Subject: [PATCH 028/222] Use model.get_image_features --- optimum/exporters/openvino/model_configs.py | 25 ++++++++++++++------- optimum/exporters/openvino/model_patcher.py | 6 ++++- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 67686b94bb..a8b293ca84 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -317,6 +317,13 @@ def init_model_configs(): register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) +def _get_language_model(model): + if is_transformers_version("<", "5"): + return model.language_model + + return model.model.language_model + + @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 @@ -1702,14 +1709,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return model.language_model if not hasattr(model, "lm_head") else model + return _get_language_model(model) if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding.config = _get_language_model(model).config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -1892,8 +1899,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = model.language_model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding = _get_language_model(model).get_input_embeddings() + text_embedding.config = _get_language_model(model).config return text_embedding return super().get_model_for_behavior(model, behavior) @@ -1969,14 +1976,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return model.language_model + return _get_language_model(model) if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = model.language_model.get_input_embeddings() - text_embedding.config = model.language_model.config + text_embedding = _get_language_model(model).get_input_embeddings() + text_embedding.config = _get_language_model(model).config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3477,7 +3484,9 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: text_embedding = ( - model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens + model.model.embed_tokens + if hasattr(model.model, "embed_tokens") + else _get_language_model(model).embed_tokens ) text_embedding.config = model.config return text_embedding diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3639ece9cf..1c1cb3bb9f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3351,7 +3351,11 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - model.forward = types.MethodType(llava_vision_embed_forward, model) + + if is_transformers_version("<", "5"): + model.forward = types.MethodType(llava_vision_embed_forward, model) + else: + model.forward = model.get_image_features super().__init__(config, model, model_kwargs) From f2f352dd92891a0d1eba46a9e0298d4848fb494d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 18:58:22 +0100 Subject: [PATCH 029/222] Use model.get_image_features --- optimum/exporters/openvino/model_patcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1c1cb3bb9f..7aec5bbe41 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3389,7 +3389,11 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) + + if is_transformers_version("<", "5"): + model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) + else: + model.forward = model.get_image_features super().__init__(config, model, model_kwargs) From 1db8fb9820a23d8a3e1d19201823c39aff1b99a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 29 Jan 2026 19:07:03 +0100 Subject: [PATCH 030/222] only add codegen remote code models when transformers < v5 --- tests/openvino/test_decoder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 183f362913..33d8383876 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -48,7 +48,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "blenderbot-small", "bloom", "codegen", - "codegen2", "gpt2", "gptj", "gpt_neo", @@ -147,6 +146,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("codegen2",) + GENERATION_LENGTH = 100 EXPECTED_NUM_SDPA = { From 0c72bc518c69ad2450217ad288b5172f2db19768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:06:32 +0100 Subject: [PATCH 031/222] Fix pipelines --- optimum/intel/pipelines/accelerator_utils.py | 30 ++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py index 7ea4102ec7..9090b8f30f 100644 --- a/optimum/intel/pipelines/accelerator_utils.py +++ b/optimum/intel/pipelines/accelerator_utils.py @@ -18,10 +18,15 @@ import transformers.pipelines from transformers import AutoConfig +from optimum.intel.utils import ( + IPEX_IMPORT_ERROR, + OPENVINO_IMPORT_ERROR, + is_ipex_available, + is_openvino_available, + is_transformers_version, +) from optimum.utils.logging import get_logger -from ..utils import IPEX_IMPORT_ERROR, OPENVINO_IMPORT_ERROR, is_ipex_available, is_openvino_available - if TYPE_CHECKING: from transformers import PretrainedConfig @@ -154,7 +159,10 @@ def openvino_infer_framework_load_model( You can also provide None as the model to use a default one.""" ) - return "pt", ov_model + if is_transformers_version("<", "5"): + return "pt", ov_model + + return ov_model def get_ipex_model_class(task: str, **model_kwargs): @@ -189,27 +197,33 @@ def ipex_infer_framework_load_model( You can also provide None as the model to use a default one.""" ) - return "pt", ipex_model + if is_transformers_version("<", "5"): + return "pt", ipex_model + + return ipex_model @contextlib.contextmanager def patch_pipelines_to_load_accelerator_model(accelerator: str): - original_infer_framework_load_model = transformers.pipelines.infer_framework_load_model + target_fn = "infer_framework_load_model" if is_transformers_version("<", "5") else "load_model" + + original_infer_framework_load_model = getattr(transformers.pipelines, target_fn) if accelerator == "openvino": if not is_openvino_available(): raise ImportError(OPENVINO_IMPORT_ERROR.format("`accelerator=openvino`")) - transformers.pipelines.infer_framework_load_model = openvino_infer_framework_load_model + setattr(transformers.pipelines, target_fn, openvino_infer_framework_load_model) + elif accelerator == "ipex": if not is_ipex_available(): raise ImportError(IPEX_IMPORT_ERROR.format("`accelerator=ipex`")) - transformers.pipelines.infer_framework_load_model = ipex_infer_framework_load_model + setattr(transformers.pipelines, target_fn, ipex_infer_framework_load_model) else: raise ValueError(f"Accelerator '{accelerator}' is not supported. Only 'openvino' and 'ipex' are supported.") try: yield finally: - transformers.pipelines.infer_framework_load_model = original_infer_framework_load_model + setattr(transformers.pipelines, target_fn, original_infer_framework_load_model) From 08ebe2b6d1778df68cb41b2297db074615a9a87b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:24:16 +0100 Subject: [PATCH 032/222] fix pipelines --- optimum/intel/pipelines/accelerator_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py index 9090b8f30f..9ac24d06d7 100644 --- a/optimum/intel/pipelines/accelerator_utils.py +++ b/optimum/intel/pipelines/accelerator_utils.py @@ -13,7 +13,7 @@ # limitations under the License. import contextlib -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, Optional, Tuple import transformers.pipelines from transformers import AutoConfig @@ -143,12 +143,15 @@ def get_openvino_model_class( # a modified transformers.pipelines.base.infer_framework_load_model that loads OpenVINO models def openvino_infer_framework_load_model( - model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs + model, + config: Optional["PretrainedConfig"] = None, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, ): if isinstance(model, str): - model_kwargs.pop("framework", None) model_kwargs.pop("_commit_hash", None) # not supported for OVModel - model_kwargs.pop("model_classes", None) ov_model_class = get_openvino_model_class(task, config, model, **model_kwargs) ov_model = ov_model_class.from_pretrained(model, **model_kwargs) elif isinstance(model, OVBaseModel): @@ -181,12 +184,15 @@ def get_ipex_model_class(task: str, **model_kwargs): # a modified transformers.pipelines.base.infer_framework_load_model that loads IPEX models def ipex_infer_framework_load_model( - model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs + model, + config: Optional["PretrainedConfig"] = None, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, ): if isinstance(model, str): - model_kwargs.pop("framework", None) model_kwargs.pop("_commit_hash", None) # not supported for IPEXModel - model_kwargs.pop("model_classes", None) ipex_model_class = get_ipex_model_class(task, **model_kwargs) ipex_model = ipex_model_class.from_pretrained(model, **model_kwargs) elif isinstance(model, IPEXModel): From 33f8c24df28e80efe49fa5beabef103d23ea89e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:34:34 +0100 Subject: [PATCH 033/222] replace with OV cache --- optimum/exporters/openvino/model_patcher.py | 33 ++++++++++----------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7aec5bbe41..fa8fe1bfb6 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -69,7 +69,7 @@ def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015 @classmethod - def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "DynamicCache": + def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "OVDynamicCache": """ Converts a cache in the legacy cache format into an equivalent `Cache`. Used for backward compatibility. @@ -87,7 +87,7 @@ def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tens class OVEncoderDecoderCache(EncoderDecoderCache): # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: - """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.""" + """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format.""" legacy_cache = () if len(self.cross_attention_cache) > 0: for self_attn, cross_attn in zip( @@ -102,9 +102,9 @@ def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: @classmethod def from_legacy_cache( cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]] - ) -> "EncoderDecoderCache": - """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.""" - cache = cls(DynamicCache(), DynamicCache()) + ) -> "OVEncoderDecoderCache": + """Converts a cache in the legacy cache format into an equivalent `OVEncoderDecoderCache`.""" + cache = cls(OVDynamicCache(), OVDynamicCache()) if past_key_values is None: logger.warning_once("past_key_values should not be None in from_legacy_cache()") else: @@ -1451,7 +1451,7 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -3023,7 +3023,7 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = DynamicCache.from_legacy_cache(legacy_pkv) + pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -4162,7 +4162,7 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4498,11 +4498,11 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, EncoderDecoderCache): + if isinstance(pkv, OVEncoderDecoderCache): pkv = pkv.self_attention_cache.to_legacy_cache() else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = EncoderDecoderCache.from_legacy_cache(pkv) + pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4512,7 +4512,7 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): + if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() # we still need to filter out cross attention in the case of non-stateful decoder @@ -4687,7 +4687,7 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = DynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -5052,7 +5052,6 @@ def _blenderbot_attn_forward_new( output_attentions: bool = False, cache_position: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - from transformers.cache_utils import EncoderDecoderCache """Input shape: Batch x Time x Channel""" @@ -5076,7 +5075,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, EncoderDecoderCache): + if isinstance(past_key_value, OVEncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5601,7 +5600,7 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5633,7 +5632,7 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, EncoderDecoderCache): + if isinstance(past_key_values, OVEncoderDecoderCache): past_key_values = past_key_values.self_attention_cache.to_legacy_cache() else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5685,7 +5684,7 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = DynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, From 9809e7ede6f973ea3ef625f5baabc06365b5f0ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 16:35:23 +0100 Subject: [PATCH 034/222] style --- optimum/exporters/openvino/model_patcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index fa8fe1bfb6..a85bd6f75c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -5052,7 +5052,6 @@ def _blenderbot_attn_forward_new( output_attentions: bool = False, cache_position: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" # if key_value_states are provided this layer is used as a cross-attention layer From 621e2bf710b44df53fc15435760bdc17c34886d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 17:51:21 +0100 Subject: [PATCH 035/222] Use AutoProcessor instead of AutoFeatureExtractor --- tests/openvino/test_modeling.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 4ffa7ab06b..777b276859 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1184,19 +1184,18 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) set_seed(SEED) transformers_model = AutoModelForImageClassification.from_pretrained(model_id) - preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) inputs = preprocessor(images=image, return_tensors="pt") with torch.no_grad(): transformers_outputs = transformers_model(**inputs) - for input_type in ["pt", "np"]: - inputs = preprocessor(images=image, return_tensors=input_type) - ov_outputs = ov_model(**inputs) - self.assertIn("logits", ov_outputs) - self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type]) - # Compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) + inputs = preprocessor(images=image, return_tensors="pt") + ov_outputs = ov_model(**inputs) + self.assertIn("logits", ov_outputs) + self.assertIsInstance(ov_outputs.logits, torch.Tensor) + # Compare tensor outputs + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)) del transformers_model del ov_model gc.collect() @@ -1209,7 +1208,7 @@ def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, device=OPENVINO_DEVICE) model.eval() - preprocessor = AutoFeatureExtractor.from_pretrained(model_id) + preprocessor = AutoProcessor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) inputs = TEST_IMAGE_URL outputs = pipe(inputs) From 30f628592391ba40c843c5077afbbc842eb586e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 18:02:27 +0100 Subject: [PATCH 036/222] remove afmoe from models to be tested list --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_decoder.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a8b293ca84..594d876812 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4646,7 +4646,7 @@ class ASTOpenVINOConfig(ASTOnnxConfig): ) class AfmoeOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.55.0" - MAX_TRANSFORMERS_VERSION = "4.57.99" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = AfmoeModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 33d8383876..19eb7dfb99 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -21,7 +21,12 @@ patch_awq_for_inference, ) -from optimum.exporters.openvino.model_configs import BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig +from optimum.exporters.openvino.model_configs import ( + AfmoeOpenVINOConfig, + BitnetOpenVINOConfig, + DeepseekOpenVINOConfig, + LFM2OpenVINOConfig, +) from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES from optimum.exporters.tasks import TasksManager @@ -274,11 +279,13 @@ def test_find_untested_architectures(self): if "llama4_text" in supported_architectures: supported_architectures.remove("llama4_text") - if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + if is_transformers_version(">", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): if "deepseek_v2" in supported_architectures: supported_architectures.remove("deepseek_v2") if "deepseek_v3" in supported_architectures: supported_architectures.remove("deepseek_v3") + if is_transformers_version(">", str(AfmoeOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): + supported_architectures -= {"afmoe"} if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): From 6cd7b1c263ef119008f21382be8f0a6dd32a5a29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 30 Jan 2026 18:20:04 +0100 Subject: [PATCH 037/222] fix pipeline saving tests --- tests/openvino/test_modeling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 777b276859..d9a61e8b44 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -708,6 +708,8 @@ def test_load_model_from_hub(self): # verify could load both pytorch and openvino model (export argument should automatically infered) ov_exported_pipe = optimum_pipeline("text-generation", model_id, revision="pt", accelerator="openvino") + ov_exported_pipe.modelcard = None + ov_pipe = optimum_pipeline("text-generation", model_id, revision="ov", accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) From 85a0418e0a64b6e9de3e802912d55eaca9c7a056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 15:46:04 +0100 Subject: [PATCH 038/222] fix seq2seq pipeline tests loading --- tests/openvino/test_modeling.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index d9a61e8b44..785c4e2782 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -727,20 +727,21 @@ def test_load_model_from_hub(self): gc.collect() def test_seq2seq_load_from_hub(self): - model_id = "echarlaix/tiny-random-t5" + model_id = MODEL_NAMES["whisper"] + task = "automatic-speech-recognition" # verify could load both pytorch and openvino model (export argument should automatically infered) - ov_exported_pipe = optimum_pipeline("text2text-generation", model_id, accelerator="openvino") - ov_pipe = optimum_pipeline("text2text-generation", model_id, revision="ov", accelerator="openvino") + ov_exported_pipe = optimum_pipeline(task, model_id, accelerator="openvino") + ov_exported_pipe.modelcard = None + ov_pipe = optimum_pipeline(task, model_id, revision="ov", accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) self.assertIsInstance(ov_pipe.model, OVBaseModel) with TemporaryDirectory() as tmpdirname: ov_exported_pipe.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) - if not ov_exported_pipe.model.decoder.stateful: - self.assertTrue(OV_DECODER_WITH_PAST_NAME in folder_contents) - self.assertTrue(OV_DECODER_WITH_PAST_NAME.replace(".xml", ".bin") in folder_contents) - ov_exported_pipe = optimum_pipeline("text2text-generation", tmpdirname, accelerator="openvino") + self.assertTrue(ov_exported_pipe.model._ov_model_paths["encoder"] in folder_contents) + self.assertTrue(ov_exported_pipe.model._ov_model_paths["decoder"] in folder_contents) + ov_exported_pipe = optimum_pipeline(task, tmpdirname, accelerator="openvino") self.assertIsInstance(ov_exported_pipe.model, OVBaseModel) del ov_exported_pipe From 08d148014e292ca9118cb1fdbf502369f3f44d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 16:06:52 +0100 Subject: [PATCH 039/222] disable pipelines tests when transformers >= v5 since summarization/translation/text2text-generation pipelines are deprecated --- tests/openvino/test_seq2seq.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 83a4b7c54f..daf81cf747 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -224,6 +224,10 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5"), + reason="requires transformers < v5 since summarization/translation/text2text-generation pipelines are deprecated", + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From 7bc714cad9b526e8634b562a349a7ecdbc54abdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 18:52:48 +0100 Subject: [PATCH 040/222] fix MixtralModelPatcher --- optimum/exporters/openvino/model_patcher.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a9d67eba7d..af2a2f546b 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -396,18 +396,19 @@ def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc class MixtralModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - - for layer in self._model.model.layers: - layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _mixtral_sparse_moe_block_forward, layer.block_sparse_moe - ) + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _mixtral_sparse_moe_block_forward, layer.block_sparse_moe + ) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward class ArcticModelPatcher(MixtralModelPatcher): From 8b374c7e067325c2ff7b3f0774aff33010f7d1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 2 Feb 2026 19:03:49 +0100 Subject: [PATCH 041/222] fix moe patching --- optimum/exporters/openvino/model_patcher.py | 81 +++++++++++++-------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index af2a2f546b..18783eb770 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -402,6 +402,8 @@ def __enter__(self): layer.block_sparse_moe.forward = types.MethodType( _mixtral_sparse_moe_block_forward, layer.block_sparse_moe ) + else: + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -1709,16 +1711,22 @@ def _phi_moe_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc class PhiMoEModelPatcher(Phi3ModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe - ) + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward + layer.block_sparse_moe.forward = types.MethodType( + _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe + ) + else: + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward def _aquila_self_attn_sdpa_forward( @@ -4443,28 +4451,35 @@ def _granite_moe_parallel_experts_forward(self, inputs, expert_size): class GraniteMoEModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward - block_sparse_moe.router.forward = types.MethodType( - _granite_moe_topk_gating_forward, block_sparse_moe.router - ) - block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward - block_sparse_moe.input_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear - ) - block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward - block_sparse_moe.output_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear - ) + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward + block_sparse_moe.router.forward = types.MethodType( + _granite_moe_topk_gating_forward, block_sparse_moe.router + ) + block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward + block_sparse_moe.input_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear + ) + block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward + block_sparse_moe.output_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear + ) + + else: + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward - block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward - block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward + + if is_transformers_version("<", "5"): + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward + block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward + block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward class OVSeq2SeqModelPatcher(OVModelPatcher): @@ -5270,14 +5285,18 @@ def _qwen2moe_sparse_block_forward(self, hidden_states: torch.Tensor) -> torch.T class Qwen2MoEPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.52.0"): + + if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"): from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock modulewise_patch(self._model, Qwen2MoeSparseMoeBlock, _qwen2moe_sparse_block_forward) + if is_transformers_version(">=", "5"): + self._model.config._experts_implementation = "batched_mm" + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.52.0"): + if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"): from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock modulewise_unpatch(self._model, Qwen2MoeSparseMoeBlock) @@ -6626,14 +6645,16 @@ class Qwen3MoeModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.53"): + if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"): self.original_moe_forward = Qwen3MoeSparseMoeBlock.forward Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward_patched + if is_transformers_version(">=", "5"): + self._model.config._experts_implementation = "batched_mm" def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.53"): + if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"): Qwen3MoeSparseMoeBlock.forward = self.original_moe_forward From a4cfc55f57bbe575576db4db6b0a0edbef72b452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 15:17:13 +0100 Subject: [PATCH 042/222] gptj fix --- optimum/exporters/openvino/model_patcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 18783eb770..68b306e318 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2793,15 +2793,19 @@ def gptj_attn_forward( if output_attentions: self._attn = self._orig_attn + kwargs = {} + if is_transformers_version("<", "5"): + kwrags["head_mask"] = head_mask + return self._orig_forward( hidden_states, layer_past, attention_mask, position_ids, - head_mask, use_cache=use_cache, output_attentions=output_attentions, cache_position=cache_position, + **kwargs ) From 5bab4588af14819368c7b5bbe555abad18df20b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 15:40:58 +0100 Subject: [PATCH 043/222] fix granitemoehybrid patcher --- optimum/exporters/openvino/model_patcher.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 68b306e318..08710d6601 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2805,7 +2805,7 @@ def gptj_attn_forward( use_cache=use_cache, output_attentions=output_attentions, cache_position=cache_position, - **kwargs + **kwargs, ) @@ -7522,10 +7522,12 @@ def patch_sparse_moe(sparse_moe_layer): super().__enter__() setattr(self._model, self.orig_forward_name, self.patched_forward) - self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask - self._model.model._update_causal_mask = types.MethodType( - granite_moe_hybrid_update_causal_mask, self._model.model - ) + if is_transformers_version("<", "5"): + self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask + self._model.model._update_causal_mask = types.MethodType( + granite_moe_hybrid_update_causal_mask, self._model.model + ) + for idx, layer in enumerate(self._model.model.layers): if hasattr(layer, "block_sparse_moe"): patch_sparse_moe(layer.block_sparse_moe) @@ -7545,7 +7547,9 @@ def unpatch_sparse_moe(sparse_moe_layer): super().__exit__(exc_type, exc_value, traceback) setattr(self._model, self.orig_forward_name, self.model_orig_forward) - self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + if is_transformers_version("<", "5"): + self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + for idx, layer in enumerate(self._model.model.layers): if hasattr(layer, "block_sparse_moe"): unpatch_sparse_moe(layer.block_sparse_moe) From daf7ec83e2fe8da562575fb7db52de930980002a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 18:20:54 +0100 Subject: [PATCH 044/222] typo --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 08710d6601..a496ec7d8e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2795,7 +2795,7 @@ def gptj_attn_forward( kwargs = {} if is_transformers_version("<", "5"): - kwrags["head_mask"] = head_mask + kwargs["head_mask"] = head_mask return self._orig_forward( hidden_states, From a45f5ab8f32c7fef54381587fb9e5368b816610e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Feb 2026 19:10:59 +0100 Subject: [PATCH 045/222] add exaone max_transformers_version as incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_decoder.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 594d876812..0f8afff724 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -701,7 +701,7 @@ class BitnetOpenVINOConfig(LlamaOnnxConfig): library_name="transformers", ) class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): - pass + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 19eb7dfb99..1f7ae31827 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -90,7 +90,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "qwen2_moe", "phi3", "gemma2", - "exaone", "granite", "granitemoe", ) @@ -152,7 +151,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2",) + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone") GENERATION_LENGTH = 100 From 342dc59c3742230d4661c351d8dba272382040cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 14:35:09 +0100 Subject: [PATCH 046/222] add decilm max_transformers_version as incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0f8afff724..fb6be5eb52 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1351,6 +1351,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int @register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 1f7ae31827..2d7652e3bf 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -79,7 +79,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "xverse", "internlm", "jais", - "decilm", "gemma", "olmo", "stablelm", @@ -151,7 +150,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone") + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm") GENERATION_LENGTH = 100 From 2a28fe7211bc39190303da7cfea980007b992e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 16:16:59 +0100 Subject: [PATCH 047/222] fix llama4 patcher --- optimum/exporters/openvino/model_patcher.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a496ec7d8e..8aa6b94a18 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -19,7 +19,7 @@ import math import types from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -6289,6 +6289,15 @@ def llama4_moe_forward(self, hidden_states): return out, router_scores +# Copied from https://github.com/huggingface/transformers/blob/v4.56.0/src/transformers/masking_utils.py#L105 +# transformers.masking_utils._legacy_chunked_overlay deprecated since transformers v5 +def _legacy_chunked_overlay(chunk_size: int) -> Callable: + def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: + return kv_idx // chunk_size == q_idx // chunk_size + + return inner_mask + + class Llama4TextModelPatcher(OVModelPatcher): def __enter__(self): super().__enter__() @@ -6305,8 +6314,8 @@ def __enter__(self): if is_transformers_version(">=", "4.56"): # openvino is not able to trace through the new chunked_overlay with left_padding self.original_chunked_overlay = transformers.masking_utils.chunked_overlay - transformers.masking_utils.chunked_overlay = ( - lambda chunk_size, left_padding: transformers.masking_utils._legacy_chunked_overlay(chunk_size) + transformers.masking_utils.chunked_overlay = lambda chunk_size, left_padding: _legacy_chunked_overlay( + chunk_size ) def __exit__(self, exc_type, exc_value, traceback): From b9a3cbe90f1ecd802f008f3cfc2cb75d1934fc25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 16:54:48 +0100 Subject: [PATCH 048/222] make OV DynamicCache backward compatible --- optimum/exporters/openvino/model_patcher.py | 52 +++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 8aa6b94a18..8d1c7a93e9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -56,6 +56,28 @@ class OVDynamicCache(DynamicCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881 + def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]: + """ + Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the + sequence length. + """ + if layer_idx < len(self.layers): + return self.layers[layer_idx].keys, self.layers[layer_idx].values + else: + raise KeyError( + f"Cache only has {len(self.layers)} layers, attempted to access layer with index {layer_idx}" + ) + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L893 + def __iter__(self): + """ + Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over + keys and values + """ + for layer_idx in range(len(self)): + yield (self.layers[layer_idx].keys, self.layers[layer_idx].values) + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005 def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: """ @@ -85,6 +107,36 @@ def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tens class OVEncoderDecoderCache(EncoderDecoderCache): + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1244 + def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the + sequence length. + """ + if layer_idx < len(self): + return ( + self.self_attention_cache.layers[layer_idx].keys, + self.self_attention_cache.layers[layer_idx].values, + self.cross_attention_cache.layers[layer_idx].keys, + self.cross_attention_cache.layers[layer_idx].values, + ) + else: + raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") + + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1231 + def __iter__(self): + """ + Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over + keys and values + """ + for layer_idx in range(len(self)): + yield ( + self.self_attention_cache.layers[layer_idx].keys, + self.self_attention_cache.layers[layer_idx].values, + self.cross_attention_cache.layers[layer_idx].keys, + self.cross_attention_cache.layers[layer_idx].values, + ) + # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format.""" From 1687e3515df16c6f19cb7c76e089a4d3523f4255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 17:48:16 +0100 Subject: [PATCH 049/222] remove incompatible models aquila2 orion internlm2 --- optimum/exporters/openvino/model_configs.py | 3 ++- tests/openvino/test_decoder.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index fb6be5eb52..5e2e643ef4 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -858,6 +858,7 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -867,7 +868,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 - + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2d7652e3bf..f40e402ecb 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -66,8 +66,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "opt", "pegasus", "phi", - "internlm2", - "orion", "falcon", "falcon-40b", "persimmon", @@ -75,7 +73,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "gpt_neox_japanese", "xglm", "aquila", - "aquila2", "xverse", "internlm", "jais", @@ -150,7 +147,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm") + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2") GENERATION_LENGTH = 100 From 961c1d3f3aa91193ff7ed09ef88c22c6c8a23514 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Feb 2026 18:34:26 +0100 Subject: [PATCH 050/222] add jais max_transformers_version as incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 5e2e643ef4..2e060684e7 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1184,6 +1184,7 @@ class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): ) class JaisOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index f40e402ecb..26e8010f76 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -75,7 +75,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "aquila", "xverse", "internlm", - "jais", "gemma", "olmo", "stablelm", @@ -147,7 +146,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2") + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais") GENERATION_LENGTH = 100 From e8e6c18a284e8cf036ca12a420e458724a42d7ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 10:29:30 +0100 Subject: [PATCH 051/222] dbrx --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 2e060684e7..3113fdf136 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1165,6 +1165,7 @@ class CodeGenOpenVINOConfig(CodeGenOnnxConfig): ) class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="n_heads", hidden_size="d_model", diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 26e8010f76..5b8916c5cb 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -79,7 +79,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "olmo", "stablelm", "starcoder2", - "dbrx", "cohere", "qwen2", "qwen2_moe", @@ -146,7 +145,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais") + # TODO: add dbrx back once fixed in transformers + SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais", "dbrx") GENERATION_LENGTH = 100 From c6640d6364257db8da6207999ed00384a9e16358 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 19:18:38 +0100 Subject: [PATCH 052/222] set float32 dtype --- tests/openvino/test_decoder.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 5b8916c5cb..76901f5db4 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -360,7 +360,17 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) - if model_arch in ["qwen", "arctic", "chatglm4", "gpt_oss_mxfp4"]: + if model_arch in [ + "qwen", + "arctic", + "chatglm4", + "gpt_oss_mxfp4", + "llama", + "lfm2", + "gemma3_text", + "llama4", + "exaone4", + ]: transformers_model.to(torch.float32) with torch.no_grad(): @@ -808,7 +818,7 @@ def test_beam_search(self, model_arch): def test_load_with_different_dtype(self): set_seed(SEED) - model_id = MODEL_NAMES["llama"] + model_id = MODEL_NAMES["mistral"] pt_model = AutoModelForCausalLM.from_pretrained( model_id, ) From 1e0c06ffa9a3bb82009399f98dab0efd70219287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 19:30:18 +0100 Subject: [PATCH 053/222] baichuan2 not compatible with v5 --- tests/openvino/test_decoder.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 76901f5db4..86105e8112 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -45,7 +45,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", - "baichuan2", "baichuan2-13b", "gpt_bigcode", "bigbird_pegasus", @@ -146,7 +145,17 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "5"): # TODO: add dbrx back once fixed in transformers - SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais", "dbrx") + SUPPORTED_ARCHITECTURES += ( + "codegen2", + "exaone", + "decilm", + "internlm2", + "orion", + "aquila2", + "jais", + "dbrx", + "baichuan2", + ) GENERATION_LENGTH = 100 From b4910fc0a30bbaf391cae1057dccb60f4f8d5225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 22:40:38 +0100 Subject: [PATCH 054/222] gpt oss set experts_implementation batched mm --- optimum/exporters/openvino/model_patcher.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 8d1c7a93e9..7490c44d55 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7385,16 +7385,19 @@ class GptOssModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.55.0"): + if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"): from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts self.original_gpt_oss_forward = GptOssExperts.forward GptOssExperts.forward = gpt_oss_forward + if is_transformers_version(">=", "5"): + self._model.config._experts_implementation = "batched_mm" + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.55.0"): + if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"): from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts GptOssExperts.forward = self.original_gpt_oss_forward From e19da565ff0cb4d85921c99c606b3a67df7af259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 5 Feb 2026 23:15:09 +0100 Subject: [PATCH 055/222] bitnet --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 3113fdf136..e25c154f4a 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -686,6 +686,7 @@ class GptOssOpenVINOConfig(LlamaOpenVINOConfig): ) class BitnetOpenVINOConfig(LlamaOnnxConfig): MIN_TRANSFORMERS_VERSION = "4.52.1" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = OVDecoderModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 86105e8112..6efa3629fe 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -125,7 +125,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) - if is_transformers_version(">=", "4.52.1"): + if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("bitnet",) if is_transformers_version(">=", "4.54.0"): From fde5ac98af9612f51bd015efe6f26c4ca693c268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 12:23:38 +0100 Subject: [PATCH 056/222] qwenvl --- optimum/exporters/openvino/model_configs.py | 26 ++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index e25c154f4a..4d5a0d4f48 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -317,11 +317,11 @@ def init_model_configs(): register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) -def _get_language_model(model): +def _get_subcomponent_model(model, name): if is_transformers_version("<", "5"): - return model.language_model + return getattr(model, name) - return model.model.language_model + return getattr(model.model, name) @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") @@ -1714,14 +1714,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_language_model(model) if not hasattr(model, "lm_head") else model + return _get_subcomponent_model(model, "language_model") if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() - text_embedding.config = _get_language_model(model).config + text_embedding.config = _get_subcomponent_model(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -1904,8 +1904,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_language_model(model).get_input_embeddings() - text_embedding.config = _get_language_model(model).config + text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() + text_embedding.config = _get_subcomponent_model(model, "language_model").config return text_embedding return super().get_model_for_behavior(model, behavior) @@ -1981,14 +1981,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_language_model(model) + return _get_subcomponent_model(model, "language_model") if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_language_model(model).get_input_embeddings() - text_embedding.config = _get_language_model(model).config + text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() + text_embedding.config = _get_subcomponent_model(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3478,12 +3478,12 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): return model if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: - vision_embeddings = model.visual.patch_embed + vision_embeddings = _get_subcomponent_model(model, "visual").patch_embed vision_embeddings.config = model.config.vision_config return vision_embeddings if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: - vision_emb_merger = model.visual + vision_emb_merger = _get_subcomponent_model(model, "visual") vision_emb_merger.config = model.config.vision_config return vision_emb_merger @@ -3491,7 +3491,7 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): text_embedding = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") - else _get_language_model(model).embed_tokens + else _get_subcomponent_model(model, "language_model").embed_tokens ) text_embedding.config = model.config return text_embedding From 0d3b656a2c110b2a84f440d428dff13e86461b8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 12:43:12 +0100 Subject: [PATCH 057/222] maira2 remote code --- optimum/exporters/openvino/model_configs.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 4d5a0d4f48..b20e80ae27 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1904,10 +1904,13 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() - text_embedding.config = _get_subcomponent_model(model, "language_model").config + text_embedding = model.language_model.get_input_embeddings() + text_embedding.config = model.language_model.config return text_embedding + if behavior == VLMConfigBehavior.LANGUAGE: + return model.language_model + return super().get_model_for_behavior(model, behavior) From b8797e32dcb0dd49fd42a5cb403df051f4f7d6e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 15:00:31 +0100 Subject: [PATCH 058/222] gemma3 and got_ocr2 --- optimum/exporters/openvino/model_patcher.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7490c44d55..f780ce0cd9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4684,7 +4684,11 @@ def __init__( model.__orig_forward = model.forward # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 - if hasattr(model, "model") and hasattr(model.model, "get_image_features"): + if ( + hasattr(model, "model") + and hasattr(model.model, "get_image_features") + and is_transformers_version("<", "5") + ): model.forward = model.model.get_image_features else: model.forward = model.get_image_features From 9dfb66617c19508dade18184d84bd20a6f5d9cf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 16:36:14 +0100 Subject: [PATCH 059/222] llava next --- optimum/exporters/openvino/model_configs.py | 15 ++++++----- optimum/exporters/openvino/model_patcher.py | 29 +++++++++++++++++++-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index b20e80ae27..a2a54fb152 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -166,6 +166,7 @@ Llama4ImageEmbeddingsModelPatcher, Llama4TextModelPatcher, LlavaImageEmbeddingModelPatcher, + LlavaNextImageEmbeddingModelPatcher, LlavaNextVideoImageEmbeddingModelPatcher, LlavaQwen2ImageEmbeddingsModelPatcher, MairaImageEmbeddingModelPatcher, @@ -199,6 +200,7 @@ SanaTextEncoderModelPatcher, XverseModelPatcher, Zamba2ModelPatcher, + _get_subcomponent_model, ) @@ -317,13 +319,6 @@ def init_model_configs(): register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) -def _get_subcomponent_model(model, name): - if is_transformers_version("<", "5"): - return getattr(model, name) - - return getattr(model.model, name) - - @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 @@ -1773,6 +1768,12 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.40.0" + def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): + model_kwargs = model_kwargs or {} + if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: + return super().patch_model_for_export(model, model_kwargs) + return LlavaNextImageEmbeddingModelPatcher(self, model, model_kwargs) + class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = ["image_features"] diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f780ce0cd9..5b93fc5347 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -55,6 +55,13 @@ logger = logging.getLogger(__name__) +def _get_subcomponent_model(model, name): + if is_transformers_version("<", "5"): + return getattr(model, name) + + return getattr(model.model, name) + + class OVDynamicCache(DynamicCache): # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881 def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]: @@ -3365,7 +3372,7 @@ def llava_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + image_outputs = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer] @@ -3376,7 +3383,7 @@ def llava_vision_embed_forward(self, pixel_values): else: raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - image_features = self.multi_modal_projector(selected_image_feature) + image_features = _get_subcomponent_model(self, "multi_modal_projector")(selected_image_feature) return image_features @@ -3429,6 +3436,24 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward +class LlavaNextImageEmbeddingModelPatcher(OVModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: "PreTrainedModel", + model_kwargs: Dict[str, Any], + ): + model.__orig_forward = model.forward + # TODO: use get_image_features instead and add image_sizes as input when exorting + # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716 + model.forward = types.MethodType(llava_vision_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + class MairaImageEmbeddingModelPatcher(OVModelPatcher): def __init__( self, From 3386c647fb1d5ad051c866e0b071ce77b4e9760a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Feb 2026 16:51:13 +0100 Subject: [PATCH 060/222] llava next video --- optimum/exporters/openvino/model_patcher.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 5b93fc5347..b3412a0a41 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3391,7 +3391,7 @@ def llava_next_video_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_features = self.vision_tower(pixel_values, output_hidden_states=True) + image_features = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) vision_feature_layer = self.config.vision_feature_layer if isinstance(vision_feature_layer, int): selected_image_feature = image_features.hidden_states[vision_feature_layer] @@ -3444,7 +3444,7 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - # TODO: use get_image_features instead and add image_sizes as input when exorting + # TODO: use get_image_features instead and add image_sizes as input when exporting # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716 model.forward = types.MethodType(llava_vision_embed_forward, model) super().__init__(config, model, model_kwargs) @@ -3479,12 +3479,9 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - - if is_transformers_version("<", "5"): - model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) - else: - model.forward = model.get_image_features - + # TODO: use get_image_features instead and add image_sizes as input when exporting + # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L746 + model.forward = types.MethodType(llava_next_video_vision_embed_forward, model) super().__init__(config, model, model_kwargs) def __exit__(self, exc_type, exc_value, traceback): From bc4a84d163dd4b8d6166272006b9b7c6c105e804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 15:12:12 +0100 Subject: [PATCH 061/222] use ONNXCache --- optimum/exporters/openvino/model_patcher.py | 245 ++++---------------- 1 file changed, 42 insertions(+), 203 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1ab1a386b9..b779170d8e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -36,6 +36,9 @@ override_arguments, sdpa_mask_without_vmap, ) +from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache + + from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -62,170 +65,6 @@ def _get_subcomponent_model(model, name): return getattr(model.model, name) -class OVDynamicCache(DynamicCache): - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881 - def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]: - """ - Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the - sequence length. - """ - if layer_idx < len(self.layers): - return self.layers[layer_idx].keys, self.layers[layer_idx].values - else: - raise KeyError( - f"Cache only has {len(self.layers)} layers, attempted to access layer with index {layer_idx}" - ) - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L893 - def __iter__(self): - """ - Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over - keys and values - """ - for layer_idx in range(len(self)): - yield (self.layers[layer_idx].keys, self.layers[layer_idx].values) - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005 - def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]: - """ - Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for - backward compatibility. - """ - legacy_cache = () - for layer in self.layers: - legacy_cache += ((layer.keys, layer.values),) - return legacy_cache - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015 - @classmethod - def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "OVDynamicCache": - """ - Converts a cache in the legacy cache format into an equivalent `Cache`. Used for - backward compatibility. - """ - cache = cls() - if past_key_values is None: - logger.warning_once("past_key_values should not be None in from_legacy_cache()") - if past_key_values is not None: - for layer_idx in range(len(past_key_values)): - key_states, value_states = past_key_values[layer_idx] - cache.update(key_states, value_states, layer_idx) - return cache - - -class OVEncoderDecoderCache(EncoderDecoderCache): - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1244 - def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the - sequence length. - """ - if layer_idx < len(self): - return ( - self.self_attention_cache.layers[layer_idx].keys, - self.self_attention_cache.layers[layer_idx].values, - self.cross_attention_cache.layers[layer_idx].keys, - self.cross_attention_cache.layers[layer_idx].values, - ) - else: - raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1231 - def __iter__(self): - """ - Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over - keys and values - """ - for layer_idx in range(len(self)): - yield ( - self.self_attention_cache.layers[layer_idx].keys, - self.self_attention_cache.layers[layer_idx].values, - self.cross_attention_cache.layers[layer_idx].keys, - self.cross_attention_cache.layers[layer_idx].values, - ) - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266 - def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]: - """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format.""" - legacy_cache = () - if len(self.cross_attention_cache) > 0: - for self_attn, cross_attn in zip( - self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache() - ): - legacy_cache += (self_attn + cross_attn,) - else: - legacy_cache = self.self_attention_cache.to_legacy_cache() - return legacy_cache - - # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1279 - @classmethod - def from_legacy_cache( - cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]] - ) -> "OVEncoderDecoderCache": - """Converts a cache in the legacy cache format into an equivalent `OVEncoderDecoderCache`.""" - cache = cls(OVDynamicCache(), OVDynamicCache()) - if past_key_values is None: - logger.warning_once("past_key_values should not be None in from_legacy_cache()") - else: - for layer_idx, key_value_states in enumerate(past_key_values): - key_states, value_states = key_value_states[:2] - cache.self_attention_cache.update(key_states, value_states, layer_idx) - if len(key_value_states) > 2: - key_states, value_states = key_value_states[2:] - cache.cross_attention_cache.update(key_states, value_states, layer_idx) - cache.is_updated[layer_idx] = True - return cache - - -def preprocess_past_key_values(past_key_values): - if ( - is_transformers_version(">=", "4.48") - and isinstance(past_key_values, (list, tuple)) - and isinstance(past_key_values[0], (list, tuple)) - ): - if len(past_key_values[0]) == 2: - past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) - elif len(past_key_values[0]) == 4: - past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) - else: - raise ValueError( - f"past_key_values should have either 2 or 4 elements, but it has {len(past_key_values[0])} elements." - ) - - return past_key_values - - -class OVModelPatcher(ModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: "PreTrainedModel", - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - - self.model_patched_forward = self.patched_forward - - @functools.wraps(self.model_patched_forward) - def patched_forward(*args, **kwargs): - signature = inspect.signature(self.model_patched_forward) - args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) - - if "past_key_values" in signature.parameters: - # Most models require past_key_values to be a cache instance instead of a tuple now - pkv_index = list(signature.parameters.keys()).index("past_key_values") - if pkv_index < len(args) and args[pkv_index] is not None: - args[pkv_index] = preprocess_past_key_values(args[pkv_index]) - elif kwargs.get("past_key_values") is not None: - kwargs["past_key_values"] = preprocess_past_key_values(kwargs["past_key_values"]) - - outputs = self.model_patched_forward(*args, **kwargs) - - return outputs - - self.patched_forward = patched_forward - - for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes @@ -382,7 +221,7 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]: return mask -class OVDecoderModelPatcher(OVModelPatcher): +class OVDecoderModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -1513,7 +1352,7 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -3095,7 +2934,7 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) + pkv = ONNXDynamicCache.from_legacy_cache(legacy_pkv) return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -3254,7 +3093,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class IBertModelPatcher(OVModelPatcher): +class IBertModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3272,7 +3111,7 @@ def __init__( self._model(torch.ones([1, 1], dtype=torch.long)) -class InternVLChatImageEmbeddingModelPatcher(OVModelPatcher): +class InternVLChatImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3415,7 +3254,7 @@ def maira_vision_embed_forward(self, pixel_values): return self.get_image_features(pixel_values, vision_feature_layer, vision_feature_select_strategy) -class LlavaImageEmbeddingModelPatcher(OVModelPatcher): +class LlavaImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3436,7 +3275,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class LlavaNextImageEmbeddingModelPatcher(OVModelPatcher): +class LlavaNextImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3454,7 +3293,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MairaImageEmbeddingModelPatcher(OVModelPatcher): +class MairaImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3471,7 +3310,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class LlavaNextVideoImageEmbeddingModelPatcher(OVModelPatcher): +class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3513,7 +3352,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: return emb.unsqueeze(1) -class FluxTransfromerModelPatcher(OVModelPatcher): +class FluxTransfromerModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() if is_diffusers_version("<", "0.31.0"): @@ -3688,7 +3527,7 @@ def _minicpmv_siglip_transformer_forward( ) -class MiniCPMVResamplerModelPatcher(OVModelPatcher): +class MiniCPMVResamplerModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3705,7 +3544,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class MiniCPMVImageEmbeddingsModelPatcher(OVModelPatcher): +class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3736,7 +3575,7 @@ def __exit__(self, exc_type, exc_value, traceback): layer.self_attn.forward = layer.self_attn._orig_forward -class LlavaQwen2ImageEmbeddingsModelPatcher(OVModelPatcher): +class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3754,7 +3593,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class InputEmbeddingPatcher(OVModelPatcher): +class InputEmbeddingPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -3779,7 +3618,7 @@ def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor): return self.get_img_features(pixel_values) -class Phi3VisionImageEmbeddingsPatcher(OVModelPatcher): +class Phi3VisionImageEmbeddingsPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4249,7 +4088,7 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + new_past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4434,7 +4273,7 @@ def block_forward( block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn) -class Qwen2VLVisionEmbMergerPatcher(OVModelPatcher): +class Qwen2VLVisionEmbMergerPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4468,7 +4307,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.attn.forward = block.attn._orig_forward -class Qwen2_5_VLVisionEmbMergerPatcher(OVModelPatcher): +class Qwen2_5_VLVisionEmbMergerPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4648,7 +4487,7 @@ def __exit__(self, exc_type, exc_value, traceback): block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward -class OVSeq2SeqModelPatcher(OVModelPatcher): +class OVSeq2SeqModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4680,11 +4519,11 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, OVEncoderDecoderCache): + if isinstance(pkv, ONNXEncoderDecoderCache): pkv = pkv.self_attention_cache.to_legacy_cache() else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) + pkv = ONNXEncoderDecoderCache.from_legacy_cache(pkv) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4694,7 +4533,7 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): + if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() # we still need to filter out cross attention in the case of non-stateful decoder @@ -4733,7 +4572,7 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) -class SanaTextEncoderModelPatcher(OVModelPatcher): +class SanaTextEncoderModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -4784,7 +4623,7 @@ def __init__( super().__init__(config, model, model_kwargs) -class CommonImageEmbeddingsModelPatcher(OVModelPatcher): +class CommonImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -4873,7 +4712,7 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -4938,7 +4777,7 @@ def __exit__(self, exc_type, exc_value, traceback): del self._model.model._orig_update_causual_mask -class Idefics3ImageEmbeddingsModelPatcher(OVModelPatcher): +class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5260,7 +5099,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, OVEncoderDecoderCache): + if isinstance(past_key_value, ONNXEncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5719,7 +5558,7 @@ def speecht5_decoder_layer_forward( return outputs -class OVSpeechT5ModelPatcher(OVModelPatcher): +class OVSpeechT5ModelPatcher(ModelPatcher): def __enter__(self): if self.real_config._behavior != "vocoder": super().__enter__() @@ -5789,7 +5628,7 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) + past_key_values = ONNXEncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5821,7 +5660,7 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, OVEncoderDecoderCache): + if isinstance(past_key_values, ONNXEncoderDecoderCache): past_key_values = past_key_values.self_attention_cache.to_legacy_cache() else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5873,7 +5712,7 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -5895,7 +5734,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioForwardEmbeddingsPatcher(OVModelPatcher): +class Phi4MMAudioForwardEmbeddingsPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5919,7 +5758,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMAudioEncoderPatcher(OVModelPatcher): +class Phi4MMAudioEncoderPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -5960,7 +5799,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward -class Phi4MMVisionEmbeddingsPatcher(OVModelPatcher): +class Phi4MMVisionEmbeddingsPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6269,7 +6108,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.img_processor.embeddings.forward = self._model.img_processor.embeddings._orig_forward -class Llama4ImageEmbeddingsModelPatcher(OVModelPatcher): +class Llama4ImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -6464,7 +6303,7 @@ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: return inner_mask -class Llama4TextModelPatcher(OVModelPatcher): +class Llama4TextModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -6634,7 +6473,7 @@ def mamba_mixer_forward( # 1. Inject a MambaCache structure into the original model to simplify input and output handling related to SSM states # 2. Patch ConvSequenceTransform module to avoid if-else branching # 3. Vectorize the selective scan operation to ensure correct behavior during JIT tracing -class MambaPatcher(OVModelPatcher): +class MambaPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -7130,7 +6969,7 @@ def segment_sum(input_tensor): # for subsequent invocation of the model's `forward` method. # 2. Patches the Zamba2MambaMixer so that the traced `forward` function works correctly # during both the prefill and decoding steps. -class Zamba2ModelPatcher(OVModelPatcher): +class Zamba2ModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", @@ -7559,7 +7398,7 @@ def granite_moe_hybrid_update_causal_mask( return causal_mask -class GraniteMoeHybridModelPatcher(OVModelPatcher): +class GraniteMoeHybridModelPatcher(ModelPatcher): def __init__( self, config: "OnnxConfig", From 0e41943847818a8ab8d660bde7f73a0ec3b2ba7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 16:13:44 +0100 Subject: [PATCH 062/222] style --- optimum/exporters/openvino/model_patcher.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b779170d8e..c2e878f0c8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -18,12 +18,11 @@ import logging as log import math import types -from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.nn.functional as F -from transformers.cache_utils import DynamicCache, EncoderDecoderCache +from transformers.cache_utils import DynamicCache from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.models.phi3.modeling_phi3 import apply_rotary_pos_emb, repeat_kv from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet @@ -37,8 +36,6 @@ sdpa_mask_without_vmap, ) from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache - - from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -4129,7 +4126,6 @@ def lm_forward( deepstack_visual_embeds, use_cache=True, ): - from transformers.cache_utils import DynamicCache pkv = DynamicCache.from_legacy_cache(past_key_values) outputs = self.model.language_model( From bc9665d0335d6d24be3a0e94c566c53424ed4088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 17:35:08 +0100 Subject: [PATCH 063/222] fix seq2seq stateless export --- optimum/exporters/openvino/model_patcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index c2e878f0c8..c8cba66a2a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4531,6 +4531,8 @@ def patched_forward(*args, **kwargs): # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() + elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): + outputs.pop("past_key_values") # we still need to filter out cross attention in the case of non-stateful decoder filtered_outputs = {} From fceb15186746ce08deccb26e66ccbdc958826b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Feb 2026 18:07:51 +0100 Subject: [PATCH 064/222] cache depending on transformers version --- optimum/exporters/openvino/model_patcher.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index e8d56d555e..6dd442130a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -54,7 +54,6 @@ override_arguments, sdpa_mask_without_vmap, ) -from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -78,6 +77,14 @@ TransformersKwargs = object +if is_transformers_version("<", "5"): + from transformers import DynamicCache as ONNXDynamicCache + from transformers import EncoderDecoderCache as ONNXEncoderDecoderCache +else: + from optimum.exporters.onnx.utils import LegacyDynamicCache as ONNXDynamicCache + from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as ONNXEncoderDecoderCache + + logger = logging.getLogger(__name__) From 5133a4a9f7f6e36b84caac249fcf8a66e20adef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 12:05:29 +0100 Subject: [PATCH 065/222] pix2struct patcher --- optimum/exporters/openvino/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 4ad4cb079b..506459987d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5005,7 +5005,7 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): ], ) class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - pass + _MODEL_PATCHER = OVSeq2SeqModelPatcher @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS) From 0c4a89c4877065c6f2b721e94f7abdb6975e7815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 16:17:59 +0100 Subject: [PATCH 066/222] fix --- optimum/exporters/openvino/model_patcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 6dd442130a..3f87244111 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -89,10 +89,10 @@ def _get_subcomponent_model(model, name): - if is_transformers_version("<", "5"): - return getattr(model, name) + if is_transformers_version(">=", "5") and hasattr(model, "model"): + return getattr(model.model, name) - return getattr(model.model, name) + return getattr(model, name) for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): From d8a482935c9cfede8a214a2555416e5cf89a7c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 18:05:15 +0100 Subject: [PATCH 067/222] remove internvl_chat, minicpmv in tests --- tests/openvino/test_seq2seq.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 0ad560213d..153f57be8e 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -535,14 +535,10 @@ def test_pipeline(self, model_arch: str): class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = [ - "internvl_chat", "llava", "llava_next", "llava_next_mistral", "llava_next_video", - "llava-qwen2", - "minicpmv", - "phi3_v", "qwen2_vl", ] SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"] @@ -554,9 +550,14 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["maira2", "idefics3"] if is_transformers_version(">=", "4.49.0"): - SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2", "phi4mm"] + SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"] SUPPORT_VIDEO.append("qwen2_5_vl") - SUPPORT_AUDIO.append("phi4mm") + + if is_transformers_version("<", "4.54.0"): + # remote code models differs after transformers v4.54 + SUPPORTED_ARCHITECTURES += ["phi4mm"] + SUPPORT_AUDIO.append("phi4mm") + if is_transformers_version(">", "4.49"): SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] if is_transformers_version(">=", "4.51"): @@ -569,9 +570,13 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["qwen3_vl"] SUPPORT_VIDEO += ["qwen3_vl"] - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version("<", "4.54.0"): # remote code models differs after transformers v4.54 - SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"} + SUPPORTED_ARCHITECTURES += ["llava-qwen2", "phi3_v"] + + if is_transformers_version("<", "5"): + # remote code models incompatible after transformers v5 + SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"] REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( From e477044ed4c9906091a9ec4f07e91535c86834be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 18:12:38 +0100 Subject: [PATCH 068/222] set max transformers version for internvl_chat minicpmv --- optimum/exporters/openvino/model_configs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 506459987d..0fb8663202 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2078,6 +2078,7 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) @register_in_tasks_manager("internvl_chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig): + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, config: "PretrainedConfig", @@ -2862,6 +2863,7 @@ class MiniCPMVConfigBehavior(str, enum.Enum): @register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers") class MiniCPMVOpenVINOConfig(BaseVLMOpenVINOConfig): + MAX_TRANSFORMERS_VERSION = "4.57.6" SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = () From e0b2b46849c44d25ca4e6e0975179bd4e57e7306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Feb 2026 18:20:18 +0100 Subject: [PATCH 069/222] style --- optimum/exporters/openvino/model_configs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0fb8663202..c3348ed285 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2079,6 +2079,7 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) @register_in_tasks_manager("internvl_chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig): MAX_TRANSFORMERS_VERSION = "4.57.6" + def __init__( self, config: "PretrainedConfig", From 50fe59046294a0aff8dab9d527c7d1027666922c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 15:47:37 +0100 Subject: [PATCH 070/222] fix textual inversion --- optimum/intel/openvino/loaders.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/loaders.py b/optimum/intel/openvino/loaders.py index 214a4a7e8c..bd62e047bb 100644 --- a/optimum/intel/openvino/loaders.py +++ b/optimum/intel/openvino/loaders.py @@ -22,7 +22,7 @@ from openvino import Type from openvino import opset11 as ops from openvino.passes import Manager, Matcher, MatcherPass, WrapType -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizerBase from .utils import TEXTUAL_INVERSION_EMBEDDING_KEYS @@ -80,7 +80,7 @@ def load_textual_inversion( self, pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]], token: Optional[Union[str, List[str]]] = None, - tokenizer: Optional["PreTrainedTokenizer"] = None, # noqa: F821 + tokenizer: Optional["PreTrainedTokenizerBase"] = None, # noqa: F821 text_encoder: Optional["openvino.Model"] = None, # noqa: F821 **kwargs, ): @@ -88,9 +88,9 @@ def load_textual_inversion( raise ValueError( f"{self.__class__.__name__} requires `self.tokenizer` for calling `{self.load_textual_inversion.__name__}`" ) - elif not isinstance(self.tokenizer, PreTrainedTokenizer): + elif not isinstance(self.tokenizer, PreTrainedTokenizerBase): raise ValueError( - f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling `{self.load_textual_inversion.__name__}`" + f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizerBase` for calling `{self.load_textual_inversion.__name__}`" ) if not hasattr(self, "text_encoder"): From e9ff083d929c3c132f4b1d17b05b59bc50873cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:15:25 +0100 Subject: [PATCH 071/222] add back inc --- .github/workflows/build_documentation.yml | 2 +- .github/workflows/build_pr_documentation.yml | 2 +- docs/source/neural_compressor/reference.mdx | 40 ++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 15852df3eb..ce3eb464ce 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -52,7 +52,7 @@ jobs: pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder uv pip install transformers==4.57.6 - uv pip install .[quality] diffusers accelerate datasets + uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation shell: bash diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 311f29b0dd..6b0b89f3f1 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -39,7 +39,7 @@ jobs: pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder uv pip install transformers==4.57.6 - uv pip install .[quality] diffusers accelerate datasets + uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation shell: bash diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx index b83618b4bc..c631aed883 100644 --- a/docs/source/neural_compressor/reference.mdx +++ b/docs/source/neural_compressor/reference.mdx @@ -14,3 +14,43 @@ specific language governing permissions and limitations under the License. `optimum.intel.neural_compressor` is deprecated and will be removed in the next major release. + +## INCQuantizer + +[[autodoc]] neural_compressor.quantization.INCQuantizer + +## INCTrainer + +[[autodoc]] neural_compressor.trainer.INCTrainer + +## INCModel + +[[autodoc]] neural_compressor.modeling_base.INCModel + +## INCModelForSequenceClassification + +[[autodoc]] neural_compressor.modeling_base.INCModelForSequenceClassification + +## INCModelForQuestionAnswering + +[[autodoc]] neural_compressor.modeling_base.INCModelForQuestionAnswering + +## INCModelForTokenClassification + +[[autodoc]] neural_compressor.modeling_base.INCModelForTokenClassification + +## INCModelForMultipleChoice + +[[autodoc]] neural_compressor.modeling_base.INCModelForMultipleChoice + +## INCModelForMaskedLM + +[[autodoc]] neural_compressor.modeling_base.INCModelForMaskedLM + +## INCModelForCausalLM + +[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM + +## INCModelForSeq2SeqLM + +[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM \ No newline at end of file From 49f020f4c3d6c09b45330ff47bf3c05b36e89208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:17:22 +0100 Subject: [PATCH 072/222] style --- docs/source/neural_compressor/reference.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx index c631aed883..b6e3d8f468 100644 --- a/docs/source/neural_compressor/reference.mdx +++ b/docs/source/neural_compressor/reference.mdx @@ -53,4 +53,4 @@ specific language governing permissions and limitations under the License. ## INCModelForSeq2SeqLM -[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM \ No newline at end of file +[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM From 31e8c4462f9146c7d66faeb013a40d80ece08f0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:35:35 +0100 Subject: [PATCH 073/222] skip text2text generation pipeline when >= v5 --- tests/openvino/test_modeling_basic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index 3dac24c69a..549411f344 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -28,7 +28,7 @@ OVModelForTokenClassification, OVStableDiffusionPipeline, ) - +from optimum.intel.utils.import_utils import is_transformers_version # Make sure that common architectures are used in combination with common tasks MODEL_NAMES = { @@ -58,6 +58,9 @@ def test_pipeline(self, model_id): """ tokenizer = AutoTokenizer.from_pretrained(model_id) model_class_str = MODEL_NAMES[model_id] + if model_class_str == "OVModelForSeq2SeqLM" and is_transformers_version(">=", "5"): + self.skipTest("text2text-generation pipeline was deprecated in transformers v5") + model_class = eval(model_class_str) model = model_class.from_pretrained(model_id, device=OPENVINO_DEVICE) model.save_pretrained(f"{model_id}_ov") From 4e43429bfd283e0bb1ffe8630e440833844aa5c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 16:59:51 +0100 Subject: [PATCH 074/222] fix perceiver vision preprocessor loading --- tests/openvino/test_modeling.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 785c4e2782..f53c9fdce6 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -34,6 +34,7 @@ from sentence_transformers import SentenceTransformer from transformers import ( AutoFeatureExtractor, + AutoImageProcessor, AutoModel, AutoModelForAudioClassification, AutoModelForAudioFrameClassification, @@ -1187,7 +1188,7 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(ov_model.config, PretrainedConfig) set_seed(SEED) transformers_model = AutoModelForImageClassification.from_pretrained(model_id) - preprocessor = AutoProcessor.from_pretrained(model_id) + preprocessor = AutoImageProcessor.from_pretrained(model_id) url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) inputs = preprocessor(images=image, return_tensors="pt") @@ -1211,7 +1212,7 @@ def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] model = OVModelForImageClassification.from_pretrained(model_id, device=OPENVINO_DEVICE) model.eval() - preprocessor = AutoProcessor.from_pretrained(model_id) + preprocessor = AutoImageProcessor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) inputs = TEST_IMAGE_URL outputs = pipe(inputs) From 3565637f2ced588d5c5dfc271fdffb015bc91c38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:15:11 +0100 Subject: [PATCH 075/222] fix question answering pipeline --- tests/openvino/test_modeling_basic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index 549411f344..5d5665beeb 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -72,9 +72,13 @@ def test_pipeline(self, model_id): elif model_class_str == "OVModelForMaskedLM": input_text[0] = f"{input_text[0]} {tokenizer.mask_token}" - if model_class_str in TASKS: - task = TASKS[model_class_str] - pipe = pipeline(task, model=model, tokenizer=tokenizer) + task = TASKS[model_class_str] + pipe = pipeline(task, model=model, tokenizer=tokenizer) + + if task == "question-answering": + # positional arguments deprecated for question-answering pipeline since v5 + pipe(question=input_text[0], context=input_text[1]) + else: pipe(*input_text) gc.collect() From 2d1929d9b81931e3fc43fa2040c671df1a23f93e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:37:05 +0100 Subject: [PATCH 076/222] only install diffusers when compatible --- .github/workflows/test_openvino.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 835204f423..6791a8962f 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -54,7 +54,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install OpenVINO @@ -71,10 +71,10 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - if: ${{ matrix.transformers-version == '5.0.0' }} + - if: ${{ matrix.transformers-version != '5.0.0' }} name: Install diffusers run: | - uv pip install git+https://github.com/huggingface/diffusers + uv pip install diffusers - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq From a8b08a4c51fa9f097e579141d1e37ff9edc1f4d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:38:11 +0100 Subject: [PATCH 077/222] fix diffusers mapping --- optimum/exporters/openvino/model_configs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index c3348ed285..42a58ee523 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -290,6 +290,8 @@ def init_model_configs(): TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline") + if "text-to-image" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS: + TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"] = {} TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana-sprint"] = "SanaSprintPipeline" if is_diffusers_available() and "text-to-video" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS: From 83ae84653c69c19e93be9de63e30f46e35386e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:38:43 +0100 Subject: [PATCH 078/222] style --- tests/openvino/test_modeling_basic.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index 5d5665beeb..c2576db98b 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -17,16 +17,11 @@ from transformers import AutoTokenizer, pipeline from utils_tests import OPENVINO_DEVICE from optimum.intel import ( - OVModelForAudioClassification, OVModelForCausalLM, - OVModelForFeatureExtraction, - OVModelForImageClassification, OVModelForMaskedLM, OVModelForQuestionAnswering, OVModelForSeq2SeqLM, OVModelForSequenceClassification, - OVModelForTokenClassification, - OVStableDiffusionPipeline, ) from optimum.intel.utils.import_utils import is_transformers_version From cad085b66ec8cba022a4117b4c657bc519b20903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 13 Feb 2026 17:39:06 +0100 Subject: [PATCH 079/222] update diffusers extra --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e010f5c0ef..617e12d24a 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], - "diffusers": ["diffusers"], + "diffusers": ["diffusers", "transformers<5"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, } From 5dbe3c894447bb8759454ff2d273fffd69de73fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 14:18:56 +0100 Subject: [PATCH 080/222] add transformers version workflow --- .github/workflows/test_openvino.yml | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 6791a8962f..b42bca1548 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45.0", "5.0.0", "latest"] + transformers-version: ["4.45", "4.57", "latest"] runs-on: ubuntu-22.04 @@ -66,17 +66,17 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} - - if: ${{ matrix.transformers-version == '4.45.0' }} + - if: ${{ matrix.transformers-version == '4.45' }} name: Install specific dependencies and versions required for older transformers run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - if: ${{ matrix.transformers-version != '5.0.0' }} + - if: ${{ matrix.transformers-version != 'latest' }} name: Install diffusers run: | uv pip install diffusers - - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} + - if: ${{ matrix.transformers-version != '4.45' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | uv pip install auto-gptq "autoawq<0.2.8" diff --git a/setup.py b/setup.py index 617e12d24a..16e2a82fed 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ "sentence-transformers", "open_clip_torch>=2.26.1", "peft", - "datasets[audio]>=1.4.0,<4.0.0", + "datasets>=1.4.0,<4.0.0", "tbb", "langchain-huggingface", "hf_xet", From b7ce98b6639488f65cba525bfeabff6d502841b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 16:08:10 +0100 Subject: [PATCH 081/222] set transformers 4.57.6 for tests --- .github/workflows/test_openvino.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index b42bca1548..59bd4673b3 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -38,7 +38,7 @@ jobs: "*diffusion*", "*quantization*", ] - transformers-version: ["4.45", "4.57", "latest"] + transformers-version: ["4.45.0", "4.57.6", "latest"] runs-on: ubuntu-22.04 @@ -66,7 +66,7 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} - - if: ${{ matrix.transformers-version == '4.45' }} + - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator @@ -76,7 +76,7 @@ jobs: run: | uv pip install diffusers - - if: ${{ matrix.transformers-version != '4.45' && matrix.test-pattern == '*decoder*'}} + - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq run: | uv pip install auto-gptq "autoawq<0.2.8" From d692d44785edd13a424f33843d004734a3fc564a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 16:08:54 +0100 Subject: [PATCH 082/222] batch_encode_plus was deprecated in v5 --- tests/openvino/test_modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index f53c9fdce6..03e099f77e 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1605,7 +1605,7 @@ def test_load_from_hub_and_save_model(self): tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR) all_text = ["a dog", "a cat", "a frog"] - tokens = tokenizer.batch_encode_plus( + tokens = tokenizer( all_text, return_tensors="pt", max_length=loaded_model.config.text_config.context_length, @@ -1683,7 +1683,7 @@ def test_functions(self): tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR) all_text = ["a dog", "a cat", "a frog"] - tokens = tokenizer.batch_encode_plus( + tokens = tokenizer( all_text, return_tensors="pt", max_length=model.config.text_config.context_length, From 93679e9b8e3afca7dc7446fd9773f0425d3990c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 18:10:15 +0100 Subject: [PATCH 083/222] fix sam --- optimum/intel/openvino/modeling_sam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_sam.py b/optimum/intel/openvino/modeling_sam.py index 4722437b72..57b33be14e 100644 --- a/optimum/intel/openvino/modeling_sam.py +++ b/optimum/intel/openvino/modeling_sam.py @@ -403,7 +403,7 @@ def get_image_wide_positional_embeddings(self): x_embed = x_embed / size positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1)) - return positional_embedding.permute(2, 0, 1).unsqueeze(0) + return positional_embedding.permute(2, 0, 1).unsqueeze(0).detach() def get_image_features(self, pixel_values, *args, **kwargs): return torch.from_numpy(self.vision_encoder(pixel_values).image_embeddings) From b2ef4184f92d626d3f9db4263e1d6b33044b75a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 18:26:46 +0100 Subject: [PATCH 084/222] install librosa for tests --- .github/workflows/test_openvino.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 59bd4673b3..38a10c22a7 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -64,17 +64,12 @@ jobs: - if: ${{ matrix.transformers-version != 'latest' }} name: Install transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} + uv pip install transformers==${{ matrix.transformers-version }} diffusers - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - - - if: ${{ matrix.transformers-version != 'latest' }} - name: Install diffusers - run: | - uv pip install diffusers + uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator librosa - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq From 3fb01723225341f11dc850e228855f16352d1e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 19:45:29 +0100 Subject: [PATCH 085/222] rename OVDynamicCache --- optimum/exporters/openvino/model_patcher.py | 42 ++++++++++----------- tests/openvino/test_modeling.py | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3f87244111..009d226b34 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -78,11 +78,11 @@ if is_transformers_version("<", "5"): - from transformers import DynamicCache as ONNXDynamicCache - from transformers import EncoderDecoderCache as ONNXEncoderDecoderCache + from transformers import DynamicCache as OVDynamicCache + from transformers import EncoderDecoderCache as OVEncoderDecoderCache else: - from optimum.exporters.onnx.utils import LegacyDynamicCache as ONNXDynamicCache - from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as ONNXEncoderDecoderCache + from optimum.exporters.onnx.utils import LegacyDynamicCache as OVDynamicCache + from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as OVEncoderDecoderCache logger = logging.getLogger(__name__) @@ -331,7 +331,7 @@ def __enter__(self): _mixtral_sparse_moe_block_forward, layer.block_sparse_moe ) else: - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -1382,7 +1382,7 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) + past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -1647,7 +1647,7 @@ def __enter__(self): _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe ) else: - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2964,7 +2964,7 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = ONNXDynamicCache.from_legacy_cache(legacy_pkv) + pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -4118,7 +4118,7 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values) + new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4502,7 +4502,7 @@ def __enter__(self): ) else: - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -4547,11 +4547,11 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, ONNXEncoderDecoderCache): + if isinstance(pkv, OVEncoderDecoderCache): pkv = pkv.self_attention_cache.to_legacy_cache() else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = ONNXEncoderDecoderCache.from_legacy_cache(pkv) + pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4561,7 +4561,7 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)): + if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs.pop("past_key_values") @@ -4742,7 +4742,7 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -5129,7 +5129,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, ONNXEncoderDecoderCache): + if isinstance(past_key_value, OVEncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5331,7 +5331,7 @@ def __enter__(self): modulewise_patch(self._model, Qwen2MoeSparseMoeBlock, _qwen2moe_sparse_block_forward) if is_transformers_version(">=", "5"): - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -5658,7 +5658,7 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = ONNXEncoderDecoderCache.from_legacy_cache(past_key_values) + past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5690,7 +5690,7 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, ONNXEncoderDecoderCache): + if isinstance(past_key_values, OVEncoderDecoderCache): past_key_values = past_key_values.self_attention_cache.to_legacy_cache() else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5742,7 +5742,7 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = ONNXDynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -6696,7 +6696,7 @@ def __enter__(self): self.original_moe_forward = Qwen3MoeSparseMoeBlock.forward Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward_patched if is_transformers_version(">=", "5"): - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -7374,7 +7374,7 @@ def __enter__(self): GptOssExperts.forward = gpt_oss_forward if is_transformers_version(">=", "5"): - self._model.config._experts_implementation = "batched_mm" + self._model.set_experts_implementation("batched_mm") def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 03e099f77e..4eccde4c87 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1723,7 +1723,7 @@ def test_functions(self): self.assertTrue(torch.allclose(model_outputs.logits_per_image, res.logits_per_image, atol=1e-2)) model.reshape(1, -1) - reshaped_tokens = tokenizer.batch_encode_plus( + reshaped_tokens = tokenizer( ["a dog"], return_tensors="pt", max_length=model.config.text_config.context_length, From 3d2286c4bc2aefc7d1c89d4c1554032440738ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 20:56:13 +0100 Subject: [PATCH 086/222] qwenvl3 fix --- optimum/exporters/openvino/model_configs.py | 2 +- optimum/exporters/openvino/model_patcher.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 42a58ee523..ef54b8f78d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3832,7 +3832,7 @@ def __init__( @staticmethod def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: - vision_emb_pos = model.visual.pos_embed + vision_emb_pos = _get_subcomponent_model(model, "visual").pos_embed vision_emb_pos.config = model.config.vision_config return vision_emb_pos diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 009d226b34..b7084b7a34 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4159,7 +4159,7 @@ def lm_forward( deepstack_visual_embeds, use_cache=True, ): - pkv = DynamicCache.from_legacy_cache(past_key_values) + pkv = OVDynamicCache.from_legacy_cache(past_key_values) outputs = self.model.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -7858,7 +7858,7 @@ def forward( inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) if use_cache and past_key_values is None: - past_key_values = DynamicCache(config=self.config) + past_key_values = OVDynamicCache(config=self.config) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 From 46fa8a70d1910cd985a8411a6a8650bcaf7f784a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 16 Feb 2026 23:06:16 +0100 Subject: [PATCH 087/222] fix qwen2vl --- optimum/exporters/openvino/model_configs.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ef54b8f78d..cea7528529 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3716,12 +3716,17 @@ def with_behavior( behavior = QwenVLConfigBehavior(behavior) if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS: - return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) + return get_vlm_text_embeddings_config( + "qwen2", + self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + ) if behavior == QwenVLConfigBehavior.LANGUAGE: return get_vlm_text_generation_config( "qwen2", - self._orig_config, + self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config, self.int_dtype, self.float_dtype, model_patcher=Qwen2VLLanguageModelPatcher, From 20bb596bcd3d3c08c93e0da51e778d3be0060f1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 10:43:13 +0100 Subject: [PATCH 088/222] github workflow librosa --- .github/workflows/test_openvino.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 38a10c22a7..085619c5fa 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -54,7 +54,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] + uv pip install .[tests] librosa - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install OpenVINO @@ -69,7 +69,7 @@ jobs: - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator librosa + uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}} name: Install auto-gptq, autoawq From a091dadd1262971955598c460ea700fde6232f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 10:46:04 +0100 Subject: [PATCH 089/222] Update MAX_TRANSFORMERS_VERSION for incompatible models --- optimum/exporters/openvino/model_configs.py | 43 ++++++++++++++++++--- tests/openvino/test_decoder.py | 3 +- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index cea7528529..a25c3e7b8e 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1080,6 +1080,8 @@ class Phi3OpenVINOConfig(PhiOnnxConfig): ) class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = PhiMoEModelPatcher @@ -1284,6 +1286,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int @register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers") class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) @@ -1293,6 +1296,7 @@ class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers") class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1302,6 +1306,7 @@ class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): @register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @@ -1892,6 +1897,8 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ @register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers") class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.37.2" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -1930,6 +1937,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: @register_in_tasks_manager("llava_next", *["image-text-to-text"], library_name="transformers") class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.40.0" + MAX_TRANSFORMERS_VERSION = "5.99" def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): model_kwargs = model_kwargs or {} @@ -1991,6 +1999,8 @@ class LlavaNextVideoConfigBehavior(str, enum.Enum): @register_in_tasks_manager("llava_next_video", *["image-text-to-text"], library_name="transformers") class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.42.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior] def with_behavior( @@ -2055,6 +2065,7 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ ) class MairaOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" + MAX_TRANSFORMERS_VERSION = "5.99" SUPPORTS_PAST = True def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3976,6 +3987,8 @@ class GraniteOpenVINOConfig(LlamaOpenVINOConfig): ) class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.45.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = GraniteMoEModelPatcher @@ -4009,7 +4022,8 @@ class T5OpenVINOConfig(T5OnnxConfig): library_name="transformers", ) class MT5OpenVINOConfig(T5OpenVINOConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -4098,6 +4112,8 @@ class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): @register_in_tasks_manager("got_ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers") class GotOCR2OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.49.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4130,6 +4146,8 @@ def __init__( @register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers") class Gemma3OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4213,6 +4231,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class Idefics3OpenVINOConfig(BaseVLMOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionPositionIdsInputGenerator) MIN_TRANSFORMERS_VERSION = "4.46.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -4271,6 +4291,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) @register_in_tasks_manager("smolvlm", *["image-text-to-text"], library_name="transformers") class SmolVLMOpenVINOConfig(Idefics3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -4335,6 +4357,8 @@ class PegasusOpenVINOConfig(PegasusOnnxConfig): ) class MarianOpenVINOConfig(MarianOnnxConfig): _MODEL_PATCHER = MarianModelPatcher + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" class DummySpeechT5OpenVINOInputGenerator(DummyInputGenerator): @@ -4548,6 +4572,8 @@ class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig): ) class Llama4OpenVINOConfig(GotOCR2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): model_kwargs = model_kwargs or {} @@ -4789,6 +4815,8 @@ class Zamba2OpenVINOConfig(MambaOpenVINOConfig): DUMMY_PKV_GENERATOR_CLASS = Zamba2DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig MIN_TRANSFORMERS_VERSION = "4.49.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = Zamba2ModelPatcher def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): @@ -5015,7 +5043,9 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): ], ) class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - _MODEL_PATCHER = OVSeq2SeqModelPatcher + # _MODEL_PATCHER = OVSeq2SeqModelPatcher + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS) @@ -5060,7 +5090,8 @@ class MobileBertOpenVINOConfig(MobileBertOnnxConfig): @register_in_tasks_manager("xlm", *COMMON_TEXT_TASKS) class XLMOpenVINOConfig(XLMOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("xlm-roberta", *COMMON_TEXT_TASKS) @@ -5085,7 +5116,8 @@ class CamembertOpenVINOConfig(CamembertOnnxConfig): @register_in_tasks_manager("flaubert", *COMMON_TEXT_TASKS) class FlaubertOpenVINOConfig(FlaubertOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -5117,7 +5149,8 @@ class Data2VecAudioOpenVINOConfig(Data2VecAudioOnnxConfig): @register_in_tasks_manager("data2vec-text", *COMMON_TEXT_TASKS) class Data2VecTextOpenVINOConfig(Data2VecTextOnnxConfig): - pass + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager("data2vec-vision", *["feature-extraction", "image-classification"]) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 1a55242d5c..235eb8406d 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -856,7 +856,8 @@ def test_load_with_different_dtype(self): ) @parameterized.expand(EAGLE3_MODELS.items()) - @pytest.mark.skipif(is_transformers_version("<", "4.54"), reason="Eagle3 requires transformers >= 4.54") + # TODO (@echarlaix) transformers v5 support + @pytest.mark.skipif(is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), reason="Eagle3 requires transformers >= 4.54") def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair): draft_model_id, target_model_id = model_pair From 847c98d8235c931fa546057fcc815b98806eafaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 11:05:03 +0100 Subject: [PATCH 090/222] style --- tests/openvino/test_decoder.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 235eb8406d..07da27807b 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -857,7 +857,10 @@ def test_load_with_different_dtype(self): @parameterized.expand(EAGLE3_MODELS.items()) # TODO (@echarlaix) transformers v5 support - @pytest.mark.skipif(is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), reason="Eagle3 requires transformers >= 4.54") + @pytest.mark.skipif( + is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), + reason="Eagle3 requires transformers >= 4.54", + ) def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair): draft_model_id, target_model_id = model_pair From 4bc2768eae5d2d18cc88aa0ecd6b2481835f7352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 16:44:52 +0100 Subject: [PATCH 091/222] pkv fix --- optimum/exporters/openvino/model_patcher.py | 106 ++++++++++++++------ setup.py | 2 +- 2 files changed, 77 insertions(+), 31 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b7084b7a34..a2a9d18fbc 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -24,6 +24,7 @@ import torch import torch.nn.functional as F from torch import nn +from transformers import DynamicCache, EncoderDecoderCache from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache from transformers.configuration_utils import PretrainedConfig from transformers.generation import GenerationMixin @@ -77,14 +78,6 @@ TransformersKwargs = object -if is_transformers_version("<", "5"): - from transformers import DynamicCache as OVDynamicCache - from transformers import EncoderDecoderCache as OVEncoderDecoderCache -else: - from optimum.exporters.onnx.utils import LegacyDynamicCache as OVDynamicCache - from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as OVEncoderDecoderCache - - logger = logging.getLogger(__name__) @@ -95,6 +88,23 @@ def _get_subcomponent_model(model, name): return getattr(model, name) +def postprocess_past_key_values(past_key_values): + if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)): + if hasattr(past_key_values, "to_legacy_cache"): + past_key_values = past_key_values.to_legacy_cache() + elif isinstance(past_key_values, DynamicCache): + past_key_values = [(lay.keys, lay.values) for lay in past_key_values.layers] + elif isinstance(past_key_values, EncoderDecoderCache): + past_key_values = [ + (self_lay.keys, self_lay.values, cross_lay.keys, cross_lay.values) + for self_lay, cross_lay in zip( + past_key_values.self_attention_cache.layers, + past_key_values.cross_attention_cache.layers, + ) + ] + return past_key_values + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes @@ -1382,7 +1392,11 @@ def phi3_442_forward( if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: - past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) if position_ids is None: @@ -1455,7 +1469,7 @@ def phi3_442_forward( next_cache = None if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = postprocess_past_key_values(next_decoder_cache) if use_legacy_cache else next_decoder_cache if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( @@ -2964,7 +2978,11 @@ def patched_forward(*args, **kwargs): legacy_pkv = args[pkv_argument_index] pkv_in_args = True if legacy_pkv is not None: - pkv = OVDynamicCache.from_legacy_cache(legacy_pkv) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(legacy_pkv) + else: + pkv = DynamicCache(legacy_pkv) + return_legacy_cache = True if not pkv_in_args: kwargs["past_key_values"] = pkv @@ -2985,7 +3003,7 @@ def patched_forward(*args, **kwargs): outputs = self.orig_forward(*args, **kwargs) if return_legacy_cache: - outputs.past_key_values = outputs.past_key_values.to_legacy_cache() + outputs.past_key_values = postprocess_past_key_values(outputs.past_key_values) return outputs @@ -4118,7 +4136,11 @@ def forward_wrap( input_ids=None, use_cache=True, ): - new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + new_past_key_values = DynamicCache.from_legacy_cache(past_key_values) + else: + new_past_key_values = DynamicCache(past_key_values) + result = self.__orig_forward( input_ids=input_ids, attention_mask=attention_mask, @@ -4128,7 +4150,7 @@ def forward_wrap( use_cache=use_cache, ) if past_key_values is not None: - result["past_key_values"] = result["past_key_values"].to_legacy_cache() + result["past_key_values"] = postprocess_past_key_values(result["past_key_values"]) return result model.forward = types.MethodType(forward_wrap, model) @@ -4159,7 +4181,11 @@ def lm_forward( deepstack_visual_embeds, use_cache=True, ): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) + outputs = self.model.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -4172,7 +4198,7 @@ def lm_forward( hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states) - return (logits, outputs.past_key_values.to_legacy_cache()) + return (logits, postprocess_past_key_values(outputs.past_key_values)) model.__orig_forward = model.forward model.forward = types.MethodType(lm_forward, model) @@ -4547,11 +4573,18 @@ def patched_forward(*args, **kwargs): pkv = args[pkv_arg_index] if pkv is not None: - if isinstance(pkv, OVEncoderDecoderCache): - pkv = pkv.self_attention_cache.to_legacy_cache() + if isinstance(pkv, EncoderDecoderCache): + pkv = postprocess_past_key_values(pkv.self_attention_cache) else: pkv = [pkv_item[:2] for pkv_item in pkv] - pkv = OVEncoderDecoderCache.from_legacy_cache(pkv) + + if is_transformers_version("<", "5"): + pkv = EncoderDecoderCache.from_legacy_cache(pkv) + else: + pkv = EncoderDecoderCache( + DynamicCache([layer[:2] for layer in pkv]), + DynamicCache([layer[2:] for layer in pkv]), + ) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -4561,8 +4594,8 @@ def patched_forward(*args, **kwargs): outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)): - outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() + if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): + outputs["past_key_values"] = postprocess_past_key_values(outputs["past_key_values"]) elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs.pop("past_key_values") @@ -4742,7 +4775,10 @@ def __init__( def forward( self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True ): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) past_seen_tokens = past_key_values[0][0].shape[-2] cache_position = torch.arange( @@ -4768,7 +4804,7 @@ def forward( **forward_kwargs, ) upd_pkv = result["past_key_values"] - result["past_key_values"] = upd_pkv.to_legacy_cache() + result["past_key_values"] = postprocess_past_key_values(upd_pkv) return result if is_transformers_version("<", "4.53.0"): @@ -5129,7 +5165,7 @@ def _blenderbot_attn_forward_new( query_states = query_states if past_key_value is not None: - if isinstance(past_key_value, OVEncoderDecoderCache): + if isinstance(past_key_value, EncoderDecoderCache): is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache @@ -5658,7 +5694,13 @@ def patched_decoder_forward( if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] if is_transformers_version(">=", "4.56"): - past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + else: + past_key_values = EncoderDecoderCache( + DynamicCache([layer[:2] for layer in past_key_values]), + DynamicCache([layer[2:] for layer in past_key_values]), + ) output_sequence = inputs_embeds output_cross_attentions = False @@ -5690,8 +5732,8 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - if isinstance(past_key_values, OVEncoderDecoderCache): - past_key_values = past_key_values.self_attention_cache.to_legacy_cache() + if isinstance(past_key_values, EncoderDecoderCache): + past_key_values = postprocess_past_key_values(past_key_values.self_attention_cache) else: past_key_values = [cache_item[:2] for cache_item in past_key_values] @@ -5742,7 +5784,11 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 # moved audio and vision features processing outside model def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True): - pkv = OVDynamicCache.from_legacy_cache(past_key_values) + if is_transformers_version("<", "5"): + pkv = DynamicCache.from_legacy_cache(past_key_values) + else: + pkv = DynamicCache(past_key_values) + outputs = self.model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, @@ -5753,7 +5799,7 @@ def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_value hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss logits = self.lm_head(hidden_states) - return (logits, outputs.past_key_values.to_legacy_cache()) + return (logits, postprocess_past_key_values(outputs.past_key_values)) model.__orig_forward = model.forward model.forward = types.MethodType(lm_forward, model) @@ -7858,7 +7904,7 @@ def forward( inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) if use_cache and past_key_values is None: - past_key_values = OVDynamicCache(config=self.config) + past_key_values = DynamicCache(config=self.config) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/setup.py b/setup.py index 16e2a82fed..25a5a01a97 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/investigate", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 6799e939ede93ae3205753b80b9fc42ee31587f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:21:57 +0100 Subject: [PATCH 092/222] transformers-v5 branch --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 25a5a01a97..16e2a82fed 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/investigate", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From a2cd48ec1a7c87549bfe86d4db0309c3d670d8c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:28:41 +0100 Subject: [PATCH 093/222] use_model_defaults arg was deprecated in v5 --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 07da27807b..9bcef5f2f0 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -478,7 +478,7 @@ def test_pipeline(self, model_arch): tokenizer._convert_tokens_to_ids = lambda x: 0 additional_args = {} - if is_transformers_version(">=", "4.51"): + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): additional_args["use_model_defaults"] = False set_seed(SEED) @@ -784,7 +784,7 @@ def test_beam_search(self, model_arch): ov_model_stateless.config.eos_token_id = None transformers_model.config.eos_token_id = None - if is_transformers_version(">=", "4.51"): + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): additional_inputs["use_model_defaults"] = False for gen_config in gen_configs: From 850c1cee66fbd5fde919d6e8b2a163bd372ba2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:33:18 +0100 Subject: [PATCH 094/222] style --- optimum/exporters/openvino/model_patcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a2a9d18fbc..89d295c0e8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -24,7 +24,6 @@ import torch import torch.nn.functional as F from torch import nn -from transformers import DynamicCache, EncoderDecoderCache from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache from transformers.configuration_utils import PretrainedConfig from transformers.generation import GenerationMixin From af4a6059d0aa27f7fa091401bc3be89a0cc56e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 18:42:20 +0100 Subject: [PATCH 095/222] baichuan remote code models incompatible with v5 --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a25c3e7b8e..069286d2f1 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -330,6 +330,7 @@ class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" ) _MODEL_PATCHER = BaichuanModelPatcher + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 9bcef5f2f0..d079e04539 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -47,7 +47,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", - "baichuan2-13b", "gpt_bigcode", "bigbird_pegasus", "blenderbot", @@ -157,6 +156,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "jais", "dbrx", "baichuan2", + "baichuan2-13b", ) GENERATION_LENGTH = 100 From 4da53e8c4037434d472f2c8ef11e628cfc50eb81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 19:17:47 +0100 Subject: [PATCH 096/222] remove tests for modelsf for models that needs fixes --- tests/openvino/test_decoder.py | 28 ++++++++++++++++++---------- tests/openvino/test_modeling.py | 13 ++++++++----- tests/openvino/test_seq2seq.py | 32 ++++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 23 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index d079e04539..6782574c01 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -58,7 +58,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "gpt_neo", "gpt_neox", "llama", - "marian", "mistral", "mixtral", "mpt", @@ -72,9 +71,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "biogpt", "gpt_neox_japanese", "xglm", - "aquila", - "xverse", - "internlm", "gemma", "olmo", "stablelm", @@ -85,12 +81,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "phi3", "gemma2", "granite", - "granitemoe", ) SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba") - if is_transformers_version(">=", "4.49"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_SSM_ARCHITECTURES += ("zamba2",) if is_transformers_version(">=", "4.53.0"): @@ -102,11 +98,15 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += SUPPORTED_SSM_ARCHITECTURES if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("phimoe",) + # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") @@ -145,8 +145,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") if is_transformers_version("<", "5"): - # TODO: add dbrx back once fixed in transformers SUPPORTED_ARCHITECTURES += ( + # remote modeling incompatible with v5 "codegen2", "exaone", "decilm", @@ -154,11 +154,19 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "orion", "aquila2", "jais", - "dbrx", "baichuan2", "baichuan2-13b", + # remote modeling code failing with v5 + "aquila", + "xverse", + "internlm", + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + "dbrx", + # "phimoe", + "marian", + "granitemoe", + # "zamba2", ) - GENERATION_LENGTH = 100 EXPECTED_NUM_SDPA = { diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 4eccde4c87..8d8ab01147 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -757,14 +757,16 @@ class OVModelForSequenceClassificationIntegrationTest(unittest.TestCase): "convbert", "distilbert", "electra", - "flaubert", "ibert", "roberta", "roformer", "squeezebert", - "xlm", ) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("flaubert", "xlm") + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] @@ -1087,13 +1089,11 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): "bert", "camembert", "convbert", - "data2vec-text", "deberta", "deberta-v2", "distilbert", "electra", "esm", - "flaubert", "ibert", "mobilebert", "mpnet", @@ -1102,7 +1102,6 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): "roberta", "roformer", "squeezebert", - "xlm", "xlm-roberta", ) @@ -1110,6 +1109,10 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase): if is_transformers_version("<", "4.51.0"): SUPPORTED_ARCHITECTURES += ("nystromformer",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("data2vec-text", "flaubert", "xlm") + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 153f57be8e..d0e5f88b71 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -145,7 +145,6 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): "longt5", "m2m_100", "mbart", - "mt5", "pegasus", "t5", ) @@ -159,6 +158,10 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("mt5",) + SUPPORT_STATEFUL = ("t5", "mt5", "longt5") if is_transformers_version(">=", "4.52.0"): SUPPORT_STATEFUL += ("bart", "blenderbot", "blenderbot-small", "m2m_100", "marian", "mbart") @@ -535,10 +538,8 @@ def test_pipeline(self, model_arch: str): class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = [ - "llava", "llava_next", "llava_next_mistral", - "llava_next_video", "qwen2_vl", ] SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"] @@ -547,20 +548,31 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): TASK = "image-text-to-text" if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ["maira2", "idefics3"] + SUPPORTED_ARCHITECTURES += ["maira2"] + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["idefics3"] if is_transformers_version(">=", "4.49.0"): - SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"] + SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"] SUPPORT_VIDEO.append("qwen2_5_vl") + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["got_ocr2"] + if is_transformers_version("<", "4.54.0"): # remote code models differs after transformers v4.54 SUPPORTED_ARCHITECTURES += ["phi4mm"] SUPPORT_AUDIO.append("phi4mm") - if is_transformers_version(">", "4.49"): - SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] - if is_transformers_version(">=", "4.51"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["gemma3", "smolvl"] + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): # SUPPORTED_ARCHITECTURES += ["llama4", "phi4_multimodal"] SUPPORTED_ARCHITECTURES += ["llama4"] @@ -578,6 +590,10 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # remote code models incompatible after transformers v5 SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"] + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") + REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( From 5a74781777df1600644601c52e5854b3d9bfa113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 19:45:14 +0100 Subject: [PATCH 097/222] fix decoder tests untested_architectures --- tests/openvino/test_decoder.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 6782574c01..9a6acf1cb7 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -126,6 +126,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("bitnet",) @@ -305,9 +306,13 @@ def test_find_untested_architectures(self): supported_architectures -= {"lfm2"} # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group - if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + if is_transformers_version(">", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"qwen3_vl_text"} + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "5"): + supported_architectures -= {"phimoe", "granitemoe", "bitnet", "dbrx", "zamba2", "marian"} + supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures From e634d777eb815f50f12366a796b85554056b059d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 19:51:46 +0100 Subject: [PATCH 098/222] fix untested architecture --- tests/openvino/test_seq2seq.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index d0e5f88b71..70e43293e0 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -161,6 +161,8 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("mt5",) + else: + UNSUPPORTED_ARCHITECTURES = {"marian", "mt5"} SUPPORT_STATEFUL = ("t5", "mt5", "longt5") if is_transformers_version(">=", "4.52.0"): @@ -593,7 +595,17 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") - + else: + UNSUPPORTED_ARCHITECTURES = { + "got_ocr2", + "idefics3", + "llama4", + "llava_next_video", + "phi4_multimodal", + "gemma3", + "smolvlm", + "llava", + } REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( From f89d0de33d40d71cfcd07b885f6a85bdbf700de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 21:57:28 +0100 Subject: [PATCH 099/222] fix pkv patching --- optimum/exporters/openvino/model_patcher.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 89d295c0e8..82a25f3098 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4580,10 +4580,7 @@ def patched_forward(*args, **kwargs): if is_transformers_version("<", "5"): pkv = EncoderDecoderCache.from_legacy_cache(pkv) else: - pkv = EncoderDecoderCache( - DynamicCache([layer[:2] for layer in pkv]), - DynamicCache([layer[2:] for layer in pkv]), - ) + pkv = EncoderDecoderCache(DynamicCache(pkv), DynamicCache()) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv @@ -5696,10 +5693,7 @@ def patched_decoder_forward( if is_transformers_version("<", "5"): past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) else: - past_key_values = EncoderDecoderCache( - DynamicCache([layer[:2] for layer in past_key_values]), - DynamicCache([layer[2:] for layer in past_key_values]), - ) + past_key_values = EncoderDecoderCache(DynamicCache(past_key_values), DynamicCache()) output_sequence = inputs_embeds output_cross_attentions = False From 6070155e197b19f8553a62978003160d37bf724a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 22:27:15 +0100 Subject: [PATCH 100/222] fix test --- tests/openvino/test_seq2seq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 70e43293e0..2737059e50 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -380,7 +380,7 @@ def test_compare_to_transformers(self, model_arch): ) generate_kwrgs = {} - if is_transformers_version(">=", "4.50"): + if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"): generate_kwrgs = {"use_model_defaults": False} gen_config = GenerationConfig( @@ -571,7 +571,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ["gemma3", "smolvl"] + SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): From 26d5c4413cdb0d37fd99aff736013390c541ac09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Feb 2026 22:36:52 +0100 Subject: [PATCH 101/222] fix expcted int8 tests --- tests/openvino/test_quantization.py | 4 ++-- tests/openvino/utils_tests.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b39ee223ae..f9bde752b3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -269,7 +269,7 @@ class OVQuantizerTest(unittest.TestCase): "model": 33, }, { - "model": {"int8": 35}, + "model": {"int8": 35 if is_transformers_version("<", "5") else 36}, }, ), ( @@ -299,7 +299,7 @@ class OVQuantizerTest(unittest.TestCase): "model": 32, }, { - "model": {"int8": 34}, + "model": {"int8": 34 if is_transformers_version("<", "5") else 35}, }, ), ( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 85f79801cd..06314ef394 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -231,19 +231,19 @@ _ARCHITECTURES_TO_EXPECTED_INT8 = { "afmoe": {"model": 16}, - "bert": {"model": 68}, + "bert": {"model": 68 if is_transformers_version("<", "5") else 70}, "roberta": {"model": 68}, "albert": {"model": 84}, "vit": {"model": 64}, - "blenderbot": {"model": 70}, + "blenderbot": {"model": 70 if is_transformers_version("<", "5") else 72}, "gpt2": {"model": 44}, "granitemoehybrid": {"model": 118}, "wav2vec2": {"model": 34}, "distilbert": {"model": 66}, "t5": { "encoder": 64, - "decoder": 104, - "decoder_with_past": 84, + "decoder": 104 if is_transformers_version("<", "5") else 106, + "decoder_with_past": 84 if is_transformers_version("<", "5") else 86, }, "stable-diffusion": { "unet": 242, From 9d84f3a4870a501fd4591a8b8473e5e1879c6217 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:27:45 +0100 Subject: [PATCH 102/222] tests transformers v5 --- tests/openvino/test_seq2seq.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 2737059e50..da68d6e8b9 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -862,7 +862,11 @@ def test_compare_to_transformers(self, model_arch): gc.collect() - @parameterized.expand(["llava", "llava_next", "llava_next_video", "llava_next_mistral"]) + @parameterized.expand( + ["llava", "llava_next", "llava_next_video", "llava_next_mistral"] + if is_transformers_version("<", "5") + else ["llava_next", "llava_next_mistral"] + ) def test_llava_with_new_preprocessing(self, model_arch): prompt = "\n What is shown in this image?" model_id = MODEL_NAMES[model_arch] From 4b5f83d4f5169513c467dd3b3a9dfdf9fc43006e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:29:41 +0100 Subject: [PATCH 103/222] pix2struct --- optimum/exporters/openvino/model_configs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 069286d2f1..ce617dc3ea 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5044,9 +5044,7 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): ], ) class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig): - # _MODEL_PATCHER = OVSeq2SeqModelPatcher - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" + _MODEL_PATCHER = OVSeq2SeqModelPatcher @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS) From 14e1b524547bc44e08294013325abbea2e63c481 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:42:44 +0100 Subject: [PATCH 104/222] fix num expected int8 --- tests/openvino/test_quantization.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f9bde752b3..a249624023 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -135,8 +135,8 @@ class OVQuantizerTest(unittest.TestCase): (OVModelForSequenceClassification, "bert", 32, 35), (OVModelForCausalLM, "gpt2", 31, 22), (OVSentenceTransformer, "sentence-transformers-bert", 12, 15), - (OVModelForFeatureExtraction, "blenderbot", 33, 35), - (OVModelForMaskedLM, "roberta", 32, 34), + (OVModelForFeatureExtraction, "blenderbot", 33, 35 if is_transformers_version("<", "5") else 36), + (OVModelForMaskedLM, "roberta", 32, 34 if is_transformers_version("<", "5") else 35), (OVModelForZeroShotImageClassification, "clip", 65, 65), ) SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET = [ @@ -344,12 +344,12 @@ class OVQuantizerTest(unittest.TestCase): if is_transformers_version("<=", "4.45") else { "encoder": 30, - "decoder": 52, + "decoder": 52 if is_transformers_version("<", "5") else 53, }, ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") - else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}} ), ), ( @@ -596,7 +596,9 @@ class OVWeightCompressionTest(unittest.TestCase): (OVModelForCausalLM, "gpt2", 44, 44), ) - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 43),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ( + (OVModelForCausalLM, "opt125m", 62 if is_transformers_version("<", "5") else 64, 43), + ) SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 74),) SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),) From 0dbe96c293c68dc66e6fdf9a0213d312d004c943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 10:44:15 +0100 Subject: [PATCH 105/222] use_model_defaults deprecated in v5 --- tests/openvino/test_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index a249624023..fc4f9ea102 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -2497,7 +2497,7 @@ def check_model_inference(ov_model, model_id, trust_remote_code): if isinstance(ov_model, OVModelForSpeechSeq2Seq): input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32) generate_kwrgs = {} - if is_transformers_version(">=", "4.50"): + if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"): generate_kwrgs = {"use_model_defaults": False} ov_model.generate(input_features, generation_config=gen_config, **generate_kwrgs) else: From af3fba3d7800adb4ab6dfd0f118cfac1c33bd962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 11:01:50 +0100 Subject: [PATCH 106/222] rename --- optimum/exporters/openvino/model_configs.py | 20 ++++++++++---------- optimum/exporters/openvino/model_patcher.py | 12 +++++------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ce617dc3ea..fb7acb865d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -202,7 +202,7 @@ SanaTextEncoderModelPatcher, XverseModelPatcher, Zamba2ModelPatcher, - _get_subcomponent_model, + _get_model_attribute, ) @@ -1878,14 +1878,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]) behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_subcomponent_model(model, "language_model") if not hasattr(model, "lm_head") else model + return _get_model_attribute(model, "language_model") if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() - text_embedding.config = _get_subcomponent_model(model, "language_model").config + text_embedding.config = _get_model_attribute(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -2162,14 +2162,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: - return _get_subcomponent_model(model, "language_model") + return _get_model_attribute(model, "language_model") if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: - text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings() - text_embedding.config = _get_subcomponent_model(model, "language_model").config + text_embedding = _get_model_attribute(model, "language_model").get_input_embeddings() + text_embedding.config = _get_model_attribute(model, "language_model").config return text_embedding def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): @@ -3696,12 +3696,12 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): return model if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS: - vision_embeddings = _get_subcomponent_model(model, "visual").patch_embed + vision_embeddings = _get_model_attribute(model, "visual").patch_embed vision_embeddings.config = model.config.vision_config return vision_embeddings if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: - vision_emb_merger = _get_subcomponent_model(model, "visual") + vision_emb_merger = _get_model_attribute(model, "visual") vision_emb_merger.config = model.config.vision_config return vision_emb_merger @@ -3709,7 +3709,7 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): text_embedding = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") - else _get_subcomponent_model(model, "language_model").embed_tokens + else _get_model_attribute(model, "language_model").embed_tokens ) text_embedding.config = model.config return text_embedding @@ -3849,7 +3849,7 @@ def __init__( @staticmethod def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]): if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: - vision_emb_pos = _get_subcomponent_model(model, "visual").pos_embed + vision_emb_pos = _get_model_attribute(model, "visual").pos_embed vision_emb_pos.config = model.config.vision_config return vision_emb_pos diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 82a25f3098..a617289c8e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -80,13 +80,6 @@ logger = logging.getLogger(__name__) -def _get_subcomponent_model(model, name): - if is_transformers_version(">=", "5") and hasattr(model, "model"): - return getattr(model.model, name) - - return getattr(model, name) - - def postprocess_past_key_values(past_key_values): if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)): if hasattr(past_key_values, "to_legacy_cache"): @@ -104,6 +97,11 @@ def postprocess_past_key_values(past_key_values): return past_key_values +def _get_model_attribute(model, name): + target = getattr(model, "model", model) + return getattr(target, name) + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes From 546127bdee7899e99fa505fadb8bf85b6a2a7a79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 11:04:12 +0100 Subject: [PATCH 107/222] style --- optimum/exporters/openvino/model_patcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a617289c8e..0910f4de3f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3256,7 +3256,7 @@ def llava_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_outputs = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) + image_outputs = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True) # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer] @@ -3267,7 +3267,7 @@ def llava_vision_embed_forward(self, pixel_values): else: raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") - image_features = _get_subcomponent_model(self, "multi_modal_projector")(selected_image_feature) + image_features = _get_model_attribute(self, "multi_modal_projector")(selected_image_feature) return image_features @@ -3275,7 +3275,7 @@ def llava_next_video_vision_embed_forward(self, pixel_values): # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519 # these changes does not bring any difference from original, it only packs model subcomponent inference together # that allow us avoid memory overheads and their inference results handling on code-level - image_features = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True) + image_features = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True) vision_feature_layer = self.config.vision_feature_layer if isinstance(vision_feature_layer, int): selected_image_feature = image_features.hidden_states[vision_feature_layer] From 3eeeb4dc64d6eeeadd1b9cdac309f36838e9b36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 11:10:09 +0100 Subject: [PATCH 108/222] install diffusers from source for v5 --- .github/workflows/test_openvino.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 085619c5fa..48e3a7409b 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -54,7 +54,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] librosa + uv pip install .[tests] librosa diffusers - if: ${{ matrix.test-pattern == '*modeling*' }} name: Install OpenVINO @@ -64,7 +64,12 @@ jobs: - if: ${{ matrix.transformers-version != 'latest' }} name: Install transformers run: | - uv pip install transformers==${{ matrix.transformers-version }} diffusers + uv pip install transformers==${{ matrix.transformers-version }} + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers From 2b61bd38e7b375f18c018e83072e2c00d258db4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 15:44:38 +0100 Subject: [PATCH 109/222] qwen2vl --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 0910f4de3f..53b7340962 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -98,7 +98,7 @@ def postprocess_past_key_values(past_key_values): def _get_model_attribute(model, name): - target = getattr(model, "model", model) + target = getattr(model, "model", model) if is_transformers_version(">=", "5") else model return getattr(target, name) From bbe65bbff2e8628e8e694535c3ca74f0c216e65b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 15:57:19 +0100 Subject: [PATCH 110/222] remove tests for v5 --- tests/openvino/test_quantization.py | 114 ++++++++++++++++------------ 1 file changed, 64 insertions(+), 50 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index fc4f9ea102..f66ae8834d 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -342,10 +342,7 @@ class OVQuantizerTest(unittest.TestCase): ), {"encoder": 30, "decoder": 52, "decoder_with_past": 61} if is_transformers_version("<=", "4.45") - else { - "encoder": 30, - "decoder": 52 if is_transformers_version("<", "5") else 53, - }, + else {"encoder": 30, "decoder": 52}, ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") @@ -1064,9 +1061,6 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionPipeline, "stable-diffusion", False), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False), (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False), - (OVModelForVisualCausalLM, "llava", False), - (OVModelForVisualCausalLM, "llava_next_video", False), - (OVModelForVisualCausalLM, "minicpmv", True), (OVModelForVisualCausalLM, "qwen2_vl", False), ] @@ -1082,6 +1076,15 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "4.57.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen3_vl", False)) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend( + [ + (OVModelForVisualCausalLM, "llava", False), + (OVModelForVisualCausalLM, "llava_next_video", False), + (OVModelForVisualCausalLM, "minicpmv", True), + ] + ) + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), @@ -1119,17 +1122,6 @@ class OVWeightCompressionTest(unittest.TestCase): "text_encoder": {}, }, ), - ( - OVModelForVisualCausalLM, - "llava", - 4, - {"bits": 4, "group_size": 8, "ratio": 0.5}, - { - "lm_model": {"int8": 22, "int4": 8}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"int8": 9}, - }, - ), ( OVSamModel, "sam", @@ -1183,15 +1175,6 @@ class OVWeightCompressionTest(unittest.TestCase): }, }, ), - ( - OVModelForVisualCausalLM, - "llava", - { - "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "text_embeddings_model": {"patterns": ["."]}, - }, - ), ( OVSamModel, "sam", @@ -1212,6 +1195,33 @@ class OVWeightCompressionTest(unittest.TestCase): ), ] + if is_transformers_version("<", "5"): + DEFAULT_COMPRESSION_CONFIGURATIONS.append( + ( + OVModelForVisualCausalLM, + "llava", + 4, + {"bits": 4, "group_size": 8, "ratio": 0.5}, + { + "lm_model": {"int8": 22, "int4": 8}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 9}, + }, + ), + ) + + DEFAULT_IGNORED_SCOPE_CONFIGURATIONS.append( + ( + OVModelForVisualCausalLM, + "llava", + { + "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "text_embeddings_model": {"patterns": ["."]}, + }, + ), + ) + def test_filtered_architectures(cls): expected = set() if is_transformers_version("<", "4.49"): @@ -1800,31 +1810,35 @@ class OVPipelineQuantizationTest(unittest.TestCase): {"encoder": 14, "decoder": 22}, {"encoder": {"int8": 14}, "decoder": {"int8": 22}}, ), - ( - OVModelForVisualCausalLM, - "internvl_chat", - True, - dict( - quantization_configs={ - "lm_model": dict(bits=8, weight_only=True), - "vision_embeddings_model": dict(bits=8, weight_only=False), + ] + + if is_transformers_version("<", "5"): + PIPELINE_QUANTIZATION_SCOPE.append( + ( + OVModelForVisualCausalLM, + "internvl_chat", + True, + dict( + quantization_configs={ + "lm_model": dict(bits=8, weight_only=True), + "vision_embeddings_model": dict(bits=8, weight_only=False), + }, + dataset="contextual", + num_samples=1, + default_config=dict(bits=8, sym=True, weight_only=True), + ), + { + "lm_model": 0, + "text_embeddings_model": 0, + "vision_embeddings_model": 15, + }, + { + "lm_model": {"int8": 30}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 11}, }, - dataset="contextual", - num_samples=1, - default_config=dict(bits=8, sym=True, weight_only=True), ), - { - "lm_model": 0, - "text_embeddings_model": 0, - "vision_embeddings_model": 15, - }, - { - "lm_model": {"int8": 30}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"int8": 11}, - }, - ), - ] + ) if is_transformers_version(">=", "4.49.0") and is_transformers_version("<", "4.54.0"): PIPELINE_QUANTIZATION_SCOPE.extend( From 7ba6fd1289612fa92476194bc9aa7f16316a2a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 16:37:42 +0100 Subject: [PATCH 111/222] disable tests for transformers v5 --- tests/openvino/test_genai.py | 41 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index b31ca1569e..74f6bab1ec 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -45,7 +45,6 @@ class LLMPipelineTestCase(unittest.TestCase): "gpt_bigcode", "bloom", "codegen", - "codegen2", "gpt2", "gptj", "gpt_neox", @@ -53,37 +52,29 @@ class LLMPipelineTestCase(unittest.TestCase): "mistral", "mixtral", "phi", - "internlm2", - "orion", "falcon", "persimmon", "xglm", - "aquila", - "aquila2", - "internlm", - "jais", - "decilm", "gemma", "olmo", "stablelm", "starcoder2", - "dbrx", "cohere", "qwen2", "qwen2_moe", "phi3", "gemma2", - "exaone", "granite", - "granitemoe", ) if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe", "opt") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "opt") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("qwen",) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("phimoe",) if is_transformers_version(">=", "4.49"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): @@ -101,6 +92,25 @@ class LLMPipelineTestCase(unittest.TestCase): if is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("chatglm", "chatglm4") + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ( + # remote modeling incompatible with v5 + "codegen2", + "exaone", + "decilm", + "internlm2", + "orion", + "aquila2", + "jais", + # remote modeling code failing with v5 + "aquila", + "internlm", + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + "dbrx", + # "phimoe", + "granitemoe", + ) + REMOTE_CODE_MODELS = ( "chatglm", "minicpm", @@ -200,9 +210,7 @@ def test_compare_outputs(self, model_arch): class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( - "llava", "llava_next", - "llava_next_video", # "minicpmv", # output is truncated for some reason "qwen2_vl", ) @@ -216,8 +224,11 @@ class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",) if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("phi4mm",) - if is_transformers_version(">=", "4.49"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("gemma3",) + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") REMOTE_CODE_MODELS = ( "minicpmv", From 928fb5009f60ac2478b818bb39aa17bd72eadf93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 16:42:24 +0100 Subject: [PATCH 112/222] remove non needed --- optimum/exporters/openvino/model_patcher.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 53b7340962..1be2bfe437 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4590,8 +4590,6 @@ def patched_forward(*args, **kwargs): # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs["past_key_values"] = postprocess_past_key_values(outputs["past_key_values"]) - elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): - outputs.pop("past_key_values") # we still need to filter out cross attention in the case of non-stateful decoder filtered_outputs = {} From ef320b3be74f090a861e30ac4c45cc76ffafa071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 19:11:20 +0100 Subject: [PATCH 113/222] disable tests --- tests/openvino/test_export.py | 9 ++- tests/openvino/test_exporters_cli.py | 82 +++++++++++++++++----------- tests/openvino/utils_tests.py | 6 +- 3 files changed, 58 insertions(+), 39 deletions(-) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 18811bd121..6cc28c8597 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -84,7 +84,6 @@ class ExportModelTest(unittest.TestCase): "stable-diffusion-xl": OVStableDiffusionXLPipeline, "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline, "latent-consistency": OVLatentConsistencyModelPipeline, - "llava": OVModelForVisualCausalLM, "sam": OVSamModel, "speecht5": OVModelForTextToSpeechSeq2Seq, "clip": OVModelForZeroShotImageClassification, @@ -95,7 +94,7 @@ class ExportModelTest(unittest.TestCase): "ltx-video": OVLTXPipeline, } - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM}) if is_transformers_version(">=", "4.53.0"): @@ -118,7 +117,11 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction}) - GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava", "speecht5") + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "speecht5") + + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM}) + GENERATIVE_MODELS.append("llava") def _openvino_export( self, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 4be27f43e5..a684c90ca8 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -121,11 +121,17 @@ class OVCLIExportTestCase(unittest.TestCase): [ ("text-generation", "lfm2"), ("text-generation-with-past", "lfm2"), + ] + ) + + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.extend( + [ ("text-generation-with-past", "qwen3_eagle3"), ] ) - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "zamba2"), @@ -138,7 +144,7 @@ class OVCLIExportTestCase(unittest.TestCase): ("text-generation-with-past", "exaone4"), ] ) - if is_transformers_version(">=", "4.52.1"): + if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "bitnet"), @@ -202,14 +208,6 @@ class OVCLIExportTestCase(unittest.TestCase): "expected_chat_template": False, "simplified_chat_template": False, }, - "llava": { # transformers, chat template in processor, simplified chat template - "num_tokenizers": 2, - "task": "image-text-to-text", - "processor_chat_template": True, - "remote_code": False, - "expected_chat_template": True, - "simplified_chat_template": True, - }, "llava_next": { # transformers, chat template in processor overrides tokinizer chat template, simplified chat template "num_tokenizers": 2, "task": "image-text-to-text", @@ -256,6 +254,20 @@ class OVCLIExportTestCase(unittest.TestCase): } ) + if is_transformers_version("<", "5"): + TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS.update( + { + "llava": { # transformers, chat template in processor, simplified chat template + "num_tokenizers": 2, + "task": "image-text-to-text", + "processor_chat_template": True, + "remote_code": False, + "expected_chat_template": True, + "simplified_chat_template": True, + }, + } + ) + SUPPORTED_SD_HYBRID_ARCHITECTURES = [ ("flux", 7, 56), ("latent-consistency", 50, 135), @@ -407,7 +419,7 @@ class OVCLIExportTestCase(unittest.TestCase): "model": 33, }, { - "model": {"int8": 35}, + "model": {"int8": 35 if is_transformers_version("<", "5") else 36}, }, ), ( @@ -431,7 +443,7 @@ class OVCLIExportTestCase(unittest.TestCase): "model": 32, }, { - "model": {"int8": 34}, + "model": {"int8": 34 if is_transformers_version("<", "5") else 35}, }, ), ( @@ -472,7 +484,7 @@ class OVCLIExportTestCase(unittest.TestCase): ( {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} if is_transformers_version("<=", "4.45") - else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}} ), ), ( @@ -489,48 +501,52 @@ class OVCLIExportTestCase(unittest.TestCase): "prompt_encoder_mask_decoder": {"int8": 49}, }, ), - ( - "image-text-to-text", - "internvl_chat", - "f8e4m3", - "--dataset contextual --num-samples 1 --trust-remote-code", - { - "lm_model": 15, - "text_embeddings_model": 0, - "vision_embeddings_model": 17, - }, - { - "lm_model": {"f8e4m3": 15}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"f8e4m3": 11}, - }, - ), ] + if is_transformers_version("<", "5"): + SUPPORTED_QUANTIZATION_ARCHITECTURES.append( + ( + "image-text-to-text", + "internvl_chat", + "f8e4m3", + "--dataset contextual --num-samples 1 --trust-remote-code", + { + "lm_model": 15, + "text_embeddings_model": 0, + "vision_embeddings_model": 17, + }, + { + "lm_model": {"f8e4m3": 15}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"f8e4m3": 11}, + }, + ), + ) + TRANSFORMERS_4BIT_CONFIGURATIONS = [ ( "text-generation-with-past", "opt125m", "int4 --sym --group-size 128", - {"model": {"int8": 4, "int4": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 72}}, ), ( "text-generation-with-past", "opt125m", "int4 --group-size 64", - {"model": {"int8": 4, "int4": 144}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 144}}, ), ( "text-generation-with-past", "opt125m", "mxfp4", - {"model": {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "f4e2m1": 72, "f8e8m0": 72}}, ), ( "text-generation-with-past", "opt125m", "nf4", - {"model": {"int8": 4, "nf4": 72}}, + {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "nf4": 72}}, ), ( "text-generation-with-past", diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 06314ef394..c6737bff1e 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -354,8 +354,8 @@ "vocoder": 80, }, "clip": {"model": 130}, - "mamba": {"model": 322}, - "falcon_mamba": {"model": 162}, + "mamba": {"model": 322 if is_transformers_version("<", "5") else 324}, + "falcon_mamba": {"model": 162 if is_transformers_version("<", "5") else 164}, "minicpmo": { "lm_model": 16, "text_embeddings_model": 1, @@ -364,7 +364,7 @@ }, "zamba2": {"model": 44}, "exaone4": {"model": 16}, - "lfm2": {"model": 52}, + "lfm2": {"model": 52 if is_transformers_version("<", "5") else 54}, "qwen3_eagle3": {"model": 20}, } From 8beb8d8bfaed7d778adfc152212d3b4912613745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Feb 2026 19:35:01 +0100 Subject: [PATCH 114/222] fix --- tests/openvino/test_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 6cc28c8597..eae3727de6 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -121,7 +121,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM}) - GENERATIVE_MODELS.append("llava") + GENERATIVE_MODELS += ("llava",) def _openvino_export( self, From e4eba9296ec619c219029f005d5fcc8913eb6871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 10:28:15 +0100 Subject: [PATCH 115/222] add stable diffusion 3 tests when diffusers compatible with v5 --- tests/openvino/test_diffusion.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 8efc69f8ec..e4f558efb7 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -80,7 +80,6 @@ class OVPipelineForText2ImageTest(unittest.TestCase): "stable-diffusion", "stable-diffusion-xl", "latent-consistency", - "stable-diffusion-3", "flux", "sana", ] @@ -93,6 +92,10 @@ class OVPipelineForText2ImageTest(unittest.TestCase): if is_diffusers_version(">=", "0.33.0"): SUPPORTED_ARCHITECTURES.extend(["sana-sprint"]) + + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -499,9 +502,11 @@ class OVPipelineForImage2ImageTest(unittest.TestCase): "stable-diffusion", "stable-diffusion-xl", "latent-consistency", - "stable-diffusion-3", "flux", ] + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForImage2Image OVMODEL_CLASS = OVPipelineForImage2Image TASK = "image-to-image" @@ -754,7 +759,11 @@ def test_textual_inversion(self): class OVPipelineForInpaintingTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "stable-diffusion-3", "flux", "flux-fill"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "flux", "flux-fill"] + + if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"): + SUPPORTED_ARCHITECTURES.append("stable-diffusion-3") + AUTOMODEL_CLASS = AutoPipelineForInpainting OVMODEL_CLASS = OVPipelineForInpainting TASK = "inpainting" From dc2823d35bef2fe24d15022c624fe210a589ac8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 14:58:17 +0100 Subject: [PATCH 116/222] use xlm-roberta with max_position_embeddings 514 --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c6737bff1e..2cdbdcf8b7 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -211,7 +211,7 @@ "wav2vec2-conformer": "optimum-intel-internal-testing/tiny-random-wav2vec2-conformer", "whisper": "optimum-intel-internal-testing/tiny-random-whisper", "xlm": "optimum-intel-internal-testing/tiny-random-xlm", - "xlm-roberta": "optimum-intel-internal-testing/tiny-xlm-roberta", + "xlm-roberta": "optimum-intel-internal-testing/tiny-random-xlm-roberta", "xglm": "optimum-intel-internal-testing/tiny-random-XGLMForCausalLM", "xverse": "optimum-intel-internal-testing/tiny-random-xverse", "glm4": "optimum-intel-internal-testing/tiny-random-glm4", From 5967be3cf2a42122d05546c8b04f449970dcef3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 15:00:31 +0100 Subject: [PATCH 117/222] add missing import --- tests/openvino/test_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index e4f558efb7..bc58c91796 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -38,7 +38,7 @@ OVPipelineForText2Video, ) from optimum.intel.openvino.utils import TemporaryDirectory -from optimum.intel.utils.import_utils import is_diffusers_version +from optimum.intel.utils.import_utils import is_diffusers_version, is_transformers_version from optimum.utils.testing_utils import require_diffusers From 699b0b797679c3242a861521f53b8394e98ca8aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 15:51:28 +0100 Subject: [PATCH 118/222] granitemoe fix --- optimum/exporters/openvino/model_configs.py | 2 - optimum/exporters/openvino/model_patcher.py | 43 +++++++++------------ tests/openvino/test_decoder.py | 2 +- 3 files changed, 20 insertions(+), 27 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index fb7acb865d..fe846efcf4 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3988,8 +3988,6 @@ class GraniteOpenVINOConfig(LlamaOpenVINOConfig): ) class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.45.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = GraniteMoEModelPatcher diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1be2bfe437..4bd9024bc8 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4508,34 +4508,29 @@ class GraniteMoEModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version("<", "5"): - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward - block_sparse_moe.router.forward = types.MethodType( - _granite_moe_topk_gating_forward, block_sparse_moe.router - ) - block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward - block_sparse_moe.input_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear - ) - block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward - block_sparse_moe.output_linear.forward = types.MethodType( - _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear - ) - - else: - self._model.set_experts_implementation("batched_mm") + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward + block_sparse_moe.router.forward = types.MethodType( + _granite_moe_topk_gating_forward, block_sparse_moe.router + ) + block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward + block_sparse_moe.input_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear + ) + block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward + block_sparse_moe.output_linear.forward = types.MethodType( + _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear + ) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version("<", "5"): - for layer in self._model.model.layers: - block_sparse_moe = layer.block_sparse_moe - block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward - block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward - block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward + for layer in self._model.model.layers: + block_sparse_moe = layer.block_sparse_moe + block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward + block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward + block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward class OVSeq2SeqModelPatcher(ModelPatcher): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 9a6acf1cb7..92b87ddfe3 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -81,6 +81,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "phi3", "gemma2", "granite", + "granitemoe", ) SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba") @@ -165,7 +166,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "dbrx", # "phimoe", "marian", - "granitemoe", # "zamba2", ) GENERATION_LENGTH = 100 From 389f818868ecdbff53249bbb0067e769f304ebe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 15:59:54 +0100 Subject: [PATCH 119/222] filtered test --- tests/openvino/test_exporters_cli.py | 2 ++ tests/openvino/test_quantization.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index a684c90ca8..5f45f00031 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -834,6 +834,8 @@ def test_filtered_architectures(cls): expected = {"qwen3_vl"} else: expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"} + if is_transformers_version(">=", "5"): + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS} diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f66ae8834d..dd69f926f5 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1234,6 +1234,8 @@ def test_filtered_architectures(cls): expected.add("qwen3_vl") if is_transformers_version(">=", "4.54"): expected.update({"llava-qwen2", "phi3_v", "minicpmo"}) + if is_transformers_version(">=", "5"): + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE} From ffe2d27e445e05e4eef70e07aed8a27038db0d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 16:46:52 +0100 Subject: [PATCH 120/222] add back granitemoe model support --- tests/openvino/test_decoder.py | 2 +- tests/openvino/test_genai.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 92b87ddfe3..fac01b5960 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -311,7 +311,7 @@ def test_find_untested_architectures(self): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "5"): - supported_architectures -= {"phimoe", "granitemoe", "bitnet", "dbrx", "zamba2", "marian"} + supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian"} supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 74f6bab1ec..388c3ce127 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -65,6 +65,7 @@ class LLMPipelineTestCase(unittest.TestCase): "phi3", "gemma2", "granite", + "granitemoe", ) if is_transformers_version(">=", "4.46.0"): @@ -108,7 +109,6 @@ class LLMPipelineTestCase(unittest.TestCase): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly "dbrx", # "phimoe", - "granitemoe", ) REMOTE_CODE_MODELS = ( From c649bdf8325b9d132051b57bdaf2b8effc7a0568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 17:59:49 +0100 Subject: [PATCH 121/222] udpate setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 16e2a82fed..3a1995891d 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 4c74aebf280bd1e68625f8b20620651cdcbb5210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:26:29 +0100 Subject: [PATCH 122/222] update workflows --- .github/workflows/test_offline.yaml | 2 +- .github/workflows/test_openvino_nightly.yml | 7 ++++++- .github/workflows/test_openvino_slow.yml | 7 ++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index c75ba43bef..5b6b019e83 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] diffusers - name: Test run: | diff --git a/.github/workflows/test_openvino_nightly.yml b/.github/workflows/test_openvino_nightly.yml index 90df6a2af3..ace0246329 100644 --- a/.github/workflows/test_openvino_nightly.yml +++ b/.github/workflows/test_openvino_nightly.yml @@ -97,7 +97,12 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[diffusers,tests] + uv pip install .[tests] librosa diffusers + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.openvino-version == 'openvino-nightly' }} name: Install OpenVINO Nightly diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 4b271d898b..3868e44141 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -59,7 +59,12 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip uv - uv pip install .[tests,diffusers] transformers[testing] + uv pip install .[tests] transformers[testing] diffusers + + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install diffusers + run: | + uv pip install git+https://github.com/huggingface/diffusers - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }} name: Install specific dependencies and versions required for older transformers From c7184e114939e777886cb1f2acb4b3abcca6f148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:38:37 +0100 Subject: [PATCH 123/222] update setup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3a1995891d..267d0b83f2 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.1", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers5", + "transformers>=4.45,<5.3", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 7b0806e3f8571558a055a457ac3958589edecc2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:39:54 +0100 Subject: [PATCH 124/222] fix --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 267d0b83f2..e99736e5a4 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers5", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", "transformers>=4.45,<5.3", "setuptools", "huggingface-hub>=0.23.2,<2.0", From 50e30b789ffc182fac3ba943cdcafdbf1a27c11b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 18:52:22 +0100 Subject: [PATCH 125/222] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e99736e5a4..16e2a82fed 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", - "transformers>=4.45,<5.3", + "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From e7878e1de4f04e61eafc97657db69b37c9e79f30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 19:00:41 +0100 Subject: [PATCH 126/222] remove diffusers --- .github/workflows/test_offline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 5b6b019e83..7c4458a306 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] diffusers + uv pip install .[tests] - name: Test run: | From 467dcad06b77db153fc8419fdfb6981c1005640a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 19:06:51 +0100 Subject: [PATCH 127/222] fix offline workflow --- .github/workflows/test_offline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 7c4458a306..d079c6c8b7 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -38,10 +38,10 @@ jobs: - name: Test run: | - HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2 + HF_HOME=/tmp/ hf download hf-internal-testing/tiny-random-gpt2 HF_HOME=/tmp/ HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation - huggingface-cli download hf-internal-testing/tiny-random-gpt2 + hf download hf-internal-testing/tiny-random-gpt2 HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv From c14f2e53737afa21d7cb20fa9ea40c9e32139f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Feb 2026 19:42:33 +0100 Subject: [PATCH 128/222] exclude openclip from offline tests --- .github/workflows/test_offline.yaml | 4 ++-- tests/openvino/test_modeling.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index d079c6c8b7..48f07b9396 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -44,5 +44,5 @@ jobs: hf download hf-internal-testing/tiny-random-gpt2 HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation - pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv - HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv + pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv + HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 8d8ab01147..db369a478c 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1601,7 +1601,7 @@ def _get_sample_image(self): image = Image.open(requests.get(url, stream=True).raw) return image - def test_load_from_hub_and_save_model(self): + def test_load_from_hub_and_save_model_openclip(self): loaded_model = OVModelOpenCLIPForZeroShotImageClassification.from_pretrained( self.OV_MODEL_ID_IR, device=OPENVINO_DEVICE ) From 69c16bfd00cb485ef7e72a013b720575bf84c28d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 09:29:22 +0100 Subject: [PATCH 129/222] workflow slow --- .github/workflows/test_openvino_slow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 3868e44141..580253a36a 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip uv - uv pip install .[tests] transformers[testing] diffusers + uv pip install .[tests] librosa diffusers - if: ${{ matrix.transformers-version == 'latest' }} name: Install diffusers From 3f8dfb4be84364485b37aaa366b7472afde47286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:15:10 +0100 Subject: [PATCH 130/222] fix question answering pipeline --- tests/openvino/test_modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index db369a478c..0c5011a908 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -896,12 +896,12 @@ def test_pipeline(self, model_arch): pipe = pipeline("question-answering", model=model, tokenizer=tokenizer) question = "What's my name?" context = "My Name is Arthur and I live in Lyon." - outputs = pipe(question, context) + outputs = pipe(question=question, context=context) self.assertEqual(pipe.device, model.device) self.assertGreaterEqual(outputs["score"], 0.0) self.assertIsInstance(outputs["answer"], str) ov_pipe = optimum_pipeline("question-answering", model_id, accelerator="openvino") - ov_outputs = ov_pipe(question, context) + ov_outputs = ov_pipe(question=question, context=context) self.assertEqual(outputs["score"], ov_outputs["score"]) del model del ov_pipe From 975da724b9c9cd25e768d4d4f928d534113f85fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:24:43 +0100 Subject: [PATCH 131/222] encode_plus deprecated --- tests/openvino/test_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index dd69f926f5..ed1577d1cd 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -2038,7 +2038,7 @@ def preprocess_function(examples, tokenizer): # Test that inference on quantized model works model = OVModelForQuestionAnswering.from_pretrained(tmp_dir, device=OPENVINO_DEVICE) - tokens = tokenizer.encode_plus( + tokens = tokenizer( "This is a sample question", "This is a sample context", add_special_tokens=True, return_tensors="pt" ) model(**tokens, return_dict=True) From 31989eb0b262634373bfa2799ed634b96c0b3fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:54:31 +0100 Subject: [PATCH 132/222] automatic-speech pipeline for whisper incompatible with v5 --- tests/openvino/test_seq2seq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index da68d6e8b9..d8c10f39dd 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -410,6 +410,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif(is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames") def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From 7adb81012983043c4e12632693848c87a1f92746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 11:54:57 +0100 Subject: [PATCH 133/222] style --- tests/openvino/test_seq2seq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index d8c10f39dd..0fc3821c9b 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -410,7 +410,9 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow - @pytest.mark.skipif(is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames") + @pytest.mark.skipif( + is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames" + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From 6a93224b1ca747e32f8ce68cdc14be725a60bb9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 14:21:22 +0100 Subject: [PATCH 134/222] image-to-text pipeline deprecated --- tests/openvino/test_seq2seq.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 0fc3821c9b..bbc3d9260d 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -515,6 +515,10 @@ def test_compare_to_transformers(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5"), + reason="requires transformers < v5 since image-to-text pipelines is deprecated", + ) def test_pipeline(self, model_arch: str): set_seed(SEED) model_id = MODEL_NAMES[model_arch] From c8e9488fad965f496b0bc4dac3aae36b554fb82a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 14:30:49 +0100 Subject: [PATCH 135/222] update MAX_TRANSFORMERS_VERSION for gemma3 exaone4 and llama4 --- optimum/exporters/openvino/model_configs.py | 6 ++++++ tests/openvino/test_decoder.py | 11 ++++++++--- tests/openvino/test_export.py | 2 +- tests/openvino/test_exporters_cli.py | 2 +- tests/openvino/test_genai.py | 2 +- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index fe846efcf4..53610803da 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -874,6 +874,8 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): ) class Exaone4OpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.54.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -1474,6 +1476,8 @@ class Gemma2OpenVINOConfig(GemmaOpenVINOConfig): ) class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): @@ -4561,6 +4565,8 @@ def with_behavior( ) class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator _MODEL_PATCHER = Llama4TextModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index fac01b5960..2e6a938c81 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -115,11 +115,16 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) - if is_transformers_version(">", "4.49"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): - SUPPORTED_ARCHITECTURES += ("llama4", "qwen3", "qwen3_moe") + SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") + + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.51.0") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ("llama4",) if is_transformers_version(">=", "4.51.3"): SUPPORTED_ARCHITECTURES += ("glm4",) @@ -131,7 +136,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("bitnet",) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("exaone4",) if is_transformers_version("<", "4.54.0"): diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index eae3727de6..b73de1aaf9 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -100,7 +100,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES.update({"granitemoehybrid": OVModelForCausalLM}) - if is_transformers_version(">=", "4.54"): + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.update({"exaone4": OVModelForCausalLM, "lfm2": OVModelForCausalLM}) if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 5f45f00031..326f42d9bd 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -138,7 +138,7 @@ class OVCLIExportTestCase(unittest.TestCase): ] ) - if is_transformers_version(">=", "4.54"): + if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "exaone4"), diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 388c3ce127..906216c567 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -84,7 +84,7 @@ class LLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("glm4",) if is_transformers_version(">=", "4.53.0"): SUPPORTED_ARCHITECTURES += ("arcee",) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("exaone4",) if is_transformers_version(">=", "4.55.0"): SUPPORTED_ARCHITECTURES += ("gpt_oss",) From 28e2e24ff38d99ee269631652fe913fa27063552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 15:28:36 +0100 Subject: [PATCH 136/222] remove from test when not supported --- tests/openvino/test_genai.py | 2 +- tests/openvino/test_quantization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 906216c567..0ddc6db210 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -76,7 +76,7 @@ class LLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen",) if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("phimoe",) - if is_transformers_version(">=", "4.49"): + if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index ed1577d1cd..9c60468dd3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1070,7 +1070,7 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version("<", "4.52.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmo", True)) - if is_transformers_version(">=", "4.54.0"): + if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "exaone4", True)) if is_transformers_version(">=", "4.57.0"): From f061f2ce4d7643e5bcc43ca30dab48438821f628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 17:41:16 +0100 Subject: [PATCH 137/222] decoder tests --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2e6a938c81..01e4481c8d 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -316,7 +316,7 @@ def test_find_untested_architectures(self): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "5"): - supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian"} + supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian", "llama4", "gemma3_text", "exaone4"} supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures @@ -420,7 +420,7 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["qwen"]: return - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens = tokenizer(["Today is a nice day and", "This is me"], return_tensors="pt", padding=True) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None From 8820fb3964e245c41351cb2cb866dfe8da228897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 17:43:05 +0100 Subject: [PATCH 138/222] test filtered architectures update with exaone4 --- tests/openvino/test_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 9c60468dd3..b5c01b90d9 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1235,7 +1235,7 @@ def test_filtered_architectures(cls): if is_transformers_version(">=", "4.54"): expected.update({"llava-qwen2", "phi3_v", "minicpmo"}) if is_transformers_version(">=", "5"): - expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"}) + expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat", "exaone4"}) all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE} From 290b7b328cf64a9dfd9d2881996dbb1114d76369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Feb 2026 18:04:06 +0100 Subject: [PATCH 139/222] change gptoss model --- tests/openvino/test_exporters_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 326f42d9bd..7f396e3a85 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -1225,13 +1225,13 @@ def test_exporters_cli_full_quantization( {"model": 65}, ), ( - "gpt_oss_mxfp4", + "gpt_oss", "openai/gpt-oss-20b", AutoModelForCausalLM, OVModelForCausalLM, "--task text-generation-with-past --weight-format int4", _DEFAULT_4BIT_WQ_CONFIGS, - {"model": {"int8": 22, "int4": 4}}, + {"model": {"int8": 40, "int4": 0}}, {"model": 0}, ), ( From 64223a8d6d331816d507a353aeb248189cfc8bf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 23 Feb 2026 15:49:57 +0100 Subject: [PATCH 140/222] style --- tests/openvino/test_decoder.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 01e4481c8d..e111b0ec06 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -316,7 +316,16 @@ def test_find_untested_architectures(self): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "5"): - supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian", "llama4", "gemma3_text", "exaone4"} + supported_architectures -= { + "phimoe", + "bitnet", + "dbrx", + "zamba2", + "marian", + "llama4", + "gemma3_text", + "exaone4", + } supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES untested_architectures = supported_architectures - tested_architectures From f40bcb35016907a36f2b509cd2cfeb4dbe669c18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 23 Feb 2026 16:15:10 +0100 Subject: [PATCH 141/222] set num beam to 5 --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index e111b0ec06..75a3a49f36 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -429,7 +429,7 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["qwen"]: return - tokens = tokenizer(["Today is a nice day and", "This is me"], return_tensors="pt", padding=True) + tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -440,7 +440,7 @@ def test_compare_to_transformers(self, model_arch): # LFM2 fails with beam search, issue link: https://github.com/huggingface/transformers/issues/42257 # CVS-177964 GraniteMoeHybrid fails due to lack support of Beam search for hybrid models in OpenVINO # For this support, we expect changes in IRs to have connected beam_idx with Mamba/Linear attention states - num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 2, + num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 5, do_sample=False, ) From 86767c7a5297026d0d88797dfa8c925d92f6998b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:30:27 +0100 Subject: [PATCH 142/222] add llava support for v5 --- optimum/exporters/openvino/model_configs.py | 10 --------- optimum/exporters/openvino/model_patcher.py | 23 --------------------- tests/openvino/test_decoder.py | 5 +++-- tests/openvino/test_export.py | 7 ++----- tests/openvino/test_exporters_cli.py | 22 +++++++------------- tests/openvino/test_seq2seq.py | 6 +++--- 6 files changed, 16 insertions(+), 57 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 53610803da..09ab8d72a0 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -166,7 +166,6 @@ Llama4ImageEmbeddingsModelPatcher, Llama4TextModelPatcher, LlavaImageEmbeddingModelPatcher, - LlavaNextImageEmbeddingModelPatcher, LlavaNextVideoImageEmbeddingModelPatcher, LlavaQwen2ImageEmbeddingsModelPatcher, MairaImageEmbeddingModelPatcher, @@ -1902,8 +1901,6 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ @register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers") class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.37.2" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, @@ -1942,13 +1939,6 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: @register_in_tasks_manager("llava_next", *["image-text-to-text"], library_name="transformers") class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.40.0" - MAX_TRANSFORMERS_VERSION = "5.99" - - def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): - model_kwargs = model_kwargs or {} - if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: - return super().patch_model_for_export(model, model_kwargs) - return LlavaNextImageEmbeddingModelPatcher(self, model, model_kwargs) class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator): diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4bd9024bc8..9624401569 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3307,29 +3307,6 @@ def __init__( model_kwargs: Dict[str, Any], ): model.__orig_forward = model.forward - - if is_transformers_version("<", "5"): - model.forward = types.MethodType(llava_vision_embed_forward, model) - else: - model.forward = model.get_image_features - - super().__init__(config, model, model_kwargs) - - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - self._model.forward = self._model.__orig_forward - - -class LlavaNextImageEmbeddingModelPatcher(ModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: "PreTrainedModel", - model_kwargs: Dict[str, Any], - ): - model.__orig_forward = model.forward - # TODO: use get_image_features instead and add image_sizes as input when exporting - # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716 model.forward = types.MethodType(llava_vision_embed_forward, model) super().__init__(config, model, model_kwargs) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 75a3a49f36..a06ee5fa12 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -429,7 +429,8 @@ def test_compare_to_transformers(self, model_arch): if model_arch in ["qwen"]: return - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + inputs = "Today is a nice day and" if model_arch == "decilm" else "The quick brown fox jumps over the" + tokens = tokenizer([inputs, "This is me"], return_tensors="pt", padding=True) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -440,7 +441,7 @@ def test_compare_to_transformers(self, model_arch): # LFM2 fails with beam search, issue link: https://github.com/huggingface/transformers/issues/42257 # CVS-177964 GraniteMoeHybrid fails due to lack support of Beam search for hybrid models in OpenVINO # For this support, we expect changes in IRs to have connected beam_idx with Mamba/Linear attention states - num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 5, + num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 2, do_sample=False, ) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index b73de1aaf9..ca16598103 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -84,6 +84,7 @@ class ExportModelTest(unittest.TestCase): "stable-diffusion-xl": OVStableDiffusionXLPipeline, "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline, "latent-consistency": OVLatentConsistencyModelPipeline, + "llava": OVModelForVisualCausalLM, "sam": OVSamModel, "speecht5": OVModelForTextToSpeechSeq2Seq, "clip": OVModelForZeroShotImageClassification, @@ -117,11 +118,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction}) - GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "speecht5") - - if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM}) - GENERATIVE_MODELS += ("llava",) + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava", "speecht5") def _openvino_export( self, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 7f396e3a85..96c8cd64f6 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -208,6 +208,14 @@ class OVCLIExportTestCase(unittest.TestCase): "expected_chat_template": False, "simplified_chat_template": False, }, + "llava": { # transformers, chat template in processor, simplified chat template + "num_tokenizers": 2, + "task": "image-text-to-text", + "processor_chat_template": True, + "remote_code": False, + "expected_chat_template": True, + "simplified_chat_template": True, + }, "llava_next": { # transformers, chat template in processor overrides tokinizer chat template, simplified chat template "num_tokenizers": 2, "task": "image-text-to-text", @@ -254,20 +262,6 @@ class OVCLIExportTestCase(unittest.TestCase): } ) - if is_transformers_version("<", "5"): - TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS.update( - { - "llava": { # transformers, chat template in processor, simplified chat template - "num_tokenizers": 2, - "task": "image-text-to-text", - "processor_chat_template": True, - "remote_code": False, - "expected_chat_template": True, - "simplified_chat_template": True, - }, - } - ) - SUPPORTED_SD_HYBRID_ARCHITECTURES = [ ("flux", 7, 56), ("latent-consistency", 50, 135), diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index bbc3d9260d..4e2df41407 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -547,6 +547,7 @@ def test_pipeline(self, model_arch: str): class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES = [ + "llava", "llava_next", "llava_next_mistral", "qwen2_vl", @@ -601,7 +602,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") + SUPPORTED_ARCHITECTURES += ("llava_next_video",) else: UNSUPPORTED_ARCHITECTURES = { "got_ocr2", @@ -611,7 +612,6 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): "phi4_multimodal", "gemma3", "smolvlm", - "llava", } REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( @@ -872,7 +872,7 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand( ["llava", "llava_next", "llava_next_video", "llava_next_mistral"] if is_transformers_version("<", "5") - else ["llava_next", "llava_next_mistral"] + else ["llava", "llava_next", "llava_next_mistral"] ) def test_llava_with_new_preprocessing(self, model_arch): prompt = "\n What is shown in this image?" From c523617123a1f45d6664335490446b05a82f576d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:33:15 +0100 Subject: [PATCH 143/222] maira --- optimum/exporters/openvino/model_configs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 09ab8d72a0..a52cb0ca87 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2060,7 +2060,6 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ ) class MairaOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" - MAX_TRANSFORMERS_VERSION = "5.99" SUPPORTS_PAST = True def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None): From d49a895bda3bf9b94752ed04bee1821735e2d8f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:50:37 +0100 Subject: [PATCH 144/222] extend tests disabled for marian for openvino v2026 --- tests/openvino/test_genai.py | 4 ++-- tests/openvino/test_seq2seq.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 0ddc6db210..5d6b3c4b72 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -466,8 +466,8 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase): @parameterized.expand(EAGLE3_MODELS.items()) def test_compare_outputs(self, model_arch, model_pair): - if is_transformers_version("<", "4.54"): - self.skipTest("Eagle3 requires transformers >= 4.54") + if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): + self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5") if is_openvino_version("<", "2026.0"): self.skipTest("Eagle3 requires openvino-genai >= 2026.0") diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 4e2df41407..e34a256060 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2025.5.0")): + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version("<", "5"): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) From 6f608fd27e7c25c8a5cb438804c272eb26b61fed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:51:02 +0100 Subject: [PATCH 145/222] style --- tests/openvino/test_seq2seq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index e34a256060..c15c0ca269 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,9 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version("<", "5"): + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version( + "<", "5" + ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) From 02a2ccd8688c520ca039d126b1dbe413c51dc82d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 17:53:12 +0100 Subject: [PATCH 146/222] style --- tests/openvino/test_genai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 5d6b3c4b72..f3c1bed1e9 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -466,7 +466,7 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase): @parameterized.expand(EAGLE3_MODELS.items()) def test_compare_outputs(self, model_arch, model_pair): - if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): + if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5") if is_openvino_version("<", "2026.0"): self.skipTest("Eagle3 requires openvino-genai >= 2026.0") From 710c5bc8679c23e71061385babc8f28994f2c67c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 18:12:08 +0100 Subject: [PATCH 147/222] include openvino 2026 --- tests/openvino/test_seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index c15c0ca269..af047f0313 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version( + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version( "<", "5" ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x From 64dc198c8c80d997e80ddb6f5a57d589aba733ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 18:44:57 +0100 Subject: [PATCH 148/222] add gemma3 text --- optimum/exporters/openvino/model_configs.py | 2 -- tests/openvino/test_decoder.py | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a52cb0ca87..8af57604fa 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1475,8 +1475,6 @@ class Gemma2OpenVINOConfig(GemmaOpenVINOConfig): ) class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index a06ee5fa12..de3d3df121 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -115,8 +115,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): + if is_transformers_version(">", "4.49"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): @@ -323,7 +322,6 @@ def test_find_untested_architectures(self): "zamba2", "marian", "llama4", - "gemma3_text", "exaone4", } From d3bdb292d52a46fa9f29c721b36a69e56b9ebc02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Feb 2026 18:53:01 +0100 Subject: [PATCH 149/222] llava tests --- tests/openvino/test_quantization.py | 49 +++++++++++++---------------- tests/openvino/test_seq2seq.py | 8 +++-- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b5c01b90d9..753b1e387a 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1061,6 +1061,7 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionPipeline, "stable-diffusion", False), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False), (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False), + (OVModelForVisualCausalLM, "llava", False), (OVModelForVisualCausalLM, "qwen2_vl", False), ] @@ -1079,7 +1080,6 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend( [ - (OVModelForVisualCausalLM, "llava", False), (OVModelForVisualCausalLM, "llava_next_video", False), (OVModelForVisualCausalLM, "minicpmv", True), ] @@ -1122,6 +1122,17 @@ class OVWeightCompressionTest(unittest.TestCase): "text_encoder": {}, }, ), + ( + OVModelForVisualCausalLM, + "llava", + 4, + {"bits": 4, "group_size": 8, "ratio": 0.5}, + { + "lm_model": {"int8": 22, "int4": 8}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 9}, + }, + ), ( OVSamModel, "sam", @@ -1175,6 +1186,15 @@ class OVWeightCompressionTest(unittest.TestCase): }, }, ), + ( + OVModelForVisualCausalLM, + "llava", + { + "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, + "text_embeddings_model": {"patterns": ["."]}, + }, + ), ( OVSamModel, "sam", @@ -1195,33 +1215,6 @@ class OVWeightCompressionTest(unittest.TestCase): ), ] - if is_transformers_version("<", "5"): - DEFAULT_COMPRESSION_CONFIGURATIONS.append( - ( - OVModelForVisualCausalLM, - "llava", - 4, - {"bits": 4, "group_size": 8, "ratio": 0.5}, - { - "lm_model": {"int8": 22, "int4": 8}, - "text_embeddings_model": {"int8": 1}, - "vision_embeddings_model": {"int8": 9}, - }, - ), - ) - - DEFAULT_IGNORED_SCOPE_CONFIGURATIONS.append( - ( - OVModelForVisualCausalLM, - "llava", - { - "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]}, - "text_embeddings_model": {"patterns": ["."]}, - }, - ), - ) - def test_filtered_architectures(cls): expected = set() if is_transformers_version("<", "4.49"): diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index af047f0313..9e2246582f 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -153,18 +153,20 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): TASK = "text2text-generation" GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 - - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version( + UNSUPPORTED_ARCHITECTURES = set() + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) or is_transformers_version( "<", "5" ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x SUPPORTED_ARCHITECTURES += ("marian",) + else: + UNSUPPORTED_ARCHITECTURES.add("marian") # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("mt5",) else: - UNSUPPORTED_ARCHITECTURES = {"marian", "mt5"} + UNSUPPORTED_ARCHITECTURES.add("mt5") SUPPORT_STATEFUL = ("t5", "mt5", "longt5") if is_transformers_version(">=", "4.52.0"): From ea761a75bd0657e0514d3a025f57676d956056ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 26 Feb 2026 14:32:57 +0100 Subject: [PATCH 150/222] exclude marian for transformers v5 or higher --- tests/openvino/test_seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 9e2246582f..26b5b7d391 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 UNSUPPORTED_ARCHITECTURES = set() - if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) or is_transformers_version( + if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version( "<", "5" ): # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x From a33065ee1c4745f1461764d6321fc6abb1bbe5d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 26 Feb 2026 18:26:50 +0100 Subject: [PATCH 151/222] fix gemma3 --- optimum/exporters/openvino/model_configs.py | 2 -- optimum/exporters/openvino/model_patcher.py | 36 +++++++++++++-------- tests/openvino/test_decoder.py | 2 +- tests/openvino/test_genai.py | 5 ++- tests/openvino/test_seq2seq.py | 13 ++++---- 5 files changed, 32 insertions(+), 26 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ec330c59e8..d5ce89bd46 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4136,8 +4136,6 @@ def __init__( @register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers") class Gemma3OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" def __init__( self, diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4ce8d17ded..860ce212e9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4657,22 +4657,30 @@ def __init__( model: "PreTrainedModel", model_kwargs: Dict[str, Any], ): - model.__orig_forward = model.forward - # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 - # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 - if ( - hasattr(model, "model") - and hasattr(model.model, "get_image_features") - and is_transformers_version("<", "5") - ): - model.forward = model.model.get_image_features - else: - model.forward = model.get_image_features super().__init__(config, model, model_kwargs) - def __exit__(self, exc_type, exc_value, traceback): - super().__exit__(exc_type, exc_value, traceback) - self._model.forward = self._model.__orig_forward + @functools.wraps(self.orig_forward) + def patched_forward(*args, **kwargs): + # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835 + # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321 + if ( + hasattr(self._model, "model") + and hasattr(self._model.model, "get_image_features") + and is_transformers_version("<", "5") + ): + get_image_features = self._model.model.get_image_features + else: + get_image_features = self._model.get_image_features + + outputs = get_image_features(*args, **kwargs) + + if is_transformers_version(">=", "5") and hasattr(outputs, "pooler_output"): + outputs = outputs.pooler_output + + output_names = list(config.outputs.keys()) + return {output_names[0]: outputs} + + self.patched_forward = patched_forward # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1147 diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index d3cfba3ba3..8f0a8f12c2 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -115,7 +115,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">", "4.47"): SUPPORTED_ARCHITECTURES += ("olmo2",) - if is_transformers_version(">", "4.49"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index f3c1bed1e9..5375cf9b67 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -76,7 +76,7 @@ class LLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen",) if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("phimoe",) - if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3_text",) if is_transformers_version(">=", "4.51.0"): SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe") @@ -224,8 +224,7 @@ class VLMPipelineTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",) if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("phi4mm",) - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): + if is_transformers_version(">=", "4.50"): SUPPORTED_ARCHITECTURES += ("gemma3",) if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video") diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 2b005849fa..e7c59476b6 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -581,9 +581,11 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["phi4mm"] SUPPORT_AUDIO.append("phi4mm") - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] + if is_transformers_version(">=", "4.50"): + SUPPORTED_ARCHITECTURES += ["gemma3"] + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES += ["smolvlm"] # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"): @@ -614,7 +616,6 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): "llama4", "llava_next_video", "phi4_multimodal", - "gemma3", "smolvlm", } REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] @@ -783,9 +784,9 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) additional_inputs = {} - # gemma3 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, + # gemma3 does not support dynamic cache until v4.53, we cannot compare dynamic cache result vs hybrid cache, # align cache representation in torch model - if model_arch == "gemma3": + if model_arch == "gemma3" and is_transformers_version("<", "4.53.0"): patch_update_causal_mask( transformers_model if is_transformers_version("<", "4.52.0") else transformers_model.language_model, "4.43.0", From bf51329a5519e3c964ed5119043a3619594666a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 3 Mar 2026 18:36:48 +0100 Subject: [PATCH 152/222] add comment --- optimum/exporters/openvino/model_patcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 92fe0b4063..634f015872 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4674,6 +4674,7 @@ def patched_forward(*args, **kwargs): outputs = get_image_features(*args, **kwargs) + # we should be able to specify pooler_output as output_name, not supported here as pooler_output key does not exist if is_transformers_version(">=", "5") and hasattr(outputs, "pooler_output"): outputs = outputs.pooler_output From 4a2786218d053164cfc01412505dca5c1174820a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Mar 2026 15:35:24 +0100 Subject: [PATCH 153/222] replace gpt_oss_mxfp4 test to gpt_oss for v5 --- tests/openvino/test_exporters_cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index e8766c737d..9690496089 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -1233,13 +1233,14 @@ def test_exporters_cli_full_quantization( {"model": 65}, ), ( - "gpt_oss", + # mxfp4 fixing saving broken since v5, fixed in https://github.com/huggingface/transformers/pull/43148, test can be added back for v5.3 + "gpt_oss_mxfp4" if is_transformers_version("<", "5") else "gpt_oss", "openai/gpt-oss-20b", AutoModelForCausalLM, OVModelForCausalLM, "--task text-generation-with-past --weight-format int4", _DEFAULT_4BIT_WQ_CONFIGS, - {"model": {"int8": 40, "int4": 0}}, + {"model": {"int8": 22, "int4": 4} if is_transformers_version("<", "5") else {"int8": 40, "int4": 0}}, {"model": 0}, ), ( From 4a8644d937989fa88031b2c46992a5e48f4b8ac2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Mar 2026 15:51:24 +0100 Subject: [PATCH 154/222] include Qwen3VLOpenVINOConfig min version --- tests/openvino/test_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index dc4073c063..e5267b5224 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -318,7 +318,7 @@ def test_find_untested_architectures(self): supported_architectures -= {"lfm2"} # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group - if is_transformers_version(">", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): + if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"qwen3_vl_text"} # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly From 988147517ceb2216275ee0c720221281fd18151a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 4 Mar 2026 16:40:14 +0100 Subject: [PATCH 155/222] add phi4_multimodal for transformers < v5 --- tests/openvino/test_seq2seq.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index e7c59476b6..9ceab2d227 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -558,6 +558,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): ] SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"] SUPPORT_AUDIO = [] + UNSUPPORTED_ARCHITECTURES = {"phi4_multimodal"} OVMODEL_CLASS = OVModelForVisualCausalLM TASK = "image-text-to-text" @@ -610,14 +611,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("llava_next_video",) else: - UNSUPPORTED_ARCHITECTURES = { - "got_ocr2", - "idefics3", - "llama4", - "llava_next_video", - "phi4_multimodal", - "smolvlm", - } + UNSUPPORTED_ARCHITECTURES.update({"got_ocr2", "idefics3", "llama4", "llava_next_video", "smolvlm"}) REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( From 2d764ef9cf916ea168282b8acc000d018da55177 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 6 Mar 2026 11:29:44 +0100 Subject: [PATCH 156/222] set dtype for beam_search tests for gemma3 text model --- tests/openvino/test_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index e5267b5224..0ddb251b22 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -783,11 +783,11 @@ def test_beam_search(self, model_arch): set_seed(SEED) with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) - if model_arch == "arctic" or "mxfp4" in model_arch: + if model_arch in ["arctic", "gemma3_text"] or "mxfp4" in model_arch: transformers_model.to(torch.float32) additional_inputs = {} # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, align cache representation in torch model - if model_arch in ["gemma2", "gemma3_text"]: + if model_arch in ["gemma2", "gemma3_text"] and is_transformers_version("<", "4.53.0"): patch_update_causal_mask(transformers_model, "4.43.0") transformers_model._supports_cache_class = True transformers_model.generation_config.cache_implementation = None From f901a66f2e42360405402a765c80342e9fefc513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 15:57:24 +0100 Subject: [PATCH 157/222] diffusers latest release now compatible with transformers v5 --- .github/workflows/build_documentation.yml | 1 - .github/workflows/build_pr_documentation.yml | 1 - .github/workflows/test_openvino.yml | 7 +------ .github/workflows/test_openvino_nightly.yml | 5 ----- .github/workflows/test_openvino_slow.yml | 5 ----- setup.py | 2 +- 6 files changed, 2 insertions(+), 19 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index ce3eb464ce..896c5f8b43 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,7 +51,6 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 6b0b89f3f1..ac3291acfd 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,7 +38,6 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder - uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 48e3a7409b..ba60fc597a 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -56,7 +56,7 @@ jobs: pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - if: ${{ matrix.test-pattern == '*modeling*' }} + - if: ${{ matrix.test-pattern == '*modeling*' || matrix.test-pattern == '*quantization*' }} name: Install OpenVINO run: | uv pip install openvino==2025.3.0 openvino-tokenizers==2025.3.0 @@ -66,11 +66,6 @@ jobs: run: | uv pip install transformers==${{ matrix.transformers-version }} - - if: ${{ matrix.transformers-version == 'latest' }} - name: Install diffusers - run: | - uv pip install git+https://github.com/huggingface/diffusers - - if: ${{ matrix.transformers-version == '4.45.0' }} name: Install specific dependencies and versions required for older transformers run: | diff --git a/.github/workflows/test_openvino_nightly.yml b/.github/workflows/test_openvino_nightly.yml index ace0246329..886d22c2b3 100644 --- a/.github/workflows/test_openvino_nightly.yml +++ b/.github/workflows/test_openvino_nightly.yml @@ -99,11 +99,6 @@ jobs: pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - if: ${{ matrix.transformers-version == 'latest' }} - name: Install diffusers - run: | - uv pip install git+https://github.com/huggingface/diffusers - - if: ${{ matrix.openvino-version == 'openvino-nightly' }} name: Install OpenVINO Nightly run: | diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 580253a36a..8a6460ca1b 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -61,11 +61,6 @@ jobs: python -m pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - if: ${{ matrix.transformers-version == 'latest' }} - name: Install diffusers - run: | - uv pip install git+https://github.com/huggingface/diffusers - - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }} name: Install specific dependencies and versions required for older transformers run: | diff --git a/setup.py b/setup.py index 16e2a82fed..ca26a42a1f 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,7 @@ "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], - "diffusers": ["diffusers", "transformers<5"], + "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, } From 7879da8bfea6fc26f8e179c74a76fb7e51b10c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 16:47:20 +0100 Subject: [PATCH 158/222] set qwen3_next max transformers version --- optimum/exporters/openvino/model_configs.py | 1 + tests/openvino/test_decoder.py | 3 ++- tests/openvino/test_export.py | 5 ++++- tests/openvino/test_exporters_cli.py | 6 ++++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index b8c2eefcf1..a386842b7d 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5435,6 +5435,7 @@ class Qwen3NextOpenVINOConfig(Qwen3OpenVINOConfig): DUMMY_PKV_GENERATOR_CLASS = Qwen3NextDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig MIN_TRANSFORMERS_VERSION = "4.57.0" + MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = Qwen3NextModelPatcher def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2fdacf5ce5..bedb5ee8e9 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -96,7 +96,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.54.0"): SUPPORTED_SSM_ARCHITECTURES += ("lfm2",) - if is_transformers_version(">=", "4.57.0"): + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly + if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"): SUPPORTED_SSM_ARCHITECTURES += ("qwen3_next",) SUPPORTED_ARCHITECTURES += SUPPORTED_SSM_ARCHITECTURES diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 0c51a6f8da..e9c7696c2d 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -111,7 +111,10 @@ class ExportModelTest(unittest.TestCase): SUPPORTED_ARCHITECTURES.update({"afmoe": OVModelForCausalLM}) if is_transformers_version(">=", "4.57.0"): - SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM, "qwen3_next": OVModelForCausalLM}) + SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM}) + + if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.update({"qwen3_next": OVModelForCausalLM}) EXPECTED_DIFFUSERS_SCALE_FACTORS = { "stable-diffusion-xl": {"vae_encoder": "128.0", "vae_decoder": "128.0"}, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index b26569ed00..940ab5b3ac 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -169,6 +169,12 @@ class OVCLIExportTestCase(unittest.TestCase): SUPPORTED_ARCHITECTURES.extend( [ ("text-generation-with-past", "hunyuan_v1_dense"), + ] + ) + + if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"): + SUPPORTED_ARCHITECTURES.extend( + [ ("text-generation-with-past", "qwen3_next"), ] ) From d5f22440f58f3a6231fa603bd37cb727f5074b60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 16:51:55 +0100 Subject: [PATCH 159/222] Fix doc building --- .github/workflows/build_documentation.yml | 1 + .github/workflows/build_pr_documentation.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 896c5f8b43..ce3eb464ce 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -51,6 +51,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index ac3291acfd..6b0b89f3f1 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -38,6 +38,7 @@ jobs: run: | pip install --upgrade pip uv uv pip install git+https://github.com/huggingface/doc-builder + uv pip install transformers==4.57.6 uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets - name: Make documentation From 43ed6175824e06d0ae0226ae4276e3d22a95c364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 9 Mar 2026 17:35:18 +0100 Subject: [PATCH 160/222] add qwen3_next to list of untested architectures --- tests/openvino/test_decoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index bedb5ee8e9..e259c38e68 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -336,6 +336,7 @@ def test_find_untested_architectures(self): "marian", "llama4", "exaone4", + "qwen3_next", } supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES From 87cc3f93a7712f60211a14431da170f8b1909314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 10 Mar 2026 17:29:14 +0100 Subject: [PATCH 161/222] comment for zamba2 --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_decoder.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index a386842b7d..62464e81f9 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4821,8 +4821,8 @@ class Zamba2OpenVINOConfig(MambaOpenVINOConfig): DUMMY_PKV_GENERATOR_CLASS = Zamba2DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig MIN_TRANSFORMERS_VERSION = "4.49.0" - # TODO (@echarlaix): add v5 support MAX_TRANSFORMERS_VERSION = "4.57.6" + # MIN_TRANSFORMERS_VERSION = "5.2.0" _MODEL_PATCHER = Zamba2ModelPatcher def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index e259c38e68..00bb85bd30 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -86,7 +86,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba") - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"): SUPPORTED_SSM_ARCHITECTURES += ("zamba2",) @@ -180,7 +179,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "dbrx", # "phimoe", "marian", - # "zamba2", ) GENERATION_LENGTH = 100 From 96d47b0172bb6720d43aef9b15d372eae3be0f4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Mar 2026 17:36:47 +0100 Subject: [PATCH 162/222] Fix eagle3 compatibility with v5 --- optimum/exporters/openvino/model_patcher.py | 2 +- tests/openvino/test_decoder.py | 6 +----- tests/openvino/test_genai.py | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f21399c836..a0c8b4601a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7804,7 +7804,7 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, - past_key_value=past_key_values, + **{"past_key_values" if is_transformers_version(">=", "5.0") else "past_key_value": past_key_values}, output_attentions=output_attentions, position_embeddings=position_embeddings, use_cache=use_cache, diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 00bb85bd30..3067f1c5c4 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -894,11 +894,7 @@ def test_load_with_different_dtype(self): ) @parameterized.expand(EAGLE3_MODELS.items()) - # TODO (@echarlaix) transformers v5 support - @pytest.mark.skipif( - is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), - reason="Eagle3 requires transformers >= 4.54", - ) + @pytest.mark.skipif(is_transformers_version("<", "4.54"), reason="Eagle3 requires transformers >= 4.54") def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair): draft_model_id, target_model_id = model_pair diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py index 584d798e88..9d217e7373 100644 --- a/tests/openvino/test_genai.py +++ b/tests/openvino/test_genai.py @@ -467,8 +467,8 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase): @parameterized.expand(EAGLE3_MODELS.items()) def test_compare_outputs(self, model_arch, model_pair): - if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"): - self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5") + if is_transformers_version("<", "4.54"): + self.skipTest("Eagle3 requires transformers >= 4.54") if is_openvino_version("<", "2026.0"): self.skipTest("Eagle3 requires openvino-genai >= 2026.0") From db805612f9bac4179c27960f4c6f6d9eeb0b4ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Mar 2026 18:59:25 +0100 Subject: [PATCH 163/222] set dtype in tests when loading sd3 model --- tests/openvino/test_diffusion.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index bc58c91796..08c5180a48 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -157,7 +157,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE) auto_cls = self.AUTOMODEL_CLASS if "sana" not in model_arch else DiffusionPipeline - diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) + model_kwargs = ( + {"torch_dtype": torch.float32} + if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3" + else {} + ) + diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type @@ -632,7 +637,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + model_kwargs = ( + {"torch_dtype": torch.float32} + if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3" + else {} + ) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE) for output_type in ["latent", "np", "pt"]: @@ -898,12 +908,18 @@ def test_shape(self, model_arch: str): @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE) + model_kwargs = ( + {"torch_dtype": torch.float32} + if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3" + else {} + ) + if model_arch != "flux-fill": - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) else: from diffusers import FluxFillPipeline - diffusers_pipeline = FluxFillPipeline.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = FluxFillPipeline.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs) height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_arch=model_arch) From 3e5a2b23f2a0d658b81acb40b9559e20c6f7d3c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 11 Mar 2026 19:13:08 +0100 Subject: [PATCH 164/222] trigger tests for transformers v5.3 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ca26a42a1f..3ca73ac3d9 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5", - "transformers>=4.45,<5.1", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", + "transformers>=4.45,<5.4", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 10add8c7df53753ab42ae8e224cb76dc35ef5eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 12 Mar 2026 09:32:48 +0100 Subject: [PATCH 165/222] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3ca73ac3d9..fe70f63757 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.4", + "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 501b5233a25e0753591cf4557f8edc91a075cf0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 12 Mar 2026 09:43:33 +0100 Subject: [PATCH 166/222] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fe70f63757..baccbf1d68 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.1", + "transformers>=4.45,<5.3", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 15548fcb6986259ddaaa1af4ced90701a6ba1acc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 12 Mar 2026 10:01:37 +0100 Subject: [PATCH 167/222] update setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index baccbf1d68..fe70f63757 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5", - "transformers>=4.45,<5.3", + "transformers>=4.45,<5.1", "setuptools", "huggingface-hub>=0.23.2,<2.0", "nncf>=2.19.0", From 79078787955d097221068bfb94399b4a8b6850d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 17 Mar 2026 19:09:31 +0100 Subject: [PATCH 168/222] fix bf16 model export --- optimum/exporters/openvino/__main__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index eb763b45d4..5e59f0cb19 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -496,6 +496,9 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs): **loading_kwargs, ) + if getattr(model, "dtype", None) in [torch.float16, torch.bfloat16]: + patch_16bit = True + needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None if needs_pad_token_id: From c026dd99262725b2000457e0c77f682ff2082c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 10:03:59 +0100 Subject: [PATCH 169/222] question answering pipeline deprecated in v5.3 --- tests/openvino/test_modeling.py | 8 ++++++++ tests/openvino/test_modeling_basic.py | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 0c5011a908..53223e692a 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -887,6 +887,10 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5.3"), + reason="requires transformers < v5.3 since question-answering pipeline is deprecated in v5.3", + ) def test_pipeline(self, model_arch): set_seed(SEED) model_id = MODEL_NAMES[model_arch] @@ -909,6 +913,10 @@ def test_pipeline(self, model_arch): @pytest.mark.run_slow @slow + @pytest.mark.skipif( + is_transformers_version(">=", "5.3"), + reason="requires transformers < v5.3 since question-answering pipeline is deprecated in v5.3", + ) def test_metric(self): model_id = "distilbert-base-cased-distilled-squad" set_seed(SEED) diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py index c2576db98b..eb72175032 100644 --- a/tests/openvino/test_modeling_basic.py +++ b/tests/openvino/test_modeling_basic.py @@ -30,12 +30,16 @@ "hf-internal-testing/tiny-random-bert": "OVModelForMaskedLM", "hf-internal-testing/tiny-random-distilbert": "OVModelForSequenceClassification", "hf-internal-testing/tiny-random-mbart": "OVModelForSeq2SeqLM", - "hf-internal-testing/tiny-random-roberta": "OVModelForQuestionAnswering", "hf-internal-testing/tiny-random-gpt2": "OVModelForCausalLM", "hf-internal-testing/tiny-random-t5": "OVModelForSeq2SeqLM", "hf-internal-testing/tiny-random-bart": "OVModelForSeq2SeqLM", } +# question-answering pipeline is deprecated in transformers v5.3 +if is_transformers_version("<", "5.3"): + MODEL_NAMES["hf-internal-testing/tiny-random-roberta"] = "OVModelForQuestionAnswering" + + TASKS = { "OVModelForMaskedLM": "fill-mask", "OVModelForSequenceClassification": "text-classification", From 61d85b371415e16d5a3757a49219ef604ee5e337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 10:04:16 +0100 Subject: [PATCH 170/222] ix mamba expected int8 --- tests/openvino/utils_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index cc084565fe..1117604b7b 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -358,8 +358,8 @@ "vocoder": 80, }, "clip": {"model": 130}, - "mamba": {"model": 322 if is_transformers_version("<", "5") else 324}, - "falcon_mamba": {"model": 162 if is_transformers_version("<", "5") else 164}, + "mamba": {"model": 324 if is_transformers_version("==", "5.0") else 322}, + "falcon_mamba": {"model": 164 if is_transformers_version("==", "5.0") else 162}, "minicpmo": { "lm_model": 16, "text_embeddings_model": 1, From 55c0d469b9c853e5aa2a285f7f9f0f37dd0b5c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 17:39:46 +0100 Subject: [PATCH 171/222] Fix _DEFAULT_IGNORED_SCOPE_CONFIGS for __make_16bit_traceable patched models --- optimum/intel/openvino/configuration.py | 1 + tests/openvino/test_quantization.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 2002e268ac..2d8608fadb 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -546,6 +546,7 @@ class OVQuantizationMethod(str, Enum): "__module.layers.27.mlp.up_proj/aten::linear/MatMul", "__module.layers.27.mlp.gate_proj/aten::linear/MatMul", ], + "validate": False, }, }, "microsoft/speecht5_tts": { diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index ec9d7b84f7..ff90b208e2 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1175,8 +1175,12 @@ class OVWeightCompressionTest(unittest.TestCase): "llama", { "model": { - "names": ["__module.layers.1.self_attn.v_proj/aten::linear/MatMul"], - "patterns": ["__module.layers.\\d.self_attn.o_proj/aten::linear/MatMul"], + "names": [ + f"__module.layers.1.self_attn.v_proj/{'aten' if is_transformers_version('<', '5') else 'ov_ext'}::linear/MatMul" + ], + "patterns": [ + f"__module.layers.\\d.self_attn.o_proj/{'aten' if is_transformers_version('<', '5') else 'ov_ext'}::linear/MatMul" + ], } }, ), From 2f38fd868b67d63add87e7f083325cd3c82968e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 18:52:35 +0100 Subject: [PATCH 172/222] add test to ensure dtype --- tests/openvino/test_modeling.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 53223e692a..7a3110b182 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -702,6 +702,14 @@ def test_load_from_hub_onnx_model_and_save(self): del model gc.collect() + def test_export_dtype(self): + model_id = "optimum-intel-internal-testing/tiny-random-GemmaForCausalLM" + for dtype in [torch.float32, torch.bfloat16, torch.float16]: + with TemporaryDirectory() as tmpdirname: + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) + self.assertEqual(model.dtype, dtype) + model.save_pretrained(tmpdirname) + ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True) class PipelineTest(unittest.TestCase): def test_load_model_from_hub(self): From c925a79cf3f04bf1ae78d24bc1ec2ea64aefa94e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 18 Mar 2026 18:52:55 +0100 Subject: [PATCH 173/222] style --- tests/openvino/test_modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 7a3110b182..e8f68d62dd 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -711,6 +711,7 @@ def test_export_dtype(self): model.save_pretrained(tmpdirname) ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True) + class PipelineTest(unittest.TestCase): def test_load_model_from_hub(self): model_id = "echarlaix/tiny-random-PhiForCausalLM" From bf1f377c540120ee33641ac02536d41e681bd6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 19 Mar 2026 15:28:13 +0100 Subject: [PATCH 174/222] check openvino model expected dtype in test_export_dtype --- optimum/intel/openvino/utils.py | 1 + tests/openvino/test_modeling.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index be6ac41d31..9549da9773 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -95,6 +95,7 @@ "f16": torch.float16, "f32": torch.float32, "f64": torch.float64, + "bf16": torch.bfloat16, } if is_torch_version(">=", "2.4.0"): diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index e8f68d62dd..372cd28943 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -90,8 +90,10 @@ OV_LANGUAGE_MODEL_NAME, OV_PROMPT_ENCODER_MASK_DECODER_MODEL_NAME, OV_TEXT_EMBEDDINGS_MODEL_NAME, + OV_TO_PT_TYPE, OV_VISION_EMBEDDINGS_MODEL_NAME, OV_VISION_ENCODER_MODEL_NAME, + STR_TO_OV_TYPE, TemporaryDirectory, ) from optimum.intel.pipelines import pipeline as optimum_pipeline @@ -704,12 +706,18 @@ def test_load_from_hub_onnx_model_and_save(self): def test_export_dtype(self): model_id = "optimum-intel-internal-testing/tiny-random-GemmaForCausalLM" - for dtype in [torch.float32, torch.bfloat16, torch.float16]: - with TemporaryDirectory() as tmpdirname: - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype) - self.assertEqual(model.dtype, dtype) - model.save_pretrained(tmpdirname) - ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True) + for dtype in ["f32", "f16", "bf16"]: + torch_dtype = OV_TO_PT_TYPE[dtype] + ov_dtype = STR_TO_OV_TYPE[dtype] + with TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype) + self.assertEqual(model.dtype, torch_dtype) + model.save_pretrained(tmp_dir) + del model + ov_model = OVModelForCausalLM.from_pretrained(tmp_dir, export=True) + dtypes = {op.get_element_type() for op in ov_model.model.get_ops() if op.get_type_name() == "Constant"} + self.assertIn(ov_dtype, dtypes, f"Expected {ov_dtype}, found {dtypes}") + del ov_model class PipelineTest(unittest.TestCase): From 5033df204cdcee729dec7ff8556e579422784d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Mar 2026 19:47:17 +0100 Subject: [PATCH 175/222] fix qwen3vl vision embeddings pos --- optimum/exporters/openvino/model_configs.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 62464e81f9..cc1cac2714 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3913,11 +3913,10 @@ def patch_model_for_export(self, model: Union["PreTrainedModel"], model_kwargs: model_kwargs = model_kwargs or {} if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER: return Qwen3VLVisionEmbMergerPatcher(self, model, model_kwargs) - if ( - self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS - or self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS - ): + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS: return ModelPatcher(self, model, model_kwargs=model_kwargs) + if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS: + return InputEmbeddingPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) @property From 28e98ca493c3fbc62a2324f115869366779893e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Mar 2026 19:57:03 +0100 Subject: [PATCH 176/222] exclude vision_embeddings_pos behavior for qwen2_vl models --- optimum/exporters/openvino/convert.py | 4 ---- optimum/exporters/openvino/model_configs.py | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index d0efa2259f..fddd840b7d 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -687,10 +687,6 @@ def export_from_model( ) logging.disable(logging.NOTSET) - # Remove empty model and export_configs pairs, they can be empty when a config class is shared between model versions. - # Example: Qwen2VL and Qwen3VL share config class, but "vision_embeddings_pos" is used in Qwen3VL only. - models_and_export_configs = {k: v for k, v in models_and_export_configs.items() if v != (None, None)} - if library_name == "open_clip": if hasattr(model.config, "save_pretrained"): model.config.save_pretrained(output) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index cc1cac2714..5427db1aa0 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3662,7 +3662,9 @@ class QwenVLConfigBehavior(str, enum.Enum): @register_in_tasks_manager("qwen2_vl", *["image-text-to-text"], library_name="transformers") class Qwen2VLOpenVINOConfig(BaseVLMOpenVINOConfig): - SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior] + SUPPORTED_BEHAVIORS = [ + model_type.value for model_type in QwenVLConfigBehavior if model_type.value != "vision_embeddings_pos" + ] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) MIN_TRANSFORMERS_VERSION = "4.45.0" From 910cc75144f686272d723c2e8f65fc6bbfcb45b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Fri, 20 Mar 2026 19:57:48 +0100 Subject: [PATCH 177/222] rename InputEmbedOpenvVINOConfig to InputEmbedOpenVINOConfig --- optimum/exporters/openvino/model_configs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 5427db1aa0..e793e7798b 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1738,7 +1738,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): return dummy_inputs -class InputEmbedOpenvVINOConfig(TextDecoderOnnxConfig): +class InputEmbedOpenVINOConfig(TextDecoderOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig _MODEL_PATCHER = InputEmbeddingPatcher @@ -1781,8 +1781,8 @@ def get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, def get_vlm_text_embeddings_config(model_type, model_config, int_dtype, float_dtype): internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype) - InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS - export_config = InputEmbedOpenvVINOConfig( + InputEmbedOpenVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS + export_config = InputEmbedOpenVINOConfig( model_config, task="feature-extraction", int_dtype=int_dtype, From 5d0637716424d7c99ee265c4d38cb49753a98891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 25 Mar 2026 18:11:24 +0100 Subject: [PATCH 178/222] fix lfm2 attention mask for mamba layers --- optimum/exporters/openvino/model_patcher.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a0c8b4601a..44263cb672 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7194,11 +7194,15 @@ def lfm2_short_conv_forward_patched( cache_position=None, attention_mask=None, ): - from transformers.models.lfm2.modeling_lfm2 import apply_mask_to_padding_states - seqlen = x.shape[1] - x = apply_mask_to_padding_states(x, attention_mask) + # only apply apply_mask_to_padding_states during the prefill phase + # https://github.com/huggingface/transformers/blob/v5.0.0/src/transformers/models/lfm2/modeling_lfm2.py#L427 + # in transformers < v5 attention_mask was never applied for conv layers, until https://github.com/huggingface/transformers/pull/41790/ + dtype = x.dtype + is_decoding = torch.tensor(seqlen == 1, dtype=dtype) + x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype) + BCx = self.in_proj(x).transpose(-1, -2) B, C, x = BCx.chunk(3, dim=-2) From b07adfbdbe59bd4f276bbbdd4a501adf48cde7e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 26 Mar 2026 16:42:10 +0100 Subject: [PATCH 179/222] add comment --- optimum/exporters/openvino/model_patcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 44263cb672..ef3d4a6b7d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7198,7 +7198,8 @@ def lfm2_short_conv_forward_patched( # only apply apply_mask_to_padding_states during the prefill phase # https://github.com/huggingface/transformers/blob/v5.0.0/src/transformers/models/lfm2/modeling_lfm2.py#L427 - # in transformers < v5 attention_mask was never applied for conv layers, until https://github.com/huggingface/transformers/pull/41790/ + # in transformers < v5 attention_mask was never applied in Lfm2ShortConv https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/lfm2/modeling_lfm2.py#L485 + # until a fix was added in https://github.com/huggingface/transformers/pull/41790/ dtype = x.dtype is_decoding = torch.tensor(seqlen == 1, dtype=dtype) x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype) From 2f8a5ed48d0b428a16cf6e29727bce5b18821a5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Thu, 26 Mar 2026 17:08:18 +0100 Subject: [PATCH 180/222] only apply mask for transformers >= v5 --- optimum/exporters/openvino/model_patcher.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index ef3d4a6b7d..3d334a7093 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7200,9 +7200,10 @@ def lfm2_short_conv_forward_patched( # https://github.com/huggingface/transformers/blob/v5.0.0/src/transformers/models/lfm2/modeling_lfm2.py#L427 # in transformers < v5 attention_mask was never applied in Lfm2ShortConv https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/lfm2/modeling_lfm2.py#L485 # until a fix was added in https://github.com/huggingface/transformers/pull/41790/ - dtype = x.dtype - is_decoding = torch.tensor(seqlen == 1, dtype=dtype) - x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype) + if is_transformers_version(">=", "5"): + dtype = x.dtype + is_decoding = torch.tensor(seqlen == 1, dtype=dtype) + x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype) BCx = self.in_proj(x).transpose(-1, -2) B, C, x = BCx.chunk(3, dim=-2) From 36371899e5c8ebc5581b2157d37059f16ea9e4a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Mon, 30 Mar 2026 18:10:39 +0200 Subject: [PATCH 181/222] Add fix for granitemoe export --- optimum/exporters/openvino/model_patcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3d334a7093..4cf552e982 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -7603,7 +7603,7 @@ def patch_sparse_moe(sparse_moe_layer): ) for idx, layer in enumerate(self._model.model.layers): - if hasattr(layer, "block_sparse_moe"): + if getattr(layer, "block_sparse_moe", None) is not None: patch_sparse_moe(layer.block_sparse_moe) if self.real_config._config.layers_block_type[idx] == "mamba": mamba_layer = layer.mamba @@ -7625,7 +7625,7 @@ def unpatch_sparse_moe(sparse_moe_layer): self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask for idx, layer in enumerate(self._model.model.layers): - if hasattr(layer, "block_sparse_moe"): + if getattr(layer, "block_sparse_moe", None) is not None: unpatch_sparse_moe(layer.block_sparse_moe) if self.real_config._config.layers_block_type[idx] == "mamba": mamba_layer = layer.mamba From dbabc02305a44b59aeaf64efee3cd11b8948e543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 31 Mar 2026 18:23:07 +0200 Subject: [PATCH 182/222] set afmoe MAX_TRANSFORMERS_VERSION to v5.0 --- optimum/exporters/openvino/model_configs.py | 2 +- tests/openvino/test_decoder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index e793e7798b..0ec3bf1bce 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5021,7 +5021,7 @@ class ASTOpenVINOConfig(ASTOnnxConfig): ) class AfmoeOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.55.0" - MAX_TRANSFORMERS_VERSION = "4.57.6" + MAX_TRANSFORMERS_VERSION = "5.0.0" _MODEL_PATCHER = AfmoeModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 3067f1c5c4..7c6d456dcc 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -150,7 +150,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.55.0"): SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4") - if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"): + if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5.1.0"): SUPPORTED_ARCHITECTURES += ("afmoe",) if is_transformers_version(">=", "4.57.0"): From dfbf3fc594a043048c97488b7d1a8bb65f1a00ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Tue, 31 Mar 2026 18:42:59 +0200 Subject: [PATCH 183/222] update afmoe test --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 1117604b7b..01fd8c5870 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -34,7 +34,7 @@ OPENVINO_DEVICE = os.getenv("OPENVINO_TEST_DEVICE", "CPU") MODEL_NAMES = { - "afmoe": "optimum-intel-internal-testing/tiny-random-trinity", + "afmoe": "onnx-internal-testing/tiny-random-AfmoeForCausalLM", "albert": "optimum-intel-internal-testing/tiny-random-albert", "aquila": "optimum-intel-internal-testing/tiny-random-aquilachat", "aquila2": "optimum-intel-internal-testing/tiny-random-aquila2", From fca276b4ec8c30f3bdd26b215fc42b98e0ddd98e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 1 Apr 2026 15:39:24 +0200 Subject: [PATCH 184/222] remove afmoe MAX_TRANSFORMERS_VERSION as included in transformers v5 --- optimum/exporters/openvino/model_configs.py | 1 - tests/openvino/utils_tests.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0ec3bf1bce..dbc4a313f6 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -5021,7 +5021,6 @@ class ASTOpenVINOConfig(ASTOnnxConfig): ) class AfmoeOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.55.0" - MAX_TRANSFORMERS_VERSION = "5.0.0" _MODEL_PATCHER = AfmoeModelPatcher diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 01fd8c5870..c723623c39 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -34,7 +34,7 @@ OPENVINO_DEVICE = os.getenv("OPENVINO_TEST_DEVICE", "CPU") MODEL_NAMES = { - "afmoe": "onnx-internal-testing/tiny-random-AfmoeForCausalLM", + "afmoe": "optimum-intel-internal-testing/tiny-random-trinity", "albert": "optimum-intel-internal-testing/tiny-random-albert", "aquila": "optimum-intel-internal-testing/tiny-random-aquilachat", "aquila2": "optimum-intel-internal-testing/tiny-random-aquila2", @@ -377,7 +377,6 @@ TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" REMOTE_CODE_MODELS = ( - "afmoe", "chatglm", "minicpm", "baichuan2", @@ -401,6 +400,8 @@ "qwen3_eagle3", ) +if is_transformers_version("<", "5"): + REMOTE_CODE_MODELS += ("afmoe",) def get_num_quantized_nodes(model): num_fake_nodes = 0 From c1dc781fc0bf5c26eb5746fd221a48b2c6ca9543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 1 Apr 2026 15:39:45 +0200 Subject: [PATCH 185/222] style --- tests/openvino/utils_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c723623c39..13ba840bb8 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -403,6 +403,7 @@ if is_transformers_version("<", "5"): REMOTE_CODE_MODELS += ("afmoe",) + def get_num_quantized_nodes(model): num_fake_nodes = 0 types_map = { From 4cb14e9db30e79e6f5e29d6b4fe4a1fd0fd2c3e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CElla?= Date: Wed, 1 Apr 2026 16:17:35 +0200 Subject: [PATCH 186/222] fix test_find_untested_architectures --- tests/openvino/test_decoder.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 7c6d456dcc..f4032db201 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -23,7 +23,6 @@ ) from optimum.exporters.openvino.model_configs import ( - AfmoeOpenVINOConfig, BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig, @@ -148,10 +147,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("minicpm", "minicpm3", "arctic") if is_transformers_version(">=", "4.55.0"): - SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4") - - if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5.1.0"): - SUPPORTED_ARCHITECTURES += ("afmoe",) + SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4", "afmoe") if is_transformers_version(">=", "4.57.0"): SUPPORTED_ARCHITECTURES += ("hunyuan_v1_dense",) @@ -313,8 +309,6 @@ def test_find_untested_architectures(self): supported_architectures.remove("deepseek_v2") if "deepseek_v3" in supported_architectures: supported_architectures.remove("deepseek_v3") - if is_transformers_version(">", str(AfmoeOpenVINOConfig.MAX_TRANSFORMERS_VERSION)): - supported_architectures -= {"afmoe"} if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)): supported_architectures -= {"bitnet"} if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)): From b0a782699be973ec1185c3da85c4696186ef256a Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Thu, 2 Apr 2026 21:50:09 +0400 Subject: [PATCH 187/222] [OpenVINO] Support Gemma 4 Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/model_configs.py | 346 ++++++++++++++ optimum/exporters/openvino/model_patcher.py | 448 +++++++++++++++++- optimum/exporters/openvino/utils.py | 1 + optimum/intel/openvino/modeling_decoder.py | 6 +- .../openvino/modeling_visual_language.py | 87 +++- 5 files changed, 884 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index dbc4a313f6..8a27ccfa0b 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -148,6 +148,9 @@ FluxTransfromerModelPatcher, Gemma2ModelPatcher, Gemma3LMModelPatcher, + Gemma4ImageEmbeddingsModelPatcher, + Gemma4LMModelPatcher, + Gemma4PerLayerInputsGetterModelPatcher, GptJModelPatcher, GptNeoModelPatcher, GptNeoxModelPatcher, @@ -277,6 +280,10 @@ def init_model_configs(): "transformers", "Gemma3ForConditionalGeneration", ) + TasksManager._CUSTOM_CLASSES[("pt", "gemma4", "image-text-to-text")] = ( + "transformers", + "Gemma4ForConditionalGeneration", + ) # since transformers v4.52, model can be loaded using default AutoModelForImageTextToText # https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/auto/modeling_auto.py#L899 @@ -1493,6 +1500,102 @@ class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" +class Gemma4DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + ) + self.num_key_value_heads = normalized_config.num_key_value_heads + self.head_dim = normalized_config.head_dim + self.global_head_dim = getattr(normalized_config.config, "global_head_dim", self.head_dim) + self.layer_types = normalized_config.config.layer_types + self.num_kv_shared_layers = normalized_config.config.num_kv_shared_layers + self.sliding_window = normalized_config.config.sliding_window + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + # some layers do not produce their own KV-cache, they use the shared KV-cache + if self.num_kv_shared_layers > 0: + layer_types = self.layer_types[: -self.num_kv_shared_layers] + else: + layer_types = self.layer_types + past_kv_values = [] + for layer_type in layer_types: + if layer_type == "sliding_attention": + shape = ( + self.batch_size, + self.num_key_value_heads, + self.sliding_window, + self.head_dim, + ) + else: + shape = ( + self.batch_size, + self.num_key_value_heads, + self.sequence_length, + self.global_head_dim, + ) + past_kv_value = ( + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + ) + past_kv_values.append(past_kv_value) + + return past_kv_values + + +@register_in_tasks_manager( + "gemma4_text", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class Gemma4TextOpenVINOConfig(Gemma3TextOpenVINOConfig): + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Gemma4DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = Gemma4DummyPastKeyValuesGenerator + MIN_TRANSFORMERS_VERSION = "4.50.0" + + def add_past_key_values(self, inputs_or_outputs: dict[str, dict[int, str]], direction: str): + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + name = "past_key_values" + else: + decoder_sequence_name = "past_sequence_length + sequence_length" + name = "present" + + num_kv_shared_layers = self._normalized_config.config.num_kv_shared_layers + if num_kv_shared_layers > 0: + layer_types = self._normalized_config.config.layer_types[:-num_kv_shared_layers] + else: + layer_types = self._normalized_config.config.layer_types + + for i, layer_type in enumerate(layer_types): + inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch_size", 2: decoder_sequence_name} + inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 2: decoder_sequence_name} + + class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def __init__( self, @@ -1735,6 +1838,16 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): dummy_inputs["token_type_ids"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[ 0 ].random_int_tensor(token_type_ids_shape, min_value=0, max_value=2) + if "per_layer_inputs" in self.inputs: + per_layer_inputs_shape = ( + input_ids.shape[0], + input_ids.shape[1], + self._normalized_config.config.num_hidden_layers, + self._normalized_config.config.hidden_size_per_layer_input, + ) + dummy_inputs["per_layer_inputs"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[ + 0 + ].random_float_tensor(per_layer_inputs_shape) return dummy_inputs @@ -4203,6 +4316,239 @@ def with_behavior( return super().with_behavior(behavior) +class Gemma4ConfigBehavior(str, enum.Enum): + VISION_EMBEDDINGS = "vision_embeddings" + TEXT_EMBEDDINGS = "text_embeddings" + LANGUAGE = "language" + TEXT_EMBEDDINGS_PER_LAYER = "text_embeddings_per_layer" + + +class DummyGemma4VisionInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("pixel_values", "image_position_ids") + + def __init__(self, task, normalized_config, batch_size=DEFAULT_DUMMY_SHAPES["batch_size"], **kwargs): + super().__init__(task, normalized_config, batch_size, **kwargs) + self.patch_size = getattr(normalized_config, "patch_size", 16) + self.pooling_kernel_size = getattr(normalized_config, "pooling_kernel_size", 3) + # Gemma4 processor always pads pixel_values to max_soft_tokens * pooling_kernel_size^2 patches. + # The vision model's pooling uses shape-dependent Python operations that get baked in during tracing, + # so the dummy input must match the actual inference shapes. + max_soft_tokens = getattr(normalized_config, "image_seq_length", None) + if max_soft_tokens is None: + max_soft_tokens = getattr(normalized_config, "max_soft_tokens", 280) + self.num_patches = max_soft_tokens * self.pooling_kernel_size**2 + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "pixel_values": + # Gemma4 expects pre-patchified pixel_values: [batch, num_patches, 3 * patch_size^2] + return self.random_float_tensor( + shape=[self.batch_size, self.num_patches, 3 * self.patch_size**2], + framework=framework, + dtype=float_dtype, + ) + if input_name == "image_position_ids": + import torch + import math + + # Create position ids as a grid. The patch count = h_patches * w_patches + # where both are divisible by pooling_kernel_size for correct pooling. + k = self.pooling_kernel_size + total_pooled = self.num_patches // (k * k) + # Find roughly square grid for pooled side + pooled_side = int(math.sqrt(total_pooled)) + if pooled_side * pooled_side < total_pooled: + pooled_h = pooled_side + pooled_w = total_pooled // pooled_h + else: + pooled_h = pooled_w = pooled_side + h_patches = pooled_h * k + w_patches = pooled_w * k + pos_ids = torch.stack( + torch.meshgrid(torch.arange(h_patches), torch.arange(w_patches), indexing="ij"), dim=-1 + ).reshape(1, -1, 2) + # Pad to num_patches with -1 (padding position) + if pos_ids.shape[1] < self.num_patches: + pad = torch.full((1, self.num_patches - pos_ids.shape[1], 2), -1, dtype=pos_ids.dtype) + pos_ids = torch.cat([pos_ids, pad], dim=1) + return pos_ids.expand(self.batch_size, -1, -1).clone() + return super().generate(input_name, framework, int_dtype, float_dtype) + + +@register_in_tasks_manager("gemma4", *["image-text-to-text"], library_name="transformers") +class Gemma4OpenVINOConfig(Gemma3OpenVINOConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in Gemma4ConfigBehavior] + DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyTextInputGenerator) + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: Gemma4ConfigBehavior = Gemma4ConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + behavior=behavior, + ) + self._behavior = behavior + if self._behavior == Gemma4ConfigBehavior.VISION_EMBEDDINGS: + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyGemma4VisionInputGenerator,) + # Attach image_seq_length from preprocessor to normalized config so + # the dummy input generator can compute the correct number of patches. + # The vision model's pooling uses shape-dependent Python operations baked in + # during tracing, so the dummy input must match actual inference shapes. + image_seq_length = None + if preprocessors is not None: + for p in preprocessors: + if hasattr(p, "image_processor") and hasattr(p.image_processor, "image_seq_length"): + image_seq_length = p.image_processor.image_seq_length + break + if hasattr(p, "image_processor") and hasattr(p.image_processor, "max_soft_tokens"): + image_seq_length = p.image_processor.max_soft_tokens + break + if image_seq_length is None: + for p in preprocessors: + if hasattr(p, "max_soft_tokens"): + image_seq_length = p.max_soft_tokens + break + if hasattr(p, "image_seq_length"): + image_seq_length = p.image_seq_length + break + if image_seq_length is not None: + self._normalized_config.image_seq_length = image_seq_length + elif self._behavior in ( + Gemma4ConfigBehavior.TEXT_EMBEDDINGS, + Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER, + ): + self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator,) + self._config = config.text_config + self._normalized_config = NormalizedTextConfig(self._config) + + def with_behavior(self, behavior: Union[str, Gemma4ConfigBehavior]): + if isinstance(behavior, str) and not isinstance(behavior, Gemma4ConfigBehavior): + behavior = Gemma4ConfigBehavior(behavior) + + if behavior == Gemma4ConfigBehavior.LANGUAGE: + model_type = "gemma4_text" + return get_vlm_text_generation_config( + model_type, + self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + model_patcher=Gemma4LMModelPatcher, + inputs_update={"per_layer_inputs": {0: "batch_size", 1: "sequence_length", 2: "num_hidden_layers"}}, + ) + if behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER: + config = self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + return config + return super().with_behavior(behavior) + + def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]): + if behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER: + import torch + + class PerLayerInputsModule(torch.nn.Module): + def __init__(self, language_model, vocab_size_per_layer_input: int, config): + super().__init__() + self.language_model = language_model + self.vocab_size_per_layer_input = vocab_size_per_layer_input + self.config = config + + def forward(self, input_ids: torch.Tensor): + # Replace multimodal token IDs with pad_token_id to match + # HF Gemma4Model.forward which uses llm_input_ids where + # image/video/audio positions are set to pad_token_id + pad_token_id = self.config.text_config.pad_token_id + per_layer_inputs_tokens = input_ids.clone() + for token_id_attr in ("image_token_id", "video_token_id", "audio_token_id"): + token_id = getattr(self.config, token_id_attr, None) + if token_id is not None: + per_layer_inputs_tokens = torch.where( + per_layer_inputs_tokens == token_id, + torch.full_like(per_layer_inputs_tokens, pad_token_id), + per_layer_inputs_tokens, + ) + per_layer_inputs_mask = torch.logical_and( + per_layer_inputs_tokens >= 0, + per_layer_inputs_tokens < self.vocab_size_per_layer_input, + ) + per_layer_inputs_tokens = torch.where( + per_layer_inputs_mask, + per_layer_inputs_tokens, + torch.zeros_like(per_layer_inputs_tokens), + ) + per_layer_inputs = self.language_model.get_per_layer_inputs( + per_layer_inputs_tokens, None + ) + return per_layer_inputs + + model = PerLayerInputsModule( + model.model.language_model, model.config.text_config.vocab_size_per_layer_input, model.config + ) + return model + if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: + return model + if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: + import torch + + class TextEmbeddingsModule(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, input_ids: torch.Tensor): + inputs_embeds = self.model.get_input_embeddings()(input_ids) + return inputs_embeds + + text_embedding = TextEmbeddingsModule(model) + text_embedding.config = model.model.language_model.config + return text_embedding + + return super().get_model_for_behavior(model, behavior) + + def patch_model_for_export(self, model, model_kwargs=None): + model_kwargs = model_kwargs or {} + if self._behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER: + return ModelPatcher(self, model, model_kwargs) + if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: + return Gemma4ImageEmbeddingsModelPatcher(self, model, model_kwargs) + return super().patch_model_for_export(model, model_kwargs) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == Gemma4ConfigBehavior.LANGUAGE: + return super().inputs + if self._behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER: + return { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + } + if self._behavior == Gemma4ConfigBehavior.VISION_EMBEDDINGS: + return { + "pixel_values": {0: "batch_size", 1: "num_patches"}, + "image_position_ids": {0: "batch_size", 1: "num_patches"}, + } + return super().inputs + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER: + return {"text_embeds_per_layer": {}} + return super().outputs + + class DummyVisionPositionIdsInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("patch_attention_mask", "patch_position_ids") diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4cf552e982..ae5f0c179d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -52,7 +52,7 @@ ModelPatcher, gpt_oss_forward, override_arguments, - sdpa_mask_without_vmap, + sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap, ) from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -82,6 +82,28 @@ logger = logging.getLogger(__name__) +# Compatibility wrapper for sdpa_mask_without_vmap from optimum. +# The installed optimum version expects (batch_size, cache_position: Tensor, kv_length, ...), +# but transformers >= 5.5 passes (batch_size, q_length: int, kv_length: int, q_offset: int, ...). +def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0, kv_offset=0, **kwargs): + import inspect + + sig = inspect.signature(_orig_sdpa_mask_without_vmap) + if "cache_position" in sig.parameters: + # Old optimum signature: (batch_size, cache_position, kv_length, kv_offset, ...) + cache_position = torch.arange(q_length, dtype=torch.long) + q_offset + kwargs.pop("q_offset", None) + kwargs.pop("allow_is_bidirectional_skip", None) + kwargs.pop("allow_torch_fix", None) + kwargs.pop("use_vmap", None) + kwargs.pop("device", None) + return _orig_sdpa_mask_without_vmap(batch_size, cache_position, kv_length, kv_offset=kv_offset, **kwargs) + else: + return _orig_sdpa_mask_without_vmap( + batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs + ) + + def postprocess_past_key_values(past_key_values): if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)): if hasattr(past_key_values, "to_legacy_cache"): @@ -4818,6 +4840,349 @@ def __exit__(self, exc_type, exc_value, traceback): del self._model.model._orig_update_causual_mask +def _gemma4_project_per_layer_inputs( + self, + inputs_embeds: torch.Tensor, + per_layer_inputs: Optional[torch.Tensor] = None, +) -> torch.Tensor: + per_layer_projection = self.per_layer_model_projection(inputs_embeds) * self.per_layer_model_projection_scale + per_layer_projection = per_layer_projection.reshape( + *inputs_embeds.shape[:-1], + self.config.num_hidden_layers, + self.hidden_size_per_layer_input, + ) + per_layer_projection = self.per_layer_projection_norm(per_layer_projection) + + if per_layer_inputs is None: + return per_layer_projection + + if per_layer_projection.shape != per_layer_inputs.shape: + per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :] + + return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale + + +def gemma4_language_model_forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + input_features_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + mm_token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + per_layer_inputs=None, + **lm_kwargs, +): + from transformers.models.gemma4.modeling_gemma4 import Gemma4ModelOutputWithPast + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # Merge text and images + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + if hasattr(image_features, "pooler_output"): + image_features = image_features.pooler_output + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + _, special_image_mask, _, _ = self.model.get_placeholder_mask(mm_token_type_ids, input_ids, inputs_embeds) + special_image_mask_expanded = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask_expanded, image_features) + + outputs = self.model.language_model( + input_ids=None, + per_layer_inputs=per_layer_inputs, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **lm_kwargs, + ) + + return Gemma4ModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values if use_cache else None, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +def gemma4_lm_forward( + self, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + per_layer_inputs=None, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + input_features: Optional[torch.FloatTensor] = None, + input_features_mask: Optional[torch.Tensor] = None, + mm_token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, +): + from transformers.models.gemma4.modeling_gemma4 import Gemma4CausalLMOutputWithPast + from optimum.exporters.onnx.model_patcher import preprocess_past_key_values + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = False + + if past_key_values is not None: + use_cache = True + past_key_values = preprocess_past_key_values(past_key_values) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + input_features=input_features, + attention_mask=attention_mask, + input_features_mask=input_features_mask, + position_ids=position_ids, + past_key_values=past_key_values, + mm_token_type_ids=mm_token_type_ids, + cache_position=cache_position, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + per_layer_inputs=per_layer_inputs, + **lm_kwargs, + ) + + hidden_states = outputs.last_hidden_state + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + tmp_logits = self.lm_head(hidden_states[:, slice_indices, :]) + if (final_logit_softcapping := self.config.get_text_config().final_logit_softcapping) is not None: + tmp_logits = tmp_logits / final_logit_softcapping + tmp_logits = torch.tanh(tmp_logits) + tmp_logits = tmp_logits * final_logit_softcapping + + outputs_dict = { + "logits": tmp_logits, + } + + if use_cache: + key_values = outputs.past_key_values + present_key_values = postprocess_past_key_values(key_values) + outputs_dict["past_key_values"] = present_key_values + return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs_dict.values()]) + + +def gemma4_eager_attention_forward_patched( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + dropout: float = 0.0, + scaling: Optional[float] = None, + softcap: Optional[float] = None, + **kwargs, +) -> tuple: + if scaling is None: + scaling = module.head_dim**-0.5 + + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + + if softcap is not None: + attn_weights = attn_weights / softcap + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * softcap + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + eps = 0.0000001 + + attn_weights = nn.functional.softmax(attn_weights + eps, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def gemma4_text_attention_forward( + self, + hidden_states: torch.Tensor, + position_embeddings: torch.Tensor, + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, +) -> tuple: + from transformers.models.gemma4.modeling_gemma4 import apply_rotary_pos_emb as apply_rotary_pos_emb_gemma4 + + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + cos, sin = position_embeddings + + query_states = self.q_proj(hidden_states).view(hidden_shape) + query_states = self.q_norm(query_states) + query_states = apply_rotary_pos_emb_gemma4(query_states, cos, sin, unsqueeze_dim=2) + query_states = query_states.transpose(1, 2) + + if self.is_kv_shared_layer and past_key_values is not None: + key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index] + key_states = key_states.to(query_states.device) + value_states = value_states.to(query_states.device) + else: + key_states = self.k_proj(hidden_states).view(hidden_shape) + value_states = self.v_proj(hidden_states).view(hidden_shape) if self.v_proj is not None else key_states + + key_states = self.k_norm(key_states) + key_states = apply_rotary_pos_emb_gemma4(key_states, cos, sin, unsqueeze_dim=2) + key_states = key_states.transpose(1, 2) + + value_states = self.v_norm(value_states) + value_states = value_states.transpose(1, 2) + + if past_key_values is not None: + cache_kwargs = { + "sin": sin, + "cos": cos, + "cache_position": cache_position, + "sliding_window": self.sliding_window, + } + if not self.is_kv_shared_layer: + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + if self.store_full_length_kv: + if not hasattr(past_key_values, "shared_layers"): + past_key_values.shared_layers = {} + past_key_values.shared_layers[self.layer_idx] = key_states, value_states + + attention_interface = gemma4_eager_attention_forward_patched + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=self.attention_dropout if self.training else 0.0, + scaling=1.0, + sliding_window=self.sliding_window, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +def _gemma4_moe_block_forward(self, hidden_states, top_k_index, top_k_weights): + # hidden_states: [B*S, hidden_dim] + # top_k_index: [B*S, K], top_k_weights: [B*S, K] + num_tokens = hidden_states.shape[0] + dtype = hidden_states.dtype + + # Compute all expert outputs via batched matmul + # expanded: [E, B*S, hidden_dim] + expanded_hidden = hidden_states.unsqueeze(0).expand(self.num_experts, -1, -1) + + # gate_up_proj: [E, 2*inter, hidden] -> transpose to [E, hidden, 2*inter] + gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.to(dtype).transpose(1, 2)) + gate, up = gate_up.chunk(2, dim=-1) + intermediate = self.act_fn(gate) * up + + # down_proj: [E, hidden, inter] -> transpose to [E, inter, hidden] + expert_outputs = torch.bmm(intermediate, self.down_proj.to(dtype).transpose(1, 2)) + # expert_outputs: [E, B*S, hidden_dim] + + # Apply per-expert scale: [E] -> [E, 1, 1] + expert_outputs = expert_outputs * self.per_expert_scale.to(dtype).unsqueeze(-1).unsqueeze(-1) + + # Build full routing weight matrix [B*S, E] from sparse top-k + full_weights = torch.zeros(num_tokens, self.num_experts, dtype=dtype, device=hidden_states.device) + full_weights.scatter_add_(1, top_k_index, top_k_weights.to(dtype)) + + # Weighted sum over experts: [B*S, 1, E] @ [B*S, E, hidden_dim] -> [B*S, hidden_dim] + expert_outputs = expert_outputs.permute(1, 0, 2) # [B*S, E, hidden_dim] + final_hidden_states = torch.bmm(full_weights.unsqueeze(1), expert_outputs).squeeze(1) + + return final_hidden_states + + +class Gemma4LMModelPatcher(Gemma3LMModelPatcher): + def __init__(self, config, model, model_kwargs): + super().__init__(config, model, model_kwargs) + + self.patched_forward = gemma4_lm_forward + self.model_orig_forward = self.orig_forward + self.orig_forward = gemma4_lm_forward + + self.model_orig_language_model_forward = self._model.model.forward + + def __enter__(self): + super().__enter__() + + setattr(self._model, self.orig_forward_name, types.MethodType(gemma4_lm_forward, self._model)) + setattr(self._model.model, "forward", types.MethodType(gemma4_language_model_forward, self._model)) + + self._model.model.language_model._orig_project_per_layer_inputs = ( + self._model.model.language_model.project_per_layer_inputs + ) + self._model.model.language_model.project_per_layer_inputs = types.MethodType( + _gemma4_project_per_layer_inputs, self._model.model.language_model + ) + + for decoder_layer in self._model.model.language_model.layers: + decoder_layer.self_attn.orig_forward = decoder_layer.self_attn.forward + decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn) + if hasattr(decoder_layer, "moe"): + decoder_layer.moe._orig_forward = decoder_layer.moe.forward + decoder_layer.moe.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.moe) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.model.language_model.project_per_layer_inputs = ( + self._model.model.language_model._orig_project_per_layer_inputs + ) + + for decoder_layer in self._model.model.language_model.layers: + decoder_layer.self_attn.forward = decoder_layer.self_attn.orig_forward + if hasattr(decoder_layer, "moe") and hasattr(decoder_layer.moe, "_orig_forward"): + decoder_layer.moe.forward = decoder_layer.moe._orig_forward + + setattr(self._model, self.orig_forward_name, self.model_orig_forward) + setattr(self._model.model, "forward", self.model_orig_language_model_forward) + + class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher): def __init__( self, @@ -6528,7 +6893,10 @@ def __init__( model: "PreTrainedModel", model_kwargs: Optional[Dict[str, Any]] = None, ): - from transformers.models.mamba.modeling_mamba import MambaCache + try: + from transformers.models.mamba.modeling_mamba import MambaCache + except ImportError: + MambaCache = object super().__init__(config, model, model_kwargs) @@ -8423,3 +8791,79 @@ def __exit__(self, exc_type, exc_value, traceback): sparse_moe_block = decoder_layer.mlp decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs + + +class Gemma4PerLayerInputsGetterModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel"], + model_kwargs: Dict[str, Any] = None, + ): + model.__orig_forward = model.forward + + def per_layer_inputs_forward(self, input_ids: torch.Tensor) -> torch.Tensor: + per_layer_inputs_mask = torch.logical_and(input_ids >= 0, input_ids < self.vocab_size_per_layer_input) + per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids)) + per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens, None) + return per_layer_inputs + + model.forward = types.MethodType(per_layer_inputs_forward, model) + super().__init__(config, model, model_kwargs) + + def __enter__(self): + super().__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + + +class Gemma4ImageEmbeddingsModelPatcher(CommonImageEmbeddingsModelPatcher): + def __init__(self, config, model, model_kwargs): + super().__init__(config, model, model_kwargs) + # Get the vision encoder - it's at model.model.vision_tower.encoder + vision_model = model.model.vision_tower if is_transformers_version(">=", "5") else model.vision_tower + self._vision_encoder = vision_model.encoder + + # Patch the vision encoder forward to bypass create_bidirectional_mask, + # which is not compatible with torch.jit.trace due to dynamic masking logic. + # Instead, we construct a simple 4D bidirectional attention mask from the + # 2D padding mask to properly mask out padding patches. + orig_encoder_forward = self._vision_encoder.forward + + def patched_encoder_forward(inputs_embeds, attention_mask=None, pixel_position_ids=None, **kwargs): + hidden_states = inputs_embeds + position_embeddings = self._vision_encoder.rotary_emb(hidden_states, pixel_position_ids) + + # Build a 4D bidirectional attention mask from the 2D boolean mask. + # attention_mask is [batch, seq_len] with True=valid, False=padding. + # Decoder layers expect a 4D mask [batch, 1, seq_len, seq_len] where + # 0 = attend and large negative = masked. + attn_mask_4d = None + if attention_mask is not None: + min_dtype = torch.finfo(hidden_states.dtype).min + # [batch, 1, 1, seq_len] key mask + key_mask = attention_mask[:, None, None, :].to(hidden_states.dtype) + # Convert: 1.0 for valid tokens, min_dtype for padding + attn_mask_4d = (1.0 - key_mask) * min_dtype + + for decoder_layer in self._vision_encoder.layers[: self._vision_encoder.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=attn_mask_4d, + position_embeddings=position_embeddings, + position_ids=pixel_position_ids, + **kwargs, + ) + + from transformers.modeling_outputs import BaseModelOutputWithPast + + return BaseModelOutputWithPast(last_hidden_state=hidden_states) + + self._orig_encoder_forward = orig_encoder_forward + self._vision_encoder.forward = patched_encoder_forward + + def __exit__(self, exc_type, exc_value, traceback): + self._vision_encoder.forward = self._orig_encoder_forward + super().__exit__(exc_type, exc_value, traceback) diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index af2f1edaba..61134d1c58 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -297,6 +297,7 @@ def get_submodels(model): "qwen3_vl", "got_ocr2", "gemma3", + "gemma4", "idefics3", "smolvlm", "phi4mm", diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index ccf177df9d..9ac0d15612 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -31,9 +31,13 @@ from transformers.generation.stopping_criteria import StoppingCriteriaList from transformers.generation.utils import GenerateOutput, GenerationMode from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput -from transformers.models.mamba.modeling_mamba import MambaCache from transformers.utils.hub import PushToHubMixin +try: + from transformers.models.mamba.modeling_mamba import MambaCache +except ImportError: + MambaCache = object + from optimum.utils.normalized_config import NormalizedConfigManager from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index beb7b974eb..ac021b23fe 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -218,6 +218,12 @@ def prepare_inputs( inputs["beam_idx"] = ( self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) ) + + if "per_layer_inputs" in self.input_names: + per_layer_inputs = kwargs.pop("per_layer_inputs", None) + assert per_layer_inputs is not None, "Expected 'per_layer_inputs', but it was not passed" + inputs["per_layer_inputs"] = torch.Tensor(per_layer_inputs) + return inputs def forward( @@ -347,6 +353,7 @@ def forward(self, audio_feature, audio_mask): MODEL_PARTS_CLS_MAPPING = { "resampler": OVResampler, "language_model": OVModelWithEmbedForCausalLM, + "text_embeddings_per_layer": OVVisionProjection, "vision_embeddings": OVVisionEmbedding, "vision_projection": OVVisionProjection, "vision_resampler": OVVisionResampler, @@ -785,8 +792,11 @@ def forward( additional_kwargs["visual_pos_masks"] = extra_outputs[0] additional_kwargs["deepstack_visual_embeds"] = extra_outputs[1] + if self.config.model_type in ("gemma4",) and extra_outputs: + additional_kwargs["per_layer_inputs"] = extra_outputs[0] + return self.language_model.forward( - input_ids=None, + input_ids=input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, @@ -3937,6 +3947,80 @@ def _update_model_kwargs_for_generation( return model_kwargs +class _OVGemma4ForCausalLM(_OVGemma3ForCausalLM): + additional_parts = ["text_embeddings_per_layer"] + + def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): + if input_ids is not None and input_ids.shape[1] == 1: + return None + return self.vision_embeddings(pixel_values, **kwargs).last_hidden_state + + def get_multimodal_embeddings( + self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs + ): + embeds_from_args = kwargs.pop("inputs_embeds", None) + inputs_embeds = ( + embeds_from_args if embeds_from_args is not None else self.get_text_embeddings(input_ids, **kwargs) + ) + per_layer_inputs = self.text_embeddings_per_layer(input_ids) + if pixel_values is not None: + vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, **kwargs) + + if vision_embeds is not None: + inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings( + vision_embeds, + inputs_embeds, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + **kwargs, + ) + return inputs_embeds, attention_mask, position_ids, per_layer_inputs + + def merge_vision_text_embeddings( + self, vision_embeds, inputs_embeds, input_ids=None, attention_mask=None, position_ids=None, **kwargs + ): + image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds + inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds + if input_ids is None: + special_image_mask = inputs_embeds == torch.from_numpy( + self.get_text_embeddings(torch.tensor([[self.config.image_token_id]], dtype=torch.long))[0] + ) + else: + special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds) + + image_features = image_features.to(inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + return inputs_embeds, attention_mask, position_ids + + def prepare_inputs_for_generation( + self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs + ): + model_inputs = super().prepare_inputs_for_generation(input_ids, **kwargs) + model_inputs["mm_token_type_ids"] = mm_token_type_ids + model_inputs["image_position_ids"] = image_position_ids + return model_inputs + + def _update_model_kwargs_for_generation( + self, + outputs, + model_kwargs, + is_encoder_decoder=False, + num_new_tokens=1, + ): + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + num_new_tokens=num_new_tokens, + ) + model_kwargs.pop("mm_token_type_ids", None) + model_kwargs.pop("image_position_ids", None) + return model_kwargs + + class _OVGotOCR2ForCausalLM(OVModelForVisualCausalLM): def get_vision_embeddings(self, pixel_values, input_ids, **kwargs): if input_ids is not None and input_ids.shape[1] == 1 and kwargs.get("past_key_values") is not None: @@ -4817,6 +4901,7 @@ def preprocess_inputs( "qwen2_5_vl_text": _OVQwen2_5_VLForCausalLM, "got_ocr2": _OVGotOCR2ForCausalLM, "gemma3": _OVGemma3ForCausalLM, + "gemma4": _OVGemma4ForCausalLM, "idefics3": _OVIdefics3ForCausalLM, "smolvlm": _OVSmolVLForCasualLM, "phi4mm": _OVPhi4MMForCausalLM, From 2c57fee968cb4a463efac1200cd89fcbcd9d3929 Mon Sep 17 00:00:00 2001 From: "Kazantsev, Roman" Date: Thu, 2 Apr 2026 22:10:17 +0400 Subject: [PATCH 188/222] Fix formatting Signed-off-by: Kazantsev, Roman --- optimum/exporters/openvino/model_configs.py | 8 +++----- optimum/exporters/openvino/model_patcher.py | 7 +++---- optimum/intel/openvino/configuration.py | 6 ++---- optimum/intel/openvino/modeling_decoder.py | 1 + optimum/intel/openvino/modeling_visual_language.py | 4 +--- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 8a27ccfa0b..7126a8ad35 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -150,7 +150,6 @@ Gemma3LMModelPatcher, Gemma4ImageEmbeddingsModelPatcher, Gemma4LMModelPatcher, - Gemma4PerLayerInputsGetterModelPatcher, GptJModelPatcher, GptNeoModelPatcher, GptNeoxModelPatcher, @@ -4347,9 +4346,10 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int dtype=float_dtype, ) if input_name == "image_position_ids": - import torch import math + import torch + # Create position ids as a grid. The patch count = h_patches * w_patches # where both are divisible by pooling_kernel_size for correct pooling. k = self.pooling_kernel_size @@ -4490,9 +4490,7 @@ def forward(self, input_ids: torch.Tensor): per_layer_inputs_tokens, torch.zeros_like(per_layer_inputs_tokens), ) - per_layer_inputs = self.language_model.get_per_layer_inputs( - per_layer_inputs_tokens, None - ) + per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens, None) return per_layer_inputs model = PerLayerInputsModule( diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index ae5f0c179d..6a841db50e 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -52,6 +52,8 @@ ModelPatcher, gpt_oss_forward, override_arguments, +) +from optimum.exporters.onnx.model_patcher import ( sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap, ) from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version @@ -4946,7 +4948,6 @@ def gemma4_lm_forward( logits_to_keep: Union[int, torch.Tensor] = 0, **lm_kwargs, ): - from transformers.models.gemma4.modeling_gemma4 import Gemma4CausalLMOutputWithPast from optimum.exporters.onnx.model_patcher import preprocess_past_key_values output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -5078,9 +5079,7 @@ def gemma4_text_attention_forward( "sliding_window": self.sliding_window, } if not self.is_kv_shared_layer: - key_states, value_states = past_key_values.update( - key_states, value_states, self.layer_idx, cache_kwargs - ) + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) if self.store_full_length_kv: if not hasattr(past_key_values, "shared_layers"): past_key_values.shared_layers = {} diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index f1a1044ebf..e621efc396 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -763,13 +763,11 @@ def __init__( self._dataset_kwargs[key] = int(value) except ValueError: raise ValueError( - f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. " - f"Expected an integer." + f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. Expected an integer." ) else: raise ValueError( - f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. " - f"Only 'seq_len' is supported." + f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. Only 'seq_len' is supported." ) else: # No options or list-of-str dataset diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 9ac0d15612..8691f6b887 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -33,6 +33,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput from transformers.utils.hub import PushToHubMixin + try: from transformers.models.mamba.modeling_mamba import MambaCache except ImportError: diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index ac021b23fe..c09243a03b 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -3995,9 +3995,7 @@ def merge_vision_text_embeddings( return inputs_embeds, attention_mask, position_ids - def prepare_inputs_for_generation( - self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs - ): + def prepare_inputs_for_generation(self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs): model_inputs = super().prepare_inputs_for_generation(input_ids, **kwargs) model_inputs["mm_token_type_ids"] = mm_token_type_ids model_inputs["image_position_ids"] = image_position_ids From 5b0b29b4cdcac3ed1dc26803b2504170e8358fb0 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Thu, 16 Apr 2026 14:08:54 +0400 Subject: [PATCH 189/222] Apply suggestion from @rkazants --- optimum/exporters/openvino/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index c3ec9e1151..fbb9c3c9a9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -134,7 +134,7 @@ def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0 return _orig_sdpa_mask_without_vmap(batch_size, cache_position, kv_length, kv_offset=kv_offset, **kwargs) else: return _orig_sdpa_mask_without_vmap( - batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs + batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs) for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes From 282724651da3e3079d43819e7ae9f2773034378b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 16 Apr 2026 12:44:29 +0200 Subject: [PATCH 190/222] Remove wrong changes. --- .github/workflows/test_offline.yaml | 2 +- optimum/exporters/openvino/model_configs.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 48f07b9396..830d77e1c3 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip uv - uv pip install .[tests] + uv pip install .[diffusers,tests] - name: Test run: | diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 642d992052..abbcd40bd1 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -876,8 +876,6 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): ) class Exaone4OpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.54.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( From bff96195d18a96d7b082ede1b517e4d3c633cc3e Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 16 Apr 2026 12:50:28 +0200 Subject: [PATCH 191/222] Remove wrong changes. --- optimum/exporters/openvino/model_configs.py | 4 ++-- tests/openvino/test_decoder.py | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index abbcd40bd1..24779f2a2a 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -876,6 +876,8 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): ) class Exaone4OpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.54.0" + # TODO (@echarlaix): add v5 support + MAX_TRANSFORMERS_VERSION = "4.57.6" @register_in_tasks_manager( @@ -1098,8 +1100,6 @@ class Phi3OpenVINOConfig(PhiOnnxConfig): ) class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" - # TODO (@echarlaix): add v5 support - MAX_TRANSFORMERS_VERSION = "4.57.6" _MODEL_PATCHER = PhiMoEModelPatcher diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 91696d8a46..4a9f3b23fb 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -104,15 +104,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("cohere2",) if is_transformers_version(">=", "4.46.0"): - SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo") + SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe") if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES += ("deepseek",) - # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly - if is_transformers_version("<", "5"): - SUPPORTED_ARCHITECTURES += ("phimoe",) - # gptq and awq install disabled for windows test environment if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"): SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq") From 387dc8cef86befc527b810a8b1c1b7cf6d1b036c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 16 Apr 2026 13:05:39 +0200 Subject: [PATCH 192/222] Remove wrong changes. --- optimum/exporters/openvino/model_patcher.py | 9 --------- optimum/intel/openvino/configuration.py | 6 ++++-- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index fbb9c3c9a9..0500fe1a0d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -1768,15 +1768,6 @@ def __enter__(self): "long_mscale", None ) - if is_transformers_version("<", "5"): - for layer in self._model.model.layers: - layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward - layer.block_sparse_moe.forward = types.MethodType( - _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe - ) - else: - self._model.set_experts_implementation("batched_mm") - def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index e621efc396..f1a1044ebf 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -763,11 +763,13 @@ def __init__( self._dataset_kwargs[key] = int(value) except ValueError: raise ValueError( - f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. Expected an integer." + f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. " + f"Expected an integer." ) else: raise ValueError( - f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. Only 'seq_len' is supported." + f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. " + f"Only 'seq_len' is supported." ) else: # No options or list-of-str dataset From 14d057a30863811c0913373a2d10fd0c66076ef3 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 16 Apr 2026 13:34:56 +0200 Subject: [PATCH 193/222] Seq2seq test. --- tests/openvino/test_seq2seq.py | 17 ++++++++++++++++- tests/openvino/utils_tests.py | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 9b246fb871..6ca18cd232 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -607,6 +607,9 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): # remote code models incompatible after transformers v5 SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"] + if is_transformers_version(">=", "5.5"): + SUPPORTED_ARCHITECTURES += ["gemma4"] + # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): SUPPORTED_ARCHITECTURES += ("llava_next_video",) @@ -770,8 +773,13 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config): set_seed(SEED) with torch.no_grad(): transformers_outputs = transformers_model(**transformers_inputs) + # Gemma4 performs poorly with random weights. + # The full model "google/gemma-4-E2B-it" passes this test with 4e-2 eps, but + # after saving it with random weights the converted model generates logits with max difference around 5. + # On the tiny model the error is about 0.1. + eps = 0.2 if model_arch == "gemma4" else 4e-3 self.assertTrue( - torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=4e-3), + torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=eps), f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", ) @@ -783,6 +791,13 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config): ov_model.generation_config.do_sample = False # minicpmo diverges after 20 tokens tokens_to_generate = 20 if model_arch == "minicpmo" else 30 + + # Gemma4 performs much poorly with random weights. + # The full model "google/gemma-4-E2B-it" passes this test, while the same architecture + # saved with random weights generates tokens that do not match transformers. + if model_arch == "gemma4": + tokens_to_generate = 1 + gen_config = GenerationConfig( max_new_tokens=tokens_to_generate, min_new_tokens=tokens_to_generate, diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 13ba840bb8..36bb4ead6c 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -87,6 +87,7 @@ "got_ocr2": "optimum-intel-internal-testing/tiny-random-got-ocr2-hf", "gemma3_text": "optimum-intel-internal-testing/tiny-random-gemma3-text", "gemma3": "optimum-intel-internal-testing/tiny-random-gemma3", + "gemma4": "optimum-intel-internal-testing/tiny-random-gemma4", "falcon": "optimum-intel-internal-testing/really-tiny-falcon-testing", "falcon-40b": "optimum-intel-internal-testing/tiny-random-falcon-40b", "falcon_mamba": "optimum-intel-internal-testing/tiny-falcon-mamba", From a117774550341b1e446c954d7dd0e94babfb62be Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 16 Apr 2026 14:05:43 +0200 Subject: [PATCH 194/222] Quantization tests. --- optimum/exporters/openvino/model_patcher.py | 5 ++++- tests/openvino/test_quantization.py | 3 +++ tests/openvino/utils_tests.py | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 0500fe1a0d..ee85a36aef 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -134,7 +134,10 @@ def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0 return _orig_sdpa_mask_without_vmap(batch_size, cache_position, kv_length, kv_offset=kv_offset, **kwargs) else: return _orig_sdpa_mask_without_vmap( - batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs) + batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs + ) + + for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC): if spec.name in { # onnx-exporter-specific fixes diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 63c13de956..f0b5f4bcd1 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1089,6 +1089,9 @@ class OVWeightCompressionTest(unittest.TestCase): ] ) + if is_transformers_version(">=", "5.5.0"): + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "gemma4", True)) + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 36bb4ead6c..1f063de6b8 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -373,6 +373,12 @@ "hunyuan_v1_dense": {"model": 32}, "qwen3_eagle3": {"model": 20}, "qwen3_next": {"model": 100}, + "gemma4": { + "lm_model": 58, + "text_embeddings_model": 1, + "vision_embeddings_model": 10, + "text_embeddings_per_layer_model": 1, + }, } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From ebb1755e0be3d6231959222da89f41158e566935 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 16 Apr 2026 14:06:33 +0200 Subject: [PATCH 195/222] Docs update. --- docs/source/openvino/models.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index e967e9d22e..ce659dfae5 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -73,6 +73,7 @@ Here is the list of the supported architectures : - Gemma - Gemma 2 - Gemma 3 +- Gemma 4 - GOT-OCR 2.0 - Granite - Granite 4.0 From bb43c2445f262cac952812936808063b75d741d0 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 16 Apr 2026 15:31:22 +0200 Subject: [PATCH 196/222] Removed redundant import. --- optimum/exporters/openvino/model_patcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index ee85a36aef..781b842738 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -63,7 +63,6 @@ from optimum.exporters.onnx.model_patcher import ( sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap, ) -from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version from ._ov_ops import convert_recurrent_attention_cell From 1fccd408ad0f29e9f4ed7f80d3414c0bc8f8bda0 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 17 Apr 2026 10:17:35 +0200 Subject: [PATCH 197/222] Fix Gemma 4 26B-A4B (MoE) export and inference --- optimum/exporters/openvino/model_configs.py | 15 +++++++++++++- optimum/exporters/openvino/model_patcher.py | 22 +++++++++------------ optimum/intel/openvino/configuration.py | 22 +++++++++++++++++++++ 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 24779f2a2a..fa341c9e5a 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1518,6 +1518,10 @@ def __init__( self.layer_types = normalized_config.config.layer_types self.num_kv_shared_layers = normalized_config.config.num_kv_shared_layers self.sliding_window = normalized_config.config.sliding_window + # Full-attention layers use fewer KV heads than sliding-attention layers (e.g. 2 vs 8 for 26B-A4B) + self.num_global_key_value_heads = getattr( + normalized_config.config, "num_global_key_value_heads", None + ) or self.num_key_value_heads def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): # some layers do not produce their own KV-cache, they use the shared KV-cache @@ -1537,7 +1541,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int else: shape = ( self.batch_size, - self.num_key_value_heads, + self.num_global_key_value_heads, self.sequence_length, self.global_head_dim, ) @@ -4462,6 +4466,15 @@ def __init__(self, language_model, vocab_size_per_layer_input: int, config): self.config = config def forward(self, input_ids: torch.Tensor): + # 26B-A4B has hidden_size_per_layer_input=0 (PLE disabled) + if self.language_model.config.hidden_size_per_layer_input <= 0: + return torch.zeros( + input_ids.shape[0], + input_ids.shape[1], + self.language_model.config.num_hidden_layers, + 0, + dtype=torch.float32, + ) # Replace multimodal token IDs with pad_token_id to match # HF Gemma4Model.forward which uses llm_input_ids where # image/video/audio positions are set to pad_token_id diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 781b842738..3ff3c2866a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -5192,27 +5192,23 @@ def _gemma4_moe_block_forward(self, hidden_states, top_k_index, top_k_weights): # hidden_states: [B*S, hidden_dim] # top_k_index: [B*S, K], top_k_weights: [B*S, K] num_tokens = hidden_states.shape[0] - dtype = hidden_states.dtype # Compute all expert outputs via batched matmul # expanded: [E, B*S, hidden_dim] expanded_hidden = hidden_states.unsqueeze(0).expand(self.num_experts, -1, -1) # gate_up_proj: [E, 2*inter, hidden] -> transpose to [E, hidden, 2*inter] - gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.to(dtype).transpose(1, 2)) + gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.transpose(1, 2)) gate, up = gate_up.chunk(2, dim=-1) intermediate = self.act_fn(gate) * up # down_proj: [E, hidden, inter] -> transpose to [E, inter, hidden] - expert_outputs = torch.bmm(intermediate, self.down_proj.to(dtype).transpose(1, 2)) + expert_outputs = torch.bmm(intermediate, self.down_proj.transpose(1, 2)) # expert_outputs: [E, B*S, hidden_dim] - # Apply per-expert scale: [E] -> [E, 1, 1] - expert_outputs = expert_outputs * self.per_expert_scale.to(dtype).unsqueeze(-1).unsqueeze(-1) - # Build full routing weight matrix [B*S, E] from sparse top-k - full_weights = torch.zeros(num_tokens, self.num_experts, dtype=dtype, device=hidden_states.device) - full_weights.scatter_add_(1, top_k_index, top_k_weights.to(dtype)) + full_weights = torch.zeros(num_tokens, self.num_experts, dtype=hidden_states.dtype, device=hidden_states.device) + full_weights.scatter_add_(1, top_k_index, top_k_weights) # Weighted sum over experts: [B*S, 1, E] @ [B*S, E, hidden_dim] -> [B*S, hidden_dim] expert_outputs = expert_outputs.permute(1, 0, 2) # [B*S, E, hidden_dim] @@ -5247,9 +5243,9 @@ def __enter__(self): for decoder_layer in self._model.model.language_model.layers: decoder_layer.self_attn.orig_forward = decoder_layer.self_attn.forward decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn) - if hasattr(decoder_layer, "moe"): - decoder_layer.moe._orig_forward = decoder_layer.moe.forward - decoder_layer.moe.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.moe) + if hasattr(decoder_layer, "experts"): + decoder_layer.experts._orig_forward = decoder_layer.experts.forward + decoder_layer.experts.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.experts) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -5259,8 +5255,8 @@ def __exit__(self, exc_type, exc_value, traceback): for decoder_layer in self._model.model.language_model.layers: decoder_layer.self_attn.forward = decoder_layer.self_attn.orig_forward - if hasattr(decoder_layer, "moe") and hasattr(decoder_layer.moe, "_orig_forward"): - decoder_layer.moe.forward = decoder_layer.moe._orig_forward + if hasattr(decoder_layer, "experts") and hasattr(decoder_layer.experts, "_orig_forward"): + decoder_layer.experts.forward = decoder_layer.experts._orig_forward setattr(self._model, self.orig_forward_name, self.model_orig_forward) setattr(self._model.model, "forward", self.model_orig_language_model_forward) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index f1a1044ebf..1531d6e4aa 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -436,6 +436,18 @@ class OVQuantizationMethod(str, Enum): "dataset": "contextual", "scale_estimation": True, }, + "google/gemma-4-26B-A4B-it": { + "bits": 4, + "sym": False, + "group_size": 64, + "group_size_fallback": "adjust", + }, + "google/gemma-4-26B-A4B": { + "bits": 4, + "sym": False, + "group_size": 64, + "group_size_fallback": "adjust", + }, } _DEFAULT_8BIT_WQ_CONFIGS = { @@ -567,6 +579,16 @@ class OVQuantizationMethod(str, Enum): ], }, }, + "google/gemma-4-26B-A4B-it": { + "lm_model": { + "patterns": [".*router.*"], + }, + }, + "google/gemma-4-26B-A4B": { + "lm_model": { + "patterns": [".*router.*"], + }, + }, } From 46752941a182e5868681ec4b0bdabc575f3f5461 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 22 Apr 2026 13:18:17 +0200 Subject: [PATCH 198/222] Fix accuracy on tiny model. --- optimum/exporters/openvino/model_patcher.py | 29 ++++++++++++++++++--- tests/openvino/test_seq2seq.py | 15 +++-------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3ff3c2866a..f23c80aabf 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -5110,9 +5110,8 @@ def gemma4_eager_attention_forward_patched( if attention_mask is not None: causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask - eps = 0.0000001 - attn_weights = nn.functional.softmax(attn_weights + eps, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() @@ -5178,7 +5177,7 @@ def gemma4_text_attention_forward( value_states, attention_mask, dropout=self.attention_dropout if self.training else 0.0, - scaling=1.0, + scaling=self.scaling, sliding_window=self.sliding_window, **kwargs, ) @@ -8891,9 +8890,18 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.forward = self._model.__orig_forward +# OpenVINO has a bug due to which Clamp(-inf, inf) doesn't work correctly: CVS-185473. +# When min == -inf and max == inf, Clamp is equivalent to an identity operation and +# can be removed from the model, which serves as a workaround for the issue. +def patched_gemma4_clippable_linear_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.linear(hidden_states) + return hidden_states + class Gemma4ImageEmbeddingsModelPatcher(CommonImageEmbeddingsModelPatcher): def __init__(self, config, model, model_kwargs): super().__init__(config, model, model_kwargs) + from transformers.models.gemma4.modeling_gemma4 import Gemma4ClippableLinear + # Get the vision encoder - it's at model.model.vision_tower.encoder vision_model = model.model.vision_tower if is_transformers_version(">=", "5") else model.vision_tower self._vision_encoder = vision_model.encoder @@ -8936,6 +8944,21 @@ def patched_encoder_forward(inputs_embeds, attention_mask=None, pixel_position_i self._orig_encoder_forward = orig_encoder_forward self._vision_encoder.forward = patched_encoder_forward + for layer in self._vision_encoder.layers: + for module in layer.modules(): + if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears: + if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float("inf") and module.output_max == float("inf"): + module.orig_forward = module.forward + module.forward = types.MethodType(patched_gemma4_clippable_linear_forward, module) + def __exit__(self, exc_type, exc_value, traceback): + from transformers.models.gemma4.modeling_gemma4 import Gemma4ClippableLinear self._vision_encoder.forward = self._orig_encoder_forward super().__exit__(exc_type, exc_value, traceback) + + for layer in self._vision_encoder.layers: + for module in layer.modules(): + if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears: + if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float( + "inf") and module.output_max == float("inf"): + module.forward = module.orig_forward diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 6ca18cd232..08ae063518 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -686,6 +686,8 @@ def test_find_untested_architectures(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): + if model_arch != "gemma4": + return def compare_outputs(inputs, ov_model, transformers_model, generation_config): transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=generation_config) @@ -773,13 +775,8 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config): set_seed(SEED) with torch.no_grad(): transformers_outputs = transformers_model(**transformers_inputs) - # Gemma4 performs poorly with random weights. - # The full model "google/gemma-4-E2B-it" passes this test with 4e-2 eps, but - # after saving it with random weights the converted model generates logits with max difference around 5. - # On the tiny model the error is about 0.1. - eps = 0.2 if model_arch == "gemma4" else 4e-3 self.assertTrue( - torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=eps), + torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=4e-3), f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", ) @@ -792,12 +789,6 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config): # minicpmo diverges after 20 tokens tokens_to_generate = 20 if model_arch == "minicpmo" else 30 - # Gemma4 performs much poorly with random weights. - # The full model "google/gemma-4-E2B-it" passes this test, while the same architecture - # saved with random weights generates tokens that do not match transformers. - if model_arch == "gemma4": - tokens_to_generate = 1 - gen_config = GenerationConfig( max_new_tokens=tokens_to_generate, min_new_tokens=tokens_to_generate, From 19c605cd900a781ce93efbeccbcf260c6266f494 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 22 Apr 2026 13:20:27 +0200 Subject: [PATCH 199/222] Code style. --- optimum/exporters/openvino/model_configs.py | 6 +++--- optimum/exporters/openvino/model_patcher.py | 17 ++++++++++++++--- tests/openvino/test_seq2seq.py | 2 -- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index fa341c9e5a..2396ab4767 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1519,9 +1519,9 @@ def __init__( self.num_kv_shared_layers = normalized_config.config.num_kv_shared_layers self.sliding_window = normalized_config.config.sliding_window # Full-attention layers use fewer KV heads than sliding-attention layers (e.g. 2 vs 8 for 26B-A4B) - self.num_global_key_value_heads = getattr( - normalized_config.config, "num_global_key_value_heads", None - ) or self.num_key_value_heads + self.num_global_key_value_heads = ( + getattr(normalized_config.config, "num_global_key_value_heads", None) or self.num_key_value_heads + ) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): # some layers do not produce their own KV-cache, they use the shared KV-cache diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f23c80aabf..79bb35e79c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -8897,6 +8897,7 @@ def patched_gemma4_clippable_linear_forward(self, hidden_states: torch.Tensor) - hidden_states = self.linear(hidden_states) return hidden_states + class Gemma4ImageEmbeddingsModelPatcher(CommonImageEmbeddingsModelPatcher): def __init__(self, config, model, model_kwargs): super().__init__(config, model, model_kwargs) @@ -8947,18 +8948,28 @@ def patched_encoder_forward(inputs_embeds, attention_mask=None, pixel_position_i for layer in self._vision_encoder.layers: for module in layer.modules(): if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears: - if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float("inf") and module.output_max == float("inf"): + if ( + module.input_min == -float("inf") + and module.input_max == float("inf") + and module.output_min == -float("inf") + and module.output_max == float("inf") + ): module.orig_forward = module.forward module.forward = types.MethodType(patched_gemma4_clippable_linear_forward, module) def __exit__(self, exc_type, exc_value, traceback): from transformers.models.gemma4.modeling_gemma4 import Gemma4ClippableLinear + self._vision_encoder.forward = self._orig_encoder_forward super().__exit__(exc_type, exc_value, traceback) for layer in self._vision_encoder.layers: for module in layer.modules(): if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears: - if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float( - "inf") and module.output_max == float("inf"): + if ( + module.input_min == -float("inf") + and module.input_max == float("inf") + and module.output_min == -float("inf") + and module.output_max == float("inf") + ): module.forward = module.orig_forward diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 08ae063518..14cfb3f7f5 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -686,8 +686,6 @@ def test_find_untested_architectures(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): - if model_arch != "gemma4": - return def compare_outputs(inputs, ov_model, transformers_model, generation_config): transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=generation_config) From 7bd6e48e68eeb59f06024b749556d6a80645c4cf Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 22 Apr 2026 14:22:17 +0200 Subject: [PATCH 200/222] Test export. --- optimum/exporters/openvino/model_patcher.py | 7 ++----- tests/openvino/test_export.py | 3 +++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 79bb35e79c..55d9ccdeab 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -52,6 +52,7 @@ ModelPatcher, gpt_oss_forward, override_arguments, + sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap ) from optimum.intel.utils.import_utils import ( is_diffusers_version, @@ -60,10 +61,6 @@ is_transformers_version, ) -from optimum.exporters.onnx.model_patcher import ( - sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap, -) - from ._ov_ops import convert_recurrent_attention_cell @@ -122,7 +119,7 @@ def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0 import inspect sig = inspect.signature(_orig_sdpa_mask_without_vmap) - if "cache_position" in sig.parameters: + if is_transformers_version(">=", "5.5") and "cache_position" in sig.parameters and q_length is not None: # Old optimum signature: (batch_size, cache_position, kv_length, kv_offset, ...) cache_position = torch.arange(q_length, dtype=torch.long) + q_offset kwargs.pop("q_offset", None) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index e9c7696c2d..f2828be1d4 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -110,6 +110,9 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"): SUPPORTED_ARCHITECTURES.update({"afmoe": OVModelForCausalLM}) + if is_transformers_version(">=", "4.55.0"): + SUPPORTED_ARCHITECTURES.update({"gemma4": OVModelForVisualCausalLM}) + if is_transformers_version(">=", "4.57.0"): SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM}) From 18fbdc4ac33d6185590e25c68ed493e9e2d84701 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 22 Apr 2026 16:16:36 +0200 Subject: [PATCH 201/222] Minor fix. --- tests/openvino/test_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index f2828be1d4..acd03802c9 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -110,7 +110,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"): SUPPORTED_ARCHITECTURES.update({"afmoe": OVModelForCausalLM}) - if is_transformers_version(">=", "4.55.0"): + if is_transformers_version(">=", "5.5.0"): SUPPORTED_ARCHITECTURES.update({"gemma4": OVModelForVisualCausalLM}) if is_transformers_version(">=", "4.57.0"): From ac8f8d890a297ff1ece904c2a684351ae20246fd Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 23 Apr 2026 14:33:33 +0200 Subject: [PATCH 202/222] Fix use_bidirectional_attention. --- optimum/exporters/openvino/model_configs.py | 7 +- optimum/exporters/openvino/model_patcher.py | 85 ++++++++++++++++++- .../openvino/modeling_visual_language.py | 31 ++++++- 3 files changed, 116 insertions(+), 7 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 2396ab4767..a9f56f292b 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4434,13 +4434,18 @@ def with_behavior(self, behavior: Union[str, Gemma4ConfigBehavior]): if behavior == Gemma4ConfigBehavior.LANGUAGE: model_type = "gemma4_text" + inputs_update = { + "per_layer_inputs": {0: "batch_size", 1: "sequence_length", 2: "num_hidden_layers"}, + } + if getattr(self._orig_config.get_text_config(), "use_bidirectional_attention", None) == "vision": + inputs_update["token_type_ids"] = {0: "batch_size", 1: "sequence_length"} return get_vlm_text_generation_config( model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype, model_patcher=Gemma4LMModelPatcher, - inputs_update={"per_layer_inputs": {0: "batch_size", 1: "sequence_length", 2: "num_hidden_layers"}}, + inputs_update=inputs_update, ) if behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER: config = self.__class__( diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 55d9ccdeab..727885912b 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4945,6 +4945,75 @@ def _gemma4_project_per_layer_inputs( return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale +def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, inputs_embeds, sliding_window): + """ + Creates a dict of causal masks with bidirectional attention for vision tokens + on sliding_attention layers, matching the behavior of transformers' + create_causal_mask_mapping when use_bidirectional_attention == "vision". + + Args: + attention_mask_2d: [batch, total_len] 2D attention mask (1=attend, 0=pad) + mm_token_type_ids: [batch, total_len] token type ids (0=text, 1=image, 2=video/audio) + inputs_embeds: [batch, seq_len, hidden_size] + sliding_window: int, sliding window size + + Returns: + dict with "full_attention" and "sliding_attention" 4D masks + """ + dtype = inputs_embeds.dtype + device = inputs_embeds.device + min_dtype = torch.finfo(dtype).min + + batch_size = inputs_embeds.shape[0] + seq_len = inputs_embeds.shape[1] + target_len = attention_mask_2d.shape[-1] + past_len = target_len - seq_len + + # Standard causal mask [seq_len, target_len] + causal_mask = torch.full((seq_len, target_len), min_dtype, dtype=dtype, device=device) + if seq_len != 1: + causal_mask = torch.triu(causal_mask, diagonal=past_len + 1) + + # Apply padding from attention_mask_2d + padding_mask = (1.0 - attention_mask_2d[:, None, None, :].to(dtype=dtype, device=device)) * min_dtype + full_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + padding_mask + + # Sliding window causal mask + sliding_mask = full_mask.clone() + if sliding_window is not None: + row_pos = torch.arange(seq_len, device=device).unsqueeze(1) + past_len + col_pos = torch.arange(target_len, device=device).unsqueeze(0) + beyond_window = (row_pos - col_pos) >= sliding_window + sliding_mask = sliding_mask.masked_fill(beyond_window[None, None, :, :], min_dtype) + + # Apply bidirectional masking for vision tokens (only on sliding_attention mask) + # mm_token_type_ids: [batch, total_len] - 0=text, 1=image, 2=video/audio + is_vision = (mm_token_type_ids == 1) | (mm_token_type_ids == 2) + + # Group contiguous vision tokens (trace-friendly, no in-place ops) + # Shift is_vision right by 1 position, padding with False on the left + is_prev_vision = torch.nn.functional.pad(is_vision[:, :-1].to(dtype=torch.int32), (1, 0), value=0).bool() + new_vision_starts = is_vision & ~is_prev_vision + vision_group_ids = torch.cumsum(new_vision_starts.to(dtype=torch.int32), dim=1) - 1 + vision_group_ids = torch.where(is_vision, vision_group_ids, torch.tensor(-1, dtype=torch.int32, device=device)) + + # Query group IDs correspond to positions [past_len : past_len + seq_len] + query_groups = vision_group_ids[:, past_len : past_len + seq_len] # [batch, seq_len] + key_groups = vision_group_ids # [batch, total_len] + + # same_group[b, q, k] = True iff query and key are in the same non-text vision group + same_group = (query_groups.unsqueeze(2) == key_groups.unsqueeze(1)) & (key_groups.unsqueeze(1) >= 0) + same_group = same_group.unsqueeze(1) # [batch, 1, seq_len, total_len] + + # Undo masking for same-group vision tokens in sliding mask + sliding_mask = sliding_mask.masked_fill(same_group, 0.0) + + return { + "full_attention": full_mask, + "sliding_attention": sliding_mask, + } + + def gemma4_language_model_forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -4985,6 +5054,18 @@ def gemma4_language_model_forward( special_image_mask_expanded = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds) inputs_embeds = inputs_embeds.masked_scatter(special_image_mask_expanded, image_features) + # Create bidirectional causal mask mapping when use_bidirectional_attention == "vision" + use_bidirectional = ( + getattr(self.config.get_text_config(), "use_bidirectional_attention", None) == "vision" + ) + if use_bidirectional and mm_token_type_ids is not None: + attention_mask = _create_gemma4_bidirectional_mask_dict( + attention_mask, + mm_token_type_ids, + inputs_embeds, + self.model.language_model.config.sliding_window, + ) + outputs = self.model.language_model( input_ids=None, per_layer_inputs=per_layer_inputs, @@ -5015,12 +5096,12 @@ def gemma4_lm_forward( past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, per_layer_inputs=None, + token_type_ids: Optional[torch.LongTensor] = None, input_ids: Optional[torch.LongTensor] = None, pixel_values: Optional[torch.FloatTensor] = None, pixel_values_videos: Optional[torch.FloatTensor] = None, input_features: Optional[torch.FloatTensor] = None, input_features_mask: Optional[torch.Tensor] = None, - mm_token_type_ids: Optional[torch.LongTensor] = None, cache_position: Optional[torch.LongTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, @@ -5050,7 +5131,7 @@ def gemma4_lm_forward( input_features_mask=input_features_mask, position_ids=position_ids, past_key_values=past_key_values, - mm_token_type_ids=mm_token_type_ids, + mm_token_type_ids=token_type_ids, cache_position=cache_position, inputs_embeds=inputs_embeds, labels=labels, diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index c09243a03b..c83010684b 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -211,7 +211,11 @@ def prepare_inputs( if "token_type_ids" in self.input_names: if token_type_ids is None: - token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) + # Use attention_mask shape to match total sequence length (including past tokens) + if attention_mask is not None: + token_type_ids = np.zeros(attention_mask.shape, dtype=int) + else: + token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) inputs["token_type_ids"] = token_type_ids if "beam_idx" in self.input_names: @@ -3995,12 +3999,31 @@ def merge_vision_text_embeddings( return inputs_embeds, attention_mask, position_ids - def prepare_inputs_for_generation(self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs): - model_inputs = super().prepare_inputs_for_generation(input_ids, **kwargs) - model_inputs["mm_token_type_ids"] = mm_token_type_ids + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, + image_sizes=None, attention_mask=None, mm_token_type_ids=None, image_position_ids=None, **kwargs + ): + model_inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, + pixel_values=pixel_values, image_sizes=image_sizes, attention_mask=attention_mask, **kwargs + ) + # Map mm_token_type_ids to token_type_ids for the OV language model input + model_inputs["token_type_ids"] = mm_token_type_ids model_inputs["image_position_ids"] = image_position_ids return model_inputs + def forward(self, input_ids, pixel_values=None, token_type_ids=None, **kwargs): + # Map mm_token_type_ids (from Gemma4 processor) to token_type_ids (OV language model input) + mm_token_type_ids = kwargs.pop("mm_token_type_ids", None) + if token_type_ids is None and mm_token_type_ids is not None: + token_type_ids = mm_token_type_ids + return super().forward( + input_ids=input_ids, + pixel_values=pixel_values, + token_type_ids=token_type_ids, + **kwargs, + ) + def _update_model_kwargs_for_generation( self, outputs, From ea694279d092bdffe1e510132989fd56112718e5 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 23 Apr 2026 15:05:45 +0200 Subject: [PATCH 203/222] MoE tests. --- optimum/exporters/openvino/model_patcher.py | 6 ++---- .../openvino/modeling_visual_language.py | 21 +++++++++++++++---- tests/openvino/test_export.py | 1 + tests/openvino/test_quantization.py | 1 + tests/openvino/test_seq2seq.py | 2 +- tests/openvino/utils_tests.py | 9 +++++++- 6 files changed, 30 insertions(+), 10 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 727885912b..4e61cb3d69 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -52,7 +52,7 @@ ModelPatcher, gpt_oss_forward, override_arguments, - sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap + sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap, ) from optimum.intel.utils.import_utils import ( is_diffusers_version, @@ -5055,9 +5055,7 @@ def gemma4_language_model_forward( inputs_embeds = inputs_embeds.masked_scatter(special_image_mask_expanded, image_features) # Create bidirectional causal mask mapping when use_bidirectional_attention == "vision" - use_bidirectional = ( - getattr(self.config.get_text_config(), "use_bidirectional_attention", None) == "vision" - ) + use_bidirectional = getattr(self.config.get_text_config(), "use_bidirectional_attention", None) == "vision" if use_bidirectional and mm_token_type_ids is not None: attention_mask = _create_gemma4_bidirectional_mask_dict( attention_mask, diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index c83010684b..89965be093 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -4000,12 +4000,25 @@ def merge_vision_text_embeddings( return inputs_embeds, attention_mask, position_ids def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, - image_sizes=None, attention_mask=None, mm_token_type_ids=None, image_position_ids=None, **kwargs + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + image_sizes=None, + attention_mask=None, + mm_token_type_ids=None, + image_position_ids=None, + **kwargs, ): model_inputs = super().prepare_inputs_for_generation( - input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, - pixel_values=pixel_values, image_sizes=image_sizes, attention_mask=attention_mask, **kwargs + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + image_sizes=image_sizes, + attention_mask=attention_mask, + **kwargs, ) # Map mm_token_type_ids to token_type_ids for the OV language model input model_inputs["token_type_ids"] = mm_token_type_ids diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index acd03802c9..6ce46f2484 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -112,6 +112,7 @@ class ExportModelTest(unittest.TestCase): if is_transformers_version(">=", "5.5.0"): SUPPORTED_ARCHITECTURES.update({"gemma4": OVModelForVisualCausalLM}) + SUPPORTED_ARCHITECTURES.update({"gemma4_moe": OVModelForVisualCausalLM}) if is_transformers_version(">=", "4.57.0"): SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM}) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f0b5f4bcd1..c8404f01c3 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1091,6 +1091,7 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version(">=", "5.5.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "gemma4", True)) + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "gemma4_moe", True)) SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 14cfb3f7f5..a8e9e7f6a5 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -608,7 +608,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin): SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"] if is_transformers_version(">=", "5.5"): - SUPPORTED_ARCHITECTURES += ["gemma4"] + SUPPORTED_ARCHITECTURES += ["gemma4", "gemma4_moe"] # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly if is_transformers_version("<", "5"): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 1f063de6b8..236deda76c 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -88,6 +88,7 @@ "gemma3_text": "optimum-intel-internal-testing/tiny-random-gemma3-text", "gemma3": "optimum-intel-internal-testing/tiny-random-gemma3", "gemma4": "optimum-intel-internal-testing/tiny-random-gemma4", + "gemma4_moe": "optimum-intel-internal-testing/tiny-random-gemma4-moe", "falcon": "optimum-intel-internal-testing/really-tiny-falcon-testing", "falcon-40b": "optimum-intel-internal-testing/tiny-random-falcon-40b", "falcon_mamba": "optimum-intel-internal-testing/tiny-falcon-mamba", @@ -374,11 +375,17 @@ "qwen3_eagle3": {"model": 20}, "qwen3_next": {"model": 100}, "gemma4": { - "lm_model": 58, + "lm_model": 54, "text_embeddings_model": 1, "vision_embeddings_model": 10, "text_embeddings_per_layer_model": 1, }, + "gemma4_moe": { + "lm_model": 44, + "text_embeddings_model": 1, + "vision_embeddings_model": 10, + "text_embeddings_per_layer_model": 0, + }, } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From a97bf7d819da198d3ebdd3caedc46a323d3d1e8b Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 23 Apr 2026 15:09:45 +0200 Subject: [PATCH 204/222] Code style. --- optimum/exporters/openvino/model_patcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4e61cb3d69..792191100d 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -52,6 +52,8 @@ ModelPatcher, gpt_oss_forward, override_arguments, +) +from optimum.exporters.onnx.model_patcher import ( sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap, ) from optimum.intel.utils.import_utils import ( @@ -61,7 +63,6 @@ is_transformers_version, ) - from ._ov_ops import convert_recurrent_attention_cell From 32712e3f5196705c3e5c227d5d3e885b3a6706c2 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 Apr 2026 13:50:21 +0200 Subject: [PATCH 205/222] Workflow to test gemma4 on transformers==5.5 --- .../test_openvino_transformers_5_5.yml | 65 +++++++++++++++++++ pyproject.toml | 1 + 2 files changed, 66 insertions(+) create mode 100644 .github/workflows/test_openvino_transformers_5_5.yml diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_transformers_5_5.yml new file mode 100644 index 0000000000..205fa00b9b --- /dev/null +++ b/.github/workflows/test_openvino_transformers_5_5.yml @@ -0,0 +1,65 @@ +name: OpenVINO - Test Gemma4 + +on: + workflow_dispatch: + push: + branches: + - main + - v*-release + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +env: + UV_TORCH_BACKEND: cpu + UV_SYSTEM_PYTHON: true + TRANSFORMERS_IS_CI: true + HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + +jobs: + build: + strategy: + fail-fast: false + matrix: + test-pattern: + [ + "*export*", + "*seq2seq*", + "*quantization*", + ] + + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + pip install --upgrade pip uv + uv pip install .[tests] librosa diffusers + + - name: Install transformers 5.5 + run: | + uv pip install transformers==5.5 + + - name: Login with fork PRs CI token + if: ${{ env.HF_TOKEN == '' }} + run: | + python tests/scripts/login_with_ci_token.py + + - name: Test Gemma4 with Pytest + run: | + pytest tests/openvino/${{ matrix.test-pattern }} -m gemma4 --durations=0 diff --git a/pyproject.toml b/pyproject.toml index bc066641fd..a201ccf730 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,4 +36,5 @@ known-first-party = ["optimum"] [tool.pytest.ini_options] markers = [ "run_slow", + "gemma4: tests for gemma4 and gemma4_moe architectures (require transformers>=5.5)", ] \ No newline at end of file From e1a1325dd27f5f915f073039302b8775e797c88e Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 Apr 2026 13:58:45 +0200 Subject: [PATCH 206/222] Workflow to test gemma4 on transformers==5.5 --- tests/openvino/conftest.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tests/openvino/conftest.py diff --git a/tests/openvino/conftest.py b/tests/openvino/conftest.py new file mode 100644 index 0000000000..94fec0215a --- /dev/null +++ b/tests/openvino/conftest.py @@ -0,0 +1,10 @@ +import pytest + + +def pytest_collection_modifyitems(items): + """Dynamically add the 'gemma4' marker to every parameterized test whose + name contains 'gemma4' (this also covers 'gemma4_moe').""" + gemma4_marker = pytest.mark.gemma4 + for item in items: + if "gemma4" in item.name: + item.add_marker(gemma4_marker) From 28fb7f03f67389668f02faa91ff48240c9c9acce Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 Apr 2026 14:12:47 +0200 Subject: [PATCH 207/222] Workflow to test gemma4 on transformers==5.5 --- .github/workflows/test_openvino_transformers_5_5.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_transformers_5_5.yml index 205fa00b9b..10dee0ec9f 100644 --- a/.github/workflows/test_openvino_transformers_5_5.yml +++ b/.github/workflows/test_openvino_transformers_5_5.yml @@ -51,9 +51,9 @@ jobs: pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - name: Install transformers 5.5 + - name: Install transformers 5.5.0 run: | - uv pip install transformers==5.5 + uv pip install transformers==5.5.0 - name: Login with fork PRs CI token if: ${{ env.HF_TOKEN == '' }} From bdde55bcd9c035843588a3a5dfa17d235f352594 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 Apr 2026 16:43:11 +0200 Subject: [PATCH 208/222] Use latest openvino nightly for Gemma4. --- .github/workflows/test_openvino_transformers_5_5.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_transformers_5_5.yml index 10dee0ec9f..4aff0e5f43 100644 --- a/.github/workflows/test_openvino_transformers_5_5.yml +++ b/.github/workflows/test_openvino_transformers_5_5.yml @@ -51,9 +51,10 @@ jobs: pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - name: Install transformers 5.5.0 + - name: Install transformers 5.5.0 and latest openvino nightly run: | uv pip install transformers==5.5.0 + uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Login with fork PRs CI token if: ${{ env.HF_TOKEN == '' }} From 601e6643d3fff37b84d7367f8c8ae3670fd0f699 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 28 Apr 2026 17:34:08 +0200 Subject: [PATCH 209/222] Fix compression test. --- tests/openvino/conftest.py | 5 +++-- tests/openvino/test_quantization.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/openvino/conftest.py b/tests/openvino/conftest.py index 94fec0215a..bf929db0c5 100644 --- a/tests/openvino/conftest.py +++ b/tests/openvino/conftest.py @@ -1,10 +1,11 @@ import pytest -def pytest_collection_modifyitems(items): +@pytest.hookimpl(tryfirst=True) +def pytest_collection_modifyitems(config, items): """Dynamically add the 'gemma4' marker to every parameterized test whose name contains 'gemma4' (this also covers 'gemma4_moe').""" gemma4_marker = pytest.mark.gemma4 for item in items: - if "gemma4" in item.name: + if "gemma4" in item.nodeid: item.add_marker(gemma4_marker) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c8404f01c3..cdda7f00f0 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1328,7 +1328,10 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict()) self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"])) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) + @parameterized.expand( + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION, + name_func=lambda testcase_func, param_num, params: f"{testcase_func.__name__}_{parameterized.to_safe_name(params.args[1])}", + ) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust_remote_code): model = model_cls.from_pretrained( MODEL_NAMES[model_type], @@ -1550,7 +1553,10 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty expected_int8 = {k: {"int8": v} for k, v in expected_int8.items()} check_compression_state_per_model(self, model.ov_models, expected_int8) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) + @parameterized.expand( + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION, + name_func=lambda testcase_func, param_num, params: f"{testcase_func.__name__}_{parameterized.to_safe_name(params.args[1])}", + ) def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code): model = model_cls.from_pretrained( MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code From 9cc279d84644c0938593b59f77e3eb10faef260f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 11:01:56 +0200 Subject: [PATCH 210/222] Workflow corrections. --- ..._5_5.yml => test_openvino_preview_models.yml} | 16 ++++++++-------- tests/openvino/test_seq2seq.py | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) rename .github/workflows/{test_openvino_transformers_5_5.yml => test_openvino_preview_models.yml} (93%) diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_preview_models.yml similarity index 93% rename from .github/workflows/test_openvino_transformers_5_5.yml rename to .github/workflows/test_openvino_preview_models.yml index 4aff0e5f43..ba60048e0b 100644 --- a/.github/workflows/test_openvino_transformers_5_5.yml +++ b/.github/workflows/test_openvino_preview_models.yml @@ -46,21 +46,21 @@ jobs: with: python-version: "3.10" + - name: Login with fork PRs CI token + if: ${{ env.HF_TOKEN == '' }} + run: | + python tests/scripts/login_with_ci_token.py + - name: Install dependencies run: | pip install --upgrade pip uv uv pip install .[tests] librosa diffusers - - name: Install transformers 5.5.0 and latest openvino nightly + - name: Install latest openvino nightly run: | - uv pip install transformers==5.5.0 uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - name: Login with fork PRs CI token - if: ${{ env.HF_TOKEN == '' }} - run: | - python tests/scripts/login_with_ci_token.py - - - name: Test Gemma4 with Pytest + - name: Preview Models Support run: | + uv pip install transformers==5.5.0 pytest tests/openvino/${{ matrix.test-pattern }} -m gemma4 --durations=0 diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 29b18f450f..23b3e73199 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -689,6 +689,8 @@ def test_find_untested_architectures(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): + if model_arch == "gemma4": + return if model_arch in ("llama4", "minicpmv", "minicpmo") and is_openvino_version(">=", "2026.1.0"): self.skipTest("CVS-185350: OpenVINO 2026.1.0 inference results mismatch") From df8b17ff32fe7004fe807bb84d6db9c54631cb5d Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 11:03:28 +0200 Subject: [PATCH 211/222] Wrong change. --- tests/openvino/test_seq2seq.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 23b3e73199..29b18f450f 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -689,8 +689,6 @@ def test_find_untested_architectures(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): - if model_arch == "gemma4": - return if model_arch in ("llama4", "minicpmv", "minicpmo") and is_openvino_version(">=", "2026.1.0"): self.skipTest("CVS-185350: OpenVINO 2026.1.0 inference results mismatch") From 6a30dcb7c073e7b06b25b06fd5d2e72b51cbf6b2 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 11:05:52 +0200 Subject: [PATCH 212/222] Minor correction. --- .github/workflows/test_openvino_preview_models.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test_openvino_preview_models.yml b/.github/workflows/test_openvino_preview_models.yml index ba60048e0b..7a3fd5f1f4 100644 --- a/.github/workflows/test_openvino_preview_models.yml +++ b/.github/workflows/test_openvino_preview_models.yml @@ -46,16 +46,16 @@ jobs: with: python-version: "3.10" - - name: Login with fork PRs CI token - if: ${{ env.HF_TOKEN == '' }} - run: | - python tests/scripts/login_with_ci_token.py - - name: Install dependencies run: | pip install --upgrade pip uv uv pip install .[tests] librosa diffusers + - name: Login with fork PRs CI token + if: ${{ env.HF_TOKEN == '' }} + run: | + python tests/scripts/login_with_ci_token.py + - name: Install latest openvino nightly run: | uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly From 018dbe6b9d388cdcd00d88dbc30972760b10e861 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 12:27:21 +0200 Subject: [PATCH 213/222] Refactor token_type_ids passing. --- optimum/exporters/openvino/model_patcher.py | 5 +++-- optimum/intel/openvino/modeling_visual_language.py | 6 +----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 52587bb177..d1bb749da9 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4972,12 +4972,13 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, # Standard causal mask [seq_len, target_len] causal_mask = torch.full((seq_len, target_len), min_dtype, dtype=dtype, device=device) - if seq_len != 1: - causal_mask = torch.triu(causal_mask, diagonal=past_len + 1) + causal_mask = torch.triu(causal_mask, diagonal=past_len + 1) # Apply padding from attention_mask_2d padding_mask = (1.0 - attention_mask_2d[:, None, None, :].to(dtype=dtype, device=device)) * min_dtype full_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + padding_mask + mm_token_type_ids = torch.nn.functional.pad(mm_token_type_ids, + (0, target_len - mm_token_type_ids.shape[-1]), value=0) # Sliding window causal mask sliding_mask = full_mask.clone() diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 89965be093..94d15c9aee 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -211,11 +211,7 @@ def prepare_inputs( if "token_type_ids" in self.input_names: if token_type_ids is None: - # Use attention_mask shape to match total sequence length (including past tokens) - if attention_mask is not None: - token_type_ids = np.zeros(attention_mask.shape, dtype=int) - else: - token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) + token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) inputs["token_type_ids"] = token_type_ids if "beam_idx" in self.input_names: From 9f281b412c43d6e4ce39e135c3c0b584c191b329 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 15:38:30 +0200 Subject: [PATCH 214/222] Update .github/workflows/test_openvino_preview_models.yml Co-authored-by: Roman Kazantsev --- .github/workflows/test_openvino_preview_models.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino_preview_models.yml b/.github/workflows/test_openvino_preview_models.yml index 7a3fd5f1f4..56702b46a2 100644 --- a/.github/workflows/test_openvino_preview_models.yml +++ b/.github/workflows/test_openvino_preview_models.yml @@ -60,7 +60,7 @@ jobs: run: | uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - name: Preview Models Support + - name: Gemma 4 Validation run: | uv pip install transformers==5.5.0 pytest tests/openvino/${{ matrix.test-pattern }} -m gemma4 --durations=0 From 21b6ce8d3f0f51ac9966dfcf51afb3cfa38171f0 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 15:38:42 +0200 Subject: [PATCH 215/222] Update .github/workflows/test_openvino_preview_models.yml Co-authored-by: Roman Kazantsev --- .github/workflows/test_openvino_preview_models.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_openvino_preview_models.yml b/.github/workflows/test_openvino_preview_models.yml index 56702b46a2..0e162e1f42 100644 --- a/.github/workflows/test_openvino_preview_models.yml +++ b/.github/workflows/test_openvino_preview_models.yml @@ -1,4 +1,4 @@ -name: OpenVINO - Test Gemma4 +name: Preview Models Support Validation on: workflow_dispatch: From 0c2f4111d33eb2845e69a00328716ac68fdb486c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 16:35:00 +0200 Subject: [PATCH 216/222] Minor corrections. --- optimum/intel/openvino/modeling_visual_language.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 94d15c9aee..42a4866c38 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -211,7 +211,7 @@ def prepare_inputs( if "token_type_ids" in self.input_names: if token_type_ids is None: - token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) + token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) inputs["token_type_ids"] = token_type_ids if "beam_idx" in self.input_names: @@ -796,7 +796,7 @@ def forward( additional_kwargs["per_layer_inputs"] = extra_outputs[0] return self.language_model.forward( - input_ids=input_ids, + input_ids=None, inputs_embeds=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, From 87f4becd7a47af5454eabb6628dadc4e9f08a24c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 29 Apr 2026 16:36:21 +0200 Subject: [PATCH 217/222] Code style. --- optimum/exporters/openvino/model_patcher.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index d1bb749da9..16b72526f6 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4977,8 +4977,9 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, # Apply padding from attention_mask_2d padding_mask = (1.0 - attention_mask_2d[:, None, None, :].to(dtype=dtype, device=device)) * min_dtype full_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + padding_mask - mm_token_type_ids = torch.nn.functional.pad(mm_token_type_ids, - (0, target_len - mm_token_type_ids.shape[-1]), value=0) + mm_token_type_ids = torch.nn.functional.pad( + mm_token_type_ids, (0, target_len - mm_token_type_ids.shape[-1]), value=0 + ) # Sliding window causal mask sliding_mask = full_mask.clone() From a27437815040776deabf0b2cc7af989ab873032f Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 30 Apr 2026 10:33:49 +0200 Subject: [PATCH 218/222] Reuse LFM2 MoE patching. --- optimum/exporters/openvino/model_patcher.py | 31 +-------------------- 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 16b72526f6..3e66401bec 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -5266,35 +5266,6 @@ def gemma4_text_attention_forward( return attn_output, attn_weights -def _gemma4_moe_block_forward(self, hidden_states, top_k_index, top_k_weights): - # hidden_states: [B*S, hidden_dim] - # top_k_index: [B*S, K], top_k_weights: [B*S, K] - num_tokens = hidden_states.shape[0] - - # Compute all expert outputs via batched matmul - # expanded: [E, B*S, hidden_dim] - expanded_hidden = hidden_states.unsqueeze(0).expand(self.num_experts, -1, -1) - - # gate_up_proj: [E, 2*inter, hidden] -> transpose to [E, hidden, 2*inter] - gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.transpose(1, 2)) - gate, up = gate_up.chunk(2, dim=-1) - intermediate = self.act_fn(gate) * up - - # down_proj: [E, hidden, inter] -> transpose to [E, inter, hidden] - expert_outputs = torch.bmm(intermediate, self.down_proj.transpose(1, 2)) - # expert_outputs: [E, B*S, hidden_dim] - - # Build full routing weight matrix [B*S, E] from sparse top-k - full_weights = torch.zeros(num_tokens, self.num_experts, dtype=hidden_states.dtype, device=hidden_states.device) - full_weights.scatter_add_(1, top_k_index, top_k_weights) - - # Weighted sum over experts: [B*S, 1, E] @ [B*S, E, hidden_dim] -> [B*S, hidden_dim] - expert_outputs = expert_outputs.permute(1, 0, 2) # [B*S, E, hidden_dim] - final_hidden_states = torch.bmm(full_weights.unsqueeze(1), expert_outputs).squeeze(1) - - return final_hidden_states - - class Gemma4LMModelPatcher(Gemma3LMModelPatcher): def __init__(self, config, model, model_kwargs): super().__init__(config, model, model_kwargs) @@ -5323,7 +5294,7 @@ def __enter__(self): decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn) if hasattr(decoder_layer, "experts"): decoder_layer.experts._orig_forward = decoder_layer.experts.forward - decoder_layer.experts.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.experts) + decoder_layer.experts.forward = types.MethodType(lfm2_moe_experts_forward, decoder_layer.experts) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) From ff99d6e13774841bdd17ac0d4c8bd2d181cf7c27 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 30 Apr 2026 10:11:02 +0200 Subject: [PATCH 219/222] Update optimum/exporters/openvino/model_configs.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/openvino/model_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index eaef60326a..c14cfc2946 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1589,7 +1589,7 @@ class Gemma4TextOpenVINOConfig(Gemma3TextOpenVINOConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Gemma4DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = Gemma4DummyPastKeyValuesGenerator - MIN_TRANSFORMERS_VERSION = "4.50.0" + MIN_TRANSFORMERS_VERSION = "5.5" def add_past_key_values(self, inputs_or_outputs: dict[str, dict[int, str]], direction: str): if direction not in ["inputs", "outputs"]: From 0f8508cb9dd95f5190045bab46e8d29dbfdda10c Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 30 Apr 2026 13:48:38 +0200 Subject: [PATCH 220/222] Applied comments, removed not needed code. --- optimum/exporters/openvino/model_patcher.py | 74 +++++-------------- .../openvino/modeling_visual_language.py | 3 +- 2 files changed, 21 insertions(+), 56 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 3e66401bec..d8237f954a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4923,44 +4923,12 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.model._update_causal_mask = self._model.model._orig_update_causual_mask del self._model.model._orig_update_causual_mask - -def _gemma4_project_per_layer_inputs( - self, - inputs_embeds: torch.Tensor, - per_layer_inputs: Optional[torch.Tensor] = None, -) -> torch.Tensor: - per_layer_projection = self.per_layer_model_projection(inputs_embeds) * self.per_layer_model_projection_scale - per_layer_projection = per_layer_projection.reshape( - *inputs_embeds.shape[:-1], - self.config.num_hidden_layers, - self.hidden_size_per_layer_input, - ) - per_layer_projection = self.per_layer_projection_norm(per_layer_projection) - - if per_layer_inputs is None: - return per_layer_projection - - if per_layer_projection.shape != per_layer_inputs.shape: - per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :] - - return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale - - +# Creates a dict of causal masks with bidirectional attention for vision tokens +# on sliding_attention layers, matching the behavior of transformers +# create_causal_mask_mapping when use_bidirectional_attention == "vision". +# Needs to be patched to pass proper 'sliding_mask' for prefill stage. +# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L1986 def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, inputs_embeds, sliding_window): - """ - Creates a dict of causal masks with bidirectional attention for vision tokens - on sliding_attention layers, matching the behavior of transformers' - create_causal_mask_mapping when use_bidirectional_attention == "vision". - - Args: - attention_mask_2d: [batch, total_len] 2D attention mask (1=attend, 0=pad) - mm_token_type_ids: [batch, total_len] token type ids (0=text, 1=image, 2=video/audio) - inputs_embeds: [batch, seq_len, hidden_size] - sliding_window: int, sliding window size - - Returns: - dict with "full_attention" and "sliding_attention" 4D masks - """ dtype = inputs_embeds.dtype device = inputs_embeds.device min_dtype = torch.finfo(dtype).min @@ -4983,11 +4951,10 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, # Sliding window causal mask sliding_mask = full_mask.clone() - if sliding_window is not None: - row_pos = torch.arange(seq_len, device=device).unsqueeze(1) + past_len - col_pos = torch.arange(target_len, device=device).unsqueeze(0) - beyond_window = (row_pos - col_pos) >= sliding_window - sliding_mask = sliding_mask.masked_fill(beyond_window[None, None, :, :], min_dtype) + row_pos = torch.arange(seq_len, device=device).unsqueeze(1) + past_len + col_pos = torch.arange(target_len, device=device).unsqueeze(0) + beyond_window = (row_pos - col_pos) >= sliding_window + sliding_mask = sliding_mask.masked_fill(beyond_window[None, None, :, :], min_dtype) # Apply bidirectional masking for vision tokens (only on sliding_attention mask) # mm_token_type_ids: [batch, total_len] - 0=text, 1=image, 2=video/audio @@ -5016,7 +4983,10 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, "sliding_attention": sliding_mask, } - +# Forward method of the language model of Gemma4, needs to be patched to pass 'per_layer_inputs', +# as original code fails to create per_layer_inputs without the providing of input_ids, +# while OV language model expects only inputs_embeds without input_ids. +# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L2152 def gemma4_language_model_forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -5089,7 +5059,8 @@ def gemma4_language_model_forward( image_hidden_states=image_features if pixel_values is not None else None, ) - +# Gemma4 model forward, needs to be patched to pass 'per_layer_inputs', +# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L2396 def gemma4_lm_forward( self, attention_mask: Optional[torch.Tensor] = None, @@ -5163,6 +5134,8 @@ def gemma4_lm_forward( return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs_dict.values()]) +# Needs to be patched to reshape 'attention_mask' to match attention weights +# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L768 def gemma4_eager_attention_forward_patched( module: nn.Module, query: torch.Tensor, @@ -5197,6 +5170,8 @@ def gemma4_eager_attention_forward_patched( return attn_output, attn_weights +# Needs to be patched to run methods 'gemma4_eager_attention_forward_patched' instead of original one +# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L1179 def gemma4_text_attention_forward( self, hidden_states: torch.Tensor, @@ -5281,14 +5256,6 @@ def __enter__(self): setattr(self._model, self.orig_forward_name, types.MethodType(gemma4_lm_forward, self._model)) setattr(self._model.model, "forward", types.MethodType(gemma4_language_model_forward, self._model)) - - self._model.model.language_model._orig_project_per_layer_inputs = ( - self._model.model.language_model.project_per_layer_inputs - ) - self._model.model.language_model.project_per_layer_inputs = types.MethodType( - _gemma4_project_per_layer_inputs, self._model.model.language_model - ) - for decoder_layer in self._model.model.language_model.layers: decoder_layer.self_attn.orig_forward = decoder_layer.self_attn.forward decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn) @@ -5298,9 +5265,6 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - self._model.model.language_model.project_per_layer_inputs = ( - self._model.model.language_model._orig_project_per_layer_inputs - ) for decoder_layer in self._model.model.language_model.layers: decoder_layer.self_attn.forward = decoder_layer.self_attn.orig_forward diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 42a4866c38..b376a58735 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -221,7 +221,8 @@ def prepare_inputs( if "per_layer_inputs" in self.input_names: per_layer_inputs = kwargs.pop("per_layer_inputs", None) - assert per_layer_inputs is not None, "Expected 'per_layer_inputs', but it was not passed" + if per_layer_inputs is None: + raise ValueError("Expected 'per_layer_inputs', but it was not passed") inputs["per_layer_inputs"] = torch.Tensor(per_layer_inputs) return inputs From 942112a13fff8798d9d40c28b962ccb4aa003986 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 30 Apr 2026 14:11:08 +0200 Subject: [PATCH 221/222] Code style. --- optimum/exporters/openvino/model_patcher.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index d8237f954a..59e40f0e8a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4923,6 +4923,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._model.model._update_causal_mask = self._model.model._orig_update_causual_mask del self._model.model._orig_update_causual_mask + # Creates a dict of causal masks with bidirectional attention for vision tokens # on sliding_attention layers, matching the behavior of transformers # create_causal_mask_mapping when use_bidirectional_attention == "vision". @@ -4983,6 +4984,7 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, "sliding_attention": sliding_mask, } + # Forward method of the language model of Gemma4, needs to be patched to pass 'per_layer_inputs', # as original code fails to create per_layer_inputs without the providing of input_ids, # while OV language model expects only inputs_embeds without input_ids. @@ -5059,6 +5061,7 @@ def gemma4_language_model_forward( image_hidden_states=image_features if pixel_values is not None else None, ) + # Gemma4 model forward, needs to be patched to pass 'per_layer_inputs', # Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L2396 def gemma4_lm_forward( From eac389347523177511abe37908090d9e5c12e714 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Thu, 30 Apr 2026 17:36:45 +0200 Subject: [PATCH 222/222] Applied comments. --- optimum/exporters/openvino/model_configs.py | 6 ++---- optimum/intel/openvino/modeling_decoder.py | 7 ++++--- tests/openvino/test_seq2seq.py | 2 +- tests/openvino/utils_tests.py | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index c14cfc2946..36a83adfa7 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -14,9 +14,11 @@ import enum import logging +import math from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +import torch from transformers import AutoConfig, PretrainedConfig, PreTrainedModel from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig @@ -4370,10 +4372,6 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int dtype=float_dtype, ) if input_name == "image_position_ids": - import math - - import torch - # Create position ids as a grid. The patch count = h_patches * w_patches # where both are divisible by pooling_kernel_size for correct pooling. k = self.pooling_kernel_size diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 7bab52fc1a..a74582b2c1 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -33,10 +33,12 @@ from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput from transformers.utils.hub import PushToHubMixin +from ..utils.import_utils import compare_versions, is_transformers_version -try: + +if is_transformers_version("<", "5.5"): from transformers.models.mamba.modeling_mamba import MambaCache -except ImportError: +else: MambaCache = object from optimum.utils.normalized_config import NormalizedConfigManager @@ -44,7 +46,6 @@ from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful from ...exporters.openvino.stateful import model_has_state from ...exporters.openvino.utils import SSM_MODELS -from ..utils.import_utils import compare_versions from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS from .configuration import ( OVConfig, diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 29b18f450f..37efa86b2d 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -787,7 +787,7 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config): with torch.no_grad(): transformers_outputs = transformers_model(**transformers_inputs) self.assertTrue( - torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=4e-3), + torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=4e-3), f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", ) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 392128dd29..3ee3a2035b 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -383,7 +383,7 @@ "text_embeddings_per_layer_model": 1, }, "gemma4_moe": { - "lm_model": 44, + "lm_model": 48, "text_embeddings_model": 1, "vision_embeddings_model": 10, "text_embeddings_per_layer_model": 0,