From 53d19b95b0d0fee1d1d7259afd920a97d62b4881 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 19 Jan 2026 10:39:10 +0100
Subject: [PATCH 001/222] Transformers v5

---
 .github/workflows/test_openvino.yml | 2 +-
 setup.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 2d0958b2d6..cffeabc42d 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -38,7 +38,7 @@ jobs:
             "*diffusion*",
             "*quantization*",
           ]
-        transformers-version: ["4.45.0", "latest"]
+        transformers-version: ["4.45.0", "latest", "5.0.0rc3"]
 
     runs-on: ubuntu-22.04
 
diff --git a/setup.py b/setup.py
index f7be8fd778..79a2cac349 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRE = [
     "torch>=2.1",
     "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main",
-    "transformers>=4.45,<4.58",
+    "transformers>=4.45,<5.1",
     "setuptools",
 ]
 

From 5205434f5394f98072291ededa597869b1604839 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 19 Jan 2026 16:11:49 +0100
Subject: [PATCH 002/222] fix loading for llava_next_video

---
 tests/openvino/test_genai.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index b8b9e8d6cd..cdb4f8a555 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -247,9 +247,9 @@ def _get_model_class(self, model_arch):
 
             return AutoModelForImageTextToText
         elif model_arch == "llava_next_video":
-            from transformers import AutoModelForVision2Seq
+            from transformers import LlavaNextVideoForConditionalGeneration
 
-            return AutoModelForVision2Seq
+            return LlavaNextVideoForConditionalGeneration
         elif model_arch == "llava":
             from transformers import LlavaForConditionalGeneration
 

From e8feb0caf0d4286fa633ba2e2907681e2e9605f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 20 Jan 2026 18:09:58 +0100
Subject: [PATCH 003/222] Remove deprecated transformers.onnx

---
 optimum/intel/openvino/modeling_base.py |  6 ++----
 optimum/intel/openvino/utils.py         | 13 -------------
 tests/openvino/test_modeling.py         | 11 +++++------
 3 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 89fa7f5a88..6632acde68 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -851,16 +851,14 @@ def _export(
         **kwargs,
     ):
         """
-        Export a vanilla Transformers model into an ONNX model using `transformers.onnx.export_onnx`.
+        Load and export a model to the OpenVINO IR.
 
         Arguments:
             model_id (`str` or `Path`):
                 The directory from which to load the model.
                 Can be either:
                     - The model id of a pretrained model hosted inside a model repo on huggingface.co.
-                    - The path to a directory containing the model weights.            save_dir (`str` or `Path`):
-                The directory where the exported ONNX model should be saved, default to
-                `transformers.file_utils.default_cache_path`, which is the cache directory for transformers.
+                    - The path to a directory containing the model weights.
             token (Optional[Union[bool, str]], defaults to `None`):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                 when running `huggingface-cli login` (stored in `~/.huggingface`).
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 4baa280fea..bb3ec658ed 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -32,7 +32,6 @@
 from openvino import Type as OVType
 from packaging.version import Version
 from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size
 
 from optimum.intel.utils.import_utils import is_torch_version
 
@@ -228,18 +227,6 @@ def maybe_convert_tokenizer_to_fast(
     return hf_tokenizer
 
 
-def use_external_data_format(num_parameters: int) -> bool:
-    """
-    Returns whether or not the model requires using external data format for the ONNX export
-    Args:
-        num_parameters: Number of parameter on the model
-    Returns:
-        True if model.num_parameters() * size_of(float32) >= 2Gb False otherwise
-    """
-
-    return compute_serialized_parameters_size(num_parameters, ParameterFormat.Float) >= EXTERNAL_DATA_FORMAT_SIZE_LIMIT
-
-
 def _is_timm_ov_dir(model_dir):
     config_file = None
     has_xml = False
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 279fcb9a8d..0c15a1b251 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -53,7 +53,6 @@
     pipeline,
     set_seed,
 )
-from transformers.onnx.utils import get_preprocessor
 from transformers.testing_utils import slow
 from transformers.utils import http_user_agent
 from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TENSOR_ALIAS_TO_TYPE, TEST_IMAGE_URL
@@ -236,7 +235,7 @@ def test_load_from_hub_and_save_visual_language_model(self):
             # anymore due to an internal bug in transformers
             model_ids.append("katuni4ka/phi-4-multimodal-ov")
         for model_id in model_ids:
-            processor = get_preprocessor(model_id)
+            processor = AutoProcessor.from_pretrained(model_id)
             prompt = "What is shown in this image?"
             image = Image.open(
                 requests.get(
@@ -491,7 +490,7 @@ def test_load_from_hub_and_save_sam_model(self):
         self.assertEqual(
             loaded_model.prompt_encoder_mask_decoder.request.get_property("PERFORMANCE_HINT"), "THROUGHPUT"
         )
-        processor = get_preprocessor(self.OV_SAM_MODEL_ID)
+        processor = AutoProcessor.from_pretrained(self.OV_SAM_MODEL_ID)
         img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
         input_points = [[[450, 600]]]
         raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
@@ -1846,7 +1845,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
         ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE)
-        processor = get_preprocessor(model_id)
+        processor = AutoProcessor.from_pretrained(model_id)
 
         self.assertIsInstance(ov_model.vision_encoder, OVSamVisionEncoder)
         self.assertIsInstance(ov_model.prompt_encoder_mask_decoder, OVSamPromptEncoder)
@@ -1899,7 +1898,7 @@ def test_reshape(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
         ov_model = OVSamModel.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE)
-        processor = get_preprocessor(model_id)
+        processor = AutoProcessor.from_pretrained(model_id)
         self.assertTrue(ov_model.is_dynamic)
         input_points = [[[450, 600]]]
         IMAGE = Image.open(
@@ -1935,7 +1934,7 @@ def test_compare_to_transformers(self, model_arch):
         ov_model = OVModelForZeroShotImageClassification.from_pretrained(
             model_id, export=True, ov_config=F32_CONFIG, device=OPENVINO_DEVICE
         )
-        processor = get_preprocessor(model_id)
+        processor = AutoProcessor.from_pretrained(model_id)
 
         self.assertIsInstance(ov_model.config, PretrainedConfig)
 

From bb54f64adee17964693a238de416ebd492a728ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 23 Jan 2026 15:01:05 +0100
Subject: [PATCH 004/222] remove deprecated transformers.onnx from tests

---
 tests/openvino/test_seq2seq.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index dbc1031a4c..238e13a1ac 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -42,7 +42,6 @@
     set_seed,
 )
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES
-from transformers.onnx.utils import get_preprocessor
 from transformers.testing_utils import slow
 from transformers.utils import http_user_agent
 from utils_tests import F32_CONFIG, MODEL_NAMES, OPENVINO_DEVICE, SEED, TEST_IMAGE_URL, Timer
@@ -336,7 +335,7 @@ def test_compare_to_transformers(self, model_arch):
         self._check_openvino_model_attributes(ov_model, use_cache=True, stateful=True)
         self._check_openvino_model_attributes(ov_model_stateless, use_cache=True, stateful=False)
 
-        processor = get_preprocessor(model_id)
+        processor = AutoProcessor.from_pretrained(model_id)
         data = self._generate_random_audio_data()
         pt_features = processor.feature_extractor(data, return_tensors="pt")
         decoder_start_token_id = transformers_model.config.decoder_start_token_id
@@ -395,7 +394,7 @@ def test_pipeline(self, model_arch):
         set_seed(SEED)
         model_id = MODEL_NAMES[model_arch]
         model = self.OVMODEL_CLASS.from_pretrained(model_id, device=OPENVINO_DEVICE)
-        processor = get_preprocessor(model_id)
+        processor = AutoProcessor.from_pretrained(model_id)
         pipe = pipeline(
             "automatic-speech-recognition",
             model=model,
@@ -1079,7 +1078,7 @@ def test_compare_to_transformers(self, model_arch):
 
         question = "Who am I?"
         transformers_model = self.AUTOMODEL_CLASS.from_pretrained(model_id)
-        preprocessor = get_preprocessor(model_id)
+        preprocessor = AutoProcessor.from_pretrained(model_id)
 
         inputs = preprocessor(images=self.IMAGE, text=question, padding=True, return_tensors="pt")
         ov_outputs = ov_model(**inputs)
@@ -1100,7 +1099,7 @@ def test_compare_to_transformers(self, model_arch):
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = self.OVMODEL_CLASS.from_pretrained(model_id, export=True, device=OPENVINO_DEVICE)
-        preprocessor = get_preprocessor(model_id)
+        preprocessor = AutoProcessor.from_pretrained(model_id)
         question = "Who am I?"
         inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt")
 
@@ -1114,7 +1113,7 @@ def test_generate_utils(self, model_arch):
 
     def test_compare_with_and_without_past_key_values(self):
         model_id = MODEL_NAMES["pix2struct"]
-        preprocessor = get_preprocessor(model_id)
+        preprocessor = AutoProcessor.from_pretrained(model_id)
         question = "Who am I?"
         inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt")
         model_with_pkv = self.OVMODEL_CLASS.from_pretrained(

From 71aa34e773537e6a191463e4c9298720fd3ff714 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 23 Jan 2026 15:38:03 +0100
Subject: [PATCH 005/222] remove huggingface_hub  deprecated

---
 optimum/intel/openvino/modeling_base.py |  8 +++++++-
 optimum/intel/utils/import_utils.py     | 18 ++++++++++++++++++
 optimum/intel/utils/modeling_utils.py   | 11 ++++++++---
 setup.py                                |  1 +
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index b96f375728..569422a085 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -28,12 +28,12 @@
 from transformers import GenerationConfig, PretrainedConfig
 from transformers.file_utils import add_start_docstrings
 from transformers.generation import GenerationMixin
-from transformers.utils import is_offline_mode
 from transformers.utils.hub import cached_file
 
 from optimum.exporters.base import ExportConfig
 from optimum.exporters.openvino.utils import _MAX_UNCOMPRESSED_SIZE
 from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
+from optimum.utils.import_utils import is_huggingface_hub_version
 
 from ...exporters.openvino import export, main_export
 from ..utils.import_utils import is_nncf_available
@@ -58,6 +58,12 @@
 )
 
 
+if is_huggingface_hub_version(">=", "1.2.1"):
+    from huggingface_hub import is_offline_mode
+else:
+    from transformers.utils import is_offline_mode
+
+
 core = Core()
 
 logger = logging.getLogger(__name__)
diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py
index 3ad9877a82..d5e44d06d0 100644
--- a/optimum/intel/utils/import_utils.py
+++ b/optimum/intel/utils/import_utils.py
@@ -119,6 +119,15 @@
         pass
 
 
+_huggingface_hub_available = importlib.util.find_spec("huggingface_hub") is not None
+_huggingface_hub_version = "N/A"
+if _huggingface_hub_available:
+    try:
+        _huggingface_hub_version = importlib_metadata.version("huggingface_hub")
+    except importlib_metadata.PackageNotFoundError:
+        _huggingface_hub_available = False
+
+
 _safetensors_version = "N/A"
 _safetensors_available = importlib.util.find_spec("safetensors") is not None
 if _safetensors_available:
@@ -486,6 +495,15 @@ def is_sentence_transformers_version(operation: str, version: str):
     return compare_versions(parse(_sentence_transformers_version), operation, version)
 
 
+def is_huggingface_hub_version(operation: str, version: str):
+    """
+    Compare the current huggingface_hub version to a given reference with an operation.
+    """
+    if not _huggingface_hub_available:
+        return False
+    return compare_versions(parse(_huggingface_hub_version), operation, version)
+
+
 DIFFUSERS_IMPORT_ERROR = """
 {0} requires the diffusers library but it was not found in your environment. You can install it with pip:
 `pip install diffusers`. Please note that you may need to restart your runtime after installation.
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index cab9e5efa3..83b5ccc1ac 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -23,14 +23,19 @@
 from typing import Dict, List, Optional, Type, Union
 
 import torch
-from huggingface_hub import HfApi, HfFolder, hf_hub_download
+from huggingface_hub import HfApi, get_token, hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from huggingface_hub.hf_api import file_exists
 from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel
 
 from optimum.exporters.tasks import TasksManager
 
-from .import_utils import is_diffusers_available, is_numa_available, is_open_clip_available, is_psutil_available
+from .import_utils import (
+    is_diffusers_available,
+    is_numa_available,
+    is_open_clip_available,
+    is_psutil_available,
+)
 
 
 if is_diffusers_available():
@@ -115,7 +120,7 @@ def _find_files_matching_pattern(
     model_path = Path(model_name_or_path) if not isinstance(model_name_or_path, Path) else model_name_or_path
 
     if isinstance(use_auth_token, bool):
-        token = HfFolder().get_token()
+        token = get_token()
     else:
         token = use_auth_token
 
diff --git a/setup.py b/setup.py
index 79a2cac349..9937ad3ebf 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,7 @@
     "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main",
     "transformers>=4.45,<5.1",
     "setuptools",
+    "huggingface-hub>=0.23.2,<2.0",
 ]
 
 TESTS_REQUIRE = [

From 0954015d7953735a0c1e5f1519bbbbd7cafeb77b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 23 Jan 2026 19:19:17 +0100
Subject: [PATCH 006/222] relative to absolute import

---
 optimum/intel/openvino/modeling_base.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 569422a085..8a16470fe4 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -31,14 +31,9 @@
 from transformers.utils.hub import cached_file
 
 from optimum.exporters.base import ExportConfig
+from optimum.exporters.openvino import export, main_export
 from optimum.exporters.openvino.utils import _MAX_UNCOMPRESSED_SIZE
-from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
-from optimum.utils.import_utils import is_huggingface_hub_version
-
-from ...exporters.openvino import export, main_export
-from ..utils.import_utils import is_nncf_available
-from ..utils.modeling_utils import _find_files_matching_pattern
-from .configuration import (
+from optimum.intel.openvino.configuration import (
     _DEFAULT_4BIT_WQ_CONFIG,
     OVConfig,
     OVQuantizationConfigBase,
@@ -47,7 +42,7 @@
     _quantization_config_from_dict,
     get_default_quantization_config,
 )
-from .utils import (
+from optimum.intel.openvino.utils import (
     ONNX_WEIGHTS_NAME,
     OV_TO_PT_TYPE,
     OV_XML_FILE_NAME,
@@ -56,6 +51,9 @@
     classproperty,
     model_has_dynamic_inputs,
 )
+from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available
+from optimum.intel.utils.modeling_utils import _find_files_matching_pattern
+from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
 
 
 if is_huggingface_hub_version(">=", "1.2.1"):

From 1ba9789bd9d8a18cd56631bbb7d85edd8ce8144f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 15:06:56 +0100
Subject: [PATCH 007/222] update workflow to v5

---
 .github/workflows/test_openvino.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index cffeabc42d..f1874d3dbd 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -38,7 +38,7 @@ jobs:
             "*diffusion*",
             "*quantization*",
           ]
-        transformers-version: ["4.45.0", "latest", "5.0.0rc3"]
+        transformers-version: ["4.45.0", "latest", "5.0.0"]
 
     runs-on: ubuntu-22.04
 

From f1586565e90bdf05b900b9e9912089a42d0d417f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 15:43:26 +0100
Subject: [PATCH 008/222] remove redundant

---
 optimum/exporters/openvino/model_configs.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ca12d455be..e9c7b52d97 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -247,10 +247,6 @@ def init_model_configs():
         "transformers",
         "Qwen2VLForConditionalGeneration",
     )
-    TasksManager._CUSTOM_CLASSES[("pt", "qwen2_5_vl", "image-text-to-text")] = (
-        "transformers",
-        "AutoModelForImageTextToText",
-    )
     TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = (
         "transformers",
         "AutoModelForVision2Seq",
@@ -259,14 +255,6 @@ def init_model_configs():
         "transformers",
         "Gemma3ForConditionalGeneration",
     )
-    TasksManager._CUSTOM_CLASSES[("pt", "idefics3", "image-text-to-text")] = (
-        "transformers",
-        "AutoModelForImageTextToText",
-    )
-    TasksManager._CUSTOM_CLASSES[("pt", "smolvlm", "image-text-to-text")] = (
-        "transformers",
-        "AutoModelForImageTextToText",
-    )
     TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM")
     TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = (
         "transformers",
@@ -280,10 +268,6 @@ def init_model_configs():
         "transformers",
         "AutoModelForCausalLM",
     )
-    TasksManager._CUSTOM_CLASSES[("pt", "llama4", "image-text-to-text")] = (
-        "transformers",
-        "AutoModelForImageTextToText",
-    )
 
     if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"

From 93451439c75f2758031a3acb573547a5a55add26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 16:18:13 +0100
Subject: [PATCH 009/222] update loading given transformers version

---
 optimum/exporters/openvino/model_configs.py | 53 +++++++++++++--------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e9c7b52d97..67686b94bb 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -235,26 +235,7 @@
 def init_model_configs():
     if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
         TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {}
-    TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = (
-        "transformers",
-        "LlavaForConditionalGeneration",
-    )
-    TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = (
-        "transformers",
-        "LlavaNextForConditionalGeneration",
-    )
-    TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = (
-        "transformers",
-        "Qwen2VLForConditionalGeneration",
-    )
-    TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = (
-        "transformers",
-        "AutoModelForVision2Seq",
-    )
-    TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = (
-        "transformers",
-        "Gemma3ForConditionalGeneration",
-    )
+
     TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM")
     TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = (
         "transformers",
@@ -269,6 +250,38 @@ def init_model_configs():
         "AutoModelForCausalLM",
     )
 
+    # since transformers v4.46, model can be loaded using default AutoModelForImageTextToText
+    # https://github.com/huggingface/transformers/blob/v4.46.0/src/transformers/models/auto/modeling_auto.py#L776
+    if is_transformers_version("<", "4.46"):
+        TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = (
+            "transformers",
+            "LlavaForConditionalGeneration",
+        )
+        TasksManager._CUSTOM_CLASSES[("pt", "llava_next", "image-text-to-text")] = (
+            "transformers",
+            "LlavaNextForConditionalGeneration",
+        )
+        TasksManager._CUSTOM_CLASSES[("pt", "qwen2_vl", "image-text-to-text")] = (
+            "transformers",
+            "Qwen2VLForConditionalGeneration",
+        )
+
+    # since transformers v4.50, model can be loaded using default AutoModelForImageTextToText
+    # https://github.com/huggingface/transformers/blob/v4.50.0/src/transformers/models/auto/modeling_auto.py#L835
+    if is_transformers_version("<", "4.50"):
+        TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = (
+            "transformers",
+            "Gemma3ForConditionalGeneration",
+        )
+
+    # since transformers v4.52, model can be loaded using default AutoModelForImageTextToText
+    # https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/auto/modeling_auto.py#L899
+    if is_transformers_version("<", "4.52"):
+        TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = (
+            "transformers",
+            "AutoModelForVision2Seq",
+        )
+
     if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}

From b290ae3c36ca4b6dd995b0601be2450a6aed63ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 16:37:48 +0100
Subject: [PATCH 010/222] remove deprecated AutoModelForVision2Seq

---
 optimum/intel/openvino/modeling_seq2seq.py | 15 +++++++++++++--
 tests/openvino/test_seq2seq.py             | 21 ++++++++++++++++-----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index c1b2177c59..4a7bc0394d 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -27,7 +27,6 @@
     AutoConfig,
     AutoModelForSeq2SeqLM,
     AutoModelForSpeechSeq2Seq,
-    AutoModelForVision2Seq,
     GenerationConfig,
     Pix2StructForConditionalGeneration,
     PretrainedConfig,
@@ -56,6 +55,18 @@
 )
 
 
+# AutoModelForVision2Seq is deprecated since v4.54
+# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151
+if is_transformers_version(">=", "4.54.0"):
+    from transformers import AutoModelForImageTextToText
+
+    transformers_auto_class = AutoModelForImageTextToText
+else:
+    from transformers import AutoModelForVision2Seq
+
+    transformers_auto_class = AutoModelForVision2Seq
+
+
 core = Core()
 
 logger = logging.getLogger(__name__)
@@ -1036,7 +1047,7 @@ def _reorder_cache(
     INPUTS_DOCSTRING,
 )
 class OVModelForVision2Seq(OVModelForSeq2SeqLM):
-    auto_model_class = AutoModelForVision2Seq
+    auto_model_class = transformers_auto_class
     main_input_name = "pixel_values"
     export_feature = "image-to-text"
 
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 238e13a1ac..83a4b7c54f 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -33,7 +33,6 @@
     AutoModelForSeq2SeqLM,
     AutoModelForSpeechSeq2Seq,
     AutoModelForTextToSpectrogram,
-    AutoModelForVision2Seq,
     AutoProcessor,
     AutoTokenizer,
     GenerationConfig,
@@ -69,6 +68,18 @@
 from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
 
 
+# AutoModelForVision2Seq is deprecated since v4.54
+# https://github.com/huggingface/transformers/blob/v4.54.0/src/transformers/models/auto/modeling_auto.py#L2151
+if is_transformers_version(">=", "4.54.0"):
+    from transformers import AutoModelForImageTextToText
+
+    transformers_auto_class = AutoModelForImageTextToText
+else:
+    from transformers import AutoModelForVision2Seq
+
+    transformers_auto_class = AutoModelForVision2Seq
+
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
@@ -421,7 +432,7 @@ class OVModelForVision2SeqIntegrationTest(OVSeq2SeqTestMixin):
     UNSUPPORTED_ARCHITECTURES = {"got_ocr2", "pix2struct"}
     TASK = "image-to-text"
     OVMODEL_CLASS = OVModelForVision2Seq
-    AUTOMODEL_CLASS = AutoModelForVision2Seq
+    AUTOMODEL_CLASS = transformers_auto_class
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 
@@ -580,9 +591,9 @@ def get_transformer_model_class(self, model_arch):
 
             return AutoModelForImageTextToText
         if model_arch == "llava_next_video":
-            from transformers import AutoModelForVision2Seq
+            from transformers import LlavaNextVideoForConditionalGeneration
 
-            return AutoModelForVision2Seq
+            return LlavaNextVideoForConditionalGeneration
         if model_arch == "llava":
             from transformers import LlavaForConditionalGeneration
 
@@ -1056,7 +1067,7 @@ class OVModelForPix2StructIntegrationTest(OVSeq2SeqTestMixin):
     SUPPORTED_ARCHITECTURES = ["pix2struct"]
     TASK = "image-to-text"  # is it fine as well with visual-question-answering?
     OVMODEL_CLASS = OVModelForVision2Seq
-    AUTOMODEL_CLASS = AutoModelForVision2Seq
+    AUTOMODEL_CLASS = transformers_auto_class
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 

From a4d1dc0067813762978c3252c029b140b7e53ebd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 16:44:22 +0100
Subject: [PATCH 011/222] update workflow

---
 .github/workflows/test_openvino.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index f1874d3dbd..1e8433087c 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -56,7 +56,7 @@ jobs:
           pip install --upgrade pip uv
           uv pip install .[openvino,diffusers,tests]
 
-      - if: ${{ matrix.transformers-version != 'latest' }}
+      - if: ${{ matrix.transformers-version == '4.45.0' }}
         name: Install specific dependencies and versions required for older transformers
         run: |
           uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator

From ac953baa2715e0f4665a4d6b03303cd679e7ebd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 17:30:56 +0100
Subject: [PATCH 012/222] style

---
 optimum/intel/utils/modeling_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index 83b5ccc1ac..69de1770ce 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -29,8 +29,7 @@
 from transformers import CLIPConfig, PretrainedConfig, PreTrainedModel
 
 from optimum.exporters.tasks import TasksManager
-
-from .import_utils import (
+from optimum.intel.utils.import_utils import (
     is_diffusers_available,
     is_numa_available,
     is_open_clip_available,

From 800188441707ed6c8ea1b216d742cc110911b062 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 17:42:39 +0100
Subject: [PATCH 013/222] update setup

---
 .github/workflows/test_openvino.yml | 4 ++--
 setup.py                            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 1e8433087c..81c8b4b48a 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -38,7 +38,7 @@ jobs:
             "*diffusion*",
             "*quantization*",
           ]
-        transformers-version: ["4.45.0", "latest", "5.0.0"]
+        transformers-version: ["4.45", "4.57.6", "latest"]
 
     runs-on: ubuntu-22.04
 
@@ -61,7 +61,7 @@ jobs:
         run: |
           uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator
 
-      - if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*decoder*'}}
+      - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}}
         name: Install auto-gptq, autoawq
         run: |
           uv pip install auto-gptq "autoawq<0.2.8"
diff --git a/setup.py b/setup.py
index 9937ad3ebf..1c313dbe0c 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=2.1",
-    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main",
+    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5",
     "transformers>=4.45,<5.1",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",

From 5f2a00716ee2755fe9924d491d30ce476c2d947b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 17:48:35 +0100
Subject: [PATCH 014/222] deprecated is_offline_mode

---
 optimum/intel/openvino/modeling_open_clip.py | 25 ++++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py
index db6abf9cc3..11bc115843 100644
--- a/optimum/intel/openvino/modeling_open_clip.py
+++ b/optimum/intel/openvino/modeling_open_clip.py
@@ -31,16 +31,27 @@
 from transformers.file_utils import add_start_docstrings
 from transformers.modeling_outputs import ModelOutput
 from transformers.models.clip.modeling_clip import CLIPOutput
-from transformers.utils import is_offline_mode
 
+from optimum.exporters.openvino import main_export
 from optimum.exporters.tasks import TasksManager
+from optimum.intel.openvino.configuration import (
+    OVConfig,
+    OVWeightQuantizationConfig,
+)
+from optimum.intel.openvino.modeling import MODEL_START_DOCSTRING, OVModel
+from optimum.intel.openvino.modeling_base import OVModelHostMixin
+from optimum.intel.openvino.utils import (
+    TemporaryDirectory,
+    classproperty,
+)
+from optimum.intel.utils.import_utils import is_huggingface_hub_version
+from optimum.intel.utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification
+
 
-from ...exporters.openvino import main_export
-from ..utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification
-from .configuration import OVConfig, OVWeightQuantizationConfig
-from .modeling import MODEL_START_DOCSTRING, OVModel
-from .modeling_base import OVModelHostMixin
-from .utils import TemporaryDirectory, classproperty
+if is_huggingface_hub_version(">=", "1.2.1"):
+    from huggingface_hub import is_offline_mode
+else:
+    from transformers.utils import is_offline_mode
 
 
 logger = logging.getLogger(__name__)

From ad477fe92395a73a67aac57349cbe25c4a82e466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 18:15:55 +0100
Subject: [PATCH 015/222] remove incompatible neural-compressor installation

---
 .github/workflows/build_documentation.yml    | 2 +-
 .github/workflows/build_pr_documentation.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index bcb51d6b58..52dae651de 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -51,7 +51,7 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
-          uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate datasets
+          uv pip install .[quality] nncf openvino diffusers accelerate datasets
 
       - name: Make documentation
         shell: bash
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 16ac720c8f..c4a34baaa6 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -38,7 +38,7 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
-          uv pip install .[quality] nncf openvino neural-compressor[pt]>3.4 diffusers accelerate datasets
+          uv pip install .[quality] nncf openvino diffusers accelerate datasets
 
       - name: Make documentation
         shell: bash

From 42e98b8495fc4ac8dc090cf06c5459f400faff55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 18:18:57 +0100
Subject: [PATCH 016/222] remove documentation reference

---
 docs/source/neural_compressor/reference.mdx | 40 ---------------------
 1 file changed, 40 deletions(-)

diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx
index b6e3d8f468..b83618b4bc 100644
--- a/docs/source/neural_compressor/reference.mdx
+++ b/docs/source/neural_compressor/reference.mdx
@@ -14,43 +14,3 @@ specific language governing permissions and limitations under the License.
 `optimum.intel.neural_compressor` is deprecated and will be removed in the next major release.
 
 </Tip>
-
-## INCQuantizer
-
-[[autodoc]] neural_compressor.quantization.INCQuantizer
-
-## INCTrainer
-
-[[autodoc]] neural_compressor.trainer.INCTrainer
-
-## INCModel
-
-[[autodoc]] neural_compressor.modeling_base.INCModel
-
-## INCModelForSequenceClassification
-
-[[autodoc]] neural_compressor.modeling_base.INCModelForSequenceClassification
-
-## INCModelForQuestionAnswering
-
-[[autodoc]] neural_compressor.modeling_base.INCModelForQuestionAnswering
-
-## INCModelForTokenClassification
-
-[[autodoc]] neural_compressor.modeling_base.INCModelForTokenClassification
-
-## INCModelForMultipleChoice
-
-[[autodoc]] neural_compressor.modeling_base.INCModelForMultipleChoice
-
-## INCModelForMaskedLM
-
-[[autodoc]] neural_compressor.modeling_base.INCModelForMaskedLM
-
-## INCModelForCausalLM
-
-[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM
-
-## INCModelForSeq2SeqLM
-
-[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM

From 4ee3f51ccdf946ae44644e9980f494d0893c2f71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 26 Jan 2026 18:30:58 +0100
Subject: [PATCH 017/222] add install transformers step

---
 .github/workflows/test_openvino.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 81c8b4b48a..2fcd23dbcf 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -38,7 +38,7 @@ jobs:
             "*diffusion*",
             "*quantization*",
           ]
-        transformers-version: ["4.45", "4.57.6", "latest"]
+        transformers-version: ["4.45.0", "4.57.6", "latest"]
 
     runs-on: ubuntu-22.04
 
@@ -56,6 +56,11 @@ jobs:
           pip install --upgrade pip uv
           uv pip install .[openvino,diffusers,tests]
 
+      - if: ${{ matrix.transformers-version != 'latest' }}
+        name: Install transformers
+        run: |
+          uv pip install transformers==${{ matrix.transformers-version }}
+
       - if: ${{ matrix.transformers-version == '4.45.0' }}
         name: Install specific dependencies and versions required for older transformers
         run: |

From 8204264e1ab001d039ccdfae3a3c48418ccc23d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 27 Jan 2026 18:54:01 +0100
Subject: [PATCH 018/222] transformers v5

---
 .github/workflows/test_openvino.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 2fcd23dbcf..aef4ef484b 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -38,7 +38,7 @@ jobs:
             "*diffusion*",
             "*quantization*",
           ]
-        transformers-version: ["4.45.0", "4.57.6", "latest"]
+        transformers-version: ["4.45.0", "5.0.0", "latest"]
 
     runs-on: ubuntu-22.04
 

From b319d19a5e8d33761bad5de23291a2c8c87557af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 27 Jan 2026 19:10:25 +0100
Subject: [PATCH 019/222] install diffusers from source for v5

---
 .github/workflows/test_openvino.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index aef4ef484b..5bcbc0e31c 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -66,6 +66,11 @@ jobs:
         run: |
           uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator
 
+      - if: ${{ matrix.transformers-version == '5.0.0' }}
+        name: Install diffusers
+        run: |
+          uv pip install git+https://github.com/huggingface/diffusers
+
       - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}}
         name: Install auto-gptq, autoawq
         run: |

From 42300e42bbea84fde261a6cf01f81ac3789081a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 27 Jan 2026 19:21:26 +0100
Subject: [PATCH 020/222] remove deprecated CLIPFeatureExtractor

---
 optimum/intel/openvino/modeling_diffusion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 368265bc3e..22182ee96c 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -50,7 +50,7 @@
 from huggingface_hub.utils import validate_hf_hub_args
 from openvino import Core
 from openvino._offline_transformations import compress_model_transformation
-from transformers import CLIPFeatureExtractor, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTokenizer
 from transformers.modeling_outputs import ModelOutput
 from transformers.utils import http_user_agent
 
@@ -170,7 +170,7 @@ def __init__(
         tokenizer: Optional[CLIPTokenizer] = None,
         tokenizer_2: Optional[CLIPTokenizer] = None,
         tokenizer_3: Optional[CLIPTokenizer] = None,
-        feature_extractor: Optional[CLIPFeatureExtractor] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
         # stable diffusion xl specific arguments
         force_zeros_for_empty_prompt: bool = True,
         requires_aesthetics_score: bool = False,

From 2a761024506fa8536a77c603233e875e25a4dbb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 28 Jan 2026 16:17:24 +0100
Subject: [PATCH 021/222] openvino 2025.3.0

---
 optimum/intel/openvino/__init__.py | 8 +++++---
 setup.py                           | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
index 8441944800..28e39f0528 100644
--- a/optimum/intel/openvino/__init__.py
+++ b/optimum/intel/openvino/__init__.py
@@ -35,8 +35,12 @@
 
 warnings.simplefilter(action="ignore", category=FutureWarning)
 
+
+logger = logging.getLogger(__name__)
+
+
 if is_openvino_version("<", "2025.4.0"):
-    raise ImportError(
+    logger.warning(
         "Optimum-intel requires OpenVINO version 2025.4.0 or higher. "
         "Please upgrade OpenVINO to version 2025.4 or later. "
         f"The current version of OpenVINO is {_openvino_version}."
@@ -51,8 +55,6 @@
 )
 
 
-logger = logging.getLogger(__name__)
-
 if is_nncf_available():
     import nncf
 
diff --git a/setup.py b/setup.py
index 1c313dbe0c..b2c945b37b 100644
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@
 
 EXTRAS_REQUIRE = {
     "nncf": ["nncf>=2.19.0"],
-    "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"],
+    "openvino": ["nncf>=2.19.0", "openvino==2025.3.0", "openvino-tokenizers==2025.3.0"],
     "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"],
     "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"],
     "diffusers": ["diffusers"],

From f38703a626c64495bd67233368b2b36a5d0a78af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 10:57:49 +0100
Subject: [PATCH 022/222] add ov cache classes

---
 optimum/exporters/openvino/model_patcher.py | 167 ++++++++++++++++----
 1 file changed, 140 insertions(+), 27 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 73b25149d9..3639ece9cf 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -18,6 +18,7 @@
 import logging as log
 import math
 import types
+from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -54,6 +55,118 @@
 logger = logging.getLogger(__name__)
 
 
+class OVDynamicCache(DynamicCache):
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005
+    def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for
+        backward compatibility.
+        """
+        legacy_cache = ()
+        for layer in self.layers:
+            legacy_cache += ((layer.keys, layer.values),)
+        return legacy_cache
+
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "DynamicCache":
+        """
+        Converts a cache in the legacy cache format into an equivalent `Cache`. Used for
+        backward compatibility.
+        """
+        cache = cls()
+        if past_key_values is None:
+            logger.warning_once("past_key_values should not be None in from_legacy_cache()")
+        if past_key_values is not None:
+            for layer_idx in range(len(past_key_values)):
+                key_states, value_states = past_key_values[layer_idx]
+                cache.update(key_states, value_states, layer_idx)
+        return cache
+
+
+class OVEncoderDecoderCache(EncoderDecoderCache):
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266
+    def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
+        """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format."""
+        legacy_cache = ()
+        if len(self.cross_attention_cache) > 0:
+            for self_attn, cross_attn in zip(
+                self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache()
+            ):
+                legacy_cache += (self_attn + cross_attn,)
+        else:
+            legacy_cache = self.self_attention_cache.to_legacy_cache()
+        return legacy_cache
+
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1279
+    @classmethod
+    def from_legacy_cache(
+        cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]]
+    ) -> "EncoderDecoderCache":
+        """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
+        cache = cls(DynamicCache(), DynamicCache())
+        if past_key_values is None:
+            logger.warning_once("past_key_values should not be None in from_legacy_cache()")
+        else:
+            for layer_idx, key_value_states in enumerate(past_key_values):
+                key_states, value_states = key_value_states[:2]
+                cache.self_attention_cache.update(key_states, value_states, layer_idx)
+                if len(key_value_states) > 2:
+                    key_states, value_states = key_value_states[2:]
+                    cache.cross_attention_cache.update(key_states, value_states, layer_idx)
+                    cache.is_updated[layer_idx] = True
+        return cache
+
+
+def preprocess_past_key_values(past_key_values):
+    if (
+        is_transformers_version(">=", "4.48")
+        and isinstance(past_key_values, (list, tuple))
+        and isinstance(past_key_values[0], (list, tuple))
+    ):
+        if len(past_key_values[0]) == 2:
+            past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
+        elif len(past_key_values[0]) == 4:
+            past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values)
+        else:
+            raise ValueError(
+                f"past_key_values should have either 2 or 4 elements, but it has {len(past_key_values[0])} elements."
+            )
+
+    return past_key_values
+
+
+class OVModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        self.model_patched_forward = self.patched_forward
+
+        @functools.wraps(self.model_patched_forward)
+        def patched_forward(*args, **kwargs):
+            signature = inspect.signature(self.model_patched_forward)
+            args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs)
+
+            if "past_key_values" in signature.parameters:
+                # Most models require past_key_values to be a cache instance instead of a tuple now
+                pkv_index = list(signature.parameters.keys()).index("past_key_values")
+                if pkv_index < len(args) and args[pkv_index] is not None:
+                    args[pkv_index] = preprocess_past_key_values(args[pkv_index])
+                elif kwargs.get("past_key_values") is not None:
+                    kwargs["past_key_values"] = preprocess_past_key_values(kwargs["past_key_values"])
+
+            outputs = self.model_patched_forward(*args, **kwargs)
+
+            return outputs
+
+        self.patched_forward = patched_forward
+
+
 for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC):
     if spec.name in {
         # onnx-exporter-specific fixes
@@ -210,7 +323,7 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]:
     return mask
 
 
-class OVDecoderModelPatcher(ModelPatcher):
+class OVDecoderModelPatcher(OVModelPatcher):
     def __enter__(self):
         super().__enter__()
 
@@ -3069,7 +3182,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             layer.self_attn.forward = layer.self_attn._orig_forward
 
 
-class IBertModelPatcher(ModelPatcher):
+class IBertModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3087,7 +3200,7 @@ def __init__(
             self._model(torch.ones([1, 1], dtype=torch.long))
 
 
-class InternVLChatImageEmbeddingModelPatcher(ModelPatcher):
+class InternVLChatImageEmbeddingModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3230,7 +3343,7 @@ def maira_vision_embed_forward(self, pixel_values):
     return self.get_image_features(pixel_values, vision_feature_layer, vision_feature_select_strategy)
 
 
-class LlavaImageEmbeddingModelPatcher(ModelPatcher):
+class LlavaImageEmbeddingModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3247,7 +3360,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class MairaImageEmbeddingModelPatcher(ModelPatcher):
+class MairaImageEmbeddingModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3264,7 +3377,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher):
+class LlavaNextVideoImageEmbeddingModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3305,7 +3418,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
     return emb.unsqueeze(1)
 
 
-class FluxTransfromerModelPatcher(ModelPatcher):
+class FluxTransfromerModelPatcher(OVModelPatcher):
     def __enter__(self):
         super().__enter__()
         if is_diffusers_version("<", "0.31.0"):
@@ -3480,7 +3593,7 @@ def _minicpmv_siglip_transformer_forward(
     )
 
 
-class MiniCPMVResamplerModelPatcher(ModelPatcher):
+class MiniCPMVResamplerModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3497,7 +3610,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher):
+class MiniCPMVImageEmbeddingsModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3528,7 +3641,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 layer.self_attn.forward = layer.self_attn._orig_forward
 
 
-class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher):
+class LlavaQwen2ImageEmbeddingsModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3546,7 +3659,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class InputEmbeddingPatcher(ModelPatcher):
+class InputEmbeddingPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3571,7 +3684,7 @@ def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor):
     return self.get_img_features(pixel_values)
 
 
-class Phi3VisionImageEmbeddingsPatcher(ModelPatcher):
+class Phi3VisionImageEmbeddingsPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4180,7 +4293,7 @@ def block_forward(
         block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn)
 
 
-class Qwen2VLVisionEmbMergerPatcher(ModelPatcher):
+class Qwen2VLVisionEmbMergerPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4214,7 +4327,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             block.attn.forward = block.attn._orig_forward
 
 
-class Qwen2_5_VLVisionEmbMergerPatcher(ModelPatcher):
+class Qwen2_5_VLVisionEmbMergerPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4345,7 +4458,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward
 
 
-class OVSeq2SeqModelPatcher(ModelPatcher):
+class OVSeq2SeqModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4430,7 +4543,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask)
 
 
-class SanaTextEncoderModelPatcher(ModelPatcher):
+class SanaTextEncoderModelPatcher(OVModelPatcher):
     def __enter__(self):
         super().__enter__()
 
@@ -4481,7 +4594,7 @@ def __init__(
         super().__init__(config, model, model_kwargs)
 
 
-class CommonImageEmbeddingsModelPatcher(ModelPatcher):
+class CommonImageEmbeddingsModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4631,7 +4744,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             del self._model.model._orig_update_causual_mask
 
 
-class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher):
+class Idefics3ImageEmbeddingsModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -5410,7 +5523,7 @@ def speecht5_decoder_layer_forward(
     return outputs
 
 
-class OVSpeechT5ModelPatcher(ModelPatcher):
+class OVSpeechT5ModelPatcher(OVModelPatcher):
     def __enter__(self):
         if self.real_config._behavior != "vocoder":
             super().__enter__()
@@ -5586,7 +5699,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class Phi4MMAudioForwardEmbeddingsPatcher(ModelPatcher):
+class Phi4MMAudioForwardEmbeddingsPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -5610,7 +5723,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class Phi4MMAudioEncoderPatcher(ModelPatcher):
+class Phi4MMAudioEncoderPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -5651,7 +5764,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class Phi4MMVisionEmbeddingsPatcher(ModelPatcher):
+class Phi4MMVisionEmbeddingsPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -5960,7 +6073,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.img_processor.embeddings.forward = self._model.img_processor.embeddings._orig_forward
 
 
-class Llama4ImageEmbeddingsModelPatcher(ModelPatcher):
+class Llama4ImageEmbeddingsModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -6146,7 +6259,7 @@ def llama4_moe_forward(self, hidden_states):
     return out, router_scores
 
 
-class Llama4TextModelPatcher(ModelPatcher):
+class Llama4TextModelPatcher(OVModelPatcher):
     def __enter__(self):
         super().__enter__()
 
@@ -6316,7 +6429,7 @@ def mamba_mixer_forward(
 # 1. Inject a MambaCache structure into the original model to simplify input and output handling related to SSM states
 # 2. Patch ConvSequenceTransform module to avoid if-else branching
 # 3. Vectorize the selective scan operation to ensure correct behavior during JIT tracing
-class MambaPatcher(ModelPatcher):
+class MambaPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -6810,7 +6923,7 @@ def segment_sum(input_tensor):
 #    for subsequent invocation of the model's `forward` method.
 # 2. Patches the Zamba2MambaMixer so that the traced `forward` function works correctly
 #    during both the prefill and decoding steps.
-class Zamba2ModelPatcher(ModelPatcher):
+class Zamba2ModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -7236,7 +7349,7 @@ def granite_moe_hybrid_update_causal_mask(
     return causal_mask
 
 
-class GraniteMoeHybridModelPatcher(ModelPatcher):
+class GraniteMoeHybridModelPatcher(OVModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",

From 2d3c734c8abf3a9907cb929accd04ef61d57a5ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 13:28:58 +0100
Subject: [PATCH 023/222] openvino nightly for modeling tests

---
 .github/workflows/test_openvino.yml | 8 ++++----
 setup.py                            | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 5bcbc0e31c..950c2f987c 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -81,15 +81,15 @@ jobs:
         run: |
           python tests/scripts/login_with_ci_token.py
 
-      - name: Test with Pytest
-        run: |
-          pytest tests/openvino/${{ matrix.test-pattern }} --durations=0
-
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Install Nightly OpenVINO
         run: |
           uv pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
+      - name: Test with Pytest
+        run: |
+          pytest tests/openvino/${{ matrix.test-pattern }} --durations=0
+
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Test with Pytest - Nightly OpenVINO
         run: |
diff --git a/setup.py b/setup.py
index b2c945b37b..1c313dbe0c 100644
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@
 
 EXTRAS_REQUIRE = {
     "nncf": ["nncf>=2.19.0"],
-    "openvino": ["nncf>=2.19.0", "openvino==2025.3.0", "openvino-tokenizers==2025.3.0"],
+    "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"],
     "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"],
     "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"],
     "diffusers": ["diffusers"],

From b6dcefd0949130e084d378e5bc6d7cc46c9e698c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 14:21:35 +0100
Subject: [PATCH 024/222] openvino 2025.3 for modeling tests

---
 .github/workflows/test_openvino.yml | 13 +++++++++----
 setup.py                            |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 950c2f987c..37e61ea335 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -56,6 +56,11 @@ jobs:
           pip install --upgrade pip uv
           uv pip install .[openvino,diffusers,tests]
 
+      - if: ${{ matrix.test-pattern == '*modeling*' }}
+        name: Install OpenVINO
+        run: |
+          uv pip install openvino==2025.3.0 openvino-tokenizers==2025.3.0
+
       - if: ${{ matrix.transformers-version != 'latest' }}
         name: Install transformers
         run: |
@@ -81,15 +86,15 @@ jobs:
         run: |
           python tests/scripts/login_with_ci_token.py
 
+      - name: Test with Pytest
+        run: |
+          pytest tests/openvino/${{ matrix.test-pattern }} --durations=0
+
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Install Nightly OpenVINO
         run: |
           uv pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
-      - name: Test with Pytest
-        run: |
-          pytest tests/openvino/${{ matrix.test-pattern }} --durations=0
-
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Test with Pytest - Nightly OpenVINO
         run: |
diff --git a/setup.py b/setup.py
index 1c313dbe0c..c072bab7f3 100644
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@
 
 EXTRAS_REQUIRE = {
     "nncf": ["nncf>=2.19.0"],
-    "openvino": ["nncf>=2.19.0", "openvino>=2025.4.0", "openvino-tokenizers>=2025.4.0"],
+    "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"],
     "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"],
     "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"],
     "diffusers": ["diffusers"],

From ea24727535b789e228106790f2f725c50dc8309b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 15:51:08 +0100
Subject: [PATCH 025/222] stop moving misplaced parameters from config to
 generation_config

---
 optimum/exporters/openvino/convert.py      | 29 ++++++++++----------
 optimum/intel/openvino/modeling_base.py    | 32 +++++++++++-----------
 optimum/intel/openvino/modeling_seq2seq.py | 29 ++++++++++----------
 3 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 4b0652393d..794e38c9ed 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -688,20 +688,21 @@ def export_from_model(
 
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
     elif library_name != "diffusers":
-        # some model configs may have issues with loading without parameters initialization
-        try:
-            misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
-        except (AttributeError, KeyError, TypeError):
-            misplaced_generation_parameters = {}
-        if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
-            logger.warning(
-                "Moving the following attributes in the config to the generation config: "
-                f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
-                "generation parameters in the model config, as opposed to in the generation config.",
-            )
-            for param_name, param_value in misplaced_generation_parameters.items():
-                setattr(model.generation_config, param_name, param_value)
-                setattr(model.config, param_name, None)
+        if is_transformers_version("<", "5"):
+            # some model configs may have issues with loading without parameters initialization
+            try:
+                misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            except (AttributeError, KeyError, TypeError):
+                misplaced_generation_parameters = {}
+            if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(model.generation_config, param_name, param_value)
+                    setattr(model.config, param_name, None)
 
         # Saving the model config and preprocessor as this is needed sometimes.
         save_config(model.config, output)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index b3c9a11eb0..0d95cc233d 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -47,7 +47,7 @@
     classproperty,
     model_has_dynamic_inputs,
 )
-from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available
+from optimum.intel.utils.import_utils import is_huggingface_hub_version, is_nncf_available, is_transformers_version
 from optimum.intel.utils.modeling_utils import _find_files_matching_pattern
 from optimum.modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
 
@@ -265,21 +265,21 @@ def __init__(
         if self.can_generate():
             self.generation_config = generation_config or GenerationConfig.from_model_config(config)
 
-            # some model configs may have issues with loading without parameters initialization
-            try:
-                misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
-            except (KeyError, TypeError):
-                misplaced_generation_parameters = {}
-            if len(misplaced_generation_parameters) > 0:
-                logger.warning(
-                    "Moving the following attributes in the config to the generation config: "
-                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
-                    "generation parameters in the model config, as opposed to in the generation config.",
-                )
-                for param_name, param_value in misplaced_generation_parameters.items():
-                    setattr(self.generation_config, param_name, param_value)
-                    setattr(self.config, param_name, None)
-
+            if is_transformers_version("<", "5"):
+                # some model configs may have issues with loading without parameters initialization
+                try:
+                    misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+                except (KeyError, TypeError):
+                    misplaced_generation_parameters = {}
+                if len(misplaced_generation_parameters) > 0:
+                    logger.warning(
+                        "Moving the following attributes in the config to the generation config: "
+                        f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                        "generation parameters in the model config, as opposed to in the generation config.",
+                    )
+                    for param_name, param_value in misplaced_generation_parameters.items():
+                        setattr(self.generation_config, param_name, param_value)
+                        setattr(self.config, param_name, None)
         else:
             self.generation_config = None
 
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index c5365e8aae..cb8d6b7fa4 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -370,20 +370,21 @@ def __init__(
         generation_config = kwargs.get("generation_config", None)
         self.generation_config = generation_config or GenerationConfig.from_model_config(config)
 
-        # some model configs may have issues with loading without parameters initialization
-        try:
-            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
-        except (KeyError, TypeError):
-            misplaced_generation_parameters = {}
-        if len(misplaced_generation_parameters) > 0:
-            logger.warning(
-                "Moving the following attributes in the config to the generation config: "
-                f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
-                "generation parameters in the model config, as opposed to in the generation config.",
-            )
-            for param_name, param_value in misplaced_generation_parameters.items():
-                setattr(self.generation_config, param_name, param_value)
-                setattr(self.config, param_name, None)
+        if is_transformers_version("<", "5"):
+            # some model configs may have issues with loading without parameters initialization
+            try:
+                misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            except (KeyError, TypeError):
+                misplaced_generation_parameters = {}
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
 
         self._openvino_config = None
         if quantization_config:

From 07ff06b936fea14798feb0ca208449bc408b3694 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 15:54:59 +0100
Subject: [PATCH 026/222] fix transformers version for doc building

---
 .github/workflows/build_pr_documentation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index c4a34baaa6..01a5bbe7e9 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -38,6 +38,7 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
+          uv pip install transformers==4.57.6
           uv pip install .[quality] nncf openvino diffusers accelerate datasets
 
       - name: Make documentation

From 1270db0612cad34664ec7b295c55e19ea0be38fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 15:56:32 +0100
Subject: [PATCH 027/222] fix transformers version for doc building

---
 .github/workflows/build_documentation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index 52dae651de..332563450b 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -51,6 +51,7 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
+          uv pip install transformers==4.57.6
           uv pip install .[quality] nncf openvino diffusers accelerate datasets
 
       - name: Make documentation

From eb045ce620a72080d746dd2877b12a685c9bb79a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 18:57:38 +0100
Subject: [PATCH 028/222] Use model.get_image_features

---
 optimum/exporters/openvino/model_configs.py | 25 ++++++++++++++-------
 optimum/exporters/openvino/model_patcher.py |  6 ++++-
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 67686b94bb..a8b293ca84 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -317,6 +317,13 @@ def init_model_configs():
 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True)
 
 
+def _get_language_model(model):
+    if is_transformers_version("<", "5"):
+        return model.language_model
+
+    return model.model.language_model
+
+
 @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
@@ -1702,14 +1709,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.LANGUAGE:
-            return model.language_model if not hasattr(model, "lm_head") else model
+            return _get_language_model(model) if not hasattr(model, "lm_head") else model
 
         if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
             return model
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
             text_embedding = model.get_input_embeddings()
-            text_embedding.config = model.language_model.config
+            text_embedding.config = _get_language_model(model).config
             return text_embedding
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
@@ -1892,8 +1899,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
-            text_embedding = model.language_model.get_input_embeddings()
-            text_embedding.config = model.language_model.config
+            text_embedding = _get_language_model(model).get_input_embeddings()
+            text_embedding.config = _get_language_model(model).config
             return text_embedding
 
         return super().get_model_for_behavior(model, behavior)
@@ -1969,14 +1976,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]):
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.LANGUAGE:
-            return model.language_model
+            return _get_language_model(model)
 
         if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
             return model
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
-            text_embedding = model.language_model.get_input_embeddings()
-            text_embedding.config = model.language_model.config
+            text_embedding = _get_language_model(model).get_input_embeddings()
+            text_embedding.config = _get_language_model(model).config
             return text_embedding
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
@@ -3477,7 +3484,9 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]):
 
         if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS:
             text_embedding = (
-                model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens
+                model.model.embed_tokens
+                if hasattr(model.model, "embed_tokens")
+                else _get_language_model(model).embed_tokens
             )
             text_embedding.config = model.config
             return text_embedding
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3639ece9cf..1c1cb3bb9f 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3351,7 +3351,11 @@ def __init__(
         model_kwargs: Dict[str, Any],
     ):
         model.__orig_forward = model.forward
-        model.forward = types.MethodType(llava_vision_embed_forward, model)
+
+        if is_transformers_version("<", "5"):
+            model.forward = types.MethodType(llava_vision_embed_forward, model)
+        else:
+            model.forward = model.get_image_features
 
         super().__init__(config, model, model_kwargs)
 

From f2f352dd92891a0d1eba46a9e0298d4848fb494d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 18:58:22 +0100
Subject: [PATCH 029/222] Use model.get_image_features

---
 optimum/exporters/openvino/model_patcher.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 1c1cb3bb9f..7aec5bbe41 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3389,7 +3389,11 @@ def __init__(
         model_kwargs: Dict[str, Any],
     ):
         model.__orig_forward = model.forward
-        model.forward = types.MethodType(llava_next_video_vision_embed_forward, model)
+
+        if is_transformers_version("<", "5"):
+            model.forward = types.MethodType(llava_next_video_vision_embed_forward, model)
+        else:
+            model.forward = model.get_image_features
 
         super().__init__(config, model, model_kwargs)
 

From 1db8fb9820a23d8a3e1d19201823c39aff1b99a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 29 Jan 2026 19:07:03 +0100
Subject: [PATCH 030/222] only add  codegen remote code models when
 transformers < v5

---
 tests/openvino/test_decoder.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 183f362913..33d8383876 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -48,7 +48,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "blenderbot-small",
         "bloom",
         "codegen",
-        "codegen2",
         "gpt2",
         "gptj",
         "gpt_neo",
@@ -147,6 +146,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version("<", "4.56.0"):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ("codegen2",)
+
     GENERATION_LENGTH = 100
 
     EXPECTED_NUM_SDPA = {

From 0c72bc518c69ad2450217ad288b5172f2db19768 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 30 Jan 2026 16:06:32 +0100
Subject: [PATCH 031/222] Fix pipelines

---
 optimum/intel/pipelines/accelerator_utils.py | 30 ++++++++++++++------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py
index 7ea4102ec7..9090b8f30f 100644
--- a/optimum/intel/pipelines/accelerator_utils.py
+++ b/optimum/intel/pipelines/accelerator_utils.py
@@ -18,10 +18,15 @@
 import transformers.pipelines
 from transformers import AutoConfig
 
+from optimum.intel.utils import (
+    IPEX_IMPORT_ERROR,
+    OPENVINO_IMPORT_ERROR,
+    is_ipex_available,
+    is_openvino_available,
+    is_transformers_version,
+)
 from optimum.utils.logging import get_logger
 
-from ..utils import IPEX_IMPORT_ERROR, OPENVINO_IMPORT_ERROR, is_ipex_available, is_openvino_available
-
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
@@ -154,7 +159,10 @@ def openvino_infer_framework_load_model(
             You can also provide None as the model to use a default one."""
         )
 
-    return "pt", ov_model
+    if is_transformers_version("<", "5"):
+        return "pt", ov_model
+
+    return ov_model
 
 
 def get_ipex_model_class(task: str, **model_kwargs):
@@ -189,27 +197,33 @@ def ipex_infer_framework_load_model(
             You can also provide None as the model to use a default one."""
         )
 
-    return "pt", ipex_model
+    if is_transformers_version("<", "5"):
+        return "pt", ipex_model
+
+    return ipex_model
 
 
 @contextlib.contextmanager
 def patch_pipelines_to_load_accelerator_model(accelerator: str):
-    original_infer_framework_load_model = transformers.pipelines.infer_framework_load_model
+    target_fn = "infer_framework_load_model" if is_transformers_version("<", "5") else "load_model"
+
+    original_infer_framework_load_model = getattr(transformers.pipelines, target_fn)
 
     if accelerator == "openvino":
         if not is_openvino_available():
             raise ImportError(OPENVINO_IMPORT_ERROR.format("`accelerator=openvino`"))
 
-        transformers.pipelines.infer_framework_load_model = openvino_infer_framework_load_model
+        setattr(transformers.pipelines, target_fn, openvino_infer_framework_load_model)
+
     elif accelerator == "ipex":
         if not is_ipex_available():
             raise ImportError(IPEX_IMPORT_ERROR.format("`accelerator=ipex`"))
 
-        transformers.pipelines.infer_framework_load_model = ipex_infer_framework_load_model
+        setattr(transformers.pipelines, target_fn, ipex_infer_framework_load_model)
     else:
         raise ValueError(f"Accelerator '{accelerator}' is not supported. Only 'openvino' and 'ipex' are supported.")
 
     try:
         yield
     finally:
-        transformers.pipelines.infer_framework_load_model = original_infer_framework_load_model
+        setattr(transformers.pipelines, target_fn, original_infer_framework_load_model)

From 08ebe2b6d1778df68cb41b2297db074615a9a87b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 30 Jan 2026 16:24:16 +0100
Subject: [PATCH 032/222] fix pipelines

---
 optimum/intel/pipelines/accelerator_utils.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py
index 9090b8f30f..9ac24d06d7 100644
--- a/optimum/intel/pipelines/accelerator_utils.py
+++ b/optimum/intel/pipelines/accelerator_utils.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 
 import contextlib
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
 
 import transformers.pipelines
 from transformers import AutoConfig
@@ -143,12 +143,15 @@ def get_openvino_model_class(
 
 # a modified transformers.pipelines.base.infer_framework_load_model that loads OpenVINO models
 def openvino_infer_framework_load_model(
-    model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs
+    model,
+    config: Optional["PretrainedConfig"] = None,
+    model_classes: Optional[Dict[str, Tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
 ):
     if isinstance(model, str):
-        model_kwargs.pop("framework", None)
         model_kwargs.pop("_commit_hash", None)  # not supported for OVModel
-        model_kwargs.pop("model_classes", None)
         ov_model_class = get_openvino_model_class(task, config, model, **model_kwargs)
         ov_model = ov_model_class.from_pretrained(model, **model_kwargs)
     elif isinstance(model, OVBaseModel):
@@ -181,12 +184,15 @@ def get_ipex_model_class(task: str, **model_kwargs):
 
 # a modified transformers.pipelines.base.infer_framework_load_model that loads IPEX models
 def ipex_infer_framework_load_model(
-    model, config: Optional["PretrainedConfig"] = None, task: Optional[str] = None, **model_kwargs
+    model,
+    config: Optional["PretrainedConfig"] = None,
+    model_classes: Optional[Dict[str, Tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
 ):
     if isinstance(model, str):
-        model_kwargs.pop("framework", None)
         model_kwargs.pop("_commit_hash", None)  # not supported for IPEXModel
-        model_kwargs.pop("model_classes", None)
         ipex_model_class = get_ipex_model_class(task, **model_kwargs)
         ipex_model = ipex_model_class.from_pretrained(model, **model_kwargs)
     elif isinstance(model, IPEXModel):

From 33f8c24df28e80efe49fa5beabef103d23ea89e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 30 Jan 2026 16:34:34 +0100
Subject: [PATCH 033/222] replace with OV cache

---
 optimum/exporters/openvino/model_patcher.py | 33 ++++++++++-----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 7aec5bbe41..fa8fe1bfb6 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -69,7 +69,7 @@ def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
 
     # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015
     @classmethod
-    def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "DynamicCache":
+    def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "OVDynamicCache":
         """
         Converts a cache in the legacy cache format into an equivalent `Cache`. Used for
         backward compatibility.
@@ -87,7 +87,7 @@ def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tens
 class OVEncoderDecoderCache(EncoderDecoderCache):
     # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266
     def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
-        """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format."""
+        """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format."""
         legacy_cache = ()
         if len(self.cross_attention_cache) > 0:
             for self_attn, cross_attn in zip(
@@ -102,9 +102,9 @@ def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
     @classmethod
     def from_legacy_cache(
         cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]]
-    ) -> "EncoderDecoderCache":
-        """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`."""
-        cache = cls(DynamicCache(), DynamicCache())
+    ) -> "OVEncoderDecoderCache":
+        """Converts a cache in the legacy cache format into an equivalent `OVEncoderDecoderCache`."""
+        cache = cls(OVDynamicCache(), OVDynamicCache())
         if past_key_values is None:
             logger.warning_once("past_key_values should not be None in from_legacy_cache()")
         else:
@@ -1451,7 +1451,7 @@ def phi3_442_forward(
     if use_cache:
         use_legacy_cache = not isinstance(past_key_values, Cache)
         if use_legacy_cache:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
         past_key_values_length = past_key_values.get_usable_length(seq_length)
 
     if position_ids is None:
@@ -3023,7 +3023,7 @@ def patched_forward(*args, **kwargs):
                 legacy_pkv = args[pkv_argument_index]
                 pkv_in_args = True
             if legacy_pkv is not None:
-                pkv = DynamicCache.from_legacy_cache(legacy_pkv)
+                pkv = OVDynamicCache.from_legacy_cache(legacy_pkv)
                 return_legacy_cache = True
                 if not pkv_in_args:
                     kwargs["past_key_values"] = pkv
@@ -4162,7 +4162,7 @@ def forward_wrap(
             input_ids=None,
             use_cache=True,
         ):
-            new_past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
             result = self.__orig_forward(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -4498,11 +4498,11 @@ def patched_forward(*args, **kwargs):
                     pkv = args[pkv_arg_index]
 
                 if pkv is not None:
-                    if isinstance(pkv, EncoderDecoderCache):
+                    if isinstance(pkv, OVEncoderDecoderCache):
                         pkv = pkv.self_attention_cache.to_legacy_cache()
                     else:
                         pkv = [pkv_item[:2] for pkv_item in pkv]
-                    pkv = EncoderDecoderCache.from_legacy_cache(pkv)
+                    pkv = OVEncoderDecoderCache.from_legacy_cache(pkv)
 
                     if "past_key_values" in kwargs:
                         kwargs["past_key_values"] = pkv
@@ -4512,7 +4512,7 @@ def patched_forward(*args, **kwargs):
             outputs = self.super_patched_forward(*args, **kwargs)
 
             # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48
-            if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)):
+            if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)):
                 outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache()
 
             # we still need to filter out cross attention in the case of non-stateful decoder
@@ -4687,7 +4687,7 @@ def __init__(
         def forward(
             self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True
         ):
-            pkv = DynamicCache.from_legacy_cache(past_key_values)
+            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
 
             past_seen_tokens = past_key_values[0][0].shape[-2]
             cache_position = torch.arange(
@@ -5052,7 +5052,6 @@ def _blenderbot_attn_forward_new(
     output_attentions: bool = False,
     cache_position: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    from transformers.cache_utils import EncoderDecoderCache
 
     """Input shape: Batch x Time x Channel"""
 
@@ -5076,7 +5075,7 @@ def _blenderbot_attn_forward_new(
     query_states = query_states
 
     if past_key_value is not None:
-        if isinstance(past_key_value, EncoderDecoderCache):
+        if isinstance(past_key_value, OVEncoderDecoderCache):
             is_updated = past_key_value.is_updated.get(self.layer_idx)
             if is_cross_attention:
                 # after the first generated id, we can subsequently re-use all key/value_states from cache
@@ -5601,7 +5600,7 @@ def patched_decoder_forward(
             if past_key_values is not None:
                 past_key_values = [cache_item[:2] for cache_item in past_key_values]
                 if is_transformers_version(">=", "4.56"):
-                    past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+                    past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values)
 
             output_sequence = inputs_embeds
             output_cross_attentions = False
@@ -5633,7 +5632,7 @@ def patched_decoder_forward(
 
             past_key_values = decoder_out.past_key_values
             if past_key_values is not None:
-                if isinstance(past_key_values, EncoderDecoderCache):
+                if isinstance(past_key_values, OVEncoderDecoderCache):
                     past_key_values = past_key_values.self_attention_cache.to_legacy_cache()
                 else:
                     past_key_values = [cache_item[:2] for cache_item in past_key_values]
@@ -5685,7 +5684,7 @@ def __init__(
         # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178
         # moved audio and vision features processing outside model
         def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True):
-            pkv = DynamicCache.from_legacy_cache(past_key_values)
+            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
             outputs = self.model(
                 inputs_embeds=inputs_embeds,
                 attention_mask=attention_mask,

From 9809e7ede6f973ea3ef625f5baabc06365b5f0ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 30 Jan 2026 16:35:23 +0100
Subject: [PATCH 034/222] style

---
 optimum/exporters/openvino/model_patcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index fa8fe1bfb6..a85bd6f75c 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -5052,7 +5052,6 @@ def _blenderbot_attn_forward_new(
     output_attentions: bool = False,
     cache_position: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-
     """Input shape: Batch x Time x Channel"""
 
     # if key_value_states are provided this layer is used as a cross-attention layer

From 621e2bf710b44df53fc15435760bdc17c34886d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 30 Jan 2026 17:51:21 +0100
Subject: [PATCH 035/222] Use AutoProcessor instead of AutoFeatureExtractor

---
 tests/openvino/test_modeling.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 4ffa7ab06b..777b276859 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1184,19 +1184,18 @@ def test_compare_to_transformers(self, model_arch):
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         set_seed(SEED)
         transformers_model = AutoModelForImageClassification.from_pretrained(model_id)
-        preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
+        preprocessor = AutoProcessor.from_pretrained(model_id)
         url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         inputs = preprocessor(images=image, return_tensors="pt")
         with torch.no_grad():
             transformers_outputs = transformers_model(**inputs)
-        for input_type in ["pt", "np"]:
-            inputs = preprocessor(images=image, return_tensors=input_type)
-            ov_outputs = ov_model(**inputs)
-            self.assertIn("logits", ov_outputs)
-            self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
-            # Compare tensor outputs
-            self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
+        inputs = preprocessor(images=image, return_tensors="pt")
+        ov_outputs = ov_model(**inputs)
+        self.assertIn("logits", ov_outputs)
+        self.assertIsInstance(ov_outputs.logits, torch.Tensor)
+        # Compare tensor outputs
+        self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4))
         del transformers_model
         del ov_model
         gc.collect()
@@ -1209,7 +1208,7 @@ def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForImageClassification.from_pretrained(model_id, device=OPENVINO_DEVICE)
         model.eval()
-        preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
+        preprocessor = AutoProcessor.from_pretrained(model_id)
         pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor)
         inputs = TEST_IMAGE_URL
         outputs = pipe(inputs)

From 30f628592391ba40c843c5077afbbc842eb586e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 30 Jan 2026 18:02:27 +0100
Subject: [PATCH 036/222] remove afmoe from models to be tested list

---
 optimum/exporters/openvino/model_configs.py |  2 +-
 tests/openvino/test_decoder.py              | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index a8b293ca84..594d876812 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4646,7 +4646,7 @@ class ASTOpenVINOConfig(ASTOnnxConfig):
 )
 class AfmoeOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.55.0"
-    MAX_TRANSFORMERS_VERSION = "4.57.99"
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = AfmoeModelPatcher
 
 
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 33d8383876..19eb7dfb99 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -21,7 +21,12 @@
     patch_awq_for_inference,
 )
 
-from optimum.exporters.openvino.model_configs import BitnetOpenVINOConfig, DeepseekOpenVINOConfig, LFM2OpenVINOConfig
+from optimum.exporters.openvino.model_configs import (
+    AfmoeOpenVINOConfig,
+    BitnetOpenVINOConfig,
+    DeepseekOpenVINOConfig,
+    LFM2OpenVINOConfig,
+)
 from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
 from optimum.exporters.openvino.utils import ONNX_SUPPORTED_ARCHITECTURES
 from optimum.exporters.tasks import TasksManager
@@ -274,11 +279,13 @@ def test_find_untested_architectures(self):
 
         if "llama4_text" in supported_architectures:
             supported_architectures.remove("llama4_text")
-        if is_transformers_version(">=", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
+        if is_transformers_version(">", str(DeepseekOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
             if "deepseek_v2" in supported_architectures:
                 supported_architectures.remove("deepseek_v2")
             if "deepseek_v3" in supported_architectures:
                 supported_architectures.remove("deepseek_v3")
+        if is_transformers_version(">", str(AfmoeOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
+            supported_architectures -= {"afmoe"}
         if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):

From 6cd7b1c263ef119008f21382be8f0a6dd32a5a29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 30 Jan 2026 18:20:04 +0100
Subject: [PATCH 037/222] fix pipeline saving tests

---
 tests/openvino/test_modeling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 777b276859..d9a61e8b44 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -708,6 +708,8 @@ def test_load_model_from_hub(self):
 
         # verify could load both pytorch and openvino model (export argument should automatically infered)
         ov_exported_pipe = optimum_pipeline("text-generation", model_id, revision="pt", accelerator="openvino")
+        ov_exported_pipe.modelcard = None
+
         ov_pipe = optimum_pipeline("text-generation", model_id, revision="ov", accelerator="openvino")
         self.assertIsInstance(ov_exported_pipe.model, OVBaseModel)
         self.assertIsInstance(ov_pipe.model, OVBaseModel)

From 85a0418e0a64b6e9de3e802912d55eaca9c7a056 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 2 Feb 2026 15:46:04 +0100
Subject: [PATCH 038/222] fix seq2seq pipeline tests loading

---
 tests/openvino/test_modeling.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index d9a61e8b44..785c4e2782 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -727,20 +727,21 @@ def test_load_model_from_hub(self):
         gc.collect()
 
     def test_seq2seq_load_from_hub(self):
-        model_id = "echarlaix/tiny-random-t5"
+        model_id = MODEL_NAMES["whisper"]
+        task = "automatic-speech-recognition"
         # verify could load both pytorch and openvino model (export argument should automatically infered)
-        ov_exported_pipe = optimum_pipeline("text2text-generation", model_id, accelerator="openvino")
-        ov_pipe = optimum_pipeline("text2text-generation", model_id, revision="ov", accelerator="openvino")
+        ov_exported_pipe = optimum_pipeline(task, model_id, accelerator="openvino")
+        ov_exported_pipe.modelcard = None
+        ov_pipe = optimum_pipeline(task, model_id, revision="ov", accelerator="openvino")
         self.assertIsInstance(ov_exported_pipe.model, OVBaseModel)
         self.assertIsInstance(ov_pipe.model, OVBaseModel)
 
         with TemporaryDirectory() as tmpdirname:
             ov_exported_pipe.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
-            if not ov_exported_pipe.model.decoder.stateful:
-                self.assertTrue(OV_DECODER_WITH_PAST_NAME in folder_contents)
-                self.assertTrue(OV_DECODER_WITH_PAST_NAME.replace(".xml", ".bin") in folder_contents)
-            ov_exported_pipe = optimum_pipeline("text2text-generation", tmpdirname, accelerator="openvino")
+            self.assertTrue(ov_exported_pipe.model._ov_model_paths["encoder"] in folder_contents)
+            self.assertTrue(ov_exported_pipe.model._ov_model_paths["decoder"] in folder_contents)
+            ov_exported_pipe = optimum_pipeline(task, tmpdirname, accelerator="openvino")
             self.assertIsInstance(ov_exported_pipe.model, OVBaseModel)
 
         del ov_exported_pipe

From 08d148014e292ca9118cb1fdbf502369f3f44d46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 2 Feb 2026 16:06:52 +0100
Subject: [PATCH 039/222] disable pipelines tests when transformers >=  v5
 since summarization/translation/text2text-generation pipelines are deprecated

---
 tests/openvino/test_seq2seq.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 83a4b7c54f..daf81cf747 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -224,6 +224,10 @@ def test_compare_to_transformers(self, model_arch):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
+    @pytest.mark.skipif(
+        is_transformers_version(">=", "5"),
+        reason="requires transformers < v5 since summarization/translation/text2text-generation pipelines are deprecated",
+    )
     def test_pipeline(self, model_arch):
         set_seed(SEED)
         model_id = MODEL_NAMES[model_arch]

From 7bc714cad9b526e8634b562a349a7ecdbc54abdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 2 Feb 2026 18:52:48 +0100
Subject: [PATCH 040/222] fix MixtralModelPatcher

---
 optimum/exporters/openvino/model_patcher.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a9d67eba7d..af2a2f546b 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -396,18 +396,19 @@ def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc
 class MixtralModelPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
-
-        for layer in self._model.model.layers:
-            layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward
-            layer.block_sparse_moe.forward = types.MethodType(
-                _mixtral_sparse_moe_block_forward, layer.block_sparse_moe
-            )
+        if is_transformers_version("<", "5"):
+            for layer in self._model.model.layers:
+                layer.block_sparse_moe._unpatched_forward = layer.block_sparse_moe.forward
+                layer.block_sparse_moe.forward = types.MethodType(
+                    _mixtral_sparse_moe_block_forward, layer.block_sparse_moe
+                )
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
 
-        for layer in self._model.model.layers:
-            layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward
+        if is_transformers_version("<", "5"):
+            for layer in self._model.model.layers:
+                layer.block_sparse_moe.forward = layer.block_sparse_moe._unpatched_forward
 
 
 class ArcticModelPatcher(MixtralModelPatcher):

From 8b374c7e067325c2ff7b3f0774aff33010f7d1a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 2 Feb 2026 19:03:49 +0100
Subject: [PATCH 041/222] fix moe patching

---
 optimum/exporters/openvino/model_patcher.py | 81 +++++++++++++--------
 1 file changed, 51 insertions(+), 30 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index af2a2f546b..18783eb770 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -402,6 +402,8 @@ def __enter__(self):
                 layer.block_sparse_moe.forward = types.MethodType(
                     _mixtral_sparse_moe_block_forward, layer.block_sparse_moe
                 )
+        else:
+            self._model.config._experts_implementation = "batched_mm"
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -1709,16 +1711,22 @@ def _phi_moe_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc
 class PhiMoEModelPatcher(Phi3ModelPatcher):
     def __enter__(self):
         super().__enter__()
-        for layer in self._model.model.layers:
-            layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward
-            layer.block_sparse_moe.forward = types.MethodType(
-                _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe
-            )
+
+        if is_transformers_version("<", "5"):
+            for layer in self._model.model.layers:
+                layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward
+                layer.block_sparse_moe.forward = types.MethodType(
+                    _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe
+                )
+        else:
+            self._model.config._experts_implementation = "batched_mm"
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        for layer in self._model.model.layers:
-            layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward
+
+        if is_transformers_version("<", "5"):
+            for layer in self._model.model.layers:
+                layer.block_sparse_moe.forward = layer.block_sparse_moe._orig_forward
 
 
 def _aquila_self_attn_sdpa_forward(
@@ -4443,28 +4451,35 @@ def _granite_moe_parallel_experts_forward(self, inputs, expert_size):
 class GraniteMoEModelPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
-        for layer in self._model.model.layers:
-            block_sparse_moe = layer.block_sparse_moe
-            block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward
-            block_sparse_moe.router.forward = types.MethodType(
-                _granite_moe_topk_gating_forward, block_sparse_moe.router
-            )
-            block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward
-            block_sparse_moe.input_linear.forward = types.MethodType(
-                _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear
-            )
-            block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward
-            block_sparse_moe.output_linear.forward = types.MethodType(
-                _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear
-            )
+
+        if is_transformers_version("<", "5"):
+            for layer in self._model.model.layers:
+                block_sparse_moe = layer.block_sparse_moe
+                block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward
+                block_sparse_moe.router.forward = types.MethodType(
+                    _granite_moe_topk_gating_forward, block_sparse_moe.router
+                )
+                block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward
+                block_sparse_moe.input_linear.forward = types.MethodType(
+                    _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear
+                )
+                block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward
+                block_sparse_moe.output_linear.forward = types.MethodType(
+                    _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear
+                )
+
+        else:
+            self._model.config._experts_implementation = "batched_mm"
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        for layer in self._model.model.layers:
-            block_sparse_moe = layer.block_sparse_moe
-            block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward
-            block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward
-            block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward
+
+        if is_transformers_version("<", "5"):
+            for layer in self._model.model.layers:
+                block_sparse_moe = layer.block_sparse_moe
+                block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward
+                block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward
+                block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward
 
 
 class OVSeq2SeqModelPatcher(OVModelPatcher):
@@ -5270,14 +5285,18 @@ def _qwen2moe_sparse_block_forward(self, hidden_states: torch.Tensor) -> torch.T
 class Qwen2MoEPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
-        if is_transformers_version(">=", "4.52.0"):
+
+        if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"):
             from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
 
             modulewise_patch(self._model, Qwen2MoeSparseMoeBlock, _qwen2moe_sparse_block_forward)
 
+        if is_transformers_version(">=", "5"):
+            self._model.config._experts_implementation = "batched_mm"
+
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        if is_transformers_version(">=", "4.52.0"):
+        if is_transformers_version(">=", "4.52.0") and is_transformers_version("<", "5"):
             from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
 
             modulewise_unpatch(self._model, Qwen2MoeSparseMoeBlock)
@@ -6626,14 +6645,16 @@ class Qwen3MoeModelPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
 
-        if is_transformers_version(">=", "4.53"):
+        if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"):
             self.original_moe_forward = Qwen3MoeSparseMoeBlock.forward
             Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward_patched
+        if is_transformers_version(">=", "5"):
+            self._model.config._experts_implementation = "batched_mm"
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
 
-        if is_transformers_version(">=", "4.53"):
+        if is_transformers_version(">=", "4.53") and is_transformers_version("<", "5"):
             Qwen3MoeSparseMoeBlock.forward = self.original_moe_forward
 
 

From a4cfc55f57bbe575576db4db6b0a0edbef72b452 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 3 Feb 2026 15:17:13 +0100
Subject: [PATCH 042/222] gptj fix

---
 optimum/exporters/openvino/model_patcher.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 18783eb770..68b306e318 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -2793,15 +2793,19 @@ def gptj_attn_forward(
     if output_attentions:
         self._attn = self._orig_attn
 
+    kwargs = {}
+    if is_transformers_version("<", "5"):
+        kwrags["head_mask"] = head_mask
+
     return self._orig_forward(
         hidden_states,
         layer_past,
         attention_mask,
         position_ids,
-        head_mask,
         use_cache=use_cache,
         output_attentions=output_attentions,
         cache_position=cache_position,
+        **kwargs
     )
 
 

From 5bab4588af14819368c7b5bbe555abad18df20b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 3 Feb 2026 15:40:58 +0100
Subject: [PATCH 043/222] fix granitemoehybrid patcher

---
 optimum/exporters/openvino/model_patcher.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 68b306e318..08710d6601 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -2805,7 +2805,7 @@ def gptj_attn_forward(
         use_cache=use_cache,
         output_attentions=output_attentions,
         cache_position=cache_position,
-        **kwargs
+        **kwargs,
     )
 
 
@@ -7522,10 +7522,12 @@ def patch_sparse_moe(sparse_moe_layer):
         super().__enter__()
         setattr(self._model, self.orig_forward_name, self.patched_forward)
 
-        self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
-        self._model.model._update_causal_mask = types.MethodType(
-            granite_moe_hybrid_update_causal_mask, self._model.model
-        )
+        if is_transformers_version("<", "5"):
+            self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
+            self._model.model._update_causal_mask = types.MethodType(
+                granite_moe_hybrid_update_causal_mask, self._model.model
+            )
+
         for idx, layer in enumerate(self._model.model.layers):
             if hasattr(layer, "block_sparse_moe"):
                 patch_sparse_moe(layer.block_sparse_moe)
@@ -7545,7 +7547,9 @@ def unpatch_sparse_moe(sparse_moe_layer):
         super().__exit__(exc_type, exc_value, traceback)
         setattr(self._model, self.orig_forward_name, self.model_orig_forward)
 
-        self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+        if is_transformers_version("<", "5"):
+            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+
         for idx, layer in enumerate(self._model.model.layers):
             if hasattr(layer, "block_sparse_moe"):
                 unpatch_sparse_moe(layer.block_sparse_moe)

From daf7ec83e2fe8da562575fb7db52de930980002a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 3 Feb 2026 18:20:54 +0100
Subject: [PATCH 044/222] typo

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 08710d6601..a496ec7d8e 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -2795,7 +2795,7 @@ def gptj_attn_forward(
 
     kwargs = {}
     if is_transformers_version("<", "5"):
-        kwrags["head_mask"] = head_mask
+        kwargs["head_mask"] = head_mask
 
     return self._orig_forward(
         hidden_states,

From a45f5ab8f32c7fef54381587fb9e5368b816610e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 3 Feb 2026 19:10:59 +0100
Subject: [PATCH 045/222] add exaone max_transformers_version as incompatible
 with v5

---
 optimum/exporters/openvino/model_configs.py | 2 +-
 tests/openvino/test_decoder.py              | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 594d876812..0f8afff724 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -701,7 +701,7 @@ class BitnetOpenVINOConfig(LlamaOnnxConfig):
     library_name="transformers",
 )
 class ExaoneOpenVINOConfig(LlamaOpenVINOConfig):
-    pass
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 19eb7dfb99..1f7ae31827 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -90,7 +90,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "qwen2_moe",
         "phi3",
         "gemma2",
-        "exaone",
         "granite",
         "granitemoe",
     )
@@ -152,7 +151,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
     if is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ("codegen2",)
+        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone")
 
     GENERATION_LENGTH = 100
 

From 342dc59c3742230d4661c351d8dba272382040cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Feb 2026 14:35:09 +0100
Subject: [PATCH 046/222] add decilm max_transformers_version as incompatible
 with v5

---
 optimum/exporters/openvino/model_configs.py | 1 +
 tests/openvino/test_decoder.py              | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0f8afff724..fb6be5eb52 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1351,6 +1351,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 @register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 1f7ae31827..2d7652e3bf 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -79,7 +79,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "xverse",
         "internlm",
         "jais",
-        "decilm",
         "gemma",
         "olmo",
         "stablelm",
@@ -151,7 +150,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
     if is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone")
+        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm")
 
     GENERATION_LENGTH = 100
 

From 2a28fe7211bc39190303da7cfea980007b992e89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Feb 2026 16:16:59 +0100
Subject: [PATCH 047/222] fix llama4 patcher

---
 optimum/exporters/openvino/model_patcher.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a496ec7d8e..8aa6b94a18 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -19,7 +19,7 @@
 import math
 import types
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -6289,6 +6289,15 @@ def llama4_moe_forward(self, hidden_states):
     return out, router_scores
 
 
+# Copied from https://github.com/huggingface/transformers/blob/v4.56.0/src/transformers/masking_utils.py#L105
+# transformers.masking_utils._legacy_chunked_overlay deprecated since transformers v5
+def _legacy_chunked_overlay(chunk_size: int) -> Callable:
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        return kv_idx // chunk_size == q_idx // chunk_size
+
+    return inner_mask
+
+
 class Llama4TextModelPatcher(OVModelPatcher):
     def __enter__(self):
         super().__enter__()
@@ -6305,8 +6314,8 @@ def __enter__(self):
         if is_transformers_version(">=", "4.56"):
             # openvino is not able to trace through the new chunked_overlay with left_padding
             self.original_chunked_overlay = transformers.masking_utils.chunked_overlay
-            transformers.masking_utils.chunked_overlay = (
-                lambda chunk_size, left_padding: transformers.masking_utils._legacy_chunked_overlay(chunk_size)
+            transformers.masking_utils.chunked_overlay = lambda chunk_size, left_padding: _legacy_chunked_overlay(
+                chunk_size
             )
 
     def __exit__(self, exc_type, exc_value, traceback):

From b9a3cbe90f1ecd802f008f3cfc2cb75d1934fc25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Feb 2026 16:54:48 +0100
Subject: [PATCH 048/222] make OV DynamicCache backward compatible

---
 optimum/exporters/openvino/model_patcher.py | 52 +++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 8aa6b94a18..8d1c7a93e9 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -56,6 +56,28 @@
 
 
 class OVDynamicCache(DynamicCache):
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self.layers):
+            return self.layers[layer_idx].keys, self.layers[layer_idx].values
+        else:
+            raise KeyError(
+                f"Cache only has {len(self.layers)} layers, attempted to access layer with index {layer_idx}"
+            )
+
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L893
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.layers[layer_idx].keys, self.layers[layer_idx].values)
+
     # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005
     def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
         """
@@ -85,6 +107,36 @@ def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tens
 
 
 class OVEncoderDecoderCache(EncoderDecoderCache):
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1244
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (
+                self.self_attention_cache.layers[layer_idx].keys,
+                self.self_attention_cache.layers[layer_idx].values,
+                self.cross_attention_cache.layers[layer_idx].keys,
+                self.cross_attention_cache.layers[layer_idx].values,
+            )
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+
+    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1231
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (
+                self.self_attention_cache.layers[layer_idx].keys,
+                self.self_attention_cache.layers[layer_idx].values,
+                self.cross_attention_cache.layers[layer_idx].keys,
+                self.cross_attention_cache.layers[layer_idx].values,
+            )
+
     # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266
     def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
         """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format."""

From 1687e3515df16c6f19cb7c76e089a4d3523f4255 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Feb 2026 17:48:16 +0100
Subject: [PATCH 049/222] remove incompatible models aquila2 orion internlm2

---
 optimum/exporters/openvino/model_configs.py | 3 ++-
 tests/openvino/test_decoder.py              | 5 +----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index fb6be5eb52..5e2e643ef4 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -858,6 +858,7 @@ class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
 @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
@@ -867,7 +868,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
 @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
-
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 2d7652e3bf..f40e402ecb 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -66,8 +66,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "opt",
         "pegasus",
         "phi",
-        "internlm2",
-        "orion",
         "falcon",
         "falcon-40b",
         "persimmon",
@@ -75,7 +73,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "gpt_neox_japanese",
         "xglm",
         "aquila",
-        "aquila2",
         "xverse",
         "internlm",
         "jais",
@@ -150,7 +147,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
     if is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm")
+        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2")
 
     GENERATION_LENGTH = 100
 

From 961c1d3f3aa91193ff7ed09ef88c22c6c8a23514 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Feb 2026 18:34:26 +0100
Subject: [PATCH 050/222] add jais max_transformers_version as incompatible
 with v5

---
 optimum/exporters/openvino/model_configs.py | 1 +
 tests/openvino/test_decoder.py              | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 5e2e643ef4..2e060684e7 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1184,6 +1184,7 @@ class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
 )
 class JaisOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index f40e402ecb..26e8010f76 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -75,7 +75,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "aquila",
         "xverse",
         "internlm",
-        "jais",
         "gemma",
         "olmo",
         "stablelm",
@@ -147,7 +146,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
     if is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2")
+        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais")
 
     GENERATION_LENGTH = 100
 

From e8e6c18a284e8cf036ca12a420e458724a42d7ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 5 Feb 2026 10:29:30 +0100
Subject: [PATCH 051/222] dbrx

---
 optimum/exporters/openvino/model_configs.py | 1 +
 tests/openvino/test_decoder.py              | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 2e060684e7..3113fdf136 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1165,6 +1165,7 @@ class CodeGenOpenVINOConfig(CodeGenOnnxConfig):
 )
 class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
         num_attention_heads="n_heads",
         hidden_size="d_model",
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 26e8010f76..5b8916c5cb 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -79,7 +79,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "olmo",
         "stablelm",
         "starcoder2",
-        "dbrx",
         "cohere",
         "qwen2",
         "qwen2_moe",
@@ -146,7 +145,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
     if is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais")
+        # TODO: add dbrx back once fixed in transformers
+        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais", "dbrx")
 
     GENERATION_LENGTH = 100
 

From c6640d6364257db8da6207999ed00384a9e16358 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 5 Feb 2026 19:18:38 +0100
Subject: [PATCH 052/222] set float32 dtype

---
 tests/openvino/test_decoder.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 5b8916c5cb..76901f5db4 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -360,7 +360,17 @@ def test_compare_to_transformers(self, model_arch):
         set_seed(SEED)
         with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch):
             transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
-        if model_arch in ["qwen", "arctic", "chatglm4", "gpt_oss_mxfp4"]:
+        if model_arch in [
+            "qwen",
+            "arctic",
+            "chatglm4",
+            "gpt_oss_mxfp4",
+            "llama",
+            "lfm2",
+            "gemma3_text",
+            "llama4",
+            "exaone4",
+        ]:
             transformers_model.to(torch.float32)
 
         with torch.no_grad():
@@ -808,7 +818,7 @@ def test_beam_search(self, model_arch):
 
     def test_load_with_different_dtype(self):
         set_seed(SEED)
-        model_id = MODEL_NAMES["llama"]
+        model_id = MODEL_NAMES["mistral"]
         pt_model = AutoModelForCausalLM.from_pretrained(
             model_id,
         )

From 1e0c06ffa9a3bb82009399f98dab0efd70219287 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 5 Feb 2026 19:30:18 +0100
Subject: [PATCH 053/222] baichuan2 not compatible with v5

---
 tests/openvino/test_decoder.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 76901f5db4..86105e8112 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -45,7 +45,6 @@
 class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (
         "bart",
-        "baichuan2",
         "baichuan2-13b",
         "gpt_bigcode",
         "bigbird_pegasus",
@@ -146,7 +145,17 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
 
     if is_transformers_version("<", "5"):
         # TODO: add dbrx back once fixed in transformers
-        SUPPORTED_ARCHITECTURES += ("codegen2", "exaone", "decilm", "internlm2", "orion", "aquila2", "jais", "dbrx")
+        SUPPORTED_ARCHITECTURES += (
+            "codegen2",
+            "exaone",
+            "decilm",
+            "internlm2",
+            "orion",
+            "aquila2",
+            "jais",
+            "dbrx",
+            "baichuan2",
+        )
 
     GENERATION_LENGTH = 100
 

From b4910fc0a30bbaf391cae1057dccb60f4f8d5225 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 5 Feb 2026 22:40:38 +0100
Subject: [PATCH 054/222] gpt oss set experts_implementation batched mm

---
 optimum/exporters/openvino/model_patcher.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 8d1c7a93e9..7490c44d55 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7385,16 +7385,19 @@ class GptOssModelPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
 
-        if is_transformers_version(">=", "4.55.0"):
+        if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"):
             from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts
 
             self.original_gpt_oss_forward = GptOssExperts.forward
             GptOssExperts.forward = gpt_oss_forward
 
+        if is_transformers_version(">=", "5"):
+            self._model.config._experts_implementation = "batched_mm"
+
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
 
-        if is_transformers_version(">=", "4.55.0"):
+        if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5"):
             from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts
 
             GptOssExperts.forward = self.original_gpt_oss_forward

From e19da565ff0cb4d85921c99c606b3a67df7af259 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 5 Feb 2026 23:15:09 +0100
Subject: [PATCH 055/222] bitnet

---
 optimum/exporters/openvino/model_configs.py | 1 +
 tests/openvino/test_decoder.py              | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 3113fdf136..e25c154f4a 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -686,6 +686,7 @@ class GptOssOpenVINOConfig(LlamaOpenVINOConfig):
 )
 class BitnetOpenVINOConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = "4.52.1"
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = OVDecoderModelPatcher
 
 
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 86105e8112..6efa3629fe 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -125,7 +125,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES += ("arcee",)
 
-    if is_transformers_version(">=", "4.52.1"):
+    if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("bitnet",)
 
     if is_transformers_version(">=", "4.54.0"):

From fde5ac98af9612f51bd015efe6f26c4ca693c268 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 6 Feb 2026 12:23:38 +0100
Subject: [PATCH 056/222] qwenvl

---
 optimum/exporters/openvino/model_configs.py | 26 ++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e25c154f4a..4d5a0d4f48 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -317,11 +317,11 @@ def init_model_configs():
 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True)
 
 
-def _get_language_model(model):
+def _get_subcomponent_model(model, name):
     if is_transformers_version("<", "5"):
-        return model.language_model
+        return getattr(model, name)
 
-    return model.model.language_model
+    return getattr(model.model, name)
 
 
 @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers")
@@ -1714,14 +1714,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.LANGUAGE:
-            return _get_language_model(model) if not hasattr(model, "lm_head") else model
+            return _get_subcomponent_model(model, "language_model") if not hasattr(model, "lm_head") else model
 
         if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
             return model
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
             text_embedding = model.get_input_embeddings()
-            text_embedding.config = _get_language_model(model).config
+            text_embedding.config = _get_subcomponent_model(model, "language_model").config
             return text_embedding
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
@@ -1904,8 +1904,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
-            text_embedding = _get_language_model(model).get_input_embeddings()
-            text_embedding.config = _get_language_model(model).config
+            text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings()
+            text_embedding.config = _get_subcomponent_model(model, "language_model").config
             return text_embedding
 
         return super().get_model_for_behavior(model, behavior)
@@ -1981,14 +1981,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]):
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.LANGUAGE:
-            return _get_language_model(model)
+            return _get_subcomponent_model(model, "language_model")
 
         if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
             return model
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
-            text_embedding = _get_language_model(model).get_input_embeddings()
-            text_embedding.config = _get_language_model(model).config
+            text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings()
+            text_embedding.config = _get_subcomponent_model(model, "language_model").config
             return text_embedding
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
@@ -3478,12 +3478,12 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]):
             return model
 
         if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS:
-            vision_embeddings = model.visual.patch_embed
+            vision_embeddings = _get_subcomponent_model(model, "visual").patch_embed
             vision_embeddings.config = model.config.vision_config
             return vision_embeddings
 
         if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER:
-            vision_emb_merger = model.visual
+            vision_emb_merger = _get_subcomponent_model(model, "visual")
             vision_emb_merger.config = model.config.vision_config
             return vision_emb_merger
 
@@ -3491,7 +3491,7 @@ def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]):
             text_embedding = (
                 model.model.embed_tokens
                 if hasattr(model.model, "embed_tokens")
-                else _get_language_model(model).embed_tokens
+                else _get_subcomponent_model(model, "language_model").embed_tokens
             )
             text_embedding.config = model.config
             return text_embedding

From 0d3b656a2c110b2a84f440d428dff13e86461b8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 6 Feb 2026 12:43:12 +0100
Subject: [PATCH 057/222] maira2 remote code

---
 optimum/exporters/openvino/model_configs.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 4d5a0d4f48..b20e80ae27 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1904,10 +1904,13 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
-            text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings()
-            text_embedding.config = _get_subcomponent_model(model, "language_model").config
+            text_embedding = model.language_model.get_input_embeddings()
+            text_embedding.config = model.language_model.config
             return text_embedding
 
+        if behavior == VLMConfigBehavior.LANGUAGE:
+            return model.language_model
+
         return super().get_model_for_behavior(model, behavior)
 
 

From b8797e32dcb0dd49fd42a5cb403df051f4f7d6e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 6 Feb 2026 15:00:31 +0100
Subject: [PATCH 058/222] gemma3 and got_ocr2

---
 optimum/exporters/openvino/model_patcher.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 7490c44d55..f780ce0cd9 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4684,7 +4684,11 @@ def __init__(
         model.__orig_forward = model.forward
         # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
         # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321
-        if hasattr(model, "model") and hasattr(model.model, "get_image_features"):
+        if (
+            hasattr(model, "model")
+            and hasattr(model.model, "get_image_features")
+            and is_transformers_version("<", "5")
+        ):
             model.forward = model.model.get_image_features
         else:
             model.forward = model.get_image_features

From 9dfb66617c19508dade18184d84bd20a6f5d9cf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 6 Feb 2026 16:36:14 +0100
Subject: [PATCH 059/222] llava next

---
 optimum/exporters/openvino/model_configs.py | 15 ++++++-----
 optimum/exporters/openvino/model_patcher.py | 29 +++++++++++++++++++--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index b20e80ae27..a2a54fb152 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -166,6 +166,7 @@
     Llama4ImageEmbeddingsModelPatcher,
     Llama4TextModelPatcher,
     LlavaImageEmbeddingModelPatcher,
+    LlavaNextImageEmbeddingModelPatcher,
     LlavaNextVideoImageEmbeddingModelPatcher,
     LlavaQwen2ImageEmbeddingsModelPatcher,
     MairaImageEmbeddingModelPatcher,
@@ -199,6 +200,7 @@
     SanaTextEncoderModelPatcher,
     XverseModelPatcher,
     Zamba2ModelPatcher,
+    _get_subcomponent_model,
 )
 
 
@@ -317,13 +319,6 @@ def init_model_configs():
 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True)
 
 
-def _get_subcomponent_model(model, name):
-    if is_transformers_version("<", "5"):
-        return getattr(model, name)
-
-    return getattr(model.model, name)
-
-
 @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
@@ -1773,6 +1768,12 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict:
 class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.40.0"
 
+    def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
+        model_kwargs = model_kwargs or {}
+        if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS:
+            return super().patch_model_for_export(model, model_kwargs)
+        return LlavaNextImageEmbeddingModelPatcher(self, model, model_kwargs)
+
 
 class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator):
     SUPPORTED_INPUT_NAMES = ["image_features"]
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index f780ce0cd9..5b93fc5347 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -55,6 +55,13 @@
 logger = logging.getLogger(__name__)
 
 
+def _get_subcomponent_model(model, name):
+    if is_transformers_version("<", "5"):
+        return getattr(model, name)
+
+    return getattr(model.model, name)
+
+
 class OVDynamicCache(DynamicCache):
     # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881
     def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
@@ -3365,7 +3372,7 @@ def llava_vision_embed_forward(self, pixel_values):
     # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441
     # these changes does not bring any difference from original, it only packs model subcomponent inference together
     # that allow us avoid memory overheads and their inference results handling on code-level
-    image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+    image_outputs = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True)
     # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
     selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer]
 
@@ -3376,7 +3383,7 @@ def llava_vision_embed_forward(self, pixel_values):
     else:
         raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
 
-    image_features = self.multi_modal_projector(selected_image_feature)
+    image_features = _get_subcomponent_model(self, "multi_modal_projector")(selected_image_feature)
     return image_features
 
 
@@ -3429,6 +3436,24 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
+class LlavaNextImageEmbeddingModelPatcher(OVModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: "PreTrainedModel",
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        # TODO: use get_image_features instead and add image_sizes as input when exorting
+        # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716
+        model.forward = types.MethodType(llava_vision_embed_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
 class MairaImageEmbeddingModelPatcher(OVModelPatcher):
     def __init__(
         self,

From 3386c647fb1d5ad051c866e0b071ce77b4e9760a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 6 Feb 2026 16:51:13 +0100
Subject: [PATCH 060/222] llava next video

---
 optimum/exporters/openvino/model_patcher.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 5b93fc5347..b3412a0a41 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3391,7 +3391,7 @@ def llava_next_video_vision_embed_forward(self, pixel_values):
     # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519
     # these changes does not bring any difference from original, it only packs model subcomponent inference together
     # that allow us avoid memory overheads and their inference results handling on code-level
-    image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+    image_features = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True)
     vision_feature_layer = self.config.vision_feature_layer
     if isinstance(vision_feature_layer, int):
         selected_image_feature = image_features.hidden_states[vision_feature_layer]
@@ -3444,7 +3444,7 @@ def __init__(
         model_kwargs: Dict[str, Any],
     ):
         model.__orig_forward = model.forward
-        # TODO: use get_image_features instead and add image_sizes as input when exorting
+        # TODO: use get_image_features instead and add image_sizes as input when exporting
         # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716
         model.forward = types.MethodType(llava_vision_embed_forward, model)
         super().__init__(config, model, model_kwargs)
@@ -3479,12 +3479,9 @@ def __init__(
         model_kwargs: Dict[str, Any],
     ):
         model.__orig_forward = model.forward
-
-        if is_transformers_version("<", "5"):
-            model.forward = types.MethodType(llava_next_video_vision_embed_forward, model)
-        else:
-            model.forward = model.get_image_features
-
+        # TODO: use get_image_features instead and add image_sizes as input when exporting
+        # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L746
+        model.forward = types.MethodType(llava_next_video_vision_embed_forward, model)
         super().__init__(config, model, model_kwargs)
 
     def __exit__(self, exc_type, exc_value, traceback):

From bc4a84d163dd4b8d6166272006b9b7c6c105e804 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 10 Feb 2026 15:12:12 +0100
Subject: [PATCH 061/222] use ONNXCache

---
 optimum/exporters/openvino/model_patcher.py | 245 ++++----------------
 1 file changed, 42 insertions(+), 203 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 1ab1a386b9..b779170d8e 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -36,6 +36,9 @@
     override_arguments,
     sdpa_mask_without_vmap,
 )
+from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache
+
+
 from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
 
 
@@ -62,170 +65,6 @@ def _get_subcomponent_model(model, name):
     return getattr(model.model, name)
 
 
-class OVDynamicCache(DynamicCache):
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L881
-    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the
-        sequence length.
-        """
-        if layer_idx < len(self.layers):
-            return self.layers[layer_idx].keys, self.layers[layer_idx].values
-        else:
-            raise KeyError(
-                f"Cache only has {len(self.layers)} layers, attempted to access layer with index {layer_idx}"
-            )
-
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L893
-    def __iter__(self):
-        """
-        Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over
-        keys and values
-        """
-        for layer_idx in range(len(self)):
-            yield (self.layers[layer_idx].keys, self.layers[layer_idx].values)
-
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1005
-    def to_legacy_cache(self) -> tuple[tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Converts the `Cache` instance into the its equivalent in the legacy cache format. Used for
-        backward compatibility.
-        """
-        legacy_cache = ()
-        for layer in self.layers:
-            legacy_cache += ((layer.keys, layer.values),)
-        return legacy_cache
-
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1015
-    @classmethod
-    def from_legacy_cache(cls, past_key_values: tuple[tuple[torch.Tensor, torch.Tensor]]) -> "OVDynamicCache":
-        """
-        Converts a cache in the legacy cache format into an equivalent `Cache`. Used for
-        backward compatibility.
-        """
-        cache = cls()
-        if past_key_values is None:
-            logger.warning_once("past_key_values should not be None in from_legacy_cache()")
-        if past_key_values is not None:
-            for layer_idx in range(len(past_key_values)):
-                key_states, value_states = past_key_values[layer_idx]
-                cache.update(key_states, value_states, layer_idx)
-        return cache
-
-
-class OVEncoderDecoderCache(EncoderDecoderCache):
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1244
-    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Support for backwards-compatible `past_key_values` indexing, e.g. `past_key_values[0][0].shape[2]` to get the
-        sequence length.
-        """
-        if layer_idx < len(self):
-            return (
-                self.self_attention_cache.layers[layer_idx].keys,
-                self.self_attention_cache.layers[layer_idx].values,
-                self.cross_attention_cache.layers[layer_idx].keys,
-                self.cross_attention_cache.layers[layer_idx].values,
-            )
-        else:
-            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
-
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1231
-    def __iter__(self):
-        """
-        Support for backwards-compatible `past_key_values` iteration, e.g. `for x in past_key_values:` to iterate over
-        keys and values
-        """
-        for layer_idx in range(len(self)):
-            yield (
-                self.self_attention_cache.layers[layer_idx].keys,
-                self.self_attention_cache.layers[layer_idx].values,
-                self.cross_attention_cache.layers[layer_idx].keys,
-                self.cross_attention_cache.layers[layer_idx].values,
-            )
-
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1266
-    def to_legacy_cache(self) -> tuple[tuple[torch.Tensor]]:
-        """Converts the `OVEncoderDecoderCache` instance into its equivalent in the legacy cache format."""
-        legacy_cache = ()
-        if len(self.cross_attention_cache) > 0:
-            for self_attn, cross_attn in zip(
-                self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache()
-            ):
-                legacy_cache += (self_attn + cross_attn,)
-        else:
-            legacy_cache = self.self_attention_cache.to_legacy_cache()
-        return legacy_cache
-
-    # copied from https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/cache_utils.py#L1279
-    @classmethod
-    def from_legacy_cache(
-        cls, past_key_values: Optional[Iterable[tuple[torch.FloatTensor, ...]]]
-    ) -> "OVEncoderDecoderCache":
-        """Converts a cache in the legacy cache format into an equivalent `OVEncoderDecoderCache`."""
-        cache = cls(OVDynamicCache(), OVDynamicCache())
-        if past_key_values is None:
-            logger.warning_once("past_key_values should not be None in from_legacy_cache()")
-        else:
-            for layer_idx, key_value_states in enumerate(past_key_values):
-                key_states, value_states = key_value_states[:2]
-                cache.self_attention_cache.update(key_states, value_states, layer_idx)
-                if len(key_value_states) > 2:
-                    key_states, value_states = key_value_states[2:]
-                    cache.cross_attention_cache.update(key_states, value_states, layer_idx)
-                    cache.is_updated[layer_idx] = True
-        return cache
-
-
-def preprocess_past_key_values(past_key_values):
-    if (
-        is_transformers_version(">=", "4.48")
-        and isinstance(past_key_values, (list, tuple))
-        and isinstance(past_key_values[0], (list, tuple))
-    ):
-        if len(past_key_values[0]) == 2:
-            past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
-        elif len(past_key_values[0]) == 4:
-            past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values)
-        else:
-            raise ValueError(
-                f"past_key_values should have either 2 or 4 elements, but it has {len(past_key_values[0])} elements."
-            )
-
-    return past_key_values
-
-
-class OVModelPatcher(ModelPatcher):
-    def __init__(
-        self,
-        config: "OnnxConfig",
-        model: "PreTrainedModel",
-        model_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        super().__init__(config, model, model_kwargs)
-
-        self.model_patched_forward = self.patched_forward
-
-        @functools.wraps(self.model_patched_forward)
-        def patched_forward(*args, **kwargs):
-            signature = inspect.signature(self.model_patched_forward)
-            args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs)
-
-            if "past_key_values" in signature.parameters:
-                # Most models require past_key_values to be a cache instance instead of a tuple now
-                pkv_index = list(signature.parameters.keys()).index("past_key_values")
-                if pkv_index < len(args) and args[pkv_index] is not None:
-                    args[pkv_index] = preprocess_past_key_values(args[pkv_index])
-                elif kwargs.get("past_key_values") is not None:
-                    kwargs["past_key_values"] = preprocess_past_key_values(kwargs["past_key_values"])
-
-            outputs = self.model_patched_forward(*args, **kwargs)
-
-            return outputs
-
-        self.patched_forward = patched_forward
-
-
 for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC):
     if spec.name in {
         # onnx-exporter-specific fixes
@@ -382,7 +221,7 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]:
     return mask
 
 
-class OVDecoderModelPatcher(OVModelPatcher):
+class OVDecoderModelPatcher(ModelPatcher):
     def __enter__(self):
         super().__enter__()
 
@@ -1513,7 +1352,7 @@ def phi3_442_forward(
     if use_cache:
         use_legacy_cache = not isinstance(past_key_values, Cache)
         if use_legacy_cache:
-            past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
+            past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values)
         past_key_values_length = past_key_values.get_usable_length(seq_length)
 
     if position_ids is None:
@@ -3095,7 +2934,7 @@ def patched_forward(*args, **kwargs):
                 legacy_pkv = args[pkv_argument_index]
                 pkv_in_args = True
             if legacy_pkv is not None:
-                pkv = OVDynamicCache.from_legacy_cache(legacy_pkv)
+                pkv = ONNXDynamicCache.from_legacy_cache(legacy_pkv)
                 return_legacy_cache = True
                 if not pkv_in_args:
                     kwargs["past_key_values"] = pkv
@@ -3254,7 +3093,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             layer.self_attn.forward = layer.self_attn._orig_forward
 
 
-class IBertModelPatcher(OVModelPatcher):
+class IBertModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3272,7 +3111,7 @@ def __init__(
             self._model(torch.ones([1, 1], dtype=torch.long))
 
 
-class InternVLChatImageEmbeddingModelPatcher(OVModelPatcher):
+class InternVLChatImageEmbeddingModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3415,7 +3254,7 @@ def maira_vision_embed_forward(self, pixel_values):
     return self.get_image_features(pixel_values, vision_feature_layer, vision_feature_select_strategy)
 
 
-class LlavaImageEmbeddingModelPatcher(OVModelPatcher):
+class LlavaImageEmbeddingModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3436,7 +3275,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class LlavaNextImageEmbeddingModelPatcher(OVModelPatcher):
+class LlavaNextImageEmbeddingModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3454,7 +3293,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class MairaImageEmbeddingModelPatcher(OVModelPatcher):
+class MairaImageEmbeddingModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3471,7 +3310,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class LlavaNextVideoImageEmbeddingModelPatcher(OVModelPatcher):
+class LlavaNextVideoImageEmbeddingModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3513,7 +3352,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
     return emb.unsqueeze(1)
 
 
-class FluxTransfromerModelPatcher(OVModelPatcher):
+class FluxTransfromerModelPatcher(ModelPatcher):
     def __enter__(self):
         super().__enter__()
         if is_diffusers_version("<", "0.31.0"):
@@ -3688,7 +3527,7 @@ def _minicpmv_siglip_transformer_forward(
     )
 
 
-class MiniCPMVResamplerModelPatcher(OVModelPatcher):
+class MiniCPMVResamplerModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3705,7 +3544,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class MiniCPMVImageEmbeddingsModelPatcher(OVModelPatcher):
+class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3736,7 +3575,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 layer.self_attn.forward = layer.self_attn._orig_forward
 
 
-class LlavaQwen2ImageEmbeddingsModelPatcher(OVModelPatcher):
+class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3754,7 +3593,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class InputEmbeddingPatcher(OVModelPatcher):
+class InputEmbeddingPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -3779,7 +3618,7 @@ def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor):
     return self.get_img_features(pixel_values)
 
 
-class Phi3VisionImageEmbeddingsPatcher(OVModelPatcher):
+class Phi3VisionImageEmbeddingsPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4249,7 +4088,7 @@ def forward_wrap(
             input_ids=None,
             use_cache=True,
         ):
-            new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
+            new_past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values)
             result = self.__orig_forward(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -4434,7 +4273,7 @@ def block_forward(
         block.attn.forward = types.MethodType(sdpa_attn_forward, block.attn)
 
 
-class Qwen2VLVisionEmbMergerPatcher(OVModelPatcher):
+class Qwen2VLVisionEmbMergerPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4468,7 +4307,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             block.attn.forward = block.attn._orig_forward
 
 
-class Qwen2_5_VLVisionEmbMergerPatcher(OVModelPatcher):
+class Qwen2_5_VLVisionEmbMergerPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4648,7 +4487,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward
 
 
-class OVSeq2SeqModelPatcher(OVModelPatcher):
+class OVSeq2SeqModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4680,11 +4519,11 @@ def patched_forward(*args, **kwargs):
                     pkv = args[pkv_arg_index]
 
                 if pkv is not None:
-                    if isinstance(pkv, OVEncoderDecoderCache):
+                    if isinstance(pkv, ONNXEncoderDecoderCache):
                         pkv = pkv.self_attention_cache.to_legacy_cache()
                     else:
                         pkv = [pkv_item[:2] for pkv_item in pkv]
-                    pkv = OVEncoderDecoderCache.from_legacy_cache(pkv)
+                    pkv = ONNXEncoderDecoderCache.from_legacy_cache(pkv)
 
                     if "past_key_values" in kwargs:
                         kwargs["past_key_values"] = pkv
@@ -4694,7 +4533,7 @@ def patched_forward(*args, **kwargs):
             outputs = self.super_patched_forward(*args, **kwargs)
 
             # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48
-            if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)):
+            if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)):
                 outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache()
 
             # we still need to filter out cross attention in the case of non-stateful decoder
@@ -4733,7 +4572,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask)
 
 
-class SanaTextEncoderModelPatcher(OVModelPatcher):
+class SanaTextEncoderModelPatcher(ModelPatcher):
     def __enter__(self):
         super().__enter__()
 
@@ -4784,7 +4623,7 @@ def __init__(
         super().__init__(config, model, model_kwargs)
 
 
-class CommonImageEmbeddingsModelPatcher(OVModelPatcher):
+class CommonImageEmbeddingsModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -4873,7 +4712,7 @@ def __init__(
         def forward(
             self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True
         ):
-            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
+            pkv = ONNXDynamicCache.from_legacy_cache(past_key_values)
 
             past_seen_tokens = past_key_values[0][0].shape[-2]
             cache_position = torch.arange(
@@ -4938,7 +4777,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             del self._model.model._orig_update_causual_mask
 
 
-class Idefics3ImageEmbeddingsModelPatcher(OVModelPatcher):
+class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -5260,7 +5099,7 @@ def _blenderbot_attn_forward_new(
     query_states = query_states
 
     if past_key_value is not None:
-        if isinstance(past_key_value, OVEncoderDecoderCache):
+        if isinstance(past_key_value, ONNXEncoderDecoderCache):
             is_updated = past_key_value.is_updated.get(self.layer_idx)
             if is_cross_attention:
                 # after the first generated id, we can subsequently re-use all key/value_states from cache
@@ -5719,7 +5558,7 @@ def speecht5_decoder_layer_forward(
     return outputs
 
 
-class OVSpeechT5ModelPatcher(OVModelPatcher):
+class OVSpeechT5ModelPatcher(ModelPatcher):
     def __enter__(self):
         if self.real_config._behavior != "vocoder":
             super().__enter__()
@@ -5789,7 +5628,7 @@ def patched_decoder_forward(
             if past_key_values is not None:
                 past_key_values = [cache_item[:2] for cache_item in past_key_values]
                 if is_transformers_version(">=", "4.56"):
-                    past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values)
+                    past_key_values = ONNXEncoderDecoderCache.from_legacy_cache(past_key_values)
 
             output_sequence = inputs_embeds
             output_cross_attentions = False
@@ -5821,7 +5660,7 @@ def patched_decoder_forward(
 
             past_key_values = decoder_out.past_key_values
             if past_key_values is not None:
-                if isinstance(past_key_values, OVEncoderDecoderCache):
+                if isinstance(past_key_values, ONNXEncoderDecoderCache):
                     past_key_values = past_key_values.self_attention_cache.to_legacy_cache()
                 else:
                     past_key_values = [cache_item[:2] for cache_item in past_key_values]
@@ -5873,7 +5712,7 @@ def __init__(
         # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178
         # moved audio and vision features processing outside model
         def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True):
-            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
+            pkv = ONNXDynamicCache.from_legacy_cache(past_key_values)
             outputs = self.model(
                 inputs_embeds=inputs_embeds,
                 attention_mask=attention_mask,
@@ -5895,7 +5734,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class Phi4MMAudioForwardEmbeddingsPatcher(OVModelPatcher):
+class Phi4MMAudioForwardEmbeddingsPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -5919,7 +5758,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class Phi4MMAudioEncoderPatcher(OVModelPatcher):
+class Phi4MMAudioEncoderPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -5960,7 +5799,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
-class Phi4MMVisionEmbeddingsPatcher(OVModelPatcher):
+class Phi4MMVisionEmbeddingsPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -6269,7 +6108,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.img_processor.embeddings.forward = self._model.img_processor.embeddings._orig_forward
 
 
-class Llama4ImageEmbeddingsModelPatcher(OVModelPatcher):
+class Llama4ImageEmbeddingsModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -6464,7 +6303,7 @@ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
     return inner_mask
 
 
-class Llama4TextModelPatcher(OVModelPatcher):
+class Llama4TextModelPatcher(ModelPatcher):
     def __enter__(self):
         super().__enter__()
 
@@ -6634,7 +6473,7 @@ def mamba_mixer_forward(
 # 1. Inject a MambaCache structure into the original model to simplify input and output handling related to SSM states
 # 2. Patch ConvSequenceTransform module to avoid if-else branching
 # 3. Vectorize the selective scan operation to ensure correct behavior during JIT tracing
-class MambaPatcher(OVModelPatcher):
+class MambaPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -7130,7 +6969,7 @@ def segment_sum(input_tensor):
 #    for subsequent invocation of the model's `forward` method.
 # 2. Patches the Zamba2MambaMixer so that the traced `forward` function works correctly
 #    during both the prefill and decoding steps.
-class Zamba2ModelPatcher(OVModelPatcher):
+class Zamba2ModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",
@@ -7559,7 +7398,7 @@ def granite_moe_hybrid_update_causal_mask(
     return causal_mask
 
 
-class GraniteMoeHybridModelPatcher(OVModelPatcher):
+class GraniteMoeHybridModelPatcher(ModelPatcher):
     def __init__(
         self,
         config: "OnnxConfig",

From 0e41943847818a8ab8d660bde7f73a0ec3b2ba7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 10 Feb 2026 16:13:44 +0100
Subject: [PATCH 062/222] style

---
 optimum/exporters/openvino/model_patcher.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index b779170d8e..c2e878f0c8 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -18,12 +18,11 @@
 import logging as log
 import math
 import types
-from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
-from transformers.cache_utils import DynamicCache, EncoderDecoderCache
+from transformers.cache_utils import DynamicCache
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
 from transformers.models.phi3.modeling_phi3 import apply_rotary_pos_emb, repeat_kv
 from transformers.models.speecht5.modeling_speecht5 import SpeechT5EncoderWithSpeechPrenet
@@ -37,8 +36,6 @@
     sdpa_mask_without_vmap,
 )
 from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache
-
-
 from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
 
 
@@ -4129,7 +4126,6 @@ def lm_forward(
             deepstack_visual_embeds,
             use_cache=True,
         ):
-            from transformers.cache_utils import DynamicCache
 
             pkv = DynamicCache.from_legacy_cache(past_key_values)
             outputs = self.model.language_model(

From bc9665d0335d6d24be3a0e94c566c53424ed4088 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 10 Feb 2026 17:35:08 +0100
Subject: [PATCH 063/222] fix seq2seq stateless export

---
 optimum/exporters/openvino/model_patcher.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index c2e878f0c8..c8cba66a2a 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4531,6 +4531,8 @@ def patched_forward(*args, **kwargs):
             # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48
             if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)):
                 outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache()
+            elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)):
+                outputs.pop("past_key_values")
 
             # we still need to filter out cross attention in the case of non-stateful decoder
             filtered_outputs = {}

From fceb15186746ce08deccb26e66ccbdc958826b65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 10 Feb 2026 18:07:51 +0100
Subject: [PATCH 064/222] cache depending on transformers version

---
 optimum/exporters/openvino/model_patcher.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index e8d56d555e..6dd442130a 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -54,7 +54,6 @@
     override_arguments,
     sdpa_mask_without_vmap,
 )
-from optimum.exporters.onnx.utils import ONNXDynamicCache, ONNXEncoderDecoderCache
 from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
 
 
@@ -78,6 +77,14 @@
     TransformersKwargs = object
 
 
+if is_transformers_version("<", "5"):
+    from transformers import DynamicCache as ONNXDynamicCache
+    from transformers import EncoderDecoderCache as ONNXEncoderDecoderCache
+else:
+    from optimum.exporters.onnx.utils import LegacyDynamicCache as ONNXDynamicCache
+    from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as ONNXEncoderDecoderCache
+
+
 logger = logging.getLogger(__name__)
 
 

From 5133a4a9f7f6e36b84caac249fcf8a66e20adef8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Feb 2026 12:05:29 +0100
Subject: [PATCH 065/222] pix2struct patcher

---
 optimum/exporters/openvino/model_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 4ad4cb079b..506459987d 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5005,7 +5005,7 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig):
     ],
 )
 class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig):
-    pass
+    _MODEL_PATCHER = OVSeq2SeqModelPatcher
 
 
 @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS)

From 0c4a89c4877065c6f2b721e94f7abdb6975e7815 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Feb 2026 16:17:59 +0100
Subject: [PATCH 066/222] fix

---
 optimum/exporters/openvino/model_patcher.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 6dd442130a..3f87244111 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -89,10 +89,10 @@
 
 
 def _get_subcomponent_model(model, name):
-    if is_transformers_version("<", "5"):
-        return getattr(model, name)
+    if is_transformers_version(">=", "5") and hasattr(model, "model"):
+        return getattr(model.model, name)
 
-    return getattr(model.model, name)
+    return getattr(model, name)
 
 
 for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC):

From d8a482935c9cfede8a214a2555416e5cf89a7c58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Feb 2026 18:05:15 +0100
Subject: [PATCH 067/222] remove internvl_chat, minicpmv in tests

---
 tests/openvino/test_seq2seq.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 0ad560213d..153f57be8e 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -535,14 +535,10 @@ def test_pipeline(self, model_arch: str):
 
 class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
     SUPPORTED_ARCHITECTURES = [
-        "internvl_chat",
         "llava",
         "llava_next",
         "llava_next_mistral",
         "llava_next_video",
-        "llava-qwen2",
-        "minicpmv",
-        "phi3_v",
         "qwen2_vl",
     ]
     SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"]
@@ -554,9 +550,14 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         SUPPORTED_ARCHITECTURES += ["maira2", "idefics3"]
 
     if is_transformers_version(">=", "4.49.0"):
-        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2", "phi4mm"]
+        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"]
         SUPPORT_VIDEO.append("qwen2_5_vl")
-        SUPPORT_AUDIO.append("phi4mm")
+
+        if is_transformers_version("<", "4.54.0"):
+            # remote code models differs after transformers v4.54
+            SUPPORTED_ARCHITECTURES += ["phi4mm"]
+            SUPPORT_AUDIO.append("phi4mm")
+
     if is_transformers_version(">", "4.49"):
         SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
     if is_transformers_version(">=", "4.51"):
@@ -569,9 +570,13 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         SUPPORTED_ARCHITECTURES += ["qwen3_vl"]
         SUPPORT_VIDEO += ["qwen3_vl"]
 
-    if is_transformers_version(">=", "4.54.0"):
+    if is_transformers_version("<", "4.54.0"):
         # remote code models differs after transformers v4.54
-        SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"}
+        SUPPORTED_ARCHITECTURES += ["llava-qwen2", "phi3_v"]
+
+    if is_transformers_version("<", "5"):
+        # remote code models incompatible after transformers v5
+        SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"]
 
     REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
     IMAGE = Image.open(

From e477044ed4c9906091a9ec4f07e91535c86834be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Feb 2026 18:12:38 +0100
Subject: [PATCH 068/222] set max transformers version for internvl_chat
 minicpmv

---
 optimum/exporters/openvino/model_configs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 506459987d..0fb8663202 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -2078,6 +2078,7 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
 
 @register_in_tasks_manager("internvl_chat", *["image-text-to-text"], library_name="transformers")
 class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig):
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     def __init__(
         self,
         config: "PretrainedConfig",
@@ -2862,6 +2863,7 @@ class MiniCPMVConfigBehavior(str, enum.Enum):
 
 @register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers")
 class MiniCPMVOpenVINOConfig(BaseVLMOpenVINOConfig):
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior]
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
     DUMMY_INPUT_GENERATOR_CLASSES = ()

From e0b2b46849c44d25ca4e6e0975179bd4e57e7306 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Feb 2026 18:20:18 +0100
Subject: [PATCH 069/222] style

---
 optimum/exporters/openvino/model_configs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0fb8663202..c3348ed285 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -2079,6 +2079,7 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
 @register_in_tasks_manager("internvl_chat", *["image-text-to-text"], library_name="transformers")
 class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig):
     MAX_TRANSFORMERS_VERSION = "4.57.6"
+
     def __init__(
         self,
         config: "PretrainedConfig",

From 50fe59046294a0aff8dab9d527c7d1027666922c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 15:47:37 +0100
Subject: [PATCH 070/222] fix textual inversion

---
 optimum/intel/openvino/loaders.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/loaders.py b/optimum/intel/openvino/loaders.py
index 214a4a7e8c..bd62e047bb 100644
--- a/optimum/intel/openvino/loaders.py
+++ b/optimum/intel/openvino/loaders.py
@@ -22,7 +22,7 @@
 from openvino import Type
 from openvino import opset11 as ops
 from openvino.passes import Manager, Matcher, MatcherPass, WrapType
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizerBase
 
 from .utils import TEXTUAL_INVERSION_EMBEDDING_KEYS
 
@@ -80,7 +80,7 @@ def load_textual_inversion(
         self,
         pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]],
         token: Optional[Union[str, List[str]]] = None,
-        tokenizer: Optional["PreTrainedTokenizer"] = None,  # noqa: F821
+        tokenizer: Optional["PreTrainedTokenizerBase"] = None,  # noqa: F821
         text_encoder: Optional["openvino.Model"] = None,  # noqa: F821
         **kwargs,
     ):
@@ -88,9 +88,9 @@ def load_textual_inversion(
             raise ValueError(
                 f"{self.__class__.__name__} requires `self.tokenizer` for calling `{self.load_textual_inversion.__name__}`"
             )
-        elif not isinstance(self.tokenizer, PreTrainedTokenizer):
+        elif not isinstance(self.tokenizer, PreTrainedTokenizerBase):
             raise ValueError(
-                f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling `{self.load_textual_inversion.__name__}`"
+                f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizerBase` for calling `{self.load_textual_inversion.__name__}`"
             )
 
         if not hasattr(self, "text_encoder"):

From e9ff083d929c3c132f4b1d17b05b59bc50873cf9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 16:15:25 +0100
Subject: [PATCH 071/222] add back inc

---
 .github/workflows/build_documentation.yml    |  2 +-
 .github/workflows/build_pr_documentation.yml |  2 +-
 docs/source/neural_compressor/reference.mdx  | 40 ++++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index 15852df3eb..ce3eb464ce 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -52,7 +52,7 @@ jobs:
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
           uv pip install transformers==4.57.6
-          uv pip install .[quality] diffusers accelerate datasets
+          uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets
 
       - name: Make documentation
         shell: bash
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 311f29b0dd..6b0b89f3f1 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -39,7 +39,7 @@ jobs:
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
           uv pip install transformers==4.57.6
-          uv pip install .[quality] diffusers accelerate datasets
+          uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets
 
       - name: Make documentation
         shell: bash
diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx
index b83618b4bc..c631aed883 100644
--- a/docs/source/neural_compressor/reference.mdx
+++ b/docs/source/neural_compressor/reference.mdx
@@ -14,3 +14,43 @@ specific language governing permissions and limitations under the License.
 `optimum.intel.neural_compressor` is deprecated and will be removed in the next major release.
 
 </Tip>
+
+## INCQuantizer
+
+[[autodoc]] neural_compressor.quantization.INCQuantizer
+
+## INCTrainer
+
+[[autodoc]] neural_compressor.trainer.INCTrainer
+
+## INCModel
+
+[[autodoc]] neural_compressor.modeling_base.INCModel
+
+## INCModelForSequenceClassification
+
+[[autodoc]] neural_compressor.modeling_base.INCModelForSequenceClassification
+
+## INCModelForQuestionAnswering
+
+[[autodoc]] neural_compressor.modeling_base.INCModelForQuestionAnswering
+
+## INCModelForTokenClassification
+
+[[autodoc]] neural_compressor.modeling_base.INCModelForTokenClassification
+
+## INCModelForMultipleChoice
+
+[[autodoc]] neural_compressor.modeling_base.INCModelForMultipleChoice
+
+## INCModelForMaskedLM
+
+[[autodoc]] neural_compressor.modeling_base.INCModelForMaskedLM
+
+## INCModelForCausalLM
+
+[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM
+
+## INCModelForSeq2SeqLM
+
+[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM
\ No newline at end of file

From 49f020f4c3d6c09b45330ff47bf3c05b36e89208 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 16:17:22 +0100
Subject: [PATCH 072/222] style

---
 docs/source/neural_compressor/reference.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/neural_compressor/reference.mdx b/docs/source/neural_compressor/reference.mdx
index c631aed883..b6e3d8f468 100644
--- a/docs/source/neural_compressor/reference.mdx
+++ b/docs/source/neural_compressor/reference.mdx
@@ -53,4 +53,4 @@ specific language governing permissions and limitations under the License.
 
 ## INCModelForSeq2SeqLM
 
-[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM
\ No newline at end of file
+[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM

From 31e8c4462f9146c7d66faeb013a40d80ece08f0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 16:35:35 +0100
Subject: [PATCH 073/222] skip text2text generation pipeline when >= v5

---
 tests/openvino/test_modeling_basic.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py
index 3dac24c69a..549411f344 100644
--- a/tests/openvino/test_modeling_basic.py
+++ b/tests/openvino/test_modeling_basic.py
@@ -28,7 +28,7 @@
     OVModelForTokenClassification,
     OVStableDiffusionPipeline,
 )
-
+from optimum.intel.utils.import_utils import is_transformers_version
 
 # Make sure that common architectures are used in combination with common tasks
 MODEL_NAMES = {
@@ -58,6 +58,9 @@ def test_pipeline(self, model_id):
         """
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         model_class_str = MODEL_NAMES[model_id]
+        if model_class_str == "OVModelForSeq2SeqLM" and is_transformers_version(">=", "5"):
+            self.skipTest("text2text-generation pipeline was deprecated in transformers v5")
+
         model_class = eval(model_class_str)
         model = model_class.from_pretrained(model_id, device=OPENVINO_DEVICE)
         model.save_pretrained(f"{model_id}_ov")

From 4e43429bfd283e0bb1ffe8630e440833844aa5c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 16:59:51 +0100
Subject: [PATCH 074/222] fix perceiver vision preprocessor loading

---
 tests/openvino/test_modeling.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 785c4e2782..f53c9fdce6 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -34,6 +34,7 @@
 from sentence_transformers import SentenceTransformer
 from transformers import (
     AutoFeatureExtractor,
+    AutoImageProcessor,
     AutoModel,
     AutoModelForAudioClassification,
     AutoModelForAudioFrameClassification,
@@ -1187,7 +1188,7 @@ def test_compare_to_transformers(self, model_arch):
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         set_seed(SEED)
         transformers_model = AutoModelForImageClassification.from_pretrained(model_id)
-        preprocessor = AutoProcessor.from_pretrained(model_id)
+        preprocessor = AutoImageProcessor.from_pretrained(model_id)
         url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         inputs = preprocessor(images=image, return_tensors="pt")
@@ -1211,7 +1212,7 @@ def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForImageClassification.from_pretrained(model_id, device=OPENVINO_DEVICE)
         model.eval()
-        preprocessor = AutoProcessor.from_pretrained(model_id)
+        preprocessor = AutoImageProcessor.from_pretrained(model_id)
         pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor)
         inputs = TEST_IMAGE_URL
         outputs = pipe(inputs)

From 3565637f2ced588d5c5dfc271fdffb015bc91c38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 17:15:11 +0100
Subject: [PATCH 075/222] fix question answering pipeline

---
 tests/openvino/test_modeling_basic.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py
index 549411f344..5d5665beeb 100644
--- a/tests/openvino/test_modeling_basic.py
+++ b/tests/openvino/test_modeling_basic.py
@@ -72,9 +72,13 @@ def test_pipeline(self, model_id):
         elif model_class_str == "OVModelForMaskedLM":
             input_text[0] = f"{input_text[0]} {tokenizer.mask_token}"
 
-        if model_class_str in TASKS:
-            task = TASKS[model_class_str]
-            pipe = pipeline(task, model=model, tokenizer=tokenizer)
+        task = TASKS[model_class_str]
+        pipe = pipeline(task, model=model, tokenizer=tokenizer)
+
+        if task == "question-answering":
+            # positional arguments deprecated for question-answering pipeline since v5
+            pipe(question=input_text[0], context=input_text[1])
+        else:
             pipe(*input_text)
         gc.collect()
 

From 2d1929d9b81931e3fc43fa2040c671df1a23f93e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 17:37:05 +0100
Subject: [PATCH 076/222] only install diffusers when compatible

---
 .github/workflows/test_openvino.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 835204f423..6791a8962f 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -54,7 +54,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
-          uv pip install .[diffusers,tests]
+          uv pip install .[tests]
 
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Install OpenVINO
@@ -71,10 +71,10 @@ jobs:
         run: |
           uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator
 
-      - if: ${{ matrix.transformers-version == '5.0.0' }}
+      - if: ${{ matrix.transformers-version != '5.0.0' }}
         name: Install diffusers
         run: |
-          uv pip install git+https://github.com/huggingface/diffusers
+          uv pip install diffusers
 
       - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}}
         name: Install auto-gptq, autoawq

From a8b08a4c51fa9f097e579141d1e37ff9edc1f4d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 17:38:11 +0100
Subject: [PATCH 077/222] fix diffusers mapping

---
 optimum/exporters/openvino/model_configs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index c3348ed285..42a58ee523 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -290,6 +290,8 @@ def init_model_configs():
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline")
+        if "text-to-image" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS:
+            TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"] = {}
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline"
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana-sprint"] = "SanaSprintPipeline"
     if is_diffusers_available() and "text-to-video" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS:

From 83ae84653c69c19e93be9de63e30f46e35386e94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 17:38:43 +0100
Subject: [PATCH 078/222] style

---
 tests/openvino/test_modeling_basic.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py
index 5d5665beeb..c2576db98b 100644
--- a/tests/openvino/test_modeling_basic.py
+++ b/tests/openvino/test_modeling_basic.py
@@ -17,16 +17,11 @@
 from transformers import AutoTokenizer, pipeline
 from utils_tests import OPENVINO_DEVICE
 from optimum.intel import (
-    OVModelForAudioClassification,
     OVModelForCausalLM,
-    OVModelForFeatureExtraction,
-    OVModelForImageClassification,
     OVModelForMaskedLM,
     OVModelForQuestionAnswering,
     OVModelForSeq2SeqLM,
     OVModelForSequenceClassification,
-    OVModelForTokenClassification,
-    OVStableDiffusionPipeline,
 )
 from optimum.intel.utils.import_utils import is_transformers_version
 

From cad085b66ec8cba022a4117b4c657bc519b20903 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 13 Feb 2026 17:39:06 +0100
Subject: [PATCH 079/222] update diffusers extra

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e010f5c0ef..617e12d24a 100644
--- a/setup.py
+++ b/setup.py
@@ -73,7 +73,7 @@
     "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"],
     "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"],
     "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"],
-    "diffusers": ["diffusers"],
+    "diffusers": ["diffusers", "transformers<5"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
 }

From 5dbe3c894447bb8759454ff2d273fffd69de73fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 14:18:56 +0100
Subject: [PATCH 080/222] add transformers version workflow

---
 .github/workflows/test_openvino.yml | 8 ++++----
 setup.py                            | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 6791a8962f..b42bca1548 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -38,7 +38,7 @@ jobs:
             "*diffusion*",
             "*quantization*",
           ]
-        transformers-version: ["4.45.0", "5.0.0", "latest"]
+        transformers-version: ["4.45", "4.57", "latest"]
 
     runs-on: ubuntu-22.04
 
@@ -66,17 +66,17 @@ jobs:
         run: |
           uv pip install transformers==${{ matrix.transformers-version }}
 
-      - if: ${{ matrix.transformers-version == '4.45.0' }}
+      - if: ${{ matrix.transformers-version == '4.45' }}
         name: Install specific dependencies and versions required for older transformers
         run: |
           uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator
 
-      - if: ${{ matrix.transformers-version != '5.0.0' }}
+      - if: ${{ matrix.transformers-version != 'latest' }}
         name: Install diffusers
         run: |
           uv pip install diffusers
 
-      - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}}
+      - if: ${{ matrix.transformers-version != '4.45' && matrix.test-pattern == '*decoder*'}}
         name: Install auto-gptq, autoawq
         run: |
           uv pip install auto-gptq "autoawq<0.2.8"
diff --git a/setup.py b/setup.py
index 617e12d24a..16e2a82fed 100644
--- a/setup.py
+++ b/setup.py
@@ -56,7 +56,7 @@
     "sentence-transformers",
     "open_clip_torch>=2.26.1",
     "peft",
-    "datasets[audio]>=1.4.0,<4.0.0",
+    "datasets>=1.4.0,<4.0.0",
     "tbb",
     "langchain-huggingface",
     "hf_xet",

From b7ce98b6639488f65cba525bfeabff6d502841b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 16:08:10 +0100
Subject: [PATCH 081/222] set transformers 4.57.6 for tests

---
 .github/workflows/test_openvino.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index b42bca1548..59bd4673b3 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -38,7 +38,7 @@ jobs:
             "*diffusion*",
             "*quantization*",
           ]
-        transformers-version: ["4.45", "4.57", "latest"]
+        transformers-version: ["4.45.0", "4.57.6", "latest"]
 
     runs-on: ubuntu-22.04
 
@@ -66,7 +66,7 @@ jobs:
         run: |
           uv pip install transformers==${{ matrix.transformers-version }}
 
-      - if: ${{ matrix.transformers-version == '4.45' }}
+      - if: ${{ matrix.transformers-version == '4.45.0' }}
         name: Install specific dependencies and versions required for older transformers
         run: |
           uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator
@@ -76,7 +76,7 @@ jobs:
         run: |
           uv pip install diffusers
 
-      - if: ${{ matrix.transformers-version != '4.45' && matrix.test-pattern == '*decoder*'}}
+      - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}}
         name: Install auto-gptq, autoawq
         run: |
           uv pip install auto-gptq "autoawq<0.2.8"

From d692d44785edd13a424f33843d004734a3fc564a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 16:08:54 +0100
Subject: [PATCH 082/222] batch_encode_plus was deprecated in v5

---
 tests/openvino/test_modeling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index f53c9fdce6..03e099f77e 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1605,7 +1605,7 @@ def test_load_from_hub_and_save_model(self):
 
         tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR)
         all_text = ["a dog", "a cat", "a frog"]
-        tokens = tokenizer.batch_encode_plus(
+        tokens = tokenizer(
             all_text,
             return_tensors="pt",
             max_length=loaded_model.config.text_config.context_length,
@@ -1683,7 +1683,7 @@ def test_functions(self):
 
         tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID_IR)
         all_text = ["a dog", "a cat", "a frog"]
-        tokens = tokenizer.batch_encode_plus(
+        tokens = tokenizer(
             all_text,
             return_tensors="pt",
             max_length=model.config.text_config.context_length,

From 93679e9b8e3afca7dc7446fd9773f0425d3990c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 18:10:15 +0100
Subject: [PATCH 083/222] fix sam

---
 optimum/intel/openvino/modeling_sam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_sam.py b/optimum/intel/openvino/modeling_sam.py
index 4722437b72..57b33be14e 100644
--- a/optimum/intel/openvino/modeling_sam.py
+++ b/optimum/intel/openvino/modeling_sam.py
@@ -403,7 +403,7 @@ def get_image_wide_positional_embeddings(self):
         x_embed = x_embed / size
 
         positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1))
-        return positional_embedding.permute(2, 0, 1).unsqueeze(0)
+        return positional_embedding.permute(2, 0, 1).unsqueeze(0).detach()
 
     def get_image_features(self, pixel_values, *args, **kwargs):
         return torch.from_numpy(self.vision_encoder(pixel_values).image_embeddings)

From b2ef4184f92d626d3f9db4263e1d6b33044b75a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 18:26:46 +0100
Subject: [PATCH 084/222] install librosa for tests

---
 .github/workflows/test_openvino.yml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 59bd4673b3..38a10c22a7 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -64,17 +64,12 @@ jobs:
       - if: ${{ matrix.transformers-version != 'latest' }}
         name: Install transformers
         run: |
-          uv pip install transformers==${{ matrix.transformers-version }}
+          uv pip install transformers==${{ matrix.transformers-version }} diffusers
 
       - if: ${{ matrix.transformers-version == '4.45.0' }}
         name: Install specific dependencies and versions required for older transformers
         run: |
-          uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator
-
-      - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Install diffusers
-        run: |
-          uv pip install diffusers
+          uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator librosa
 
       - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}}
         name: Install auto-gptq, autoawq

From 3fb01723225341f11dc850e228855f16352d1e20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 19:45:29 +0100
Subject: [PATCH 085/222] rename OVDynamicCache

---
 optimum/exporters/openvino/model_patcher.py | 42 ++++++++++-----------
 tests/openvino/test_modeling.py             |  2 +-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3f87244111..009d226b34 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -78,11 +78,11 @@
 
 
 if is_transformers_version("<", "5"):
-    from transformers import DynamicCache as ONNXDynamicCache
-    from transformers import EncoderDecoderCache as ONNXEncoderDecoderCache
+    from transformers import DynamicCache as OVDynamicCache
+    from transformers import EncoderDecoderCache as OVEncoderDecoderCache
 else:
-    from optimum.exporters.onnx.utils import LegacyDynamicCache as ONNXDynamicCache
-    from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as ONNXEncoderDecoderCache
+    from optimum.exporters.onnx.utils import LegacyDynamicCache as OVDynamicCache
+    from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as OVEncoderDecoderCache
 
 
 logger = logging.getLogger(__name__)
@@ -331,7 +331,7 @@ def __enter__(self):
                     _mixtral_sparse_moe_block_forward, layer.block_sparse_moe
                 )
         else:
-            self._model.config._experts_implementation = "batched_mm"
+            self._model.set_experts_implementation("batched_mm")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -1382,7 +1382,7 @@ def phi3_442_forward(
     if use_cache:
         use_legacy_cache = not isinstance(past_key_values, Cache)
         if use_legacy_cache:
-            past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values)
+            past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
         past_key_values_length = past_key_values.get_usable_length(seq_length)
 
     if position_ids is None:
@@ -1647,7 +1647,7 @@ def __enter__(self):
                     _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe
                 )
         else:
-            self._model.config._experts_implementation = "batched_mm"
+            self._model.set_experts_implementation("batched_mm")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -2964,7 +2964,7 @@ def patched_forward(*args, **kwargs):
                 legacy_pkv = args[pkv_argument_index]
                 pkv_in_args = True
             if legacy_pkv is not None:
-                pkv = ONNXDynamicCache.from_legacy_cache(legacy_pkv)
+                pkv = OVDynamicCache.from_legacy_cache(legacy_pkv)
                 return_legacy_cache = True
                 if not pkv_in_args:
                     kwargs["past_key_values"] = pkv
@@ -4118,7 +4118,7 @@ def forward_wrap(
             input_ids=None,
             use_cache=True,
         ):
-            new_past_key_values = ONNXDynamicCache.from_legacy_cache(past_key_values)
+            new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
             result = self.__orig_forward(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -4502,7 +4502,7 @@ def __enter__(self):
                 )
 
         else:
-            self._model.config._experts_implementation = "batched_mm"
+            self._model.set_experts_implementation("batched_mm")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -4547,11 +4547,11 @@ def patched_forward(*args, **kwargs):
                     pkv = args[pkv_arg_index]
 
                 if pkv is not None:
-                    if isinstance(pkv, ONNXEncoderDecoderCache):
+                    if isinstance(pkv, OVEncoderDecoderCache):
                         pkv = pkv.self_attention_cache.to_legacy_cache()
                     else:
                         pkv = [pkv_item[:2] for pkv_item in pkv]
-                    pkv = ONNXEncoderDecoderCache.from_legacy_cache(pkv)
+                    pkv = OVEncoderDecoderCache.from_legacy_cache(pkv)
 
                     if "past_key_values" in kwargs:
                         kwargs["past_key_values"] = pkv
@@ -4561,7 +4561,7 @@ def patched_forward(*args, **kwargs):
             outputs = self.super_patched_forward(*args, **kwargs)
 
             # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48
-            if isinstance(outputs.get("past_key_values"), (ONNXDynamicCache, ONNXEncoderDecoderCache)):
+            if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)):
                 outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache()
             elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)):
                 outputs.pop("past_key_values")
@@ -4742,7 +4742,7 @@ def __init__(
         def forward(
             self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True
         ):
-            pkv = ONNXDynamicCache.from_legacy_cache(past_key_values)
+            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
 
             past_seen_tokens = past_key_values[0][0].shape[-2]
             cache_position = torch.arange(
@@ -5129,7 +5129,7 @@ def _blenderbot_attn_forward_new(
     query_states = query_states
 
     if past_key_value is not None:
-        if isinstance(past_key_value, ONNXEncoderDecoderCache):
+        if isinstance(past_key_value, OVEncoderDecoderCache):
             is_updated = past_key_value.is_updated.get(self.layer_idx)
             if is_cross_attention:
                 # after the first generated id, we can subsequently re-use all key/value_states from cache
@@ -5331,7 +5331,7 @@ def __enter__(self):
             modulewise_patch(self._model, Qwen2MoeSparseMoeBlock, _qwen2moe_sparse_block_forward)
 
         if is_transformers_version(">=", "5"):
-            self._model.config._experts_implementation = "batched_mm"
+            self._model.set_experts_implementation("batched_mm")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -5658,7 +5658,7 @@ def patched_decoder_forward(
             if past_key_values is not None:
                 past_key_values = [cache_item[:2] for cache_item in past_key_values]
                 if is_transformers_version(">=", "4.56"):
-                    past_key_values = ONNXEncoderDecoderCache.from_legacy_cache(past_key_values)
+                    past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values)
 
             output_sequence = inputs_embeds
             output_cross_attentions = False
@@ -5690,7 +5690,7 @@ def patched_decoder_forward(
 
             past_key_values = decoder_out.past_key_values
             if past_key_values is not None:
-                if isinstance(past_key_values, ONNXEncoderDecoderCache):
+                if isinstance(past_key_values, OVEncoderDecoderCache):
                     past_key_values = past_key_values.self_attention_cache.to_legacy_cache()
                 else:
                     past_key_values = [cache_item[:2] for cache_item in past_key_values]
@@ -5742,7 +5742,7 @@ def __init__(
         # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178
         # moved audio and vision features processing outside model
         def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True):
-            pkv = ONNXDynamicCache.from_legacy_cache(past_key_values)
+            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
             outputs = self.model(
                 inputs_embeds=inputs_embeds,
                 attention_mask=attention_mask,
@@ -6696,7 +6696,7 @@ def __enter__(self):
             self.original_moe_forward = Qwen3MoeSparseMoeBlock.forward
             Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward_patched
         if is_transformers_version(">=", "5"):
-            self._model.config._experts_implementation = "batched_mm"
+            self._model.set_experts_implementation("batched_mm")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -7374,7 +7374,7 @@ def __enter__(self):
             GptOssExperts.forward = gpt_oss_forward
 
         if is_transformers_version(">=", "5"):
-            self._model.config._experts_implementation = "batched_mm"
+            self._model.set_experts_implementation("batched_mm")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 03e099f77e..4eccde4c87 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1723,7 +1723,7 @@ def test_functions(self):
         self.assertTrue(torch.allclose(model_outputs.logits_per_image, res.logits_per_image, atol=1e-2))
 
         model.reshape(1, -1)
-        reshaped_tokens = tokenizer.batch_encode_plus(
+        reshaped_tokens = tokenizer(
             ["a dog"],
             return_tensors="pt",
             max_length=model.config.text_config.context_length,

From 3d2286c4bc2aefc7d1c89d4c1554032440738ad7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 20:56:13 +0100
Subject: [PATCH 086/222] qwenvl3 fix

---
 optimum/exporters/openvino/model_configs.py | 2 +-
 optimum/exporters/openvino/model_patcher.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 42a58ee523..ef54b8f78d 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -3832,7 +3832,7 @@ def __init__(
     @staticmethod
     def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]):
         if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS:
-            vision_emb_pos = model.visual.pos_embed
+            vision_emb_pos = _get_subcomponent_model(model, "visual").pos_embed
             vision_emb_pos.config = model.config.vision_config
             return vision_emb_pos
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 009d226b34..b7084b7a34 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4159,7 +4159,7 @@ def lm_forward(
             deepstack_visual_embeds,
             use_cache=True,
         ):
-            pkv = DynamicCache.from_legacy_cache(past_key_values)
+            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
             outputs = self.model.language_model(
                 inputs_embeds=inputs_embeds,
                 attention_mask=attention_mask,
@@ -7858,7 +7858,7 @@ def forward(
             inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
 
         if use_cache and past_key_values is None:
-            past_key_values = DynamicCache(config=self.config)
+            past_key_values = OVDynamicCache(config=self.config)
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0

From 46fa8a70d1910cd985a8411a6a8650bcaf7f784a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 16 Feb 2026 23:06:16 +0100
Subject: [PATCH 087/222] fix qwen2vl

---
 optimum/exporters/openvino/model_configs.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ef54b8f78d..cea7528529 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -3716,12 +3716,17 @@ def with_behavior(
             behavior = QwenVLConfigBehavior(behavior)
 
         if behavior == QwenVLConfigBehavior.TEXT_EMBEDDINGS:
-            return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
+            return get_vlm_text_embeddings_config(
+                "qwen2",
+                self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config,
+                self.int_dtype,
+                self.float_dtype,
+            )
 
         if behavior == QwenVLConfigBehavior.LANGUAGE:
             return get_vlm_text_generation_config(
                 "qwen2",
-                self._orig_config,
+                self._orig_config if is_transformers_version("<", "5") else self._orig_config.text_config,
                 self.int_dtype,
                 self.float_dtype,
                 model_patcher=Qwen2VLLanguageModelPatcher,

From 20bb596bcd3d3c08c93e0da51e778d3be0060f1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 10:43:13 +0100
Subject: [PATCH 088/222] github workflow librosa

---
 .github/workflows/test_openvino.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 38a10c22a7..085619c5fa 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -54,7 +54,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
-          uv pip install .[tests]
+          uv pip install .[tests] librosa
 
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Install OpenVINO
@@ -69,7 +69,7 @@ jobs:
       - if: ${{ matrix.transformers-version == '4.45.0' }}
         name: Install specific dependencies and versions required for older transformers
         run: |
-          uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator librosa
+          uv pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.32.* transformers_stream_generator
 
       - if: ${{ matrix.transformers-version != '4.45.0' && matrix.test-pattern == '*decoder*'}}
         name: Install auto-gptq, autoawq

From a091dadd1262971955598c460ea700fde6232f9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 10:46:04 +0100
Subject: [PATCH 089/222] Update MAX_TRANSFORMERS_VERSION for incompatible
 models

---
 optimum/exporters/openvino/model_configs.py | 43 ++++++++++++++++++---
 tests/openvino/test_decoder.py              |  3 +-
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index cea7528529..a25c3e7b8e 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1080,6 +1080,8 @@ class Phi3OpenVINOConfig(PhiOnnxConfig):
 )
 class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.46.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = PhiMoEModelPatcher
 
 
@@ -1284,6 +1286,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 @register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
@@ -1293,6 +1296,7 @@ class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
 @register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
@@ -1302,6 +1306,7 @@ class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
 @register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
@@ -1892,6 +1897,8 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[
 @register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers")
 class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.37.2"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
     def __init__(
         self,
@@ -1930,6 +1937,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict:
 @register_in_tasks_manager("llava_next", *["image-text-to-text"], library_name="transformers")
 class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.40.0"
+    MAX_TRANSFORMERS_VERSION = "5.99"
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
         model_kwargs = model_kwargs or {}
@@ -1991,6 +1999,8 @@ class LlavaNextVideoConfigBehavior(str, enum.Enum):
 @register_in_tasks_manager("llava_next_video", *["image-text-to-text"], library_name="transformers")
 class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.42.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior]
 
     def with_behavior(
@@ -2055,6 +2065,7 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[
 )
 class MairaOpenVINOConfig(LlavaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.46.0"
+    MAX_TRANSFORMERS_VERSION = "5.99"
     SUPPORTS_PAST = True
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
@@ -3976,6 +3987,8 @@ class GraniteOpenVINOConfig(LlamaOpenVINOConfig):
 )
 class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.45.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = GraniteMoEModelPatcher
 
 
@@ -4009,7 +4022,8 @@ class T5OpenVINOConfig(T5OnnxConfig):
     library_name="transformers",
 )
 class MT5OpenVINOConfig(T5OpenVINOConfig):
-    pass
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(
@@ -4098,6 +4112,8 @@ class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig):
 @register_in_tasks_manager("got_ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers")
 class GotOCR2OpenVINOConfig(BaseVLMOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.49.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
     def __init__(
         self,
@@ -4130,6 +4146,8 @@ def __init__(
 @register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers")
 class Gemma3OpenVINOConfig(BaseVLMOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.50.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
     def __init__(
         self,
@@ -4213,6 +4231,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 class Idefics3OpenVINOConfig(BaseVLMOpenVINOConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionPositionIdsInputGenerator)
     MIN_TRANSFORMERS_VERSION = "4.46.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
     def __init__(
         self,
@@ -4271,6 +4291,8 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
 @register_in_tasks_manager("smolvlm", *["image-text-to-text"], library_name="transformers")
 class SmolVLMOpenVINOConfig(Idefics3OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.50.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(
@@ -4335,6 +4357,8 @@ class PegasusOpenVINOConfig(PegasusOnnxConfig):
 )
 class MarianOpenVINOConfig(MarianOnnxConfig):
     _MODEL_PATCHER = MarianModelPatcher
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 class DummySpeechT5OpenVINOInputGenerator(DummyInputGenerator):
@@ -4548,6 +4572,8 @@ class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig):
 )
 class Llama4OpenVINOConfig(GotOCR2OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.51.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
         model_kwargs = model_kwargs or {}
@@ -4789,6 +4815,8 @@ class Zamba2OpenVINOConfig(MambaOpenVINOConfig):
     DUMMY_PKV_GENERATOR_CLASS = Zamba2DummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     MIN_TRANSFORMERS_VERSION = "4.49.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = Zamba2ModelPatcher
 
     def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
@@ -5015,7 +5043,9 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig):
     ],
 )
 class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig):
-    _MODEL_PATCHER = OVSeq2SeqModelPatcher
+    # _MODEL_PATCHER = OVSeq2SeqModelPatcher
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS)
@@ -5060,7 +5090,8 @@ class MobileBertOpenVINOConfig(MobileBertOnnxConfig):
 
 @register_in_tasks_manager("xlm", *COMMON_TEXT_TASKS)
 class XLMOpenVINOConfig(XLMOnnxConfig):
-    pass
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager("xlm-roberta", *COMMON_TEXT_TASKS)
@@ -5085,7 +5116,8 @@ class CamembertOpenVINOConfig(CamembertOnnxConfig):
 
 @register_in_tasks_manager("flaubert", *COMMON_TEXT_TASKS)
 class FlaubertOpenVINOConfig(FlaubertOnnxConfig):
-    pass
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(
@@ -5117,7 +5149,8 @@ class Data2VecAudioOpenVINOConfig(Data2VecAudioOnnxConfig):
 
 @register_in_tasks_manager("data2vec-text", *COMMON_TEXT_TASKS)
 class Data2VecTextOpenVINOConfig(Data2VecTextOnnxConfig):
-    pass
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager("data2vec-vision", *["feature-extraction", "image-classification"])
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 1a55242d5c..235eb8406d 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -856,7 +856,8 @@ def test_load_with_different_dtype(self):
             )
 
     @parameterized.expand(EAGLE3_MODELS.items())
-    @pytest.mark.skipif(is_transformers_version("<", "4.54"), reason="Eagle3 requires transformers >= 4.54")
+    # TODO (@echarlaix) transformers v5 support
+    @pytest.mark.skipif(is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), reason="Eagle3 requires transformers >= 4.54")
     def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair):
         draft_model_id, target_model_id = model_pair
 

From 847c98d8235c931fa546057fcc815b98806eafaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 11:05:03 +0100
Subject: [PATCH 090/222] style

---
 tests/openvino/test_decoder.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 235eb8406d..07da27807b 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -857,7 +857,10 @@ def test_load_with_different_dtype(self):
 
     @parameterized.expand(EAGLE3_MODELS.items())
     # TODO (@echarlaix) transformers v5 support
-    @pytest.mark.skipif(is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"), reason="Eagle3 requires transformers >= 4.54")
+    @pytest.mark.skipif(
+        is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"),
+        reason="Eagle3 requires transformers >= 4.54",
+    )
     def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair):
         draft_model_id, target_model_id = model_pair
 

From 4bc2768eae5d2d18cc88aa0ecd6b2481835f7352 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 16:44:52 +0100
Subject: [PATCH 091/222] pkv fix

---
 optimum/exporters/openvino/model_patcher.py | 106 ++++++++++++++------
 setup.py                                    |   2 +-
 2 files changed, 77 insertions(+), 31 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index b7084b7a34..a2a9d18fbc 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -24,6 +24,7 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+from transformers import DynamicCache, EncoderDecoderCache
 from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from transformers.configuration_utils import PretrainedConfig
 from transformers.generation import GenerationMixin
@@ -77,14 +78,6 @@
     TransformersKwargs = object
 
 
-if is_transformers_version("<", "5"):
-    from transformers import DynamicCache as OVDynamicCache
-    from transformers import EncoderDecoderCache as OVEncoderDecoderCache
-else:
-    from optimum.exporters.onnx.utils import LegacyDynamicCache as OVDynamicCache
-    from optimum.exporters.onnx.utils import LegacyEncoderDecoderCache as OVEncoderDecoderCache
-
-
 logger = logging.getLogger(__name__)
 
 
@@ -95,6 +88,23 @@ def _get_subcomponent_model(model, name):
     return getattr(model, name)
 
 
+def postprocess_past_key_values(past_key_values):
+    if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)):
+        if hasattr(past_key_values, "to_legacy_cache"):
+            past_key_values = past_key_values.to_legacy_cache()
+        elif isinstance(past_key_values, DynamicCache):
+            past_key_values = [(lay.keys, lay.values) for lay in past_key_values.layers]
+        elif isinstance(past_key_values, EncoderDecoderCache):
+            past_key_values = [
+                (self_lay.keys, self_lay.values, cross_lay.keys, cross_lay.values)
+                for self_lay, cross_lay in zip(
+                    past_key_values.self_attention_cache.layers,
+                    past_key_values.cross_attention_cache.layers,
+                )
+            ]
+    return past_key_values
+
+
 for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC):
     if spec.name in {
         # onnx-exporter-specific fixes
@@ -1382,7 +1392,11 @@ def phi3_442_forward(
     if use_cache:
         use_legacy_cache = not isinstance(past_key_values, Cache)
         if use_legacy_cache:
-            past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
+            if is_transformers_version("<", "5"):
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            else:
+                past_key_values = DynamicCache(past_key_values)
+
         past_key_values_length = past_key_values.get_usable_length(seq_length)
 
     if position_ids is None:
@@ -1455,7 +1469,7 @@ def phi3_442_forward(
 
     next_cache = None
     if use_cache:
-        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = postprocess_past_key_values(next_decoder_cache) if use_legacy_cache else next_decoder_cache
     if not return_dict:
         return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
     return BaseModelOutputWithPast(
@@ -2964,7 +2978,11 @@ def patched_forward(*args, **kwargs):
                 legacy_pkv = args[pkv_argument_index]
                 pkv_in_args = True
             if legacy_pkv is not None:
-                pkv = OVDynamicCache.from_legacy_cache(legacy_pkv)
+                if is_transformers_version("<", "5"):
+                    pkv = DynamicCache.from_legacy_cache(legacy_pkv)
+                else:
+                    pkv = DynamicCache(legacy_pkv)
+
                 return_legacy_cache = True
                 if not pkv_in_args:
                     kwargs["past_key_values"] = pkv
@@ -2985,7 +3003,7 @@ def patched_forward(*args, **kwargs):
 
             outputs = self.orig_forward(*args, **kwargs)
             if return_legacy_cache:
-                outputs.past_key_values = outputs.past_key_values.to_legacy_cache()
+                outputs.past_key_values = postprocess_past_key_values(outputs.past_key_values)
 
             return outputs
 
@@ -4118,7 +4136,11 @@ def forward_wrap(
             input_ids=None,
             use_cache=True,
         ):
-            new_past_key_values = OVDynamicCache.from_legacy_cache(past_key_values)
+            if is_transformers_version("<", "5"):
+                new_past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            else:
+                new_past_key_values = DynamicCache(past_key_values)
+
             result = self.__orig_forward(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -4128,7 +4150,7 @@ def forward_wrap(
                 use_cache=use_cache,
             )
             if past_key_values is not None:
-                result["past_key_values"] = result["past_key_values"].to_legacy_cache()
+                result["past_key_values"] = postprocess_past_key_values(result["past_key_values"])
             return result
 
         model.forward = types.MethodType(forward_wrap, model)
@@ -4159,7 +4181,11 @@ def lm_forward(
             deepstack_visual_embeds,
             use_cache=True,
         ):
-            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
+            if is_transformers_version("<", "5"):
+                pkv = DynamicCache.from_legacy_cache(past_key_values)
+            else:
+                pkv = DynamicCache(past_key_values)
+
             outputs = self.model.language_model(
                 inputs_embeds=inputs_embeds,
                 attention_mask=attention_mask,
@@ -4172,7 +4198,7 @@ def lm_forward(
             hidden_states = outputs[0]
             # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
             logits = self.lm_head(hidden_states)
-            return (logits, outputs.past_key_values.to_legacy_cache())
+            return (logits, postprocess_past_key_values(outputs.past_key_values))
 
         model.__orig_forward = model.forward
         model.forward = types.MethodType(lm_forward, model)
@@ -4547,11 +4573,18 @@ def patched_forward(*args, **kwargs):
                     pkv = args[pkv_arg_index]
 
                 if pkv is not None:
-                    if isinstance(pkv, OVEncoderDecoderCache):
-                        pkv = pkv.self_attention_cache.to_legacy_cache()
+                    if isinstance(pkv, EncoderDecoderCache):
+                        pkv = postprocess_past_key_values(pkv.self_attention_cache)
                     else:
                         pkv = [pkv_item[:2] for pkv_item in pkv]
-                    pkv = OVEncoderDecoderCache.from_legacy_cache(pkv)
+
+                    if is_transformers_version("<", "5"):
+                        pkv = EncoderDecoderCache.from_legacy_cache(pkv)
+                    else:
+                        pkv = EncoderDecoderCache(
+                            DynamicCache([layer[:2] for layer in pkv]),
+                            DynamicCache([layer[2:] for layer in pkv]),
+                        )
 
                     if "past_key_values" in kwargs:
                         kwargs["past_key_values"] = pkv
@@ -4561,8 +4594,8 @@ def patched_forward(*args, **kwargs):
             outputs = self.super_patched_forward(*args, **kwargs)
 
             # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48
-            if isinstance(outputs.get("past_key_values"), (OVDynamicCache, OVEncoderDecoderCache)):
-                outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache()
+            if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)):
+                outputs["past_key_values"] = postprocess_past_key_values(outputs["past_key_values"])
             elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)):
                 outputs.pop("past_key_values")
 
@@ -4742,7 +4775,10 @@ def __init__(
         def forward(
             self, attention_mask, position_ids, past_key_values, token_type_ids, inputs_embeds, use_cache=True
         ):
-            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
+            if is_transformers_version("<", "5"):
+                pkv = DynamicCache.from_legacy_cache(past_key_values)
+            else:
+                pkv = DynamicCache(past_key_values)
 
             past_seen_tokens = past_key_values[0][0].shape[-2]
             cache_position = torch.arange(
@@ -4768,7 +4804,7 @@ def forward(
                 **forward_kwargs,
             )
             upd_pkv = result["past_key_values"]
-            result["past_key_values"] = upd_pkv.to_legacy_cache()
+            result["past_key_values"] = postprocess_past_key_values(upd_pkv)
             return result
 
         if is_transformers_version("<", "4.53.0"):
@@ -5129,7 +5165,7 @@ def _blenderbot_attn_forward_new(
     query_states = query_states
 
     if past_key_value is not None:
-        if isinstance(past_key_value, OVEncoderDecoderCache):
+        if isinstance(past_key_value, EncoderDecoderCache):
             is_updated = past_key_value.is_updated.get(self.layer_idx)
             if is_cross_attention:
                 # after the first generated id, we can subsequently re-use all key/value_states from cache
@@ -5658,7 +5694,13 @@ def patched_decoder_forward(
             if past_key_values is not None:
                 past_key_values = [cache_item[:2] for cache_item in past_key_values]
                 if is_transformers_version(">=", "4.56"):
-                    past_key_values = OVEncoderDecoderCache.from_legacy_cache(past_key_values)
+                    if is_transformers_version("<", "5"):
+                        past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+                    else:
+                        past_key_values = EncoderDecoderCache(
+                            DynamicCache([layer[:2] for layer in past_key_values]),
+                            DynamicCache([layer[2:] for layer in past_key_values]),
+                        )
 
             output_sequence = inputs_embeds
             output_cross_attentions = False
@@ -5690,8 +5732,8 @@ def patched_decoder_forward(
 
             past_key_values = decoder_out.past_key_values
             if past_key_values is not None:
-                if isinstance(past_key_values, OVEncoderDecoderCache):
-                    past_key_values = past_key_values.self_attention_cache.to_legacy_cache()
+                if isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values = postprocess_past_key_values(past_key_values.self_attention_cache)
                 else:
                     past_key_values = [cache_item[:2] for cache_item in past_key_values]
 
@@ -5742,7 +5784,11 @@ def __init__(
         # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178
         # moved audio and vision features processing outside model
         def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_values, use_cache=True):
-            pkv = OVDynamicCache.from_legacy_cache(past_key_values)
+            if is_transformers_version("<", "5"):
+                pkv = DynamicCache.from_legacy_cache(past_key_values)
+            else:
+                pkv = DynamicCache(past_key_values)
+
             outputs = self.model(
                 inputs_embeds=inputs_embeds,
                 attention_mask=attention_mask,
@@ -5753,7 +5799,7 @@ def lm_forward(self, inputs_embeds, attention_mask, position_ids, past_key_value
             hidden_states = outputs[0]
             # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
             logits = self.lm_head(hidden_states)
-            return (logits, outputs.past_key_values.to_legacy_cache())
+            return (logits, postprocess_past_key_values(outputs.past_key_values))
 
         model.__orig_forward = model.forward
         model.forward = types.MethodType(lm_forward, model)
@@ -7858,7 +7904,7 @@ def forward(
             inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
 
         if use_cache and past_key_values is None:
-            past_key_values = OVDynamicCache(config=self.config)
+            past_key_values = DynamicCache(config=self.config)
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/setup.py b/setup.py
index 16e2a82fed..25a5a01a97 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=2.1",
-    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5",
+    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/investigate",
     "transformers>=4.45,<5.1",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",

From 6799e939ede93ae3205753b80b9fc42ee31587f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 18:21:57 +0100
Subject: [PATCH 092/222] transformers-v5 branch

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 25a5a01a97..16e2a82fed 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=2.1",
-    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/investigate",
+    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5",
     "transformers>=4.45,<5.1",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",

From a2cd48ec1a7c87549bfe86d4db0309c3d670d8c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 18:28:41 +0100
Subject: [PATCH 093/222] use_model_defaults arg was deprecated in v5

---
 tests/openvino/test_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 07da27807b..9bcef5f2f0 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -478,7 +478,7 @@ def test_pipeline(self, model_arch):
             tokenizer._convert_tokens_to_ids = lambda x: 0
 
         additional_args = {}
-        if is_transformers_version(">=", "4.51"):
+        if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"):
             additional_args["use_model_defaults"] = False
 
         set_seed(SEED)
@@ -784,7 +784,7 @@ def test_beam_search(self, model_arch):
         ov_model_stateless.config.eos_token_id = None
         transformers_model.config.eos_token_id = None
 
-        if is_transformers_version(">=", "4.51"):
+        if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"):
             additional_inputs["use_model_defaults"] = False
 
         for gen_config in gen_configs:

From 850c1cee66fbd5fde919d6e8b2a163bd372ba2d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 18:33:18 +0100
Subject: [PATCH 094/222] style

---
 optimum/exporters/openvino/model_patcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a2a9d18fbc..89d295c0e8 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -24,7 +24,6 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from transformers import DynamicCache, EncoderDecoderCache
 from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from transformers.configuration_utils import PretrainedConfig
 from transformers.generation import GenerationMixin

From af4a6059d0aa27f7fa091401bc3be89a0cc56e14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 18:42:20 +0100
Subject: [PATCH 095/222] baichuan remote code models incompatible with v5

---
 optimum/exporters/openvino/model_configs.py | 1 +
 tests/openvino/test_decoder.py              | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index a25c3e7b8e..069286d2f1 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -330,6 +330,7 @@ class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
         num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size"
     )
     _MODEL_PATCHER = BaichuanModelPatcher
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 9bcef5f2f0..d079e04539 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -47,7 +47,6 @@
 class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (
         "bart",
-        "baichuan2-13b",
         "gpt_bigcode",
         "bigbird_pegasus",
         "blenderbot",
@@ -157,6 +156,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
             "jais",
             "dbrx",
             "baichuan2",
+            "baichuan2-13b",
         )
 
     GENERATION_LENGTH = 100

From 4da53e8c4037434d472f2c8ef11e628cfc50eb81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 19:17:47 +0100
Subject: [PATCH 096/222] remove tests for modelsf for models that needs fixes

---
 tests/openvino/test_decoder.py  | 28 ++++++++++++++++++----------
 tests/openvino/test_modeling.py | 13 ++++++++-----
 tests/openvino/test_seq2seq.py  | 32 ++++++++++++++++++++++++--------
 3 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index d079e04539..6782574c01 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -58,7 +58,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "gpt_neo",
         "gpt_neox",
         "llama",
-        "marian",
         "mistral",
         "mixtral",
         "mpt",
@@ -72,9 +71,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "biogpt",
         "gpt_neox_japanese",
         "xglm",
-        "aquila",
-        "xverse",
-        "internlm",
         "gemma",
         "olmo",
         "stablelm",
@@ -85,12 +81,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "phi3",
         "gemma2",
         "granite",
-        "granitemoe",
     )
 
     SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba")
 
-    if is_transformers_version(">=", "4.49"):
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
         SUPPORTED_SSM_ARCHITECTURES += ("zamba2",)
 
     if is_transformers_version(">=", "4.53.0"):
@@ -102,11 +98,15 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES += SUPPORTED_SSM_ARCHITECTURES
 
     if is_transformers_version(">=", "4.46.0"):
-        SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe")
+        SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo")
 
         if is_transformers_version("<", "4.54.0"):
             SUPPORTED_ARCHITECTURES += ("deepseek",)
 
+        # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+        if is_transformers_version("<", "5"):
+            SUPPORTED_ARCHITECTURES += ("phimoe",)
+
         # gptq and awq install disabled for windows test environment
         if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"):
             SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq")
@@ -145,8 +145,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4")
 
     if is_transformers_version("<", "5"):
-        # TODO: add dbrx back once fixed in transformers
         SUPPORTED_ARCHITECTURES += (
+            # remote modeling incompatible with v5
             "codegen2",
             "exaone",
             "decilm",
@@ -154,11 +154,19 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
             "orion",
             "aquila2",
             "jais",
-            "dbrx",
             "baichuan2",
             "baichuan2-13b",
+            # remote modeling code failing with v5
+            "aquila",
+            "xverse",
+            "internlm",
+            # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+            "dbrx",
+            # "phimoe",
+            "marian",
+            "granitemoe",
+            # "zamba2",
         )
-
     GENERATION_LENGTH = 100
 
     EXPECTED_NUM_SDPA = {
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 4eccde4c87..8d8ab01147 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -757,14 +757,16 @@ class OVModelForSequenceClassificationIntegrationTest(unittest.TestCase):
         "convbert",
         "distilbert",
         "electra",
-        "flaubert",
         "ibert",
         "roberta",
         "roformer",
         "squeezebert",
-        "xlm",
     )
 
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ("flaubert", "xlm")
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -1087,13 +1089,11 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
         "bert",
         "camembert",
         "convbert",
-        "data2vec-text",
         "deberta",
         "deberta-v2",
         "distilbert",
         "electra",
         "esm",
-        "flaubert",
         "ibert",
         "mobilebert",
         "mpnet",
@@ -1102,7 +1102,6 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
         "roberta",
         "roformer",
         "squeezebert",
-        "xlm",
         "xlm-roberta",
     )
 
@@ -1110,6 +1109,10 @@ class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
     if is_transformers_version("<", "4.51.0"):
         SUPPORTED_ARCHITECTURES += ("nystromformer",)
 
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ("data2vec-text", "flaubert", "xlm")
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 153f57be8e..d0e5f88b71 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -145,7 +145,6 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
         "longt5",
         "m2m_100",
         "mbart",
-        "mt5",
         "pegasus",
         "t5",
     )
@@ -159,6 +158,10 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
         # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x
         SUPPORTED_ARCHITECTURES += ("marian",)
 
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ("mt5",)
+
     SUPPORT_STATEFUL = ("t5", "mt5", "longt5")
     if is_transformers_version(">=", "4.52.0"):
         SUPPORT_STATEFUL += ("bart", "blenderbot", "blenderbot-small", "m2m_100", "marian", "mbart")
@@ -535,10 +538,8 @@ def test_pipeline(self, model_arch: str):
 
 class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
     SUPPORTED_ARCHITECTURES = [
-        "llava",
         "llava_next",
         "llava_next_mistral",
-        "llava_next_video",
         "qwen2_vl",
     ]
     SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"]
@@ -547,20 +548,31 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
     TASK = "image-text-to-text"
 
     if is_transformers_version(">=", "4.46.0"):
-        SUPPORTED_ARCHITECTURES += ["maira2", "idefics3"]
+        SUPPORTED_ARCHITECTURES += ["maira2"]
+
+        # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+        if is_transformers_version("<", "5"):
+            SUPPORTED_ARCHITECTURES += ["idefics3"]
 
     if is_transformers_version(">=", "4.49.0"):
-        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl", "got_ocr2"]
+        SUPPORTED_ARCHITECTURES += ["qwen2_5_vl"]
         SUPPORT_VIDEO.append("qwen2_5_vl")
 
+        # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+        if is_transformers_version("<", "5"):
+            SUPPORTED_ARCHITECTURES += ["got_ocr2"]
+
         if is_transformers_version("<", "4.54.0"):
             # remote code models differs after transformers v4.54
             SUPPORTED_ARCHITECTURES += ["phi4mm"]
             SUPPORT_AUDIO.append("phi4mm")
 
-    if is_transformers_version(">", "4.49"):
-        SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
-    if is_transformers_version(">=", "4.51"):
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ["gemma3", "smolvl"]
+
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"):
         # SUPPORTED_ARCHITECTURES += ["llama4", "phi4_multimodal"]
         SUPPORTED_ARCHITECTURES += ["llama4"]
 
@@ -578,6 +590,10 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         # remote code models incompatible after transformers v5
         SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"]
 
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video")
+
     REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
     IMAGE = Image.open(
         requests.get(

From 5a74781777df1600644601c52e5854b3d9bfa113 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 19:45:14 +0100
Subject: [PATCH 097/222] fix decoder tests untested_architectures

---
 tests/openvino/test_decoder.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 6782574c01..9a6acf1cb7 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -126,6 +126,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES += ("arcee",)
 
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("bitnet",)
 
@@ -305,9 +306,13 @@ def test_find_untested_architectures(self):
             supported_architectures -= {"lfm2"}
 
         # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group
-        if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
+        if is_transformers_version(">", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"qwen3_vl_text"}
 
+        # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+        if is_transformers_version(">=", "5"):
+            supported_architectures -= {"phimoe", "granitemoe", "bitnet", "dbrx", "zamba2", "marian"}
+
         supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES
         untested_architectures = supported_architectures - tested_architectures
 

From e634d777eb815f50f12366a796b85554056b059d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 19:51:46 +0100
Subject: [PATCH 098/222] fix untested architecture

---
 tests/openvino/test_seq2seq.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index d0e5f88b71..70e43293e0 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -161,6 +161,8 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("mt5",)
+    else:
+        UNSUPPORTED_ARCHITECTURES = {"marian", "mt5"}
 
     SUPPORT_STATEFUL = ("t5", "mt5", "longt5")
     if is_transformers_version(">=", "4.52.0"):
@@ -593,7 +595,17 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video")
-
+    else:
+        UNSUPPORTED_ARCHITECTURES = {
+            "got_ocr2",
+            "idefics3",
+            "llama4",
+            "llava_next_video",
+            "phi4_multimodal",
+            "gemma3",
+            "smolvlm",
+            "llava",
+        }
     REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
     IMAGE = Image.open(
         requests.get(

From f89d0de33d40d71cfcd07b885f6a85bdbf700de3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 21:57:28 +0100
Subject: [PATCH 099/222] fix pkv patching

---
 optimum/exporters/openvino/model_patcher.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 89d295c0e8..82a25f3098 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4580,10 +4580,7 @@ def patched_forward(*args, **kwargs):
                     if is_transformers_version("<", "5"):
                         pkv = EncoderDecoderCache.from_legacy_cache(pkv)
                     else:
-                        pkv = EncoderDecoderCache(
-                            DynamicCache([layer[:2] for layer in pkv]),
-                            DynamicCache([layer[2:] for layer in pkv]),
-                        )
+                        pkv = EncoderDecoderCache(DynamicCache(pkv), DynamicCache())
 
                     if "past_key_values" in kwargs:
                         kwargs["past_key_values"] = pkv
@@ -5696,10 +5693,7 @@ def patched_decoder_forward(
                     if is_transformers_version("<", "5"):
                         past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
                     else:
-                        past_key_values = EncoderDecoderCache(
-                            DynamicCache([layer[:2] for layer in past_key_values]),
-                            DynamicCache([layer[2:] for layer in past_key_values]),
-                        )
+                        past_key_values = EncoderDecoderCache(DynamicCache(past_key_values), DynamicCache())
 
             output_sequence = inputs_embeds
             output_cross_attentions = False

From 6070155e197b19f8553a62978003160d37bf724a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 22:27:15 +0100
Subject: [PATCH 100/222] fix test

---
 tests/openvino/test_seq2seq.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 70e43293e0..2737059e50 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -380,7 +380,7 @@ def test_compare_to_transformers(self, model_arch):
             )
 
         generate_kwrgs = {}
-        if is_transformers_version(">=", "4.50"):
+        if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"):
             generate_kwrgs = {"use_model_defaults": False}
 
         gen_config = GenerationConfig(
@@ -571,7 +571,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
 
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ["gemma3", "smolvl"]
+        SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
 
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"):

From 26d5c4413cdb0d37fd99aff736013390c541ac09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Feb 2026 22:36:52 +0100
Subject: [PATCH 101/222] fix expcted int8 tests

---
 tests/openvino/test_quantization.py | 4 ++--
 tests/openvino/utils_tests.py       | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index b39ee223ae..f9bde752b3 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -269,7 +269,7 @@ class OVQuantizerTest(unittest.TestCase):
                 "model": 33,
             },
             {
-                "model": {"int8": 35},
+                "model": {"int8": 35 if is_transformers_version("<", "5") else 36},
             },
         ),
         (
@@ -299,7 +299,7 @@ class OVQuantizerTest(unittest.TestCase):
                 "model": 32,
             },
             {
-                "model": {"int8": 34},
+                "model": {"int8": 34 if is_transformers_version("<", "5") else 35},
             },
         ),
         (
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 85f79801cd..06314ef394 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -231,19 +231,19 @@
 
 _ARCHITECTURES_TO_EXPECTED_INT8 = {
     "afmoe": {"model": 16},
-    "bert": {"model": 68},
+    "bert": {"model": 68 if is_transformers_version("<", "5") else 70},
     "roberta": {"model": 68},
     "albert": {"model": 84},
     "vit": {"model": 64},
-    "blenderbot": {"model": 70},
+    "blenderbot": {"model": 70 if is_transformers_version("<", "5") else 72},
     "gpt2": {"model": 44},
     "granitemoehybrid": {"model": 118},
     "wav2vec2": {"model": 34},
     "distilbert": {"model": 66},
     "t5": {
         "encoder": 64,
-        "decoder": 104,
-        "decoder_with_past": 84,
+        "decoder": 104 if is_transformers_version("<", "5") else 106,
+        "decoder_with_past": 84 if is_transformers_version("<", "5") else 86,
     },
     "stable-diffusion": {
         "unet": 242,

From 9d84f3a4870a501fd4591a8b8473e5e1879c6217 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 10:27:45 +0100
Subject: [PATCH 102/222] tests transformers v5

---
 tests/openvino/test_seq2seq.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 2737059e50..da68d6e8b9 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -862,7 +862,11 @@ def test_compare_to_transformers(self, model_arch):
 
         gc.collect()
 
-    @parameterized.expand(["llava", "llava_next", "llava_next_video", "llava_next_mistral"])
+    @parameterized.expand(
+        ["llava", "llava_next", "llava_next_video", "llava_next_mistral"]
+        if is_transformers_version("<", "5")
+        else ["llava_next", "llava_next_mistral"]
+    )
     def test_llava_with_new_preprocessing(self, model_arch):
         prompt = "<image>\n What is shown in this image?"
         model_id = MODEL_NAMES[model_arch]

From 4b5f83d4f5169513c467dd3b3a9dfdf9fc43006e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 10:29:41 +0100
Subject: [PATCH 103/222] pix2struct

---
 optimum/exporters/openvino/model_configs.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 069286d2f1..ce617dc3ea 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5044,9 +5044,7 @@ class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig):
     ],
 )
 class Pix2StructOpenVINOConfig(Pix2StructOnnxConfig):
-    # _MODEL_PATCHER = OVSeq2SeqModelPatcher
-    # TODO (@echarlaix): add v5 support
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
+    _MODEL_PATCHER = OVSeq2SeqModelPatcher
 
 
 @register_in_tasks_manager("bert", *COMMON_TEXT_TASKS)

From 14e1b524547bc44e08294013325abbea2e63c481 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 10:42:44 +0100
Subject: [PATCH 104/222] fix num expected int8

---
 tests/openvino/test_quantization.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index f9bde752b3..a249624023 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -135,8 +135,8 @@ class OVQuantizerTest(unittest.TestCase):
         (OVModelForSequenceClassification, "bert", 32, 35),
         (OVModelForCausalLM, "gpt2", 31, 22),
         (OVSentenceTransformer, "sentence-transformers-bert", 12, 15),
-        (OVModelForFeatureExtraction, "blenderbot", 33, 35),
-        (OVModelForMaskedLM, "roberta", 32, 34),
+        (OVModelForFeatureExtraction, "blenderbot", 33, 35 if is_transformers_version("<", "5") else 36),
+        (OVModelForMaskedLM, "roberta", 32, 34 if is_transformers_version("<", "5") else 35),
         (OVModelForZeroShotImageClassification, "clip", 65, 65),
     )
     SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET = [
@@ -344,12 +344,12 @@ class OVQuantizerTest(unittest.TestCase):
             if is_transformers_version("<=", "4.45")
             else {
                 "encoder": 30,
-                "decoder": 52,
+                "decoder": 52 if is_transformers_version("<", "5") else 53,
             },
             (
                 {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}}
                 if is_transformers_version("<=", "4.45")
-                else {"encoder": {"int8": 32}, "decoder": {"int8": 52}}
+                else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}}
             ),
         ),
         (
@@ -596,7 +596,9 @@ class OVWeightCompressionTest(unittest.TestCase):
         (OVModelForCausalLM, "gpt2", 44, 44),
     )
 
-    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 62, 43),)
+    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = (
+        (OVModelForCausalLM, "opt125m", 62 if is_transformers_version("<", "5") else 64, 43),
+    )
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 74),)
     SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),)
 

From 0dbe96c293c68dc66e6fdf9a0213d312d004c943 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 10:44:15 +0100
Subject: [PATCH 105/222] use_model_defaults deprecated in v5

---
 tests/openvino/test_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index a249624023..fc4f9ea102 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -2497,7 +2497,7 @@ def check_model_inference(ov_model, model_id, trust_remote_code):
         if isinstance(ov_model, OVModelForSpeechSeq2Seq):
             input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32)
             generate_kwrgs = {}
-            if is_transformers_version(">=", "4.50"):
+            if is_transformers_version(">=", "4.50") and is_transformers_version("<", "5"):
                 generate_kwrgs = {"use_model_defaults": False}
             ov_model.generate(input_features, generation_config=gen_config, **generate_kwrgs)
         else:

From af3fba3d7800adb4ab6dfd0f118cfac1c33bd962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 11:01:50 +0100
Subject: [PATCH 106/222] rename

---
 optimum/exporters/openvino/model_configs.py | 20 ++++++++++----------
 optimum/exporters/openvino/model_patcher.py | 12 +++++-------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ce617dc3ea..fb7acb865d 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -202,7 +202,7 @@
     SanaTextEncoderModelPatcher,
     XverseModelPatcher,
     Zamba2ModelPatcher,
-    _get_subcomponent_model,
+    _get_model_attribute,
 )
 
 
@@ -1878,14 +1878,14 @@ def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior])
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.LANGUAGE:
-            return _get_subcomponent_model(model, "language_model") if not hasattr(model, "lm_head") else model
+            return _get_model_attribute(model, "language_model") if not hasattr(model, "lm_head") else model
 
         if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
             return model
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
             text_embedding = model.get_input_embeddings()
-            text_embedding.config = _get_subcomponent_model(model, "language_model").config
+            text_embedding.config = _get_model_attribute(model, "language_model").config
             return text_embedding
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
@@ -2162,14 +2162,14 @@ def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]):
             behavior = VLMConfigBehavior(behavior)
 
         if behavior == VLMConfigBehavior.LANGUAGE:
-            return _get_subcomponent_model(model, "language_model")
+            return _get_model_attribute(model, "language_model")
 
         if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
             return model
 
         if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
-            text_embedding = _get_subcomponent_model(model, "language_model").get_input_embeddings()
-            text_embedding.config = _get_subcomponent_model(model, "language_model").config
+            text_embedding = _get_model_attribute(model, "language_model").get_input_embeddings()
+            text_embedding.config = _get_model_attribute(model, "language_model").config
             return text_embedding
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
@@ -3696,12 +3696,12 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]):
             return model
 
         if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS:
-            vision_embeddings = _get_subcomponent_model(model, "visual").patch_embed
+            vision_embeddings = _get_model_attribute(model, "visual").patch_embed
             vision_embeddings.config = model.config.vision_config
             return vision_embeddings
 
         if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER:
-            vision_emb_merger = _get_subcomponent_model(model, "visual")
+            vision_emb_merger = _get_model_attribute(model, "visual")
             vision_emb_merger.config = model.config.vision_config
             return vision_emb_merger
 
@@ -3709,7 +3709,7 @@ def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]):
             text_embedding = (
                 model.model.embed_tokens
                 if hasattr(model.model, "embed_tokens")
-                else _get_subcomponent_model(model, "language_model").embed_tokens
+                else _get_model_attribute(model, "language_model").embed_tokens
             )
             text_embedding.config = model.config
             return text_embedding
@@ -3849,7 +3849,7 @@ def __init__(
     @staticmethod
     def get_model_for_behavior(model, behavior: Union[str, QwenVLConfigBehavior]):
         if behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS:
-            vision_emb_pos = _get_subcomponent_model(model, "visual").pos_embed
+            vision_emb_pos = _get_model_attribute(model, "visual").pos_embed
             vision_emb_pos.config = model.config.vision_config
             return vision_emb_pos
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 82a25f3098..a617289c8e 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -80,13 +80,6 @@
 logger = logging.getLogger(__name__)
 
 
-def _get_subcomponent_model(model, name):
-    if is_transformers_version(">=", "5") and hasattr(model, "model"):
-        return getattr(model.model, name)
-
-    return getattr(model, name)
-
-
 def postprocess_past_key_values(past_key_values):
     if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)):
         if hasattr(past_key_values, "to_legacy_cache"):
@@ -104,6 +97,11 @@ def postprocess_past_key_values(past_key_values):
     return past_key_values
 
 
+def _get_model_attribute(model, name):
+    target = getattr(model, "model", model)
+    return getattr(target, name)
+
+
 for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC):
     if spec.name in {
         # onnx-exporter-specific fixes

From 546127bdee7899e99fa505fadb8bf85b6a2a7a79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 11:04:12 +0100
Subject: [PATCH 107/222] style

---
 optimum/exporters/openvino/model_patcher.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a617289c8e..0910f4de3f 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3256,7 +3256,7 @@ def llava_vision_embed_forward(self, pixel_values):
     # copied from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L428-L441
     # these changes does not bring any difference from original, it only packs model subcomponent inference together
     # that allow us avoid memory overheads and their inference results handling on code-level
-    image_outputs = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True)
+    image_outputs = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True)
     # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
     selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer]
 
@@ -3267,7 +3267,7 @@ def llava_vision_embed_forward(self, pixel_values):
     else:
         raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
 
-    image_features = _get_subcomponent_model(self, "multi_modal_projector")(selected_image_feature)
+    image_features = _get_model_attribute(self, "multi_modal_projector")(selected_image_feature)
     return image_features
 
 
@@ -3275,7 +3275,7 @@ def llava_next_video_vision_embed_forward(self, pixel_values):
     # copied from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/llava_next_video/modeling_llava_next_video.py#L519
     # these changes does not bring any difference from original, it only packs model subcomponent inference together
     # that allow us avoid memory overheads and their inference results handling on code-level
-    image_features = _get_subcomponent_model(self, "vision_tower")(pixel_values, output_hidden_states=True)
+    image_features = _get_model_attribute(self, "vision_tower")(pixel_values, output_hidden_states=True)
     vision_feature_layer = self.config.vision_feature_layer
     if isinstance(vision_feature_layer, int):
         selected_image_feature = image_features.hidden_states[vision_feature_layer]

From 3eeeb4dc64d6eeeadd1b9cdac309f36838e9b36a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 11:10:09 +0100
Subject: [PATCH 108/222] install diffusers from source for v5

---
 .github/workflows/test_openvino.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 085619c5fa..48e3a7409b 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -54,7 +54,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
-          uv pip install .[tests] librosa
+          uv pip install .[tests] librosa diffusers
 
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Install OpenVINO
@@ -64,7 +64,12 @@ jobs:
       - if: ${{ matrix.transformers-version != 'latest' }}
         name: Install transformers
         run: |
-          uv pip install transformers==${{ matrix.transformers-version }} diffusers
+          uv pip install transformers==${{ matrix.transformers-version }}
+
+      - if: ${{ matrix.transformers-version == 'latest' }}
+        name: Install diffusers
+        run: |
+          uv pip install git+https://github.com/huggingface/diffusers
 
       - if: ${{ matrix.transformers-version == '4.45.0' }}
         name: Install specific dependencies and versions required for older transformers

From 2b61bd38e7b375f18c018e83072e2c00d258db4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 15:44:38 +0100
Subject: [PATCH 109/222] qwen2vl

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 0910f4de3f..53b7340962 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -98,7 +98,7 @@ def postprocess_past_key_values(past_key_values):
 
 
 def _get_model_attribute(model, name):
-    target = getattr(model, "model", model)
+    target = getattr(model, "model", model) if is_transformers_version(">=", "5") else model
     return getattr(target, name)
 
 

From bbe65bbff2e8628e8e694535c3ca74f0c216e65b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 15:57:19 +0100
Subject: [PATCH 110/222] remove tests for v5

---
 tests/openvino/test_quantization.py | 114 ++++++++++++++++------------
 1 file changed, 64 insertions(+), 50 deletions(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index fc4f9ea102..f66ae8834d 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -342,10 +342,7 @@ class OVQuantizerTest(unittest.TestCase):
             ),
             {"encoder": 30, "decoder": 52, "decoder_with_past": 61}
             if is_transformers_version("<=", "4.45")
-            else {
-                "encoder": 30,
-                "decoder": 52 if is_transformers_version("<", "5") else 53,
-            },
+            else {"encoder": 30, "decoder": 52},
             (
                 {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}}
                 if is_transformers_version("<=", "4.45")
@@ -1064,9 +1061,6 @@ class OVWeightCompressionTest(unittest.TestCase):
         (OVStableDiffusionPipeline, "stable-diffusion", False),
         (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False),
         (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False),
-        (OVModelForVisualCausalLM, "llava", False),
-        (OVModelForVisualCausalLM, "llava_next_video", False),
-        (OVModelForVisualCausalLM, "minicpmv", True),
         (OVModelForVisualCausalLM, "qwen2_vl", False),
     ]
 
@@ -1082,6 +1076,15 @@ class OVWeightCompressionTest(unittest.TestCase):
     if is_transformers_version(">=", "4.57.0"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen3_vl", False))
 
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend(
+            [
+                (OVModelForVisualCausalLM, "llava", False),
+                (OVModelForVisualCausalLM, "llava_next_video", False),
+                (OVModelForVisualCausalLM, "minicpmv", True),
+            ]
+        )
+
     SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
         (OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
         (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
@@ -1119,17 +1122,6 @@ class OVWeightCompressionTest(unittest.TestCase):
                 "text_encoder": {},
             },
         ),
-        (
-            OVModelForVisualCausalLM,
-            "llava",
-            4,
-            {"bits": 4, "group_size": 8, "ratio": 0.5},
-            {
-                "lm_model": {"int8": 22, "int4": 8},
-                "text_embeddings_model": {"int8": 1},
-                "vision_embeddings_model": {"int8": 9},
-            },
-        ),
         (
             OVSamModel,
             "sam",
@@ -1183,15 +1175,6 @@ class OVWeightCompressionTest(unittest.TestCase):
                 },
             },
         ),
-        (
-            OVModelForVisualCausalLM,
-            "llava",
-            {
-                "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
-                "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
-                "text_embeddings_model": {"patterns": ["."]},
-            },
-        ),
         (
             OVSamModel,
             "sam",
@@ -1212,6 +1195,33 @@ class OVWeightCompressionTest(unittest.TestCase):
         ),
     ]
 
+    if is_transformers_version("<", "5"):
+        DEFAULT_COMPRESSION_CONFIGURATIONS.append(
+            (
+                OVModelForVisualCausalLM,
+                "llava",
+                4,
+                {"bits": 4, "group_size": 8, "ratio": 0.5},
+                {
+                    "lm_model": {"int8": 22, "int4": 8},
+                    "text_embeddings_model": {"int8": 1},
+                    "vision_embeddings_model": {"int8": 9},
+                },
+            ),
+        )
+
+        DEFAULT_IGNORED_SCOPE_CONFIGURATIONS.append(
+            (
+                OVModelForVisualCausalLM,
+                "llava",
+                {
+                    "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
+                    "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
+                    "text_embeddings_model": {"patterns": ["."]},
+                },
+            ),
+        )
+
     def test_filtered_architectures(cls):
         expected = set()
         if is_transformers_version("<", "4.49"):
@@ -1800,31 +1810,35 @@ class OVPipelineQuantizationTest(unittest.TestCase):
             {"encoder": 14, "decoder": 22},
             {"encoder": {"int8": 14}, "decoder": {"int8": 22}},
         ),
-        (
-            OVModelForVisualCausalLM,
-            "internvl_chat",
-            True,
-            dict(
-                quantization_configs={
-                    "lm_model": dict(bits=8, weight_only=True),
-                    "vision_embeddings_model": dict(bits=8, weight_only=False),
+    ]
+
+    if is_transformers_version("<", "5"):
+        PIPELINE_QUANTIZATION_SCOPE.append(
+            (
+                OVModelForVisualCausalLM,
+                "internvl_chat",
+                True,
+                dict(
+                    quantization_configs={
+                        "lm_model": dict(bits=8, weight_only=True),
+                        "vision_embeddings_model": dict(bits=8, weight_only=False),
+                    },
+                    dataset="contextual",
+                    num_samples=1,
+                    default_config=dict(bits=8, sym=True, weight_only=True),
+                ),
+                {
+                    "lm_model": 0,
+                    "text_embeddings_model": 0,
+                    "vision_embeddings_model": 15,
+                },
+                {
+                    "lm_model": {"int8": 30},
+                    "text_embeddings_model": {"int8": 1},
+                    "vision_embeddings_model": {"int8": 11},
                 },
-                dataset="contextual",
-                num_samples=1,
-                default_config=dict(bits=8, sym=True, weight_only=True),
             ),
-            {
-                "lm_model": 0,
-                "text_embeddings_model": 0,
-                "vision_embeddings_model": 15,
-            },
-            {
-                "lm_model": {"int8": 30},
-                "text_embeddings_model": {"int8": 1},
-                "vision_embeddings_model": {"int8": 11},
-            },
-        ),
-    ]
+        )
 
     if is_transformers_version(">=", "4.49.0") and is_transformers_version("<", "4.54.0"):
         PIPELINE_QUANTIZATION_SCOPE.extend(

From 7ba6fd1289612fa92476194bc9aa7f16316a2a52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 16:37:42 +0100
Subject: [PATCH 111/222] disable tests for transformers v5

---
 tests/openvino/test_genai.py | 41 +++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index b31ca1569e..74f6bab1ec 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -45,7 +45,6 @@ class LLMPipelineTestCase(unittest.TestCase):
         "gpt_bigcode",
         "bloom",
         "codegen",
-        "codegen2",
         "gpt2",
         "gptj",
         "gpt_neox",
@@ -53,37 +52,29 @@ class LLMPipelineTestCase(unittest.TestCase):
         "mistral",
         "mixtral",
         "phi",
-        "internlm2",
-        "orion",
         "falcon",
         "persimmon",
         "xglm",
-        "aquila",
-        "aquila2",
-        "internlm",
-        "jais",
-        "decilm",
         "gemma",
         "olmo",
         "stablelm",
         "starcoder2",
-        "dbrx",
         "cohere",
         "qwen2",
         "qwen2_moe",
         "phi3",
         "gemma2",
-        "exaone",
         "granite",
-        "granitemoe",
     )
 
     if is_transformers_version(">=", "4.46.0"):
-        SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe", "opt")
+        SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "opt")
         if is_transformers_version("<", "4.54.0"):
             SUPPORTED_ARCHITECTURES += ("deepseek",)
         if is_transformers_version("<", "4.56.0"):
             SUPPORTED_ARCHITECTURES += ("qwen",)
+        if is_transformers_version("<", "5"):
+            SUPPORTED_ARCHITECTURES += ("phimoe",)
     if is_transformers_version(">=", "4.49"):
         SUPPORTED_ARCHITECTURES += ("gemma3_text",)
     if is_transformers_version(">=", "4.51.0"):
@@ -101,6 +92,25 @@ class LLMPipelineTestCase(unittest.TestCase):
     if is_transformers_version("<", "4.56.0"):
         SUPPORTED_ARCHITECTURES += ("chatglm", "chatglm4")
 
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += (
+            # remote modeling incompatible with v5
+            "codegen2",
+            "exaone",
+            "decilm",
+            "internlm2",
+            "orion",
+            "aquila2",
+            "jais",
+            # remote modeling code failing with v5
+            "aquila",
+            "internlm",
+            # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+            "dbrx",
+            # "phimoe",
+            "granitemoe",
+        )
+
     REMOTE_CODE_MODELS = (
         "chatglm",
         "minicpm",
@@ -200,9 +210,7 @@ def test_compare_outputs(self, model_arch):
 
 class VLMPipelineTestCase(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (
-        "llava",
         "llava_next",
-        "llava_next_video",
         # "minicpmv", # output is truncated for some reason
         "qwen2_vl",
     )
@@ -216,8 +224,11 @@ class VLMPipelineTestCase(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",)
         if is_transformers_version("<", "4.54.0"):
             SUPPORTED_ARCHITECTURES += ("phi4mm",)
-    if is_transformers_version(">=", "4.49"):
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("gemma3",)
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video")
 
     REMOTE_CODE_MODELS = (
         "minicpmv",

From 928fb5009f60ac2478b818bb39aa17bd72eadf93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 16:42:24 +0100
Subject: [PATCH 112/222] remove non needed

---
 optimum/exporters/openvino/model_patcher.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 53b7340962..1be2bfe437 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4590,8 +4590,6 @@ def patched_forward(*args, **kwargs):
             # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48
             if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)):
                 outputs["past_key_values"] = postprocess_past_key_values(outputs["past_key_values"])
-            elif isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)):
-                outputs.pop("past_key_values")
 
             # we still need to filter out cross attention in the case of non-stateful decoder
             filtered_outputs = {}

From ef320b3be74f090a861e30ac4c45cc76ffafa071 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 19:11:20 +0100
Subject: [PATCH 113/222] disable tests

---
 tests/openvino/test_export.py        |  9 ++-
 tests/openvino/test_exporters_cli.py | 82 +++++++++++++++++-----------
 tests/openvino/utils_tests.py        |  6 +-
 3 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 18811bd121..6cc28c8597 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -84,7 +84,6 @@ class ExportModelTest(unittest.TestCase):
         "stable-diffusion-xl": OVStableDiffusionXLPipeline,
         "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline,
         "latent-consistency": OVLatentConsistencyModelPipeline,
-        "llava": OVModelForVisualCausalLM,
         "sam": OVSamModel,
         "speecht5": OVModelForTextToSpeechSeq2Seq,
         "clip": OVModelForZeroShotImageClassification,
@@ -95,7 +94,7 @@ class ExportModelTest(unittest.TestCase):
         "ltx-video": OVLTXPipeline,
     }
 
-    if is_transformers_version(">=", "4.49"):
+    if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.update({"zamba2": OVModelForCausalLM})
 
     if is_transformers_version(">=", "4.53.0"):
@@ -118,7 +117,11 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.51"):
         SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction})
 
-    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava", "speecht5")
+    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "speecht5")
+
+    if is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM})
+        GENERATIVE_MODELS.append("llava")
 
     def _openvino_export(
         self,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 4be27f43e5..a684c90ca8 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -121,11 +121,17 @@ class OVCLIExportTestCase(unittest.TestCase):
             [
                 ("text-generation", "lfm2"),
                 ("text-generation-with-past", "lfm2"),
+            ]
+        )
+
+    if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES.extend(
+            [
                 ("text-generation-with-past", "qwen3_eagle3"),
             ]
         )
 
-    if is_transformers_version(">=", "4.49"):
+    if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.extend(
             [
                 ("text-generation-with-past", "zamba2"),
@@ -138,7 +144,7 @@ class OVCLIExportTestCase(unittest.TestCase):
                 ("text-generation-with-past", "exaone4"),
             ]
         )
-    if is_transformers_version(">=", "4.52.1"):
+    if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.extend(
             [
                 ("text-generation-with-past", "bitnet"),
@@ -202,14 +208,6 @@ class OVCLIExportTestCase(unittest.TestCase):
             "expected_chat_template": False,
             "simplified_chat_template": False,
         },
-        "llava": {  # transformers, chat template in processor, simplified chat template
-            "num_tokenizers": 2,
-            "task": "image-text-to-text",
-            "processor_chat_template": True,
-            "remote_code": False,
-            "expected_chat_template": True,
-            "simplified_chat_template": True,
-        },
         "llava_next": {  # transformers, chat template in processor overrides tokinizer chat template, simplified chat template
             "num_tokenizers": 2,
             "task": "image-text-to-text",
@@ -256,6 +254,20 @@ class OVCLIExportTestCase(unittest.TestCase):
             }
         )
 
+    if is_transformers_version("<", "5"):
+        TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS.update(
+            {
+                "llava": {  # transformers, chat template in processor, simplified chat template
+                    "num_tokenizers": 2,
+                    "task": "image-text-to-text",
+                    "processor_chat_template": True,
+                    "remote_code": False,
+                    "expected_chat_template": True,
+                    "simplified_chat_template": True,
+                },
+            }
+        )
+
     SUPPORTED_SD_HYBRID_ARCHITECTURES = [
         ("flux", 7, 56),
         ("latent-consistency", 50, 135),
@@ -407,7 +419,7 @@ class OVCLIExportTestCase(unittest.TestCase):
                 "model": 33,
             },
             {
-                "model": {"int8": 35},
+                "model": {"int8": 35 if is_transformers_version("<", "5") else 36},
             },
         ),
         (
@@ -431,7 +443,7 @@ class OVCLIExportTestCase(unittest.TestCase):
                 "model": 32,
             },
             {
-                "model": {"int8": 34},
+                "model": {"int8": 34 if is_transformers_version("<", "5") else 35},
             },
         ),
         (
@@ -472,7 +484,7 @@ class OVCLIExportTestCase(unittest.TestCase):
             (
                 {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}}
                 if is_transformers_version("<=", "4.45")
-                else {"encoder": {"int8": 32}, "decoder": {"int8": 52}}
+                else {"encoder": {"int8": 32}, "decoder": {"int8": 52 if is_transformers_version("<", "5") else 53}}
             ),
         ),
         (
@@ -489,48 +501,52 @@ class OVCLIExportTestCase(unittest.TestCase):
                 "prompt_encoder_mask_decoder": {"int8": 49},
             },
         ),
-        (
-            "image-text-to-text",
-            "internvl_chat",
-            "f8e4m3",
-            "--dataset contextual --num-samples 1 --trust-remote-code",
-            {
-                "lm_model": 15,
-                "text_embeddings_model": 0,
-                "vision_embeddings_model": 17,
-            },
-            {
-                "lm_model": {"f8e4m3": 15},
-                "text_embeddings_model": {"int8": 1},
-                "vision_embeddings_model": {"f8e4m3": 11},
-            },
-        ),
     ]
 
+    if is_transformers_version("<", "5"):
+        SUPPORTED_QUANTIZATION_ARCHITECTURES.append(
+            (
+                "image-text-to-text",
+                "internvl_chat",
+                "f8e4m3",
+                "--dataset contextual --num-samples 1 --trust-remote-code",
+                {
+                    "lm_model": 15,
+                    "text_embeddings_model": 0,
+                    "vision_embeddings_model": 17,
+                },
+                {
+                    "lm_model": {"f8e4m3": 15},
+                    "text_embeddings_model": {"int8": 1},
+                    "vision_embeddings_model": {"f8e4m3": 11},
+                },
+            ),
+        )
+
     TRANSFORMERS_4BIT_CONFIGURATIONS = [
         (
             "text-generation-with-past",
             "opt125m",
             "int4 --sym --group-size 128",
-            {"model": {"int8": 4, "int4": 72}},
+            {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 72}},
         ),
         (
             "text-generation-with-past",
             "opt125m",
             "int4 --group-size 64",
-            {"model": {"int8": 4, "int4": 144}},
+            {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "int4": 144}},
         ),
         (
             "text-generation-with-past",
             "opt125m",
             "mxfp4",
-            {"model": {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}},
+            {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "f4e2m1": 72, "f8e8m0": 72}},
         ),
         (
             "text-generation-with-past",
             "opt125m",
             "nf4",
-            {"model": {"int8": 4, "nf4": 72}},
+            {"model": {"int8": 4 if is_transformers_version("<", "5") else 6, "nf4": 72}},
         ),
         (
             "text-generation-with-past",
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 06314ef394..c6737bff1e 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -354,8 +354,8 @@
         "vocoder": 80,
     },
     "clip": {"model": 130},
-    "mamba": {"model": 322},
-    "falcon_mamba": {"model": 162},
+    "mamba": {"model": 322 if is_transformers_version("<", "5") else 324},
+    "falcon_mamba": {"model": 162 if is_transformers_version("<", "5") else 164},
     "minicpmo": {
         "lm_model": 16,
         "text_embeddings_model": 1,
@@ -364,7 +364,7 @@
     },
     "zamba2": {"model": 44},
     "exaone4": {"model": 16},
-    "lfm2": {"model": 52},
+    "lfm2": {"model": 52 if is_transformers_version("<", "5") else 54},
     "qwen3_eagle3": {"model": 20},
 }
 

From 8beb8d8bfaed7d778adfc152212d3b4912613745 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Feb 2026 19:35:01 +0100
Subject: [PATCH 114/222] fix

---
 tests/openvino/test_export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 6cc28c8597..eae3727de6 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -121,7 +121,7 @@ class ExportModelTest(unittest.TestCase):
 
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM})
-        GENERATIVE_MODELS.append("llava")
+        GENERATIVE_MODELS += ("llava",)
 
     def _openvino_export(
         self,

From e4eba9296ec619c219029f005d5fcc8913eb6871 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 10:28:15 +0100
Subject: [PATCH 115/222] add stable diffusion 3 tests when diffusers
 compatible with v5

---
 tests/openvino/test_diffusion.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index 8efc69f8ec..e4f558efb7 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -80,7 +80,6 @@ class OVPipelineForText2ImageTest(unittest.TestCase):
         "stable-diffusion",
         "stable-diffusion-xl",
         "latent-consistency",
-        "stable-diffusion-3",
         "flux",
         "sana",
     ]
@@ -93,6 +92,10 @@ class OVPipelineForText2ImageTest(unittest.TestCase):
 
     if is_diffusers_version(">=", "0.33.0"):
         SUPPORTED_ARCHITECTURES.extend(["sana-sprint"])
+
+    if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
+
     CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
     OVMODEL_CLASS = OVPipelineForText2Image
@@ -499,9 +502,11 @@ class OVPipelineForImage2ImageTest(unittest.TestCase):
         "stable-diffusion",
         "stable-diffusion-xl",
         "latent-consistency",
-        "stable-diffusion-3",
         "flux",
     ]
+    if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
+
     AUTOMODEL_CLASS = AutoPipelineForImage2Image
     OVMODEL_CLASS = OVPipelineForImage2Image
     TASK = "image-to-image"
@@ -754,7 +759,11 @@ def test_textual_inversion(self):
 
 
 class OVPipelineForInpaintingTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "stable-diffusion-3", "flux", "flux-fill"]
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "flux", "flux-fill"]
+
+    if is_transformers_version("<", "5") or is_diffusers_version(">=", "0.37"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
+
     AUTOMODEL_CLASS = AutoPipelineForInpainting
     OVMODEL_CLASS = OVPipelineForInpainting
     TASK = "inpainting"

From dc2823d35bef2fe24d15022c624fe210a589ac8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 14:58:17 +0100
Subject: [PATCH 116/222] use xlm-roberta with max_position_embeddings 514

---
 tests/openvino/utils_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index c6737bff1e..2cdbdcf8b7 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -211,7 +211,7 @@
     "wav2vec2-conformer": "optimum-intel-internal-testing/tiny-random-wav2vec2-conformer",
     "whisper": "optimum-intel-internal-testing/tiny-random-whisper",
     "xlm": "optimum-intel-internal-testing/tiny-random-xlm",
-    "xlm-roberta": "optimum-intel-internal-testing/tiny-xlm-roberta",
+    "xlm-roberta": "optimum-intel-internal-testing/tiny-random-xlm-roberta",
     "xglm": "optimum-intel-internal-testing/tiny-random-XGLMForCausalLM",
     "xverse": "optimum-intel-internal-testing/tiny-random-xverse",
     "glm4": "optimum-intel-internal-testing/tiny-random-glm4",

From 5967be3cf2a42122d05546c8b04f449970dcef3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 15:00:31 +0100
Subject: [PATCH 117/222] add missing import

---
 tests/openvino/test_diffusion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index e4f558efb7..bc58c91796 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -38,7 +38,7 @@
     OVPipelineForText2Video,
 )
 from optimum.intel.openvino.utils import TemporaryDirectory
-from optimum.intel.utils.import_utils import is_diffusers_version
+from optimum.intel.utils.import_utils import is_diffusers_version, is_transformers_version
 from optimum.utils.testing_utils import require_diffusers
 
 

From 699b0b797679c3242a861521f53b8394e98ca8aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 15:51:28 +0100
Subject: [PATCH 118/222] granitemoe fix

---
 optimum/exporters/openvino/model_configs.py |  2 -
 optimum/exporters/openvino/model_patcher.py | 43 +++++++++------------
 tests/openvino/test_decoder.py              |  2 +-
 3 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index fb7acb865d..fe846efcf4 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -3988,8 +3988,6 @@ class GraniteOpenVINOConfig(LlamaOpenVINOConfig):
 )
 class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.45.0"
-    # TODO (@echarlaix): add v5 support
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = GraniteMoEModelPatcher
 
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 1be2bfe437..4bd9024bc8 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4508,34 +4508,29 @@ class GraniteMoEModelPatcher(OVDecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
 
-        if is_transformers_version("<", "5"):
-            for layer in self._model.model.layers:
-                block_sparse_moe = layer.block_sparse_moe
-                block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward
-                block_sparse_moe.router.forward = types.MethodType(
-                    _granite_moe_topk_gating_forward, block_sparse_moe.router
-                )
-                block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward
-                block_sparse_moe.input_linear.forward = types.MethodType(
-                    _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear
-                )
-                block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward
-                block_sparse_moe.output_linear.forward = types.MethodType(
-                    _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear
-                )
-
-        else:
-            self._model.set_experts_implementation("batched_mm")
+        for layer in self._model.model.layers:
+            block_sparse_moe = layer.block_sparse_moe
+            block_sparse_moe.router._orig_forward = block_sparse_moe.router.forward
+            block_sparse_moe.router.forward = types.MethodType(
+                _granite_moe_topk_gating_forward, block_sparse_moe.router
+            )
+            block_sparse_moe.input_linear._orig_forward = block_sparse_moe.input_linear.forward
+            block_sparse_moe.input_linear.forward = types.MethodType(
+                _granite_moe_parallel_experts_forward, block_sparse_moe.input_linear
+            )
+            block_sparse_moe.output_linear._orig_forward = block_sparse_moe.output_linear.forward
+            block_sparse_moe.output_linear.forward = types.MethodType(
+                _granite_moe_parallel_experts_forward, block_sparse_moe.output_linear
+            )
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
 
-        if is_transformers_version("<", "5"):
-            for layer in self._model.model.layers:
-                block_sparse_moe = layer.block_sparse_moe
-                block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward
-                block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward
-                block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward
+        for layer in self._model.model.layers:
+            block_sparse_moe = layer.block_sparse_moe
+            block_sparse_moe.router.forward = block_sparse_moe.router._orig_forward
+            block_sparse_moe.input_linear.forward = block_sparse_moe.input_linear._orig_forward
+            block_sparse_moe.output_linear.forward = block_sparse_moe.output_linear._orig_forward
 
 
 class OVSeq2SeqModelPatcher(ModelPatcher):
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 9a6acf1cb7..92b87ddfe3 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -81,6 +81,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "phi3",
         "gemma2",
         "granite",
+        "granitemoe",
     )
 
     SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba")
@@ -165,7 +166,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
             "dbrx",
             # "phimoe",
             "marian",
-            "granitemoe",
             # "zamba2",
         )
     GENERATION_LENGTH = 100

From 389f818868ecdbff53249bbb0067e769f304ebe2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 15:59:54 +0100
Subject: [PATCH 119/222] filtered test

---
 tests/openvino/test_exporters_cli.py | 2 ++
 tests/openvino/test_quantization.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index a684c90ca8..5f45f00031 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -834,6 +834,8 @@ def test_filtered_architectures(cls):
             expected = {"qwen3_vl"}
         else:
             expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"}
+        if is_transformers_version(">=", "5"):
+            expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"})
 
         all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
         filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS}
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index f66ae8834d..dd69f926f5 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1234,6 +1234,8 @@ def test_filtered_architectures(cls):
             expected.add("qwen3_vl")
         if is_transformers_version(">=", "4.54"):
             expected.update({"llava-qwen2", "phi3_v", "minicpmo"})
+        if is_transformers_version(">=", "5"):
+            expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"})
 
         all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
         filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE}

From ffe2d27e445e05e4eef70e07aed8a27038db0d1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 16:46:52 +0100
Subject: [PATCH 120/222] add back granitemoe model support

---
 tests/openvino/test_decoder.py | 2 +-
 tests/openvino/test_genai.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 92b87ddfe3..fac01b5960 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -311,7 +311,7 @@ def test_find_untested_architectures(self):
 
         # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
         if is_transformers_version(">=", "5"):
-            supported_architectures -= {"phimoe", "granitemoe", "bitnet", "dbrx", "zamba2", "marian"}
+            supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian"}
 
         supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES
         untested_architectures = supported_architectures - tested_architectures
diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index 74f6bab1ec..388c3ce127 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -65,6 +65,7 @@ class LLMPipelineTestCase(unittest.TestCase):
         "phi3",
         "gemma2",
         "granite",
+        "granitemoe",
     )
 
     if is_transformers_version(">=", "4.46.0"):
@@ -108,7 +109,6 @@ class LLMPipelineTestCase(unittest.TestCase):
             # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
             "dbrx",
             # "phimoe",
-            "granitemoe",
         )
 
     REMOTE_CODE_MODELS = (

From c649bdf8325b9d132051b57bdaf2b8effc7a0568 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 17:59:49 +0100
Subject: [PATCH 121/222] udpate setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 16e2a82fed..3a1995891d 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=2.1",
-    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5",
+    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5",
     "transformers>=4.45,<5.1",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",

From 4c74aebf280bd1e68625f8b20620651cdcbb5210 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 18:26:29 +0100
Subject: [PATCH 122/222] update workflows

---
 .github/workflows/test_offline.yaml         | 2 +-
 .github/workflows/test_openvino_nightly.yml | 7 ++++++-
 .github/workflows/test_openvino_slow.yml    | 7 ++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml
index c75ba43bef..5b6b019e83 100644
--- a/.github/workflows/test_offline.yaml
+++ b/.github/workflows/test_offline.yaml
@@ -34,7 +34,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
-          uv pip install .[diffusers,tests]
+          uv pip install .[tests] diffusers
 
       - name: Test
         run: |
diff --git a/.github/workflows/test_openvino_nightly.yml b/.github/workflows/test_openvino_nightly.yml
index 90df6a2af3..ace0246329 100644
--- a/.github/workflows/test_openvino_nightly.yml
+++ b/.github/workflows/test_openvino_nightly.yml
@@ -97,7 +97,12 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
-          uv pip install .[diffusers,tests]
+          uv pip install .[tests] librosa diffusers
+
+      - if: ${{ matrix.transformers-version == 'latest' }}
+        name: Install diffusers
+        run: |
+          uv pip install git+https://github.com/huggingface/diffusers
 
       - if: ${{ matrix.openvino-version == 'openvino-nightly' }}
         name: Install OpenVINO Nightly
diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml
index 4b271d898b..3868e44141 100644
--- a/.github/workflows/test_openvino_slow.yml
+++ b/.github/workflows/test_openvino_slow.yml
@@ -59,7 +59,12 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip uv
-          uv pip install .[tests,diffusers] transformers[testing]
+          uv pip install .[tests] transformers[testing] diffusers
+
+      - if: ${{ matrix.transformers-version == 'latest' }}
+        name: Install diffusers
+        run: |
+          uv pip install git+https://github.com/huggingface/diffusers
 
       - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }}
         name: Install specific dependencies and versions required for older transformers

From c7184e114939e777886cb1f2acb4b3abcca6f148 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 18:38:37 +0100
Subject: [PATCH 123/222] update setup

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 3a1995891d..267d0b83f2 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=2.1",
-    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5",
-    "transformers>=4.45,<5.1",
+    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers5",
+    "transformers>=4.45,<5.3",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",
     "nncf>=2.19.0",

From 7b0806e3f8571558a055a457ac3958589edecc2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 18:39:54 +0100
Subject: [PATCH 124/222] fix

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 267d0b83f2..e99736e5a4 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=2.1",
-    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers5",
+    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5",
     "transformers>=4.45,<5.3",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",

From 50e30b789ffc182fac3ba943cdcafdbf1a27c11b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 18:52:22 +0100
Subject: [PATCH 125/222] update setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e99736e5a4..16e2a82fed 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRE = [
     "torch>=2.1",
     "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5",
-    "transformers>=4.45,<5.3",
+    "transformers>=4.45,<5.1",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",
     "nncf>=2.19.0",

From e7878e1de4f04e61eafc97657db69b37c9e79f30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 19:00:41 +0100
Subject: [PATCH 126/222] remove diffusers

---
 .github/workflows/test_offline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml
index 5b6b019e83..7c4458a306 100644
--- a/.github/workflows/test_offline.yaml
+++ b/.github/workflows/test_offline.yaml
@@ -34,7 +34,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
-          uv pip install .[tests] diffusers
+          uv pip install .[tests]
 
       - name: Test
         run: |

From 467dcad06b77db153fc8419fdfb6981c1005640a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 19:06:51 +0100
Subject: [PATCH 127/222] fix offline workflow

---
 .github/workflows/test_offline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml
index 7c4458a306..d079c6c8b7 100644
--- a/.github/workflows/test_offline.yaml
+++ b/.github/workflows/test_offline.yaml
@@ -38,10 +38,10 @@ jobs:
 
       - name: Test
         run: |
-          HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2
+          HF_HOME=/tmp/ hf download hf-internal-testing/tiny-random-gpt2
           HF_HOME=/tmp/ HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation
 
-          huggingface-cli download hf-internal-testing/tiny-random-gpt2
+          hf download hf-internal-testing/tiny-random-gpt2
           HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation
 
           pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv

From c14f2e53737afa21d7cb20fa9ea40c9e32139f80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Feb 2026 19:42:33 +0100
Subject: [PATCH 128/222] exclude openclip from offline tests

---
 .github/workflows/test_offline.yaml | 4 ++--
 tests/openvino/test_modeling.py     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml
index d079c6c8b7..48f07b9396 100644
--- a/.github/workflows/test_offline.yaml
+++ b/.github/workflows/test_offline.yaml
@@ -44,5 +44,5 @@ jobs:
           hf download hf-internal-testing/tiny-random-gpt2
           HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation
 
-          pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv
-          HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv
+          pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv
+          HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub and not openclip" -s -vvvvv
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 8d8ab01147..db369a478c 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1601,7 +1601,7 @@ def _get_sample_image(self):
         image = Image.open(requests.get(url, stream=True).raw)
         return image
 
-    def test_load_from_hub_and_save_model(self):
+    def test_load_from_hub_and_save_model_openclip(self):
         loaded_model = OVModelOpenCLIPForZeroShotImageClassification.from_pretrained(
             self.OV_MODEL_ID_IR, device=OPENVINO_DEVICE
         )

From 69c16bfd00cb485ef7e72a013b720575bf84c28d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 09:29:22 +0100
Subject: [PATCH 129/222] workflow slow

---
 .github/workflows/test_openvino_slow.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml
index 3868e44141..580253a36a 100644
--- a/.github/workflows/test_openvino_slow.yml
+++ b/.github/workflows/test_openvino_slow.yml
@@ -59,7 +59,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip uv
-          uv pip install .[tests] transformers[testing] diffusers
+          uv pip install .[tests] librosa diffusers
 
       - if: ${{ matrix.transformers-version == 'latest' }}
         name: Install diffusers

From 3f8dfb4be84364485b37aaa366b7472afde47286 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 11:15:10 +0100
Subject: [PATCH 130/222] fix question answering pipeline

---
 tests/openvino/test_modeling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index db369a478c..0c5011a908 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -896,12 +896,12 @@ def test_pipeline(self, model_arch):
         pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
         question = "What's my name?"
         context = "My Name is Arthur and I live in Lyon."
-        outputs = pipe(question, context)
+        outputs = pipe(question=question, context=context)
         self.assertEqual(pipe.device, model.device)
         self.assertGreaterEqual(outputs["score"], 0.0)
         self.assertIsInstance(outputs["answer"], str)
         ov_pipe = optimum_pipeline("question-answering", model_id, accelerator="openvino")
-        ov_outputs = ov_pipe(question, context)
+        ov_outputs = ov_pipe(question=question, context=context)
         self.assertEqual(outputs["score"], ov_outputs["score"])
         del model
         del ov_pipe

From 975da724b9c9cd25e768d4d4f928d534113f85fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 11:24:43 +0100
Subject: [PATCH 131/222] encode_plus deprecated

---
 tests/openvino/test_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index dd69f926f5..ed1577d1cd 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -2038,7 +2038,7 @@ def preprocess_function(examples, tokenizer):
 
             # Test that inference on quantized model works
             model = OVModelForQuestionAnswering.from_pretrained(tmp_dir, device=OPENVINO_DEVICE)
-            tokens = tokenizer.encode_plus(
+            tokens = tokenizer(
                 "This is a sample question", "This is a sample context", add_special_tokens=True, return_tensors="pt"
             )
             model(**tokens, return_dict=True)

From 31989eb0b262634373bfa2799ed634b96c0b3fd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 11:54:31 +0100
Subject: [PATCH 132/222] automatic-speech pipeline for whisper incompatible
 with v5

---
 tests/openvino/test_seq2seq.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index da68d6e8b9..d8c10f39dd 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -410,6 +410,7 @@ def test_compare_to_transformers(self, model_arch):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
+    @pytest.mark.skipif(is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames")
     def test_pipeline(self, model_arch):
         set_seed(SEED)
         model_id = MODEL_NAMES[model_arch]

From 7adb81012983043c4e12632693848c87a1f92746 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 11:54:57 +0100
Subject: [PATCH 133/222] style

---
 tests/openvino/test_seq2seq.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index d8c10f39dd..0fc3821c9b 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -410,7 +410,9 @@ def test_compare_to_transformers(self, model_arch):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
-    @pytest.mark.skipif(is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames")
+    @pytest.mark.skipif(
+        is_transformers_version("==", "5.0"), reason="Issue with transformers v5.0 coming from num_frames"
+    )
     def test_pipeline(self, model_arch):
         set_seed(SEED)
         model_id = MODEL_NAMES[model_arch]

From 6a93224b1ca747e32f8ce68cdc14be725a60bb9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 14:21:22 +0100
Subject: [PATCH 134/222] image-to-text pipeline deprecated

---
 tests/openvino/test_seq2seq.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 0fc3821c9b..bbc3d9260d 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -515,6 +515,10 @@ def test_compare_to_transformers(self, model_arch: str):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
+    @pytest.mark.skipif(
+        is_transformers_version(">=", "5"),
+        reason="requires transformers < v5 since image-to-text pipelines is deprecated",
+    )
     def test_pipeline(self, model_arch: str):
         set_seed(SEED)
         model_id = MODEL_NAMES[model_arch]

From c8e9488fad965f496b0bc4dac3aae36b554fb82a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 14:30:49 +0100
Subject: [PATCH 135/222] update MAX_TRANSFORMERS_VERSION for gemma3 exaone4
 and llama4

---
 optimum/exporters/openvino/model_configs.py |  6 ++++++
 tests/openvino/test_decoder.py              | 11 ++++++++---
 tests/openvino/test_export.py               |  2 +-
 tests/openvino/test_exporters_cli.py        |  2 +-
 tests/openvino/test_genai.py                |  2 +-
 5 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index fe846efcf4..53610803da 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -874,6 +874,8 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig):
 )
 class Exaone4OpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.54.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(
@@ -1474,6 +1476,8 @@ class Gemma2OpenVINOConfig(GemmaOpenVINOConfig):
 )
 class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.50.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
@@ -4561,6 +4565,8 @@ def with_behavior(
 )
 class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.51.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
     _MODEL_PATCHER = Llama4TextModelPatcher
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index fac01b5960..2e6a938c81 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -115,11 +115,16 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">", "4.47"):
         SUPPORTED_ARCHITECTURES += ("olmo2",)
 
-    if is_transformers_version(">", "4.49"):
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("gemma3_text",)
 
     if is_transformers_version(">=", "4.51.0"):
-        SUPPORTED_ARCHITECTURES += ("llama4", "qwen3", "qwen3_moe")
+        SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe")
+
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version(">=", "4.51.0") and is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES += ("llama4",)
 
     if is_transformers_version(">=", "4.51.3"):
         SUPPORTED_ARCHITECTURES += ("glm4",)
@@ -131,7 +136,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.52.1") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("bitnet",)
 
-    if is_transformers_version(">=", "4.54.0"):
+    if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("exaone4",)
 
     if is_transformers_version("<", "4.54.0"):
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index eae3727de6..b73de1aaf9 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -100,7 +100,7 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES.update({"granitemoehybrid": OVModelForCausalLM})
 
-    if is_transformers_version(">=", "4.54"):
+    if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.update({"exaone4": OVModelForCausalLM, "lfm2": OVModelForCausalLM})
 
     if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"):
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 5f45f00031..326f42d9bd 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -138,7 +138,7 @@ class OVCLIExportTestCase(unittest.TestCase):
             ]
         )
 
-    if is_transformers_version(">=", "4.54"):
+    if is_transformers_version(">=", "4.54") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES.extend(
             [
                 ("text-generation-with-past", "exaone4"),
diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index 388c3ce127..906216c567 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -84,7 +84,7 @@ class LLMPipelineTestCase(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("glm4",)
     if is_transformers_version(">=", "4.53.0"):
         SUPPORTED_ARCHITECTURES += ("arcee",)
-    if is_transformers_version(">=", "4.54.0"):
+    if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("exaone4",)
     if is_transformers_version(">=", "4.55.0"):
         SUPPORTED_ARCHITECTURES += ("gpt_oss",)

From 28e2e24ff38d99ee269631652fe913fa27063552 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 15:28:36 +0100
Subject: [PATCH 136/222] remove from test when not supported

---
 tests/openvino/test_genai.py        | 2 +-
 tests/openvino/test_quantization.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index 906216c567..0ddc6db210 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -76,7 +76,7 @@ class LLMPipelineTestCase(unittest.TestCase):
             SUPPORTED_ARCHITECTURES += ("qwen",)
         if is_transformers_version("<", "5"):
             SUPPORTED_ARCHITECTURES += ("phimoe",)
-    if is_transformers_version(">=", "4.49"):
+    if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("gemma3_text",)
     if is_transformers_version(">=", "4.51.0"):
         SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe")
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index ed1577d1cd..9c60468dd3 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1070,7 +1070,7 @@ class OVWeightCompressionTest(unittest.TestCase):
     if is_transformers_version("<", "4.52.0"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmo", True))
 
-    if is_transformers_version(">=", "4.54.0"):
+    if is_transformers_version(">=", "4.54.0") and is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForCausalLM, "exaone4", True))
 
     if is_transformers_version(">=", "4.57.0"):

From f061f2ce4d7643e5bcc43ca30dab48438821f628 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 17:41:16 +0100
Subject: [PATCH 137/222] decoder tests

---
 tests/openvino/test_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 2e6a938c81..01e4481c8d 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -316,7 +316,7 @@ def test_find_untested_architectures(self):
 
         # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
         if is_transformers_version(">=", "5"):
-            supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian"}
+            supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian", "llama4", "gemma3_text", "exaone4"}
 
         supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES
         untested_architectures = supported_architectures - tested_architectures
@@ -420,7 +420,7 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["qwen"]:
             return
 
-        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        tokens = tokenizer(["Today is a nice day and", "This is me"], return_tensors="pt", padding=True)
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None

From 8820fb3964e245c41351cb2cb866dfe8da228897 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 17:43:05 +0100
Subject: [PATCH 138/222] test filtered architectures update with exaone4

---
 tests/openvino/test_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 9c60468dd3..b5c01b90d9 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1235,7 +1235,7 @@ def test_filtered_architectures(cls):
         if is_transformers_version(">=", "4.54"):
             expected.update({"llava-qwen2", "phi3_v", "minicpmo"})
         if is_transformers_version(">=", "5"):
-            expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat"})
+            expected.update({"llama4", "llava_next_video", "minicpmv", "internvl_chat", "exaone4"})
 
         all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS}
         filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE}

From 290b7b328cf64a9dfd9d2881996dbb1114d76369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Feb 2026 18:04:06 +0100
Subject: [PATCH 139/222] change gptoss model

---
 tests/openvino/test_exporters_cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 326f42d9bd..7f396e3a85 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -1225,13 +1225,13 @@ def test_exporters_cli_full_quantization(
             {"model": 65},
         ),
         (
-            "gpt_oss_mxfp4",
+            "gpt_oss",
             "openai/gpt-oss-20b",
             AutoModelForCausalLM,
             OVModelForCausalLM,
             "--task text-generation-with-past --weight-format int4",
             _DEFAULT_4BIT_WQ_CONFIGS,
-            {"model": {"int8": 22, "int4": 4}},
+            {"model": {"int8": 40, "int4": 0}},
             {"model": 0},
         ),
         (

From 64223a8d6d331816d507a353aeb248189cfc8bf0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 23 Feb 2026 15:49:57 +0100
Subject: [PATCH 140/222] style

---
 tests/openvino/test_decoder.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 01e4481c8d..e111b0ec06 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -316,7 +316,16 @@ def test_find_untested_architectures(self):
 
         # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
         if is_transformers_version(">=", "5"):
-            supported_architectures -= {"phimoe", "bitnet", "dbrx", "zamba2", "marian", "llama4", "gemma3_text", "exaone4"}
+            supported_architectures -= {
+                "phimoe",
+                "bitnet",
+                "dbrx",
+                "zamba2",
+                "marian",
+                "llama4",
+                "gemma3_text",
+                "exaone4",
+            }
 
         supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES
         untested_architectures = supported_architectures - tested_architectures

From f40bcb35016907a36f2b509cd2cfeb4dbe669c18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 23 Feb 2026 16:15:10 +0100
Subject: [PATCH 141/222] set num beam to 5

---
 tests/openvino/test_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index e111b0ec06..75a3a49f36 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -429,7 +429,7 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["qwen"]:
             return
 
-        tokens = tokenizer(["Today is a nice day and", "This is me"], return_tensors="pt", padding=True)
+        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -440,7 +440,7 @@ def test_compare_to_transformers(self, model_arch):
             # LFM2 fails with beam search, issue link: https://github.com/huggingface/transformers/issues/42257
             # CVS-177964 GraniteMoeHybrid fails due to lack support of Beam search for hybrid models in OpenVINO
             # For this support, we expect changes in IRs to have connected beam_idx with Mamba/Linear attention states
-            num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 2,
+            num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 5,
             do_sample=False,
         )
 

From 86767c7a5297026d0d88797dfa8c925d92f6998b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 17:30:27 +0100
Subject: [PATCH 142/222] add llava support for v5

---
 optimum/exporters/openvino/model_configs.py | 10 ---------
 optimum/exporters/openvino/model_patcher.py | 23 ---------------------
 tests/openvino/test_decoder.py              |  5 +++--
 tests/openvino/test_export.py               |  7 ++-----
 tests/openvino/test_exporters_cli.py        | 22 +++++++-------------
 tests/openvino/test_seq2seq.py              |  6 +++---
 6 files changed, 16 insertions(+), 57 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 53610803da..09ab8d72a0 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -166,7 +166,6 @@
     Llama4ImageEmbeddingsModelPatcher,
     Llama4TextModelPatcher,
     LlavaImageEmbeddingModelPatcher,
-    LlavaNextImageEmbeddingModelPatcher,
     LlavaNextVideoImageEmbeddingModelPatcher,
     LlavaQwen2ImageEmbeddingsModelPatcher,
     MairaImageEmbeddingModelPatcher,
@@ -1902,8 +1901,6 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[
 @register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers")
 class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.37.2"
-    # TODO (@echarlaix): add v5 support
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
     def __init__(
         self,
@@ -1942,13 +1939,6 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict:
 @register_in_tasks_manager("llava_next", *["image-text-to-text"], library_name="transformers")
 class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.40.0"
-    MAX_TRANSFORMERS_VERSION = "5.99"
-
-    def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
-        model_kwargs = model_kwargs or {}
-        if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS:
-            return super().patch_model_for_export(model, model_kwargs)
-        return LlavaNextImageEmbeddingModelPatcher(self, model, model_kwargs)
 
 
 class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator):
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 4bd9024bc8..9624401569 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -3307,29 +3307,6 @@ def __init__(
         model_kwargs: Dict[str, Any],
     ):
         model.__orig_forward = model.forward
-
-        if is_transformers_version("<", "5"):
-            model.forward = types.MethodType(llava_vision_embed_forward, model)
-        else:
-            model.forward = model.get_image_features
-
-        super().__init__(config, model, model_kwargs)
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        self._model.forward = self._model.__orig_forward
-
-
-class LlavaNextImageEmbeddingModelPatcher(ModelPatcher):
-    def __init__(
-        self,
-        config: "OnnxConfig",
-        model: "PreTrainedModel",
-        model_kwargs: Dict[str, Any],
-    ):
-        model.__orig_forward = model.forward
-        # TODO: use get_image_features instead and add image_sizes as input when exporting
-        # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/llava_next/modeling_llava_next.py#L716
         model.forward = types.MethodType(llava_vision_embed_forward, model)
         super().__init__(config, model, model_kwargs)
 
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 75a3a49f36..a06ee5fa12 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -429,7 +429,8 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in ["qwen"]:
             return
 
-        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        inputs = "Today is a nice day and" if model_arch == "decilm" else "The quick brown fox jumps over the"
+        tokens = tokenizer([inputs, "This is me"], return_tensors="pt", padding=True)
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -440,7 +441,7 @@ def test_compare_to_transformers(self, model_arch):
             # LFM2 fails with beam search, issue link: https://github.com/huggingface/transformers/issues/42257
             # CVS-177964 GraniteMoeHybrid fails due to lack support of Beam search for hybrid models in OpenVINO
             # For this support, we expect changes in IRs to have connected beam_idx with Mamba/Linear attention states
-            num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 5,
+            num_beams=1 if model_arch in ["chatglm4", "lfm2", "granitemoehybrid"] else 2,
             do_sample=False,
         )
 
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index b73de1aaf9..ca16598103 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -84,6 +84,7 @@ class ExportModelTest(unittest.TestCase):
         "stable-diffusion-xl": OVStableDiffusionXLPipeline,
         "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline,
         "latent-consistency": OVLatentConsistencyModelPipeline,
+        "llava": OVModelForVisualCausalLM,
         "sam": OVSamModel,
         "speecht5": OVModelForTextToSpeechSeq2Seq,
         "clip": OVModelForZeroShotImageClassification,
@@ -117,11 +118,7 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.51"):
         SUPPORTED_ARCHITECTURES.update({"qwen3": OVModelForFeatureExtraction})
 
-    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "speecht5")
-
-    if is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES.update({"llava": OVModelForVisualCausalLM})
-        GENERATIVE_MODELS += ("llava",)
+    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava", "speecht5")
 
     def _openvino_export(
         self,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 7f396e3a85..96c8cd64f6 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -208,6 +208,14 @@ class OVCLIExportTestCase(unittest.TestCase):
             "expected_chat_template": False,
             "simplified_chat_template": False,
         },
+        "llava": {  # transformers, chat template in processor, simplified chat template
+            "num_tokenizers": 2,
+            "task": "image-text-to-text",
+            "processor_chat_template": True,
+            "remote_code": False,
+            "expected_chat_template": True,
+            "simplified_chat_template": True,
+        },
         "llava_next": {  # transformers, chat template in processor overrides tokinizer chat template, simplified chat template
             "num_tokenizers": 2,
             "task": "image-text-to-text",
@@ -254,20 +262,6 @@ class OVCLIExportTestCase(unittest.TestCase):
             }
         )
 
-    if is_transformers_version("<", "5"):
-        TOKENIZER_CHAT_TEMPLATE_TESTS_MODELS.update(
-            {
-                "llava": {  # transformers, chat template in processor, simplified chat template
-                    "num_tokenizers": 2,
-                    "task": "image-text-to-text",
-                    "processor_chat_template": True,
-                    "remote_code": False,
-                    "expected_chat_template": True,
-                    "simplified_chat_template": True,
-                },
-            }
-        )
-
     SUPPORTED_SD_HYBRID_ARCHITECTURES = [
         ("flux", 7, 56),
         ("latent-consistency", 50, 135),
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index bbc3d9260d..4e2df41407 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -547,6 +547,7 @@ def test_pipeline(self, model_arch: str):
 
 class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
     SUPPORTED_ARCHITECTURES = [
+        "llava",
         "llava_next",
         "llava_next_mistral",
         "qwen2_vl",
@@ -601,7 +602,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
 
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video")
+        SUPPORTED_ARCHITECTURES += ("llava_next_video",)
     else:
         UNSUPPORTED_ARCHITECTURES = {
             "got_ocr2",
@@ -611,7 +612,6 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
             "phi4_multimodal",
             "gemma3",
             "smolvlm",
-            "llava",
         }
     REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
     IMAGE = Image.open(
@@ -872,7 +872,7 @@ def test_compare_to_transformers(self, model_arch):
     @parameterized.expand(
         ["llava", "llava_next", "llava_next_video", "llava_next_mistral"]
         if is_transformers_version("<", "5")
-        else ["llava_next", "llava_next_mistral"]
+        else ["llava", "llava_next", "llava_next_mistral"]
     )
     def test_llava_with_new_preprocessing(self, model_arch):
         prompt = "<image>\n What is shown in this image?"

From c523617123a1f45d6664335490446b05a82f576d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 17:33:15 +0100
Subject: [PATCH 143/222] maira

---
 optimum/exporters/openvino/model_configs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 09ab8d72a0..a52cb0ca87 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -2060,7 +2060,6 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[
 )
 class MairaOpenVINOConfig(LlavaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.46.0"
-    MAX_TRANSFORMERS_VERSION = "5.99"
     SUPPORTS_PAST = True
 
     def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):

From d49a895bda3bf9b94752ed04bee1821735e2d8f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 17:50:37 +0100
Subject: [PATCH 144/222] extend tests disabled for marian for openvino v2026

---
 tests/openvino/test_genai.py   | 4 ++--
 tests/openvino/test_seq2seq.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index 0ddc6db210..5d6b3c4b72 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -466,8 +466,8 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase):
 
     @parameterized.expand(EAGLE3_MODELS.items())
     def test_compare_outputs(self, model_arch, model_pair):
-        if is_transformers_version("<", "4.54"):
-            self.skipTest("Eagle3 requires transformers >= 4.54")
+        if is_transformers_version("<", "4.54") or  is_transformers_version(">=", "5"):
+            self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5")
         if is_openvino_version("<", "2026.0"):
             self.skipTest("Eagle3 requires openvino-genai >= 2026.0")
 
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 4e2df41407..e34a256060 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 
-    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2025.5.0")):
+    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version("<", "5"):
         # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x
         SUPPORTED_ARCHITECTURES += ("marian",)
 

From 6f608fd27e7c25c8a5cb438804c272eb26b61fed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 17:51:02 +0100
Subject: [PATCH 145/222] style

---
 tests/openvino/test_seq2seq.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index e34a256060..c15c0ca269 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -154,7 +154,9 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 
-    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version("<", "5"):
+    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version(
+        "<", "5"
+    ):
         # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x
         SUPPORTED_ARCHITECTURES += ("marian",)
 

From 02a2ccd8688c520ca039d126b1dbe413c51dc82d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 17:53:12 +0100
Subject: [PATCH 146/222] style

---
 tests/openvino/test_genai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index 5d6b3c4b72..f3c1bed1e9 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -466,7 +466,7 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase):
 
     @parameterized.expand(EAGLE3_MODELS.items())
     def test_compare_outputs(self, model_arch, model_pair):
-        if is_transformers_version("<", "4.54") or  is_transformers_version(">=", "5"):
+        if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"):
             self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5")
         if is_openvino_version("<", "2026.0"):
             self.skipTest("Eagle3 requires openvino-genai >= 2026.0")

From 710c5bc8679c23e71061385babc8f28994f2c67c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 18:12:08 +0100
Subject: [PATCH 147/222] include openvino 2026

---
 tests/openvino/test_seq2seq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index c15c0ca269..af047f0313 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 
-    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026")) and is_transformers_version(
+    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version(
         "<", "5"
     ):
         # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x

From 64dc198c8c80d997e80ddb6f5a57d589aba733ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 18:44:57 +0100
Subject: [PATCH 148/222] add gemma3 text

---
 optimum/exporters/openvino/model_configs.py | 2 --
 tests/openvino/test_decoder.py              | 4 +---
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index a52cb0ca87..8af57604fa 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1475,8 +1475,6 @@ class Gemma2OpenVINOConfig(GemmaOpenVINOConfig):
 )
 class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.50.0"
-    # TODO (@echarlaix): add v5 support
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index a06ee5fa12..de3d3df121 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -115,8 +115,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">", "4.47"):
         SUPPORTED_ARCHITECTURES += ("olmo2",)
 
-    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
-    if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"):
+    if is_transformers_version(">", "4.49"):
         SUPPORTED_ARCHITECTURES += ("gemma3_text",)
 
     if is_transformers_version(">=", "4.51.0"):
@@ -323,7 +322,6 @@ def test_find_untested_architectures(self):
                 "zamba2",
                 "marian",
                 "llama4",
-                "gemma3_text",
                 "exaone4",
             }
 

From d3bdb292d52a46fa9f29c721b36a69e56b9ebc02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Feb 2026 18:53:01 +0100
Subject: [PATCH 149/222] llava tests

---
 tests/openvino/test_quantization.py | 49 +++++++++++++----------------
 tests/openvino/test_seq2seq.py      |  8 +++--
 2 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index b5c01b90d9..753b1e387a 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1061,6 +1061,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (OVStableDiffusionPipeline, "stable-diffusion", False),
         (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False),
         (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False),
+        (OVModelForVisualCausalLM, "llava", False),
         (OVModelForVisualCausalLM, "qwen2_vl", False),
     ]
 
@@ -1079,7 +1080,6 @@ class OVWeightCompressionTest(unittest.TestCase):
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.extend(
             [
-                (OVModelForVisualCausalLM, "llava", False),
                 (OVModelForVisualCausalLM, "llava_next_video", False),
                 (OVModelForVisualCausalLM, "minicpmv", True),
             ]
@@ -1122,6 +1122,17 @@ class OVWeightCompressionTest(unittest.TestCase):
                 "text_encoder": {},
             },
         ),
+        (
+            OVModelForVisualCausalLM,
+            "llava",
+            4,
+            {"bits": 4, "group_size": 8, "ratio": 0.5},
+            {
+                "lm_model": {"int8": 22, "int4": 8},
+                "text_embeddings_model": {"int8": 1},
+                "vision_embeddings_model": {"int8": 9},
+            },
+        ),
         (
             OVSamModel,
             "sam",
@@ -1175,6 +1186,15 @@ class OVWeightCompressionTest(unittest.TestCase):
                 },
             },
         ),
+        (
+            OVModelForVisualCausalLM,
+            "llava",
+            {
+                "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
+                "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
+                "text_embeddings_model": {"patterns": ["."]},
+            },
+        ),
         (
             OVSamModel,
             "sam",
@@ -1195,33 +1215,6 @@ class OVWeightCompressionTest(unittest.TestCase):
         ),
     ]
 
-    if is_transformers_version("<", "5"):
-        DEFAULT_COMPRESSION_CONFIGURATIONS.append(
-            (
-                OVModelForVisualCausalLM,
-                "llava",
-                4,
-                {"bits": 4, "group_size": 8, "ratio": 0.5},
-                {
-                    "lm_model": {"int8": 22, "int4": 8},
-                    "text_embeddings_model": {"int8": 1},
-                    "vision_embeddings_model": {"int8": 9},
-                },
-            ),
-        )
-
-        DEFAULT_IGNORED_SCOPE_CONFIGURATIONS.append(
-            (
-                OVModelForVisualCausalLM,
-                "llava",
-                {
-                    "lm_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
-                    "vision_embeddings_model": {"patterns": [".*layers.0.self_attn.q_proj/aten::linear/MatMul"]},
-                    "text_embeddings_model": {"patterns": ["."]},
-                },
-            ),
-        )
-
     def test_filtered_architectures(cls):
         expected = set()
         if is_transformers_version("<", "4.49"):
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index af047f0313..9e2246582f 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -153,18 +153,20 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
     TASK = "text2text-generation"
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
-
-    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version(
+    UNSUPPORTED_ARCHITECTURES = set()
+    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) or is_transformers_version(
         "<", "5"
     ):
         # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x
         SUPPORTED_ARCHITECTURES += ("marian",)
+    else:
+        UNSUPPORTED_ARCHITECTURES.add("marian")
 
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("mt5",)
     else:
-        UNSUPPORTED_ARCHITECTURES = {"marian", "mt5"}
+        UNSUPPORTED_ARCHITECTURES.add("mt5")
 
     SUPPORT_STATEFUL = ("t5", "mt5", "longt5")
     if is_transformers_version(">=", "4.52.0"):

From ea761a75bd0657e0514d3a025f57676d956056ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 26 Feb 2026 14:32:57 +0100
Subject: [PATCH 150/222] exclude marian for transformers v5 or higher

---
 tests/openvino/test_seq2seq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 9e2246582f..26b5b7d391 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -154,7 +154,7 @@ class OVModelForSeq2SeqLMIntegrationTest(OVSeq2SeqTestMixin):
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
     UNSUPPORTED_ARCHITECTURES = set()
-    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) or is_transformers_version(
+    if not (is_openvino_version(">=", "2025.3.0") and is_openvino_version("<", "2026.1")) and is_transformers_version(
         "<", "5"
     ):
         # There are known issues with marian model on OpenVINO 2025.3.x and 2025.4.x

From a33065ee1c4745f1461764d6321fc6abb1bbe5d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 26 Feb 2026 18:26:50 +0100
Subject: [PATCH 151/222] fix gemma3

---
 optimum/exporters/openvino/model_configs.py |  2 --
 optimum/exporters/openvino/model_patcher.py | 36 +++++++++++++--------
 tests/openvino/test_decoder.py              |  2 +-
 tests/openvino/test_genai.py                |  5 ++-
 tests/openvino/test_seq2seq.py              | 13 ++++----
 5 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index ec330c59e8..d5ce89bd46 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4136,8 +4136,6 @@ def __init__(
 @register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers")
 class Gemma3OpenVINOConfig(BaseVLMOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.50.0"
-    # TODO (@echarlaix): add v5 support
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
     def __init__(
         self,
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 4ce8d17ded..860ce212e9 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4657,22 +4657,30 @@ def __init__(
         model: "PreTrainedModel",
         model_kwargs: Dict[str, Any],
     ):
-        model.__orig_forward = model.forward
-        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
-        # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321
-        if (
-            hasattr(model, "model")
-            and hasattr(model.model, "get_image_features")
-            and is_transformers_version("<", "5")
-        ):
-            model.forward = model.model.get_image_features
-        else:
-            model.forward = model.get_image_features
         super().__init__(config, model, model_kwargs)
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        self._model.forward = self._model.__orig_forward
+        @functools.wraps(self.orig_forward)
+        def patched_forward(*args, **kwargs):
+            # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L835
+            # Adapted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1321
+            if (
+                hasattr(self._model, "model")
+                and hasattr(self._model.model, "get_image_features")
+                and is_transformers_version("<", "5")
+            ):
+                get_image_features = self._model.model.get_image_features
+            else:
+                get_image_features = self._model.get_image_features
+
+            outputs = get_image_features(*args, **kwargs)
+
+            if is_transformers_version(">=", "5") and hasattr(outputs, "pooler_output"):
+                outputs = outputs.pooler_output
+
+            output_names = list(config.outputs.keys())
+            return {output_names[0]: outputs}
+
+        self.patched_forward = patched_forward
 
 
 # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0-Gemma-3/src/transformers/models/gemma3/modeling_gemma3.py#L1147
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index d3cfba3ba3..8f0a8f12c2 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -115,7 +115,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">", "4.47"):
         SUPPORTED_ARCHITECTURES += ("olmo2",)
 
-    if is_transformers_version(">", "4.49"):
+    if is_transformers_version(">=", "4.50"):
         SUPPORTED_ARCHITECTURES += ("gemma3_text",)
 
     if is_transformers_version(">=", "4.51.0"):
diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index f3c1bed1e9..5375cf9b67 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -76,7 +76,7 @@ class LLMPipelineTestCase(unittest.TestCase):
             SUPPORTED_ARCHITECTURES += ("qwen",)
         if is_transformers_version("<", "5"):
             SUPPORTED_ARCHITECTURES += ("phimoe",)
-    if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
+    if is_transformers_version(">=", "4.50"):
         SUPPORTED_ARCHITECTURES += ("gemma3_text",)
     if is_transformers_version(">=", "4.51.0"):
         SUPPORTED_ARCHITECTURES += ("qwen3", "qwen3_moe")
@@ -224,8 +224,7 @@ class VLMPipelineTestCase(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("qwen2_5_vl",)
         if is_transformers_version("<", "4.54.0"):
             SUPPORTED_ARCHITECTURES += ("phi4mm",)
-    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
-    if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
+    if is_transformers_version(">=", "4.50"):
         SUPPORTED_ARCHITECTURES += ("gemma3",)
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("llava", "llava_next_video")
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 2b005849fa..e7c59476b6 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -581,9 +581,11 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
             SUPPORTED_ARCHITECTURES += ["phi4mm"]
             SUPPORT_AUDIO.append("phi4mm")
 
-    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
-    if is_transformers_version(">", "4.49") and is_transformers_version("<", "5"):
-        SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"]
+    if is_transformers_version(">=", "4.50"):
+        SUPPORTED_ARCHITECTURES += ["gemma3"]
+        # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+        if is_transformers_version("<", "5"):
+            SUPPORTED_ARCHITECTURES += ["smolvlm"]
 
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version(">=", "4.51") and is_transformers_version("<", "5"):
@@ -614,7 +616,6 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
             "llama4",
             "llava_next_video",
             "phi4_multimodal",
-            "gemma3",
             "smolvlm",
         }
     REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
@@ -783,9 +784,9 @@ def test_compare_to_transformers(self, model_arch):
         set_seed(SEED)
 
         additional_inputs = {}
-        # gemma3 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
+        # gemma3 does not support dynamic cache until v4.53, we cannot compare dynamic cache result vs hybrid cache,
         # align cache representation in torch model
-        if model_arch == "gemma3":
+        if model_arch == "gemma3" and is_transformers_version("<", "4.53.0"):
             patch_update_causal_mask(
                 transformers_model if is_transformers_version("<", "4.52.0") else transformers_model.language_model,
                 "4.43.0",

From bf51329a5519e3c964ed5119043a3619594666a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 3 Mar 2026 18:36:48 +0100
Subject: [PATCH 152/222] add comment

---
 optimum/exporters/openvino/model_patcher.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 92fe0b4063..634f015872 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4674,6 +4674,7 @@ def patched_forward(*args, **kwargs):
 
             outputs = get_image_features(*args, **kwargs)
 
+            # we should be able to specify pooler_output as output_name, not supported here as pooler_output key does not exist
             if is_transformers_version(">=", "5") and hasattr(outputs, "pooler_output"):
                 outputs = outputs.pooler_output
 

From 4a2786218d053164cfc01412505dca5c1174820a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Mar 2026 15:35:24 +0100
Subject: [PATCH 153/222] replace gpt_oss_mxfp4 test to gpt_oss for v5

---
 tests/openvino/test_exporters_cli.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index e8766c737d..9690496089 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -1233,13 +1233,14 @@ def test_exporters_cli_full_quantization(
             {"model": 65},
         ),
         (
-            "gpt_oss",
+            # mxfp4 fixing saving broken since v5, fixed in https://github.com/huggingface/transformers/pull/43148, test can be added back for v5.3
+            "gpt_oss_mxfp4" if is_transformers_version("<", "5") else "gpt_oss",
             "openai/gpt-oss-20b",
             AutoModelForCausalLM,
             OVModelForCausalLM,
             "--task text-generation-with-past --weight-format int4",
             _DEFAULT_4BIT_WQ_CONFIGS,
-            {"model": {"int8": 40, "int4": 0}},
+            {"model": {"int8": 22, "int4": 4} if is_transformers_version("<", "5") else {"int8": 40, "int4": 0}},
             {"model": 0},
         ),
         (

From 4a8644d937989fa88031b2c46992a5e48f4b8ac2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Mar 2026 15:51:24 +0100
Subject: [PATCH 154/222] include Qwen3VLOpenVINOConfig min version

---
 tests/openvino/test_decoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index dc4073c063..e5267b5224 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -318,7 +318,7 @@ def test_find_untested_architectures(self):
             supported_architectures -= {"lfm2"}
 
         # qwen3_vl_text a part of qwen3_vl architecture and is tested in seq2seq group
-        if is_transformers_version(">", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
+        if is_transformers_version(">=", str(Qwen3VLOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"qwen3_vl_text"}
 
         # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly

From 988147517ceb2216275ee0c720221281fd18151a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 4 Mar 2026 16:40:14 +0100
Subject: [PATCH 155/222] add phi4_multimodal for transformers < v5

---
 tests/openvino/test_seq2seq.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index e7c59476b6..9ceab2d227 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -558,6 +558,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
     ]
     SUPPORT_VIDEO = ["llava_next_video", "qwen2_vl"]
     SUPPORT_AUDIO = []
+    UNSUPPORTED_ARCHITECTURES = {"phi4_multimodal"}
     OVMODEL_CLASS = OVModelForVisualCausalLM
     TASK = "image-text-to-text"
 
@@ -610,14 +611,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("llava_next_video",)
     else:
-        UNSUPPORTED_ARCHITECTURES = {
-            "got_ocr2",
-            "idefics3",
-            "llama4",
-            "llava_next_video",
-            "phi4_multimodal",
-            "smolvlm",
-        }
+        UNSUPPORTED_ARCHITECTURES.update({"got_ocr2", "idefics3", "llama4", "llava_next_video", "smolvlm"})
     REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"]
     IMAGE = Image.open(
         requests.get(

From 2d764ef9cf916ea168282b8acc000d018da55177 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 6 Mar 2026 11:29:44 +0100
Subject: [PATCH 156/222] set dtype for beam_search tests for gemma3 text model

---
 tests/openvino/test_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index e5267b5224..0ddb251b22 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -783,11 +783,11 @@ def test_beam_search(self, model_arch):
         set_seed(SEED)
         with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch):
             transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
-        if model_arch == "arctic" or "mxfp4" in model_arch:
+        if model_arch in ["arctic", "gemma3_text"] or "mxfp4" in model_arch:
             transformers_model.to(torch.float32)
         additional_inputs = {}
         # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, align cache representation in torch model
-        if model_arch in ["gemma2", "gemma3_text"]:
+        if model_arch in ["gemma2", "gemma3_text"] and is_transformers_version("<", "4.53.0"):
             patch_update_causal_mask(transformers_model, "4.43.0")
             transformers_model._supports_cache_class = True
             transformers_model.generation_config.cache_implementation = None

From f901a66f2e42360405402a765c80342e9fefc513 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 9 Mar 2026 15:57:24 +0100
Subject: [PATCH 157/222] diffusers latest release now compatible with
 transformers v5

---
 .github/workflows/build_documentation.yml    | 1 -
 .github/workflows/build_pr_documentation.yml | 1 -
 .github/workflows/test_openvino.yml          | 7 +------
 .github/workflows/test_openvino_nightly.yml  | 5 -----
 .github/workflows/test_openvino_slow.yml     | 5 -----
 setup.py                                     | 2 +-
 6 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index ce3eb464ce..896c5f8b43 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -51,7 +51,6 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
-          uv pip install transformers==4.57.6
           uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets
 
       - name: Make documentation
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 6b0b89f3f1..ac3291acfd 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -38,7 +38,6 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
-          uv pip install transformers==4.57.6
           uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets
 
       - name: Make documentation
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 48e3a7409b..ba60fc597a 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -56,7 +56,7 @@ jobs:
           pip install --upgrade pip uv
           uv pip install .[tests] librosa diffusers
 
-      - if: ${{ matrix.test-pattern == '*modeling*' }}
+      - if: ${{ matrix.test-pattern == '*modeling*' || matrix.test-pattern == '*quantization*' }}
         name: Install OpenVINO
         run: |
           uv pip install openvino==2025.3.0 openvino-tokenizers==2025.3.0
@@ -66,11 +66,6 @@ jobs:
         run: |
           uv pip install transformers==${{ matrix.transformers-version }}
 
-      - if: ${{ matrix.transformers-version == 'latest' }}
-        name: Install diffusers
-        run: |
-          uv pip install git+https://github.com/huggingface/diffusers
-
       - if: ${{ matrix.transformers-version == '4.45.0' }}
         name: Install specific dependencies and versions required for older transformers
         run: |
diff --git a/.github/workflows/test_openvino_nightly.yml b/.github/workflows/test_openvino_nightly.yml
index ace0246329..886d22c2b3 100644
--- a/.github/workflows/test_openvino_nightly.yml
+++ b/.github/workflows/test_openvino_nightly.yml
@@ -99,11 +99,6 @@ jobs:
           pip install --upgrade pip uv
           uv pip install .[tests] librosa diffusers
 
-      - if: ${{ matrix.transformers-version == 'latest' }}
-        name: Install diffusers
-        run: |
-          uv pip install git+https://github.com/huggingface/diffusers
-
       - if: ${{ matrix.openvino-version == 'openvino-nightly' }}
         name: Install OpenVINO Nightly
         run: |
diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml
index 580253a36a..8a6460ca1b 100644
--- a/.github/workflows/test_openvino_slow.yml
+++ b/.github/workflows/test_openvino_slow.yml
@@ -61,11 +61,6 @@ jobs:
           python -m pip install --upgrade pip uv
           uv pip install .[tests] librosa diffusers
 
-      - if: ${{ matrix.transformers-version == 'latest' }}
-        name: Install diffusers
-        run: |
-          uv pip install git+https://github.com/huggingface/diffusers
-
       - if: ${{ matrix.transformers-version != 'latest' && matrix.transformers-version != 'main' }}
         name: Install specific dependencies and versions required for older transformers
         run: |
diff --git a/setup.py b/setup.py
index 16e2a82fed..ca26a42a1f 100644
--- a/setup.py
+++ b/setup.py
@@ -73,7 +73,7 @@
     "openvino": ["nncf>=2.19.0", "openvino>=2025.3.0", "openvino-tokenizers>=2025.3.0"],
     "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"],
     "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"],
-    "diffusers": ["diffusers", "transformers<5"],
+    "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
 }

From 7879da8bfea6fc26f8e179c74a76fb7e51b10c9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 9 Mar 2026 16:47:20 +0100
Subject: [PATCH 158/222] set qwen3_next max transformers version

---
 optimum/exporters/openvino/model_configs.py | 1 +
 tests/openvino/test_decoder.py              | 3 ++-
 tests/openvino/test_export.py               | 5 ++++-
 tests/openvino/test_exporters_cli.py        | 6 ++++++
 4 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index b8c2eefcf1..a386842b7d 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5435,6 +5435,7 @@ class Qwen3NextOpenVINOConfig(Qwen3OpenVINOConfig):
     DUMMY_PKV_GENERATOR_CLASS = Qwen3NextDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     MIN_TRANSFORMERS_VERSION = "4.57.0"
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = Qwen3NextModelPatcher
 
     def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 2fdacf5ce5..bedb5ee8e9 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -96,7 +96,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.54.0"):
         SUPPORTED_SSM_ARCHITECTURES += ("lfm2",)
 
-    if is_transformers_version(">=", "4.57.0"):
+    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
+    if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"):
         SUPPORTED_SSM_ARCHITECTURES += ("qwen3_next",)
 
     SUPPORTED_ARCHITECTURES += SUPPORTED_SSM_ARCHITECTURES
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 0c51a6f8da..e9c7696c2d 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -111,7 +111,10 @@ class ExportModelTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES.update({"afmoe": OVModelForCausalLM})
 
     if is_transformers_version(">=", "4.57.0"):
-        SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM, "qwen3_next": OVModelForCausalLM})
+        SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM})
+
+    if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES.update({"qwen3_next": OVModelForCausalLM})
 
     EXPECTED_DIFFUSERS_SCALE_FACTORS = {
         "stable-diffusion-xl": {"vae_encoder": "128.0", "vae_decoder": "128.0"},
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index b26569ed00..940ab5b3ac 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -169,6 +169,12 @@ class OVCLIExportTestCase(unittest.TestCase):
         SUPPORTED_ARCHITECTURES.extend(
             [
                 ("text-generation-with-past", "hunyuan_v1_dense"),
+            ]
+        )
+
+    if is_transformers_version(">=", "4.57.0") and is_transformers_version("<", "5"):
+        SUPPORTED_ARCHITECTURES.extend(
+            [
                 ("text-generation-with-past", "qwen3_next"),
             ]
         )

From d5f22440f58f3a6231fa603bd37cb727f5074b60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 9 Mar 2026 16:51:55 +0100
Subject: [PATCH 159/222] Fix doc building

---
 .github/workflows/build_documentation.yml    | 1 +
 .github/workflows/build_pr_documentation.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index 896c5f8b43..ce3eb464ce 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -51,6 +51,7 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
+          uv pip install transformers==4.57.6
           uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets
 
       - name: Make documentation
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index ac3291acfd..6b0b89f3f1 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -38,6 +38,7 @@ jobs:
         run: |
           pip install --upgrade pip uv
           uv pip install git+https://github.com/huggingface/doc-builder
+          uv pip install transformers==4.57.6
           uv pip install .[quality] neural-compressor[pt]>3.4 diffusers accelerate datasets
 
       - name: Make documentation

From 43ed6175824e06d0ae0226ae4276e3d22a95c364 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 9 Mar 2026 17:35:18 +0100
Subject: [PATCH 160/222] add qwen3_next to list of untested architectures

---
 tests/openvino/test_decoder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index bedb5ee8e9..e259c38e68 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -336,6 +336,7 @@ def test_find_untested_architectures(self):
                 "marian",
                 "llama4",
                 "exaone4",
+                "qwen3_next",
             }
 
         supported_architectures -= ONNX_SUPPORTED_ARCHITECTURES

From 87cc3f93a7712f60211a14431da170f8b1909314 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 10 Mar 2026 17:29:14 +0100
Subject: [PATCH 161/222] comment for zamba2

---
 optimum/exporters/openvino/model_configs.py | 2 +-
 tests/openvino/test_decoder.py              | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index a386842b7d..62464e81f9 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4821,8 +4821,8 @@ class Zamba2OpenVINOConfig(MambaOpenVINOConfig):
     DUMMY_PKV_GENERATOR_CLASS = Zamba2DummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     MIN_TRANSFORMERS_VERSION = "4.49.0"
-    # TODO (@echarlaix): add v5 support
     MAX_TRANSFORMERS_VERSION = "4.57.6"
+    # MIN_TRANSFORMERS_VERSION = "5.2.0"
     _MODEL_PATCHER = Zamba2ModelPatcher
 
     def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index e259c38e68..00bb85bd30 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -86,7 +86,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
 
     SUPPORTED_SSM_ARCHITECTURES = ("mamba", "falcon_mamba")
 
-    # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version(">=", "4.49") and is_transformers_version("<", "5"):
         SUPPORTED_SSM_ARCHITECTURES += ("zamba2",)
 
@@ -180,7 +179,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
             "dbrx",
             # "phimoe",
             "marian",
-            # "zamba2",
         )
     GENERATION_LENGTH = 100
 

From 96d47b0172bb6720d43aef9b15d372eae3be0f4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Mar 2026 17:36:47 +0100
Subject: [PATCH 162/222] Fix eagle3 compatibility with v5

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 tests/openvino/test_decoder.py              | 6 +-----
 tests/openvino/test_genai.py                | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index f21399c836..a0c8b4601a 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7804,7 +7804,7 @@ def forward(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
-            past_key_value=past_key_values,
+            **{"past_key_values" if is_transformers_version(">=", "5.0") else "past_key_value": past_key_values},
             output_attentions=output_attentions,
             position_embeddings=position_embeddings,
             use_cache=use_cache,
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 00bb85bd30..3067f1c5c4 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -894,11 +894,7 @@ def test_load_with_different_dtype(self):
             )
 
     @parameterized.expand(EAGLE3_MODELS.items())
-    # TODO (@echarlaix) transformers v5 support
-    @pytest.mark.skipif(
-        is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"),
-        reason="Eagle3 requires transformers >= 4.54",
-    )
+    @pytest.mark.skipif(is_transformers_version("<", "4.54"), reason="Eagle3 requires transformers >= 4.54")
     def test_load_and_infer_with_eagle3_model(self, model_arch, model_pair):
         draft_model_id, target_model_id = model_pair
 
diff --git a/tests/openvino/test_genai.py b/tests/openvino/test_genai.py
index 584d798e88..9d217e7373 100644
--- a/tests/openvino/test_genai.py
+++ b/tests/openvino/test_genai.py
@@ -467,8 +467,8 @@ class LLMPipelineWithEagle3TestCase(unittest.TestCase):
 
     @parameterized.expand(EAGLE3_MODELS.items())
     def test_compare_outputs(self, model_arch, model_pair):
-        if is_transformers_version("<", "4.54") or is_transformers_version(">=", "5"):
-            self.skipTest("Eagle3 requires transformers >= 4.54 and transformers < 5")
+        if is_transformers_version("<", "4.54"):
+            self.skipTest("Eagle3 requires transformers >= 4.54")
         if is_openvino_version("<", "2026.0"):
             self.skipTest("Eagle3 requires openvino-genai >= 2026.0")
 

From db805612f9bac4179c27960f4c6f6d9eeb0b4ef1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Mar 2026 18:59:25 +0100
Subject: [PATCH 163/222] set dtype in tests when loading sd3 model

---
 tests/openvino/test_diffusion.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index bc58c91796..08c5180a48 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -157,7 +157,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch)
         ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE)
         auto_cls = self.AUTOMODEL_CLASS if "sana" not in model_arch else DiffusionPipeline
-        diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch])
+        model_kwargs = (
+            {"torch_dtype": torch.float32}
+            if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3"
+            else {}
+        )
+        diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs)
 
         for output_type in ["latent", "np", "pt"]:
             inputs["output_type"] = output_type
@@ -632,7 +637,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         height, width, batch_size = 128, 128, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_type=model_arch)
 
-        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        model_kwargs = (
+            {"torch_dtype": torch.float32}
+            if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3"
+            else {}
+        )
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs)
         ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE)
 
         for output_type in ["latent", "np", "pt"]:
@@ -898,12 +908,18 @@ def test_shape(self, model_arch: str):
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
         ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], device=OPENVINO_DEVICE)
+        model_kwargs = (
+            {"torch_dtype": torch.float32}
+            if is_transformers_version(">=", "5") and model_arch == "stable-diffusion-3"
+            else {}
+        )
+
         if model_arch != "flux-fill":
-            diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+            diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs)
         else:
             from diffusers import FluxFillPipeline
 
-            diffusers_pipeline = FluxFillPipeline.from_pretrained(MODEL_NAMES[model_arch])
+            diffusers_pipeline = FluxFillPipeline.from_pretrained(MODEL_NAMES[model_arch], **model_kwargs)
 
         height, width, batch_size = 64, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, model_arch=model_arch)

From 3e5a2b23f2a0d658b81acb40b9559e20c6f7d3c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 11 Mar 2026 19:13:08 +0100
Subject: [PATCH 164/222] trigger tests for transformers v5.3

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index ca26a42a1f..3ca73ac3d9 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=2.1",
-    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-v5",
-    "transformers>=4.45,<5.1",
+    "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5",
+    "transformers>=4.45,<5.4",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",
     "nncf>=2.19.0",

From 10add8c7df53753ab42ae8e224cb76dc35ef5eb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 12 Mar 2026 09:32:48 +0100
Subject: [PATCH 165/222] update setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3ca73ac3d9..fe70f63757 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRE = [
     "torch>=2.1",
     "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5",
-    "transformers>=4.45,<5.4",
+    "transformers>=4.45,<5.1",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",
     "nncf>=2.19.0",

From 501b5233a25e0753591cf4557f8edc91a075cf0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 12 Mar 2026 09:43:33 +0100
Subject: [PATCH 166/222] update setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fe70f63757..baccbf1d68 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRE = [
     "torch>=2.1",
     "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5",
-    "transformers>=4.45,<5.1",
+    "transformers>=4.45,<5.3",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",
     "nncf>=2.19.0",

From 15548fcb6986259ddaaa1af4ced90701a6ba1acc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 12 Mar 2026 10:01:37 +0100
Subject: [PATCH 167/222] update setup

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index baccbf1d68..fe70f63757 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 INSTALL_REQUIRE = [
     "torch>=2.1",
     "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@xadupre/transformers5",
-    "transformers>=4.45,<5.3",
+    "transformers>=4.45,<5.1",
     "setuptools",
     "huggingface-hub>=0.23.2,<2.0",
     "nncf>=2.19.0",

From 79078787955d097221068bfb94399b4a8b6850d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 17 Mar 2026 19:09:31 +0100
Subject: [PATCH 168/222] fix bf16 model export

---
 optimum/exporters/openvino/__main__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index eb763b45d4..5e59f0cb19 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -496,6 +496,9 @@ def bitnet_load_hook(self, state_dict, prefix, *args, **kwargs):
                 **loading_kwargs,
             )
 
+        if getattr(model, "dtype", None) in [torch.float16, torch.bfloat16]:
+            patch_16bit = True
+
         needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
 
         if needs_pad_token_id:

From c026dd99262725b2000457e0c77f682ff2082c4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Mar 2026 10:03:59 +0100
Subject: [PATCH 169/222] question answering pipeline deprecated in v5.3

---
 tests/openvino/test_modeling.py       | 8 ++++++++
 tests/openvino/test_modeling_basic.py | 6 +++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 0c5011a908..53223e692a 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -887,6 +887,10 @@ def test_compare_to_transformers(self, model_arch):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
+    @pytest.mark.skipif(
+        is_transformers_version(">=", "5.3"),
+        reason="requires transformers < v5.3 since question-answering pipeline is deprecated in v5.3",
+    )
     def test_pipeline(self, model_arch):
         set_seed(SEED)
         model_id = MODEL_NAMES[model_arch]
@@ -909,6 +913,10 @@ def test_pipeline(self, model_arch):
 
     @pytest.mark.run_slow
     @slow
+    @pytest.mark.skipif(
+        is_transformers_version(">=", "5.3"),
+        reason="requires transformers < v5.3 since question-answering pipeline is deprecated in v5.3",
+    )
     def test_metric(self):
         model_id = "distilbert-base-cased-distilled-squad"
         set_seed(SEED)
diff --git a/tests/openvino/test_modeling_basic.py b/tests/openvino/test_modeling_basic.py
index c2576db98b..eb72175032 100644
--- a/tests/openvino/test_modeling_basic.py
+++ b/tests/openvino/test_modeling_basic.py
@@ -30,12 +30,16 @@
     "hf-internal-testing/tiny-random-bert": "OVModelForMaskedLM",
     "hf-internal-testing/tiny-random-distilbert": "OVModelForSequenceClassification",
     "hf-internal-testing/tiny-random-mbart": "OVModelForSeq2SeqLM",
-    "hf-internal-testing/tiny-random-roberta": "OVModelForQuestionAnswering",
     "hf-internal-testing/tiny-random-gpt2": "OVModelForCausalLM",
     "hf-internal-testing/tiny-random-t5": "OVModelForSeq2SeqLM",
     "hf-internal-testing/tiny-random-bart": "OVModelForSeq2SeqLM",
 }
 
+# question-answering pipeline is deprecated in transformers v5.3
+if is_transformers_version("<", "5.3"):
+    MODEL_NAMES["hf-internal-testing/tiny-random-roberta"] = "OVModelForQuestionAnswering"
+
+
 TASKS = {
     "OVModelForMaskedLM": "fill-mask",
     "OVModelForSequenceClassification": "text-classification",

From 61d85b371415e16d5a3757a49219ef604ee5e337 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Mar 2026 10:04:16 +0100
Subject: [PATCH 170/222] ix mamba expected int8

---
 tests/openvino/utils_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index cc084565fe..1117604b7b 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -358,8 +358,8 @@
         "vocoder": 80,
     },
     "clip": {"model": 130},
-    "mamba": {"model": 322 if is_transformers_version("<", "5") else 324},
-    "falcon_mamba": {"model": 162 if is_transformers_version("<", "5") else 164},
+    "mamba": {"model": 324 if is_transformers_version("==", "5.0") else 322},
+    "falcon_mamba": {"model": 164 if is_transformers_version("==", "5.0") else 162},
     "minicpmo": {
         "lm_model": 16,
         "text_embeddings_model": 1,

From 55c0d469b9c853e5aa2a285f7f9f0f37dd0b5c80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Mar 2026 17:39:46 +0100
Subject: [PATCH 171/222] Fix _DEFAULT_IGNORED_SCOPE_CONFIGS for
 __make_16bit_traceable patched models

---
 optimum/intel/openvino/configuration.py | 1 +
 tests/openvino/test_quantization.py     | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 2002e268ac..2d8608fadb 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -546,6 +546,7 @@ class OVQuantizationMethod(str, Enum):
                 "__module.layers.27.mlp.up_proj/aten::linear/MatMul",
                 "__module.layers.27.mlp.gate_proj/aten::linear/MatMul",
             ],
+            "validate": False,
         },
     },
     "microsoft/speecht5_tts": {
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index ec9d7b84f7..ff90b208e2 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1175,8 +1175,12 @@ class OVWeightCompressionTest(unittest.TestCase):
             "llama",
             {
                 "model": {
-                    "names": ["__module.layers.1.self_attn.v_proj/aten::linear/MatMul"],
-                    "patterns": ["__module.layers.\\d.self_attn.o_proj/aten::linear/MatMul"],
+                    "names": [
+                        f"__module.layers.1.self_attn.v_proj/{'aten' if is_transformers_version('<', '5') else 'ov_ext'}::linear/MatMul"
+                    ],
+                    "patterns": [
+                        f"__module.layers.\\d.self_attn.o_proj/{'aten' if is_transformers_version('<', '5') else 'ov_ext'}::linear/MatMul"
+                    ],
                 }
             },
         ),

From 2f38fd868b67d63add87e7f083325cd3c82968e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Mar 2026 18:52:35 +0100
Subject: [PATCH 172/222] add test to ensure dtype

---
 tests/openvino/test_modeling.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 53223e692a..7a3110b182 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -702,6 +702,14 @@ def test_load_from_hub_onnx_model_and_save(self):
         del model
         gc.collect()
 
+    def test_export_dtype(self):
+        model_id = "optimum-intel-internal-testing/tiny-random-GemmaForCausalLM"
+        for dtype in [torch.float32, torch.bfloat16, torch.float16]:
+            with TemporaryDirectory() as tmpdirname:
+                model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype)
+                self.assertEqual(model.dtype, dtype)
+                model.save_pretrained(tmpdirname)
+                ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True)
 
 class PipelineTest(unittest.TestCase):
     def test_load_model_from_hub(self):

From c925a79cf3f04bf1ae78d24bc1ec2ea64aefa94e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 18 Mar 2026 18:52:55 +0100
Subject: [PATCH 173/222] style

---
 tests/openvino/test_modeling.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 7a3110b182..e8f68d62dd 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -711,6 +711,7 @@ def test_export_dtype(self):
                 model.save_pretrained(tmpdirname)
                 ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True)
 
+
 class PipelineTest(unittest.TestCase):
     def test_load_model_from_hub(self):
         model_id = "echarlaix/tiny-random-PhiForCausalLM"

From bf1f377c540120ee33641ac02536d41e681bd6bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 19 Mar 2026 15:28:13 +0100
Subject: [PATCH 174/222] check openvino model expected dtype in
 test_export_dtype

---
 optimum/intel/openvino/utils.py |  1 +
 tests/openvino/test_modeling.py | 20 ++++++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index be6ac41d31..9549da9773 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -95,6 +95,7 @@
     "f16": torch.float16,
     "f32": torch.float32,
     "f64": torch.float64,
+    "bf16": torch.bfloat16,
 }
 
 if is_torch_version(">=", "2.4.0"):
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index e8f68d62dd..372cd28943 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -90,8 +90,10 @@
     OV_LANGUAGE_MODEL_NAME,
     OV_PROMPT_ENCODER_MASK_DECODER_MODEL_NAME,
     OV_TEXT_EMBEDDINGS_MODEL_NAME,
+    OV_TO_PT_TYPE,
     OV_VISION_EMBEDDINGS_MODEL_NAME,
     OV_VISION_ENCODER_MODEL_NAME,
+    STR_TO_OV_TYPE,
     TemporaryDirectory,
 )
 from optimum.intel.pipelines import pipeline as optimum_pipeline
@@ -704,12 +706,18 @@ def test_load_from_hub_onnx_model_and_save(self):
 
     def test_export_dtype(self):
         model_id = "optimum-intel-internal-testing/tiny-random-GemmaForCausalLM"
-        for dtype in [torch.float32, torch.bfloat16, torch.float16]:
-            with TemporaryDirectory() as tmpdirname:
-                model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype)
-                self.assertEqual(model.dtype, dtype)
-                model.save_pretrained(tmpdirname)
-                ov_model = OVModelForCausalLM.from_pretrained(tmpdirname, export=True)
+        for dtype in ["f32", "f16", "bf16"]:
+            torch_dtype = OV_TO_PT_TYPE[dtype]
+            ov_dtype = STR_TO_OV_TYPE[dtype]
+            with TemporaryDirectory() as tmp_dir:
+                model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype)
+                self.assertEqual(model.dtype, torch_dtype)
+                model.save_pretrained(tmp_dir)
+                del model
+                ov_model = OVModelForCausalLM.from_pretrained(tmp_dir, export=True)
+                dtypes = {op.get_element_type() for op in ov_model.model.get_ops() if op.get_type_name() == "Constant"}
+                self.assertIn(ov_dtype, dtypes, f"Expected {ov_dtype}, found {dtypes}")
+                del ov_model
 
 
 class PipelineTest(unittest.TestCase):

From 5033df204cdcee729dec7ff8556e579422784d84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Mar 2026 19:47:17 +0100
Subject: [PATCH 175/222] fix qwen3vl vision embeddings pos

---
 optimum/exporters/openvino/model_configs.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 62464e81f9..cc1cac2714 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -3913,11 +3913,10 @@ def patch_model_for_export(self, model: Union["PreTrainedModel"], model_kwargs:
         model_kwargs = model_kwargs or {}
         if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_MERGER:
             return Qwen3VLVisionEmbMergerPatcher(self, model, model_kwargs)
-        if (
-            self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS
-            or self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS
-        ):
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS:
             return ModelPatcher(self, model, model_kwargs=model_kwargs)
+        if self._behavior == QwenVLConfigBehavior.VISION_EMBEDDINGS_POS:
+            return InputEmbeddingPatcher(self, model, model_kwargs)
         return super().patch_model_for_export(model, model_kwargs)
 
     @property

From 28e98ca493c3fbc62a2324f115869366779893e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Mar 2026 19:57:03 +0100
Subject: [PATCH 176/222] exclude vision_embeddings_pos behavior for qwen2_vl
 models

---
 optimum/exporters/openvino/convert.py       | 4 ----
 optimum/exporters/openvino/model_configs.py | 4 +++-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index d0efa2259f..fddd840b7d 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -687,10 +687,6 @@ def export_from_model(
         )
         logging.disable(logging.NOTSET)
 
-    # Remove empty model and export_configs pairs, they can be empty when a config class is shared between model versions.
-    # Example: Qwen2VL and Qwen3VL share config class, but "vision_embeddings_pos" is used in Qwen3VL only.
-    models_and_export_configs = {k: v for k, v in models_and_export_configs.items() if v != (None, None)}
-
     if library_name == "open_clip":
         if hasattr(model.config, "save_pretrained"):
             model.config.save_pretrained(output)
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index cc1cac2714..5427db1aa0 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -3662,7 +3662,9 @@ class QwenVLConfigBehavior(str, enum.Enum):
 
 @register_in_tasks_manager("qwen2_vl", *["image-text-to-text"], library_name="transformers")
 class Qwen2VLOpenVINOConfig(BaseVLMOpenVINOConfig):
-    SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior]
+    SUPPORTED_BEHAVIORS = [
+        model_type.value for model_type in QwenVLConfigBehavior if model_type.value != "vision_embeddings_pos"
+    ]
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,)
     MIN_TRANSFORMERS_VERSION = "4.45.0"

From 910cc75144f686272d723c2e8f65fc6bbfcb45b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Fri, 20 Mar 2026 19:57:48 +0100
Subject: [PATCH 177/222] rename InputEmbedOpenvVINOConfig to
 InputEmbedOpenVINOConfig

---
 optimum/exporters/openvino/model_configs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 5427db1aa0..e793e7798b 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1738,7 +1738,7 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         return dummy_inputs
 
 
-class InputEmbedOpenvVINOConfig(TextDecoderOnnxConfig):
+class InputEmbedOpenVINOConfig(TextDecoderOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     _MODEL_PATCHER = InputEmbeddingPatcher
 
@@ -1781,8 +1781,8 @@ def get_vlm_internal_text_generation_config(model_type, model_config, int_dtype,
 
 def get_vlm_text_embeddings_config(model_type, model_config, int_dtype, float_dtype):
     internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype)
-    InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
-    export_config = InputEmbedOpenvVINOConfig(
+    InputEmbedOpenVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
+    export_config = InputEmbedOpenVINOConfig(
         model_config,
         task="feature-extraction",
         int_dtype=int_dtype,

From 5d0637716424d7c99ee265c4d38cb49753a98891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 25 Mar 2026 18:11:24 +0100
Subject: [PATCH 178/222] fix lfm2 attention mask for mamba layers

---
 optimum/exporters/openvino/model_patcher.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index a0c8b4601a..44263cb672 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7194,11 +7194,15 @@ def lfm2_short_conv_forward_patched(
     cache_position=None,
     attention_mask=None,
 ):
-    from transformers.models.lfm2.modeling_lfm2 import apply_mask_to_padding_states
-
     seqlen = x.shape[1]
 
-    x = apply_mask_to_padding_states(x, attention_mask)
+    # only apply apply_mask_to_padding_states during the prefill phase
+    # https://github.com/huggingface/transformers/blob/v5.0.0/src/transformers/models/lfm2/modeling_lfm2.py#L427
+    # in transformers < v5 attention_mask was never applied for conv layers, until https://github.com/huggingface/transformers/pull/41790/
+    dtype = x.dtype
+    is_decoding = torch.tensor(seqlen == 1, dtype=dtype)
+    x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype)
+
     BCx = self.in_proj(x).transpose(-1, -2)
     B, C, x = BCx.chunk(3, dim=-2)
 

From b07adfbdbe59bd4f276bbbdd4a501adf48cde7e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 26 Mar 2026 16:42:10 +0100
Subject: [PATCH 179/222] add comment

---
 optimum/exporters/openvino/model_patcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 44263cb672..ef3d4a6b7d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7198,7 +7198,8 @@ def lfm2_short_conv_forward_patched(
 
     # only apply apply_mask_to_padding_states during the prefill phase
     # https://github.com/huggingface/transformers/blob/v5.0.0/src/transformers/models/lfm2/modeling_lfm2.py#L427
-    # in transformers < v5 attention_mask was never applied for conv layers, until https://github.com/huggingface/transformers/pull/41790/
+    # in transformers < v5 attention_mask was never applied in Lfm2ShortConv https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/lfm2/modeling_lfm2.py#L485
+    # until a fix was added in https://github.com/huggingface/transformers/pull/41790/
     dtype = x.dtype
     is_decoding = torch.tensor(seqlen == 1, dtype=dtype)
     x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype)

From 2f8a5ed48d0b428a16cf6e29727bce5b18821a5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Thu, 26 Mar 2026 17:08:18 +0100
Subject: [PATCH 180/222] only apply mask for transformers >= v5

---
 optimum/exporters/openvino/model_patcher.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index ef3d4a6b7d..3d334a7093 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7200,9 +7200,10 @@ def lfm2_short_conv_forward_patched(
     # https://github.com/huggingface/transformers/blob/v5.0.0/src/transformers/models/lfm2/modeling_lfm2.py#L427
     # in transformers < v5 attention_mask was never applied in Lfm2ShortConv https://github.com/huggingface/transformers/blob/v4.57.6/src/transformers/models/lfm2/modeling_lfm2.py#L485
     # until a fix was added in https://github.com/huggingface/transformers/pull/41790/
-    dtype = x.dtype
-    is_decoding = torch.tensor(seqlen == 1, dtype=dtype)
-    x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype)
+    if is_transformers_version(">=", "5"):
+        dtype = x.dtype
+        is_decoding = torch.tensor(seqlen == 1, dtype=dtype)
+        x = (x * (attention_mask[:, :seqlen, None] * (1 - is_decoding) + is_decoding)).to(dtype)
 
     BCx = self.in_proj(x).transpose(-1, -2)
     B, C, x = BCx.chunk(3, dim=-2)

From 36371899e5c8ebc5581b2157d37059f16ea9e4a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Mon, 30 Mar 2026 18:10:39 +0200
Subject: [PATCH 181/222] Add fix for granitemoe export

---
 optimum/exporters/openvino/model_patcher.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3d334a7093..4cf552e982 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -7603,7 +7603,7 @@ def patch_sparse_moe(sparse_moe_layer):
             )
 
         for idx, layer in enumerate(self._model.model.layers):
-            if hasattr(layer, "block_sparse_moe"):
+            if getattr(layer, "block_sparse_moe", None) is not None:
                 patch_sparse_moe(layer.block_sparse_moe)
             if self.real_config._config.layers_block_type[idx] == "mamba":
                 mamba_layer = layer.mamba
@@ -7625,7 +7625,7 @@ def unpatch_sparse_moe(sparse_moe_layer):
             self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
 
         for idx, layer in enumerate(self._model.model.layers):
-            if hasattr(layer, "block_sparse_moe"):
+            if getattr(layer, "block_sparse_moe", None) is not None:
                 unpatch_sparse_moe(layer.block_sparse_moe)
             if self.real_config._config.layers_block_type[idx] == "mamba":
                 mamba_layer = layer.mamba

From dbabc02305a44b59aeaf64efee3cd11b8948e543 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 31 Mar 2026 18:23:07 +0200
Subject: [PATCH 182/222] set afmoe MAX_TRANSFORMERS_VERSION to v5.0

---
 optimum/exporters/openvino/model_configs.py | 2 +-
 tests/openvino/test_decoder.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e793e7798b..0ec3bf1bce 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5021,7 +5021,7 @@ class ASTOpenVINOConfig(ASTOnnxConfig):
 )
 class AfmoeOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.55.0"
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
+    MAX_TRANSFORMERS_VERSION = "5.0.0"
     _MODEL_PATCHER = AfmoeModelPatcher
 
 
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 3067f1c5c4..7c6d456dcc 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -150,7 +150,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.55.0"):
         SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4")
 
-    if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"):
+    if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5.1.0"):
         SUPPORTED_ARCHITECTURES += ("afmoe",)
 
     if is_transformers_version(">=", "4.57.0"):

From dfbf3fc594a043048c97488b7d1a8bb65f1a00ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Tue, 31 Mar 2026 18:42:59 +0200
Subject: [PATCH 183/222] update afmoe test

---
 tests/openvino/utils_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 1117604b7b..01fd8c5870 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -34,7 +34,7 @@
 OPENVINO_DEVICE = os.getenv("OPENVINO_TEST_DEVICE", "CPU")
 
 MODEL_NAMES = {
-    "afmoe": "optimum-intel-internal-testing/tiny-random-trinity",
+    "afmoe": "onnx-internal-testing/tiny-random-AfmoeForCausalLM",
     "albert": "optimum-intel-internal-testing/tiny-random-albert",
     "aquila": "optimum-intel-internal-testing/tiny-random-aquilachat",
     "aquila2": "optimum-intel-internal-testing/tiny-random-aquila2",

From fca276b4ec8c30f3bdd26b215fc42b98e0ddd98e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 1 Apr 2026 15:39:24 +0200
Subject: [PATCH 184/222] remove afmoe MAX_TRANSFORMERS_VERSION as included in
 transformers v5

---
 optimum/exporters/openvino/model_configs.py | 1 -
 tests/openvino/utils_tests.py               | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0ec3bf1bce..dbc4a313f6 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -5021,7 +5021,6 @@ class ASTOpenVINOConfig(ASTOnnxConfig):
 )
 class AfmoeOpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.55.0"
-    MAX_TRANSFORMERS_VERSION = "5.0.0"
     _MODEL_PATCHER = AfmoeModelPatcher
 
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 01fd8c5870..c723623c39 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -34,7 +34,7 @@
 OPENVINO_DEVICE = os.getenv("OPENVINO_TEST_DEVICE", "CPU")
 
 MODEL_NAMES = {
-    "afmoe": "onnx-internal-testing/tiny-random-AfmoeForCausalLM",
+    "afmoe": "optimum-intel-internal-testing/tiny-random-trinity",
     "albert": "optimum-intel-internal-testing/tiny-random-albert",
     "aquila": "optimum-intel-internal-testing/tiny-random-aquilachat",
     "aquila2": "optimum-intel-internal-testing/tiny-random-aquila2",
@@ -377,7 +377,6 @@
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
 REMOTE_CODE_MODELS = (
-    "afmoe",
     "chatglm",
     "minicpm",
     "baichuan2",
@@ -401,6 +400,8 @@
     "qwen3_eagle3",
 )
 
+if is_transformers_version("<", "5"):
+    REMOTE_CODE_MODELS += ("afmoe",)
 
 def get_num_quantized_nodes(model):
     num_fake_nodes = 0

From c1dc781fc0bf5c26eb5746fd221a48b2c6ca9543 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 1 Apr 2026 15:39:45 +0200
Subject: [PATCH 185/222] style

---
 tests/openvino/utils_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index c723623c39..13ba840bb8 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -403,6 +403,7 @@
 if is_transformers_version("<", "5"):
     REMOTE_CODE_MODELS += ("afmoe",)
 
+
 def get_num_quantized_nodes(model):
     num_fake_nodes = 0
     types_map = {

From 4cb14e9db30e79e6f5e29d6b4fe4a1fd0fd2c3e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CElla?= <ella@huggingface.co>
Date: Wed, 1 Apr 2026 16:17:35 +0200
Subject: [PATCH 186/222] fix test_find_untested_architectures

---
 tests/openvino/test_decoder.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 7c6d456dcc..f4032db201 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -23,7 +23,6 @@
 )
 
 from optimum.exporters.openvino.model_configs import (
-    AfmoeOpenVINOConfig,
     BitnetOpenVINOConfig,
     DeepseekOpenVINOConfig,
     LFM2OpenVINOConfig,
@@ -148,10 +147,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("minicpm", "minicpm3", "arctic")
 
     if is_transformers_version(">=", "4.55.0"):
-        SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4")
-
-    if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "5.1.0"):
-        SUPPORTED_ARCHITECTURES += ("afmoe",)
+        SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4", "afmoe")
 
     if is_transformers_version(">=", "4.57.0"):
         SUPPORTED_ARCHITECTURES += ("hunyuan_v1_dense",)
@@ -313,8 +309,6 @@ def test_find_untested_architectures(self):
                 supported_architectures.remove("deepseek_v2")
             if "deepseek_v3" in supported_architectures:
                 supported_architectures.remove("deepseek_v3")
-        if is_transformers_version(">", str(AfmoeOpenVINOConfig.MAX_TRANSFORMERS_VERSION)):
-            supported_architectures -= {"afmoe"}
         if is_transformers_version("<", str(BitnetOpenVINOConfig.MIN_TRANSFORMERS_VERSION)):
             supported_architectures -= {"bitnet"}
         if is_transformers_version("<", str(LFM2OpenVINOConfig.MIN_TRANSFORMERS_VERSION)):

From b0a782699be973ec1185c3da85c4696186ef256a Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Thu, 2 Apr 2026 21:50:09 +0400
Subject: [PATCH 187/222] [OpenVINO] Support Gemma 4

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/model_configs.py   | 346 ++++++++++++++
 optimum/exporters/openvino/model_patcher.py   | 448 +++++++++++++++++-
 optimum/exporters/openvino/utils.py           |   1 +
 optimum/intel/openvino/modeling_decoder.py    |   6 +-
 .../openvino/modeling_visual_language.py      |  87 +++-
 5 files changed, 884 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index dbc4a313f6..8a27ccfa0b 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -148,6 +148,9 @@
     FluxTransfromerModelPatcher,
     Gemma2ModelPatcher,
     Gemma3LMModelPatcher,
+    Gemma4ImageEmbeddingsModelPatcher,
+    Gemma4LMModelPatcher,
+    Gemma4PerLayerInputsGetterModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
     GptNeoxModelPatcher,
@@ -277,6 +280,10 @@ def init_model_configs():
             "transformers",
             "Gemma3ForConditionalGeneration",
         )
+    TasksManager._CUSTOM_CLASSES[("pt", "gemma4", "image-text-to-text")] = (
+        "transformers",
+        "Gemma4ForConditionalGeneration",
+    )
 
     # since transformers v4.52, model can be loaded using default AutoModelForImageTextToText
     # https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/auto/modeling_auto.py#L899
@@ -1493,6 +1500,102 @@ class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.50.0"
 
 
+class Gemma4DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            random_batch_size_range=random_batch_size_range,
+            random_sequence_length_range=random_sequence_length_range,
+        )
+        self.num_key_value_heads = normalized_config.num_key_value_heads
+        self.head_dim = normalized_config.head_dim
+        self.global_head_dim = getattr(normalized_config.config, "global_head_dim", self.head_dim)
+        self.layer_types = normalized_config.config.layer_types
+        self.num_kv_shared_layers = normalized_config.config.num_kv_shared_layers
+        self.sliding_window = normalized_config.config.sliding_window
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        # some layers do not produce their own KV-cache, they use the shared KV-cache
+        if self.num_kv_shared_layers > 0:
+            layer_types = self.layer_types[: -self.num_kv_shared_layers]
+        else:
+            layer_types = self.layer_types
+        past_kv_values = []
+        for layer_type in layer_types:
+            if layer_type == "sliding_attention":
+                shape = (
+                    self.batch_size,
+                    self.num_key_value_heads,
+                    self.sliding_window,
+                    self.head_dim,
+                )
+            else:
+                shape = (
+                    self.batch_size,
+                    self.num_key_value_heads,
+                    self.sequence_length,
+                    self.global_head_dim,
+                )
+            past_kv_value = (
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+            )
+            past_kv_values.append(past_kv_value)
+
+        return past_kv_values
+
+
+@register_in_tasks_manager(
+    "gemma4_text",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class Gemma4TextOpenVINOConfig(Gemma3TextOpenVINOConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Gemma4DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = Gemma4DummyPastKeyValuesGenerator
+    MIN_TRANSFORMERS_VERSION = "4.50.0"
+
+    def add_past_key_values(self, inputs_or_outputs: dict[str, dict[int, str]], direction: str):
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        if direction == "inputs":
+            decoder_sequence_name = "past_sequence_length"
+            name = "past_key_values"
+        else:
+            decoder_sequence_name = "past_sequence_length + sequence_length"
+            name = "present"
+
+        num_kv_shared_layers = self._normalized_config.config.num_kv_shared_layers
+        if num_kv_shared_layers > 0:
+            layer_types = self._normalized_config.config.layer_types[:-num_kv_shared_layers]
+        else:
+            layer_types = self._normalized_config.config.layer_types
+
+        for i, layer_type in enumerate(layer_types):
+            inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch_size", 2: decoder_sequence_name}
+            inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 2: decoder_sequence_name}
+
+
 class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
     def __init__(
         self,
@@ -1735,6 +1838,16 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
             dummy_inputs["token_type_ids"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[
                 0
             ].random_int_tensor(token_type_ids_shape, min_value=0, max_value=2)
+        if "per_layer_inputs" in self.inputs:
+            per_layer_inputs_shape = (
+                input_ids.shape[0],
+                input_ids.shape[1],
+                self._normalized_config.config.num_hidden_layers,
+                self._normalized_config.config.hidden_size_per_layer_input,
+            )
+            dummy_inputs["per_layer_inputs"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[
+                0
+            ].random_float_tensor(per_layer_inputs_shape)
         return dummy_inputs
 
 
@@ -4203,6 +4316,239 @@ def with_behavior(
         return super().with_behavior(behavior)
 
 
+class Gemma4ConfigBehavior(str, enum.Enum):
+    VISION_EMBEDDINGS = "vision_embeddings"
+    TEXT_EMBEDDINGS = "text_embeddings"
+    LANGUAGE = "language"
+    TEXT_EMBEDDINGS_PER_LAYER = "text_embeddings_per_layer"
+
+
+class DummyGemma4VisionInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("pixel_values", "image_position_ids")
+
+    def __init__(self, task, normalized_config, batch_size=DEFAULT_DUMMY_SHAPES["batch_size"], **kwargs):
+        super().__init__(task, normalized_config, batch_size, **kwargs)
+        self.patch_size = getattr(normalized_config, "patch_size", 16)
+        self.pooling_kernel_size = getattr(normalized_config, "pooling_kernel_size", 3)
+        # Gemma4 processor always pads pixel_values to max_soft_tokens * pooling_kernel_size^2 patches.
+        # The vision model's pooling uses shape-dependent Python operations that get baked in during tracing,
+        # so the dummy input must match the actual inference shapes.
+        max_soft_tokens = getattr(normalized_config, "image_seq_length", None)
+        if max_soft_tokens is None:
+            max_soft_tokens = getattr(normalized_config, "max_soft_tokens", 280)
+        self.num_patches = max_soft_tokens * self.pooling_kernel_size**2
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "pixel_values":
+            # Gemma4 expects pre-patchified pixel_values: [batch, num_patches, 3 * patch_size^2]
+            return self.random_float_tensor(
+                shape=[self.batch_size, self.num_patches, 3 * self.patch_size**2],
+                framework=framework,
+                dtype=float_dtype,
+            )
+        if input_name == "image_position_ids":
+            import torch
+            import math
+
+            # Create position ids as a grid. The patch count = h_patches * w_patches
+            # where both are divisible by pooling_kernel_size for correct pooling.
+            k = self.pooling_kernel_size
+            total_pooled = self.num_patches // (k * k)
+            # Find roughly square grid for pooled side
+            pooled_side = int(math.sqrt(total_pooled))
+            if pooled_side * pooled_side < total_pooled:
+                pooled_h = pooled_side
+                pooled_w = total_pooled // pooled_h
+            else:
+                pooled_h = pooled_w = pooled_side
+            h_patches = pooled_h * k
+            w_patches = pooled_w * k
+            pos_ids = torch.stack(
+                torch.meshgrid(torch.arange(h_patches), torch.arange(w_patches), indexing="ij"), dim=-1
+            ).reshape(1, -1, 2)
+            # Pad to num_patches with -1 (padding position)
+            if pos_ids.shape[1] < self.num_patches:
+                pad = torch.full((1, self.num_patches - pos_ids.shape[1], 2), -1, dtype=pos_ids.dtype)
+                pos_ids = torch.cat([pos_ids, pad], dim=1)
+            return pos_ids.expand(self.batch_size, -1, -1).clone()
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+@register_in_tasks_manager("gemma4", *["image-text-to-text"], library_name="transformers")
+class Gemma4OpenVINOConfig(Gemma3OpenVINOConfig):
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in Gemma4ConfigBehavior]
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyTextInputGenerator)
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: Gemma4ConfigBehavior = Gemma4ConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+            behavior=behavior,
+        )
+        self._behavior = behavior
+        if self._behavior == Gemma4ConfigBehavior.VISION_EMBEDDINGS:
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyGemma4VisionInputGenerator,)
+            # Attach image_seq_length from preprocessor to normalized config so
+            # the dummy input generator can compute the correct number of patches.
+            # The vision model's pooling uses shape-dependent Python operations baked in
+            # during tracing, so the dummy input must match actual inference shapes.
+            image_seq_length = None
+            if preprocessors is not None:
+                for p in preprocessors:
+                    if hasattr(p, "image_processor") and hasattr(p.image_processor, "image_seq_length"):
+                        image_seq_length = p.image_processor.image_seq_length
+                        break
+                    if hasattr(p, "image_processor") and hasattr(p.image_processor, "max_soft_tokens"):
+                        image_seq_length = p.image_processor.max_soft_tokens
+                        break
+                if image_seq_length is None:
+                    for p in preprocessors:
+                        if hasattr(p, "max_soft_tokens"):
+                            image_seq_length = p.max_soft_tokens
+                            break
+                        if hasattr(p, "image_seq_length"):
+                            image_seq_length = p.image_seq_length
+                            break
+            if image_seq_length is not None:
+                self._normalized_config.image_seq_length = image_seq_length
+        elif self._behavior in (
+            Gemma4ConfigBehavior.TEXT_EMBEDDINGS,
+            Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER,
+        ):
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator,)
+            self._config = config.text_config
+            self._normalized_config = NormalizedTextConfig(self._config)
+
+    def with_behavior(self, behavior: Union[str, Gemma4ConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, Gemma4ConfigBehavior):
+            behavior = Gemma4ConfigBehavior(behavior)
+
+        if behavior == Gemma4ConfigBehavior.LANGUAGE:
+            model_type = "gemma4_text"
+            return get_vlm_text_generation_config(
+                model_type,
+                self._orig_config.text_config,
+                self.int_dtype,
+                self.float_dtype,
+                model_patcher=Gemma4LMModelPatcher,
+                inputs_update={"per_layer_inputs": {0: "batch_size", 1: "sequence_length", 2: "num_hidden_layers"}},
+            )
+        if behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER:
+            config = self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+            return config
+        return super().with_behavior(behavior)
+
+    def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]):
+        if behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER:
+            import torch
+
+            class PerLayerInputsModule(torch.nn.Module):
+                def __init__(self, language_model, vocab_size_per_layer_input: int, config):
+                    super().__init__()
+                    self.language_model = language_model
+                    self.vocab_size_per_layer_input = vocab_size_per_layer_input
+                    self.config = config
+
+                def forward(self, input_ids: torch.Tensor):
+                    # Replace multimodal token IDs with pad_token_id to match
+                    # HF Gemma4Model.forward which uses llm_input_ids where
+                    # image/video/audio positions are set to pad_token_id
+                    pad_token_id = self.config.text_config.pad_token_id
+                    per_layer_inputs_tokens = input_ids.clone()
+                    for token_id_attr in ("image_token_id", "video_token_id", "audio_token_id"):
+                        token_id = getattr(self.config, token_id_attr, None)
+                        if token_id is not None:
+                            per_layer_inputs_tokens = torch.where(
+                                per_layer_inputs_tokens == token_id,
+                                torch.full_like(per_layer_inputs_tokens, pad_token_id),
+                                per_layer_inputs_tokens,
+                            )
+                    per_layer_inputs_mask = torch.logical_and(
+                        per_layer_inputs_tokens >= 0,
+                        per_layer_inputs_tokens < self.vocab_size_per_layer_input,
+                    )
+                    per_layer_inputs_tokens = torch.where(
+                        per_layer_inputs_mask,
+                        per_layer_inputs_tokens,
+                        torch.zeros_like(per_layer_inputs_tokens),
+                    )
+                    per_layer_inputs = self.language_model.get_per_layer_inputs(
+                        per_layer_inputs_tokens, None
+                    )
+                    return per_layer_inputs
+
+            model = PerLayerInputsModule(
+                model.model.language_model, model.config.text_config.vocab_size_per_layer_input, model.config
+            )
+            return model
+        if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
+            return model
+        if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
+            import torch
+
+            class TextEmbeddingsModule(torch.nn.Module):
+                def __init__(self, model):
+                    super().__init__()
+                    self.model = model
+
+                def forward(self, input_ids: torch.Tensor):
+                    inputs_embeds = self.model.get_input_embeddings()(input_ids)
+                    return inputs_embeds
+
+            text_embedding = TextEmbeddingsModule(model)
+            text_embedding.config = model.model.language_model.config
+            return text_embedding
+
+        return super().get_model_for_behavior(model, behavior)
+
+    def patch_model_for_export(self, model, model_kwargs=None):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER:
+            return ModelPatcher(self, model, model_kwargs)
+        if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
+            return Gemma4ImageEmbeddingsModelPatcher(self, model, model_kwargs)
+        return super().patch_model_for_export(model, model_kwargs)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == Gemma4ConfigBehavior.LANGUAGE:
+            return super().inputs
+        if self._behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER:
+            return {
+                "input_ids": {0: "batch_size", 1: "sequence_length"},
+            }
+        if self._behavior == Gemma4ConfigBehavior.VISION_EMBEDDINGS:
+            return {
+                "pixel_values": {0: "batch_size", 1: "num_patches"},
+                "image_position_ids": {0: "batch_size", 1: "num_patches"},
+            }
+        return super().inputs
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER:
+            return {"text_embeds_per_layer": {}}
+        return super().outputs
+
+
 class DummyVisionPositionIdsInputGenerator(DummyVisionInputGenerator):
     SUPPORTED_INPUT_NAMES = ("patch_attention_mask", "patch_position_ids")
 
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 4cf552e982..ae5f0c179d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -52,7 +52,7 @@
     ModelPatcher,
     gpt_oss_forward,
     override_arguments,
-    sdpa_mask_without_vmap,
+    sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap,
 )
 from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
 
@@ -82,6 +82,28 @@
 logger = logging.getLogger(__name__)
 
 
+# Compatibility wrapper for sdpa_mask_without_vmap from optimum.
+# The installed optimum version expects (batch_size, cache_position: Tensor, kv_length, ...),
+# but transformers >= 5.5 passes (batch_size, q_length: int, kv_length: int, q_offset: int, ...).
+def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0, kv_offset=0, **kwargs):
+    import inspect
+
+    sig = inspect.signature(_orig_sdpa_mask_without_vmap)
+    if "cache_position" in sig.parameters:
+        # Old optimum signature: (batch_size, cache_position, kv_length, kv_offset, ...)
+        cache_position = torch.arange(q_length, dtype=torch.long) + q_offset
+        kwargs.pop("q_offset", None)
+        kwargs.pop("allow_is_bidirectional_skip", None)
+        kwargs.pop("allow_torch_fix", None)
+        kwargs.pop("use_vmap", None)
+        kwargs.pop("device", None)
+        return _orig_sdpa_mask_without_vmap(batch_size, cache_position, kv_length, kv_offset=kv_offset, **kwargs)
+    else:
+        return _orig_sdpa_mask_without_vmap(
+            batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs
+        )
+
+
 def postprocess_past_key_values(past_key_values):
     if isinstance(past_key_values, (EncoderDecoderCache, DynamicCache)):
         if hasattr(past_key_values, "to_legacy_cache"):
@@ -4818,6 +4840,349 @@ def __exit__(self, exc_type, exc_value, traceback):
             del self._model.model._orig_update_causual_mask
 
 
+def _gemma4_project_per_layer_inputs(
+    self,
+    inputs_embeds: torch.Tensor,
+    per_layer_inputs: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    per_layer_projection = self.per_layer_model_projection(inputs_embeds) * self.per_layer_model_projection_scale
+    per_layer_projection = per_layer_projection.reshape(
+        *inputs_embeds.shape[:-1],
+        self.config.num_hidden_layers,
+        self.hidden_size_per_layer_input,
+    )
+    per_layer_projection = self.per_layer_projection_norm(per_layer_projection)
+
+    if per_layer_inputs is None:
+        return per_layer_projection
+
+    if per_layer_projection.shape != per_layer_inputs.shape:
+        per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :]
+
+    return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale
+
+
+def gemma4_language_model_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    input_features: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    input_features_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    mm_token_type_ids: Optional[torch.LongTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    per_layer_inputs=None,
+    **lm_kwargs,
+):
+    from transformers.models.gemma4.modeling_gemma4 import Gemma4ModelOutputWithPast
+
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+
+    # Merge text and images
+    if pixel_values is not None:
+        image_features = self.get_image_features(pixel_values)
+        if hasattr(image_features, "pooler_output"):
+            image_features = image_features.pooler_output
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        _, special_image_mask, _, _ = self.model.get_placeholder_mask(mm_token_type_ids, input_ids, inputs_embeds)
+        special_image_mask_expanded = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask_expanded, image_features)
+
+    outputs = self.model.language_model(
+        input_ids=None,
+        per_layer_inputs=per_layer_inputs,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        **lm_kwargs,
+    )
+
+    return Gemma4ModelOutputWithPast(
+        last_hidden_state=outputs.last_hidden_state,
+        past_key_values=outputs.past_key_values if use_cache else None,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        image_hidden_states=image_features if pixel_values is not None else None,
+    )
+
+
+def gemma4_lm_forward(
+    self,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    per_layer_inputs=None,
+    input_ids: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    input_features: Optional[torch.FloatTensor] = None,
+    input_features_mask: Optional[torch.Tensor] = None,
+    mm_token_type_ids: Optional[torch.LongTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    **lm_kwargs,
+):
+    from transformers.models.gemma4.modeling_gemma4 import Gemma4CausalLMOutputWithPast
+    from optimum.exporters.onnx.model_patcher import preprocess_past_key_values
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = False
+
+    if past_key_values is not None:
+        use_cache = True
+        past_key_values = preprocess_past_key_values(past_key_values)
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        input_features=input_features,
+        attention_mask=attention_mask,
+        input_features_mask=input_features_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        mm_token_type_ids=mm_token_type_ids,
+        cache_position=cache_position,
+        inputs_embeds=inputs_embeds,
+        labels=labels,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=True,
+        per_layer_inputs=per_layer_inputs,
+        **lm_kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    tmp_logits = self.lm_head(hidden_states[:, slice_indices, :])
+    if (final_logit_softcapping := self.config.get_text_config().final_logit_softcapping) is not None:
+        tmp_logits = tmp_logits / final_logit_softcapping
+        tmp_logits = torch.tanh(tmp_logits)
+        tmp_logits = tmp_logits * final_logit_softcapping
+
+    outputs_dict = {
+        "logits": tmp_logits,
+    }
+
+    if use_cache:
+        key_values = outputs.past_key_values
+        present_key_values = postprocess_past_key_values(key_values)
+        outputs_dict["past_key_values"] = present_key_values
+    return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs_dict.values()])
+
+
+def gemma4_eager_attention_forward_patched(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> tuple:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    eps = 0.0000001
+
+    attn_weights = nn.functional.softmax(attn_weights + eps, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+def gemma4_text_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    position_embeddings: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    past_key_values: Optional[Cache] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    **kwargs,
+) -> tuple:
+    from transformers.models.gemma4.modeling_gemma4 import apply_rotary_pos_emb as apply_rotary_pos_emb_gemma4
+
+    input_shape = hidden_states.shape[:-1]
+    hidden_shape = (*input_shape, -1, self.head_dim)
+
+    cos, sin = position_embeddings
+
+    query_states = self.q_proj(hidden_states).view(hidden_shape)
+    query_states = self.q_norm(query_states)
+    query_states = apply_rotary_pos_emb_gemma4(query_states, cos, sin, unsqueeze_dim=2)
+    query_states = query_states.transpose(1, 2)
+
+    if self.is_kv_shared_layer and past_key_values is not None:
+        key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index]
+        key_states = key_states.to(query_states.device)
+        value_states = value_states.to(query_states.device)
+    else:
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape) if self.v_proj is not None else key_states
+
+        key_states = self.k_norm(key_states)
+        key_states = apply_rotary_pos_emb_gemma4(key_states, cos, sin, unsqueeze_dim=2)
+        key_states = key_states.transpose(1, 2)
+
+        value_states = self.v_norm(value_states)
+        value_states = value_states.transpose(1, 2)
+
+    if past_key_values is not None:
+        cache_kwargs = {
+            "sin": sin,
+            "cos": cos,
+            "cache_position": cache_position,
+            "sliding_window": self.sliding_window,
+        }
+        if not self.is_kv_shared_layer:
+            key_states, value_states = past_key_values.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+        if self.store_full_length_kv:
+            if not hasattr(past_key_values, "shared_layers"):
+                past_key_values.shared_layers = {}
+            past_key_values.shared_layers[self.layer_idx] = key_states, value_states
+
+    attention_interface = gemma4_eager_attention_forward_patched
+
+    attn_output, attn_weights = attention_interface(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        dropout=self.attention_dropout if self.training else 0.0,
+        scaling=1.0,
+        sliding_window=self.sliding_window,
+        **kwargs,
+    )
+
+    attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights
+
+
+def _gemma4_moe_block_forward(self, hidden_states, top_k_index, top_k_weights):
+    # hidden_states: [B*S, hidden_dim]
+    # top_k_index: [B*S, K], top_k_weights: [B*S, K]
+    num_tokens = hidden_states.shape[0]
+    dtype = hidden_states.dtype
+
+    # Compute all expert outputs via batched matmul
+    # expanded: [E, B*S, hidden_dim]
+    expanded_hidden = hidden_states.unsqueeze(0).expand(self.num_experts, -1, -1)
+
+    # gate_up_proj: [E, 2*inter, hidden] -> transpose to [E, hidden, 2*inter]
+    gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.to(dtype).transpose(1, 2))
+    gate, up = gate_up.chunk(2, dim=-1)
+    intermediate = self.act_fn(gate) * up
+
+    # down_proj: [E, hidden, inter] -> transpose to [E, inter, hidden]
+    expert_outputs = torch.bmm(intermediate, self.down_proj.to(dtype).transpose(1, 2))
+    # expert_outputs: [E, B*S, hidden_dim]
+
+    # Apply per-expert scale: [E] -> [E, 1, 1]
+    expert_outputs = expert_outputs * self.per_expert_scale.to(dtype).unsqueeze(-1).unsqueeze(-1)
+
+    # Build full routing weight matrix [B*S, E] from sparse top-k
+    full_weights = torch.zeros(num_tokens, self.num_experts, dtype=dtype, device=hidden_states.device)
+    full_weights.scatter_add_(1, top_k_index, top_k_weights.to(dtype))
+
+    # Weighted sum over experts: [B*S, 1, E] @ [B*S, E, hidden_dim] -> [B*S, hidden_dim]
+    expert_outputs = expert_outputs.permute(1, 0, 2)  # [B*S, E, hidden_dim]
+    final_hidden_states = torch.bmm(full_weights.unsqueeze(1), expert_outputs).squeeze(1)
+
+    return final_hidden_states
+
+
+class Gemma4LMModelPatcher(Gemma3LMModelPatcher):
+    def __init__(self, config, model, model_kwargs):
+        super().__init__(config, model, model_kwargs)
+
+        self.patched_forward = gemma4_lm_forward
+        self.model_orig_forward = self.orig_forward
+        self.orig_forward = gemma4_lm_forward
+
+        self.model_orig_language_model_forward = self._model.model.forward
+
+    def __enter__(self):
+        super().__enter__()
+
+        setattr(self._model, self.orig_forward_name, types.MethodType(gemma4_lm_forward, self._model))
+        setattr(self._model.model, "forward", types.MethodType(gemma4_language_model_forward, self._model))
+
+        self._model.model.language_model._orig_project_per_layer_inputs = (
+            self._model.model.language_model.project_per_layer_inputs
+        )
+        self._model.model.language_model.project_per_layer_inputs = types.MethodType(
+            _gemma4_project_per_layer_inputs, self._model.model.language_model
+        )
+
+        for decoder_layer in self._model.model.language_model.layers:
+            decoder_layer.self_attn.orig_forward = decoder_layer.self_attn.forward
+            decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn)
+            if hasattr(decoder_layer, "moe"):
+                decoder_layer.moe._orig_forward = decoder_layer.moe.forward
+                decoder_layer.moe.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.moe)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.model.language_model.project_per_layer_inputs = (
+            self._model.model.language_model._orig_project_per_layer_inputs
+        )
+
+        for decoder_layer in self._model.model.language_model.layers:
+            decoder_layer.self_attn.forward = decoder_layer.self_attn.orig_forward
+            if hasattr(decoder_layer, "moe") and hasattr(decoder_layer.moe, "_orig_forward"):
+                decoder_layer.moe.forward = decoder_layer.moe._orig_forward
+
+        setattr(self._model, self.orig_forward_name, self.model_orig_forward)
+        setattr(self._model.model, "forward", self.model_orig_language_model_forward)
+
+
 class Idefics3ImageEmbeddingsModelPatcher(ModelPatcher):
     def __init__(
         self,
@@ -6528,7 +6893,10 @@ def __init__(
         model: "PreTrainedModel",
         model_kwargs: Optional[Dict[str, Any]] = None,
     ):
-        from transformers.models.mamba.modeling_mamba import MambaCache
+        try:
+            from transformers.models.mamba.modeling_mamba import MambaCache
+        except ImportError:
+            MambaCache = object
 
         super().__init__(config, model, model_kwargs)
 
@@ -8423,3 +8791,79 @@ def __exit__(self, exc_type, exc_value, traceback):
                 sparse_moe_block = decoder_layer.mlp
                 decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward
                 del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs
+
+
+class Gemma4PerLayerInputsGetterModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel"],
+        model_kwargs: Dict[str, Any] = None,
+    ):
+        model.__orig_forward = model.forward
+
+        def per_layer_inputs_forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+            per_layer_inputs_mask = torch.logical_and(input_ids >= 0, input_ids < self.vocab_size_per_layer_input)
+            per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids))
+            per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens, None)
+            return per_layer_inputs
+
+        model.forward = types.MethodType(per_layer_inputs_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __enter__(self):
+        super().__enter__()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
+class Gemma4ImageEmbeddingsModelPatcher(CommonImageEmbeddingsModelPatcher):
+    def __init__(self, config, model, model_kwargs):
+        super().__init__(config, model, model_kwargs)
+        # Get the vision encoder - it's at model.model.vision_tower.encoder
+        vision_model = model.model.vision_tower if is_transformers_version(">=", "5") else model.vision_tower
+        self._vision_encoder = vision_model.encoder
+
+        # Patch the vision encoder forward to bypass create_bidirectional_mask,
+        # which is not compatible with torch.jit.trace due to dynamic masking logic.
+        # Instead, we construct a simple 4D bidirectional attention mask from the
+        # 2D padding mask to properly mask out padding patches.
+        orig_encoder_forward = self._vision_encoder.forward
+
+        def patched_encoder_forward(inputs_embeds, attention_mask=None, pixel_position_ids=None, **kwargs):
+            hidden_states = inputs_embeds
+            position_embeddings = self._vision_encoder.rotary_emb(hidden_states, pixel_position_ids)
+
+            # Build a 4D bidirectional attention mask from the 2D boolean mask.
+            # attention_mask is [batch, seq_len] with True=valid, False=padding.
+            # Decoder layers expect a 4D mask [batch, 1, seq_len, seq_len] where
+            # 0 = attend and large negative = masked.
+            attn_mask_4d = None
+            if attention_mask is not None:
+                min_dtype = torch.finfo(hidden_states.dtype).min
+                # [batch, 1, 1, seq_len] key mask
+                key_mask = attention_mask[:, None, None, :].to(hidden_states.dtype)
+                # Convert: 1.0 for valid tokens, min_dtype for padding
+                attn_mask_4d = (1.0 - key_mask) * min_dtype
+
+            for decoder_layer in self._vision_encoder.layers[: self._vision_encoder.config.num_hidden_layers]:
+                hidden_states = decoder_layer(
+                    hidden_states,
+                    attention_mask=attn_mask_4d,
+                    position_embeddings=position_embeddings,
+                    position_ids=pixel_position_ids,
+                    **kwargs,
+                )
+
+            from transformers.modeling_outputs import BaseModelOutputWithPast
+
+            return BaseModelOutputWithPast(last_hidden_state=hidden_states)
+
+        self._orig_encoder_forward = orig_encoder_forward
+        self._vision_encoder.forward = patched_encoder_forward
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._vision_encoder.forward = self._orig_encoder_forward
+        super().__exit__(exc_type, exc_value, traceback)
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index af2f1edaba..61134d1c58 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -297,6 +297,7 @@ def get_submodels(model):
     "qwen3_vl",
     "got_ocr2",
     "gemma3",
+    "gemma4",
     "idefics3",
     "smolvlm",
     "phi4mm",
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index ccf177df9d..9ac0d15612 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -31,9 +31,13 @@
 from transformers.generation.stopping_criteria import StoppingCriteriaList
 from transformers.generation.utils import GenerateOutput, GenerationMode
 from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
-from transformers.models.mamba.modeling_mamba import MambaCache
 from transformers.utils.hub import PushToHubMixin
 
+try:
+    from transformers.models.mamba.modeling_mamba import MambaCache
+except ImportError:
+    MambaCache = object
+
 from optimum.utils.normalized_config import NormalizedConfigManager
 
 from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index beb7b974eb..ac021b23fe 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -218,6 +218,12 @@ def prepare_inputs(
             inputs["beam_idx"] = (
                 self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)
             )
+
+        if "per_layer_inputs" in self.input_names:
+            per_layer_inputs = kwargs.pop("per_layer_inputs", None)
+            assert per_layer_inputs is not None, "Expected 'per_layer_inputs', but it was not passed"
+            inputs["per_layer_inputs"] = torch.Tensor(per_layer_inputs)
+
         return inputs
 
     def forward(
@@ -347,6 +353,7 @@ def forward(self, audio_feature, audio_mask):
 MODEL_PARTS_CLS_MAPPING = {
     "resampler": OVResampler,
     "language_model": OVModelWithEmbedForCausalLM,
+    "text_embeddings_per_layer": OVVisionProjection,
     "vision_embeddings": OVVisionEmbedding,
     "vision_projection": OVVisionProjection,
     "vision_resampler": OVVisionResampler,
@@ -785,8 +792,11 @@ def forward(
             additional_kwargs["visual_pos_masks"] = extra_outputs[0]
             additional_kwargs["deepstack_visual_embeds"] = extra_outputs[1]
 
+        if self.config.model_type in ("gemma4",) and extra_outputs:
+            additional_kwargs["per_layer_inputs"] = extra_outputs[0]
+
         return self.language_model.forward(
-            input_ids=None,
+            input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -3937,6 +3947,80 @@ def _update_model_kwargs_for_generation(
         return model_kwargs
 
 
+class _OVGemma4ForCausalLM(_OVGemma3ForCausalLM):
+    additional_parts = ["text_embeddings_per_layer"]
+
+    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1:
+            return None
+        return self.vision_embeddings(pixel_values, **kwargs).last_hidden_state
+
+    def get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+        embeds_from_args = kwargs.pop("inputs_embeds", None)
+        inputs_embeds = (
+            embeds_from_args if embeds_from_args is not None else self.get_text_embeddings(input_ids, **kwargs)
+        )
+        per_layer_inputs = self.text_embeddings_per_layer(input_ids)
+        if pixel_values is not None:
+            vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, **kwargs)
+
+            if vision_embeds is not None:
+                inputs_embeds, attention_mask, position_ids = self.merge_vision_text_embeddings(
+                    vision_embeds,
+                    inputs_embeds,
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    **kwargs,
+                )
+        return inputs_embeds, attention_mask, position_ids, per_layer_inputs
+
+    def merge_vision_text_embeddings(
+        self, vision_embeds, inputs_embeds, input_ids=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+        image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
+        inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
+        if input_ids is None:
+            special_image_mask = inputs_embeds == torch.from_numpy(
+                self.get_text_embeddings(torch.tensor([[self.config.image_token_id]], dtype=torch.long))[0]
+            )
+        else:
+            special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds)
+
+            image_features = image_features.to(inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        return inputs_embeds, attention_mask, position_ids
+
+    def prepare_inputs_for_generation(
+        self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs
+    ):
+        model_inputs = super().prepare_inputs_for_generation(input_ids, **kwargs)
+        model_inputs["mm_token_type_ids"] = mm_token_type_ids
+        model_inputs["image_position_ids"] = image_position_ids
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs,
+        model_kwargs,
+        is_encoder_decoder=False,
+        num_new_tokens=1,
+    ):
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+        model_kwargs.pop("mm_token_type_ids", None)
+        model_kwargs.pop("image_position_ids", None)
+        return model_kwargs
+
+
 class _OVGotOCR2ForCausalLM(OVModelForVisualCausalLM):
     def get_vision_embeddings(self, pixel_values, input_ids, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1 and kwargs.get("past_key_values") is not None:
@@ -4817,6 +4901,7 @@ def preprocess_inputs(
     "qwen2_5_vl_text": _OVQwen2_5_VLForCausalLM,
     "got_ocr2": _OVGotOCR2ForCausalLM,
     "gemma3": _OVGemma3ForCausalLM,
+    "gemma4": _OVGemma4ForCausalLM,
     "idefics3": _OVIdefics3ForCausalLM,
     "smolvlm": _OVSmolVLForCasualLM,
     "phi4mm": _OVPhi4MMForCausalLM,

From 2c57fee968cb4a463efac1200cd89fcbcd9d3929 Mon Sep 17 00:00:00 2001
From: "Kazantsev, Roman" <roman.kazantsev@intel.com>
Date: Thu, 2 Apr 2026 22:10:17 +0400
Subject: [PATCH 188/222] Fix formatting

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 optimum/exporters/openvino/model_configs.py        | 8 +++-----
 optimum/exporters/openvino/model_patcher.py        | 7 +++----
 optimum/intel/openvino/configuration.py            | 6 ++----
 optimum/intel/openvino/modeling_decoder.py         | 1 +
 optimum/intel/openvino/modeling_visual_language.py | 4 +---
 5 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 8a27ccfa0b..7126a8ad35 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -150,7 +150,6 @@
     Gemma3LMModelPatcher,
     Gemma4ImageEmbeddingsModelPatcher,
     Gemma4LMModelPatcher,
-    Gemma4PerLayerInputsGetterModelPatcher,
     GptJModelPatcher,
     GptNeoModelPatcher,
     GptNeoxModelPatcher,
@@ -4347,9 +4346,10 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 dtype=float_dtype,
             )
         if input_name == "image_position_ids":
-            import torch
             import math
 
+            import torch
+
             # Create position ids as a grid. The patch count = h_patches * w_patches
             # where both are divisible by pooling_kernel_size for correct pooling.
             k = self.pooling_kernel_size
@@ -4490,9 +4490,7 @@ def forward(self, input_ids: torch.Tensor):
                         per_layer_inputs_tokens,
                         torch.zeros_like(per_layer_inputs_tokens),
                     )
-                    per_layer_inputs = self.language_model.get_per_layer_inputs(
-                        per_layer_inputs_tokens, None
-                    )
+                    per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens, None)
                     return per_layer_inputs
 
             model = PerLayerInputsModule(
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index ae5f0c179d..6a841db50e 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -52,6 +52,8 @@
     ModelPatcher,
     gpt_oss_forward,
     override_arguments,
+)
+from optimum.exporters.onnx.model_patcher import (
     sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap,
 )
 from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
@@ -4946,7 +4948,6 @@ def gemma4_lm_forward(
     logits_to_keep: Union[int, torch.Tensor] = 0,
     **lm_kwargs,
 ):
-    from transformers.models.gemma4.modeling_gemma4 import Gemma4CausalLMOutputWithPast
     from optimum.exporters.onnx.model_patcher import preprocess_past_key_values
 
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -5078,9 +5079,7 @@ def gemma4_text_attention_forward(
             "sliding_window": self.sliding_window,
         }
         if not self.is_kv_shared_layer:
-            key_states, value_states = past_key_values.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
         if self.store_full_length_kv:
             if not hasattr(past_key_values, "shared_layers"):
                 past_key_values.shared_layers = {}
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index f1a1044ebf..e621efc396 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -763,13 +763,11 @@ def __init__(
                         self._dataset_kwargs[key] = int(value)
                     except ValueError:
                         raise ValueError(
-                            f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. "
-                            f"Expected an integer."
+                            f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. Expected an integer."
                         )
                 else:
                     raise ValueError(
-                        f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. "
-                        f"Only 'seq_len' is supported."
+                        f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. Only 'seq_len' is supported."
                     )
         else:
             # No options or list-of-str dataset
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 9ac0d15612..8691f6b887 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -33,6 +33,7 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
 from transformers.utils.hub import PushToHubMixin
 
+
 try:
     from transformers.models.mamba.modeling_mamba import MambaCache
 except ImportError:
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index ac021b23fe..c09243a03b 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -3995,9 +3995,7 @@ def merge_vision_text_embeddings(
 
         return inputs_embeds, attention_mask, position_ids
 
-    def prepare_inputs_for_generation(
-        self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs
-    ):
+    def prepare_inputs_for_generation(self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs):
         model_inputs = super().prepare_inputs_for_generation(input_ids, **kwargs)
         model_inputs["mm_token_type_ids"] = mm_token_type_ids
         model_inputs["image_position_ids"] = image_position_ids

From 5b0b29b4cdcac3ed1dc26803b2504170e8358fb0 Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Thu, 16 Apr 2026 14:08:54 +0400
Subject: [PATCH 189/222] Apply suggestion from @rkazants

---
 optimum/exporters/openvino/model_patcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index c3ec9e1151..fbb9c3c9a9 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -134,7 +134,7 @@ def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0
         return _orig_sdpa_mask_without_vmap(batch_size, cache_position, kv_length, kv_offset=kv_offset, **kwargs)
     else:
         return _orig_sdpa_mask_without_vmap(
-            batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs
+            batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs)
 for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC):
     if spec.name in {
         # onnx-exporter-specific fixes

From 282724651da3e3079d43819e7ae9f2773034378b Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 16 Apr 2026 12:44:29 +0200
Subject: [PATCH 190/222] Remove wrong changes.

---
 .github/workflows/test_offline.yaml         | 2 +-
 optimum/exporters/openvino/model_configs.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml
index 48f07b9396..830d77e1c3 100644
--- a/.github/workflows/test_offline.yaml
+++ b/.github/workflows/test_offline.yaml
@@ -34,7 +34,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
-          uv pip install .[tests]
+          uv pip install .[diffusers,tests]
 
       - name: Test
         run: |
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 642d992052..abbcd40bd1 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -876,8 +876,6 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig):
 )
 class Exaone4OpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.54.0"
-    # TODO (@echarlaix): add v5 support
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(

From bff96195d18a96d7b082ede1b517e4d3c633cc3e Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 16 Apr 2026 12:50:28 +0200
Subject: [PATCH 191/222] Remove wrong changes.

---
 optimum/exporters/openvino/model_configs.py | 4 ++--
 tests/openvino/test_decoder.py              | 6 +-----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index abbcd40bd1..24779f2a2a 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -876,6 +876,8 @@ class ExaoneOpenVINOConfig(LlamaOpenVINOConfig):
 )
 class Exaone4OpenVINOConfig(LlamaOpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.54.0"
+    # TODO (@echarlaix): add v5 support
+    MAX_TRANSFORMERS_VERSION = "4.57.6"
 
 
 @register_in_tasks_manager(
@@ -1098,8 +1100,6 @@ class Phi3OpenVINOConfig(PhiOnnxConfig):
 )
 class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig):
     MIN_TRANSFORMERS_VERSION = "4.46.0"
-    # TODO (@echarlaix): add v5 support
-    MAX_TRANSFORMERS_VERSION = "4.57.6"
     _MODEL_PATCHER = PhiMoEModelPatcher
 
 
diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py
index 91696d8a46..4a9f3b23fb 100644
--- a/tests/openvino/test_decoder.py
+++ b/tests/openvino/test_decoder.py
@@ -104,15 +104,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         SUPPORTED_ARCHITECTURES += ("cohere2",)
 
     if is_transformers_version(">=", "4.46.0"):
-        SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo")
+        SUPPORTED_ARCHITECTURES += ("glm", "mistral-nemo", "phimoe")
 
         if is_transformers_version("<", "4.54.0"):
             SUPPORTED_ARCHITECTURES += ("deepseek",)
 
-        # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
-        if is_transformers_version("<", "5"):
-            SUPPORTED_ARCHITECTURES += ("phimoe",)
-
         # gptq and awq install disabled for windows test environment
         if platform.system() != "Windows" and is_transformers_version("<", "4.56.0"):
             SUPPORTED_ARCHITECTURES += ("opt_gptq", "mixtral_awq")

From 387dc8cef86befc527b810a8b1c1b7cf6d1b036c Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 16 Apr 2026 13:05:39 +0200
Subject: [PATCH 192/222] Remove wrong changes.

---
 optimum/exporters/openvino/model_patcher.py | 9 ---------
 optimum/intel/openvino/configuration.py     | 6 ++++--
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index fbb9c3c9a9..0500fe1a0d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -1768,15 +1768,6 @@ def __enter__(self):
                 "long_mscale", None
             )
 
-        if is_transformers_version("<", "5"):
-            for layer in self._model.model.layers:
-                layer.block_sparse_moe._orig_forward = layer.block_sparse_moe.forward
-                layer.block_sparse_moe.forward = types.MethodType(
-                    _phi_moe_sparse_moe_block_forward, layer.block_sparse_moe
-                )
-        else:
-            self._model.set_experts_implementation("batched_mm")
-
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
 
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index e621efc396..f1a1044ebf 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -763,11 +763,13 @@ def __init__(
                         self._dataset_kwargs[key] = int(value)
                     except ValueError:
                         raise ValueError(
-                            f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. Expected an integer."
+                            f"Invalid value '{value}' for seq_len in dataset spec '{dataset}'. "
+                            f"Expected an integer."
                         )
                 else:
                     raise ValueError(
-                        f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. Only 'seq_len' is supported."
+                        f"Unsupported dataset option '{key}' in dataset spec '{dataset}'. "
+                        f"Only 'seq_len' is supported."
                     )
         else:
             # No options or list-of-str dataset

From 14d057a30863811c0913373a2d10fd0c66076ef3 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 16 Apr 2026 13:34:56 +0200
Subject: [PATCH 193/222] Seq2seq test.

---
 tests/openvino/test_seq2seq.py | 17 ++++++++++++++++-
 tests/openvino/utils_tests.py  |  1 +
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 9b246fb871..6ca18cd232 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -607,6 +607,9 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         # remote code models incompatible after transformers v5
         SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"]
 
+    if is_transformers_version(">=", "5.5"):
+        SUPPORTED_ARCHITECTURES += ["gemma4"]
+
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version("<", "5"):
         SUPPORTED_ARCHITECTURES += ("llava_next_video",)
@@ -770,8 +773,13 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             set_seed(SEED)
             with torch.no_grad():
                 transformers_outputs = transformers_model(**transformers_inputs)
+            # Gemma4 performs poorly with random weights.
+            # The full model "google/gemma-4-E2B-it" passes this test with 4e-2 eps, but
+            # after saving it with random weights the converted model generates logits with max difference around 5.
+            # On the tiny model the error is about 0.1.
+            eps = 0.2 if model_arch == "gemma4" else 4e-3
             self.assertTrue(
-                torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=4e-3),
+                torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=eps),
                 f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
             )
 
@@ -783,6 +791,13 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
         ov_model.generation_config.do_sample = False
         # minicpmo diverges after 20 tokens
         tokens_to_generate = 20 if model_arch == "minicpmo" else 30
+
+        # Gemma4 performs much poorly with random weights.
+        # The full model "google/gemma-4-E2B-it" passes this test, while the same architecture
+        # saved with random weights generates tokens that do not match transformers.
+        if model_arch == "gemma4":
+            tokens_to_generate = 1
+
         gen_config = GenerationConfig(
             max_new_tokens=tokens_to_generate,
             min_new_tokens=tokens_to_generate,
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 13ba840bb8..36bb4ead6c 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -87,6 +87,7 @@
     "got_ocr2": "optimum-intel-internal-testing/tiny-random-got-ocr2-hf",
     "gemma3_text": "optimum-intel-internal-testing/tiny-random-gemma3-text",
     "gemma3": "optimum-intel-internal-testing/tiny-random-gemma3",
+    "gemma4": "optimum-intel-internal-testing/tiny-random-gemma4",
     "falcon": "optimum-intel-internal-testing/really-tiny-falcon-testing",
     "falcon-40b": "optimum-intel-internal-testing/tiny-random-falcon-40b",
     "falcon_mamba": "optimum-intel-internal-testing/tiny-falcon-mamba",

From a117774550341b1e446c954d7dd0e94babfb62be Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 16 Apr 2026 14:05:43 +0200
Subject: [PATCH 194/222] Quantization tests.

---
 optimum/exporters/openvino/model_patcher.py | 5 ++++-
 tests/openvino/test_quantization.py         | 3 +++
 tests/openvino/utils_tests.py               | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 0500fe1a0d..ee85a36aef 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -134,7 +134,10 @@ def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0
         return _orig_sdpa_mask_without_vmap(batch_size, cache_position, kv_length, kv_offset=kv_offset, **kwargs)
     else:
         return _orig_sdpa_mask_without_vmap(
-            batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs)
+            batch_size, q_length=q_length, kv_length=kv_length, q_offset=q_offset, kv_offset=kv_offset, **kwargs
+        )
+
+
 for idx, spec in enumerate(UNSUPPORTED_OPS_PATCHING_SPEC):
     if spec.name in {
         # onnx-exporter-specific fixes
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 63c13de956..f0b5f4bcd1 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1089,6 +1089,9 @@ class OVWeightCompressionTest(unittest.TestCase):
             ]
         )
 
+    if is_transformers_version(">=", "5.5.0"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "gemma4", True))
+
     SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
         (OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
         (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 36bb4ead6c..1f063de6b8 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -373,6 +373,12 @@
     "hunyuan_v1_dense": {"model": 32},
     "qwen3_eagle3": {"model": 20},
     "qwen3_next": {"model": 100},
+    "gemma4": {
+        "lm_model": 58,
+        "text_embeddings_model": 1,
+        "vision_embeddings_model": 10,
+        "text_embeddings_per_layer_model": 1,
+    },
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From ebb1755e0be3d6231959222da89f41158e566935 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 16 Apr 2026 14:06:33 +0200
Subject: [PATCH 195/222] Docs update.

---
 docs/source/openvino/models.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index e967e9d22e..ce659dfae5 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -73,6 +73,7 @@ Here is the list of the supported architectures :
 - Gemma
 - Gemma 2
 - Gemma 3
+- Gemma 4
 - GOT-OCR 2.0
 - Granite
 - Granite 4.0

From bb43c2445f262cac952812936808063b75d741d0 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 16 Apr 2026 15:31:22 +0200
Subject: [PATCH 196/222] Removed redundant import.

---
 optimum/exporters/openvino/model_patcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index ee85a36aef..781b842738 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -63,7 +63,6 @@
 from optimum.exporters.onnx.model_patcher import (
     sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap,
 )
-from optimum.intel.utils.import_utils import is_diffusers_version, is_torch_version, is_transformers_version
 
 
 from ._ov_ops import convert_recurrent_attention_cell

From 1fccd408ad0f29e9f4ed7f80d3414c0bc8f8bda0 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 17 Apr 2026 10:17:35 +0200
Subject: [PATCH 197/222] Fix Gemma 4 26B-A4B (MoE) export and inference

---
 optimum/exporters/openvino/model_configs.py | 15 +++++++++++++-
 optimum/exporters/openvino/model_patcher.py | 22 +++++++++------------
 optimum/intel/openvino/configuration.py     | 22 +++++++++++++++++++++
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 24779f2a2a..fa341c9e5a 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1518,6 +1518,10 @@ def __init__(
         self.layer_types = normalized_config.config.layer_types
         self.num_kv_shared_layers = normalized_config.config.num_kv_shared_layers
         self.sliding_window = normalized_config.config.sliding_window
+        # Full-attention layers use fewer KV heads than sliding-attention layers (e.g. 2 vs 8 for 26B-A4B)
+        self.num_global_key_value_heads = getattr(
+            normalized_config.config, "num_global_key_value_heads", None
+        ) or self.num_key_value_heads
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         # some layers do not produce their own KV-cache, they use the shared KV-cache
@@ -1537,7 +1541,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
             else:
                 shape = (
                     self.batch_size,
-                    self.num_key_value_heads,
+                    self.num_global_key_value_heads,
                     self.sequence_length,
                     self.global_head_dim,
                 )
@@ -4462,6 +4466,15 @@ def __init__(self, language_model, vocab_size_per_layer_input: int, config):
                     self.config = config
 
                 def forward(self, input_ids: torch.Tensor):
+                    # 26B-A4B has hidden_size_per_layer_input=0 (PLE disabled)
+                    if self.language_model.config.hidden_size_per_layer_input <= 0:
+                        return torch.zeros(
+                            input_ids.shape[0],
+                            input_ids.shape[1],
+                            self.language_model.config.num_hidden_layers,
+                            0,
+                            dtype=torch.float32,
+                        )
                     # Replace multimodal token IDs with pad_token_id to match
                     # HF Gemma4Model.forward which uses llm_input_ids where
                     # image/video/audio positions are set to pad_token_id
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 781b842738..3ff3c2866a 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -5192,27 +5192,23 @@ def _gemma4_moe_block_forward(self, hidden_states, top_k_index, top_k_weights):
     # hidden_states: [B*S, hidden_dim]
     # top_k_index: [B*S, K], top_k_weights: [B*S, K]
     num_tokens = hidden_states.shape[0]
-    dtype = hidden_states.dtype
 
     # Compute all expert outputs via batched matmul
     # expanded: [E, B*S, hidden_dim]
     expanded_hidden = hidden_states.unsqueeze(0).expand(self.num_experts, -1, -1)
 
     # gate_up_proj: [E, 2*inter, hidden] -> transpose to [E, hidden, 2*inter]
-    gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.to(dtype).transpose(1, 2))
+    gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.transpose(1, 2))
     gate, up = gate_up.chunk(2, dim=-1)
     intermediate = self.act_fn(gate) * up
 
     # down_proj: [E, hidden, inter] -> transpose to [E, inter, hidden]
-    expert_outputs = torch.bmm(intermediate, self.down_proj.to(dtype).transpose(1, 2))
+    expert_outputs = torch.bmm(intermediate, self.down_proj.transpose(1, 2))
     # expert_outputs: [E, B*S, hidden_dim]
 
-    # Apply per-expert scale: [E] -> [E, 1, 1]
-    expert_outputs = expert_outputs * self.per_expert_scale.to(dtype).unsqueeze(-1).unsqueeze(-1)
-
     # Build full routing weight matrix [B*S, E] from sparse top-k
-    full_weights = torch.zeros(num_tokens, self.num_experts, dtype=dtype, device=hidden_states.device)
-    full_weights.scatter_add_(1, top_k_index, top_k_weights.to(dtype))
+    full_weights = torch.zeros(num_tokens, self.num_experts, dtype=hidden_states.dtype, device=hidden_states.device)
+    full_weights.scatter_add_(1, top_k_index, top_k_weights)
 
     # Weighted sum over experts: [B*S, 1, E] @ [B*S, E, hidden_dim] -> [B*S, hidden_dim]
     expert_outputs = expert_outputs.permute(1, 0, 2)  # [B*S, E, hidden_dim]
@@ -5247,9 +5243,9 @@ def __enter__(self):
         for decoder_layer in self._model.model.language_model.layers:
             decoder_layer.self_attn.orig_forward = decoder_layer.self_attn.forward
             decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn)
-            if hasattr(decoder_layer, "moe"):
-                decoder_layer.moe._orig_forward = decoder_layer.moe.forward
-                decoder_layer.moe.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.moe)
+            if hasattr(decoder_layer, "experts"):
+                decoder_layer.experts._orig_forward = decoder_layer.experts.forward
+                decoder_layer.experts.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.experts)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -5259,8 +5255,8 @@ def __exit__(self, exc_type, exc_value, traceback):
 
         for decoder_layer in self._model.model.language_model.layers:
             decoder_layer.self_attn.forward = decoder_layer.self_attn.orig_forward
-            if hasattr(decoder_layer, "moe") and hasattr(decoder_layer.moe, "_orig_forward"):
-                decoder_layer.moe.forward = decoder_layer.moe._orig_forward
+            if hasattr(decoder_layer, "experts") and hasattr(decoder_layer.experts, "_orig_forward"):
+                decoder_layer.experts.forward = decoder_layer.experts._orig_forward
 
         setattr(self._model, self.orig_forward_name, self.model_orig_forward)
         setattr(self._model.model, "forward", self.model_orig_language_model_forward)
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index f1a1044ebf..1531d6e4aa 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -436,6 +436,18 @@ class OVQuantizationMethod(str, Enum):
         "dataset": "contextual",
         "scale_estimation": True,
     },
+    "google/gemma-4-26B-A4B-it": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 64,
+        "group_size_fallback": "adjust",
+    },
+    "google/gemma-4-26B-A4B": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 64,
+        "group_size_fallback": "adjust",
+    },
 }
 
 _DEFAULT_8BIT_WQ_CONFIGS = {
@@ -567,6 +579,16 @@ class OVQuantizationMethod(str, Enum):
             ],
         },
     },
+    "google/gemma-4-26B-A4B-it": {
+        "lm_model": {
+            "patterns": [".*router.*"],
+        },
+    },
+    "google/gemma-4-26B-A4B": {
+        "lm_model": {
+            "patterns": [".*router.*"],
+        },
+    },
 }
 
 

From 46752941a182e5868681ec4b0bdabc575f3f5461 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 22 Apr 2026 13:18:17 +0200
Subject: [PATCH 198/222] Fix accuracy on tiny model.

---
 optimum/exporters/openvino/model_patcher.py | 29 ++++++++++++++++++---
 tests/openvino/test_seq2seq.py              | 15 +++--------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3ff3c2866a..f23c80aabf 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -5110,9 +5110,8 @@ def gemma4_eager_attention_forward_patched(
     if attention_mask is not None:
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
-    eps = 0.0000001
 
-    attn_weights = nn.functional.softmax(attn_weights + eps, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
@@ -5178,7 +5177,7 @@ def gemma4_text_attention_forward(
         value_states,
         attention_mask,
         dropout=self.attention_dropout if self.training else 0.0,
-        scaling=1.0,
+        scaling=self.scaling,
         sliding_window=self.sliding_window,
         **kwargs,
     )
@@ -8891,9 +8890,18 @@ def __exit__(self, exc_type, exc_value, traceback):
         self._model.forward = self._model.__orig_forward
 
 
+# OpenVINO has a bug due to which Clamp(-inf, inf) doesn't work correctly: CVS-185473.
+# When min == -inf and max == inf, Clamp is equivalent to an identity operation and
+# can be removed from the model, which serves as a workaround for the issue.
+def patched_gemma4_clippable_linear_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    hidden_states = self.linear(hidden_states)
+    return hidden_states
+
 class Gemma4ImageEmbeddingsModelPatcher(CommonImageEmbeddingsModelPatcher):
     def __init__(self, config, model, model_kwargs):
         super().__init__(config, model, model_kwargs)
+        from transformers.models.gemma4.modeling_gemma4 import Gemma4ClippableLinear
+
         # Get the vision encoder - it's at model.model.vision_tower.encoder
         vision_model = model.model.vision_tower if is_transformers_version(">=", "5") else model.vision_tower
         self._vision_encoder = vision_model.encoder
@@ -8936,6 +8944,21 @@ def patched_encoder_forward(inputs_embeds, attention_mask=None, pixel_position_i
         self._orig_encoder_forward = orig_encoder_forward
         self._vision_encoder.forward = patched_encoder_forward
 
+        for layer in self._vision_encoder.layers:
+            for module in layer.modules():
+                if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears:
+                    if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float("inf") and module.output_max == float("inf"):
+                        module.orig_forward = module.forward
+                        module.forward = types.MethodType(patched_gemma4_clippable_linear_forward, module)
+
     def __exit__(self, exc_type, exc_value, traceback):
+        from transformers.models.gemma4.modeling_gemma4 import Gemma4ClippableLinear
         self._vision_encoder.forward = self._orig_encoder_forward
         super().__exit__(exc_type, exc_value, traceback)
+
+        for layer in self._vision_encoder.layers:
+            for module in layer.modules():
+                if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears:
+                    if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float(
+                            "inf") and module.output_max == float("inf"):
+                        module.forward = module.orig_forward
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 6ca18cd232..08ae063518 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -686,6 +686,8 @@ def test_find_untested_architectures(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
+        if model_arch != "gemma4":
+            return
         def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=generation_config)
@@ -773,13 +775,8 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             set_seed(SEED)
             with torch.no_grad():
                 transformers_outputs = transformers_model(**transformers_inputs)
-            # Gemma4 performs poorly with random weights.
-            # The full model "google/gemma-4-E2B-it" passes this test with 4e-2 eps, but
-            # after saving it with random weights the converted model generates logits with max difference around 5.
-            # On the tiny model the error is about 0.1.
-            eps = 0.2 if model_arch == "gemma4" else 4e-3
             self.assertTrue(
-                torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=eps),
+                torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=4e-3),
                 f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
             )
 
@@ -792,12 +789,6 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
         # minicpmo diverges after 20 tokens
         tokens_to_generate = 20 if model_arch == "minicpmo" else 30
 
-        # Gemma4 performs much poorly with random weights.
-        # The full model "google/gemma-4-E2B-it" passes this test, while the same architecture
-        # saved with random weights generates tokens that do not match transformers.
-        if model_arch == "gemma4":
-            tokens_to_generate = 1
-
         gen_config = GenerationConfig(
             max_new_tokens=tokens_to_generate,
             min_new_tokens=tokens_to_generate,

From 19c605cd900a781ce93efbeccbcf260c6266f494 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 22 Apr 2026 13:20:27 +0200
Subject: [PATCH 199/222] Code style.

---
 optimum/exporters/openvino/model_configs.py |  6 +++---
 optimum/exporters/openvino/model_patcher.py | 17 ++++++++++++++---
 tests/openvino/test_seq2seq.py              |  2 --
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index fa341c9e5a..2396ab4767 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1519,9 +1519,9 @@ def __init__(
         self.num_kv_shared_layers = normalized_config.config.num_kv_shared_layers
         self.sliding_window = normalized_config.config.sliding_window
         # Full-attention layers use fewer KV heads than sliding-attention layers (e.g. 2 vs 8 for 26B-A4B)
-        self.num_global_key_value_heads = getattr(
-            normalized_config.config, "num_global_key_value_heads", None
-        ) or self.num_key_value_heads
+        self.num_global_key_value_heads = (
+            getattr(normalized_config.config, "num_global_key_value_heads", None) or self.num_key_value_heads
+        )
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         # some layers do not produce their own KV-cache, they use the shared KV-cache
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index f23c80aabf..79bb35e79c 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -8897,6 +8897,7 @@ def patched_gemma4_clippable_linear_forward(self, hidden_states: torch.Tensor) -
     hidden_states = self.linear(hidden_states)
     return hidden_states
 
+
 class Gemma4ImageEmbeddingsModelPatcher(CommonImageEmbeddingsModelPatcher):
     def __init__(self, config, model, model_kwargs):
         super().__init__(config, model, model_kwargs)
@@ -8947,18 +8948,28 @@ def patched_encoder_forward(inputs_embeds, attention_mask=None, pixel_position_i
         for layer in self._vision_encoder.layers:
             for module in layer.modules():
                 if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears:
-                    if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float("inf") and module.output_max == float("inf"):
+                    if (
+                        module.input_min == -float("inf")
+                        and module.input_max == float("inf")
+                        and module.output_min == -float("inf")
+                        and module.output_max == float("inf")
+                    ):
                         module.orig_forward = module.forward
                         module.forward = types.MethodType(patched_gemma4_clippable_linear_forward, module)
 
     def __exit__(self, exc_type, exc_value, traceback):
         from transformers.models.gemma4.modeling_gemma4 import Gemma4ClippableLinear
+
         self._vision_encoder.forward = self._orig_encoder_forward
         super().__exit__(exc_type, exc_value, traceback)
 
         for layer in self._vision_encoder.layers:
             for module in layer.modules():
                 if isinstance(module, Gemma4ClippableLinear) and module.use_clipped_linears:
-                    if module.input_min == -float("inf") and module.input_max == float("inf") and module.output_min == -float(
-                            "inf") and module.output_max == float("inf"):
+                    if (
+                        module.input_min == -float("inf")
+                        and module.input_max == float("inf")
+                        and module.output_min == -float("inf")
+                        and module.output_max == float("inf")
+                    ):
                         module.forward = module.orig_forward
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 08ae063518..14cfb3f7f5 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -686,8 +686,6 @@ def test_find_untested_architectures(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
-        if model_arch != "gemma4":
-            return
         def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             transformers_inputs = copy.deepcopy(inputs)
             ov_outputs = ov_model.generate(**inputs, generation_config=generation_config)

From 7bd6e48e68eeb59f06024b749556d6a80645c4cf Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 22 Apr 2026 14:22:17 +0200
Subject: [PATCH 200/222] Test export.

---
 optimum/exporters/openvino/model_patcher.py | 7 ++-----
 tests/openvino/test_export.py               | 3 +++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 79bb35e79c..55d9ccdeab 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -52,6 +52,7 @@
     ModelPatcher,
     gpt_oss_forward,
     override_arguments,
+    sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap
 )
 from optimum.intel.utils.import_utils import (
     is_diffusers_version,
@@ -60,10 +61,6 @@
     is_transformers_version,
 )
 
-from optimum.exporters.onnx.model_patcher import (
-    sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap,
-)
-
 
 from ._ov_ops import convert_recurrent_attention_cell
 
@@ -122,7 +119,7 @@ def sdpa_mask_without_vmap(batch_size, q_length=None, kv_length=None, q_offset=0
     import inspect
 
     sig = inspect.signature(_orig_sdpa_mask_without_vmap)
-    if "cache_position" in sig.parameters:
+    if is_transformers_version(">=", "5.5") and "cache_position" in sig.parameters and q_length is not None:
         # Old optimum signature: (batch_size, cache_position, kv_length, kv_offset, ...)
         cache_position = torch.arange(q_length, dtype=torch.long) + q_offset
         kwargs.pop("q_offset", None)
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index e9c7696c2d..f2828be1d4 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -110,6 +110,9 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"):
         SUPPORTED_ARCHITECTURES.update({"afmoe": OVModelForCausalLM})
 
+    if is_transformers_version(">=", "4.55.0"):
+        SUPPORTED_ARCHITECTURES.update({"gemma4": OVModelForVisualCausalLM})
+
     if is_transformers_version(">=", "4.57.0"):
         SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM})
 

From 18fbdc4ac33d6185590e25c68ed493e9e2d84701 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 22 Apr 2026 16:16:36 +0200
Subject: [PATCH 201/222] Minor fix.

---
 tests/openvino/test_export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index f2828be1d4..acd03802c9 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -110,7 +110,7 @@ class ExportModelTest(unittest.TestCase):
     if is_transformers_version(">=", "4.55.0") and is_transformers_version("<", "4.58.0"):
         SUPPORTED_ARCHITECTURES.update({"afmoe": OVModelForCausalLM})
 
-    if is_transformers_version(">=", "4.55.0"):
+    if is_transformers_version(">=", "5.5.0"):
         SUPPORTED_ARCHITECTURES.update({"gemma4": OVModelForVisualCausalLM})
 
     if is_transformers_version(">=", "4.57.0"):

From ac8f8d890a297ff1ece904c2a684351ae20246fd Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 23 Apr 2026 14:33:33 +0200
Subject: [PATCH 202/222] Fix use_bidirectional_attention.

---
 optimum/exporters/openvino/model_configs.py   |  7 +-
 optimum/exporters/openvino/model_patcher.py   | 85 ++++++++++++++++++-
 .../openvino/modeling_visual_language.py      | 31 ++++++-
 3 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 2396ab4767..a9f56f292b 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -4434,13 +4434,18 @@ def with_behavior(self, behavior: Union[str, Gemma4ConfigBehavior]):
 
         if behavior == Gemma4ConfigBehavior.LANGUAGE:
             model_type = "gemma4_text"
+            inputs_update = {
+                "per_layer_inputs": {0: "batch_size", 1: "sequence_length", 2: "num_hidden_layers"},
+            }
+            if getattr(self._orig_config.get_text_config(), "use_bidirectional_attention", None) == "vision":
+                inputs_update["token_type_ids"] = {0: "batch_size", 1: "sequence_length"}
             return get_vlm_text_generation_config(
                 model_type,
                 self._orig_config.text_config,
                 self.int_dtype,
                 self.float_dtype,
                 model_patcher=Gemma4LMModelPatcher,
-                inputs_update={"per_layer_inputs": {0: "batch_size", 1: "sequence_length", 2: "num_hidden_layers"}},
+                inputs_update=inputs_update,
             )
         if behavior == Gemma4ConfigBehavior.TEXT_EMBEDDINGS_PER_LAYER:
             config = self.__class__(
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 55d9ccdeab..727885912b 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4945,6 +4945,75 @@ def _gemma4_project_per_layer_inputs(
     return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale
 
 
+def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, inputs_embeds, sliding_window):
+    """
+    Creates a dict of causal masks with bidirectional attention for vision tokens
+    on sliding_attention layers, matching the behavior of transformers'
+    create_causal_mask_mapping when use_bidirectional_attention == "vision".
+
+    Args:
+        attention_mask_2d: [batch, total_len] 2D attention mask (1=attend, 0=pad)
+        mm_token_type_ids: [batch, total_len] token type ids (0=text, 1=image, 2=video/audio)
+        inputs_embeds: [batch, seq_len, hidden_size]
+        sliding_window: int, sliding window size
+
+    Returns:
+        dict with "full_attention" and "sliding_attention" 4D masks
+    """
+    dtype = inputs_embeds.dtype
+    device = inputs_embeds.device
+    min_dtype = torch.finfo(dtype).min
+
+    batch_size = inputs_embeds.shape[0]
+    seq_len = inputs_embeds.shape[1]
+    target_len = attention_mask_2d.shape[-1]
+    past_len = target_len - seq_len
+
+    # Standard causal mask [seq_len, target_len]
+    causal_mask = torch.full((seq_len, target_len), min_dtype, dtype=dtype, device=device)
+    if seq_len != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=past_len + 1)
+
+    # Apply padding from attention_mask_2d
+    padding_mask = (1.0 - attention_mask_2d[:, None, None, :].to(dtype=dtype, device=device)) * min_dtype
+    full_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + padding_mask
+
+    # Sliding window causal mask
+    sliding_mask = full_mask.clone()
+    if sliding_window is not None:
+        row_pos = torch.arange(seq_len, device=device).unsqueeze(1) + past_len
+        col_pos = torch.arange(target_len, device=device).unsqueeze(0)
+        beyond_window = (row_pos - col_pos) >= sliding_window
+        sliding_mask = sliding_mask.masked_fill(beyond_window[None, None, :, :], min_dtype)
+
+    # Apply bidirectional masking for vision tokens (only on sliding_attention mask)
+    # mm_token_type_ids: [batch, total_len] - 0=text, 1=image, 2=video/audio
+    is_vision = (mm_token_type_ids == 1) | (mm_token_type_ids == 2)
+
+    # Group contiguous vision tokens (trace-friendly, no in-place ops)
+    # Shift is_vision right by 1 position, padding with False on the left
+    is_prev_vision = torch.nn.functional.pad(is_vision[:, :-1].to(dtype=torch.int32), (1, 0), value=0).bool()
+    new_vision_starts = is_vision & ~is_prev_vision
+    vision_group_ids = torch.cumsum(new_vision_starts.to(dtype=torch.int32), dim=1) - 1
+    vision_group_ids = torch.where(is_vision, vision_group_ids, torch.tensor(-1, dtype=torch.int32, device=device))
+
+    # Query group IDs correspond to positions [past_len : past_len + seq_len]
+    query_groups = vision_group_ids[:, past_len : past_len + seq_len]  # [batch, seq_len]
+    key_groups = vision_group_ids  # [batch, total_len]
+
+    # same_group[b, q, k] = True iff query and key are in the same non-text vision group
+    same_group = (query_groups.unsqueeze(2) == key_groups.unsqueeze(1)) & (key_groups.unsqueeze(1) >= 0)
+    same_group = same_group.unsqueeze(1)  # [batch, 1, seq_len, total_len]
+
+    # Undo masking for same-group vision tokens in sliding mask
+    sliding_mask = sliding_mask.masked_fill(same_group, 0.0)
+
+    return {
+        "full_attention": full_mask,
+        "sliding_attention": sliding_mask,
+    }
+
+
 def gemma4_language_model_forward(
     self,
     input_ids: Optional[torch.LongTensor] = None,
@@ -4985,6 +5054,18 @@ def gemma4_language_model_forward(
         special_image_mask_expanded = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds)
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask_expanded, image_features)
 
+    # Create bidirectional causal mask mapping when use_bidirectional_attention == "vision"
+    use_bidirectional = (
+        getattr(self.config.get_text_config(), "use_bidirectional_attention", None) == "vision"
+    )
+    if use_bidirectional and mm_token_type_ids is not None:
+        attention_mask = _create_gemma4_bidirectional_mask_dict(
+            attention_mask,
+            mm_token_type_ids,
+            inputs_embeds,
+            self.model.language_model.config.sliding_window,
+        )
+
     outputs = self.model.language_model(
         input_ids=None,
         per_layer_inputs=per_layer_inputs,
@@ -5015,12 +5096,12 @@ def gemma4_lm_forward(
     past_key_values: Optional[Cache] = None,
     inputs_embeds: Optional[torch.FloatTensor] = None,
     per_layer_inputs=None,
+    token_type_ids: Optional[torch.LongTensor] = None,
     input_ids: Optional[torch.LongTensor] = None,
     pixel_values: Optional[torch.FloatTensor] = None,
     pixel_values_videos: Optional[torch.FloatTensor] = None,
     input_features: Optional[torch.FloatTensor] = None,
     input_features_mask: Optional[torch.Tensor] = None,
-    mm_token_type_ids: Optional[torch.LongTensor] = None,
     cache_position: Optional[torch.LongTensor] = None,
     labels: Optional[torch.LongTensor] = None,
     use_cache: Optional[bool] = None,
@@ -5050,7 +5131,7 @@ def gemma4_lm_forward(
         input_features_mask=input_features_mask,
         position_ids=position_ids,
         past_key_values=past_key_values,
-        mm_token_type_ids=mm_token_type_ids,
+        mm_token_type_ids=token_type_ids,
         cache_position=cache_position,
         inputs_embeds=inputs_embeds,
         labels=labels,
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index c09243a03b..c83010684b 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -211,7 +211,11 @@ def prepare_inputs(
 
         if "token_type_ids" in self.input_names:
             if token_type_ids is None:
-                token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
+                # Use attention_mask shape to match total sequence length (including past tokens)
+                if attention_mask is not None:
+                    token_type_ids = np.zeros(attention_mask.shape, dtype=int)
+                else:
+                    token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
             inputs["token_type_ids"] = token_type_ids
 
         if "beam_idx" in self.input_names:
@@ -3995,12 +3999,31 @@ def merge_vision_text_embeddings(
 
         return inputs_embeds, attention_mask, position_ids
 
-    def prepare_inputs_for_generation(self, input_ids, mm_token_type_ids=None, image_position_ids=None, **kwargs):
-        model_inputs = super().prepare_inputs_for_generation(input_ids, **kwargs)
-        model_inputs["mm_token_type_ids"] = mm_token_type_ids
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None,
+        image_sizes=None, attention_mask=None, mm_token_type_ids=None, image_position_ids=None, **kwargs
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values, image_sizes=image_sizes, attention_mask=attention_mask, **kwargs
+        )
+        # Map mm_token_type_ids to token_type_ids for the OV language model input
+        model_inputs["token_type_ids"] = mm_token_type_ids
         model_inputs["image_position_ids"] = image_position_ids
         return model_inputs
 
+    def forward(self, input_ids, pixel_values=None, token_type_ids=None, **kwargs):
+        # Map mm_token_type_ids (from Gemma4 processor) to token_type_ids (OV language model input)
+        mm_token_type_ids = kwargs.pop("mm_token_type_ids", None)
+        if token_type_ids is None and mm_token_type_ids is not None:
+            token_type_ids = mm_token_type_ids
+        return super().forward(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
     def _update_model_kwargs_for_generation(
         self,
         outputs,

From ea694279d092bdffe1e510132989fd56112718e5 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 23 Apr 2026 15:05:45 +0200
Subject: [PATCH 203/222] MoE tests.

---
 optimum/exporters/openvino/model_patcher.py   |  6 ++----
 .../openvino/modeling_visual_language.py      | 21 +++++++++++++++----
 tests/openvino/test_export.py                 |  1 +
 tests/openvino/test_quantization.py           |  1 +
 tests/openvino/test_seq2seq.py                |  2 +-
 tests/openvino/utils_tests.py                 |  9 +++++++-
 6 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 727885912b..4e61cb3d69 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -52,7 +52,7 @@
     ModelPatcher,
     gpt_oss_forward,
     override_arguments,
-    sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap
+    sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap,
 )
 from optimum.intel.utils.import_utils import (
     is_diffusers_version,
@@ -5055,9 +5055,7 @@ def gemma4_language_model_forward(
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask_expanded, image_features)
 
     # Create bidirectional causal mask mapping when use_bidirectional_attention == "vision"
-    use_bidirectional = (
-        getattr(self.config.get_text_config(), "use_bidirectional_attention", None) == "vision"
-    )
+    use_bidirectional = getattr(self.config.get_text_config(), "use_bidirectional_attention", None) == "vision"
     if use_bidirectional and mm_token_type_ids is not None:
         attention_mask = _create_gemma4_bidirectional_mask_dict(
             attention_mask,
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index c83010684b..89965be093 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -4000,12 +4000,25 @@ def merge_vision_text_embeddings(
         return inputs_embeds, attention_mask, position_ids
 
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None,
-        image_sizes=None, attention_mask=None, mm_token_type_ids=None, image_position_ids=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        image_sizes=None,
+        attention_mask=None,
+        mm_token_type_ids=None,
+        image_position_ids=None,
+        **kwargs,
     ):
         model_inputs = super().prepare_inputs_for_generation(
-            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds,
-            pixel_values=pixel_values, image_sizes=image_sizes, attention_mask=attention_mask, **kwargs
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            attention_mask=attention_mask,
+            **kwargs,
         )
         # Map mm_token_type_ids to token_type_ids for the OV language model input
         model_inputs["token_type_ids"] = mm_token_type_ids
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index acd03802c9..6ce46f2484 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -112,6 +112,7 @@ class ExportModelTest(unittest.TestCase):
 
     if is_transformers_version(">=", "5.5.0"):
         SUPPORTED_ARCHITECTURES.update({"gemma4": OVModelForVisualCausalLM})
+        SUPPORTED_ARCHITECTURES.update({"gemma4_moe": OVModelForVisualCausalLM})
 
     if is_transformers_version(">=", "4.57.0"):
         SUPPORTED_ARCHITECTURES.update({"hunyuan_v1_dense": OVModelForCausalLM})
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index f0b5f4bcd1..c8404f01c3 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1091,6 +1091,7 @@ class OVWeightCompressionTest(unittest.TestCase):
 
     if is_transformers_version(">=", "5.5.0"):
         SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "gemma4", True))
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "gemma4_moe", True))
 
     SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
         (OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 14cfb3f7f5..a8e9e7f6a5 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -608,7 +608,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
         SUPPORTED_ARCHITECTURES += ["internvl_chat", "minicpmv"]
 
     if is_transformers_version(">=", "5.5"):
-        SUPPORTED_ARCHITECTURES += ["gemma4"]
+        SUPPORTED_ARCHITECTURES += ["gemma4", "gemma4_moe"]
 
     # TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
     if is_transformers_version("<", "5"):
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 1f063de6b8..236deda76c 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -88,6 +88,7 @@
     "gemma3_text": "optimum-intel-internal-testing/tiny-random-gemma3-text",
     "gemma3": "optimum-intel-internal-testing/tiny-random-gemma3",
     "gemma4": "optimum-intel-internal-testing/tiny-random-gemma4",
+    "gemma4_moe": "optimum-intel-internal-testing/tiny-random-gemma4-moe",
     "falcon": "optimum-intel-internal-testing/really-tiny-falcon-testing",
     "falcon-40b": "optimum-intel-internal-testing/tiny-random-falcon-40b",
     "falcon_mamba": "optimum-intel-internal-testing/tiny-falcon-mamba",
@@ -374,11 +375,17 @@
     "qwen3_eagle3": {"model": 20},
     "qwen3_next": {"model": 100},
     "gemma4": {
-        "lm_model": 58,
+        "lm_model": 54,
         "text_embeddings_model": 1,
         "vision_embeddings_model": 10,
         "text_embeddings_per_layer_model": 1,
     },
+    "gemma4_moe": {
+        "lm_model": 44,
+        "text_embeddings_model": 1,
+        "vision_embeddings_model": 10,
+        "text_embeddings_per_layer_model": 0,
+    },
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From a97bf7d819da198d3ebdd3caedc46a323d3d1e8b Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 23 Apr 2026 15:09:45 +0200
Subject: [PATCH 204/222] Code style.

---
 optimum/exporters/openvino/model_patcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 4e61cb3d69..792191100d 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -52,6 +52,8 @@
     ModelPatcher,
     gpt_oss_forward,
     override_arguments,
+)
+from optimum.exporters.onnx.model_patcher import (
     sdpa_mask_without_vmap as _orig_sdpa_mask_without_vmap,
 )
 from optimum.intel.utils.import_utils import (
@@ -61,7 +63,6 @@
     is_transformers_version,
 )
 
-
 from ._ov_ops import convert_recurrent_attention_cell
 
 

From 32712e3f5196705c3e5c227d5d3e885b3a6706c2 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 Apr 2026 13:50:21 +0200
Subject: [PATCH 205/222] Workflow to test gemma4 on transformers==5.5

---
 .../test_openvino_transformers_5_5.yml        | 65 +++++++++++++++++++
 pyproject.toml                                |  1 +
 2 files changed, 66 insertions(+)
 create mode 100644 .github/workflows/test_openvino_transformers_5_5.yml

diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_transformers_5_5.yml
new file mode 100644
index 0000000000..205fa00b9b
--- /dev/null
+++ b/.github/workflows/test_openvino_transformers_5_5.yml
@@ -0,0 +1,65 @@
+name: OpenVINO - Test Gemma4
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+env:
+  UV_TORCH_BACKEND: cpu
+  UV_SYSTEM_PYTHON: true
+  TRANSFORMERS_IS_CI: true
+  HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        test-pattern:
+          [
+            "*export*",
+            "*seq2seq*",
+            "*quantization*",
+          ]
+
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip uv
+          uv pip install .[tests] librosa diffusers
+
+      - name: Install transformers 5.5
+        run: |
+          uv pip install transformers==5.5
+
+      - name: Login with fork PRs CI token
+        if: ${{ env.HF_TOKEN == '' }}
+        run: |
+          python tests/scripts/login_with_ci_token.py
+
+      - name: Test Gemma4 with Pytest
+        run: |
+          pytest tests/openvino/${{ matrix.test-pattern }} -m gemma4 --durations=0
diff --git a/pyproject.toml b/pyproject.toml
index bc066641fd..a201ccf730 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,4 +36,5 @@ known-first-party = ["optimum"]
 [tool.pytest.ini_options]
 markers = [
     "run_slow",
+    "gemma4: tests for gemma4 and gemma4_moe architectures (require transformers>=5.5)",
 ]
\ No newline at end of file

From e1a1325dd27f5f915f073039302b8775e797c88e Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 Apr 2026 13:58:45 +0200
Subject: [PATCH 206/222] Workflow to test gemma4 on transformers==5.5

---
 tests/openvino/conftest.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 tests/openvino/conftest.py

diff --git a/tests/openvino/conftest.py b/tests/openvino/conftest.py
new file mode 100644
index 0000000000..94fec0215a
--- /dev/null
+++ b/tests/openvino/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+
+def pytest_collection_modifyitems(items):
+    """Dynamically add the 'gemma4' marker to every parameterized test whose
+    name contains 'gemma4' (this also covers 'gemma4_moe')."""
+    gemma4_marker = pytest.mark.gemma4
+    for item in items:
+        if "gemma4" in item.name:
+            item.add_marker(gemma4_marker)

From 28fb7f03f67389668f02faa91ff48240c9c9acce Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 Apr 2026 14:12:47 +0200
Subject: [PATCH 207/222] Workflow to test gemma4 on transformers==5.5

---
 .github/workflows/test_openvino_transformers_5_5.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_transformers_5_5.yml
index 205fa00b9b..10dee0ec9f 100644
--- a/.github/workflows/test_openvino_transformers_5_5.yml
+++ b/.github/workflows/test_openvino_transformers_5_5.yml
@@ -51,9 +51,9 @@ jobs:
           pip install --upgrade pip uv
           uv pip install .[tests] librosa diffusers
 
-      - name: Install transformers 5.5
+      - name: Install transformers 5.5.0
         run: |
-          uv pip install transformers==5.5
+          uv pip install transformers==5.5.0
 
       - name: Login with fork PRs CI token
         if: ${{ env.HF_TOKEN == '' }}

From bdde55bcd9c035843588a3a5dfa17d235f352594 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 Apr 2026 16:43:11 +0200
Subject: [PATCH 208/222] Use latest openvino nightly for Gemma4.

---
 .github/workflows/test_openvino_transformers_5_5.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_transformers_5_5.yml
index 10dee0ec9f..4aff0e5f43 100644
--- a/.github/workflows/test_openvino_transformers_5_5.yml
+++ b/.github/workflows/test_openvino_transformers_5_5.yml
@@ -51,9 +51,10 @@ jobs:
           pip install --upgrade pip uv
           uv pip install .[tests] librosa diffusers
 
-      - name: Install transformers 5.5.0
+      - name: Install transformers 5.5.0 and latest openvino nightly
         run: |
           uv pip install transformers==5.5.0
+          uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
       - name: Login with fork PRs CI token
         if: ${{ env.HF_TOKEN == '' }}

From 601e6643d3fff37b84d7367f8c8ae3670fd0f699 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Tue, 28 Apr 2026 17:34:08 +0200
Subject: [PATCH 209/222] Fix compression test.

---
 tests/openvino/conftest.py          |  5 +++--
 tests/openvino/test_quantization.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/openvino/conftest.py b/tests/openvino/conftest.py
index 94fec0215a..bf929db0c5 100644
--- a/tests/openvino/conftest.py
+++ b/tests/openvino/conftest.py
@@ -1,10 +1,11 @@
 import pytest
 
 
-def pytest_collection_modifyitems(items):
+@pytest.hookimpl(tryfirst=True)
+def pytest_collection_modifyitems(config, items):
     """Dynamically add the 'gemma4' marker to every parameterized test whose
     name contains 'gemma4' (this also covers 'gemma4_moe')."""
     gemma4_marker = pytest.mark.gemma4
     for item in items:
-        if "gemma4" in item.name:
+        if "gemma4" in item.nodeid:
             item.add_marker(gemma4_marker)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index c8404f01c3..cdda7f00f0 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -1328,7 +1328,10 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e
             self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict())
             self.assertFalse(model.model.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
+    @parameterized.expand(
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION,
+        name_func=lambda testcase_func, param_num, params: f"{testcase_func.__name__}_{parameterized.to_safe_name(params.args[1])}",
+    )
     def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust_remote_code):
         model = model_cls.from_pretrained(
             MODEL_NAMES[model_type],
@@ -1550,7 +1553,10 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
         expected_int8 = {k: {"int8": v} for k, v in expected_int8.items()}
         check_compression_state_per_model(self, model.ov_models, expected_int8)
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
+    @parameterized.expand(
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION,
+        name_func=lambda testcase_func, param_num, params: f"{testcase_func.__name__}_{parameterized.to_safe_name(params.args[1])}",
+    )
     def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code):
         model = model_cls.from_pretrained(
             MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code

From 9cc279d84644c0938593b59f77e3eb10faef260f Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 11:01:56 +0200
Subject: [PATCH 210/222] Workflow corrections.

---
 ..._5_5.yml => test_openvino_preview_models.yml} | 16 ++++++++--------
 tests/openvino/test_seq2seq.py                   |  2 ++
 2 files changed, 10 insertions(+), 8 deletions(-)
 rename .github/workflows/{test_openvino_transformers_5_5.yml => test_openvino_preview_models.yml} (93%)

diff --git a/.github/workflows/test_openvino_transformers_5_5.yml b/.github/workflows/test_openvino_preview_models.yml
similarity index 93%
rename from .github/workflows/test_openvino_transformers_5_5.yml
rename to .github/workflows/test_openvino_preview_models.yml
index 4aff0e5f43..ba60048e0b 100644
--- a/.github/workflows/test_openvino_transformers_5_5.yml
+++ b/.github/workflows/test_openvino_preview_models.yml
@@ -46,21 +46,21 @@ jobs:
         with:
           python-version: "3.10"
 
+      - name: Login with fork PRs CI token
+        if: ${{ env.HF_TOKEN == '' }}
+        run: |
+          python tests/scripts/login_with_ci_token.py
+
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
           uv pip install .[tests] librosa diffusers
 
-      - name: Install transformers 5.5.0 and latest openvino nightly
+      - name: Install latest openvino nightly
         run: |
-          uv pip install transformers==5.5.0
           uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
-      - name: Login with fork PRs CI token
-        if: ${{ env.HF_TOKEN == '' }}
-        run: |
-          python tests/scripts/login_with_ci_token.py
-
-      - name: Test Gemma4 with Pytest
+      - name: Preview Models Support
         run: |
+          uv pip install transformers==5.5.0
           pytest tests/openvino/${{ matrix.test-pattern }} -m gemma4 --durations=0
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 29b18f450f..23b3e73199 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -689,6 +689,8 @@ def test_find_untested_architectures(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
+        if model_arch == "gemma4":
+            return
         if model_arch in ("llama4", "minicpmv", "minicpmo") and is_openvino_version(">=", "2026.1.0"):
             self.skipTest("CVS-185350: OpenVINO 2026.1.0 inference results mismatch")
 

From df8b17ff32fe7004fe807bb84d6db9c54631cb5d Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 11:03:28 +0200
Subject: [PATCH 211/222] Wrong change.

---
 tests/openvino/test_seq2seq.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 23b3e73199..29b18f450f 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -689,8 +689,6 @@ def test_find_untested_architectures(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
-        if model_arch == "gemma4":
-            return
         if model_arch in ("llama4", "minicpmv", "minicpmo") and is_openvino_version(">=", "2026.1.0"):
             self.skipTest("CVS-185350: OpenVINO 2026.1.0 inference results mismatch")
 

From 6a30dcb7c073e7b06b25b06fd5d2e72b51cbf6b2 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 11:05:52 +0200
Subject: [PATCH 212/222] Minor correction.

---
 .github/workflows/test_openvino_preview_models.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test_openvino_preview_models.yml b/.github/workflows/test_openvino_preview_models.yml
index ba60048e0b..7a3fd5f1f4 100644
--- a/.github/workflows/test_openvino_preview_models.yml
+++ b/.github/workflows/test_openvino_preview_models.yml
@@ -46,16 +46,16 @@ jobs:
         with:
           python-version: "3.10"
 
-      - name: Login with fork PRs CI token
-        if: ${{ env.HF_TOKEN == '' }}
-        run: |
-          python tests/scripts/login_with_ci_token.py
-
       - name: Install dependencies
         run: |
           pip install --upgrade pip uv
           uv pip install .[tests] librosa diffusers
 
+      - name: Login with fork PRs CI token
+        if: ${{ env.HF_TOKEN == '' }}
+        run: |
+          python tests/scripts/login_with_ci_token.py
+
       - name: Install latest openvino nightly
         run: |
           uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly

From 018dbe6b9d388cdcd00d88dbc30972760b10e861 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 12:27:21 +0200
Subject: [PATCH 213/222] Refactor token_type_ids passing.

---
 optimum/exporters/openvino/model_patcher.py        | 5 +++--
 optimum/intel/openvino/modeling_visual_language.py | 6 +-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 52587bb177..d1bb749da9 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4972,12 +4972,13 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids,
 
     # Standard causal mask [seq_len, target_len]
     causal_mask = torch.full((seq_len, target_len), min_dtype, dtype=dtype, device=device)
-    if seq_len != 1:
-        causal_mask = torch.triu(causal_mask, diagonal=past_len + 1)
+    causal_mask = torch.triu(causal_mask, diagonal=past_len + 1)
 
     # Apply padding from attention_mask_2d
     padding_mask = (1.0 - attention_mask_2d[:, None, None, :].to(dtype=dtype, device=device)) * min_dtype
     full_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + padding_mask
+    mm_token_type_ids = torch.nn.functional.pad(mm_token_type_ids,
+                                                (0, target_len - mm_token_type_ids.shape[-1]), value=0)
 
     # Sliding window causal mask
     sliding_mask = full_mask.clone()
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 89965be093..94d15c9aee 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -211,11 +211,7 @@ def prepare_inputs(
 
         if "token_type_ids" in self.input_names:
             if token_type_ids is None:
-                # Use attention_mask shape to match total sequence length (including past tokens)
-                if attention_mask is not None:
-                    token_type_ids = np.zeros(attention_mask.shape, dtype=int)
-                else:
-                    token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
+               token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
             inputs["token_type_ids"] = token_type_ids
 
         if "beam_idx" in self.input_names:

From 9f281b412c43d6e4ce39e135c3c0b584c191b329 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 15:38:30 +0200
Subject: [PATCH 214/222] Update
 .github/workflows/test_openvino_preview_models.yml

Co-authored-by: Roman Kazantsev <roman.kazantsev@intel.com>
---
 .github/workflows/test_openvino_preview_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino_preview_models.yml b/.github/workflows/test_openvino_preview_models.yml
index 7a3fd5f1f4..56702b46a2 100644
--- a/.github/workflows/test_openvino_preview_models.yml
+++ b/.github/workflows/test_openvino_preview_models.yml
@@ -60,7 +60,7 @@ jobs:
         run: |
           uv pip install --pre -U openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
-      - name: Preview Models Support
+      - name: Gemma 4 Validation
         run: |
           uv pip install transformers==5.5.0
           pytest tests/openvino/${{ matrix.test-pattern }} -m gemma4 --durations=0

From 21b6ce8d3f0f51ac9966dfcf51afb3cfa38171f0 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 15:38:42 +0200
Subject: [PATCH 215/222] Update
 .github/workflows/test_openvino_preview_models.yml

Co-authored-by: Roman Kazantsev <roman.kazantsev@intel.com>
---
 .github/workflows/test_openvino_preview_models.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino_preview_models.yml b/.github/workflows/test_openvino_preview_models.yml
index 56702b46a2..0e162e1f42 100644
--- a/.github/workflows/test_openvino_preview_models.yml
+++ b/.github/workflows/test_openvino_preview_models.yml
@@ -1,4 +1,4 @@
-name: OpenVINO - Test Gemma4
+name: Preview Models Support Validation
 
 on:
   workflow_dispatch:

From 0c2f4111d33eb2845e69a00328716ac68fdb486c Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 16:35:00 +0200
Subject: [PATCH 216/222] Minor corrections.

---
 optimum/intel/openvino/modeling_visual_language.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 94d15c9aee..42a4866c38 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -211,7 +211,7 @@ def prepare_inputs(
 
         if "token_type_ids" in self.input_names:
             if token_type_ids is None:
-               token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
+                token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int)
             inputs["token_type_ids"] = token_type_ids
 
         if "beam_idx" in self.input_names:
@@ -796,7 +796,7 @@ def forward(
             additional_kwargs["per_layer_inputs"] = extra_outputs[0]
 
         return self.language_model.forward(
-            input_ids=input_ids,
+            input_ids=None,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             position_ids=position_ids,

From 87f4becd7a47af5454eabb6628dadc4e9f08a24c Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 29 Apr 2026 16:36:21 +0200
Subject: [PATCH 217/222] Code style.

---
 optimum/exporters/openvino/model_patcher.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index d1bb749da9..16b72526f6 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4977,8 +4977,9 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids,
     # Apply padding from attention_mask_2d
     padding_mask = (1.0 - attention_mask_2d[:, None, None, :].to(dtype=dtype, device=device)) * min_dtype
     full_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + padding_mask
-    mm_token_type_ids = torch.nn.functional.pad(mm_token_type_ids,
-                                                (0, target_len - mm_token_type_ids.shape[-1]), value=0)
+    mm_token_type_ids = torch.nn.functional.pad(
+        mm_token_type_ids, (0, target_len - mm_token_type_ids.shape[-1]), value=0
+    )
 
     # Sliding window causal mask
     sliding_mask = full_mask.clone()

From a27437815040776deabf0b2cc7af989ab873032f Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 30 Apr 2026 10:33:49 +0200
Subject: [PATCH 218/222] Reuse LFM2 MoE patching.

---
 optimum/exporters/openvino/model_patcher.py | 31 +--------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 16b72526f6..3e66401bec 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -5266,35 +5266,6 @@ def gemma4_text_attention_forward(
     return attn_output, attn_weights
 
 
-def _gemma4_moe_block_forward(self, hidden_states, top_k_index, top_k_weights):
-    # hidden_states: [B*S, hidden_dim]
-    # top_k_index: [B*S, K], top_k_weights: [B*S, K]
-    num_tokens = hidden_states.shape[0]
-
-    # Compute all expert outputs via batched matmul
-    # expanded: [E, B*S, hidden_dim]
-    expanded_hidden = hidden_states.unsqueeze(0).expand(self.num_experts, -1, -1)
-
-    # gate_up_proj: [E, 2*inter, hidden] -> transpose to [E, hidden, 2*inter]
-    gate_up = torch.bmm(expanded_hidden, self.gate_up_proj.transpose(1, 2))
-    gate, up = gate_up.chunk(2, dim=-1)
-    intermediate = self.act_fn(gate) * up
-
-    # down_proj: [E, hidden, inter] -> transpose to [E, inter, hidden]
-    expert_outputs = torch.bmm(intermediate, self.down_proj.transpose(1, 2))
-    # expert_outputs: [E, B*S, hidden_dim]
-
-    # Build full routing weight matrix [B*S, E] from sparse top-k
-    full_weights = torch.zeros(num_tokens, self.num_experts, dtype=hidden_states.dtype, device=hidden_states.device)
-    full_weights.scatter_add_(1, top_k_index, top_k_weights)
-
-    # Weighted sum over experts: [B*S, 1, E] @ [B*S, E, hidden_dim] -> [B*S, hidden_dim]
-    expert_outputs = expert_outputs.permute(1, 0, 2)  # [B*S, E, hidden_dim]
-    final_hidden_states = torch.bmm(full_weights.unsqueeze(1), expert_outputs).squeeze(1)
-
-    return final_hidden_states
-
-
 class Gemma4LMModelPatcher(Gemma3LMModelPatcher):
     def __init__(self, config, model, model_kwargs):
         super().__init__(config, model, model_kwargs)
@@ -5323,7 +5294,7 @@ def __enter__(self):
             decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn)
             if hasattr(decoder_layer, "experts"):
                 decoder_layer.experts._orig_forward = decoder_layer.experts.forward
-                decoder_layer.experts.forward = types.MethodType(_gemma4_moe_block_forward, decoder_layer.experts)
+                decoder_layer.experts.forward = types.MethodType(lfm2_moe_experts_forward, decoder_layer.experts)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)

From ff99d6e13774841bdd17ac0d4c8bd2d181cf7c27 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 30 Apr 2026 10:11:02 +0200
Subject: [PATCH 219/222] Update optimum/exporters/openvino/model_configs.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/exporters/openvino/model_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index eaef60326a..c14cfc2946 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -1589,7 +1589,7 @@ class Gemma4TextOpenVINOConfig(Gemma3TextOpenVINOConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, Gemma4DummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = Gemma4DummyPastKeyValuesGenerator
-    MIN_TRANSFORMERS_VERSION = "4.50.0"
+    MIN_TRANSFORMERS_VERSION = "5.5"
 
     def add_past_key_values(self, inputs_or_outputs: dict[str, dict[int, str]], direction: str):
         if direction not in ["inputs", "outputs"]:

From 0f8508cb9dd95f5190045bab46e8d29dbfdda10c Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 30 Apr 2026 13:48:38 +0200
Subject: [PATCH 220/222] Applied comments, removed not needed code.

---
 optimum/exporters/openvino/model_patcher.py   | 74 +++++--------------
 .../openvino/modeling_visual_language.py      |  3 +-
 2 files changed, 21 insertions(+), 56 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3e66401bec..d8237f954a 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4923,44 +4923,12 @@ def __exit__(self, exc_type, exc_value, traceback):
             self._model.model._update_causal_mask = self._model.model._orig_update_causual_mask
             del self._model.model._orig_update_causual_mask
 
-
-def _gemma4_project_per_layer_inputs(
-    self,
-    inputs_embeds: torch.Tensor,
-    per_layer_inputs: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    per_layer_projection = self.per_layer_model_projection(inputs_embeds) * self.per_layer_model_projection_scale
-    per_layer_projection = per_layer_projection.reshape(
-        *inputs_embeds.shape[:-1],
-        self.config.num_hidden_layers,
-        self.hidden_size_per_layer_input,
-    )
-    per_layer_projection = self.per_layer_projection_norm(per_layer_projection)
-
-    if per_layer_inputs is None:
-        return per_layer_projection
-
-    if per_layer_projection.shape != per_layer_inputs.shape:
-        per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :]
-
-    return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale
-
-
+# Creates a dict of causal masks with bidirectional attention for vision tokens
+# on sliding_attention layers, matching the behavior of transformers
+# create_causal_mask_mapping when use_bidirectional_attention == "vision".
+# Needs to be patched to pass proper 'sliding_mask' for prefill stage.
+# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L1986
 def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids, inputs_embeds, sliding_window):
-    """
-    Creates a dict of causal masks with bidirectional attention for vision tokens
-    on sliding_attention layers, matching the behavior of transformers'
-    create_causal_mask_mapping when use_bidirectional_attention == "vision".
-
-    Args:
-        attention_mask_2d: [batch, total_len] 2D attention mask (1=attend, 0=pad)
-        mm_token_type_ids: [batch, total_len] token type ids (0=text, 1=image, 2=video/audio)
-        inputs_embeds: [batch, seq_len, hidden_size]
-        sliding_window: int, sliding window size
-
-    Returns:
-        dict with "full_attention" and "sliding_attention" 4D masks
-    """
     dtype = inputs_embeds.dtype
     device = inputs_embeds.device
     min_dtype = torch.finfo(dtype).min
@@ -4983,11 +4951,10 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids,
 
     # Sliding window causal mask
     sliding_mask = full_mask.clone()
-    if sliding_window is not None:
-        row_pos = torch.arange(seq_len, device=device).unsqueeze(1) + past_len
-        col_pos = torch.arange(target_len, device=device).unsqueeze(0)
-        beyond_window = (row_pos - col_pos) >= sliding_window
-        sliding_mask = sliding_mask.masked_fill(beyond_window[None, None, :, :], min_dtype)
+    row_pos = torch.arange(seq_len, device=device).unsqueeze(1) + past_len
+    col_pos = torch.arange(target_len, device=device).unsqueeze(0)
+    beyond_window = (row_pos - col_pos) >= sliding_window
+    sliding_mask = sliding_mask.masked_fill(beyond_window[None, None, :, :], min_dtype)
 
     # Apply bidirectional masking for vision tokens (only on sliding_attention mask)
     # mm_token_type_ids: [batch, total_len] - 0=text, 1=image, 2=video/audio
@@ -5016,7 +4983,10 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids,
         "sliding_attention": sliding_mask,
     }
 
-
+# Forward method of the language model of Gemma4, needs to be patched to pass 'per_layer_inputs',
+# as original code fails to create per_layer_inputs without the providing of input_ids,
+# while OV language model expects only inputs_embeds without input_ids.
+# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L2152
 def gemma4_language_model_forward(
     self,
     input_ids: Optional[torch.LongTensor] = None,
@@ -5089,7 +5059,8 @@ def gemma4_language_model_forward(
         image_hidden_states=image_features if pixel_values is not None else None,
     )
 
-
+# Gemma4 model forward, needs to be patched to pass 'per_layer_inputs',
+# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L2396
 def gemma4_lm_forward(
     self,
     attention_mask: Optional[torch.Tensor] = None,
@@ -5163,6 +5134,8 @@ def gemma4_lm_forward(
     return tuple([value if not isinstance(value, list) else tuple(value) for value in outputs_dict.values()])
 
 
+# Needs to be patched to reshape 'attention_mask' to match attention weights
+# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L768
 def gemma4_eager_attention_forward_patched(
     module: nn.Module,
     query: torch.Tensor,
@@ -5197,6 +5170,8 @@ def gemma4_eager_attention_forward_patched(
     return attn_output, attn_weights
 
 
+# Needs to be patched to run methods 'gemma4_eager_attention_forward_patched' instead of original one
+# Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L1179
 def gemma4_text_attention_forward(
     self,
     hidden_states: torch.Tensor,
@@ -5281,14 +5256,6 @@ def __enter__(self):
 
         setattr(self._model, self.orig_forward_name, types.MethodType(gemma4_lm_forward, self._model))
         setattr(self._model.model, "forward", types.MethodType(gemma4_language_model_forward, self._model))
-
-        self._model.model.language_model._orig_project_per_layer_inputs = (
-            self._model.model.language_model.project_per_layer_inputs
-        )
-        self._model.model.language_model.project_per_layer_inputs = types.MethodType(
-            _gemma4_project_per_layer_inputs, self._model.model.language_model
-        )
-
         for decoder_layer in self._model.model.language_model.layers:
             decoder_layer.self_attn.orig_forward = decoder_layer.self_attn.forward
             decoder_layer.self_attn.forward = types.MethodType(gemma4_text_attention_forward, decoder_layer.self_attn)
@@ -5298,9 +5265,6 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        self._model.model.language_model.project_per_layer_inputs = (
-            self._model.model.language_model._orig_project_per_layer_inputs
-        )
 
         for decoder_layer in self._model.model.language_model.layers:
             decoder_layer.self_attn.forward = decoder_layer.self_attn.orig_forward
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 42a4866c38..b376a58735 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -221,7 +221,8 @@ def prepare_inputs(
 
         if "per_layer_inputs" in self.input_names:
             per_layer_inputs = kwargs.pop("per_layer_inputs", None)
-            assert per_layer_inputs is not None, "Expected 'per_layer_inputs', but it was not passed"
+            if per_layer_inputs is None:
+                raise ValueError("Expected 'per_layer_inputs', but it was not passed")
             inputs["per_layer_inputs"] = torch.Tensor(per_layer_inputs)
 
         return inputs

From 942112a13fff8798d9d40c28b962ccb4aa003986 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 30 Apr 2026 14:11:08 +0200
Subject: [PATCH 221/222] Code style.

---
 optimum/exporters/openvino/model_patcher.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index d8237f954a..59e40f0e8a 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -4923,6 +4923,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             self._model.model._update_causal_mask = self._model.model._orig_update_causual_mask
             del self._model.model._orig_update_causual_mask
 
+
 # Creates a dict of causal masks with bidirectional attention for vision tokens
 # on sliding_attention layers, matching the behavior of transformers
 # create_causal_mask_mapping when use_bidirectional_attention == "vision".
@@ -4983,6 +4984,7 @@ def _create_gemma4_bidirectional_mask_dict(attention_mask_2d, mm_token_type_ids,
         "sliding_attention": sliding_mask,
     }
 
+
 # Forward method of the language model of Gemma4, needs to be patched to pass 'per_layer_inputs',
 # as original code fails to create per_layer_inputs without the providing of input_ids,
 # while OV language model expects only inputs_embeds without input_ids.
@@ -5059,6 +5061,7 @@ def gemma4_language_model_forward(
         image_hidden_states=image_features if pixel_values is not None else None,
     )
 
+
 # Gemma4 model forward, needs to be patched to pass 'per_layer_inputs',
 # Original code: https://github.com/huggingface/transformers/blob/v5.5.0/src/transformers/models/gemma4/modeling_gemma4.py#L2396
 def gemma4_lm_forward(

From eac389347523177511abe37908090d9e5c12e714 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 30 Apr 2026 17:36:45 +0200
Subject: [PATCH 222/222] Applied comments.

---
 optimum/exporters/openvino/model_configs.py | 6 ++----
 optimum/intel/openvino/modeling_decoder.py  | 7 ++++---
 tests/openvino/test_seq2seq.py              | 2 +-
 tests/openvino/utils_tests.py               | 2 +-
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index c14cfc2946..36a83adfa7 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -14,9 +14,11 @@
 
 import enum
 import logging
+import math
 from copy import deepcopy
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
+import torch
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
 
 from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
@@ -4370,10 +4372,6 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 dtype=float_dtype,
             )
         if input_name == "image_position_ids":
-            import math
-
-            import torch
-
             # Create position ids as a grid. The patch count = h_patches * w_patches
             # where both are divisible by pooling_kernel_size for correct pooling.
             k = self.pooling_kernel_size
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 7bab52fc1a..a74582b2c1 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -33,10 +33,12 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
 from transformers.utils.hub import PushToHubMixin
 
+from ..utils.import_utils import compare_versions, is_transformers_version
 
-try:
+
+if is_transformers_version("<", "5.5"):
     from transformers.models.mamba.modeling_mamba import MambaCache
-except ImportError:
+else:
     MambaCache = object
 
 from optimum.utils.normalized_config import NormalizedConfigManager
@@ -44,7 +46,6 @@
 from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful
 from ...exporters.openvino.stateful import model_has_state
 from ...exporters.openvino.utils import SSM_MODELS
-from ..utils.import_utils import compare_versions
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
 from .configuration import (
     OVConfig,
diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py
index 29b18f450f..37efa86b2d 100644
--- a/tests/openvino/test_seq2seq.py
+++ b/tests/openvino/test_seq2seq.py
@@ -787,7 +787,7 @@ def compare_outputs(inputs, ov_model, transformers_model, generation_config):
             with torch.no_grad():
                 transformers_outputs = transformers_model(**transformers_inputs)
             self.assertTrue(
-                torch.allclose(ov_outputs.logits, transformers_outputs.logits.to(torch.float32), atol=4e-3),
+                torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=4e-3),
                 f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
             )
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 392128dd29..3ee3a2035b 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -383,7 +383,7 @@
         "text_embeddings_per_layer_model": 1,
     },
     "gemma4_moe": {
-        "lm_model": 44,
+        "lm_model": 48,
         "text_embeddings_model": 1,
         "vision_embeddings_model": 10,
         "text_embeddings_per_layer_model": 0,