Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ Here is the list of the supported architectures :
- Qwen3
- Qwen3MoE
- Qwen3-VL
- Qwen3.5
- Qwen3.5-MoE
- Qwen3.5 (requires Transformers >= 5.2.0)
- Qwen3.5-MoE (requires Transformers >= 5.2.0)
- Qwen3.6
- Qwen3-Next
- RemBERT
Expand Down
4 changes: 2 additions & 2 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6425,7 +6425,7 @@ class Qwen3_5TextOpenVINOConfig(Qwen3VLTextOpenVINOConfig):
DUMMY_PKV_GENERATOR_CLASS = Qwen3_5DummyPastKeyValuesGenerator
NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
MIN_TRANSFORMERS_VERSION = "5.2.0"
MAX_TRANSFORMERS_VERSION = "5.2.99"
MAX_TRANSFORMERS_VERSION = "5.8.99"
_MODEL_PATCHER = Qwen3_5ModelPatcher

def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
Expand Down Expand Up @@ -6498,7 +6498,7 @@ class Qwen3_5OpenVINOConfig(Qwen3VLOpenVINOConfig):
SUPPORTED_BEHAVIORS = [model_type.value for model_type in QwenVLConfigBehavior]
DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator,)
MIN_TRANSFORMERS_VERSION = "5.2.0"
MAX_TRANSFORMERS_VERSION = "5.2.99"
MAX_TRANSFORMERS_VERSION = "5.8.99"

def __init__(
self,
Expand Down
45 changes: 34 additions & 11 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8714,6 +8714,7 @@ def qwen3_next_gated_delta_net_forward(
cache_params=None,
cache_position: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
):
def apply_mask_to_padding_states(hidden_states, attention_mask):
"""
Expand Down Expand Up @@ -9275,6 +9276,7 @@ def qwen3_5_gated_delta_net_forward(
cache_params=None,
cache_position: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
):
def apply_mask_to_padding_states(hidden_states, attention_mask):
"""
Expand Down Expand Up @@ -9370,7 +9372,7 @@ def __init__(
model: "PreTrainedModel",
model_kwargs: Optional[Dict[str, Any]] = None,
):
from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5DynamicCache
from transformers.models.qwen3_5.modeling_qwen3_5 import DynamicCache as Qwen3_5DynamicCache

from openvino.frontend.pytorch import ConversionExtension, ModuleExtension

Expand Down Expand Up @@ -9401,10 +9403,10 @@ def __init__(self, config, conv_states, recurrent_states, key_cache, value_cache
full_attn_layer_idx = 0
linear_attn_layer_idx = 0
for i in range(len(config.layer_types)):
if self.layer_types[i] == "full_attention":
if config.layer_types[i] == "full_attention":
self.full_attn_mapping[i] = full_attn_layer_idx
full_attn_layer_idx += 1
elif self.layer_types[i] == "linear_attention":
elif config.layer_types[i] == "linear_attention":
self.linear_attn_mapping[i] = linear_attn_layer_idx
linear_attn_layer_idx += 1

Expand All @@ -9429,17 +9431,38 @@ def update(
def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
"""Returns the sequence length of the cached states. A layer index can be optionally passed."""
# take any layer that contains cache and not empty tensor
layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
layer_idx = self.full_attn_mapping[layer_idx]
if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None:
full_attn_layers = list(self.full_attn_mapping.keys())
layer_idx = full_attn_layers[0] if layer_idx not in full_attn_layers else layer_idx
mapped_layer_idx = self.full_attn_mapping[layer_idx]
if len(self.key_cache) <= mapped_layer_idx or self.key_cache[mapped_layer_idx] is None:
return 0
return self.key_cache[layer_idx].shape[-2]
return self.key_cache[mapped_layer_idx].shape[-2]

@property
def has_previous_state(self):
def has_previous_state(self, layer_idx: Optional[int] = None) -> bool:
"""We have a previous state if the last linear (conv) layer was already updated."""
layer_idx = self.linear_attn_mapping[self.last_linear_layer]
return self.conv_states[layer_idx] is not None
if layer_idx is None:
# Get the last linear attention layer
linear_layers = list(self.linear_attn_mapping.keys())
if not linear_layers:
return False
layer_idx = linear_layers[-1]
mapped_layer_idx = self.linear_attn_mapping.get(layer_idx)
if mapped_layer_idx is None:
return False
return self.conv_states[mapped_layer_idx] is not None

def get_mask_sizes(self, query_length: int, layer_idx: int = 0) -> tuple[int, int]:
"""Returns (kv_length, kv_offset) using key_cache instead of the base cache layers."""
full_attn_layers = list(self.full_attn_mapping.keys())
if not full_attn_layers:
return query_length, 0
if layer_idx not in self.full_attn_mapping:
layer_idx = full_attn_layers[0]
local_idx = self.full_attn_mapping[layer_idx]
if local_idx >= len(self.key_cache) or self.key_cache[local_idx] is None:
return query_length, 0
past_len = self.key_cache[local_idx].shape[-2]
return past_len + query_length, 0

# the patch is needed to include KV-cache, Conv, and SSM states in the inputs and outputs.
def patched_forward(
Expand Down
2 changes: 1 addition & 1 deletion tests/openvino/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class ExportModelTest(unittest.TestCase):
SUPPORTED_ARCHITECTURES.update({"gemma4": OVModelForVisualCausalLM})
SUPPORTED_ARCHITECTURES.update({"gemma4_moe": OVModelForVisualCausalLM})

if is_transformers_version(">=", "5.2.0") and is_transformers_version("<", "5.3.0"):
if is_transformers_version(">=", "5.2.0") and is_transformers_version("<", "5.9.0"):
SUPPORTED_ARCHITECTURES.update({"qwen3_5": OVModelForVisualCausalLM})
SUPPORTED_ARCHITECTURES.update({"qwen3_5_moe": OVModelForVisualCausalLM})

Expand Down
2 changes: 1 addition & 1 deletion tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,7 +1095,7 @@ class OVWeightCompressionTest(unittest.TestCase):
if is_transformers_version(">=", "4.49.0") and is_transformers_version("<=", "4.57.6"):
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "videochat_flash_qwen", True))

if is_transformers_version(">=", "5.2.0") and is_transformers_version("<", "5.3.0"):
if is_transformers_version(">=", "5.2.0") and is_transformers_version("<", "5.9.0"):
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen3_5", False))
SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "qwen3_5_moe", False))

Expand Down
2 changes: 1 addition & 1 deletion tests/openvino/test_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ class OVModelForVisualCausalLMIntegrationTest(OVSeq2SeqTestMixin):
if is_transformers_version(">=", "5.5"):
SUPPORTED_ARCHITECTURES += ["gemma4", "gemma4_moe"]

if is_transformers_version(">=", "5.2.0") and is_transformers_version("<", "5.3.0"):
if is_transformers_version(">=", "5.2.0") and is_transformers_version("<", "5.9.0"):
SUPPORTED_ARCHITECTURES += ["qwen3_5", "qwen3_5_moe"]

# TODO: add fix for v5 and update MAX_TRANSFORMERS_VERSION accordingly
Expand Down