Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 185 additions & 0 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ def init_model_configs():
"AutoModelForImageTextToText",
)

# Add support for Qwen3-VL-Embedding
TasksManager._CUSTOM_CLASSES[("pt", "qwen3_vl", "feature-extraction")] = (
"transformers",
"Qwen3VLForConditionalGeneration",
)
TasksManager._CUSTOM_CLASSES[("pt", "qwen3_vl", "image-text-to-text")] = (
"transformers",
"Qwen3VLForConditionalGeneration",
)

if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
Expand Down Expand Up @@ -450,6 +460,33 @@ def generate(
)
return super().generate(input_name, framework, int_dtype, float_dtype)

class DummyQwen3VLInputGenerator(DummyVisionInputGenerator):
SUPPORTED_INPUT_NAMES = ("pixel_values", "image_grid_thw")

def __init__(
self,
task: str,
normalized_config: NormalizedVisionConfig,
batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
width: int = DEFAULT_DUMMY_SHAPES["width"],
height: int = DEFAULT_DUMMY_SHAPES["height"],
**kwargs,
):
super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)

def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
if input_name == "pixel_values":
# For Qwen3-VL-Embedding, the input shape is [batch_size, 3, 2, 16, 16]
return self.random_float_tensor(
[self.batch_size, 3, 2, 16, 16], framework=framework, dtype=float_dtype
)
if input_name == "image_grid_thw":
# For Qwen3-VL-Embedding, the input shape is [num_images, 3]
return self.random_int_tensor(
[1, 3], min_value=1, max_value=16, framework=framework, dtype=int_dtype
Comment on lines +477 to +487
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DummyQwen3VLInputGenerator.generate hard-codes [batch, 3, 2, 16, 16] / [1, 3] shapes and doesn’t use normalized_config (or the provided width/height) to derive shapes. This risks producing invalid dummy inputs for other Qwen3-VL checkpoints or future config changes. Derive these dimensions from the model config (e.g., patch size / temporal patch size / image size) or at least thread through the dummy-shape kwargs so callers can override them.

Suggested change
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
if input_name == "pixel_values":
# For Qwen3-VL-Embedding, the input shape is [batch_size, 3, 2, 16, 16]
return self.random_float_tensor(
[self.batch_size, 3, 2, 16, 16], framework=framework, dtype=float_dtype
)
if input_name == "image_grid_thw":
# For Qwen3-VL-Embedding, the input shape is [num_images, 3]
return self.random_int_tensor(
[1, 3], min_value=1, max_value=16, framework=framework, dtype=int_dtype
self.patch_size = max(
1,
int(
kwargs.get(
"patch_size",
getattr(normalized_config, "patch_size", getattr(normalized_config, "image_patch_size", 1)),
)
),
)
self.temporal_patch_size = max(
1,
int(
kwargs.get(
"temporal_patch_size",
getattr(normalized_config, "temporal_patch_size", getattr(normalized_config, "video_temporal_patch_size", 1)),
)
),
)
self.num_channels = int(
kwargs.get("num_channels", getattr(normalized_config, "num_channels", num_channels))
)
self.width = int(kwargs.get("width", getattr(normalized_config, "image_size", width)))
self.height = int(kwargs.get("height", getattr(normalized_config, "image_size", height)))
def _grid_size(self, size: int) -> int:
return max(1, (int(size) + self.patch_size - 1) // self.patch_size)
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
grid_h = self._grid_size(self.height)
grid_w = self._grid_size(self.width)
padded_height = grid_h * self.patch_size
padded_width = grid_w * self.patch_size
if input_name == "pixel_values":
return self.random_float_tensor(
[self.batch_size, self.num_channels, self.temporal_patch_size, padded_height, padded_width],
framework=framework,
dtype=float_dtype,
)
if input_name == "image_grid_thw":
max_grid_dim = max(self.temporal_patch_size, grid_h, grid_w)
return self.random_int_tensor(
[self.batch_size, 3],
min_value=1,
max_value=max_grid_dim,
framework=framework,
dtype=int_dtype,

Copilot uses AI. Check for mistakes.
)
return super().generate(input_name, framework, int_dtype, float_dtype)

@register_in_tasks_manager(
"qwen3_vl_text",
Expand Down Expand Up @@ -1898,6 +1935,154 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[
return super().patch_model_for_export(model, model_kwargs)
return CommonImageEmbeddingsModelPatcher(self, model, model_kwargs)

@register_in_tasks_manager(
"qwen3_vl",
*[
"feature-extraction",
"image-text-to-text",
],
library_name="transformers",
)
class Qwen3VLOpenVINOConfig(BaseVLMOpenVINOConfig):
Comment on lines +1942 to +1946
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file already defines and registers another Qwen3VLOpenVINOConfig for qwen3_vl later (used for image-text-to-text). Introducing a second class with the same name and overlapping @register_in_tasks_manager("qwen3_vl", ...) decorators makes the registry behavior order-dependent and the symbol name ambiguous. Consider renaming this new config (e.g., Qwen3VLEmbeddingOpenVINOConfig) and registering it only for the feature-extraction task to avoid accidental overrides.

Suggested change
"image-text-to-text",
],
library_name="transformers",
)
class Qwen3VLOpenVINOConfig(BaseVLMOpenVINOConfig):
],
library_name="transformers",
)
class Qwen3VLEmbeddingOpenVINOConfig(BaseVLMOpenVINOConfig):

Copilot uses AI. Check for mistakes.
MIN_TRANSFORMERS_VERSION = "4.57.0"
SUPPORTS_PAST = True
DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLInputGenerator,)

def __init__(
self,
config: "PretrainedConfig",
task: str = "feature-extraction",
int_dtype: str = "int64",
float_dtype: str = "fp32",
behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS,
preprocessors: Optional[List[Any]] = None,
**kwargs,
):
super().__init__(
config=config,
task=task,
int_dtype=int_dtype,
float_dtype=float_dtype,
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

behavior passed to __init__ is currently ignored because it isn’t forwarded to BaseVLMOpenVINOConfig.__init__ (which sets self._behavior). This can break with_behavior(...) / multi-part VLM export because the instance will always behave as VISION_EMBEDDINGS. Pass behavior=behavior to super().__init__(...) or set self._behavior = behavior after calling super().

Suggested change
float_dtype=float_dtype,
float_dtype=float_dtype,
behavior=behavior,

Copilot uses AI. Check for mistakes.
preprocessors=preprocessors,
)
self._orig_config = config
if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
self._config = config.vision_config
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)

@property
def inputs(self) -> Dict[str, Dict[int, str]]:
if not self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
return {}
return {
"pixel_values": {0: "batch_size", 1: "channels", 2: "temporal_patch_size", 3: "patch_height", 4: "patch_width"},
"image_grid_thw": {0: "num_images", 1: "3"}
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the inputs mapping, using the literal string "3" as a dimension name for image_grid_thw is inconsistent with the rest of the exporter configs (dimension names are descriptive identifiers). Rename this axis to something semantic (e.g. grid_dims/thw) or omit it if it’s intended to be a fixed size.

Suggested change
"image_grid_thw": {0: "num_images", 1: "3"}
"image_grid_thw": {0: "num_images", 1: "thw"}

Copilot uses AI. Check for mistakes.
}

def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
# For feature-extraction task, we need to generate inputs for get_image_features method
import torch
# Only return the inputs that the model actually accepts
# Use shape [batch_size, 3, 2, 16, 16] for pixel_values
dummy_inputs = {
"pixel_values": torch.randn(1, 3, 2, 16, 16, dtype=torch.float32),
"image_grid_thw": torch.tensor([[1, 16, 16]], dtype=torch.int64)
}
return dummy_inputs
Comment on lines +1983 to +1991
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

generate_dummy_inputs hard-codes Torch tensors with float32/int64 dtypes and fixed shapes, ignoring framework, float_dtype, int_dtype, and user-provided input_shapes. This can cause dtype mismatches (e.g. fp16 export) and makes input-shape overrides ineffective. Prefer using the standard dummy input generator flow (super().generate_dummy_inputs(...)) with a dedicated DummyInputGenerator that respects the dtype/shape parameters.

Suggested change
# For feature-extraction task, we need to generate inputs for get_image_features method
import torch
# Only return the inputs that the model actually accepts
# Use shape [batch_size, 3, 2, 16, 16] for pixel_values
dummy_inputs = {
"pixel_values": torch.randn(1, 3, 2, 16, 16, dtype=torch.float32),
"image_grid_thw": torch.tensor([[1, 16, 16]], dtype=torch.int64)
}
return dummy_inputs
# Reuse the standard dummy input generator flow so framework, dtype,
# and caller-provided shape overrides are all respected.
return super().generate_dummy_inputs(framework=framework, **kwargs)

Copilot uses AI. Check for mistakes.

def rename_ambiguous_inputs(self, inputs):
# Do not add any new inputs, just return the original inputs
return inputs

def with_behavior(
self,
behavior: Union[str, VLMConfigBehavior],
):
"""
Creates a config for different behaviour.

Args:
behavior ([`ConfigBehavior`]):
The behavior to use for the new instance.
"""
if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior):
behavior = VLMConfigBehavior(behavior)

if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
# Use 'qwen3' as the model type instead of 'qwen3_vl_text'
model_type = "qwen3"
return get_vlm_text_embeddings_config(
model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype
)

if behavior == VLMConfigBehavior.LANGUAGE:
# Use 'qwen3' as the model type instead of 'qwen3_vl_text'
model_type = "qwen3"
return get_vlm_text_generation_config(
model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype
)

if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
return self.__class__(
self._orig_config,
task=self.task,
int_dtype=self.int_dtype,
float_dtype=self.float_dtype,
behavior=behavior,
preprocessors=self._preprocessors,
)

def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]):
if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior):
behavior = VLMConfigBehavior(behavior)

if behavior == VLMConfigBehavior.LANGUAGE:
# For Qwen3VLForConditionalGeneration, the language model part is in model.model.language_model
if hasattr(model, "model") and hasattr(model.model, "language_model"):
return model.model.language_model if not hasattr(model, "lm_head") else model
return model.language_model if not hasattr(model, "lm_head") else model

if behavior == VLMConfigBehavior.VISION_EMBEDDINGS:
# For Qwen3VLForConditionalGeneration, the visual part is in model.model.visual
return model.model.visual if hasattr(model, "model") and hasattr(model.model, "visual") else model.visual

if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS:
text_embedding = model.get_input_embeddings()
# For Qwen3VLForConditionalGeneration, the language model part is in model.model.language_model
if hasattr(model, "model") and hasattr(model.model, "language_model"):
text_embedding.config = model.model.language_model.config
else:
text_embedding.config = model.language_model.config
return text_embedding

def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[Dict[str, Any]] = None):
model_kwargs = model_kwargs or {}
if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS:
return super().patch_model_for_export(model, model_kwargs)
# Create a simple patcher that doesn't rely on get_image_features or ModelPatcher
class Qwen3VLImageEmbeddingsModelPatcher:
def __init__(self, config, model, model_kwargs=None):
self.config = config
self.model = model
self.model_kwargs = model_kwargs
# Patch the forward method directly
self.orig_forward = model.forward
model.forward = self.patched_forward
def patched_forward(self, pixel_values, image_grid_thw, **kwargs):
# Get the original output
output = self.orig_forward(pixel_values, image_grid_thw, **kwargs)
# Return only the last_hidden_state to avoid type inference issues
return output.last_hidden_state
Comment on lines +2071 to +2075
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def patched_forward(self, pixel_values, image_grid_thw, **kwargs):
output = self.orig_forward(pixel_values, image_grid_thw, **kwargs)
if isinstance(output, tuple):
return output[0]
return output.last_hidden_state if hasattr(output, 'last_hidden_state') else output

def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# Restore the original forward method
self.model.forward = self.orig_forward
Comment on lines +2062 to +2080
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The custom patcher returned by patch_model_for_export doesn’t follow the ModelPatcher contract used by convert.py (which wraps patcher.patched_forward and assumes a dict-like output with .values(), and may reassign patcher.patched_forward). Here, model.forward is patched in __init__ and the patched forward returns a raw tensor, so future changes (or different export paths) can easily break. Prefer implementing this as a proper ModelPatcher subclass (or reuse CommonImageEmbeddingsModelPatcher) so patched_forward returns a dict keyed by config.outputs and patching is applied in __enter__/__exit__.

Suggested change
# Create a simple patcher that doesn't rely on get_image_features or ModelPatcher
class Qwen3VLImageEmbeddingsModelPatcher:
def __init__(self, config, model, model_kwargs=None):
self.config = config
self.model = model
self.model_kwargs = model_kwargs
# Patch the forward method directly
self.orig_forward = model.forward
model.forward = self.patched_forward
def patched_forward(self, pixel_values, image_grid_thw, **kwargs):
# Get the original output
output = self.orig_forward(pixel_values, image_grid_thw, **kwargs)
# Return only the last_hidden_state to avoid type inference issues
return output.last_hidden_state
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# Restore the original forward method
self.model.forward = self.orig_forward
class Qwen3VLImageEmbeddingsModelPatcher(ModelPatcher):
def patched_forward(self, pixel_values, image_grid_thw, **kwargs):
output = self._model(pixel_values, image_grid_thw, **kwargs)
output_name = next(iter(self.config.outputs))
return {output_name: output.last_hidden_state}

Copilot uses AI. Check for mistakes.
return Qwen3VLImageEmbeddingsModelPatcher(self, model, model_kwargs)

def _create_dummy_input_generator_classes(self, **kwargs):
# Override this method to ensure our DummyQwen3VLInputGenerator is used
return [DummyQwen3VLInputGenerator(self.task, self._normalized_config, **kwargs)]

@register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers")
class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig):
Expand Down