Skip to content

Commit f2fc0ef

Browse files
committed
Qwen 3 Omni MoE support
1 parent 190d59f commit f2fc0ef

19 files changed

Lines changed: 3538 additions & 122 deletions

optimum/exporters/openvino/__main__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,8 @@ def _main_quantize(
651651
# which is returned as the inferred task. As a result, we try to load the exported model using the
652652
# OVModelForSpeechSeq2Seq class instead of the OVModelForVisualCausalLM class when the task is not specified
653653
# explicitly. Because of this, we get an error.
654+
# Qwen3-Omni-MoE registers multiple tasks (text-to-audio, ASR, image-text-to-text) but always loads
655+
# through OVModelForVisualCausalLM, so redirect unconditionally regardless of inferred or explicit task.
654656
if original_task == "auto" and library_name == "transformers":
655657
config = AutoConfig.from_pretrained(
656658
model_name_or_path,
@@ -661,7 +663,7 @@ def _main_quantize(
661663
trust_remote_code=trust_remote_code,
662664
)
663665
model_type = config.model_type
664-
if model_type in ["phi4mm", "phi4_multimodal"]:
666+
if model_type in ["phi4mm", "phi4_multimodal", "qwen3_omni_moe"]:
665667
task = "image-text-to-text"
666668

667669
# Step 1. Obtain the correct OpenVINO model class

optimum/exporters/openvino/convert.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,13 @@
9494
from optimum.intel.openvino.configuration import OVConfig
9595

9696

97+
_VLM_LANGUAGE_MODEL_TASKS = ("image-text-to-text", "text-to-audio", "automatic-speech-recognition")
98+
99+
100+
def _is_vlm_language_model(task: str, model_name: str) -> bool:
101+
return model_name == "language_model" and any(t in task for t in _VLM_LANGUAGE_MODEL_TASKS)
102+
103+
97104
def _set_runtime_options(
98105
models_and_export_configs: Dict[
99106
str,
@@ -109,13 +116,13 @@ def _set_runtime_options(
109116
sub_export_config.runtime_options = {}
110117
if (
111118
"text-generation" in task
112-
or ("image-text-to-text" in task and model_name == "language_model")
119+
or _is_vlm_language_model(task, model_name)
113120
or getattr(sub_export_config, "stateful", False)
114121
):
115122
sub_export_config.runtime_options["ACTIVATIONS_SCALE_FACTOR"] = "8.0"
116123
if not quantized_model and (
117124
"text-generation" in task
118-
or ("image-text-to-text" in task and model_name == "language_model")
125+
or _is_vlm_language_model(task, model_name)
119126
or getattr(sub_export_config, "stateful", False)
120127
):
121128
sub_export_config.runtime_options["KV_CACHE_PRECISION"] = "f16"
@@ -552,6 +559,35 @@ def export_models(
552559
return outputs
553560

554561

562+
def _save_auxiliary_weights(model, model_type: str, output_dir: Path):
563+
if model_type != "qwen3_omni_moe":
564+
return
565+
# CodePredictor is traced with inputs_embeds, so the per-step codec_embedding
566+
# tables aren't part of the IR and must be dumped separately for runtime lookup.
567+
import numpy as np
568+
import torch
569+
570+
talker = getattr(model, "talker", None)
571+
if talker is None:
572+
return
573+
code_predictor = getattr(talker, "code_predictor", None)
574+
if code_predictor is None:
575+
return
576+
cp_model = getattr(code_predictor, "model", None)
577+
if cp_model is None:
578+
return
579+
codec_embedding = getattr(cp_model, "codec_embedding", None)
580+
if codec_embedding is None or len(codec_embedding) == 0:
581+
return
582+
try:
583+
stacked = torch.stack([emb.weight.data for emb in codec_embedding])
584+
except (AttributeError, RuntimeError) as e:
585+
logger.warning(f"Failed to extract codec_embedding weights for qwen3_omni_moe: {e}")
586+
return
587+
np.save(output_dir / "code_predictor_codec_embedding.npy", stacked.cpu().float().numpy())
588+
logger.info(f"Saved CodePredictor codec_embedding weights ({stacked.shape}) to {output_dir}")
589+
590+
555591
def export_from_model(
556592
model: Union["PreTrainedModel", "ModelMixin", "DiffusionPipeline"],
557593
output: Union[str, Path],
@@ -784,6 +820,8 @@ def export_from_model(
784820
library_name=library_name,
785821
)
786822

823+
_save_auxiliary_weights(model, model_type, output)
824+
787825
return files_subpaths
788826

789827

@@ -811,7 +849,7 @@ def export_tokenizer(
811849

812850
if (
813851
task is not None
814-
and (task.startswith("text-generation") or task == "image-text-to-text")
852+
and (task.startswith("text-generation") or any(t in task for t in _VLM_LANGUAGE_MODEL_TASKS))
815853
and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0")
816854
):
817855
logger.info(f"Set tokenizer padding side to left for `{task}` task.")

0 commit comments

Comments
 (0)