@@ -4258,9 +4258,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
42584258 yield from super().modify_tensors(data_torch, name, bid)
42594259
42604260
4261- @ModelBase.register("Qwen2_5OmniModel")
4262- class Qwen25OmniModel(Qwen2VLVisionModel):
4263- has_vision_encoder = True
4261+ class Qwen25AudioModel(MmprojModel):
42644262 has_audio_encoder = True
42654263
42664264 def __init__(self, *args, **kwargs):
@@ -4276,12 +4274,6 @@ def set_gguf_parameters(self):
42764274 self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
42774275 self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
42784276
4279- def get_vision_config(self) -> dict[str, Any] | None:
4280- return self.global_config["thinker_config"].get("vision_config")
4281-
4282- def get_audio_config(self) -> dict[str, Any] | None:
4283- return self.global_config["thinker_config"].get("audio_config")
4284-
42854277 def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
42864278 # SinusoidsPositionEmbedding
42874279 assert self.hparams_audio is not None
@@ -4312,7 +4304,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
43124304 # this tensor is left unused in transformers code
43134305 # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
43144306 return
4315- yield from super().modify_tensors(data_torch, name, bid)
4307+ yield from MmprojModel.modify_tensors(self, data_torch, name, bid)
4308+
4309+ return # skip other tensors
4310+
4311+
4312+ @ModelBase.register("Qwen2_5OmniModel")
4313+ class Qwen25OmniModel(Qwen2VLVisionModel, Qwen25AudioModel):
4314+ has_audio_encoder = True
4315+ has_vision_encoder = True
4316+
4317+ def get_vision_config(self) -> dict[str, Any] | None:
4318+ return self.global_config["thinker_config"].get("vision_config")
4319+
4320+ def get_audio_config(self) -> dict[str, Any] | None:
4321+ return self.global_config["thinker_config"].get("audio_config")
4322+
4323+ def set_gguf_parameters(self):
4324+ super().set_gguf_parameters()
4325+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
4326+
4327+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4328+ if "visual." in name:
4329+ yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)
4330+ elif "audio_tower." in name:
4331+ yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
4332+ return # skip other tensors
43164333
43174334
43184335@ModelBase.register("InternVisionModel")
@@ -4816,7 +4833,10 @@ def set_gguf_parameters(self):
48164833class Qwen3VLVisionModel(MmprojModel):
48174834 def __init__(self, *args, **kwargs):
48184835 super().__init__(*args, **kwargs)
4819- assert self.hparams_vision is not None
4836+ if self.hparams_vision is None:
4837+ logger.info("No vision config found, skipping vision tensor processing")
4838+ return
4839+
48204840 # Compute image_size if not present
48214841 if "image_size" not in self.hparams_vision:
48224842 # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
@@ -4837,7 +4857,9 @@ def __init__(self, *args, **kwargs):
48374857
48384858 def set_gguf_parameters(self):
48394859 super().set_gguf_parameters()
4840- self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
4860+ # in case mixed modalities, the arch will be handled by subclass
4861+ if not self.has_audio_encoder:
4862+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
48414863 self.gguf_writer.add_vision_use_gelu(True)
48424864
48434865 if self.hparams_vision is not None:
@@ -4925,11 +4947,64 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
49254947 return
49264948
49274949 if name.startswith("visual."):
4928- yield from super() .modify_tensors(data_torch, name, bid)
4929- return
4950+ yield from MmprojModel .modify_tensors(self, data_torch, name, bid)
4951+ return # skip other tensors
49304952
4931- # Fall back to parent class for other tensors
4932- yield from super().modify_tensors(data_torch, name, bid)
4953+
4954+ @ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
4955+ class Qwen3OmniMmprojModel(Qwen3VLVisionModel, Qwen25AudioModel):
4956+ has_audio_encoder = True
4957+ has_vision_encoder = True
4958+
4959+ def get_vision_config(self) -> dict[str, Any] | None:
4960+ if self.has_vision_encoder:
4961+ return self.global_config["thinker_config"].get("vision_config")
4962+ else:
4963+ return None
4964+
4965+ def get_audio_config(self) -> dict[str, Any] | None:
4966+ if self.has_audio_encoder:
4967+ return self.global_config["thinker_config"].get("audio_config")
4968+ else:
4969+ return None
4970+
4971+ def set_gguf_parameters(self):
4972+ if self.has_vision_encoder:
4973+ Qwen3VLVisionModel.set_gguf_parameters(self)
4974+ self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.QWEN3VL)
4975+ if self.has_audio_encoder:
4976+ Qwen25AudioModel.set_gguf_parameters(self)
4977+ self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.QWEN3A)
4978+
4979+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4980+ if "visual." in name:
4981+ if not self.has_vision_encoder:
4982+ raise ValueError(f"Model does not have vision encoder, but found tensor {name}")
4983+ # need to transform vision tensor naming, so that modify_tensors() logic can be used correctly
4984+ name = name.replace("thinker.visual.", "model.visual.")
4985+ if ".merger_list." in name:
4986+ name = name.replace(".merger_list.", ".deepstack_merger_list.")
4987+ name = name.replace(".ln_q", ".norm")
4988+ name = name.replace(".mlp.0", ".linear_fc1")
4989+ name = name.replace(".mlp.2", ".linear_fc2")
4990+ elif ".merger." in name:
4991+ name = name.replace(".ln_q", ".norm")
4992+ name = name.replace(".mlp.0", ".linear_fc1")
4993+ name = name.replace(".mlp.2", ".linear_fc2")
4994+ yield from Qwen3VLVisionModel.modify_tensors(self, data_torch, name, bid)
4995+ elif "audio_tower." in name:
4996+ if not self.has_audio_encoder:
4997+ raise ValueError(f"Model does not have audio encoder, but found tensor {name}")
4998+ if "conv2d" in name and name.endswith(".bias"):
4999+ # transform conv2d bias [n_embd] --> [1, 1, n_embd]
5000+ data_torch = data_torch.unsqueeze(-1).unsqueeze(-1)
5001+ yield from Qwen25AudioModel.modify_tensors(self, data_torch, name, bid)
5002+
5003+
5004+ @ModelBase.register("Qwen3ASRForConditionalGeneration")
5005+ class Qwen3ASRMmprojModel(Qwen3OmniMmprojModel):
5006+ has_audio_encoder = True
5007+ has_vision_encoder = False
49335008
49345009
49355010@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
@@ -4992,6 +5067,8 @@ def set_gguf_parameters(self):
49925067 def tensor_force_quant(self, name, new_name, bid, n_dims):
49935068 if ".position_embd." in new_name:
49945069 return gguf.GGMLQuantizationType.F32
5070+ if ("mm.0." in new_name or "mm.1." in new_name) and new_name.endswith(".weight"):
5071+ return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
49955072 return super().tensor_force_quant(name, new_name, bid, n_dims)
49965073
49975074 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -5030,9 +5107,10 @@ class Qwen3VLTextModel(Qwen3Model):
50305107
50315108 def set_gguf_parameters(self):
50325109 super().set_gguf_parameters()
5033-
5034- # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
5035- vision_config = self.hparams.get("vision_config", {})
5110+ if "thinker_config" in self.hparams:
5111+ vision_config = self.hparams["thinker_config"].get("vision_config", {})
5112+ else:
5113+ vision_config = self.hparams.get("vision_config", {})
50365114 deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
50375115 self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
50385116
@@ -5101,6 +5179,70 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51015179 yield from super().modify_tensors(data_torch, name, bid)
51025180
51035181
5182+ @ModelBase.register("Qwen3OmniMoeForConditionalGeneration")
5183+ class Qwen3OmniMoeTextModel(Qwen3VLMoeTextModel):
5184+ model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
5185+
5186+ def set_vocab(self):
5187+ super().set_vocab()
5188+ # correct BOS/EOS tokens
5189+ with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
5190+ tokenizer_config = json.load(f)
5191+ added_tokens = tokenizer_config.get("added_tokens_decoder", {})
5192+ for token_id, data in added_tokens.items():
5193+ if data.get("content") == "<|im_end|>":
5194+ self.gguf_writer.add_bos_token_id(int(token_id))
5195+ self.gguf_writer.add_eos_token_id(int(token_id))
5196+ break
5197+
5198+ def set_gguf_parameters(self):
5199+ super().set_gguf_parameters()
5200+ self.gguf_writer.add_num_deepstack_layers(0)
5201+
5202+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5203+ # Skip vision and audio tensors - they go in the mmproj file
5204+ if "visual." in name or "audio_tower." in name \
5205+ or "talker." in name or "code2wav." in name:
5206+ return
5207+
5208+ name = name.replace("thinker.", "")
5209+ yield from super().modify_tensors(data_torch, name, bid)
5210+
5211+
5212+ @ModelBase.register("Qwen3ASRForConditionalGeneration")
5213+ class Qwen3ASRTextModel(Qwen3VLTextModel):
5214+ model_arch = gguf.MODEL_ARCH.QWEN3VL
5215+
5216+ def set_gguf_parameters(self):
5217+ super().set_gguf_parameters()
5218+ self.gguf_writer.add_num_deepstack_layers(0)
5219+
5220+ def set_vocab(self):
5221+ super().set_vocab()
5222+ # fix chat template, use correct chatml format
5223+ self.gguf_writer.add_chat_template("{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}")
5224+ # correct BOS/EOS tokens
5225+ with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
5226+ tokenizer_config = json.load(f)
5227+ added_tokens = tokenizer_config.get("added_tokens_decoder", {})
5228+ for token_id, data in added_tokens.items():
5229+ if data.get("content") == "<|im_end|>":
5230+ self.gguf_writer.add_bos_token_id(int(token_id))
5231+ self.gguf_writer.add_eos_token_id(int(token_id))
5232+ break
5233+
5234+ def modify_tensors(self, data_torch, name, bid):
5235+ # qwen3-omni
5236+ name = name.replace("thinker.", "")
5237+
5238+ # Skip vision and audio tensors - they go in the mmproj file
5239+ if "visual." in name or "audio_tower." in name \
5240+ or "talker." in name or "code2wav." in name:
5241+ return
5242+
5243+ yield from super().modify_tensors(data_torch, name, bid)
5244+
5245+
51045246class _LinearAttentionVReorderBase(Qwen3NextModel):
51055247 model_arch = gguf.MODEL_ARCH.QWEN3NEXT # overridden by subclasses
51065248 """reorders V heads from grouped to tiled order for ggml broadcast
0 commit comments