@@ -1064,7 +1064,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
10641064
10651065 # Skip multimodal tensors
10661066 if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.")) \
1067- or "visual." in name or "vision." in name or " audio." in name or "talker." in name \
1067+ or "visual." in name or "audio." in name or "talker." in name \
10681068 or "vision_" in name or "audio_" in name or "sam_model" in name \
10691069 or "token2wav." in name or "code2wav." in name \
10701070 or "projector." in name or "pre_mm_projector_norm" in name \
@@ -1360,9 +1360,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
13601360 if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
13611361 # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
13621362 res = "qwen2"
1363- if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f":
1364- # ref: https://huggingface.co/openbmb/MiniCPM-V-4_6
1365- res = "qwen35"
13661363 if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
13671364 # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
13681365 res = "grok-2"
@@ -5502,101 +5499,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
55025499 yield from super().modify_tensors(data_torch, name, bid)
55035500
55045501
5505- class _Qwen35MRopeMixin:
5506- # Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers);
5507- # the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE
5508- # loaders treat qwen35.rope.dimension_sections as required, so make sure it is always
5509- # written even when a particular checkpoint omits the field in `rope_parameters`.
5510- _QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0]
5511-
5512- gguf_writer: gguf.GGUFWriter
5513- rope_parameters: dict
5514-
5515- def set_gguf_parameters(self):
5516- super().set_gguf_parameters() # ty: ignore[unresolved-attribute]
5517- if "mrope_section" not in self.rope_parameters:
5518- self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION)
5519-
5520-
55215502@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
5522- class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
5503+ class Qwen3_5TextModel(_LinearAttentionVReorderBase):
55235504 model_arch = gguf.MODEL_ARCH.QWEN35
55245505
55255506
55265507@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
5527- class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
5508+ class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
55285509 model_arch = gguf.MODEL_ARCH.QWEN35MOE
55295510
55305511
5531- # MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under
5532- # `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger
5533- # + a final DownsampleMLP merger. The same HF arch is registered twice below: once as
5534- # the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup.
5535-
5536- @ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
5537- class MiniCPMV4_6TextModel(Qwen3_5TextModel):
5538- model_arch = gguf.MODEL_ARCH.QWEN35
5539-
5540- @classmethod
5541- def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
5542- name, gen = item
5543-
5544- if name.startswith("model.merger."):
5545- return None
5546- # MTP tensors are not used at inference yet; align with Qwen3Next behaviour
5547- if name.startswith("mtp"):
5548- return None
5549-
5550- return super().filter_tensors(item)
5551-
5552-
5553- @ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
5554- class MiniCPMV4_6VisionModel(MmprojModel):
5555- def __init__(self, *args, **kwargs):
5556- super().__init__(*args, **kwargs)
5557- if self.hparams_vision is not None:
5558- # In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP
5559- # positional embedding bucket grid (70 x 70), while the per-slice processing
5560- # resolution is the preprocessor's `scale_resolution` (typically 448).
5561- # The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size`
5562- # as the slice size and warmup resolution, so report `scale_resolution` there
5563- # to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules.
5564- scale_resolution = self.preprocessor_config.get("scale_resolution")
5565- if scale_resolution is not None:
5566- self.hparams_vision["image_size"] = int(scale_resolution)
5567-
5568- def set_gguf_parameters(self):
5569- super().set_gguf_parameters()
5570- assert self.hparams_vision is not None
5571-
5572- # projector type string is consumed by clip_projector_type_from_string() in clip.cpp
5573- # (mapped to PROJECTOR_TYPE_MINICPMV4_6).
5574- self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6)
5575-
5576- # ViT merger 2x2 + final merger 2x2 = 4x spatial merge per dimension; used for slice alignment
5577- self.gguf_writer.add_vision_projector_scale_factor(4)
5578-
5579- # borrow wa_layer_indexes for vit_merger insertion point
5580- insert_layer_id = int(self.global_config.get(
5581- "insert_layer_id", self.hparams_vision.get("insert_layer_id", 6)))
5582- self.gguf_writer.add_vision_wa_layer_indexes([insert_layer_id])
5583-
5584- # SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx).
5585- self.gguf_writer.add_vision_use_gelu(True)
5586- self.gguf_writer.add_vision_attention_layernorm_eps(
5587- self.hparams_vision.get("layer_norm_eps", 1e-6))
5588-
5589- @classmethod
5590- def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
5591- name, gen = item
5592-
5593- # lm_head / MTP -> belong to the LM file
5594- if name.startswith(("lm_head.", "mtp")):
5595- return None
5596-
5597- return super().filter_tensors(item)
5598-
5599-
56005512@ModelBase.register("GPT2LMHeadModel")
56015513class GPT2Model(TextModel):
56025514 model_arch = gguf.MODEL_ARCH.GPT2
@@ -10783,7 +10695,7 @@ def prepare_tensors(self):
1078310695 raise ValueError(f"Unprocessed experts: {experts}")
1078410696
1078510697
10786- @ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration" )
10698+ @ModelBase.register("GraniteForCausalLM")
1078710699class GraniteModel(LlamaModel):
1078810700 """Conversion for IBM's GraniteForCausalLM"""
1078910701 model_arch = gguf.MODEL_ARCH.GRANITE
@@ -10816,13 +10728,6 @@ def set_gguf_parameters(self):
1081610728 self.gguf_writer.add_logit_scale(logits_scale)
1081710729 logger.info("gguf: (granite) logits_scale = %s", logits_scale)
1081810730
10819- @classmethod
10820- def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
10821- name, gen = item
10822- if name.startswith("encoder."):
10823- return None
10824- return super().filter_tensors(item)
10825-
1082610731
1082710732@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
1082810733class GraniteMoeModel(GraniteModel):
@@ -12676,89 +12581,6 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
1267612581 return super().filter_tensors(item)
1267712582
1267812583
12679- @ModelBase.register("GraniteSpeechForConditionalGeneration")
12680- class GraniteSpeechMmprojModel(MmprojModel):
12681- has_vision_encoder = False
12682- has_audio_encoder = True
12683-
12684- _batch_norm_tensors: list[dict[str, Tensor]] | None = None
12685-
12686- def get_audio_config(self) -> dict[str, Any] | None:
12687- return self.global_config.get("encoder_config")
12688-
12689- def set_gguf_parameters(self):
12690- assert self.hparams_audio is not None
12691- a = self.hparams_audio
12692- a["hidden_size"] = a["hidden_dim"]
12693- a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"]
12694- a["num_attention_heads"] = a["num_heads"]
12695- a["num_hidden_layers"] = a["num_layers"]
12696-
12697- super().set_gguf_parameters()
12698-
12699- self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
12700- self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
12701- self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
12702- self.gguf_writer.add_audio_chunk_size(a["context_size"])
12703- self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"])
12704- self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"])
12705-
12706- p = self.global_config
12707- self.gguf_writer.add_audio_projector_window_size(p["window_size"])
12708- self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"])
12709- self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"])
12710-
12711- def tensor_force_quant(self, name, new_name, bid, n_dims):
12712- if "encoder" in name or "projector" in name:
12713- if ".conv" in name and ".weight" in name:
12714- return gguf.GGMLQuantizationType.F32
12715- return super().tensor_force_quant(name, new_name, bid, n_dims)
12716-
12717- @classmethod
12718- def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
12719- name, gen = item
12720- if "attention_dists" in name or "num_batches_tracked" in name:
12721- return None
12722- return super().filter_tensors(item)
12723-
12724- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
12725- # fold running_mean, running_var and eps into weight and bias for batch_norm
12726- if "batch_norm" in name and "encoder.layers." in name:
12727- if self._batch_norm_tensors is None:
12728- self._batch_norm_tensors = [{} for _ in range(self.block_count)]
12729- assert bid is not None
12730- self._batch_norm_tensors[bid][name] = data_torch
12731- if len(self._batch_norm_tensors[bid]) < 4:
12732- return
12733- prefix = f"encoder.layers.{bid}.conv.batch_norm"
12734- weight = self._batch_norm_tensors[bid][f"{prefix}.weight"]
12735- bias = self._batch_norm_tensors[bid][f"{prefix}.bias"]
12736- running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"]
12737- running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"]
12738- eps = 1e-5
12739- a = weight / torch.sqrt(running_var + eps)
12740- b = bias - running_mean * a
12741- yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid)
12742- yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid)
12743- return
12744-
12745- if ".attn.to_kv.weight" in name:
12746- k_weight, v_weight = data_torch.chunk(2, dim=0)
12747- yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid)
12748- yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid)
12749- return
12750-
12751- if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"):
12752- if data_torch.ndim == 3 and data_torch.shape[2] == 1:
12753- data_torch = data_torch.squeeze(2)
12754-
12755- if "depth_conv" in name and name.endswith(".weight"):
12756- if data_torch.ndim == 3 and data_torch.shape[1] == 1:
12757- data_torch = data_torch.squeeze(1)
12758-
12759- yield from super().modify_tensors(data_torch, name, bid)
12760-
12761-
1276212584@ModelBase.register("Lfm25AudioTokenizer")
1276312585class LFM25AudioTokenizer(LFM2Model):
1276412586 model_arch = gguf.MODEL_ARCH.LFM2
@@ -13551,6 +13373,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1355113373 yield from super().modify_tensors(data_torch, name, bid)
1355213374
1355313375
13376+ @ModelBase.register("Sarashina2VisionForCausalLM")
13377+ class Sarashina2VLTextModel(LlamaModel):
13378+ model_arch = gguf.MODEL_ARCH.LLAMA
13379+
13380+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
13381+ if name.startswith("llm."):
13382+ name = name.replace("llm.", "", 1)
13383+ elif name.startswith("norm.") or name.startswith("visual."):
13384+ return
13385+
13386+ yield from super().modify_tensors(data_torch, name, bid)
13387+
13388+
13389+ @ModelBase.register("Sarashina2VisionForCausalLM")
13390+ class Sarashina2VLVisionModel(Qwen2VLVisionModel):
13391+ def __init__(self, *args, **kwargs):
13392+ super().__init__(*args, **kwargs)
13393+ self.global_config['model_type'] = "qwen2_vl"
13394+
13395+
1355413396###### CONVERSION LOGIC ######
1355513397
1355613398
@@ -13807,7 +13649,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1380713649 # Step3-VL keeps text config under text_config but uses a custom top-level architecture.
1380813650 # For text conversion we route to a dedicated text-only class.
1380913651 # TODO: refactor this later to avoid adding exception here
13810- if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
13652+ if model_type == ModelType.TEXT and arch in ( "StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM") :
1381113653 return arch
1381213654
1381313655 # if "architectures" is found in the sub-config, use that instead
0 commit comments