Skip to content

Commit b4f4b3c

Browse files
committed
apply diff
1 parent 97f06e9 commit b4f4b3c

1 file changed

Lines changed: 25 additions & 183 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 25 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
10641064

10651065
# Skip multimodal tensors
10661066
if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.")) \
1067-
or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
1067+
or "visual." in name or "audio." in name or "talker." in name \
10681068
or "vision_" in name or "audio_" in name or "sam_model" in name \
10691069
or "token2wav." in name or "code2wav." in name \
10701070
or "projector." in name or "pre_mm_projector_norm" in name \
@@ -1360,9 +1360,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
13601360
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
13611361
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
13621362
res = "qwen2"
1363-
if chkhsh == "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f":
1364-
# ref: https://huggingface.co/openbmb/MiniCPM-V-4_6
1365-
res = "qwen35"
13661363
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
13671364
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
13681365
res = "grok-2"
@@ -5502,101 +5499,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
55025499
yield from super().modify_tensors(data_torch, name, bid)
55035500

55045501

5505-
class _Qwen35MRopeMixin:
5506-
# Qwen3.5 always applies interleaved MRoPE (see Qwen3_5RotaryEmbedding in transformers);
5507-
# the upstream default mrope_section is [11, 11, 10] and llama.cpp's QWEN35 / QWEN35MOE
5508-
# loaders treat qwen35.rope.dimension_sections as required, so make sure it is always
5509-
# written even when a particular checkpoint omits the field in `rope_parameters`.
5510-
_QWEN35_DEFAULT_MROPE_SECTION = [11, 11, 10, 0]
5511-
5512-
gguf_writer: gguf.GGUFWriter
5513-
rope_parameters: dict
5514-
5515-
def set_gguf_parameters(self):
5516-
super().set_gguf_parameters() # ty: ignore[unresolved-attribute]
5517-
if "mrope_section" not in self.rope_parameters:
5518-
self.gguf_writer.add_rope_dimension_sections(self._QWEN35_DEFAULT_MROPE_SECTION)
5519-
5520-
55215502
@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
5522-
class Qwen3_5TextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
5503+
class Qwen3_5TextModel(_LinearAttentionVReorderBase):
55235504
model_arch = gguf.MODEL_ARCH.QWEN35
55245505

55255506

55265507
@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
5527-
class Qwen3_5MoeTextModel(_Qwen35MRopeMixin, _LinearAttentionVReorderBase):
5508+
class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
55285509
model_arch = gguf.MODEL_ARCH.QWEN35MOE
55295510

55305511

5531-
# MiniCPM-V 4.6: text tower is Qwen3.5 (linear+full hybrid attention) wrapped under
5532-
# `model.language_model.*`; vision tower is SigLIP + a window-attention ViT merger
5533-
# + a final DownsampleMLP merger. The same HF arch is registered twice below: once as
5534-
# the LM (text mode) and once as the mmproj (vision mode), mirroring the Qwen3-VL setup.
5535-
5536-
@ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
5537-
class MiniCPMV4_6TextModel(Qwen3_5TextModel):
5538-
model_arch = gguf.MODEL_ARCH.QWEN35
5539-
5540-
@classmethod
5541-
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
5542-
name, gen = item
5543-
5544-
if name.startswith("model.merger."):
5545-
return None
5546-
# MTP tensors are not used at inference yet; align with Qwen3Next behaviour
5547-
if name.startswith("mtp"):
5548-
return None
5549-
5550-
return super().filter_tensors(item)
5551-
5552-
5553-
@ModelBase.register("MiniCPMV4_6ForConditionalGeneration")
5554-
class MiniCPMV4_6VisionModel(MmprojModel):
5555-
def __init__(self, *args, **kwargs):
5556-
super().__init__(*args, **kwargs)
5557-
if self.hparams_vision is not None:
5558-
# In MiniCPM-V 4.6 `vision_config.image_size` (980) describes the SigLIP
5559-
# positional embedding bucket grid (70 x 70), while the per-slice processing
5560-
# resolution is the preprocessor's `scale_resolution` (typically 448).
5561-
# The CLIP loader in tools/mtmd/clip.cpp consumes `clip.vision.image_size`
5562-
# as the slice size and warmup resolution, so report `scale_resolution` there
5563-
# to match the upstream MiniCPMV4_6ImageProcessorPil slicing rules.
5564-
scale_resolution = self.preprocessor_config.get("scale_resolution")
5565-
if scale_resolution is not None:
5566-
self.hparams_vision["image_size"] = int(scale_resolution)
5567-
5568-
def set_gguf_parameters(self):
5569-
super().set_gguf_parameters()
5570-
assert self.hparams_vision is not None
5571-
5572-
# projector type string is consumed by clip_projector_type_from_string() in clip.cpp
5573-
# (mapped to PROJECTOR_TYPE_MINICPMV4_6).
5574-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MINICPMV4_6)
5575-
5576-
# ViT merger 2x2 + final merger 2x2 = 4x spatial merge per dimension; used for slice alignment
5577-
self.gguf_writer.add_vision_projector_scale_factor(4)
5578-
5579-
# borrow wa_layer_indexes for vit_merger insertion point
5580-
insert_layer_id = int(self.global_config.get(
5581-
"insert_layer_id", self.hparams_vision.get("insert_layer_id", 6)))
5582-
self.gguf_writer.add_vision_wa_layer_indexes([insert_layer_id])
5583-
5584-
# SigLIP vision body uses gelu_pytorch_tanh, which matches ggml_gelu (tanh approx).
5585-
self.gguf_writer.add_vision_use_gelu(True)
5586-
self.gguf_writer.add_vision_attention_layernorm_eps(
5587-
self.hparams_vision.get("layer_norm_eps", 1e-6))
5588-
5589-
@classmethod
5590-
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
5591-
name, gen = item
5592-
5593-
# lm_head / MTP -> belong to the LM file
5594-
if name.startswith(("lm_head.", "mtp")):
5595-
return None
5596-
5597-
return super().filter_tensors(item)
5598-
5599-
56005512
@ModelBase.register("GPT2LMHeadModel")
56015513
class GPT2Model(TextModel):
56025514
model_arch = gguf.MODEL_ARCH.GPT2
@@ -10783,7 +10695,7 @@ def prepare_tensors(self):
1078310695
raise ValueError(f"Unprocessed experts: {experts}")
1078410696

1078510697

10786-
@ModelBase.register("GraniteForCausalLM", "GraniteSpeechForConditionalGeneration")
10698+
@ModelBase.register("GraniteForCausalLM")
1078710699
class GraniteModel(LlamaModel):
1078810700
"""Conversion for IBM's GraniteForCausalLM"""
1078910701
model_arch = gguf.MODEL_ARCH.GRANITE
@@ -10816,13 +10728,6 @@ def set_gguf_parameters(self):
1081610728
self.gguf_writer.add_logit_scale(logits_scale)
1081710729
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
1081810730

10819-
@classmethod
10820-
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
10821-
name, gen = item
10822-
if name.startswith("encoder."):
10823-
return None
10824-
return super().filter_tensors(item)
10825-
1082610731

1082710732
@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
1082810733
class GraniteMoeModel(GraniteModel):
@@ -12676,89 +12581,6 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
1267612581
return super().filter_tensors(item)
1267712582

1267812583

12679-
@ModelBase.register("GraniteSpeechForConditionalGeneration")
12680-
class GraniteSpeechMmprojModel(MmprojModel):
12681-
has_vision_encoder = False
12682-
has_audio_encoder = True
12683-
12684-
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
12685-
12686-
def get_audio_config(self) -> dict[str, Any] | None:
12687-
return self.global_config.get("encoder_config")
12688-
12689-
def set_gguf_parameters(self):
12690-
assert self.hparams_audio is not None
12691-
a = self.hparams_audio
12692-
a["hidden_size"] = a["hidden_dim"]
12693-
a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"]
12694-
a["num_attention_heads"] = a["num_heads"]
12695-
a["num_hidden_layers"] = a["num_layers"]
12696-
12697-
super().set_gguf_parameters()
12698-
12699-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
12700-
self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
12701-
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
12702-
self.gguf_writer.add_audio_chunk_size(a["context_size"])
12703-
self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"])
12704-
self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"])
12705-
12706-
p = self.global_config
12707-
self.gguf_writer.add_audio_projector_window_size(p["window_size"])
12708-
self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"])
12709-
self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"])
12710-
12711-
def tensor_force_quant(self, name, new_name, bid, n_dims):
12712-
if "encoder" in name or "projector" in name:
12713-
if ".conv" in name and ".weight" in name:
12714-
return gguf.GGMLQuantizationType.F32
12715-
return super().tensor_force_quant(name, new_name, bid, n_dims)
12716-
12717-
@classmethod
12718-
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
12719-
name, gen = item
12720-
if "attention_dists" in name or "num_batches_tracked" in name:
12721-
return None
12722-
return super().filter_tensors(item)
12723-
12724-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
12725-
# fold running_mean, running_var and eps into weight and bias for batch_norm
12726-
if "batch_norm" in name and "encoder.layers." in name:
12727-
if self._batch_norm_tensors is None:
12728-
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
12729-
assert bid is not None
12730-
self._batch_norm_tensors[bid][name] = data_torch
12731-
if len(self._batch_norm_tensors[bid]) < 4:
12732-
return
12733-
prefix = f"encoder.layers.{bid}.conv.batch_norm"
12734-
weight = self._batch_norm_tensors[bid][f"{prefix}.weight"]
12735-
bias = self._batch_norm_tensors[bid][f"{prefix}.bias"]
12736-
running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"]
12737-
running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"]
12738-
eps = 1e-5
12739-
a = weight / torch.sqrt(running_var + eps)
12740-
b = bias - running_mean * a
12741-
yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid)
12742-
yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid)
12743-
return
12744-
12745-
if ".attn.to_kv.weight" in name:
12746-
k_weight, v_weight = data_torch.chunk(2, dim=0)
12747-
yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid)
12748-
yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid)
12749-
return
12750-
12751-
if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"):
12752-
if data_torch.ndim == 3 and data_torch.shape[2] == 1:
12753-
data_torch = data_torch.squeeze(2)
12754-
12755-
if "depth_conv" in name and name.endswith(".weight"):
12756-
if data_torch.ndim == 3 and data_torch.shape[1] == 1:
12757-
data_torch = data_torch.squeeze(1)
12758-
12759-
yield from super().modify_tensors(data_torch, name, bid)
12760-
12761-
1276212584
@ModelBase.register("Lfm25AudioTokenizer")
1276312585
class LFM25AudioTokenizer(LFM2Model):
1276412586
model_arch = gguf.MODEL_ARCH.LFM2
@@ -13551,6 +13373,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1355113373
yield from super().modify_tensors(data_torch, name, bid)
1355213374

1355313375

13376+
@ModelBase.register("Sarashina2VisionForCausalLM")
13377+
class Sarashina2VLTextModel(LlamaModel):
13378+
model_arch = gguf.MODEL_ARCH.LLAMA
13379+
13380+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
13381+
if name.startswith("llm."):
13382+
name = name.replace("llm.", "", 1)
13383+
elif name.startswith("norm.") or name.startswith("visual."):
13384+
return
13385+
13386+
yield from super().modify_tensors(data_torch, name, bid)
13387+
13388+
13389+
@ModelBase.register("Sarashina2VisionForCausalLM")
13390+
class Sarashina2VLVisionModel(Qwen2VLVisionModel):
13391+
def __init__(self, *args, **kwargs):
13392+
super().__init__(*args, **kwargs)
13393+
self.global_config['model_type'] = "qwen2_vl"
13394+
13395+
1355413396
###### CONVERSION LOGIC ######
1355513397

1355613398

@@ -13807,7 +13649,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1380713649
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
1380813650
# For text conversion we route to a dedicated text-only class.
1380913651
# TODO: refactor this later to avoid adding exception here
13810-
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
13652+
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
1381113653
return arch
1381213654

1381313655
# if "architectures" is found in the sub-config, use that instead

0 commit comments

Comments
 (0)