modify to follow review comments

samuraieng · samuraieng · commit f3eeb2bf4ee4 · 2026-04-25T10:25:22.000+09:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2810,7 +2810,6 @@ def prepare_tensors(self):
     "LlavaForConditionalGeneration",
     "VoxtralForConditionalGeneration",
     "IQuestCoderForCausalLM",
-    "Sarashina2VisionForCausalLM",
     "LlamaModel")
 class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
@@ -13124,56 +13123,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
 
 
 @ModelBase.register("Sarashina2VisionForCausalLM")
-class Sarashina2VLVisionModel(MmprojModel):
-    model_type = ModelType.MMPROJ
-
+class Sarashina2VLVisionModel(Qwen2VLVisionModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        assert self.hparams_vision is not None
-        self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
-        # rename config.json values
-        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
-        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
-        if "embed_dim" in self.hparams_vision: # qwen2vl
-            self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
-            self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
-        self.gguf_writer.add_vision_spatial_merge_size(2)
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        assert self.hparams_vision is not None
-        # Two tensors will be ignored
-        #if name in ('norm.weight', 'norm.bias'):
-        #    name = "visual.post_layer" + name
-        if name.startswith("visual."):
-            # process visual tensors
-            # split QKV tensors if needed
-            if ".qkv." in name:
-                if data_torch.ndim == 2: # weight
-                    c3, _ = data_torch.shape
-                else: # bias
-                    c3 = data_torch.shape[0]
-                assert c3 % 3 == 0
-                c = c3 // 3
-                wq = data_torch[:c]
-                wk = data_torch[c: c * 2]
-                wv = data_torch[c * 2:]
-                yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid)
-                yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid)
-                yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid)
-            elif 'patch_embed.proj.weight' in name:
-                # split Conv3D into Conv2Ds
-                c1, c2, kt, kh, kw = data_torch.shape
-                del c1, c2, kh, kw  # unused
-                assert kt == 2, "Current implementation only support temporal_patch_size of 2"
-                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
-                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
-            else:
-                yield from super().modify_tensors(data_torch, name, bid)
+        self.global_config['model_type'] = "qwen2_vl"
 
 
 ###### CONVERSION LOGIC ######