samuraieng/sarashina22-00 Update to follow review comments

samuraieng · samuraieng · commit ef4613b9d00b · 2026-04-23T07:24:52.000+09:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2810,6 +2810,7 @@ def prepare_tensors(self):
     "LlavaForConditionalGeneration",
     "VoxtralForConditionalGeneration",
     "IQuestCoderForCausalLM",
+    "Sarashina2VisionForCausalLM",
     "LlamaModel")
 class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
@@ -2955,13 +2956,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             else:
                 return
 
-        if self.origin_hf_arch.startswith('Sarashina2VisionForCausalLM'):
-            # Remove llm. from name 
-            if name.startswith("llm."):
-                name = name[len("llm."):]
-            elif name.startswith("visual.") or name in ("norm.weight", "norm.bias"):
-                return  #Skip processing "modify_tensors"
-
         yield from super().modify_tensors(data_torch, name, bid)
 
     def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
@@ -13117,6 +13111,74 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("Sarashina2VisionForCausalLM")
+class Sarashina2VLTextModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
+        self.origin_hf_arch = hparams.get('architectures', [None])[0]
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        if name.startswith("llm."):
+            name = name[len("llm."):]
+            yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Sarashina2VisionForCausalLM")
+class Sarashina2VLVisionModel(MmprojModel):
+    model_type = ModelType.MMPROJ
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
+        # rename config.json values
+        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
+        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
+        if "embed_dim" in self.hparams_vision: # qwen2vl
+            self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
+            self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
+        self.gguf_writer.add_vision_spatial_merge_size(2)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        assert self.hparams_vision is not None
+        # Two tensors will be ignored
+        #if name in ('norm.weight', 'norm.bias'):
+        #    name = "visual.post_layer" + name
+        if name.startswith("visual."):
+            # process visual tensors
+            # split QKV tensors if needed
+            if ".qkv." in name:
+                if data_torch.ndim == 2: # weight
+                    c3, _ = data_torch.shape
+                else: # bias
+                    c3 = data_torch.shape[0]
+                assert c3 % 3 == 0
+                c = c3 // 3
+                wq = data_torch[:c]
+                wk = data_torch[c: c * 2]
+                wv = data_torch[c * 2:]
+                yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid)
+                yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid)
+                yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid)
+            elif 'patch_embed.proj.weight' in name:
+                # split Conv3D into Conv2Ds
+                c1, c2, kt, kh, kw = data_torch.shape
+                del c1, c2, kh, kw  # unused
+                assert kt == 2, "Current implementation only support temporal_patch_size of 2"
+                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
+                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
+            else:
+                yield from super().modify_tensors(data_torch, name, bid)
+
+
 ###### CONVERSION LOGIC ######
 
 
@@ -13374,14 +13436,14 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
     # TODO: refactor this later to avoid adding exception here
     if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
         return arch
+    if model_type == ModelType.TEXT and arch == "Sarashina2VisionForCausalLM":
+        return "Sarashina2VisionForCausalLM"
 
     # if "architectures" is found in the sub-config, use that instead
     if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
         arch = text_config["architectures"][0]
     elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
         arch = vision_config["architectures"][0]
-    if "Sarashina2VisionForCausalLM" in arch:
-        arch = "Qwen2VLForConditionalGeneration"
     if arch is None:
         raise ValueError("Failed to detect model architecture")
     return arch