Skip to content

Commit 5905b0a

Browse files
committed
samuraieng/sarashina22vl-00, update to follow review results
1 parent cda2785 commit 5905b0a

6 files changed

Lines changed: 4 additions & 29 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4200,7 +4200,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
42004200
yield from super().modify_tensors(data_torch, name, bid)
42014201

42024202

4203-
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration", "Sarashina2VisionForCausalLM")
4203+
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
42044204
class Qwen2VLVisionModel(MmprojModel):
42054205
def __init__(self, *args, **kwargs):
42064206
super().__init__(*args, **kwargs)
@@ -4236,7 +4236,7 @@ def set_gguf_parameters(self):
42364236
raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
42374237
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
42384238
elif model_type == "sarashina2_vision":
4239-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.SARASHINA2VL)
4239+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
42404240
self.gguf_writer.add_vision_spatial_merge_size(2)
42414241
else:
42424242
raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
@@ -13380,7 +13380,8 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1338013380
arch = text_config["architectures"][0]
1338113381
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
1338213382
arch = vision_config["architectures"][0]
13383-
13383+
if "Sarashina2VisionForCausalLM" in arch:
13384+
arch = "Qwen2VLForConditionalGeneration"
1338413385
if arch is None:
1338513386
raise ValueError("Failed to detect model architecture")
1338613387
return arch

gguf-py/gguf/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4138,7 +4138,6 @@ class VisionProjectorType:
41384138
YOUTUVL = "youtuvl"
41394139
NEMOTRON_V2_VL = "nemotron_v2_vl"
41404140
HUNYUANOCR = "hunyuanocr"
4141-
SARASHINA2VL = "sarashina2vl_merger"
41424141

41434142

41444143
# Items here are (block size, type size)

tools/mtmd/clip-impl.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,6 @@ enum projector_type {
293293
PROJECTOR_TYPE_KIMIK25,
294294
PROJECTOR_TYPE_NEMOTRON_V2_VL,
295295
PROJECTOR_TYPE_HUNYUANOCR,
296-
PROJECTOR_TYPE_SARASHINA2VL_MERGER,
297296
PROJECTOR_TYPE_UNKNOWN,
298297
};
299298

@@ -339,7 +338,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
339338
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
340339
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
341340
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
342-
{ PROJECTOR_TYPE_SARASHINA2VL_MERGER, "sarashina2vl_merger"}
343341
};
344342

345343
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
859859
} break;
860860
case PROJECTOR_TYPE_QWEN2VL:
861861
case PROJECTOR_TYPE_QWEN25VL:
862-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
863862
{
864863
builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
865864
} break;
@@ -1363,19 +1362,6 @@ struct clip_model_loader {
13631362
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
13641363
}
13651364
} break;
1366-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
1367-
{
1368-
hparams.n_merge = 2;
1369-
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
1370-
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1371-
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, false);
1372-
hparams.set_limit_image_tokens(8, 4096);
1373-
hparams.set_warmup_n_tokens(46*46);
1374-
const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
1375-
if (hparams.image_min_pixels < warn_min_pixels) {
1376-
LOG_WRN("%s: Sarashina2VL models require sufficient image tokens\n", __func__);
1377-
}
1378-
} break;
13791365
case PROJECTOR_TYPE_STEP3VL:
13801366
{
13811367
hparams.n_merge = 4; // two stride-2 downsamplers after patching
@@ -1708,7 +1694,6 @@ struct clip_model_loader {
17081694
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
17091695
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
17101696
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
1711-
|| model.proj_type == PROJECTOR_TYPE_SARASHINA2VL_MERGER
17121697
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
17131698
if (is_ffn_swapped) {
17141699
// swap up and down weights
@@ -1825,7 +1810,6 @@ struct clip_model_loader {
18251810
} break;
18261811
case PROJECTOR_TYPE_QWEN2VL:
18271812
case PROJECTOR_TYPE_QWEN25VL:
1828-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
18291813
{
18301814
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
18311815
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
@@ -2814,7 +2798,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
28142798
case PROJECTOR_TYPE_PADDLEOCR:
28152799
case PROJECTOR_TYPE_HUNYUANOCR:
28162800
case PROJECTOR_TYPE_YOUTUVL:
2817-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
28182801
return (img->nx / params.patch_size) / 2;
28192802
case PROJECTOR_TYPE_STEP3VL:
28202803
return img->nx / (params.patch_size * params.n_merge);
@@ -2834,7 +2817,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
28342817
case PROJECTOR_TYPE_GLM4V:
28352818
case PROJECTOR_TYPE_PADDLEOCR:
28362819
case PROJECTOR_TYPE_YOUTUVL:
2837-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
28382820
return (img->ny / params.patch_size) / 2;
28392821
case PROJECTOR_TYPE_STEP3VL:
28402822
return img->ny / (params.patch_size * params.n_merge);
@@ -2902,7 +2884,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
29022884
case PROJECTOR_TYPE_QWEN3VL:
29032885
case PROJECTOR_TYPE_GLM4V:
29042886
case PROJECTOR_TYPE_YOUTUVL:
2905-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
29062887
{
29072888
// dynamic size (2 conv, so double patch size)
29082889
int x_patch = img->nx / (params.patch_size * 2);
@@ -3298,7 +3279,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
32983279
} break;
32993280
case PROJECTOR_TYPE_QWEN25VL:
33003281
case PROJECTOR_TYPE_YOUTUVL:
3301-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
33023282
{
33033283
// pw * ph = number of tokens output by ViT after apply patch merger
33043284
// ipw * ipw = number of vision token been processed inside ViT
@@ -3676,7 +3656,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
36763656
case PROJECTOR_TYPE_QWEN25VL:
36773657
case PROJECTOR_TYPE_JANUS_PRO:
36783658
case PROJECTOR_TYPE_YOUTUVL:
3679-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
36803659
return ctx->model.mm_1_b->ne[0];
36813660
case PROJECTOR_TYPE_QWEN3VL:
36823661
// main path + deepstack paths

tools/mtmd/mtmd.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,6 @@ struct mtmd_context {
303303
case PROJECTOR_TYPE_QWEN2VL:
304304
case PROJECTOR_TYPE_QWEN25VL:
305305
case PROJECTOR_TYPE_QWEN3VL:
306-
case PROJECTOR_TYPE_SARASHINA2VL_MERGER:
307306
{
308307
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
309308
img_beg = "<|vision_start|>";

tools/mtmd/tests.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template
9292
add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
9393
add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
9494
add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja
95-
add_test_vision "samuraieng/sarashina2.2-vision-3b-gguf:Q4_K_M"
9695

9796
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
9897
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"

0 commit comments

Comments
 (0)