@@ -859,7 +859,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
859859 } break ;
860860 case PROJECTOR_TYPE_QWEN2VL :
861861 case PROJECTOR_TYPE_QWEN25VL :
862- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
863862 {
864863 builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
865864 } break ;
@@ -1363,19 +1362,6 @@ struct clip_model_loader {
13631362 LOG_WRN (" %s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n " , __func__);
13641363 }
13651364 } break ;
1366- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
1367- {
1368- hparams.n_merge = 2 ;
1369- hparams.image_resize_algo = RESIZE_ALGO_BILINEAR ;
1370- get_u32 (KEY_SPATIAL_MERGE_SIZE , hparams.n_merge , false );
1371- get_u32 (KEY_WIN_ATTN_PATTERN , hparams.n_wa_pattern , false );
1372- hparams.set_limit_image_tokens (8 , 4096 );
1373- hparams.set_warmup_n_tokens (46 *46 );
1374- const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size ;
1375- if (hparams.image_min_pixels < warn_min_pixels) {
1376- LOG_WRN (" %s: Sarashina2VL models require sufficient image tokens\n " , __func__);
1377- }
1378- } break ;
13791365 case PROJECTOR_TYPE_STEP3VL :
13801366 {
13811367 hparams.n_merge = 4 ; // two stride-2 downsamplers after patching
@@ -1708,7 +1694,6 @@ struct clip_model_loader {
17081694 || model.proj_type == PROJECTOR_TYPE_GEMMA3
17091695 || model.proj_type == PROJECTOR_TYPE_IDEFICS3
17101696 || model.proj_type == PROJECTOR_TYPE_MINICPMV
1711- || model.proj_type == PROJECTOR_TYPE_SARASHINA2VL_MERGER
17121697 ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w ->ne [0 ] == hparams.n_embd ;
17131698 if (is_ffn_swapped) {
17141699 // swap up and down weights
@@ -1825,7 +1810,6 @@ struct clip_model_loader {
18251810 } break ;
18261811 case PROJECTOR_TYPE_QWEN2VL :
18271812 case PROJECTOR_TYPE_QWEN25VL :
1828- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
18291813 {
18301814 model.mm_0_w = get_tensor (string_format (TN_LLAVA_PROJ , 0 , " weight" ));
18311815 model.mm_0_b = get_tensor (string_format (TN_LLAVA_PROJ , 0 , " bias" ));
@@ -2814,7 +2798,6 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
28142798 case PROJECTOR_TYPE_PADDLEOCR :
28152799 case PROJECTOR_TYPE_HUNYUANOCR :
28162800 case PROJECTOR_TYPE_YOUTUVL :
2817- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
28182801 return (img->nx / params.patch_size ) / 2 ;
28192802 case PROJECTOR_TYPE_STEP3VL :
28202803 return img->nx / (params.patch_size * params.n_merge );
@@ -2834,7 +2817,6 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
28342817 case PROJECTOR_TYPE_GLM4V :
28352818 case PROJECTOR_TYPE_PADDLEOCR :
28362819 case PROJECTOR_TYPE_YOUTUVL :
2837- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
28382820 return (img->ny / params.patch_size ) / 2 ;
28392821 case PROJECTOR_TYPE_STEP3VL :
28402822 return img->ny / (params.patch_size * params.n_merge );
@@ -2902,7 +2884,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
29022884 case PROJECTOR_TYPE_QWEN3VL :
29032885 case PROJECTOR_TYPE_GLM4V :
29042886 case PROJECTOR_TYPE_YOUTUVL :
2905- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
29062887 {
29072888 // dynamic size (2 conv, so double patch size)
29082889 int x_patch = img->nx / (params.patch_size * 2 );
@@ -3298,7 +3279,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
32983279 } break ;
32993280 case PROJECTOR_TYPE_QWEN25VL :
33003281 case PROJECTOR_TYPE_YOUTUVL :
3301- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
33023282 {
33033283 // pw * ph = number of tokens output by ViT after apply patch merger
33043284 // ipw * ipw = number of vision token been processed inside ViT
@@ -3676,7 +3656,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
36763656 case PROJECTOR_TYPE_QWEN25VL :
36773657 case PROJECTOR_TYPE_JANUS_PRO :
36783658 case PROJECTOR_TYPE_YOUTUVL :
3679- case PROJECTOR_TYPE_SARASHINA2VL_MERGER :
36803659 return ctx->model .mm_1_b ->ne [0 ];
36813660 case PROJECTOR_TYPE_QWEN3VL :
36823661 // main path + deepstack paths
0 commit comments