@@ -931,6 +931,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
931931 {
932932 builder = std::make_unique<clip_graph_conformer>(ctx, img);
933933 } break ;
934+ case PROJECTOR_TYPE_GEMMA4A:
935+ {
936+ builder = std::make_unique<clip_graph_gemma4a>(ctx, img);
937+ } break ;
934938 case PROJECTOR_TYPE_GLM4V:
935939 {
936940 builder = std::make_unique<clip_graph_glm4v>(ctx, img);
@@ -1459,6 +1463,16 @@ struct clip_model_loader {
14591463 hparams.audio_window_len = 400 ;
14601464 hparams.audio_hop_len = 160 ;
14611465 } break ;
1466+ case PROJECTOR_TYPE_GEMMA4A:
1467+ {
1468+ // Gemma4 feature_extraction_gemma4.py:
1469+ // frame_length_ms=20 -> 320 samples, n_fft=512, hop=10ms -> 160
1470+ hparams.audio_chunk_len = 0 ; // no fixed-length padding
1471+ hparams.audio_sample_rate = 16000 ;
1472+ hparams.audio_n_fft = 512 ;
1473+ hparams.audio_window_len = 320 ; // 20ms frame (NOT 25ms/400)
1474+ hparams.audio_hop_len = 160 ;
1475+ } break ;
14621476 case PROJECTOR_TYPE_JANUS_PRO:
14631477 {
14641478 hparams.image_pad_color = {127 , 127 , 127 };
@@ -1561,16 +1575,21 @@ struct clip_model_loader {
15611575 }
15621576
15631577 // helper function
1578+ std::unordered_set<std::string> loaded_tensor_names;
15641579 auto get_tensor = [&](const std::string & name, bool required = true ) {
1580+ // Each tensor should only be loaded once; duplicates indicate a bug
1581+ if (loaded_tensor_names.count (name)) {
1582+ throw std::runtime_error (string_format (" %s: tensor already loaded: %s\n " , __func__, name.c_str ()));
1583+ }
15651584 ggml_tensor * cur = ggml_get_tensor (ctx_meta.get (), name.c_str ());
15661585 if (!cur && required) {
15671586 throw std::runtime_error (string_format (" %s: unable to find tensor %s\n " , __func__, name.c_str ()));
15681587 }
15691588 if (cur) {
15701589 tensors_to_load.push_back (cur);
1571- // add tensors to context
15721590 ggml_tensor * data_tensor = ggml_dup_tensor (ctx_clip.ctx_data .get (), cur);
15731591 ggml_set_name (data_tensor, cur->name );
1592+ loaded_tensor_names.insert (name);
15741593 cur = data_tensor;
15751594 }
15761595 return cur;
@@ -2186,6 +2205,76 @@ struct clip_model_loader {
21862205 model.mm_fc_w = get_tensor (string_format (TN_MM_PROJECTOR, " weight" ));
21872206 model.mm_fc_b = get_tensor (string_format (TN_MM_PROJECTOR, " bias" ));
21882207 } break ;
2208+ case PROJECTOR_TYPE_GEMMA4A:
2209+ {
2210+ for (int i = 0 ; i < 2 ; i++) {
2211+ model.sscp_conv_w [i] = get_tensor (string_format (TN_A_CONV1D, i, " weight" ));
2212+ model.sscp_conv_b [i] = get_tensor (string_format (TN_A_CONV1D, i, " bias" ), false );
2213+ model.sscp_norm_w [i] = get_tensor (string_format (TN_A_CONV1D_NORM, i, " weight" ), false );
2214+ }
2215+ model.sscp_inp_proj_w = get_tensor (string_format (TN_A_INP_PROJ, " weight" ));
2216+ model.sscp_inp_proj_b = get_tensor (string_format (TN_A_INP_PROJ, " bias" ), false );
2217+ model.audio_out_proj_w = get_tensor (string_format (TN_A_OUT_PROJ, " weight" ), false );
2218+ model.audio_out_proj_b = get_tensor (string_format (TN_A_OUT_PROJ, " bias" ), false );
2219+ // audio multimodal embedder (mm.a.* namespace, not mm.*)
2220+ model.mm_soft_emb_norm_w = get_tensor (string_format (TN_A_MM_SOFT_EMB_N, " weight" ), false );
2221+ model.mm_input_proj_w = get_tensor (string_format (TN_A_MM_INP_PROJ, " weight" ), false );
2222+
2223+ // Per-layer tensors NOT loaded by the generic loop above
2224+ for (int il = 0 ; il < hparams.n_layer ; ++il) {
2225+ auto & layer = model.layers [il];
2226+
2227+ // Gemma4 audio conformer-specific tensors
2228+ layer.ff_norm_w = get_tensor (string_format (TN_FFN_NORM, prefix, il, " weight" ));
2229+ layer.attn_pre_norm_w = get_tensor (string_format (TN_A_ATTN_PRE_NORM, prefix, il, " weight" ), false );
2230+ layer.per_dim_scale_w = get_tensor (string_format (TN_A_PER_DIM_SCALE, prefix, il, " weight" ), false );
2231+ layer.per_dim_k_scale_w = get_tensor (string_format (TN_A_PER_DIM_K_SCALE, prefix, il, " weight" ), false );
2232+ layer.attn_k_rel_w = get_tensor (string_format (TN_A_ATTN_K_REL, prefix, il, " weight" ), false );
2233+
2234+ // Convolution module
2235+ // Note: conv_norm / norm_conv are swapped in GGUF due to
2236+ // upstream tensor_mapping.py, so we load them in reverse order
2237+ layer.norm_conv_w = get_tensor (string_format (TN_CONV_NORM, prefix, il, " weight" ), false );
2238+ layer.norm_conv_b = get_tensor (string_format (TN_CONV_NORM, prefix, il, " bias" ), false );
2239+ layer.conv_pw1_w = get_tensor (string_format (TN_CONV_PW1, prefix, il, " weight" ));
2240+ layer.conv_pw1_b = get_tensor (string_format (TN_CONV_PW1, prefix, il, " bias" ), false );
2241+ layer.conv_dw_w = get_tensor (string_format (TN_CONV_DW, prefix, il, " weight" ));
2242+ layer.conv_dw_b = get_tensor (string_format (TN_CONV_DW, prefix, il, " bias" ), false );
2243+ layer.conv_norm_w = get_tensor (string_format (TN_NORM_CONV, prefix, il, " weight" ), false );
2244+ layer.conv_norm_b = get_tensor (string_format (TN_NORM_CONV, prefix, il, " bias" ), false );
2245+ layer.conv_pw2_w = get_tensor (string_format (TN_CONV_PW2, prefix, il, " weight" ));
2246+ layer.conv_pw2_b = get_tensor (string_format (TN_CONV_PW2, prefix, il, " bias" ), false );
2247+
2248+ // FFN2 (second half-step)
2249+ layer.ff_norm_1_w = get_tensor (string_format (TN_FFN_NORM_1, prefix, il, " weight" ));
2250+ layer.ff_up_1_w = get_tensor (string_format (TN_FFN_UP_1, prefix, il, " weight" ));
2251+ layer.ff_up_1_b = get_tensor (string_format (TN_FFN_UP_1, prefix, il, " bias" ), false );
2252+ layer.ff_down_1_w = get_tensor (string_format (TN_FFN_DOWN_1, prefix, il, " weight" ));
2253+ layer.ff_down_1_b = get_tensor (string_format (TN_FFN_DOWN_1, prefix, il, " bias" ), false );
2254+ layer.ff_post_norm_1_w = get_tensor (string_format (TN_A_FFN_POST_NORM_1, prefix, il, " weight" ), false );
2255+ }
2256+
2257+ // Load clamp info for ClippableLinear AFTER all tensors are loaded
2258+ for (auto * tensor : tensors_to_load) {
2259+ std::string name = tensor->name ;
2260+ if (string_ends_with (name, " .weight" )) {
2261+ std::string name_inp_max = name;
2262+ std::string name_inp_min = name;
2263+ std::string name_out_max = name;
2264+ std::string name_out_min = name;
2265+ string_replace_all (name_inp_max, " .weight" , " .input_max" );
2266+ string_replace_all (name_inp_min, " .weight" , " .input_min" );
2267+ string_replace_all (name_out_max, " .weight" , " .output_max" );
2268+ string_replace_all (name_out_min, " .weight" , " .output_min" );
2269+ model.clamp_info_map [name] = {
2270+ get_scalar (name_inp_max, FLT_MAX),
2271+ get_scalar (name_inp_min, -FLT_MAX),
2272+ get_scalar (name_out_max, FLT_MAX),
2273+ get_scalar (name_out_min, -FLT_MAX)
2274+ };
2275+ }
2276+ }
2277+ } break ;
21892278 case PROJECTOR_TYPE_LFM2A:
21902279 {
21912280 for (int i : {0 , 2 , 3 , 5 , 6 }) {
@@ -2246,7 +2335,10 @@ struct clip_model_loader {
22462335 ggml_backend_buffer_set_usage (ctx_clip.buf .get (), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
22472336 for (auto & t : tensors_to_load) {
22482337 ggml_tensor * cur = ggml_get_tensor (ctx_clip.ctx_data .get (), t->name );
2249- const size_t offset = tensor_offset[t->name ];
2338+ GGML_ASSERT (cur && " tensor not found in ctx_data" );
2339+ auto it_off = tensor_offset.find (t->name );
2340+ GGML_ASSERT (it_off != tensor_offset.end () && " no offset for tensor" );
2341+ const size_t offset = it_off->second ;
22502342 fin.seekg (offset, std::ios::beg);
22512343 if (!fin) {
22522344 throw std::runtime_error (string_format (" %s: failed to seek for tensor %s\n " , __func__, t->name ));
@@ -2266,6 +2358,7 @@ struct clip_model_loader {
22662358
22672359 LOG_DBG (" %s: loaded %zu tensors from %s\n " , __func__, tensors_to_load.size (), fname.c_str ());
22682360 }
2361+
22692362 }
22702363
22712364 struct support_info_op {
@@ -2538,8 +2631,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
25382631
25392632 // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
25402633 // we can remove this check when we implement audio support for Gemma 3N
2541- skip_audio = ctx_vision->model .proj_type == PROJECTOR_TYPE_GEMMA3NV
2542- || ctx_vision->model .proj_type == PROJECTOR_TYPE_GEMMA4V;
2634+ skip_audio = ctx_vision->model .proj_type == PROJECTOR_TYPE_GEMMA3NV;
25432635 }
25442636
25452637 if (loader.has_audio && !skip_audio) {
@@ -2893,6 +2985,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
28932985 {
28942986 n_patches = ((((img->nx + 1 ) / 2 ) + 1 ) / 2 + 1 ) / 2 ;
28952987 } break ;
2988+ case PROJECTOR_TYPE_GEMMA4A:
2989+ {
2990+ // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2
2991+ // O = floor((I - 1) / 2) + 1
2992+ int n = img->nx ;
2993+ for (int i = 0 ; i < 2 ; i++) {
2994+ n = (n - 1 ) / 2 + 1 ;
2995+ }
2996+ n_patches = n;
2997+ } break ;
28962998 default :
28972999 GGML_ABORT (" unsupported projector type" );
28983000 }
@@ -3352,6 +3454,56 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
33523454 }
33533455 set_input_i32 (" pos_w" , pos_data);
33543456 } break ;
3457+ case PROJECTOR_TYPE_GEMMA4A:
3458+ {
3459+ GGML_ASSERT (imgs.entries .size () == 1 );
3460+ const auto & img0 = imgs.entries .front ();
3461+ // Compute n_pos matching SSCP output: two stride-2 convs
3462+ int n_pos = img0->nx ;
3463+ for (int i = 0 ; i < 2 ; i++) { n_pos = (n_pos - 1 ) / 2 + 1 ; }
3464+
3465+ // Chunked local attention: blocked causal mask and RPE
3466+ const int chunk_size = 12 ;
3467+ const int max_past = 12 ;
3468+ const int context_size = chunk_size + max_past;
3469+ const int num_blocks = (n_pos + chunk_size - 1 ) / chunk_size;
3470+
3471+ // Blocked causal attention mask: [context_size, chunk_size, num_blocks]
3472+ {
3473+ std::vector<float > mask (context_size * chunk_size * num_blocks, -1e9f);
3474+ for (int b = 0 ; b < num_blocks; b++) {
3475+ for (int q = 0 ; q < chunk_size; q++) {
3476+ int gq = b * chunk_size + q;
3477+ for (int k = 0 ; k < context_size; k++) {
3478+ int gk = b * chunk_size - max_past + k;
3479+ if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq && (gq - gk) < max_past) {
3480+ mask[k + q * context_size + b * context_size * chunk_size] = 0 .0f ;
3481+ }
3482+ }
3483+ }
3484+ }
3485+ set_input_f32 (" kq_mask" , mask);
3486+ }
3487+
3488+ // Sinusoidal RPE: 13 positions [12, 11, ..., 0]
3489+ {
3490+ const int n_embd = ctx->model .hparams .n_embd ;
3491+ const int num_timescales = n_embd / 2 ;
3492+ const float log_timescale_increment = logf (10000 .0f ) / std::max (num_timescales - 1 , 1 );
3493+ const int rpe_len = max_past + 1 ;
3494+ std::vector<float > pos_emb (n_embd * rpe_len, 0 .0f );
3495+ for (int p = 0 ; p < rpe_len; p++) {
3496+ float position = (float )(max_past - p);
3497+ for (int i = 0 ; i < num_timescales; i++) {
3498+ float inv_ts = expf (-(float )i * log_timescale_increment);
3499+ float scaled = position * inv_ts;
3500+ pos_emb[p * n_embd + i] = sinf (scaled);
3501+ pos_emb[p * n_embd + i + num_timescales] = cosf (scaled);
3502+ }
3503+ }
3504+ set_input_f32 (" pos_emb" , pos_emb);
3505+ }
3506+ } break ;
33553507 case PROJECTOR_TYPE_LFM2A:
33563508 {
33573509 GGML_ASSERT (imgs.entries .size () == 1 );
@@ -3516,6 +3668,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
35163668 return ctx->model .mm_fc_w ->ne [1 ];
35173669 case PROJECTOR_TYPE_LFM2A:
35183670 return ctx->model .position_embeddings ->ne [0 ];
3671+ case PROJECTOR_TYPE_GEMMA4A:
3672+ return ctx->model .hparams .projection_dim ;
35193673 case PROJECTOR_TYPE_GLM4V:
35203674 return ctx->model .mm_ffn_down_w ->ne [1 ];
35213675 default :
0 commit comments