@@ -922,6 +922,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
922922 {
923923 builder = std::make_unique<clip_graph_conformer>(ctx, img);
924924 } break ;
925+ case PROJECTOR_TYPE_GEMMA4A:
926+ {
927+ builder = std::make_unique<clip_graph_gemma4a>(ctx, img);
928+ } break ;
925929 case PROJECTOR_TYPE_GLM4V:
926930 {
927931 builder = std::make_unique<clip_graph_glm4v>(ctx, img);
@@ -1429,6 +1433,16 @@ struct clip_model_loader {
14291433 hparams.audio_window_len = 400 ;
14301434 hparams.audio_hop_len = 160 ;
14311435 } break ;
1436+ case PROJECTOR_TYPE_GEMMA4A:
1437+ {
1438+ // Gemma4 feature_extraction_gemma4.py:
1439+ // frame_length_ms=20 -> 320 samples, n_fft=512, hop=10ms -> 160
1440+ hparams.audio_chunk_len = 0 ; // no fixed-length padding
1441+ hparams.audio_sample_rate = 16000 ;
1442+ hparams.audio_n_fft = 512 ;
1443+ hparams.audio_window_len = 320 ; // 20ms frame (NOT 25ms/400)
1444+ hparams.audio_hop_len = 160 ;
1445+ } break ;
14321446 case PROJECTOR_TYPE_JANUS_PRO:
14331447 {
14341448 hparams.image_pad_color = {127 , 127 , 127 };
@@ -1531,16 +1545,21 @@ struct clip_model_loader {
15311545 }
15321546
15331547 // helper function
1548+ std::unordered_set<std::string> loaded_tensor_names;
15341549 auto get_tensor = [&](const std::string & name, bool required = true ) {
1550+ // Each tensor should only be loaded once; duplicates indicate a bug
1551+ if (loaded_tensor_names.count (name)) {
1552+ throw std::runtime_error (string_format (" %s: tensor already loaded: %s\n " , __func__, name.c_str ()));
1553+ }
15351554 ggml_tensor * cur = ggml_get_tensor (ctx_meta.get (), name.c_str ());
15361555 if (!cur && required) {
15371556 throw std::runtime_error (string_format (" %s: unable to find tensor %s\n " , __func__, name.c_str ()));
15381557 }
15391558 if (cur) {
15401559 tensors_to_load.push_back (cur);
1541- // add tensors to context
15421560 ggml_tensor * data_tensor = ggml_dup_tensor (ctx_clip.ctx_data .get (), cur);
15431561 ggml_set_name (data_tensor, cur->name );
1562+ loaded_tensor_names.insert (name);
15441563 cur = data_tensor;
15451564 }
15461565 return cur;
@@ -2113,6 +2132,74 @@ struct clip_model_loader {
21132132 model.mm_fc_w = get_tensor (string_format (TN_MM_PROJECTOR, " weight" ));
21142133 model.mm_fc_b = get_tensor (string_format (TN_MM_PROJECTOR, " bias" ));
21152134 } break ;
2135+ case PROJECTOR_TYPE_GEMMA4A:
2136+ {
2137+ for (int i = 0 ; i < 2 ; i++) {
2138+ model.sscp_conv_w [i] = get_tensor (string_format (TN_A_CONV1D, i, " weight" ));
2139+ model.sscp_conv_b [i] = get_tensor (string_format (TN_A_CONV1D, i, " bias" ), false );
2140+ model.sscp_norm_w [i] = get_tensor (string_format (TN_A_CONV1D_NORM, i, " weight" ), false );
2141+ }
2142+ model.sscp_inp_proj_w = get_tensor (string_format (TN_A_INP_PROJ, " weight" ));
2143+ model.sscp_inp_proj_b = get_tensor (string_format (TN_A_INP_PROJ, " bias" ), false );
2144+ model.audio_out_proj_w = get_tensor (string_format (TN_A_OUT_PROJ, " weight" ), false );
2145+ model.audio_out_proj_b = get_tensor (string_format (TN_A_OUT_PROJ, " bias" ), false );
2146+ // audio multimodal embedder (mm.a.* namespace, not mm.*)
2147+ model.mm_soft_emb_norm_w = get_tensor (string_format (TN_A_MM_SOFT_EMB_N, " weight" ), false );
2148+ model.mm_input_proj_w = get_tensor (string_format (TN_A_MM_INP_PROJ, " weight" ), false );
2149+
2150+ // Per-layer tensors NOT loaded by the generic loop above
2151+ for (int il = 0 ; il < hparams.n_layer ; ++il) {
2152+ auto & layer = model.layers [il];
2153+
2154+ // Gemma4 audio conformer-specific tensors
2155+ layer.ff_norm_w = get_tensor (string_format (TN_FFN_NORM, prefix, il, " weight" ));
2156+ layer.attn_pre_norm_w = get_tensor (string_format (TN_A_ATTN_PRE_NORM, prefix, il, " weight" ), false );
2157+ layer.per_dim_scale_w = get_tensor (string_format (TN_A_PER_DIM_SCALE, prefix, il, " weight" ), false );
2158+ layer.per_dim_k_scale_w = get_tensor (string_format (TN_A_PER_DIM_K_SCALE, prefix, il, " weight" ), false );
2159+ layer.attn_k_rel_w = get_tensor (string_format (TN_A_ATTN_K_REL, prefix, il, " weight" ), false );
2160+
2161+ // Convolution module
2162+ layer.norm_conv_w = get_tensor (string_format (TN_NORM_CONV, prefix, il, " weight" ), false );
2163+ layer.norm_conv_b = get_tensor (string_format (TN_NORM_CONV, prefix, il, " bias" ), false );
2164+ layer.conv_pw1_w = get_tensor (string_format (TN_CONV_PW1, prefix, il, " weight" ));
2165+ layer.conv_pw1_b = get_tensor (string_format (TN_CONV_PW1, prefix, il, " bias" ), false );
2166+ layer.conv_dw_w = get_tensor (string_format (TN_CONV_DW, prefix, il, " weight" ));
2167+ layer.conv_dw_b = get_tensor (string_format (TN_CONV_DW, prefix, il, " bias" ), false );
2168+ layer.conv_norm_w = get_tensor (string_format (TN_CONV_NORM, prefix, il, " weight" ), false );
2169+ layer.conv_norm_b = get_tensor (string_format (TN_CONV_NORM, prefix, il, " bias" ), false );
2170+ layer.conv_pw2_w = get_tensor (string_format (TN_CONV_PW2, prefix, il, " weight" ));
2171+ layer.conv_pw2_b = get_tensor (string_format (TN_CONV_PW2, prefix, il, " bias" ), false );
2172+
2173+ // FFN2 (second half-step)
2174+ layer.ff_norm_1_w = get_tensor (string_format (TN_FFN_NORM_1, prefix, il, " weight" ));
2175+ layer.ff_up_1_w = get_tensor (string_format (TN_FFN_UP_1, prefix, il, " weight" ));
2176+ layer.ff_up_1_b = get_tensor (string_format (TN_FFN_UP_1, prefix, il, " bias" ), false );
2177+ layer.ff_down_1_w = get_tensor (string_format (TN_FFN_DOWN_1, prefix, il, " weight" ));
2178+ layer.ff_down_1_b = get_tensor (string_format (TN_FFN_DOWN_1, prefix, il, " bias" ), false );
2179+ layer.ff_post_norm_1_w = get_tensor (string_format (TN_A_FFN_POST_NORM_1, prefix, il, " weight" ), false );
2180+ }
2181+
2182+ // Load clamp info for ClippableLinear AFTER all tensors are loaded
2183+ for (auto * tensor : tensors_to_load) {
2184+ std::string name = tensor->name ;
2185+ if (string_ends_with (name, " .weight" )) {
2186+ std::string name_inp_max = name;
2187+ std::string name_inp_min = name;
2188+ std::string name_out_max = name;
2189+ std::string name_out_min = name;
2190+ string_replace_all (name_inp_max, " .weight" , " .input_max" );
2191+ string_replace_all (name_inp_min, " .weight" , " .input_min" );
2192+ string_replace_all (name_out_max, " .weight" , " .output_max" );
2193+ string_replace_all (name_out_min, " .weight" , " .output_min" );
2194+ model.clamp_info_map [name] = {
2195+ get_scalar (name_inp_max, FLT_MAX),
2196+ get_scalar (name_inp_min, -FLT_MAX),
2197+ get_scalar (name_out_max, FLT_MAX),
2198+ get_scalar (name_out_min, -FLT_MAX)
2199+ };
2200+ }
2201+ }
2202+ } break ;
21162203 case PROJECTOR_TYPE_LFM2A:
21172204 {
21182205 for (int i : {0 , 2 , 3 , 5 , 6 }) {
@@ -2173,7 +2260,10 @@ struct clip_model_loader {
21732260 ggml_backend_buffer_set_usage (ctx_clip.buf .get (), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
21742261 for (auto & t : tensors_to_load) {
21752262 ggml_tensor * cur = ggml_get_tensor (ctx_clip.ctx_data .get (), t->name );
2176- const size_t offset = tensor_offset[t->name ];
2263+ GGML_ASSERT (cur && " tensor not found in ctx_data" );
2264+ auto it_off = tensor_offset.find (t->name );
2265+ GGML_ASSERT (it_off != tensor_offset.end () && " no offset for tensor" );
2266+ const size_t offset = it_off->second ;
21772267 fin.seekg (offset, std::ios::beg);
21782268 if (!fin) {
21792269 throw std::runtime_error (string_format (" %s: failed to seek for tensor %s\n " , __func__, t->name ));
@@ -2465,8 +2555,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
24652555
24662556 // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
24672557 // we can remove this check when we implement audio support for Gemma 3N
2468- skip_audio = ctx_vision->model .proj_type == PROJECTOR_TYPE_GEMMA3NV
2469- || ctx_vision->model .proj_type == PROJECTOR_TYPE_GEMMA4V;
2558+ skip_audio = ctx_vision->model .proj_type == PROJECTOR_TYPE_GEMMA3NV;
24702559 }
24712560
24722561 if (loader.has_audio && !skip_audio) {
@@ -2808,6 +2897,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
28082897 {
28092898 n_patches = ((((img->nx + 1 ) / 2 ) + 1 ) / 2 + 1 ) / 2 ;
28102899 } break ;
2900+ case PROJECTOR_TYPE_GEMMA4A:
2901+ {
2902+ // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2
2903+ // O = floor((I - 1) / 2) + 1
2904+ int n = img->nx ;
2905+ for (int i = 0 ; i < 2 ; i++) {
2906+ n = (n - 1 ) / 2 + 1 ;
2907+ }
2908+ n_patches = n;
2909+ } break ;
28112910 default :
28122911 GGML_ABORT (" unsupported projector type" );
28132912 }
@@ -3232,6 +3331,56 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
32323331 }
32333332 set_input_i32 (" pos_w" , pos_data);
32343333 } break ;
3334+ case PROJECTOR_TYPE_GEMMA4A:
3335+ {
3336+ GGML_ASSERT (imgs.entries .size () == 1 );
3337+ const auto & img0 = imgs.entries .front ();
3338+ // Compute n_pos matching SSCP output: two stride-2 convs
3339+ int n_pos = img0->nx ;
3340+ for (int i = 0 ; i < 2 ; i++) { n_pos = (n_pos - 1 ) / 2 + 1 ; }
3341+
3342+ // Chunked local attention: blocked causal mask and RPE
3343+ const int chunk_size = 12 ;
3344+ const int max_past = 12 ;
3345+ const int context_size = chunk_size + max_past;
3346+ const int num_blocks = (n_pos + chunk_size - 1 ) / chunk_size;
3347+
3348+ // Blocked causal attention mask: [context_size, chunk_size, num_blocks]
3349+ {
3350+ std::vector<float > mask (context_size * chunk_size * num_blocks, -INFINITY);
3351+ for (int b = 0 ; b < num_blocks; b++) {
3352+ for (int q = 0 ; q < chunk_size; q++) {
3353+ int gq = b * chunk_size + q;
3354+ for (int k = 0 ; k < context_size; k++) {
3355+ int gk = b * chunk_size - max_past + k;
3356+ if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq) {
3357+ mask[k + q * context_size + b * context_size * chunk_size] = 0 .0f ;
3358+ }
3359+ }
3360+ }
3361+ }
3362+ set_input_f32 (" kq_mask" , mask);
3363+ }
3364+
3365+ // Sinusoidal RPE: 13 positions [12, 11, ..., 0]
3366+ {
3367+ const int n_embd = ctx->model .hparams .n_embd ;
3368+ const int num_timescales = n_embd / 2 ;
3369+ const float log_timescale_increment = logf (10000 .0f ) / std::max (num_timescales - 1 , 1 );
3370+ const int rpe_len = max_past + 1 ;
3371+ std::vector<float > pos_emb (n_embd * rpe_len, 0 .0f );
3372+ for (int p = 0 ; p < rpe_len; p++) {
3373+ float position = (float )(max_past - p);
3374+ for (int i = 0 ; i < num_timescales; i++) {
3375+ float inv_ts = expf (-(float )i * log_timescale_increment);
3376+ float scaled = position * inv_ts;
3377+ pos_emb[p * n_embd + i] = sinf (scaled);
3378+ pos_emb[p * n_embd + i + num_timescales] = cosf (scaled);
3379+ }
3380+ }
3381+ set_input_f32 (" pos_emb" , pos_emb);
3382+ }
3383+ } break ;
32353384 case PROJECTOR_TYPE_LFM2A:
32363385 {
32373386 GGML_ASSERT (imgs.entries .size () == 1 );
@@ -3391,6 +3540,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
33913540 return ctx->model .mm_fc_w ->ne [1 ];
33923541 case PROJECTOR_TYPE_LFM2A:
33933542 return ctx->model .position_embeddings ->ne [0 ];
3543+ case PROJECTOR_TYPE_GEMMA4A:
3544+ return ctx->model .hparams .projection_dim ;
33943545 case PROJECTOR_TYPE_GLM4V:
33953546 return ctx->model .mm_ffn_down_w ->ne [1 ];
33963547 default :
0 commit comments