@@ -1489,6 +1489,13 @@ struct clip_model_loader {
14891489 hparams.audio_n_fft = 512 ;
14901490 hparams.audio_window_len = 400 ;
14911491 hparams.audio_hop_len = 160 ;
1492+ get_u32 (KEY_A_CHUNK_SIZE, hparams.audio_chunk_size );
1493+ get_u32 (KEY_A_CONV_KERNEL_SIZE, hparams.audio_conv_kernel_size );
1494+ get_u32 (KEY_A_MAX_POS_EMB, hparams.audio_max_pos_emb );
1495+ get_u32 (KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size );
1496+ get_u32 (KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate );
1497+ get_u32 (KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count );
1498+ get_f32 (KEY_A_PROJ_LAYERNORM_EPS, hparams.audio_proj_layernorm_eps );
14921499 } break ;
14931500 case PROJECTOR_TYPE_JANUS_PRO:
14941501 {
@@ -3131,7 +3138,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
31313138 } break ;
31323139 case PROJECTOR_TYPE_GRANITE_SPEECH:
31333140 {
3134- n_patches = ((img->nx + 14 ) / 15 ) * 3 ;
3141+ const int ws = ctx->model .hparams .audio_proj_window_size ;
3142+ const int ds = ctx->model .hparams .audio_proj_downsample_rate ;
3143+ n_patches = ((img->nx + ws - 1 ) / ws) * (ws / ds);
31353144 } break ;
31363145 default :
31373146 GGML_ABORT (" unsupported projector type" );
@@ -3666,8 +3675,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
36663675 } break ;
36673676 case PROJECTOR_TYPE_GRANITE_SPEECH:
36683677 {
3669- const int context_size = 200 ;
3670- const int max_pos_emb = 512 ;
3678+ const int context_size = ctx-> model . hparams . audio_chunk_size ;
3679+ const int max_pos_emb = ctx-> model . hparams . audio_max_pos_emb ;
36713680
36723681 std::vector<int32_t > dists (context_size * context_size);
36733682 for (int i = 0 ; i < context_size; i++) {
0 commit comments