11#include " models.h"
22
3+ #include < algorithm>
4+
35ggml_cgraph * clip_graph_granite_speech::build () {
46 const int n_frames = img.nx ();
57 const int context_size = hparams.audio_chunk_size ;
@@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() {
1113 const int padded_len = num_blocks * context_size;
1214 const int remainder = n_frames % context_size;
1315
16+ // Calculate projector input dimension based on feature layers
17+ const int proj_input_dim = n_embd * (hparams.feature_layers .size () + 1 );
18+ const bool use_feature_concat = !hparams.feature_layers .empty ();
19+
1420 ggml_tensor * attn_dists = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32 , context_size * context_size);
1521 ggml_set_name (attn_dists, " attn_dists" );
1622 ggml_set_input (attn_dists);
@@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() {
3137 cur = ggml_add (ctx0, cur, model.inp_proj_b );
3238 cb (cur, " inp_linear" , -1 );
3339
40+ // Capture layer 0 if requested (after input_linear)
41+ ggml_tensor * concat_result = nullptr ;
42+ if (use_feature_concat) {
43+ if (std::find (hparams.feature_layers .begin (), hparams.feature_layers .end (), 0 ) != hparams.feature_layers .end ()) {
44+ concat_result = cur;
45+ cb (concat_result, " feature_layer_0" , -1 );
46+ }
47+ }
48+
3449 for (int il = 0 ; il < n_layer; il++) {
3550 const auto & layer = model.layers [il];
3651 auto * residual = cur;
@@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() {
168183 NORM_TYPE_NORMAL , eps, il);
169184 cb (cur, " layer_out" , il);
170185
186+ // Capture intermediate layer (il + 1) if requested
187+ if (use_feature_concat) {
188+ if (hparams.is_feature_layer (il + 1 )) {
189+ if (concat_result == nullptr ) {
190+ concat_result = cur;
191+ } else {
192+ concat_result = ggml_concat (ctx0, concat_result, cur, 0 );
193+ }
194+ cb (concat_result, string_format (" feature_layer_%d" , il + 1 ).c_str (), il);
195+ }
196+ }
197+
171198 // CTC branch
172199 if (il + 1 == ctc_layer) {
173200 auto * mid = build_mm (model.ctc_out_w , cur);
@@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() {
180207 }
181208 }
182209
210+ // Append final output to concatenated features if using feature concatenation
211+ if (use_feature_concat && concat_result != nullptr ) {
212+ concat_result = ggml_concat (ctx0, concat_result, cur, 0 );
213+ cb (concat_result, " concat_final" , -1 );
214+ cur = concat_result;
215+ }
216+
183217 cb (cur, " encoder_out" , -1 );
184218
185219 // QFormer projector
@@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
197231 cur = ggml_pad (ctx0, cur, 0 , padded_proj - n_frames, 0 , 0 );
198232 }
199233
200- ggml_tensor * enc_windows = ggml_reshape_3d (ctx0, cur, n_embd , window_size, nblocks_proj);
234+ ggml_tensor * enc_windows = ggml_reshape_3d (ctx0, cur, proj_input_dim , window_size, nblocks_proj);
201235
202236 ggml_tensor * queries = build_norm (model.qf_proj_blocks [0 ].qf_proj_query ,
203237 model.qf_proj_blocks [0 ].qf_proj_norm_w , model.qf_proj_blocks [0 ].qf_proj_norm_b ,
0 commit comments