model: Granite Speech Plus (#24818)

gabe-l-hart · ngxson · web-flow · commit a3900a669419 · 2026-06-23T12:03:31.000+02:00
* feat: Add conversion support for Granite Speech Plus

Branch: GraniteSpeechPlus
AI-usage: full (Bob, OpenCode + Qwen3.6-35b)
Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

* feat: Extend granite_speech to support plus multi-layer concatenation

Branch: GraniteSpeechPlus
AI-usage: draft (Bob, OpenCode + Qwen3.6-35b)
Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

* fix(conversion): Fix plural naming for feature_layers for audio

Branch: GraniteSpeechPlus
AI-usage: none
Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

* fix(mtmd): Align feature_layer usage and naming everywhere

Branch: GraniteSpeechPlus
AI-usage: none
Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

* style: Use fstring for log

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

Co-authored-by: Xuan-Son Nguyen &lt;thichthat@gmail.com&gt;

---------

Co-authored-by: Xuan-Son Nguyen &lt;thichthat@gmail.com&gt;
diff --git a/conversion/__init__.py b/conversion/__init__.py
@@ -96,6 +96,7 @@
     "GraniteMoeHybridForCausalLM": "granite",
     "GraniteMoeSharedForCausalLM": "granite",
     "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
     "Grok1ForCausalLM": "grok",
     "GrokForCausalLM": "grok",
     "GroveMoeForCausalLM": "grovemoe",
@@ -261,6 +262,7 @@
     "GlmasrModel": "ultravox",
     "Granite4VisionForConditionalGeneration": "granite",
     "GraniteSpeechForConditionalGeneration": "granite",
+    "GraniteSpeechPlusForConditionalGeneration": "granite",
     "HunYuanVLForConditionalGeneration": "hunyuan",
     "Idefics3ForConditionalGeneration": "smolvlm",
     "InternVisionModel": "internvl",
diff --git a/conversion/granite.py b/conversion/granite.py
@@ -348,6 +348,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
+class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
+    """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
+    has_vision_encoder = False
+    has_audio_encoder = True
+
+    def set_gguf_parameters(self):
+        assert self.hparams_audio is not None
+        super().set_gguf_parameters()
+
+        # Add feature_layer if present in encoder config
+        if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
+            self.gguf_writer.add_audio_feature_layers(feature_layers)
+            logger.info(f"gguf: audio feature_layers = {feature_layers}")
+
+            # Validate projector dimension matches concatenated encoder output
+            hidden_dim = self.hparams_audio["hidden_dim"]
+            expected_dim = hidden_dim * (len(feature_layers) + 1)
+            projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
+
+            if projector_dim != expected_dim:
+                raise ValueError(
+                    f"Projector encoder_hidden_size ({projector_dim}) does not match "
+                    f"expected concatenated dimension ({expected_dim}). "
+                    f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
+                )
+
+
 @ModelBase.register("Granite4VisionForConditionalGeneration")
 class Granite4VisionMmprojModel(MmprojModel):
     has_vision_encoder = True
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -359,6 +359,7 @@ class ClipAudio:
         CHUNK_SIZE          = "clip.audio.chunk_size"
         CONV_KERNEL_SIZE    = "clip.audio.conv_kernel_size"
         MAX_POS_EMB         = "clip.audio.max_pos_emb"
+        FEATURE_LAYERS      = "clip.audio.feature_layer" # Granite Speech Plus
 
         class Attention:
             HEAD_COUNT      = "clip.audio.attention.head_count"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -1310,6 +1310,9 @@ def add_audio_conv_kernel_size(self, value: int) -> None:
     def add_audio_max_pos_emb(self, value: int) -> None:
         self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
 
+    def add_audio_feature_layers(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers)
+
     def add_audio_projector_window_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
 
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -42,6 +42,7 @@
 #define KEY_N_HEAD              "clip.%s.attention.head_count"
 #define KEY_N_HEAD_KV           "clip.%s.attention.head_count_kv"
 #define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+#define KEY_FEATURE_LAYERS      "clip.%s.feature_layer"
 
 // vision-specific
 #define KEY_VISION_PROJ_TYPE        "clip.vision.projector_type" // for models with mixed modalities
@@ -54,7 +55,6 @@
 #define KEY_PATCH_SIZE              "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN              "clip.vision.image_mean"
 #define KEY_IMAGE_STD               "clip.vision.image_std"
-#define KEY_FEATURE_LAYER           "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR       "clip.vision.projector.scale_factor"
 #define KEY_PROJ_SAMPLE_QUERY_SIDE  "clip.vision.projector.query_side"
 #define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
@@ -91,7 +91,7 @@ struct clip_hparams {
 
     float eps = 1e-6;
     float rope_theta = 0.0;
-    std::vector<int32_t> vision_feature_layer;
+    std::vector<int32_t> feature_layers;
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
     std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
@@ -165,8 +165,8 @@ struct clip_hparams {
         return false;
     }
 
-    bool is_vision_feature_layer(int32_t layer) const {
-        return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
+    bool is_feature_layer(int32_t layer) const {
+        return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end();
     }
 };
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -1264,12 +1264,10 @@ struct clip_model_loader {
                 }
             }
 
-            // Load the vision feature layer indices if they are explicitly provided;
-            // if multiple vision feature layers are present, the values will be concatenated
-            // to form the final visual features.
+            // Load the vision/audio feature layer indices if they are explicitly provided
             // NOTE: gguf conversions should standardize the values of the vision feature layer to
             // be non-negative, since we use -1 to mark values as unset here.
-            get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
+            get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false);
 
             // model-specific params
             switch (model.proj_type) {
@@ -1651,6 +1649,7 @@ struct clip_model_loader {
                         get_u32(KEY_A_PROJ_WINDOW_SIZE,     hparams.audio_proj_window_size);
                         get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
                         get_u32(KEY_A_PROJ_HEAD_COUNT,      hparams.audio_proj_head_count);
+                        // NOTE: feature layers loaded above in common path
                     } break;
                 case PROJECTOR_TYPE_JANUS_PRO:
                     {
@@ -1663,11 +1662,11 @@ struct clip_model_loader {
                         hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
                         hparams.image_resize_pad = PAD_CEIL;
 
-                        get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
+                        // NOTE: feature_layers loaded in common path as optional
                         get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
-                        if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
-                            throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
-                                                                   hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
+                        if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) {
+                            throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d",
+                                                                   hparams.feature_layers.size(), hparams.proj_spatial_offsets.size()));
                         }
 
                         get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE,  hparams.downsample_query_side);
@@ -2740,7 +2739,7 @@ struct clip_model_loader {
                     model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
 
                     // Load separate layerwise and spatial projector tensors
-                    const auto projector_count = hparams.vision_feature_layer.size();
+                    const auto projector_count = hparams.feature_layers.size();
                     model.qf_proj_blocks.resize(projector_count);
                     for (size_t bid = 0; bid < projector_count; ++bid) {
                         auto & b = model.qf_proj_blocks[bid];
@@ -4388,7 +4387,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32
 
                 // Stage 1b only uses block 0's permutations; future stages
                 // will upload all blocks.
-                for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
+                for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) {
                     const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
                     upload(prefix + "win_idx",     make_win_idx(image_side, window_side));
                     upload(prefix + "qwin_idx",    make_win_idx(new_side, query_side));
diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp
@@ -1,5 +1,7 @@
 #include "models.h"
 
+#include <algorithm>
+
 ggml_cgraph * clip_graph_granite_speech::build() {
     const int n_frames     = img.nx();
     const int context_size = hparams.audio_chunk_size;
@@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() {
     const int padded_len   = num_blocks * context_size;
     const int remainder    = n_frames % context_size;
 
+    // Calculate projector input dimension based on feature layers
+    const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1);
+    const bool use_feature_concat = !hparams.feature_layers.empty();
+
     ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
     ggml_set_name(attn_dists, "attn_dists");
     ggml_set_input(attn_dists);
@@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() {
     cur = ggml_add(ctx0, cur, model.inp_proj_b);
     cb(cur, "inp_linear", -1);
 
+    // Capture layer 0 if requested (after input_linear)
+    ggml_tensor * concat_result = nullptr;
+    if (use_feature_concat) {
+        if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) {
+            concat_result = cur;
+            cb(concat_result, "feature_layer_0", -1);
+        }
+    }
+
     for (int il = 0; il < n_layer; il++) {
         const auto & layer = model.layers[il];
         auto * residual = cur;
@@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() {
                          NORM_TYPE_NORMAL, eps, il);
         cb(cur, "layer_out", il);
 
+        // Capture intermediate layer (il + 1) if requested
+        if (use_feature_concat) {
+            if (hparams.is_feature_layer(il + 1)) {
+                if (concat_result == nullptr) {
+                    concat_result = cur;
+                } else {
+                    concat_result = ggml_concat(ctx0, concat_result, cur, 0);
+                }
+                cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il);
+            }
+        }
+
         // CTC branch
         if (il + 1 == ctc_layer) {
             auto * mid = build_mm(model.ctc_out_w, cur);
@@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() {
         }
     }
 
+    // Append final output to concatenated features if using feature concatenation
+    if (use_feature_concat && concat_result != nullptr) {
+        concat_result = ggml_concat(ctx0, concat_result, cur, 0);
+        cb(concat_result, "concat_final", -1);
+        cur = concat_result;
+    }
+
     cb(cur, "encoder_out", -1);
 
     // QFormer projector
@@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
             cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
         }
 
-        ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
+        ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj);
 
         ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
             model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,
diff --git a/tools/mtmd/models/granite4-vision.cpp b/tools/mtmd/models/granite4-vision.cpp
@@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() {
     }
 
     // --- Stage 1b/1c: WindowQFormer blocks ---
-    const int projector_count = hparams.vision_feature_layer.size();
+    const int projector_count = hparams.feature_layers.size();
     const float qformer_eps = 1e-12f;
 
     ggml_tensor * mmproj = nullptr;
     for (int bid = 0; bid < projector_count; ++bid) {
         const auto & blk = model.qf_proj_blocks[bid];
 
-        int vlayer = hparams.vision_feature_layer[bid];
+        int vlayer = hparams.feature_layers[bid];
         GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
         ggml_tensor * h = layer_outs[vlayer];
 
diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp
@@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() {
 
         // If we set explicit vision feature layers, only go up to the deepest one
         // NOTE: only used by granite-vision models for now
-        for (const auto & feature_layer : hparams.vision_feature_layer) {
+        for (const auto & feature_layer : hparams.feature_layers) {
             if (feature_layer > deepest_feature_layer) {
                 deepest_feature_layer = feature_layer;
             }
@@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {
 
         // If this is an embedding feature layer, save the output.
         // NOTE: 0 index here refers to the input to the encoder.
-        if (hparams.is_vision_feature_layer(il)) {
+        if (hparams.is_feature_layer(il)) {
             embedding_stack.push_back(cur);
         }
 
@@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {
     // process vision feature layers (used by granite)
     {
         // final layer is a vision feature layer
-        if (hparams.is_vision_feature_layer(max_feature_layer)) {
+        if (hparams.is_feature_layer(max_feature_layer)) {
             embedding_stack.push_back(inpL);
         }
 

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() {`
`21`	`21`
`22`	`22`	`// If we set explicit vision feature layers, only go up to the deepest one`
`23`	`23`	`// NOTE: only used by granite-vision models for now`
`24`		`- for (const auto & feature_layer : hparams.vision_feature_layer) {`
	`24`	`+ for (const auto & feature_layer : hparams.feature_layers) {`
`25`	`25`	`if (feature_layer > deepest_feature_layer) {`
`26`	`26`	`deepest_feature_layer = feature_layer;`
`27`	`27`	`}`
`@@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {`
`59`	`59`
`60`	`60`	`// If this is an embedding feature layer, save the output.`
`61`	`61`	`// NOTE: 0 index here refers to the input to the encoder.`
`62`		`- if (hparams.is_vision_feature_layer(il)) {`
	`62`	`+ if (hparams.is_feature_layer(il)) {`
`63`	`63`	`embedding_stack.push_back(cur);`
`64`	`64`	`}`
`65`	`65`
`@@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {`
`134`	`134`	`// process vision feature layers (used by granite)`
`135`	`135`	`{`
`136`	`136`	`// final layer is a vision feature layer`
`137`		`- if (hparams.is_vision_feature_layer(max_feature_layer)) {`
	`137`	`+ if (hparams.is_feature_layer(max_feature_layer)) {`
`138`	`138`	`embedding_stack.push_back(inpL);`
`139`	`139`	`}`
`140`	`140`