Skip to content

Commit a3900a6

Browse files
gabe-l-hartngxson
andauthored
model: Granite Speech Plus (#24818)
* feat: Add conversion support for Granite Speech Plus Branch: GraniteSpeechPlus AI-usage: full (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Extend granite_speech to support plus multi-layer concatenation Branch: GraniteSpeechPlus AI-usage: draft (Bob, OpenCode + Qwen3.6-35b) Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(conversion): Fix plural naming for feature_layers for audio Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Align feature_layer usage and naming everywhere Branch: GraniteSpeechPlus AI-usage: none Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Use fstring for log Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
1 parent 7c90850 commit a3900a6

10 files changed

Lines changed: 87 additions & 20 deletions

File tree

conversion/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
"GraniteMoeHybridForCausalLM": "granite",
9797
"GraniteMoeSharedForCausalLM": "granite",
9898
"GraniteSpeechForConditionalGeneration": "granite",
99+
"GraniteSpeechPlusForConditionalGeneration": "granite",
99100
"Grok1ForCausalLM": "grok",
100101
"GrokForCausalLM": "grok",
101102
"GroveMoeForCausalLM": "grovemoe",
@@ -261,6 +262,7 @@
261262
"GlmasrModel": "ultravox",
262263
"Granite4VisionForConditionalGeneration": "granite",
263264
"GraniteSpeechForConditionalGeneration": "granite",
265+
"GraniteSpeechPlusForConditionalGeneration": "granite",
264266
"HunYuanVLForConditionalGeneration": "hunyuan",
265267
"Idefics3ForConditionalGeneration": "smolvlm",
266268
"InternVisionModel": "internvl",

conversion/granite.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
348348
yield from super().modify_tensors(data_torch, name, bid)
349349

350350

351+
@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
352+
class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
353+
"""Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
354+
has_vision_encoder = False
355+
has_audio_encoder = True
356+
357+
def set_gguf_parameters(self):
358+
assert self.hparams_audio is not None
359+
super().set_gguf_parameters()
360+
361+
# Add feature_layer if present in encoder config
362+
if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
363+
self.gguf_writer.add_audio_feature_layers(feature_layers)
364+
logger.info(f"gguf: audio feature_layers = {feature_layers}")
365+
366+
# Validate projector dimension matches concatenated encoder output
367+
hidden_dim = self.hparams_audio["hidden_dim"]
368+
expected_dim = hidden_dim * (len(feature_layers) + 1)
369+
projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
370+
371+
if projector_dim != expected_dim:
372+
raise ValueError(
373+
f"Projector encoder_hidden_size ({projector_dim}) does not match "
374+
f"expected concatenated dimension ({expected_dim}). "
375+
f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
376+
)
377+
378+
351379
@ModelBase.register("Granite4VisionForConditionalGeneration")
352380
class Granite4VisionMmprojModel(MmprojModel):
353381
has_vision_encoder = True

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ class ClipAudio:
359359
CHUNK_SIZE = "clip.audio.chunk_size"
360360
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
361361
MAX_POS_EMB = "clip.audio.max_pos_emb"
362+
FEATURE_LAYERS = "clip.audio.feature_layer" # Granite Speech Plus
362363

363364
class Attention:
364365
HEAD_COUNT = "clip.audio.attention.head_count"

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1310,6 +1310,9 @@ def add_audio_conv_kernel_size(self, value: int) -> None:
13101310
def add_audio_max_pos_emb(self, value: int) -> None:
13111311
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
13121312

1313+
def add_audio_feature_layers(self, layers: Sequence[int]) -> None:
1314+
self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers)
1315+
13131316
def add_audio_projector_window_size(self, value: int) -> None:
13141317
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
13151318

tools/mtmd/clip-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#define KEY_N_HEAD "clip.%s.attention.head_count"
4343
#define KEY_N_HEAD_KV "clip.%s.attention.head_count_kv"
4444
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
45+
#define KEY_FEATURE_LAYERS "clip.%s.feature_layer"
4546

4647
// vision-specific
4748
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
@@ -54,7 +55,6 @@
5455
#define KEY_PATCH_SIZE "clip.vision.patch_size"
5556
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
5657
#define KEY_IMAGE_STD "clip.vision.image_std"
57-
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
5858
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
5959
#define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side"
6060
#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"

tools/mtmd/clip-model.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ struct clip_hparams {
9191

9292
float eps = 1e-6;
9393
float rope_theta = 0.0;
94-
std::vector<int32_t> vision_feature_layer;
94+
std::vector<int32_t> feature_layers;
9595
int32_t attn_window_size = 0;
9696
int32_t n_wa_pattern = 0;
9797
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
@@ -165,8 +165,8 @@ struct clip_hparams {
165165
return false;
166166
}
167167

168-
bool is_vision_feature_layer(int32_t layer) const {
169-
return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
168+
bool is_feature_layer(int32_t layer) const {
169+
return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end();
170170
}
171171
};
172172

tools/mtmd/clip.cpp

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,12 +1264,10 @@ struct clip_model_loader {
12641264
}
12651265
}
12661266

1267-
// Load the vision feature layer indices if they are explicitly provided;
1268-
// if multiple vision feature layers are present, the values will be concatenated
1269-
// to form the final visual features.
1267+
// Load the vision/audio feature layer indices if they are explicitly provided
12701268
// NOTE: gguf conversions should standardize the values of the vision feature layer to
12711269
// be non-negative, since we use -1 to mark values as unset here.
1272-
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
1270+
get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false);
12731271

12741272
// model-specific params
12751273
switch (model.proj_type) {
@@ -1651,6 +1649,7 @@ struct clip_model_loader {
16511649
get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size);
16521650
get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
16531651
get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count);
1652+
// NOTE: feature layers loaded above in common path
16541653
} break;
16551654
case PROJECTOR_TYPE_JANUS_PRO:
16561655
{
@@ -1663,11 +1662,11 @@ struct clip_model_loader {
16631662
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
16641663
hparams.image_resize_pad = PAD_CEIL;
16651664

1666-
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
1665+
// NOTE: feature_layers loaded in common path as optional
16671666
get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
1668-
if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
1669-
throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
1670-
hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
1667+
if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) {
1668+
throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d",
1669+
hparams.feature_layers.size(), hparams.proj_spatial_offsets.size()));
16711670
}
16721671

16731672
get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side);
@@ -2740,7 +2739,7 @@ struct clip_model_loader {
27402739
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
27412740

27422741
// Load separate layerwise and spatial projector tensors
2743-
const auto projector_count = hparams.vision_feature_layer.size();
2742+
const auto projector_count = hparams.feature_layers.size();
27442743
model.qf_proj_blocks.resize(projector_count);
27452744
for (size_t bid = 0; bid < projector_count; ++bid) {
27462745
auto & b = model.qf_proj_blocks[bid];
@@ -4388,7 +4387,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32
43884387

43894388
// Stage 1b only uses block 0's permutations; future stages
43904389
// will upload all blocks.
4391-
for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
4390+
for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) {
43924391
const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
43934392
upload(prefix + "win_idx", make_win_idx(image_side, window_side));
43944393
upload(prefix + "qwin_idx", make_win_idx(new_side, query_side));

tools/mtmd/models/granite-speech.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "models.h"
22

3+
#include <algorithm>
4+
35
ggml_cgraph * clip_graph_granite_speech::build() {
46
const int n_frames = img.nx();
57
const int context_size = hparams.audio_chunk_size;
@@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() {
1113
const int padded_len = num_blocks * context_size;
1214
const int remainder = n_frames % context_size;
1315

16+
// Calculate projector input dimension based on feature layers
17+
const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1);
18+
const bool use_feature_concat = !hparams.feature_layers.empty();
19+
1420
ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
1521
ggml_set_name(attn_dists, "attn_dists");
1622
ggml_set_input(attn_dists);
@@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() {
3137
cur = ggml_add(ctx0, cur, model.inp_proj_b);
3238
cb(cur, "inp_linear", -1);
3339

40+
// Capture layer 0 if requested (after input_linear)
41+
ggml_tensor * concat_result = nullptr;
42+
if (use_feature_concat) {
43+
if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) {
44+
concat_result = cur;
45+
cb(concat_result, "feature_layer_0", -1);
46+
}
47+
}
48+
3449
for (int il = 0; il < n_layer; il++) {
3550
const auto & layer = model.layers[il];
3651
auto * residual = cur;
@@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() {
168183
NORM_TYPE_NORMAL, eps, il);
169184
cb(cur, "layer_out", il);
170185

186+
// Capture intermediate layer (il + 1) if requested
187+
if (use_feature_concat) {
188+
if (hparams.is_feature_layer(il + 1)) {
189+
if (concat_result == nullptr) {
190+
concat_result = cur;
191+
} else {
192+
concat_result = ggml_concat(ctx0, concat_result, cur, 0);
193+
}
194+
cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il);
195+
}
196+
}
197+
171198
// CTC branch
172199
if (il + 1 == ctc_layer) {
173200
auto * mid = build_mm(model.ctc_out_w, cur);
@@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() {
180207
}
181208
}
182209

210+
// Append final output to concatenated features if using feature concatenation
211+
if (use_feature_concat && concat_result != nullptr) {
212+
concat_result = ggml_concat(ctx0, concat_result, cur, 0);
213+
cb(concat_result, "concat_final", -1);
214+
cur = concat_result;
215+
}
216+
183217
cb(cur, "encoder_out", -1);
184218

185219
// QFormer projector
@@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
197231
cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
198232
}
199233

200-
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
234+
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj);
201235

202236
ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
203237
model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,

tools/mtmd/models/granite4-vision.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() {
304304
}
305305

306306
// --- Stage 1b/1c: WindowQFormer blocks ---
307-
const int projector_count = hparams.vision_feature_layer.size();
307+
const int projector_count = hparams.feature_layers.size();
308308
const float qformer_eps = 1e-12f;
309309

310310
ggml_tensor * mmproj = nullptr;
311311
for (int bid = 0; bid < projector_count; ++bid) {
312312
const auto & blk = model.qf_proj_blocks[bid];
313313

314-
int vlayer = hparams.vision_feature_layer[bid];
314+
int vlayer = hparams.feature_layers[bid];
315315
GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
316316
ggml_tensor * h = layer_outs[vlayer];
317317

tools/mtmd/models/llava.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() {
2121

2222
// If we set explicit vision feature layers, only go up to the deepest one
2323
// NOTE: only used by granite-vision models for now
24-
for (const auto & feature_layer : hparams.vision_feature_layer) {
24+
for (const auto & feature_layer : hparams.feature_layers) {
2525
if (feature_layer > deepest_feature_layer) {
2626
deepest_feature_layer = feature_layer;
2727
}
@@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() {
5959

6060
// If this is an embedding feature layer, save the output.
6161
// NOTE: 0 index here refers to the input to the encoder.
62-
if (hparams.is_vision_feature_layer(il)) {
62+
if (hparams.is_feature_layer(il)) {
6363
embedding_stack.push_back(cur);
6464
}
6565

@@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() {
134134
// process vision feature layers (used by granite)
135135
{
136136
// final layer is a vision feature layer
137-
if (hparams.is_vision_feature_layer(max_feature_layer)) {
137+
if (hparams.is_feature_layer(max_feature_layer)) {
138138
embedding_stack.push_back(inpL);
139139
}
140140

0 commit comments

Comments
 (0)