Skip to content

Commit f4c14e1

Browse files
mtmd: add granite-speech support (ibm-granite/granite-4.0-1b-speech)
Conformer encoder with Shaw relative position encoding, QFormer projector, log-mel spectrogram with frame stacking. Encoder uses GLU gating, folded batch norm, and SSM depthwise conv. QFormer compresses encoder output via windowed cross-attention (window=15, queries=3) into the LLM embedding space. Audio preprocessing: reflect-padded STFT, 80-bin mel filterbank, dynamic range compression, 2x frame stacking (80->160 mel). GGUF converter handles batch norm folding at export time, fused K/V split, and Conv1d weight reshaping. Tested against HF transformers reference: token-for-token match on 30s/60s audio clips with greedy decoding.
1 parent 9e5647a commit f4c14e1

12 files changed

Lines changed: 802 additions & 7 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 167 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10611,6 +10611,18 @@ def set_gguf_parameters(self):
1061110611
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
1061210612

1061310613

10614+
@ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.TEXT)
10615+
class GraniteSpeechTextModel(GraniteModel):
10616+
model_arch = gguf.MODEL_ARCH.GRANITE
10617+
10618+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10619+
if name.startswith(("encoder.", "projector.")):
10620+
return
10621+
if name.startswith("language_model."):
10622+
name = name[len("language_model."):]
10623+
yield from super().modify_tensors(data_torch, name, bid)
10624+
10625+
1061410626
@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
1061510627
class GraniteMoeModel(GraniteModel):
1061610628
"""Conversion for IBM's GraniteMoeForCausalLM"""
@@ -10912,14 +10924,14 @@ def set_vocab(self):
1091210924
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
1091310925
self.hparams["vocab_size"] = vocab_size
1091410926

10915-
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
10927+
assert max(tokenizer.vocab.values()) < vocab_size
1091610928

1091710929
tokpre = self.get_vocab_base_pre(tokenizer)
1091810930

10919-
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
10920-
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
10931+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
10932+
added_vocab = tokenizer.get_added_vocab()
1092110933

10922-
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
10934+
added_tokens_decoder = tokenizer.added_tokens_decoder
1092310935

1092410936
for i in range(vocab_size):
1092510937
if i not in reverse_vocab:
@@ -10930,7 +10942,7 @@ def set_vocab(self):
1093010942
if token in added_vocab:
1093110943
if not added_tokens_decoder[i].normalized:
1093210944
previous_token = token
10933-
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]
10945+
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
1093410946
if previous_token != token:
1093510947
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
1093610948

@@ -12347,6 +12359,154 @@ def modify_tensors(self, data_torch, name, bid):
1234712359
yield from super().modify_tensors(data_torch, name, bid)
1234812360

1234912361

12362+
@ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.MMPROJ)
12363+
class GraniteSpeechMmprojModel(MmprojModel):
12364+
has_vision_encoder = False
12365+
has_audio_encoder = True
12366+
12367+
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
12368+
12369+
def get_audio_config(self) -> dict[str, Any] | None:
12370+
return self.global_config.get("encoder_config")
12371+
12372+
def set_gguf_parameters(self):
12373+
assert self.hparams_audio is not None
12374+
a = self.hparams_audio
12375+
a["hidden_size"] = a["hidden_dim"]
12376+
a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"]
12377+
a["num_attention_heads"] = a["num_heads"]
12378+
a["num_hidden_layers"] = a["num_layers"]
12379+
12380+
super().set_gguf_parameters()
12381+
12382+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
12383+
self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
12384+
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
12385+
12386+
def tensor_force_quant(self, name, new_name, bid, n_dims):
12387+
if "encoder" in name or "projector" in name:
12388+
if ".conv" in name and ".weight" in name:
12389+
return gguf.GGMLQuantizationType.F32
12390+
return super().tensor_force_quant(name, new_name, bid, n_dims)
12391+
12392+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
12393+
if name.startswith("language_model."):
12394+
return
12395+
if "attention_dists" in name:
12396+
return
12397+
if "num_batches_tracked" in name:
12398+
return
12399+
12400+
# fold running_mean, running_var and eps into weight and bias for batch_norm
12401+
if "batch_norm" in name and "encoder.layers." in name:
12402+
if self._batch_norm_tensors is None:
12403+
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
12404+
assert bid is not None
12405+
self._batch_norm_tensors[bid][name] = data_torch
12406+
if len(self._batch_norm_tensors[bid]) < 4:
12407+
return
12408+
prefix = f"encoder.layers.{bid}.conv.batch_norm"
12409+
weight = self._batch_norm_tensors[bid][f"{prefix}.weight"]
12410+
bias = self._batch_norm_tensors[bid][f"{prefix}.bias"]
12411+
running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"]
12412+
running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"]
12413+
eps = 1e-5
12414+
a = weight / torch.sqrt(running_var + eps)
12415+
b = bias - running_mean * a
12416+
yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid)
12417+
yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid)
12418+
return
12419+
12420+
if ".attn.to_kv.weight" in name:
12421+
k_weight, v_weight = data_torch.chunk(2, dim=0)
12422+
yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid)
12423+
yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid)
12424+
return
12425+
12426+
if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"):
12427+
if data_torch.ndim == 3 and data_torch.shape[2] == 1:
12428+
data_torch = data_torch.squeeze(2)
12429+
12430+
if "depth_conv" in name and name.endswith(".weight"):
12431+
if data_torch.ndim == 3 and data_torch.shape[1] == 1:
12432+
data_torch = data_torch.squeeze(1)
12433+
12434+
if name.startswith("projector."):
12435+
gguf_name = self._map_projector_tensor(name)
12436+
if gguf_name is None:
12437+
return
12438+
yield (gguf_name, data_torch)
12439+
return
12440+
12441+
global_map = {
12442+
"encoder.input_linear.weight": "a.enc_inp_linear.weight",
12443+
"encoder.input_linear.bias": "a.enc_inp_linear.bias",
12444+
"encoder.out.weight": "a.enc_ctc_out.weight",
12445+
"encoder.out.bias": "a.enc_ctc_out.bias",
12446+
"encoder.out_mid.weight": "a.enc_ctc_out_mid.weight",
12447+
"encoder.out_mid.bias": "a.enc_ctc_out_mid.bias",
12448+
}
12449+
if name in global_map:
12450+
yield (global_map[name], data_torch)
12451+
return
12452+
12453+
if ".attn.rel_pos_emb.weight" in name:
12454+
assert bid is not None
12455+
yield (f"a.blk.{bid}.attn_rel_pos_emb", data_torch)
12456+
return
12457+
12458+
yield from super().modify_tensors(data_torch, name, bid)
12459+
12460+
@staticmethod
12461+
def _map_projector_tensor(name: str) -> str | None:
12462+
static_map = {
12463+
"projector.query": "a.proj_query",
12464+
"projector.qformer.layernorm.weight": "a.proj_norm.weight",
12465+
"projector.qformer.layernorm.bias": "a.proj_norm.bias",
12466+
"projector.linear.weight": "a.proj_linear.weight",
12467+
"projector.linear.bias": "a.proj_linear.bias",
12468+
}
12469+
if name in static_map:
12470+
return static_map[name]
12471+
m = re.match(r"projector\.qformer\.encoder\.layer\.(\d+)\.(.*)", name)
12472+
if not m:
12473+
return None
12474+
lid = m.group(1)
12475+
rest = m.group(2)
12476+
layer_map = {
12477+
"attention.attention.query.weight": "self_attn_q.weight",
12478+
"attention.attention.query.bias": "self_attn_q.bias",
12479+
"attention.attention.key.weight": "self_attn_k.weight",
12480+
"attention.attention.key.bias": "self_attn_k.bias",
12481+
"attention.attention.value.weight": "self_attn_v.weight",
12482+
"attention.attention.value.bias": "self_attn_v.bias",
12483+
"attention.output.dense.weight": "self_attn_out.weight",
12484+
"attention.output.dense.bias": "self_attn_out.bias",
12485+
"attention.output.LayerNorm.weight": "self_attn_norm.weight",
12486+
"attention.output.LayerNorm.bias": "self_attn_norm.bias",
12487+
"crossattention.attention.query.weight": "cross_attn_q.weight",
12488+
"crossattention.attention.query.bias": "cross_attn_q.bias",
12489+
"crossattention.attention.key.weight": "cross_attn_k.weight",
12490+
"crossattention.attention.key.bias": "cross_attn_k.bias",
12491+
"crossattention.attention.value.weight": "cross_attn_v.weight",
12492+
"crossattention.attention.value.bias": "cross_attn_v.bias",
12493+
"crossattention.output.dense.weight": "cross_attn_out.weight",
12494+
"crossattention.output.dense.bias": "cross_attn_out.bias",
12495+
"crossattention.output.LayerNorm.weight": "cross_attn_norm.weight",
12496+
"crossattention.output.LayerNorm.bias": "cross_attn_norm.bias",
12497+
"intermediate_query.dense.weight": "ffn_up.weight",
12498+
"intermediate_query.dense.bias": "ffn_up.bias",
12499+
"output_query.dense.weight": "ffn_down.weight",
12500+
"output_query.dense.bias": "ffn_down.bias",
12501+
"output_query.LayerNorm.weight": "ffn_norm.weight",
12502+
"output_query.LayerNorm.bias": "ffn_norm.bias",
12503+
}
12504+
suffix = layer_map.get(rest)
12505+
if suffix is None:
12506+
return None
12507+
return f"a.proj_blk.{lid}.{suffix}"
12508+
12509+
1235012510
@ModelBase.register("Lfm25AudioTokenizer")
1235112511
class LFM25AudioTokenizer(LFM2Model):
1235212512
model_arch = gguf.MODEL_ARCH.LFM2
@@ -13356,6 +13516,8 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
1335613516
# TODO: refactor this later to avoid adding exception here
1335713517
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
1335813518
return arch
13519+
if model_type == ModelType.TEXT and arch == "GraniteSpeechForConditionalGeneration":
13520+
return arch
1335913521

1336013522
# if "architectures" is found in the sub-config, use that instead
1336113523
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4138,6 +4138,7 @@ class VisionProjectorType:
41384138
YOUTUVL = "youtuvl"
41394139
NEMOTRON_V2_VL = "nemotron_v2_vl"
41404140
HUNYUANOCR = "hunyuanocr"
4141+
GRANITE_SPEECH = "granite_speech" # audio
41414142

41424143

41434144
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1912,20 +1912,23 @@ class TensorNameMap:
19121912
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
19131913
"conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
19141914
"conformer.layers.{bid}.self_attn.q_proj", # gemma4
1915+
"encoder.layers.{bid}.attn.to_q", # granite_speech
19151916
),
19161917

19171918
MODEL_TENSOR.A_ENC_ATTN_K: (
19181919
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
19191920
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
19201921
"conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
19211922
"conformer.layers.{bid}.self_attn.k_proj", # gemma4
1923+
"encoder.layers.{bid}.attn.to_k", # granite_speech (split from to_kv)
19221924
),
19231925

19241926
MODEL_TENSOR.A_ENC_ATTN_V: (
19251927
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
19261928
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
19271929
"conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
19281930
"conformer.layers.{bid}.self_attn.v_proj", # gemma4
1931+
"encoder.layers.{bid}.attn.to_v", # granite_speech (split from to_kv)
19291932
),
19301933

19311934
MODEL_TENSOR.A_ENC_ATTN_K_REL: (
@@ -1953,25 +1956,29 @@ class TensorNameMap:
19531956
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
19541957
"conformer.layers.{bid}.norm_self_att", # lfm2
19551958
"conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
1959+
"encoder.layers.{bid}.attn.pre_norm", # granite_speech
19561960
),
19571961

19581962
MODEL_TENSOR.A_ENC_OUTPUT: (
19591963
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
19601964
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
19611965
"conformer.layers.{bid}.attention.post", # gemma3n
19621966
"conformer.layers.{bid}.self_attn.post", # gemma4
1967+
"encoder.layers.{bid}.attn.to_out", # granite_speech
19631968
),
19641969

19651970
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
19661971
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
19671972
"conformer.layers.{bid}.norm_out", # lfm2
19681973
"conformer.layers.{bid}.attention.post_norm", # gemma3n
1974+
"encoder.layers.{bid}.post_norm", # granite_speech
19691975
),
19701976

19711977
MODEL_TENSOR.A_ENC_FFN_NORM: (
19721978
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
19731979
"conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
19741980
"conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4
1981+
"encoder.layers.{bid}.ff1.pre_norm", # granite_speech
19751982
),
19761983

19771984
MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
@@ -1988,6 +1995,7 @@ class TensorNameMap:
19881995
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
19891996
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
19901997
"conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4
1998+
"encoder.layers.{bid}.ff1.up_proj", # granite_speech
19911999
),
19922000

19932001
MODEL_TENSOR.A_ENC_FFN_GATE: (),
@@ -1997,24 +2005,28 @@ class TensorNameMap:
19972005
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
19982006
"conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
19992007
"conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4
2008+
"encoder.layers.{bid}.ff1.down_proj", # granite_speech
20002009
),
20012010

20022011
MODEL_TENSOR.A_ENC_FFN_UP_1: (
20032012
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
20042013
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
20052014
"conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4
2015+
"encoder.layers.{bid}.ff2.up_proj", # granite_speech
20062016
),
20072017

20082018
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
20092019
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
20102020
"conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
20112021
"conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4
2022+
"encoder.layers.{bid}.ff2.down_proj", # granite_speech
20122023
),
20132024

20142025
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
20152026
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
20162027
"conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
20172028
"conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4
2029+
"encoder.layers.{bid}.ff2.pre_norm", # granite_speech
20182030
),
20192031

20202032
MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
@@ -2071,26 +2083,31 @@ class TensorNameMap:
20712083
MODEL_TENSOR.A_ENC_CONV_DW: (
20722084
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
20732085
"conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
2086+
"encoder.layers.{bid}.conv.depth_conv.conv", # granite_speech
20742087
),
20752088

20762089
MODEL_TENSOR.A_ENC_CONV_NORM: (
20772090
"conformer.layers.{bid}.conv.batch_norm", # lfm2
20782091
"conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
2092+
"encoder.layers.{bid}.conv.batch_norm", # granite_speech
20792093
),
20802094

20812095
MODEL_TENSOR.A_ENC_CONV_PW1: (
20822096
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
20832097
"conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
2098+
"encoder.layers.{bid}.conv.up_conv", # granite_speech
20842099
),
20852100

20862101
MODEL_TENSOR.A_ENC_CONV_PW2: (
20872102
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
20882103
"conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
2104+
"encoder.layers.{bid}.conv.down_conv", # granite_speech
20892105
),
20902106

20912107
MODEL_TENSOR.A_ENC_NORM_CONV: (
20922108
"conformer.layers.{bid}.norm_conv", # lfm2
20932109
"conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
2110+
"encoder.layers.{bid}.conv.norm", # granite_speech
20942111
),
20952112

20962113
MODEL_TENSOR.A_PER_DIM_K_SCALE: (

tools/mtmd/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ add_library(mtmd
2020
models/dotsocr.cpp
2121
models/gemma4a.cpp
2222
models/gemma4v.cpp
23+
models/granite-speech.cpp
2324
models/glm4v.cpp
2425
models/hunyuanocr.cpp
2526
models/internvl.cpp

tools/mtmd/clip-impl.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,27 @@
182182
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
183183
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
184184
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
185+
// granite_speech
186+
#define TN_GS_INP_LINEAR "a.enc_inp_linear.%s"
187+
#define TN_GS_CTC_OUT "a.enc_ctc_out.%s"
188+
#define TN_GS_CTC_OUT_MID "a.enc_ctc_out_mid.%s"
189+
#define TN_GS_ATTN_REL_POS "%s.blk.%d.attn_rel_pos_emb"
190+
#define TN_GS_PROJ_QUERY "a.proj_query"
191+
#define TN_GS_PROJ_NORM "a.proj_norm.%s"
192+
#define TN_GS_PROJ_LINEAR "a.proj_linear.%s"
193+
#define TN_GS_PROJ_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s"
194+
#define TN_GS_PROJ_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s"
195+
#define TN_GS_PROJ_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s"
196+
#define TN_GS_PROJ_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s"
197+
#define TN_GS_PROJ_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s"
198+
#define TN_GS_PROJ_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s"
199+
#define TN_GS_PROJ_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s"
200+
#define TN_GS_PROJ_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s"
201+
#define TN_GS_PROJ_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s"
202+
#define TN_GS_PROJ_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s"
203+
#define TN_GS_PROJ_FFN_UP "a.proj_blk.%d.ffn_up.%s"
204+
#define TN_GS_PROJ_FFN_DOWN "a.proj_blk.%d.ffn_down.%s"
205+
#define TN_GS_PROJ_FFN_NORM "a.proj_blk.%d.ffn_norm.%s"
185206

186207
// gemma4 audio conformer
187208
#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s"
@@ -293,6 +314,7 @@ enum projector_type {
293314
PROJECTOR_TYPE_KIMIK25,
294315
PROJECTOR_TYPE_NEMOTRON_V2_VL,
295316
PROJECTOR_TYPE_HUNYUANOCR,
317+
PROJECTOR_TYPE_GRANITE_SPEECH,
296318
PROJECTOR_TYPE_UNKNOWN,
297319
};
298320

@@ -338,6 +360,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
338360
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
339361
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
340362
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
363+
{ PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
341364
};
342365

343366
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)