Skip to content

Commit c09a6d9

Browse files
mtmd: replace hardcoded magic numbers with GGUF hparams for granite speech
1 parent ea83dd8 commit c09a6d9

7 files changed

Lines changed: 69 additions & 9 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12385,6 +12385,15 @@ def set_gguf_parameters(self):
1238512385
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
1238612386
self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
1238712387
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
12388+
self.gguf_writer.add_audio_chunk_size(a["context_size"])
12389+
self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"])
12390+
self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"])
12391+
12392+
p = self.global_config
12393+
self.gguf_writer.add_audio_projector_window_size(p["window_size"])
12394+
self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"])
12395+
self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"])
12396+
self.gguf_writer.add_audio_projector_layernorm_eps(p["projector_config"]["layer_norm_eps"])
1238812397

1238912398
def tensor_force_quant(self, name, new_name, bid, n_dims):
1239012399
if "encoder" in name or "projector" in name:

gguf-py/gguf/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,13 +338,20 @@ class ClipAudio:
338338
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
339339
PROJECTION_DIM = "clip.audio.projection_dim"
340340
BLOCK_COUNT = "clip.audio.block_count"
341+
CHUNK_SIZE = "clip.audio.chunk_size"
342+
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
343+
MAX_POS_EMB = "clip.audio.max_pos_emb"
341344

342345
class Attention:
343346
HEAD_COUNT = "clip.audio.attention.head_count"
344347
LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon"
345348

346349
class Projector:
347350
STACK_FACTOR = "clip.audio.projector.stack_factor"
351+
WINDOW_SIZE = "clip.audio.projector.window_size"
352+
DOWNSAMPLE_RATE = "clip.audio.projector.downsample_rate"
353+
HEAD_COUNT = "clip.audio.projector.head_count"
354+
LAYERNORM_EPS = "clip.audio.projector.layer_norm_epsilon"
348355

349356
class Diffusion:
350357
SHIFT_LOGITS = "diffusion.shift_logits"

gguf-py/gguf/gguf_writer.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,6 +1257,27 @@ def add_audio_num_mel_bins(self, value: int) -> None:
12571257
def add_audio_stack_factor(self, value: int) -> None:
12581258
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
12591259

1260+
def add_audio_chunk_size(self, value: int) -> None:
1261+
self.add_uint32(Keys.ClipAudio.CHUNK_SIZE, value)
1262+
1263+
def add_audio_conv_kernel_size(self, value: int) -> None:
1264+
self.add_uint32(Keys.ClipAudio.CONV_KERNEL_SIZE, value)
1265+
1266+
def add_audio_max_pos_emb(self, value: int) -> None:
1267+
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
1268+
1269+
def add_audio_projector_window_size(self, value: int) -> None:
1270+
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
1271+
1272+
def add_audio_projector_downsample_rate(self, value: int) -> None:
1273+
self.add_uint32(Keys.ClipAudio.Projector.DOWNSAMPLE_RATE, value)
1274+
1275+
def add_audio_projector_head_count(self, value: int) -> None:
1276+
self.add_uint32(Keys.ClipAudio.Projector.HEAD_COUNT, value)
1277+
1278+
def add_audio_projector_layernorm_eps(self, value: float) -> None:
1279+
self.add_float32(Keys.ClipAudio.Projector.LAYERNORM_EPS, value)
1280+
12601281
def add_xielu_alpha_p(self, values: Sequence[float]):
12611282
self.add_array(Keys.xIELU.ALPHA_P, values)
12621283

tools/mtmd/clip-impl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,13 @@
6363
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
6464
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
6565
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
66+
#define KEY_A_CHUNK_SIZE "clip.audio.chunk_size"
67+
#define KEY_A_CONV_KERNEL_SIZE "clip.audio.conv_kernel_size"
68+
#define KEY_A_MAX_POS_EMB "clip.audio.max_pos_emb"
69+
#define KEY_A_PROJ_WINDOW_SIZE "clip.audio.projector.window_size"
70+
#define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate"
71+
#define KEY_A_PROJ_HEAD_COUNT "clip.audio.projector.head_count"
72+
#define KEY_A_PROJ_LAYERNORM_EPS "clip.audio.projector.layer_norm_epsilon"
6673

6774

6875
//

tools/mtmd/clip-model.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,13 @@ struct clip_hparams {
9292
// audio
9393
int32_t n_mel_bins = 0; // whisper preprocessor
9494
int32_t proj_stack_factor = 0; // ultravox
95+
int32_t audio_chunk_size = 0;
96+
int32_t audio_conv_kernel_size = 0;
97+
int32_t audio_max_pos_emb = 0;
98+
int32_t audio_proj_window_size = 0;
99+
int32_t audio_proj_downsample_rate = 0;
100+
int32_t audio_proj_head_count = 0;
101+
float audio_proj_layernorm_eps = 0.0f;
95102

96103
// audio-to-mel preprocessor params
97104
int32_t audio_chunk_len = -1; // in seconds

tools/mtmd/clip.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,6 +1489,13 @@ struct clip_model_loader {
14891489
hparams.audio_n_fft = 512;
14901490
hparams.audio_window_len = 400;
14911491
hparams.audio_hop_len = 160;
1492+
get_u32(KEY_A_CHUNK_SIZE, hparams.audio_chunk_size);
1493+
get_u32(KEY_A_CONV_KERNEL_SIZE, hparams.audio_conv_kernel_size);
1494+
get_u32(KEY_A_MAX_POS_EMB, hparams.audio_max_pos_emb);
1495+
get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size);
1496+
get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
1497+
get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count);
1498+
get_f32(KEY_A_PROJ_LAYERNORM_EPS, hparams.audio_proj_layernorm_eps);
14921499
} break;
14931500
case PROJECTOR_TYPE_JANUS_PRO:
14941501
{
@@ -3131,7 +3138,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
31313138
} break;
31323139
case PROJECTOR_TYPE_GRANITE_SPEECH:
31333140
{
3134-
n_patches = ((img->nx + 14) / 15) * 3;
3141+
const int ws = ctx->model.hparams.audio_proj_window_size;
3142+
const int ds = ctx->model.hparams.audio_proj_downsample_rate;
3143+
n_patches = ((img->nx + ws - 1) / ws) * (ws / ds);
31353144
} break;
31363145
default:
31373146
GGML_ABORT("unsupported projector type");
@@ -3666,8 +3675,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
36663675
} break;
36673676
case PROJECTOR_TYPE_GRANITE_SPEECH:
36683677
{
3669-
const int context_size = 200;
3670-
const int max_pos_emb = 512;
3678+
const int context_size = ctx->model.hparams.audio_chunk_size;
3679+
const int max_pos_emb = ctx->model.hparams.audio_max_pos_emb;
36713680

36723681
std::vector<int32_t> dists(context_size * context_size);
36733682
for (int i = 0; i < context_size; i++) {

tools/mtmd/models/granite-speech.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
ggml_cgraph * clip_graph_granite_speech::build() {
44
const int n_frames = img.nx;
5-
const int context_size = 200;
5+
const int context_size = hparams.audio_chunk_size;
66
const int ctc_layer = n_layer / 2;
7-
const int conv_kernel = 15;
7+
const int conv_kernel = hparams.audio_conv_kernel_size;
88
const int conv_pad = conv_kernel / 2;
99

1010
const int num_blocks = (n_frames + context_size - 1) / context_size;
@@ -185,12 +185,12 @@ ggml_cgraph * clip_graph_granite_speech::build() {
185185

186186
// QFormer projector
187187
{
188-
const int window_size = 15;
189-
const int num_queries = 3;
190-
const int proj_n_head = 16;
188+
const int window_size = hparams.audio_proj_window_size;
189+
const int num_queries = window_size / hparams.audio_proj_downsample_rate;
190+
const int proj_n_head = hparams.audio_proj_head_count;
191191
const int proj_d_head = n_embd / proj_n_head;
192192
const float proj_kq_scale = 1.0f / sqrtf((float)proj_d_head);
193-
const float proj_eps = 1e-12f;
193+
const float proj_eps = hparams.audio_proj_layernorm_eps;
194194
const int nblocks_proj = (n_frames + window_size - 1) / window_size;
195195
const int padded_proj = nblocks_proj * window_size;
196196

0 commit comments

Comments
 (0)