Skip to content

Commit bbf2625

Browse files
mtmd: rename gs_ prefixed tensors to generic/architecture names
1 parent 57c8aa7 commit bbf2625

4 files changed

Lines changed: 92 additions & 90 deletions

File tree

tools/mtmd/clip-impl.h

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -182,27 +182,28 @@
182182
#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s"
183183
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
184184
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
185-
// granite_speech
186-
#define TN_GS_INP_LINEAR "a.enc_inp_linear.%s"
187-
#define TN_GS_CTC_OUT "a.enc_ctc_out.%s"
188-
#define TN_GS_CTC_OUT_MID "a.enc_ctc_out_mid.%s"
189-
#define TN_GS_ATTN_REL_POS "%s.blk.%d.attn_rel_pos_emb"
190-
#define TN_GS_PROJ_QUERY "a.proj_query"
191-
#define TN_GS_PROJ_NORM "a.proj_norm.%s"
192-
#define TN_GS_PROJ_LINEAR "a.proj_linear.%s"
193-
#define TN_GS_PROJ_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s"
194-
#define TN_GS_PROJ_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s"
195-
#define TN_GS_PROJ_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s"
196-
#define TN_GS_PROJ_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s"
197-
#define TN_GS_PROJ_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s"
198-
#define TN_GS_PROJ_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s"
199-
#define TN_GS_PROJ_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s"
200-
#define TN_GS_PROJ_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s"
201-
#define TN_GS_PROJ_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s"
202-
#define TN_GS_PROJ_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s"
203-
#define TN_GS_PROJ_FFN_UP "a.proj_blk.%d.ffn_up.%s"
204-
#define TN_GS_PROJ_FFN_DOWN "a.proj_blk.%d.ffn_down.%s"
205-
#define TN_GS_PROJ_FFN_NORM "a.proj_blk.%d.ffn_norm.%s"
185+
// granite_speech encoder
186+
#define TN_INP_PROJ "a.enc_inp_linear.%s"
187+
#define TN_CTC_OUT "a.enc_ctc_out.%s"
188+
#define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s"
189+
#define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb"
190+
// qformer projector
191+
#define TN_QF_PROJ_QUERY "a.proj_query"
192+
#define TN_QF_PROJ_NORM "a.proj_norm.%s"
193+
#define TN_QF_PROJ_LINEAR "a.proj_linear.%s"
194+
#define TN_QF_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s"
195+
#define TN_QF_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s"
196+
#define TN_QF_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s"
197+
#define TN_QF_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s"
198+
#define TN_QF_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s"
199+
#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s"
200+
#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s"
201+
#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s"
202+
#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s"
203+
#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s"
204+
#define TN_QF_FFN_UP "a.proj_blk.%d.ffn_up.%s"
205+
#define TN_QF_FFN_DOWN "a.proj_blk.%d.ffn_down.%s"
206+
#define TN_QF_FFN_NORM "a.proj_blk.%d.ffn_norm.%s"
206207

207208
// gemma4 audio conformer
208209
#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s"

tools/mtmd/clip-model.h

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ struct mobilenetv5_block {
271271
ggml_tensor * attn_norm_w = nullptr;
272272
};
273273

274-
struct granite_speech_proj_layer {
274+
struct qformer_proj_layer {
275275
ggml_tensor * self_attn_q_w = nullptr;
276276
ggml_tensor * self_attn_q_b = nullptr;
277277
ggml_tensor * self_attn_k_w = nullptr;
@@ -519,19 +519,20 @@ struct clip_model {
519519
ggml_tensor * audio_out_proj_w = nullptr;
520520
ggml_tensor * audio_out_proj_b = nullptr;
521521

522-
// granite_speech encoder + projector
523-
ggml_tensor * gs_inp_linear_w = nullptr;
524-
ggml_tensor * gs_inp_linear_b = nullptr;
525-
ggml_tensor * gs_ctc_out_w = nullptr;
526-
ggml_tensor * gs_ctc_out_b = nullptr;
527-
ggml_tensor * gs_ctc_out_mid_w = nullptr;
528-
ggml_tensor * gs_ctc_out_mid_b = nullptr;
529-
ggml_tensor * gs_proj_query = nullptr;
530-
ggml_tensor * gs_proj_norm_w = nullptr;
531-
ggml_tensor * gs_proj_norm_b = nullptr;
532-
ggml_tensor * gs_proj_linear_w = nullptr;
533-
ggml_tensor * gs_proj_linear_b = nullptr;
534-
std::vector<granite_speech_proj_layer> gs_proj_layers;
522+
// granite_speech encoder
523+
ggml_tensor * inp_proj_w = nullptr;
524+
ggml_tensor * inp_proj_b = nullptr;
525+
ggml_tensor * ctc_out_w = nullptr;
526+
ggml_tensor * ctc_out_b = nullptr;
527+
ggml_tensor * ctc_out_mid_w = nullptr;
528+
ggml_tensor * ctc_out_mid_b = nullptr;
529+
// qformer projector
530+
ggml_tensor * qf_proj_query = nullptr;
531+
ggml_tensor * qf_proj_norm_w = nullptr;
532+
ggml_tensor * qf_proj_norm_b = nullptr;
533+
ggml_tensor * qf_proj_linear_w = nullptr;
534+
ggml_tensor * qf_proj_linear_b = nullptr;
535+
std::vector<qformer_proj_layer> qf_proj_layers;
535536

536537
bool audio_has_avgpool() const {
537538
return proj_type == PROJECTOR_TYPE_QWEN2A

tools/mtmd/clip.cpp

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2359,12 +2359,12 @@ struct clip_model_loader {
23592359
hparams.n_layer = n_layer_orig;
23602360
model.layers.resize(hparams.n_layer);
23612361

2362-
model.gs_inp_linear_w = get_tensor(string_format(TN_GS_INP_LINEAR, "weight"));
2363-
model.gs_inp_linear_b = get_tensor(string_format(TN_GS_INP_LINEAR, "bias"));
2364-
model.gs_ctc_out_w = get_tensor(string_format(TN_GS_CTC_OUT, "weight"));
2365-
model.gs_ctc_out_b = get_tensor(string_format(TN_GS_CTC_OUT, "bias"));
2366-
model.gs_ctc_out_mid_w = get_tensor(string_format(TN_GS_CTC_OUT_MID, "weight"));
2367-
model.gs_ctc_out_mid_b = get_tensor(string_format(TN_GS_CTC_OUT_MID, "bias"));
2362+
model.inp_proj_w = get_tensor(string_format(TN_INP_PROJ, "weight"));
2363+
model.inp_proj_b = get_tensor(string_format(TN_INP_PROJ, "bias"));
2364+
model.ctc_out_w = get_tensor(string_format(TN_CTC_OUT, "weight"));
2365+
model.ctc_out_b = get_tensor(string_format(TN_CTC_OUT, "bias"));
2366+
model.ctc_out_mid_w = get_tensor(string_format(TN_CTC_OUT_MID, "weight"));
2367+
model.ctc_out_mid_b = get_tensor(string_format(TN_CTC_OUT_MID, "bias"));
23682368

23692369
for (int il = 0; il < hparams.n_layer; ++il) {
23702370
auto & layer = model.layers[il];
@@ -2374,7 +2374,7 @@ struct clip_model_loader {
23742374
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
23752375
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
23762376
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"));
2377-
layer.attn_rel_pos_emb = get_tensor(string_format(TN_GS_ATTN_REL_POS, prefix, il));
2377+
layer.attn_rel_pos_emb = get_tensor(string_format(TN_ATTN_REL_POS_EMB, prefix, il));
23782378

23792379
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"));
23802380
layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"));
@@ -2407,45 +2407,45 @@ struct clip_model_loader {
24072407
layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"));
24082408
}
24092409

2410-
model.gs_proj_query = get_tensor(TN_GS_PROJ_QUERY);
2411-
model.gs_proj_norm_w = get_tensor(string_format(TN_GS_PROJ_NORM, "weight"));
2412-
model.gs_proj_norm_b = get_tensor(string_format(TN_GS_PROJ_NORM, "bias"));
2413-
model.gs_proj_linear_w = get_tensor(string_format(TN_GS_PROJ_LINEAR, "weight"));
2414-
model.gs_proj_linear_b = get_tensor(string_format(TN_GS_PROJ_LINEAR, "bias"));
2410+
model.qf_proj_query = get_tensor(TN_QF_PROJ_QUERY);
2411+
model.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, "weight"));
2412+
model.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, "bias"));
2413+
model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight"));
2414+
model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias"));
24152415

24162416
const int n_proj_layers = 2;
2417-
model.gs_proj_layers.resize(n_proj_layers);
2417+
model.qf_proj_layers.resize(n_proj_layers);
24182418
for (int il = 0; il < n_proj_layers; ++il) {
2419-
auto & pl = model.gs_proj_layers[il];
2420-
2421-
pl.self_attn_q_w = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_Q, il, "weight"));
2422-
pl.self_attn_q_b = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_Q, il, "bias"));
2423-
pl.self_attn_k_w = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_K, il, "weight"));
2424-
pl.self_attn_k_b = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_K, il, "bias"));
2425-
pl.self_attn_v_w = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_V, il, "weight"));
2426-
pl.self_attn_v_b = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_V, il, "bias"));
2427-
pl.self_attn_o_w = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_O, il, "weight"));
2428-
pl.self_attn_o_b = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_O, il, "bias"));
2429-
pl.self_attn_norm_w = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_N, il, "weight"));
2430-
pl.self_attn_norm_b = get_tensor(string_format(TN_GS_PROJ_SELF_ATTN_N, il, "bias"));
2431-
2432-
pl.cross_attn_q_w = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_Q, il, "weight"));
2433-
pl.cross_attn_q_b = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_Q, il, "bias"));
2434-
pl.cross_attn_k_w = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_K, il, "weight"));
2435-
pl.cross_attn_k_b = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_K, il, "bias"));
2436-
pl.cross_attn_v_w = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_V, il, "weight"));
2437-
pl.cross_attn_v_b = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_V, il, "bias"));
2438-
pl.cross_attn_o_w = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_O, il, "weight"));
2439-
pl.cross_attn_o_b = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_O, il, "bias"));
2440-
pl.cross_attn_norm_w = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_N, il, "weight"));
2441-
pl.cross_attn_norm_b = get_tensor(string_format(TN_GS_PROJ_CROSS_ATTN_N, il, "bias"));
2442-
2443-
pl.ffn_up_w = get_tensor(string_format(TN_GS_PROJ_FFN_UP, il, "weight"));
2444-
pl.ffn_up_b = get_tensor(string_format(TN_GS_PROJ_FFN_UP, il, "bias"));
2445-
pl.ffn_down_w = get_tensor(string_format(TN_GS_PROJ_FFN_DOWN, il, "weight"));
2446-
pl.ffn_down_b = get_tensor(string_format(TN_GS_PROJ_FFN_DOWN, il, "bias"));
2447-
pl.ffn_norm_w = get_tensor(string_format(TN_GS_PROJ_FFN_NORM, il, "weight"));
2448-
pl.ffn_norm_b = get_tensor(string_format(TN_GS_PROJ_FFN_NORM, il, "bias"));
2419+
auto & pl = model.qf_proj_layers[il];
2420+
2421+
pl.self_attn_q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight"));
2422+
pl.self_attn_q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias"));
2423+
pl.self_attn_k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight"));
2424+
pl.self_attn_k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias"));
2425+
pl.self_attn_v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight"));
2426+
pl.self_attn_v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias"));
2427+
pl.self_attn_o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight"));
2428+
pl.self_attn_o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias"));
2429+
pl.self_attn_norm_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight"));
2430+
pl.self_attn_norm_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias"));
2431+
2432+
pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight"));
2433+
pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias"));
2434+
pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight"));
2435+
pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias"));
2436+
pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight"));
2437+
pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias"));
2438+
pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight"));
2439+
pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias"));
2440+
pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight"));
2441+
pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias"));
2442+
2443+
pl.ffn_up_w = get_tensor(string_format(TN_QF_FFN_UP, il, "weight"));
2444+
pl.ffn_up_b = get_tensor(string_format(TN_QF_FFN_UP, il, "bias"));
2445+
pl.ffn_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight"));
2446+
pl.ffn_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias"));
2447+
pl.ffn_norm_w = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight"));
2448+
pl.ffn_norm_b = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias"));
24492449
}
24502450
} break;
24512451
default:
@@ -3846,7 +3846,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
38463846
case PROJECTOR_TYPE_GEMMA4A:
38473847
return ctx->model.hparams.projection_dim;
38483848
case PROJECTOR_TYPE_GRANITE_SPEECH:
3849-
return ctx->model.gs_proj_linear_w->ne[1];
3849+
return ctx->model.qf_proj_linear_w->ne[1];
38503850
case PROJECTOR_TYPE_GLM4V:
38513851
return ctx->model.mm_ffn_down_w->ne[1];
38523852
default:

tools/mtmd/models/granite-speech.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ ggml_cgraph * clip_graph_granite_speech::build() {
2929
auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
3030
cb(cur, "inp_transposed", -1);
3131

32-
cur = build_mm(model.gs_inp_linear_w, cur);
33-
cur = ggml_add(ctx0, cur, model.gs_inp_linear_b);
32+
cur = build_mm(model.inp_proj_w, cur);
33+
cur = ggml_add(ctx0, cur, model.inp_proj_b);
3434
cb(cur, "inp_linear", -1);
3535

3636
for (int il = 0; il < n_layer; il++) {
@@ -171,11 +171,11 @@ ggml_cgraph * clip_graph_granite_speech::build() {
171171

172172
// CTC branch
173173
if (il + 1 == ctc_layer) {
174-
auto * mid = build_mm(model.gs_ctc_out_w, cur);
175-
mid = ggml_add(ctx0, mid, model.gs_ctc_out_b);
174+
auto * mid = build_mm(model.ctc_out_w, cur);
175+
mid = ggml_add(ctx0, mid, model.ctc_out_b);
176176
mid = ggml_soft_max(ctx0, mid);
177-
mid = build_mm(model.gs_ctc_out_mid_w, mid);
178-
mid = ggml_add(ctx0, mid, model.gs_ctc_out_mid_b);
177+
mid = build_mm(model.ctc_out_mid_w, mid);
178+
mid = ggml_add(ctx0, mid, model.ctc_out_mid_b);
179179
cur = ggml_add(ctx0, cur, mid);
180180
cb(cur, "ctc_branch", il);
181181
}
@@ -200,8 +200,8 @@ ggml_cgraph * clip_graph_granite_speech::build() {
200200

201201
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
202202

203-
ggml_tensor * queries = build_norm(model.gs_proj_query,
204-
model.gs_proj_norm_w, model.gs_proj_norm_b,
203+
ggml_tensor * queries = build_norm(model.qf_proj_query,
204+
model.qf_proj_norm_w, model.qf_proj_norm_b,
205205
NORM_TYPE_NORMAL, proj_eps, -1);
206206
{
207207
ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1);
@@ -210,8 +210,8 @@ ggml_cgraph * clip_graph_granite_speech::build() {
210210
queries = ggml_repeat(ctx0, q_3d, q_shape);
211211
}
212212

213-
for (int il = 0; il < (int)model.gs_proj_layers.size(); il++) {
214-
const auto & pl = model.gs_proj_layers[il];
213+
for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) {
214+
const auto & pl = model.qf_proj_layers[il];
215215

216216
// self-attention
217217
{
@@ -266,7 +266,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
266266
}
267267

268268
cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj);
269-
cur = ggml_add(ctx0, build_mm(model.gs_proj_linear_w, cur), model.gs_proj_linear_b);
269+
cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b);
270270
cb(cur, "projector_out", -1);
271271
}
272272

0 commit comments

Comments
 (0)