Skip to content

Commit d8bf733

Browse files
committed
model : extend build_qkv to bert/mpt/dbrx/olmo/lfm2/nemotron-h/granite-hybrid/gemma3n-iswa/t5-dec and fix wqkv_s
1 parent 2976e27 commit d8bf733

10 files changed

Lines changed: 32 additions & 175 deletions

File tree

src/llama-graph.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1059,12 +1059,16 @@ llm_graph_qkv llm_graph_context::build_qkv(
10591059

10601060
if (layer.wqkv) {
10611061
// fused QKV path
1062-
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur);
1062+
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
10631063
cb(qkv, "wqkv", il);
10641064
if (layer.bqkv) {
10651065
qkv = ggml_add(ctx0, qkv, layer.bqkv);
10661066
cb(qkv, "bqkv", il);
10671067
}
1068+
if (hparams.f_clamp_kqv > 0.0f) {
1069+
qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
1070+
cb(qkv, "wqkv_clamped", il);
1071+
}
10681072
Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens,
10691073
ggml_element_size(qkv) * n_embd_head, qkv->nb[1], 0);
10701074
Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
@@ -1081,18 +1085,30 @@ llm_graph_qkv llm_graph_context::build_qkv(
10811085
Qcur = ggml_add(ctx0, Qcur, layer.bq);
10821086
cb(Qcur, "Qcur", il);
10831087
}
1088+
if (hparams.f_clamp_kqv > 0.0f) {
1089+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
1090+
cb(Qcur, "Qcur_clamped", il);
1091+
}
10841092
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
10851093
cb(Kcur, "Kcur", il);
10861094
if (layer.bk) {
10871095
Kcur = ggml_add(ctx0, Kcur, layer.bk);
10881096
cb(Kcur, "Kcur", il);
10891097
}
1098+
if (hparams.f_clamp_kqv > 0.0f) {
1099+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
1100+
cb(Kcur, "Kcur_clamped", il);
1101+
}
10901102
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
10911103
cb(Vcur, "Vcur", il);
10921104
if (layer.bv) {
10931105
Vcur = ggml_add(ctx0, Vcur, layer.bv);
10941106
cb(Vcur, "Vcur", il);
10951107
}
1108+
if (hparams.f_clamp_kqv > 0.0f) {
1109+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
1110+
cb(Vcur, "Vcur_clamped", il);
1111+
}
10961112
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10971113
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10981114
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
@@ -1105,6 +1121,7 @@ llm_graph_qkv llm_graph_context::build_qkv(
11051121
return { Qcur, Kcur, Vcur };
11061122
}
11071123

1124+
11081125
ggml_tensor * llm_graph_context::build_ffn(
11091126
ggml_tensor * cur,
11101127
ggml_tensor * up,

src/models/bert.cpp

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
44
const int64_t n_embd_head = hparams.n_embd_head_v();
5-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
65

76
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
87

@@ -39,35 +38,8 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
3938
ggml_tensor * cur = inpL;
4039

4140
{
42-
ggml_tensor * Qcur;
43-
ggml_tensor * Kcur;
44-
ggml_tensor * Vcur;
45-
46-
// self-attention
47-
if (model.layers[il].wqkv) {
48-
cur = build_lora_mm(model.layers[il].wqkv, cur);
49-
cb(cur, "wqkv", il);
50-
51-
if (model.layers[il].bqkv) {
52-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
53-
cb(cur, "bqkv", il);
54-
}
55-
56-
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
57-
0 * sizeof(float) * (n_embd));
58-
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
59-
cur->nb[1], 1 * sizeof(float) * (n_embd));
60-
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
61-
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
62-
} else {
63-
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
64-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
65-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
66-
67-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
68-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
69-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
70-
}
41+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
42+
n_embd_head, n_head, n_head_kv, il);
7143

7244
if (model.layers[il].attn_q_norm) {
7345
Qcur = ggml_reshape_2d(ctx0, Qcur, n_embd_head * n_head, n_tokens);

src/models/dbrx.cpp

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
44
const int64_t n_embd_head = hparams.n_embd_head_v();
5-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
65

76
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
87
GGML_ASSERT(n_embd_head == n_rot);
@@ -30,19 +29,8 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
3029

3130
// self-attention
3231
{
33-
ggml_tensor * Qcur = nullptr;
34-
ggml_tensor * Kcur = nullptr;
35-
ggml_tensor * Vcur = nullptr;
36-
37-
cur = build_lora_mm(model.layers[il].wqkv, cur);
38-
cb(cur, "wqkv", il);
39-
40-
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
41-
cb(cur, "wqkv_clamped", il);
42-
43-
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
44-
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
45-
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
32+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
33+
n_embd_head, n_head, n_head_kv, il);
4634

4735
Qcur = ggml_rope_ext(
4836
ctx0, Qcur, inp_pos, nullptr,

src/models/gemma3n-iswa.cpp

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -62,19 +62,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
6262

6363
// self-attention
6464
if (hparams.has_kv(il)) {
65-
// compute Q and K and RoPE them
66-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
67-
cb(Qcur, "Qcur", il);
68-
69-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
70-
cb(Kcur, "Kcur", il);
71-
72-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
73-
cb(Vcur, "Vcur", il);
74-
75-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
76-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
77-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
65+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, n_head, n_head_kv, il);
7866

7967
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
8068
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);

src/models/granite-hybrid.cpp

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -73,31 +73,7 @@ ggml_tensor * llm_build_granite_hybrid::build_attention_layer(ggml_tensor *
7373
const llama_model & model,
7474
const int64_t n_embd_head,
7575
const int il) {
76-
// compute Q and K and (optionally) RoPE them
77-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
78-
cb(Qcur, "Qcur", il);
79-
if (model.layers[il].bq) {
80-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
81-
cb(Qcur, "Qcur", il);
82-
}
83-
84-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
85-
cb(Kcur, "Kcur", il);
86-
if (model.layers[il].bk) {
87-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
88-
cb(Kcur, "Kcur", il);
89-
}
90-
91-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
92-
cb(Vcur, "Vcur", il);
93-
if (model.layers[il].bv) {
94-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
95-
cb(Vcur, "Vcur", il);
96-
}
97-
98-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
99-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
100-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
76+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, hparams.n_head(il), hparams.n_head_kv(il), il);
10177

10278
const bool use_rope = hparams.rope_finetuned;
10379
if (use_rope) {

src/models/lfm2.cpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,8 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
4242
const auto n_embd_head = hparams.n_embd_head_v();
4343
const auto n_head_kv = hparams.n_head_kv(il);
4444

45-
auto * q = build_lora_mm(model.layers[il].wq, cur);
46-
cb(q, "model.layers.{}.self_attn.q_proj", il);
47-
auto * k = build_lora_mm(model.layers[il].wk, cur);
48-
cb(k, "model.layers.{}.self_attn.k_proj", il);
49-
auto * v = build_lora_mm(model.layers[il].wv, cur);
50-
cb(v, "model.layers.{}.self_attn.v_proj", il);
51-
52-
q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
53-
k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
54-
v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
45+
auto [q, k, v] = build_qkv(model.layers[il], cur,
46+
n_embd_head, n_head, n_head_kv, il);
5547

5648
// qk norm
5749
q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);

src/models/mpt.cpp

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
66
const int64_t n_embd_head = hparams.n_embd_head_v();
7-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
87

98
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
109

@@ -38,25 +37,8 @@ llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params &
3837
{
3938
cur = attn_norm;
4039

41-
cur = build_lora_mm(model.layers[il].wqkv, cur);
42-
cb(cur, "wqkv", il);
43-
44-
if (model.layers[il].bqkv) {
45-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
46-
cb(cur, "bqkv", il);
47-
}
48-
49-
if (hparams.f_clamp_kqv > 0.0f) {
50-
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
51-
cb(cur, "wqkv_clamped", il);
52-
}
53-
54-
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
55-
cur->nb[1], 0 * sizeof(float) * (n_embd));
56-
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
57-
cur->nb[1], 1 * sizeof(float) * (n_embd));
58-
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
59-
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
40+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
41+
n_embd_head, n_head, n_head_kv, il);
6042

6143
// Q/K Layernorm
6244
if (model.layers[il].attn_q_norm) {

src/models/nemotron-h.cpp

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -65,35 +65,7 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
6565
const llama_model & model,
6666
int64_t n_embd_head,
6767
int il) {
68-
// compute Q and K
69-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
70-
cb(Qcur, "Qcur", il);
71-
if (model.layers[il].bq) {
72-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
73-
cb(Qcur, "Qcur", il);
74-
}
75-
76-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
77-
cb(Kcur, "Kcur", il);
78-
if (model.layers[il].bk) {
79-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
80-
cb(Kcur, "Kcur", il);
81-
}
82-
83-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
84-
cb(Vcur, "Vcur", il);
85-
if (model.layers[il].bv) {
86-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
87-
cb(Vcur, "Vcur", il);
88-
}
89-
90-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
91-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
92-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
93-
94-
cb(Qcur, "Qcur", il);
95-
cb(Kcur, "Kcur", il);
96-
cb(Vcur, "Vcur", il);
68+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, hparams.n_head(il), hparams.n_head_kv(il), il);
9769

9870
const float kq_scale =
9971
hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

src/models/olmo.cpp

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,27 +30,8 @@ llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params
3030
// self-attention
3131
{
3232
// compute Q and K and RoPE them
33-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
34-
cb(Qcur, "Qcur", il);
35-
if (hparams.f_clamp_kqv > 0.0f) {
36-
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
37-
cb(Qcur, "Qcur", il);
38-
}
39-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
40-
cb(Kcur, "Kcur", il);
41-
if (hparams.f_clamp_kqv > 0.0f) {
42-
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
43-
cb(Kcur, "Kcur", il);
44-
}
45-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
46-
cb(Vcur, "Vcur", il);
47-
if (hparams.f_clamp_kqv > 0.0f) {
48-
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
49-
cb(Vcur, "Vcur", il);
50-
}
51-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
52-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
53-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
33+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
34+
n_embd_head, n_head, n_head_kv, il);
5435

5536
Qcur = ggml_rope_ext(
5637
ctx0, Qcur, inp_pos, nullptr,

src/models/t5-dec.cpp

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,7 @@ llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_pa
3434

3535
// self-attention
3636
{
37-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
38-
cb(Qcur, "Qcur", il);
39-
40-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
41-
cb(Kcur, "Kcur", il);
42-
43-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
44-
cb(Vcur, "Vcur", il);
45-
46-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
47-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
48-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
37+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, n_head, n_head_kv, il);
4938

5039
ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
5140
ggml_tensor * kq_b = build_pos_bias(pos_bucket_dec, attn_rel_b);

0 commit comments

Comments
 (0)