Skip to content

Commit 42eae08

Browse files
committed
model : refactor QKV into common build_qkv and create_tensor_qkv helpers
1 parent 825eb91 commit 42eae08

80 files changed

Lines changed: 322 additions & 1602 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/llama-graph.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "llama-graph.h"
22

33
#include "llama-impl.h"
4+
#include "llama-model.h"
45
#include "llama-batch.h"
56
#include "llama-cparams.h"
67

@@ -992,6 +993,67 @@ ggml_tensor * llm_graph_context::build_norm(
992993
return cur;
993994
}
994995

996+
997+
llm_graph_qkv llm_graph_context::build_qkv(
998+
const llama_layer & layer,
999+
ggml_tensor * cur,
1000+
int64_t n_embd_head,
1001+
int64_t n_head,
1002+
int64_t n_head_kv,
1003+
int il) const {
1004+
const int64_t n_embd_q = n_embd_head * n_head;
1005+
const int64_t n_embd_kv = n_embd_head * n_head_kv;
1006+
1007+
ggml_tensor * Qcur, * Kcur, * Vcur;
1008+
1009+
if (layer.wqkv) {
1010+
// fused QKV path
1011+
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur);
1012+
cb(qkv, "wqkv", il);
1013+
if (layer.bqkv) {
1014+
qkv = ggml_add(ctx0, qkv, layer.bqkv);
1015+
cb(qkv, "bqkv", il);
1016+
}
1017+
Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens,
1018+
ggml_element_size(qkv) * n_embd_head, qkv->nb[1], 0);
1019+
Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
1020+
ggml_element_size(qkv) * n_embd_head, qkv->nb[1],
1021+
ggml_element_size(qkv) * n_embd_q);
1022+
Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
1023+
ggml_element_size(qkv) * n_embd_head, qkv->nb[1],
1024+
ggml_element_size(qkv) * (n_embd_q + n_embd_kv));
1025+
} else {
1026+
// separate Q/K/V path
1027+
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
1028+
cb(Qcur, "Qcur", il);
1029+
if (layer.bq) {
1030+
Qcur = ggml_add(ctx0, Qcur, layer.bq);
1031+
cb(Qcur, "Qcur", il);
1032+
}
1033+
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
1034+
cb(Kcur, "Kcur", il);
1035+
if (layer.bk) {
1036+
Kcur = ggml_add(ctx0, Kcur, layer.bk);
1037+
cb(Kcur, "Kcur", il);
1038+
}
1039+
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
1040+
cb(Vcur, "Vcur", il);
1041+
if (layer.bv) {
1042+
Vcur = ggml_add(ctx0, Vcur, layer.bv);
1043+
cb(Vcur, "Vcur", il);
1044+
}
1045+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
1046+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1047+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1048+
}
1049+
1050+
cb(Qcur, "Qcur", il);
1051+
cb(Kcur, "Kcur", il);
1052+
cb(Vcur, "Vcur", il);
1053+
1054+
return { Qcur, Kcur, Vcur };
1055+
}
1056+
9951057
ggml_tensor * llm_graph_context::build_ffn(
9961058
ggml_tensor * cur,
9971059
ggml_tensor * up,

src/llama-graph.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct ggml_context;
1717
struct ggml_tensor;
1818

1919
struct llama_cparams;
20+
struct llama_layer;
2021

2122
struct llama_memory_context_i;
2223

@@ -697,6 +698,12 @@ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
697698
// used in build_rs to properly order writes and avoid unnecessary copies
698699
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
699700

701+
struct llm_graph_qkv {
702+
ggml_tensor * q; // [n_embd_head, n_head, n_tokens]
703+
ggml_tensor * k; // [n_embd_head, n_head_kv, n_tokens]
704+
ggml_tensor * v; // [n_embd_head, n_head_kv, n_tokens]
705+
};
706+
700707
struct llm_graph_context {
701708
const llm_arch arch;
702709

@@ -783,6 +790,17 @@ struct llm_graph_context {
783790
llm_norm_type type,
784791
int il) const;
785792

793+
794+
// compute Q, K, V projections with optional bias and reshape
795+
// supports both fused wqkv and separate wq/wk/wv paths
796+
llm_graph_qkv build_qkv(
797+
const llama_layer & layer,
798+
ggml_tensor * cur,
799+
int64_t n_embd_head,
800+
int64_t n_head,
801+
int64_t n_head_kv,
802+
int il) const;
803+
786804
ggml_tensor * build_ffn(
787805
ggml_tensor * cur,
788806
ggml_tensor * up,

src/llama-model.cpp

Lines changed: 88 additions & 301 deletions
Large diffs are not rendered by default.

src/models/afmoe.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,22 +41,13 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
4141
{
4242
ggml_tensor * attn_inp = cur; // save input for gate computation
4343

44-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
45-
cb(Qcur, "Qcur", il);
46-
47-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
48-
cb(Kcur, "Kcur", il);
49-
50-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
51-
cb(Vcur, "Vcur", il);
44+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
45+
n_embd_head, n_head, n_head_kv, il);
5246

5347
// compute gate from input
5448
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
5549
cb(gate, "attn_gate_proj", il);
5650

57-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
58-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
59-
6051
// Q/K normalization
6152
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6253
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);

src/models/apertus.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,15 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
3232
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
3333

3434
// compute Q and K and RoPE them
35-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36-
cb(Qcur, "Qcur", il);
35+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
36+
n_embd_head, n_head, n_head_kv, il);
3737

38-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39-
cb(Kcur, "Kcur", il);
40-
41-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42-
cb(Vcur, "Vcur", il);
43-
44-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4538
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
4639
cb(Qcur, "Qcur_normed", il);
4740

48-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4941
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
5042
cb(Kcur, "Kcur_normed", il);
5143

52-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
53-
5444
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
5545
ext_factor, attn_factor, beta_fast, beta_slow);
5646

src/models/arcee.cpp

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,30 +36,8 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
3636
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
3737

3838
// compute Q and K and RoPE them
39-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
40-
cb(Qcur, "Qcur", il);
41-
if (model.layers[il].bq) {
42-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
43-
cb(Qcur, "Qcur", il);
44-
}
45-
46-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
47-
cb(Kcur, "Kcur", il);
48-
if (model.layers[il].bk) {
49-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
50-
cb(Kcur, "Kcur", il);
51-
}
52-
53-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
54-
cb(Vcur, "Vcur", il);
55-
if (model.layers[il].bv) {
56-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
57-
cb(Vcur, "Vcur", il);
58-
}
59-
60-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
61-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
62-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
39+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
40+
n_embd_head, n_head, n_head_kv, il);
6341

6442
Qcur = ggml_rope_ext(
6543
ctx0, Qcur, inp_pos, rope_factors,

src/models/arctic.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,8 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
3030
// self-attention
3131
{
3232
// compute Q and K and RoPE them
33-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
34-
cb(Qcur, "Qcur", il);
35-
36-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
37-
cb(Kcur, "Kcur", il);
38-
39-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
40-
cb(Vcur, "Vcur", il);
41-
42-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
43-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
44-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
33+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
34+
n_embd_head, n_head, n_head_kv, il);
4535

4636
Qcur = ggml_rope_ext(
4737
ctx0, Qcur, inp_pos, nullptr,

src/models/baichuan.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,8 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
2929

3030
// self-attention
3131
{
32-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
33-
cb(Qcur, "Qcur", il);
34-
35-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
36-
cb(Kcur, "Kcur", il);
37-
38-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
39-
cb(Vcur, "Vcur", il);
40-
41-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
42-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
43-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
32+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
33+
n_embd_head, n_head, n_head_kv, il);
4434

4535
switch (model.type) {
4636
case LLM_TYPE_7B:

src/models/bailingmoe.cpp

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,8 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
2828
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
2929

3030
// compute Q and K and RoPE them
31-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
32-
cb(Qcur, "Qcur", il);
33-
if (model.layers[il].bq) {
34-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
35-
cb(Qcur, "Qcur", il);
36-
}
37-
38-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39-
cb(Kcur, "Kcur", il);
40-
if (model.layers[il].bk) {
41-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
42-
cb(Kcur, "Kcur", il);
43-
}
44-
45-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
46-
cb(Vcur, "Vcur", il);
47-
if (model.layers[il].bv) {
48-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
49-
cb(Vcur, "Vcur", il);
50-
}
51-
52-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
53-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
54-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
31+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
32+
n_embd_head_k, n_head, n_head_kv, il);
5533

5634
Qcur = ggml_rope_ext(
5735
ctx0, Qcur, inp_pos, rope_factors,

src/models/bailingmoe2.cpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
44
llm_graph_context(params) {
55
const int64_t n_embd_head = hparams.n_embd_head_v();
6-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
76

87
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
98

@@ -29,15 +28,8 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
2928

3029
// self_attention
3130
{
32-
cur = build_lora_mm(model.layers[il].wqkv, cur);
33-
cb(cur, "wqkv", il);
34-
35-
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
36-
cur->nb[1], 0 * sizeof(float) * (n_embd));
37-
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
38-
cur->nb[1], 1 * sizeof(float) * (n_embd));
39-
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
40-
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
31+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
32+
n_embd_head, n_head, n_head_kv, il);
4133

4234
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
4335
cb(Qcur, "Qcur_normed", il);

0 commit comments

Comments
 (0)