Skip to content

Commit 75d759d

Browse files
committed
model : refactor QKV into common build_qkv and create_tensor_qkv helpers
1 parent fbd441c commit 75d759d

80 files changed

Lines changed: 322 additions & 1604 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/llama-graph.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "llama-graph.h"
22

33
#include "llama-impl.h"
4+
#include "llama-model.h"
45
#include "llama-batch.h"
56
#include "llama-cparams.h"
67

@@ -1043,6 +1044,67 @@ ggml_tensor * llm_graph_context::build_norm(
10431044
return cur;
10441045
}
10451046

1047+
1048+
llm_graph_qkv llm_graph_context::build_qkv(
1049+
const llama_layer & layer,
1050+
ggml_tensor * cur,
1051+
int64_t n_embd_head,
1052+
int64_t n_head,
1053+
int64_t n_head_kv,
1054+
int il) const {
1055+
const int64_t n_embd_q = n_embd_head * n_head;
1056+
const int64_t n_embd_kv = n_embd_head * n_head_kv;
1057+
1058+
ggml_tensor * Qcur, * Kcur, * Vcur;
1059+
1060+
if (layer.wqkv) {
1061+
// fused QKV path
1062+
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur);
1063+
cb(qkv, "wqkv", il);
1064+
if (layer.bqkv) {
1065+
qkv = ggml_add(ctx0, qkv, layer.bqkv);
1066+
cb(qkv, "bqkv", il);
1067+
}
1068+
Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens,
1069+
ggml_element_size(qkv) * n_embd_head, qkv->nb[1], 0);
1070+
Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
1071+
ggml_element_size(qkv) * n_embd_head, qkv->nb[1],
1072+
ggml_element_size(qkv) * n_embd_q);
1073+
Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
1074+
ggml_element_size(qkv) * n_embd_head, qkv->nb[1],
1075+
ggml_element_size(qkv) * (n_embd_q + n_embd_kv));
1076+
} else {
1077+
// separate Q/K/V path
1078+
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
1079+
cb(Qcur, "Qcur", il);
1080+
if (layer.bq) {
1081+
Qcur = ggml_add(ctx0, Qcur, layer.bq);
1082+
cb(Qcur, "Qcur", il);
1083+
}
1084+
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
1085+
cb(Kcur, "Kcur", il);
1086+
if (layer.bk) {
1087+
Kcur = ggml_add(ctx0, Kcur, layer.bk);
1088+
cb(Kcur, "Kcur", il);
1089+
}
1090+
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
1091+
cb(Vcur, "Vcur", il);
1092+
if (layer.bv) {
1093+
Vcur = ggml_add(ctx0, Vcur, layer.bv);
1094+
cb(Vcur, "Vcur", il);
1095+
}
1096+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
1097+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
1098+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
1099+
}
1100+
1101+
cb(Qcur, "Qcur", il);
1102+
cb(Kcur, "Kcur", il);
1103+
cb(Vcur, "Vcur", il);
1104+
1105+
return { Qcur, Kcur, Vcur };
1106+
}
1107+
10461108
ggml_tensor * llm_graph_context::build_ffn(
10471109
ggml_tensor * cur,
10481110
ggml_tensor * up,

src/llama-graph.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct ggml_context;
1717
struct ggml_tensor;
1818

1919
struct llama_cparams;
20+
struct llama_layer;
2021

2122
struct llama_memory_context_i;
2223

@@ -705,6 +706,12 @@ using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
705706
// used in build_rs to properly order writes and avoid unnecessary copies
706707
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
707708

709+
struct llm_graph_qkv {
710+
ggml_tensor * q; // [n_embd_head, n_head, n_tokens]
711+
ggml_tensor * k; // [n_embd_head, n_head_kv, n_tokens]
712+
ggml_tensor * v; // [n_embd_head, n_head_kv, n_tokens]
713+
};
714+
708715
struct llm_graph_context {
709716
const llm_arch arch;
710717

@@ -791,6 +798,17 @@ struct llm_graph_context {
791798
llm_norm_type type,
792799
int il) const;
793800

801+
802+
// compute Q, K, V projections with optional bias and reshape
803+
// supports both fused wqkv and separate wq/wk/wv paths
804+
llm_graph_qkv build_qkv(
805+
const llama_layer & layer,
806+
ggml_tensor * cur,
807+
int64_t n_embd_head,
808+
int64_t n_head,
809+
int64_t n_head_kv,
810+
int il) const;
811+
794812
ggml_tensor * build_ffn(
795813
ggml_tensor * cur,
796814
ggml_tensor * up,

src/llama-model.cpp

Lines changed: 88 additions & 301 deletions
Large diffs are not rendered by default.

src/models/afmoe.cpp

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,22 +41,13 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
4141
{
4242
ggml_tensor * attn_inp = cur; // save input for gate computation
4343

44-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
45-
cb(Qcur, "Qcur", il);
46-
47-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
48-
cb(Kcur, "Kcur", il);
49-
50-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
51-
cb(Vcur, "Vcur", il);
44+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
45+
n_embd_head, n_head, n_head_kv, il);
5246

5347
// compute gate from input
5448
ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, attn_inp);
5549
cb(gate, "attn_gate_proj", il);
5650

57-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
58-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
59-
6051
// Q/K normalization
6152
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
6253
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
@@ -77,8 +68,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
7768
cb(Kcur, "Kcur_rope", il);
7869
}
7970

80-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
81-
8271
cur = build_attn(inp_attn,
8372
NULL, NULL, // wo will be applied after gating
8473
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);

src/models/apertus.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,15 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
3232
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
3333

3434
// compute Q and K and RoPE them
35-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36-
cb(Qcur, "Qcur", il);
35+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
36+
n_embd_head, n_head, n_head_kv, il);
3737

38-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39-
cb(Kcur, "Kcur", il);
40-
41-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42-
cb(Vcur, "Vcur", il);
43-
44-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
4538
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
4639
cb(Qcur, "Qcur_normed", il);
4740

48-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
4941
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
5042
cb(Kcur, "Kcur_normed", il);
5143

52-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
53-
5444
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
5545
ext_factor, attn_factor, beta_fast, beta_slow);
5646

src/models/arcee.cpp

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -36,30 +36,8 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
3636
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
3737

3838
// compute Q and K and RoPE them
39-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
40-
cb(Qcur, "Qcur", il);
41-
if (model.layers[il].bq) {
42-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
43-
cb(Qcur, "Qcur", il);
44-
}
45-
46-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
47-
cb(Kcur, "Kcur", il);
48-
if (model.layers[il].bk) {
49-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
50-
cb(Kcur, "Kcur", il);
51-
}
52-
53-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
54-
cb(Vcur, "Vcur", il);
55-
if (model.layers[il].bv) {
56-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
57-
cb(Vcur, "Vcur", il);
58-
}
59-
60-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
61-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
62-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
39+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
40+
n_embd_head, n_head, n_head_kv, il);
6341

6442
Qcur = ggml_rope_ext(
6543
ctx0, Qcur, inp_pos, rope_factors,

src/models/arctic.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,8 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
3030
// self-attention
3131
{
3232
// compute Q and K and RoPE them
33-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
34-
cb(Qcur, "Qcur", il);
35-
36-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
37-
cb(Kcur, "Kcur", il);
38-
39-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
40-
cb(Vcur, "Vcur", il);
41-
42-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
43-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
44-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
33+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
34+
n_embd_head, n_head, n_head_kv, il);
4535

4636
Qcur = ggml_rope_ext(
4737
ctx0, Qcur, inp_pos, nullptr,

src/models/baichuan.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,8 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
2929

3030
// self-attention
3131
{
32-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
33-
cb(Qcur, "Qcur", il);
34-
35-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
36-
cb(Kcur, "Kcur", il);
37-
38-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
39-
cb(Vcur, "Vcur", il);
40-
41-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
42-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
43-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
32+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
33+
n_embd_head, n_head, n_head_kv, il);
4434

4535
switch (model.type) {
4636
case LLM_TYPE_7B:

src/models/bailingmoe.cpp

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,30 +28,8 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
2828
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
2929

3030
// compute Q and K and RoPE them
31-
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
32-
cb(Qcur, "Qcur", il);
33-
if (model.layers[il].bq) {
34-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
35-
cb(Qcur, "Qcur", il);
36-
}
37-
38-
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39-
cb(Kcur, "Kcur", il);
40-
if (model.layers[il].bk) {
41-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
42-
cb(Kcur, "Kcur", il);
43-
}
44-
45-
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
46-
cb(Vcur, "Vcur", il);
47-
if (model.layers[il].bv) {
48-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
49-
cb(Vcur, "Vcur", il);
50-
}
51-
52-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
53-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
54-
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
31+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
32+
n_embd_head_k, n_head, n_head_kv, il);
5533

5634
Qcur = ggml_rope_ext(
5735
ctx0, Qcur, inp_pos, rope_factors,

src/models/bailingmoe2.cpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
44
llm_graph_context(params) {
55
const int64_t n_embd_head = hparams.n_embd_head_v();
6-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
76

87
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
98

@@ -29,15 +28,8 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
2928

3029
// self_attention
3130
{
32-
cur = build_lora_mm(model.layers[il].wqkv, cur);
33-
cb(cur, "wqkv", il);
34-
35-
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
36-
cur->nb[1], 0 * sizeof(float) * (n_embd));
37-
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
38-
cur->nb[1], 1 * sizeof(float) * (n_embd));
39-
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
40-
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
31+
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
32+
n_embd_head, n_head, n_head_kv, il);
4133

4234
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
4335
cb(Qcur, "Qcur_normed", il);

0 commit comments

Comments
 (0)