|
1 | 1 | #include "llama-graph.h" |
2 | 2 |
|
3 | 3 | #include "llama-impl.h" |
| 4 | +#include "llama-model.h" |
4 | 5 | #include "llama-batch.h" |
5 | 6 | #include "llama-cparams.h" |
6 | 7 |
|
@@ -1043,6 +1044,67 @@ ggml_tensor * llm_graph_context::build_norm( |
1043 | 1044 | return cur; |
1044 | 1045 | } |
1045 | 1046 |
|
| 1047 | + |
| 1048 | +llm_graph_qkv llm_graph_context::build_qkv( |
| 1049 | + const llama_layer & layer, |
| 1050 | + ggml_tensor * cur, |
| 1051 | + int64_t n_embd_head, |
| 1052 | + int64_t n_head, |
| 1053 | + int64_t n_head_kv, |
| 1054 | + int il) const { |
| 1055 | + const int64_t n_embd_q = n_embd_head * n_head; |
| 1056 | + const int64_t n_embd_kv = n_embd_head * n_head_kv; |
| 1057 | + |
| 1058 | + ggml_tensor * Qcur, * Kcur, * Vcur; |
| 1059 | + |
| 1060 | + if (layer.wqkv) { |
| 1061 | + // fused QKV path |
| 1062 | + ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur); |
| 1063 | + cb(qkv, "wqkv", il); |
| 1064 | + if (layer.bqkv) { |
| 1065 | + qkv = ggml_add(ctx0, qkv, layer.bqkv); |
| 1066 | + cb(qkv, "bqkv", il); |
| 1067 | + } |
| 1068 | + Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens, |
| 1069 | + ggml_element_size(qkv) * n_embd_head, qkv->nb[1], 0); |
| 1070 | + Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, |
| 1071 | + ggml_element_size(qkv) * n_embd_head, qkv->nb[1], |
| 1072 | + ggml_element_size(qkv) * n_embd_q); |
| 1073 | + Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, |
| 1074 | + ggml_element_size(qkv) * n_embd_head, qkv->nb[1], |
| 1075 | + ggml_element_size(qkv) * (n_embd_q + n_embd_kv)); |
| 1076 | + } else { |
| 1077 | + // separate Q/K/V path |
| 1078 | + Qcur = build_lora_mm(layer.wq, cur, layer.wq_s); |
| 1079 | + cb(Qcur, "Qcur", il); |
| 1080 | + if (layer.bq) { |
| 1081 | + Qcur = ggml_add(ctx0, Qcur, layer.bq); |
| 1082 | + cb(Qcur, "Qcur", il); |
| 1083 | + } |
| 1084 | + Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); |
| 1085 | + cb(Kcur, "Kcur", il); |
| 1086 | + if (layer.bk) { |
| 1087 | + Kcur = ggml_add(ctx0, Kcur, layer.bk); |
| 1088 | + cb(Kcur, "Kcur", il); |
| 1089 | + } |
| 1090 | + Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); |
| 1091 | + cb(Vcur, "Vcur", il); |
| 1092 | + if (layer.bv) { |
| 1093 | + Vcur = ggml_add(ctx0, Vcur, layer.bv); |
| 1094 | + cb(Vcur, "Vcur", il); |
| 1095 | + } |
| 1096 | + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); |
| 1097 | + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); |
| 1098 | + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); |
| 1099 | + } |
| 1100 | + |
| 1101 | + cb(Qcur, "Qcur", il); |
| 1102 | + cb(Kcur, "Kcur", il); |
| 1103 | + cb(Vcur, "Vcur", il); |
| 1104 | + |
| 1105 | + return { Qcur, Kcur, Vcur }; |
| 1106 | +} |
| 1107 | + |
1046 | 1108 | ggml_tensor * llm_graph_context::build_ffn( |
1047 | 1109 | ggml_tensor * cur, |
1048 | 1110 | ggml_tensor * up, |
|
0 commit comments