Skip to content

Commit 5be5965

Browse files
ggerganovruixiang63
authored andcommitted
llama : enable layer input extraction
1 parent a135ec0 commit 5be5965

13 files changed

Lines changed: 73 additions & 9 deletions

src/llama-context.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ llama_context::llama_context(
8383
cparams.cb_eval = params.cb_eval;
8484
cparams.cb_eval_user_data = params.cb_eval_user_data;
8585

86+
cparams.output_layer_inp.resize(hparams.n_layer, false);
87+
8688
cparams.ctx_type = params.ctx_type;
8789

8890
// Initialize backend samplers here so they are part of the sampling graph
@@ -1233,6 +1235,16 @@ bool llama_context::set_adapter_cvec(
12331235
return res;
12341236
}
12351237

1238+
void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
1239+
LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable);
1240+
1241+
GGML_ASSERT(layer_id < model.hparams.n_layer);
1242+
1243+
cparams.output_layer_inp[layer_id] = enable;
1244+
1245+
sched_need_reserve = true;
1246+
}
1247+
12361248
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
12371249
if (mctx && !mctx->apply()) {
12381250
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
@@ -2009,7 +2021,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
20092021
has_embd = true;
20102022
}
20112023

2012-
20132024
size_t backend_float_count = 0;
20142025
size_t backend_token_count = 0;
20152026

@@ -3992,3 +4003,7 @@ void llama_opt_epoch(
39924003
llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) {
39934004
return ctx->memory_breakdown();
39944005
}
4006+
4007+
void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) {
4008+
ctx->set_output_layer_inp(layer_id, enable);
4009+
}

src/llama-context.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ struct llama_context {
125125
int32_t il_start,
126126
int32_t il_end);
127127

128+
void set_output_layer_inp(uint32_t layer_id, bool enable);
129+
128130
// process a single ubatch with a specific graph type
129131
// if memory_context is provided, it will be applied first to the context's memory
130132
// ret contains the status of the graph computation

src/llama-cparams.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "llama.h"
44

55
#include <cstdint>
6+
#include <vector>
67

78
#define LLAMA_MAX_SEQ 256
89

@@ -43,6 +44,8 @@ struct llama_cparams {
4344
bool kv_unified;
4445
bool pipeline_parallel;
4546

47+
std::vector<bool> output_layer_inp;
48+
4649
enum llama_context_type ctx_type;
4750
enum llama_pooling_type pooling_type;
4851

src/llama-ext.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,14 @@ LLAMA_API float * llama_get_embeddings_pre_norm (struct llama_context * ctx);
104104

105105
// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
106106
LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i);
107+
108+
//
109+
// model/context data extraction
110+
//
111+
112+
// set if the layer input embeddings should be outputed
113+
LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
114+
115+
LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model);
116+
LLAMA_API void llama_model_set_tok_embd( struct llama_model * model, ggml_tensor * tensor);
117+

src/llama-graph.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,10 @@ void llm_graph_result::reset() {
810810
t_logits = nullptr;
811811
t_embd = nullptr;
812812
t_embd_pooled = nullptr;
813+
814+
t_layer_inp.resize(LLAMA_MAX_LAYERS);
815+
std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
816+
813817
t_sampled.clear();
814818
t_sampled_probs.clear();
815819
t_sampled_logits.clear();
@@ -838,7 +842,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
838842
}
839843
}
840844

841-
void llm_graph_result::set_outputs() {
845+
void llm_graph_result::set_outputs(const llm_graph_params & params) {
842846
if (t_logits != nullptr) {
843847
ggml_set_output(t_logits);
844848
}
@@ -851,6 +855,14 @@ void llm_graph_result::set_outputs() {
851855
if (t_h_pre_norm != nullptr) {
852856
ggml_set_output(t_h_pre_norm);
853857
}
858+
{
859+
const auto & output_layer_inp = params.cparams.output_layer_inp;
860+
for (size_t il = 0; il < output_layer_inp.size(); ++il) {
861+
if (output_layer_inp[il]) {
862+
ggml_set_output(t_layer_inp[il]);
863+
}
864+
}
865+
}
854866
for (auto & [seq_id, t] : t_sampled) {
855867
if (t != nullptr) {
856868
ggml_set_output(t);

src/llama-graph.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,8 @@ class llm_graph_result {
647647
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
648648
ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; }
649649

650+
ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
651+
650652
ggml_cgraph * get_gf() const { return gf; }
651653
ggml_context * get_ctx() const { return ctx_compute.get(); }
652654

@@ -655,7 +657,7 @@ class llm_graph_result {
655657
void reset();
656658

657659
void set_inputs(const llama_ubatch * ubatch);
658-
void set_outputs();
660+
void set_outputs(const llm_graph_params & params);
659661

660662
// try to update the existing graph result using the new graph parameters in order to reuse it
661663
// this can only be done if we determine that the resulting graph using the new graph parameters
@@ -676,10 +678,12 @@ class llm_graph_result {
676678
ggml_tensor * t_embd_pooled = nullptr;
677679
ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm
678680

679-
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
680-
std::map<llama_seq_id, ggml_tensor*> t_candidates;
681-
std::map<llama_seq_id, ggml_tensor*> t_sampled;
682-
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
681+
std::vector<ggml_tensor *> t_layer_inp;
682+
683+
std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
684+
std::map<llama_seq_id, ggml_tensor *> t_candidates;
685+
std::map<llama_seq_id, ggml_tensor *> t_sampled;
686+
std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
683687

684688
std::vector<llm_graph_input_ptr> inputs;
685689

src/llama-hparams.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ uint32_t llama_hparams::n_rot(uint32_t il) const {
7171
}
7272

7373
uint32_t llama_hparams::n_embd_inp() const {
74-
uint32_t n_embd_inp = n_embd;
74+
uint32_t n_embd_inp = n_embd_inp_impl > 0 ? n_embd_inp_impl : n_embd;
7575

7676
if (n_deepstack_layers > 0) {
7777
n_embd_inp += n_embd * n_deepstack_layers;

src/llama-hparams.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct llama_hparams {
4242

4343
uint32_t n_ctx_train; // context size the model was trained on
4444
uint32_t n_embd;
45+
uint32_t n_embd_inp_impl = 0;
4546
uint32_t n_layer;
4647
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
4748
uint32_t n_expert = 0;

src/llama-model.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2104,7 +2104,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
21042104
// TODO: move reranking logic here and generalize
21052105
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
21062106

2107-
llm->res->set_outputs();
2107+
llm->res->set_outputs(params);
21082108

21092109
return llm->res->get_gf();
21102110
}
@@ -2549,3 +2549,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
25492549
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
25502550
}
25512551
}
2552+
2553+
ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) {
2554+
return model->tok_embd;
2555+
}
2556+
2557+
void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) {
2558+
model->tok_embd = tensor;
2559+
}

src/models/llama.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
124124
ggml_tensor * inp_out_ids = build_inp_out_ids();
125125

126126
for (int il = 0; il < n_layer; ++il) {
127+
res->t_layer_inp[il] = inpL;
128+
127129
ggml_tensor * inpSA = inpL;
128130

129131
// norm

0 commit comments

Comments
 (0)