llama : enable layer input extraction

ggerganov · ruixiang63 · commit 5be5965edb85 · 2026-05-18T14:12:29.000Z
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -83,6 +83,8 @@ llama_context::llama_context(
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
+    cparams.output_layer_inp.resize(hparams.n_layer, false);
+
     cparams.ctx_type          = params.ctx_type;
 
     // Initialize backend samplers here so they are part of the sampling graph
@@ -1233,6 +1235,16 @@ bool llama_context::set_adapter_cvec(
     return res;
 }
 
+void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
+    LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable);
+
+    GGML_ASSERT(layer_id < model.hparams.n_layer);
+
+    cparams.output_layer_inp[layer_id] = enable;
+
+    sched_need_reserve = true;
+}
+
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
@@ -2009,7 +2021,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
         has_embd   = true;
     }
 
-
     size_t backend_float_count = 0;
     size_t backend_token_count = 0;
 
@@ -3992,3 +4003,7 @@ void llama_opt_epoch(
 llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) {
     return ctx->memory_breakdown();
 }
+
+void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) {
+    ctx->set_output_layer_inp(layer_id, enable);
+}
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -125,6 +125,8 @@ struct llama_context {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    void set_output_layer_inp(uint32_t layer_id, bool enable);
+
     // process a single ubatch with a specific graph type
     // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -3,6 +3,7 @@
 #include "llama.h"
 
 #include <cstdint>
+#include <vector>
 
 #define LLAMA_MAX_SEQ 256
 
@@ -43,6 +44,8 @@ struct llama_cparams {
     bool kv_unified;
     bool pipeline_parallel;
 
+    std::vector<bool> output_layer_inp;
+
     enum llama_context_type ctx_type;
     enum llama_pooling_type pooling_type;
 
diff --git a/src/llama-ext.h b/src/llama-ext.h
@@ -104,3 +104,14 @@ LLAMA_API float * llama_get_embeddings_pre_norm    (struct llama_context * ctx);
 
 // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
 LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i);
+
+//
+// model/context data extraction
+//
+
+// set if the layer input embeddings should be outputed
+LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
+
+LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model);
+LLAMA_API void          llama_model_set_tok_embd(      struct llama_model * model, ggml_tensor * tensor);
+
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -810,6 +810,10 @@ void llm_graph_result::reset() {
     t_logits      = nullptr;
     t_embd        = nullptr;
     t_embd_pooled = nullptr;
+
+    t_layer_inp.resize(LLAMA_MAX_LAYERS);
+    std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
+
     t_sampled.clear();
     t_sampled_probs.clear();
     t_sampled_logits.clear();
@@ -838,7 +842,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
     }
 }
 
-void llm_graph_result::set_outputs() {
+void llm_graph_result::set_outputs(const llm_graph_params & params) {
     if (t_logits != nullptr) {
         ggml_set_output(t_logits);
     }
@@ -851,6 +855,14 @@ void llm_graph_result::set_outputs() {
     if (t_h_pre_norm != nullptr) {
         ggml_set_output(t_h_pre_norm);
     }
+    {
+        const auto & output_layer_inp = params.cparams.output_layer_inp;
+        for (size_t il = 0; il < output_layer_inp.size(); ++il) {
+            if (output_layer_inp[il]) {
+                ggml_set_output(t_layer_inp[il]);
+            }
+        }
+    }
     for (auto & [seq_id, t] : t_sampled) {
         if (t != nullptr) {
             ggml_set_output(t);
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -647,6 +647,8 @@ class llm_graph_result {
     ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
     ggml_tensor * get_h_pre_norm()  const { return t_h_pre_norm; }
 
+    ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
+
     ggml_cgraph  * get_gf()  const { return gf; }
     ggml_context * get_ctx() const { return ctx_compute.get(); }
 
@@ -655,7 +657,7 @@ class llm_graph_result {
     void reset();
 
     void set_inputs(const llama_ubatch * ubatch);
-    void set_outputs();
+    void set_outputs(const llm_graph_params & params);
 
     // try to update the existing graph result using the new graph parameters in order to reuse it
     // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -676,10 +678,12 @@ class llm_graph_result {
     ggml_tensor * t_embd_pooled = nullptr;
     ggml_tensor * t_h_pre_norm  = nullptr; // [n_embd, n_outputs] hidden state before final output norm
 
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
-    std::map<llama_seq_id, ggml_tensor*> t_candidates;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
+    std::vector<ggml_tensor *> t_layer_inp;
+
+    std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
+    std::map<llama_seq_id, ggml_tensor *> t_candidates;
+    std::map<llama_seq_id, ggml_tensor *> t_sampled;
+    std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
 
     std::vector<llm_graph_input_ptr> inputs;
 
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
@@ -71,7 +71,7 @@ uint32_t llama_hparams::n_rot(uint32_t il) const {
 }
 
 uint32_t llama_hparams::n_embd_inp() const {
-    uint32_t n_embd_inp = n_embd;
+    uint32_t n_embd_inp = n_embd_inp_impl > 0 ? n_embd_inp_impl : n_embd;
 
     if (n_deepstack_layers > 0) {
         n_embd_inp += n_embd * n_deepstack_layers;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -42,6 +42,7 @@ struct llama_hparams {
 
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
+    uint32_t n_embd_inp_impl = 0;
     uint32_t n_layer;
     int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
     uint32_t n_expert = 0;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -2104,7 +2104,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
     // TODO: move reranking logic here and generalize
     llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
 
-    llm->res->set_outputs();
+    llm->res->set_outputs(params);
 
     return llm->res->get_gf();
 }
@@ -2549,3 +2549,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
         layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
     }
 }
+
+ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) {
+    return model->tok_embd;
+}
+
+void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) {
+    model->tok_embd = tensor;
+}
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
@@ -124,6 +124,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         // norm
diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp
@@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         const float freq_base_l  = model.get_rope_freq_base (cparams, il);
         const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
 
diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp
@@ -68,6 +68,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         // norm
diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp
@@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         // norm

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ uint32_t llama_hparams::n_rot(uint32_t il) const {`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`uint32_t llama_hparams::n_embd_inp() const {`
`74`		`- uint32_t n_embd_inp = n_embd;`
	`74`	`+ uint32_t n_embd_inp = n_embd_inp_impl > 0 ? n_embd_inp_impl : n_embd;`
`75`	`75`
`76`	`76`	`if (n_deepstack_layers > 0) {`
`77`	`77`	`n_embd_inp += n_embd * n_deepstack_layers;`
Original file line number	Diff line number	Diff line change
`@@ -2104,7 +2104,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {`
`2104`	`2104`	`// TODO: move reranking logic here and generalize`
`2105`	`2105`	`llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);`
`2106`	`2106`
`2107`		`- llm->res->set_outputs();`
	`2107`	`+ llm->res->set_outputs(params);`
`2108`	`2108`
`2109`	`2109`	`return llm->res->get_gf();`
`2110`	`2110`	`}`
`@@ -2549,3 +2549,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,`
`2549`	`2549`	`layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);`
`2550`	`2550`	`}`
`2551`	`2551`	`}`
	`2552`	`+`
	`2553`	`+ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) {`
	`2554`	`+ return model->tok_embd;`
	`2555`	`+}`
	`2556`	`+`
	`2557`	`+void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) {`
	`2558`	`+ model->tok_embd = tensor;`
	`2559`	`+}`