@@ -83,6 +83,8 @@ llama_context::llama_context(
8383 cparams.cb_eval = params.cb_eval ;
8484 cparams.cb_eval_user_data = params.cb_eval_user_data ;
8585
86+ cparams.output_layer_inp .resize (hparams.n_layer , false );
87+
8688 cparams.ctx_type = params.ctx_type ;
8789
8890 // Initialize backend samplers here so they are part of the sampling graph
@@ -1233,6 +1235,16 @@ bool llama_context::set_adapter_cvec(
12331235 return res;
12341236}
12351237
1238+ void llama_context::set_output_layer_inp (uint32_t layer_id, bool enable) {
1239+ LLAMA_LOG_DEBUG (" %s: layer_id = %d, enable = %d\n " , __func__, layer_id, enable);
1240+
1241+ GGML_ASSERT (layer_id < model.hparams .n_layer );
1242+
1243+ cparams.output_layer_inp [layer_id] = enable;
1244+
1245+ sched_need_reserve = true ;
1246+ }
1247+
12361248llm_graph_result * llama_context::process_ubatch (const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
12371249 if (mctx && !mctx->apply ()) {
12381250 LLAMA_LOG_ERROR (" %s: failed to apply memory context\n " , __func__);
@@ -2009,7 +2021,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
20092021 has_embd = true ;
20102022 }
20112023
2012-
20132024 size_t backend_float_count = 0 ;
20142025 size_t backend_token_count = 0 ;
20152026
@@ -3992,3 +4003,7 @@ void llama_opt_epoch(
39924003llama_memory_breakdown llama_get_memory_breakdown (const struct llama_context * ctx) {
39934004 return ctx->memory_breakdown ();
39944005}
4006+
4007+ void llama_set_output_layer_inp (struct llama_context * ctx, uint32_t layer_id, bool enable) {
4008+ ctx->set_output_layer_inp (layer_id, enable);
4009+ }
0 commit comments