move n_outputs_max to server-context

am17an · am17an · commit d936634bcbf1 · 2026-06-01T21:47:47.000+08:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1562,8 +1562,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
 
     cparams.n_ctx             = params.n_ctx;
     cparams.n_seq_max         = params.n_parallel;
-    cparams.n_rs_seq          = params.speculative.need_n_rs_seq();
-    cparams.n_outputs_per_seq = std::max(params.n_outputs_per_seq, 0);
+    cparams.n_rs_seq      = params.speculative.need_n_rs_seq();
+    cparams.n_outputs_max = std::max(params.n_outputs_max, 0);
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.cpuparams.n_threads;
diff --git a/common/common.h b/common/common.h
@@ -431,7 +431,7 @@ struct common_params {
     int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel            =     1; // number of parallel sequences to decode
     int32_t n_sequences           =     1; // number of sequences to decode
-    int32_t n_outputs_per_seq     =     0; // max outputs per sequence in a ubatch (0 = no limit)
+    int32_t n_outputs_max         =     0; // max outputs in a ubatch (0 = n_batch)
     int32_t grp_attn_n            =     1; // group-attention factor
     int32_t grp_attn_w            =   512; // group-attention width
     int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
diff --git a/include/llama.h b/include/llama.h
@@ -339,7 +339,7 @@ extern "C" {
         uint32_t n_ubatch;          // physical maximum batch size
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
         uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
-        uint32_t n_outputs_per_seq; // max outputs per sequence in a ubatch (0 = no limit)
+        uint32_t n_outputs_max;     // max outputs in a ubatch (0 = n_batch)
         int32_t  n_threads;         // number of threads to use for generation
         int32_t  n_threads_batch;   // number of threads to use for batch processing
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -30,24 +30,6 @@ static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) {
     throw std::runtime_error("Unsupported ctx type");
 }
 
-static uint32_t graph_n_outputs_pp(const llama_cparams & cparams, uint32_t n_tokens, uint32_t n_seqs) {
-    GGML_ASSERT(n_tokens >= 1);
-    GGML_ASSERT(n_seqs   >= 1);
-
-    const bool reserve_all_outputs =
-        cparams.embeddings ||
-        cparams.pooling_type != LLAMA_POOLING_TYPE_NONE ||
-        cparams.n_outputs_per_seq == 0;
-
-    if (reserve_all_outputs) {
-        return n_tokens;
-    }
-
-    const uint64_t n_outputs = (uint64_t) n_seqs * cparams.n_outputs_per_seq;
-
-    return std::max<uint32_t>(1, std::min<uint64_t>(n_tokens, n_outputs));
-}
-
 llama_context::llama_context(
         const llama_model & model,
               llama_context_params params) :
@@ -69,8 +51,6 @@ llama_context::llama_context(
         throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
     }
 
-    cparams.n_outputs_per_seq = params.n_outputs_per_seq;
-
     cparams.n_rs_seq = params.n_rs_seq;
     if (cparams.n_rs_seq > 0 && !llm_arch_supports_rs_rollback(model.arch)) {
         LLAMA_LOG_DEBUG("%s: n_rs_seq=%u requested but model arch does not support recurrent partial rollback; clamping to 0\n",
@@ -202,6 +182,8 @@ llama_context::llama_context(
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
+    cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
+
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
 
@@ -597,7 +579,7 @@ void llama_context::sched_reserve() {
     int n_splits_tg = -1;
     int n_nodes_tg  = -1;
 
-    const uint32_t n_outputs_pp = graph_n_outputs_pp(cparams, n_tokens, n_seqs);
+    const uint32_t n_outputs_pp = std::min(n_tokens, cparams.n_outputs_max);
 
     // reserve pp (prompt processing) graph first so that buffers are only allocated once
     {
@@ -796,7 +778,7 @@ bool llama_context::memory_update(bool optimize) {
         const uint32_t n_seqs = cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
-        const uint32_t n_outputs_pp = graph_n_outputs_pp(cparams, n_tokens, n_seqs);
+        const uint32_t n_outputs_pp = std::min(n_tokens, cparams.n_outputs_max);
 
         auto * gf = graph_reserve(n_tokens, n_seqs, n_outputs_pp, mctx.get());
         if (!gf) {
@@ -1804,6 +1786,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
             // needs to happen before the graph is built
             n_outputs = n_outputs_new;
+
+            GGML_ASSERT(n_outputs <= cparams.n_outputs_max);
         }
 
         ggml_status status;
@@ -3365,7 +3349,7 @@ llama_context_params llama_context_default_params() {
         /*.n_ubatch                    =*/ 512,
         /*.n_seq_max                   =*/ 1,
         /*.n_rs_seq                    =*/ 0,
-        /*.n_outputs_per_seq           =*/ 0,
+        /*.n_outputs_max               =*/ 0,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
         /*.ctx_type                    =*/ LLAMA_CONTEXT_TYPE_DEFAULT,
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -12,10 +12,10 @@ struct llama_cparams {
     uint32_t n_batch;
     uint32_t n_ubatch;
     uint32_t n_seq_max;
-    uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback
-    uint32_t n_outputs_per_seq; // max outputs per sequence in a ubatch (0 = no limit)
-    int32_t  n_threads;         // number of threads to use for generation
-    int32_t  n_threads_batch;   // number of threads to use for batch processing
+    uint32_t n_rs_seq;        // number of recurrent-state snapshots per seq for rollback
+    uint32_t n_outputs_max;   // max outputs in a ubatch
+    int32_t  n_threads;       // number of threads to use for generation
+    int32_t  n_threads_batch; // number of threads to use for batch processing
 
     float rope_freq_base;
     float rope_freq_scale;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -37,38 +37,48 @@ using json = nlohmann::ordered_json;
 
 constexpr int HTTP_POLLING_SECONDS = 1;
 
-static uint32_t server_n_outputs_per_seq(const common_params_speculative & speculative) {
-    uint32_t n_outputs = 1;
+static uint32_t server_n_outputs_max(const common_params & params) {
+    const uint32_t n_batch  = params.n_batch;
+    const uint32_t n_ubatch = std::min(n_batch, params.n_ubatch == 0 ? n_batch : params.n_ubatch);
 
-    for (const auto type : speculative.types) {
+    if (params.embedding ||
+            (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED && params.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
+        return n_ubatch;
+    }
+
+    uint32_t n_outputs_per_seq = 1;
+
+    for (const auto type : params.speculative.types) {
         switch (type) {
             case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE:
             case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3:
             case COMMON_SPECULATIVE_TYPE_DRAFT_MTP:
-                n_outputs = std::max<uint32_t>(n_outputs, 1 + std::max(0, speculative.draft.n_max));
+                n_outputs_per_seq = std::max<uint32_t>(n_outputs_per_seq, 1 + std::max(0, params.speculative.draft.n_max));
                 break;
             case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:
-                n_outputs = std::max<uint32_t>(n_outputs, 1 + speculative.ngram_simple.size_m);
+                n_outputs_per_seq = std::max<uint32_t>(n_outputs_per_seq, 1 + params.speculative.ngram_simple.size_m);
                 break;
             case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
-                n_outputs = std::max<uint32_t>(n_outputs, 1 + speculative.ngram_map_k.size_m);
+                n_outputs_per_seq = std::max<uint32_t>(n_outputs_per_seq, 1 + params.speculative.ngram_map_k.size_m);
                 break;
             case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V:
-                n_outputs = std::max<uint32_t>(n_outputs, 1 + speculative.ngram_map_k4v.size_m);
+                n_outputs_per_seq = std::max<uint32_t>(n_outputs_per_seq, 1 + params.speculative.ngram_map_k4v.size_m);
                 break;
             case COMMON_SPECULATIVE_TYPE_NGRAM_MOD:
-                n_outputs = std::max<uint32_t>(n_outputs, 1 + std::max(0, speculative.ngram_mod.n_max));
+                n_outputs_per_seq = std::max<uint32_t>(n_outputs_per_seq, 1 + std::max(0, params.speculative.ngram_mod.n_max));
                 break;
             case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE:
-                n_outputs = std::max<uint32_t>(n_outputs, 1 + 8);
+                n_outputs_per_seq = std::max<uint32_t>(n_outputs_per_seq, 1 + 8);
                 break;
             case COMMON_SPECULATIVE_TYPE_NONE:
             case COMMON_SPECULATIVE_TYPE_COUNT:
                 break;
         }
     }
 
-    return n_outputs;
+    const uint64_t n_outputs = (uint64_t) params.n_parallel * n_outputs_per_seq;
+
+    return std::max<uint32_t>(1, std::min<uint64_t>(n_ubatch, n_outputs));
 }
 
 // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
@@ -787,7 +797,7 @@ struct server_context_impl {
         SRV_INF("loading model '%s'\n", params.model.path.c_str());
 
         params_base = params;
-        params_base.n_outputs_per_seq = server_n_outputs_per_seq(params_base.speculative);
+        params_base.n_outputs_max = server_n_outputs_max(params_base);
 
         std::string & mmproj_path = params_base.mmproj.path;
         bool has_mmproj = !mmproj_path.empty();
@@ -854,7 +864,7 @@ struct server_context_impl {
                 }
 
                 if (!has_draft) {
-                    params_dft.n_outputs_per_seq = 1;
+                    params_dft.n_outputs_max = params_base.n_parallel;
                 }
 
                 auto mparams_dft = common_model_params_to_llama(params_dft);
@@ -980,11 +990,11 @@ struct server_context_impl {
                     params_base.model.path.c_str());
 
             auto cparams_mtp = common_context_params_to_llama(params_base);
-            cparams_mtp.ctx_type          = LLAMA_CONTEXT_TYPE_MTP;
-            cparams_mtp.type_k            = params_base.speculative.draft.cache_type_k;
-            cparams_mtp.type_v            = params_base.speculative.draft.cache_type_v;
-            cparams_mtp.n_rs_seq          = 0;
-            cparams_mtp.n_outputs_per_seq = 1;
+            cparams_mtp.ctx_type      = LLAMA_CONTEXT_TYPE_MTP;
+            cparams_mtp.type_k        = params_base.speculative.draft.cache_type_k;
+            cparams_mtp.type_v        = params_base.speculative.draft.cache_type_v;
+            cparams_mtp.n_rs_seq      = 0;
+            cparams_mtp.n_outputs_max = params_base.n_parallel;
 
             ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp));
             if (ctx_dft == nullptr) {