llama + spec: MTP support

am17an · am17an · commit 8ea0fa8974e1 · 2026-05-04T12:41:28.000+08:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -3562,12 +3562,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
+        {"--spec-type"}, "[none|mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
         string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
             common_speculative_type_to_str(params.speculative.type).c_str()),
         [](common_params & params, const std::string & value) {
             if (value == "none") {
                 params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
+            } else if (value == "mtp") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_MTP;
             } else if (value == "ngram-cache") {
                 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
             } else if (value == "ngram-simple") {
diff --git a/common/common.h b/common/common.h
@@ -159,6 +159,7 @@ enum common_speculative_type {
     COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
     COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
     COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
+    COMMON_SPECULATIVE_TYPE_MTP,           // multi-token prediction
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
diff --git a/common/speculative.cpp b/common/speculative.cpp
diff --git a/common/speculative.h b/common/speculative.h
@@ -16,7 +16,8 @@ std::string common_speculative_type_to_str(enum common_speculative_type type);
 
 common_speculative * common_speculative_init(
         common_params_speculative & params,
-        llama_context             * ctx_tgt);
+        llama_context             * ctx_tgt,
+        llama_context             * ctx_mtp = nullptr);
 
 void common_speculative_free(common_speculative * spec);
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -2018,7 +2018,14 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.SSM_NORM,
         MODEL_TENSOR.SSM_BETA,
         MODEL_TENSOR.SSM_ALPHA,
-        MODEL_TENSOR.SSM_OUT
+        MODEL_TENSOR.SSM_OUT,
+        # NextN/MTP tensors - preserved but unused
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
     ],
     MODEL_ARCH.QWEN35MOE: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -2049,7 +2056,14 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.SSM_NORM,
         MODEL_TENSOR.SSM_BETA,
         MODEL_TENSOR.SSM_ALPHA,
-        MODEL_TENSOR.SSM_OUT
+        MODEL_TENSOR.SSM_OUT,
+        # NextN/MTP tensors - preserved but unused
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
     ],
     MODEL_ARCH.PLAMO: [
         MODEL_TENSOR.TOKEN_EMBD,
diff --git a/include/llama.h b/include/llama.h
@@ -310,6 +310,9 @@ extern "C" {
         // override key-value pairs of the model meta data
         const struct llama_model_kv_override * kv_overrides;
 
+        // override arch from GGUF to load MTP as a separate ctx
+        const char * override_arch;
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool vocab_only;      // only load the vocabulary, no weights
         bool use_mmap;        // use mmap if possible
@@ -967,6 +970,56 @@ extern "C" {
     // If true, all model tensors are activated during llama_decode() to load and cache their weights.
     LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
 
+    // Accessors for graph-output tensors used by speculative decoders that
+    // need intermediate hidden states (e.g. MTP / NextN). Returns nullptr if
+    // the most recent decode didn't populate the tensor. Call llama_synchronize
+    // on the source context before reading via ggml_backend_tensor_get.
+    LLAMA_API struct ggml_tensor * llama_context_get_t_h_pre_norm(struct llama_context * ctx);
+    LLAMA_API struct ggml_tensor * llama_context_get_t_mtp_out   (struct llama_context * ctx);
+
+    // Generic post-compute callback fired from inside process_ubatch after
+    // each ubatch's compute finishes. Speculative decoders register this to
+    // mirror the trunk's hidden state into a sibling context (e.g. an MTP
+    // draft head) and decode into its KV. Pass cb = nullptr to clear.
+    typedef void (*llama_post_ubatch_cb_t)(
+            struct llama_context * ctx,
+            int32_t                n_tokens,
+            const llama_token    * tokens,
+            const llama_pos      * positions,
+            struct ggml_tensor   * t_h_pre_norm,
+            void                 * user_data);
+
+    LLAMA_API void llama_set_post_ubatch_cb(
+            struct llama_context  * ctx,
+            llama_post_ubatch_cb_t  cb,
+            void                  * user_data);
+
+    // Generic post-seq_rm callback fired from inside llama_context_seq_rm
+    // after the trunk's memory.seq_rm completes. Speculative decoders that
+    // mirror trunk KV state to a sibling context register this. Pass cb =
+    // nullptr to clear.
+    typedef void (*llama_post_seq_rm_cb_t)(
+            struct llama_context * ctx,
+            llama_seq_id           seq_id,
+            llama_pos              p0,
+            llama_pos              p1,
+            void                 * user_data);
+
+    LLAMA_API void llama_set_post_seq_rm_cb(
+            struct llama_context  * ctx,
+            llama_post_seq_rm_cb_t  cb,
+            void                  * user_data);
+
+    // seq_rm on the trunk's memory plus dispatch to the registered post-
+    // seq_rm callback (no-op if none). Use in place of
+    // llama_memory_seq_rm(llama_get_memory(ctx), ...) at trunk-side seq_rm
+    // sites so observers (e.g. an MTP context) stay in lockstep.
+    LLAMA_API bool llama_context_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1);
+
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -41,6 +41,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN3VLMOE,       "qwen3vlmoe"       },
     { LLM_ARCH_QWEN35,           "qwen35"           },
     { LLM_ARCH_QWEN35MOE,        "qwen35moe"        },
+    { LLM_ARCH_QWEN35_MTP,       "qwen35_mtp"       },
     { LLM_ARCH_PHI2,             "phi2"             },
     { LLM_ARCH_PHI3,             "phi3"             },
     { LLM_ARCH_PHIMOE,           "phimoe"           },
@@ -756,14 +757,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_INDEXER_PROJ,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_Q_B,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    // NextN/MTP tensors are currently ignored (reserved for future MTP support)
-    // These tensors only exist in the last layer(s) and are treated as output tensors
-    {LLM_TENSOR_NEXTN_EH_PROJ,              {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_NEXTN_EMBED_TOKENS,         {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
-    {LLM_TENSOR_NEXTN_ENORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
-    {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
-    {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
+    // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
+    // the model loader doesn't fault on the block index.
+    {LLM_TENSOR_NEXTN_EH_PROJ,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_NEXTN_EMBED_TOKENS,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_NEXTN_ENORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     // Nemotron 3 Super
     {LLM_TENSOR_FFN_LATENT_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -45,6 +45,7 @@ enum llm_arch {
     LLM_ARCH_QWEN3VLMOE,
     LLM_ARCH_QWEN35,
     LLM_ARCH_QWEN35MOE,
+    LLM_ARCH_QWEN35_MTP,
     LLM_ARCH_PHI2,
     LLM_ARCH_PHI3,
     LLM_ARCH_PHIMOE,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1242,13 +1242,24 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
         return nullptr;
     }
 
+    // Generic post-ubatch dispatch — speculative decoders register a callback
+    // here to mirror the trunk's hidden state into a sibling context.
+    if (post_ubatch_cb) {
+        post_ubatch_cb(this,
+                       (int32_t) ubatch.n_tokens,
+                       ubatch.token,
+                       ubatch.pos,
+                       res->t_h_pre_norm,
+                       post_ubatch_ud);
+    }
+
     ret = GGML_STATUS_SUCCESS;
 
     return res;
 }
 
 int llama_context::encode(const llama_batch & batch_inp) {
-    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
+    GGML_ASSERT(batch_inp.token || batch_inp.embd);
 
     if (batch_inp.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -1538,7 +1549,7 @@ static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_s
 }
 
 int llama_context::decode(const llama_batch & batch_inp) {
-    GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
+    GGML_ASSERT(batch_inp.token || batch_inp.embd);
 
     if (!memory) {
         LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
@@ -3095,6 +3106,32 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
     ctx->set_warmup(warmup);
 }
 
+ggml_tensor * llama_context::get_t_h_pre_norm() const {
+    return gf_res_prev ? gf_res_prev->t_h_pre_norm : nullptr;
+}
+
+ggml_tensor * llama_context_get_t_h_pre_norm(struct llama_context * ctx) {
+    return ctx ? ctx->get_t_h_pre_norm() : nullptr;
+}
+
+ggml_tensor * llama_context::get_t_mtp_out() const {
+    return gf_res_prev ? gf_res_prev->t_mtp_out : nullptr;
+}
+
+ggml_tensor * llama_context_get_t_mtp_out(struct llama_context * ctx) {
+    return ctx ? ctx->get_t_mtp_out() : nullptr;
+}
+
+void llama_set_post_ubatch_cb(struct llama_context * ctx, llama_post_ubatch_cb_t cb, void * user_data) {
+    if (!ctx) return;
+    ctx->set_post_ubatch_cb(cb, user_data);
+}
+
+void llama_set_post_seq_rm_cb(struct llama_context * ctx, llama_post_seq_rm_cb_t cb, void * user_data) {
+    if (!ctx) return;
+    ctx->set_post_seq_rm_cb(cb, user_data);
+}
+
 void llama_synchronize(llama_context * ctx) {
     ctx->synchronize();
 }
@@ -3252,6 +3289,24 @@ bool llama_memory_seq_rm(
     return mem->seq_rm(seq_id, p0, p1);
 }
 
+bool llama_context_seq_rm(
+    struct llama_context * ctx,
+            llama_seq_id   seq_id,
+               llama_pos   p0,
+               llama_pos   p1) {
+    if (!ctx) {
+        return true;
+    }
+    const bool ok = llama_memory_seq_rm(llama_get_memory(ctx), seq_id, p0, p1);
+
+    // Dispatch to a registered observer (e.g. an MTP context wrapper) so
+    // sibling state stays in lockstep with the trunk's KV.
+    if (llama_post_seq_rm_cb_t cb = ctx->get_post_seq_rm_cb()) {
+        cb(ctx, seq_id, p0, p1, ctx->get_post_seq_rm_ud());
+    }
+    return ok;
+}
+
 void llama_memory_seq_cp(
         llama_memory_t mem,
           llama_seq_id seq_id_src,
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -69,6 +69,25 @@ struct llama_context {
     float * get_embeddings_ith(int32_t i);
     float * get_embeddings_seq(llama_seq_id seq_id);
 
+    // Accessors for graph-output tensors used by speculative decoders that
+    // need intermediate hidden states. Return nullptr if the most recent
+    // decode didn't populate them.
+    ggml_tensor * get_t_h_pre_norm() const;
+    ggml_tensor * get_t_mtp_out()    const;
+
+    // Post-ubatch / post-seq_rm callbacks. See llama.h for semantics.
+    // Pass cb=nullptr to clear.
+    void set_post_ubatch_cb(llama_post_ubatch_cb_t cb, void * user_data) {
+        post_ubatch_cb = cb;
+        post_ubatch_ud = user_data;
+    }
+    void set_post_seq_rm_cb(llama_post_seq_rm_cb_t cb, void * user_data) {
+        post_seq_rm_cb = cb;
+        post_seq_rm_ud = user_data;
+    }
+    llama_post_seq_rm_cb_t get_post_seq_rm_cb() const { return post_seq_rm_cb; }
+    void *                 get_post_seq_rm_ud() const { return post_seq_rm_ud; }
+
     llama_token * get_sampled_tokens() const;
     llama_token   get_sampled_token_ith(int32_t idx);
 
@@ -253,6 +272,13 @@ struct llama_context {
 
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
 
+    // Generic post-compute / post-seq_rm callbacks. Speculative decoders that
+    // need to mirror the trunk's state into a sibling context register here.
+    llama_post_ubatch_cb_t post_ubatch_cb = nullptr;
+    void *                 post_ubatch_ud = nullptr;
+    llama_post_seq_rm_cb_t post_seq_rm_cb = nullptr;
+    void *                 post_seq_rm_ud = nullptr;
+
     std::unique_ptr<llama_memory_i> memory;
 
     // decode output (2-dimensional array: [n_outputs][n_vocab])
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -3,6 +3,7 @@
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-batch.h"
+#include "llama-context.h"
 #include "llama-cparams.h"
 
 #include "llama-kv-cache.h"
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -18,6 +18,7 @@ struct ggml_tensor;
 
 struct llama_cparams;
 struct llama_layer;
+struct llama_context;
 
 struct llama_memory_context_i;
 
@@ -645,6 +646,13 @@ class llm_graph_result {
     ggml_tensor * get_embd()        const { return t_embd; }
     ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
 
+    // Pre-output-norm hidden state at output positions: shape [n_embd, n_outputs].
+    // Set by models that ship an MTP/NextN head (Qwen3.5/3.6, GLM-4.5/4.6,
+    // DeepSeek V3) so a separate ctx_mtp can pull it via
+    // ggml_backend_tensor_copy_async to feed its own MTP forward.
+    // nullptr on models without an MTP head.
+    ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; }
+
     ggml_cgraph  * get_gf()  const { return gf; }
     ggml_context * get_ctx() const { return ctx_compute.get(); }
 
@@ -672,6 +680,8 @@ class llm_graph_result {
     ggml_tensor * t_logits      = nullptr;
     ggml_tensor * t_embd        = nullptr;
     ggml_tensor * t_embd_pooled = nullptr;
+    ggml_tensor * t_h_pre_norm  = nullptr; // [n_embd, n_outputs]; trunk hidden before final norm (MTP source)
+    ggml_tensor * t_mtp_out     = nullptr; // [n_embd, n_tokens]; MTP block post-FFN (chained MTP-K source)
 
     std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
     std::map<llama_seq_id, ggml_tensor*> t_candidates;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
@@ -229,6 +229,11 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
 }
 
 bool llama_hparams::has_kv(uint32_t il) const {
+    if (kv_only_nextn) {
+        // QWEN35_MTP and friends: only the MTP block's layers have a KV cache.
+        return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
+    }
+
     if (n_layer_kv_from_start >= 0) {
         if (il < (uint32_t) n_layer_kv_from_start) {
             return true;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -91,6 +91,11 @@ struct llama_hparams {
     uint32_t moe_every_n_layers   = 0;
     uint32_t moe_latent_size      = 0;
     uint32_t nextn_predict_layers = 0;
+    // For LLM_ARCH_QWEN35_MTP: only layers in [n_layer - nextn_predict_layers, n_layer)
+    // have a KV cache. Inverts the typical n_layer_kv_from_start semantics; lets a
+    // separate ctx_mtp share the same hparams as the trunk while only allocating KV
+    // for the MTP block.
+    bool kv_only_nextn = false;
 
     float f_norm_eps;
     float f_norm_rms_eps;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -1312,9 +1312,16 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte
     return tensor;
 }
 
-void llama_model_loader::done_getting_tensors() const {
-    if (n_created != n_tensors) {
-        throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+void llama_model_loader::done_getting_tensors(bool partial) const {
+    if (n_created > n_tensors) {
+        throw std::runtime_error(format("%s: too many tensors created; expected %d, got %d", __func__, n_tensors, n_created));
+    }
+    if (n_created < n_tensors) {
+        if (!partial) {
+            throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+        }
+        LLAMA_LOG_INFO("%s: partial load — used %d of %d tensors in the file (rest belong to a sibling model on the same .gguf)\n",
+                __func__, n_created, n_tensors);
     }
     if (n_tensors_moved > 0) {
         LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/models/models.h b/src/models/models.h
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
diff --git a/src/models/qwen35_mtp.cpp b/src/models/qwen35_mtp.cpp
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp