mtp: add MTP-K chain via t_mtp_out + self-relay

am17an · am17an · commit 17d47dfded82 · 2026-04-30T13:35:10.000+08:00
Generalize the MTP draft path to support chain length K &gt; 1, where each
chain step conditions on the previous step's MTP block output instead of
the target's pre-output-norm hidden.

Pieces:
- llama_graph_result gains t_mtp_out: the MTP block's post-FFN hidden
  (pre-LM-head). qwen35_mtp's graph builder sets it.
- llama_context::get_t_mtp_out() exposes the most recent decode's value.
- llama_mtp_relay_h_self(ctx_mtp, n_rows): on-device copy of the LAST
  n_rows of t_mtp_out into the FIRST n_rows of t_inp_h. Same machinery
  as llama_mtp_relay_h, just self-source.
- common_speculative_state_mtp::draft chains n_max calls. Step 0 relays
  from ctx_target's t_h_pre_norm (existing). Steps 1..K-1 self-relay
  from ctx_mtp's previous t_mtp_out. Each step argmaxes its logits and
  feeds the result to the next.
- accept(n_accepted) trims any rejected trailing draft positions from
  ctx_mtp's KV via seq_rm so the next draft writes K/V at the right
  slots. Tracks last_n_drafted to know how many to potentially drop.

Smoke results on Qwen3.6-q8_0-mtp.gguf, --spec-draft-n-max 2:

  fibonacci:  K=1 → 13.17 tok/s, 100% accept
              K=2 → 15.40 tok/s,  75% accept (12/16)
              K=2 wins because the prompt is highly canonical and even
              chain step 1 stays accepted most of the time.

  send_req:   K=1 → 11.44 tok/s, 83.9% accept (182/217)
              K=2 →  9.48 tok/s, 29.7% accept (148/499)
              K=2 loses on dense code: chain step 1 accept falls off a
              cliff because Qwen3.6's MTP head is trained one-step-ahead
              and feeding it its own previous output is out-of-distribution
              (the FastMTP problem; also discussed in DeepSeek V3 paper).
              The infrastructure works correctly; the model doesn't
              benefit without retraining.

Practical guidance: keep --spec-draft-n-max 1 for code/dense workloads.
K &gt; 1 only helps when the head was either trained for chain prediction
(FastMTP-style) or when the workload is canonical enough that vanilla
self-rolling stays in-distribution.
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -630,6 +630,10 @@ struct common_speculative_state_mtp : public common_speculative_state {
     // next draft writes at pos+1, etc. Reset by begin().
     llama_pos mtp_pos = 0;
 
+    // How many tokens the most recent draft() pushed into ctx_mtp. accept()
+    // uses this to compute how many trailing positions to roll back.
+    uint16_t last_n_drafted = 0;
+
     common_speculative_state_mtp(enum common_speculative_type type,
                                  llama_context * ctx_tgt,
                                  llama_context * ctx_mtp)
@@ -701,73 +705,85 @@ struct common_speculative_state_mtp : public common_speculative_state {
             const llama_tokens & prompt_tgt,
             llama_token id_last,
             llama_tokens & draft_tokens) override {
-        GGML_UNUSED(params);
         GGML_UNUSED(prompt_tgt);
         draft_tokens.clear();
 
-        // Stage h from the target's last decode into ctx_mtp's input buffer.
-        // For a single-token MTP step we relay the LAST row of t_h_pre_norm
-        // (the h corresponding to id_last's predecessor) into the FIRST row
-        // of ctx_mtp's t_inp_h.
-        const int32_t rc = llama_mtp_relay_h(ctx_tgt, ctx_mtp, /*n_rows=*/ 1);
-        if (rc != 0) {
-            LOG_DBG("%s: llama_mtp_relay_h rc=%d; skipping MTP draft\n", __func__, rc);
-            return;
-        }
+        // Chain length K: 1 for plain MTP, 2+ for chained MTP-K. Each step
+        // runs one MTP forward, takes the argmax as a draft token, and feeds
+        // its hidden state forward as the next step's t_inp_h.
+        const int32_t n_max = std::max(1, params.draft.n_max);
 
-        // Position: one past whatever's currently in ctx_mtp's KV cache for
-        // seq 0. M-RoPE asserts Y > X (input start > cache last), so we always
-        // advance by querying the cache rather than relying on a stale local
-        // counter. Step 7's prompt prefill will warm the cache; until then it
-        // starts at -1 (empty) and grows as drafts are accepted.
-        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
-        const llama_pos pos     = pos_max + 1;
+        const int32_t n_vocab = (int32_t) logits_buf.size();
+        llama_token cond_tok = id_last;
+
+        for (int32_t k = 0; k < n_max; ++k) {
+            // Stage h. Step 0: from ctx_tgt's t_h_pre_norm (the h corresponding
+            // to id_last's position in the trunk). Step k>0: self-relay from
+            // ctx_mtp's previous t_mtp_out (the MTP block's post-FFN h from
+            // the previous chain step).
+            const int32_t rc_relay = (k == 0)
+                ? llama_mtp_relay_h(ctx_tgt, ctx_mtp, /*n_rows=*/ 1)
+                : llama_mtp_relay_h_self(ctx_mtp,    /*n_rows=*/ 1);
+            if (rc_relay != 0) {
+                LOG_DBG("%s: relay rc=%d at k=%d; stopping chain\n", __func__, rc_relay, k);
+                return;
+            }
 
-        // Build the single-token batch.
-        batch.n_tokens     = 1;
-        batch.token[0]     = id_last;
-        batch.pos[0]       = pos;
-        batch.n_seq_id[0]  = 1;
-        batch.seq_id[0][0] = 0;
-        batch.logits[0]    = 1;
+            // Position: one past whatever's in ctx_mtp's KV. Always queried
+            // (M-RoPE requires Y > X strictly).
+            const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
+            const llama_pos pos     = pos_max + 1;
+
+            batch.n_tokens     = 1;
+            batch.token[0]     = cond_tok;
+            batch.pos[0]       = pos;
+            batch.n_seq_id[0]  = 1;
+            batch.seq_id[0][0] = 0;
+            batch.logits[0]    = 1;
+
+            const int32_t dec_rc = llama_decode(ctx_mtp, batch);
+            if (dec_rc != 0) {
+                LOG_DBG("%s: llama_decode rc=%d at k=%d; stopping chain\n", __func__, dec_rc, k);
+                return;
+            }
 
-        // Run the MTP graph: ctx_mtp consumes (h_input from the relay,
-        // e(id_last)) and produces draft logits.
-        const int32_t dec_rc = llama_decode(ctx_mtp, batch);
-        if (dec_rc != 0) {
-            LOG_DBG("%s: llama_decode on ctx_mtp rc=%d\n", __func__, dec_rc);
-            return;
-        }
+            const float * logits = llama_get_logits_ith(ctx_mtp, 0);
+            if (!logits) {
+                return;
+            }
 
-        const float * logits = llama_get_logits_ith(ctx_mtp, 0);
-        if (!logits) {
-            return;
+            std::memcpy(logits_buf.data(), logits, n_vocab * sizeof(float));
+            int   best = 0;
+            float bv   = logits_buf[0];
+            for (int i = 1; i < n_vocab; ++i) {
+                if (logits_buf[i] > bv) { bv = logits_buf[i]; best = i; }
+            }
+            draft_tokens.push_back(best);
+            cond_tok = best;
         }
 
-        // Greedy argmax draft.
-        const int32_t n_vocab = (int32_t) logits_buf.size();
-        std::memcpy(logits_buf.data(), logits, n_vocab * sizeof(float));
-        int   best = 0;
-        float bv   = logits_buf[0];
-        for (int i = 1; i < n_vocab; ++i) {
-            if (logits_buf[i] > bv) { bv = logits_buf[i]; best = i; }
-        }
-        draft_tokens.push_back(best);
+        last_n_drafted = (uint16_t) draft_tokens.size();
     }
 
     void accept(uint16_t n_accepted) override {
-        // The previous draft() pushed one token (the drafted token) into
-        // ctx_mtp at pos+1. If the verifier accepted it, leave it. If not
-        // (n_accepted=0), trim it back so the next draft picks the slot
-        // again. We always trim past `pos_max - (n_drafted - n_accepted)`,
-        // but for K=1 this simplifies to: trim if rejected.
-        if (n_accepted == 0) {
-            const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
-            if (pos_max >= 0) {
-                llama_memory_seq_rm(llama_get_memory(ctx_mtp), /*seq_id=*/ 0,
-                                    /*pos_min=*/ pos_max, /*pos_max=*/ -1);
-            }
+        // The previous draft() pushed K tokens into ctx_mtp at positions
+        // [pos_max - K + 1, pos_max]. The verifier accepted the first
+        // n_accepted of them; the remaining K - n_accepted came from
+        // chain steps that the verifier rejected. Trim those rejected
+        // positions from ctx_mtp's KV so the next draft writes K/V at the
+        // right slots.
+        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
+        if (pos_max < 0) {
+            return;
+        }
+        const int32_t n_drafted_last = (int32_t) last_n_drafted;
+        const int32_t n_to_drop = std::max(0, n_drafted_last - (int32_t) n_accepted);
+        if (n_to_drop > 0) {
+            const llama_pos drop_from = pos_max - n_to_drop + 1;
+            llama_memory_seq_rm(llama_get_memory(ctx_mtp), /*seq_id=*/ 0,
+                                /*p0=*/ drop_from, /*p1=*/ -1);
         }
+        last_n_drafted = 0;
     }
 
     int32_t n_max(const common_params_speculative & params) const override {
diff --git a/include/llama.h b/include/llama.h
@@ -1001,6 +1001,15 @@ extern "C" {
             struct llama_context * ctx_mtp,
             int32_t                n_rows);
 
+    // Self-relay: copy the LAST n_rows of ctx_mtp's most recent t_mtp_out
+    // (the MTP block's post-FFN hidden) into the FIRST n_rows of its own
+    // t_inp_h. Used for chained MTP-K drafting (K > 1) where each chain step
+    // conditions on the previous step's MTP output rather than the target's
+    // pre-output-norm hidden.
+    LLAMA_API int32_t llama_mtp_relay_h_self(
+            struct llama_context * ctx_mtp,
+            int32_t                n_rows);
+
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -3099,6 +3099,10 @@ ggml_tensor * llama_context::get_t_h_pre_norm() const {
     return gf_res_prev ? gf_res_prev->t_h_pre_norm : nullptr;
 }
 
+ggml_tensor * llama_context::get_t_mtp_out() const {
+    return gf_res_prev ? gf_res_prev->t_mtp_out : nullptr;
+}
+
 ggml_tensor * llama_context::get_t_inp_h() const {
     // gf_res_prev->t_inp_h is set by the model's graph builder (e.g.
     // llm_build_qwen35_mtp). After the first real llama_decode it lives there.
@@ -3114,52 +3118,44 @@ ggml_tensor * llama_context::get_t_inp_h() const {
     return nullptr;
 }
 
-int32_t llama_mtp_relay_h(
-        struct llama_context * ctx_target,
-        struct llama_context * ctx_mtp,
-        int32_t                n_rows) {
-    if (!ctx_target || !ctx_mtp) {
-        return -1;
-    }
-
-    ggml_tensor * src = ctx_target->get_t_h_pre_norm();
+// Common implementation: copy the LAST n_rows of `src` into the FIRST n_rows
+// of `dst`, on-device via ggml_backend_tensor_copy_async. ctx_src/ctx_dst are
+// used to look up backends per tensor and to synchronize the source.
+static int32_t llama_mtp_relay_impl(
+        struct llama_context * ctx_src,
+        struct llama_context * ctx_dst,
+        ggml_tensor          * src,
+        ggml_tensor          * dst,
+        int32_t                n_rows,
+        const char           * fn) {
     if (!src) {
-        LLAMA_LOG_ERROR("%s: ctx_target's last decode did not produce t_h_pre_norm\n", __func__);
+        LLAMA_LOG_ERROR("%s: src tensor missing\n", fn);
         return -2;
     }
-
-    ggml_tensor * dst = ctx_mtp->get_t_inp_h();
     if (!dst) {
-        LLAMA_LOG_ERROR("%s: ctx_mtp has no t_inp_h (graph not built or wrong arch)\n", __func__);
+        LLAMA_LOG_ERROR("%s: dst tensor missing (graph not built or wrong arch)\n", fn);
         return -3;
     }
-
     if (src->ne[0] != dst->ne[0]) {
         LLAMA_LOG_ERROR("%s: shape mismatch: src n_embd=%" PRId64 ", dst n_embd=%" PRId64 "\n",
-                __func__, src->ne[0], dst->ne[0]);
+                fn, src->ne[0], dst->ne[0]);
         return -4;
     }
-
     if (n_rows <= 0 || n_rows > src->ne[1] || n_rows > dst->ne[1]) {
         LLAMA_LOG_ERROR("%s: n_rows=%d out of range (src cap=%" PRId64 ", dst cap=%" PRId64 ")\n",
-                __func__, n_rows, src->ne[1], dst->ne[1]);
+                fn, n_rows, src->ne[1], dst->ne[1]);
         return -5;
     }
 
-    // Copy the LAST n_rows of src into the FIRST n_rows of dst.
-    const int32_t src_first = (int32_t) src->ne[1] - n_rows;
-    const int32_t dst_first = 0;
-
-    // Wait for ctx_target's last compute to finish before reading t_h_pre_norm.
-    ctx_target->synchronize();
+    // Wait for the source's compute to finish before reading.
+    ctx_src->synchronize();
 
-    // Build views for the row range we want to copy. ggml_view_2d does NOT
-    // propagate the parent's backend buffer to the view tensor (it sets
-    // view->buffer = NULL and only forwards view->data + offset), so we have
-    // to wire the buffer manually before passing the views to copy_async —
-    // otherwise the backend's copy path hits a null buffer and aborts inside
-    // ggml_backend_buffer_get_type.
+    // Build views for the row range. ggml_view_2d does not propagate the
+    // parent's backend buffer to the view (it sets view->buffer = NULL and
+    // only forwards view->data + offset), so wire the buffer manually before
+    // passing the views to copy_async.
     const size_t row_size   = src->nb[1];
+    const int32_t src_first = (int32_t) src->ne[1] - n_rows; // last n_rows of src
     const size_t src_offset = (size_t) src_first * row_size;
 
     ggml_context_ptr view_ctx;
@@ -3182,20 +3178,45 @@ int32_t llama_mtp_relay_h(
     src_view->buffer = src->buffer;
     dst_view->buffer = dst->buffer;
 
-    auto * sched_src = ctx_target->get_sched();
-    auto * sched_dst = ctx_mtp->get_sched();
+    auto * sched_src = ctx_src->get_sched();
+    auto * sched_dst = ctx_dst->get_sched();
     auto * backend_src = ggml_backend_sched_get_tensor_backend(sched_src, src);
     auto * backend_dst = ggml_backend_sched_get_tensor_backend(sched_dst, dst);
     if (!backend_src || !backend_dst) {
         LLAMA_LOG_ERROR("%s: backend resolve failed (src=%p dst=%p)\n",
-                __func__, (void *) backend_src, (void *) backend_dst);
+                fn, (void *) backend_src, (void *) backend_dst);
         return -8;
     }
 
     ggml_backend_tensor_copy_async(backend_src, backend_dst, src_view, dst_view);
     return 0;
 }
 
+int32_t llama_mtp_relay_h(
+        struct llama_context * ctx_target,
+        struct llama_context * ctx_mtp,
+        int32_t                n_rows) {
+    if (!ctx_target || !ctx_mtp) {
+        return -1;
+    }
+    return llama_mtp_relay_impl(ctx_target, ctx_mtp,
+            ctx_target->get_t_h_pre_norm(),
+            ctx_mtp->get_t_inp_h(),
+            n_rows, __func__);
+}
+
+int32_t llama_mtp_relay_h_self(
+        struct llama_context * ctx_mtp,
+        int32_t                n_rows) {
+    if (!ctx_mtp) {
+        return -1;
+    }
+    return llama_mtp_relay_impl(ctx_mtp, ctx_mtp,
+            ctx_mtp->get_t_mtp_out(),
+            ctx_mtp->get_t_inp_h(),
+            n_rows, __func__);
+}
+
 void llama_synchronize(llama_context * ctx) {
     ctx->synchronize();
 }
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -80,6 +80,11 @@ struct llama_context {
     // writes into this tensor before llama_decode runs on ctx_mtp.
     ggml_tensor * get_t_inp_h() const;
 
+    // For LLM_ARCH_QWEN35_MTP contexts: the MTP block's post-FFN output from
+    // the most recent decode. Used by chained MTP-K drafting (K > 1) — the
+    // self-relay copies this into t_inp_h for the next chain step.
+    ggml_tensor * get_t_mtp_out() const;
+
     llama_token * get_sampled_tokens() const;
     llama_token   get_sampled_token_ith(int32_t idx);
 
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -706,6 +706,13 @@ class llm_graph_result {
     // each llama_decode via llama_mtp_relay_h.
     ggml_tensor * t_inp_h       = nullptr; // [n_embd, n_tokens]
 
+    // For LLM_ARCH_QWEN35_MTP: the MTP block's post-FFN output, before the
+    // shared LM head. Used by chained MTP-K drafting (K > 1): the speculative
+    // wrapper relays this back into t_inp_h for the next draft step so the
+    // chain conditions on the previous step's hidden state, matching how the
+    // MTP head was trained.
+    ggml_tensor * t_mtp_out     = nullptr; // [n_embd, n_tokens]
+
     std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
     std::map<llama_seq_id, ggml_tensor*> t_candidates;
     std::map<llama_seq_id, ggml_tensor*> t_sampled;
diff --git a/src/models/qwen35_mtp.cpp b/src/models/qwen35_mtp.cpp
@@ -150,6 +150,11 @@ llm_build_qwen35_mtp::llm_build_qwen35_mtp(const llama_model & model, const llm_
     cur = ggml_add(ctx0, cur, ffn_residual);
     cb(cur, "mtp_post_ffn", il);
 
+    // Snapshot the MTP block's post-FFN hidden — this is what gets fed back
+    // as the next chain step's t_inp_h for MTP-K drafting (K > 1). Lives on
+    // the output buffer alongside t_logits.
+    res->t_mtp_out = cur;
+
     // Shared final norm + LM head. The MTP block carries its own
     // shared_head_norm; if absent (some converted variants), fall back to the
     // model's output_norm. The LM head is the model's output (or tok_embd if