mtp: copy correct row of t_h_pre_norm based on prior n_accepted

am17an · am17an · commit 183a99c831f0 · 2026-04-30T14:02:38.000+08:00
Real bug fix. Previously llama_mtp_relay_h copied the LAST row of
ctx_target's t_h_pre_norm into ctx_mtp's t_inp_h. That is only correct
when the verifier accepts ALL drafts in the previous round; on partial
acceptance, the row whose hidden produced the next id_last is row
n_accepted, not the last row.

For a verify batch [sampled, d0, ..., d_{K-1}] at positions [p..p+K]:
- bonus = verifier's sample at row n_accepted (rejected position, or
  the last row if all K drafts accepted)
- next id_last lives at position p + n_accepted + 1
- MTP needs h at position p + n_accepted = ROW n_accepted of t_h_pre_norm

The bug was invisible at K=1 in canonical paths (most rounds full-
accept → row K-1 = last row = correct) but degraded acceptance whenever
a draft was rejected. At K&gt;=2, partial-accept dominates and MTP cascades
on wrong h, collapsing acceptance to ~30%.

Changes:
- llama_mtp_relay_h signature: int32_t n_rows → int32_t src_row.
  Copies a single row at the specified index from src into row 0 of dst.
  Caller picks the row.
- llama_mtp_relay_h_self unchanged in semantics — t_mtp_out has only the
  one row produced by the previous chain step's single-token decode.
- common_speculative_state_mtp: track last_n_accepted (set by accept(),
  consumed by next draft()'s k=0 relay). begin() resets it to -1, which
  the relay maps to row 0 (only the prompt's last position is in the
  trunk's outputs after prefill).

Measured on Qwen3.6-q8_0-mtp.gguf, send_req.sh (dense Python code, 400
tokens, temp=1, seed=42):

              before fix             after fix
  K=1   84% accept, 11.4 tok/s   88% accept, 12.5 tok/s
  K=2   30% accept,  9.5 tok/s   86% accept, 16.9 tok/s   (+78%)
  K=3   not viable               73% accept, 17.5 tok/s

K=2 now matches vLLM's documented sweet spot for Qwen3.6 / DeepSeek
MTP on code workloads. K=3 is a marginal win on top.

Architecture confirmation: an independent walk of vLLM's chain code
(SpecDecodeBaseProposer.propose, qwen3_5_mtp.forward) confirms vLLM's
K&gt;1 chain is a pure self-roll on the MTP block's post-residual hidden
with hnorm reapplied each step — the same mechanism this codebase
already implements; the only delta vs vLLM was the row-selection bug
fixed here.
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -634,6 +634,14 @@ struct common_speculative_state_mtp : public common_speculative_state {
     // uses this to compute how many trailing positions to roll back.
     uint16_t last_n_drafted = 0;
 
+    // # of drafts the verifier accepted on the most recent round. Used by the
+    // NEXT draft()'s k=0 relay: the row of ctx_tgt's t_h_pre_norm whose hidden
+    // produced the new id_last is exactly `last_n_accepted` (the bonus came
+    // from that row's logits). Using the last row instead silently corrupts
+    // MTP whenever last_n_accepted < n_drafts. -1 = first draft after begin(),
+    // where ctx_tgt's t_h_pre_norm has only the prompt's last-position row.
+    int32_t last_n_accepted = -1;
+
     common_speculative_state_mtp(enum common_speculative_type type,
                                  llama_context * ctx_tgt,
                                  llama_context * ctx_mtp)
@@ -696,6 +704,7 @@ struct common_speculative_state_mtp : public common_speculative_state {
             LOG_WRN("%s: ctx_mtp seed decode rc=%d\n", __func__, rc);
         }
         mtp_pos = 1;
+        last_n_accepted = -1; // signal "first draft of this generation"
 
         GGML_UNUSED(prompt);
     }
@@ -717,13 +726,21 @@ struct common_speculative_state_mtp : public common_speculative_state {
         llama_token cond_tok = id_last;
 
         for (int32_t k = 0; k < n_max; ++k) {
-            // Stage h. Step 0: from ctx_tgt's t_h_pre_norm (the h corresponding
-            // to id_last's position in the trunk). Step k>0: self-relay from
-            // ctx_mtp's previous t_mtp_out (the MTP block's post-FFN h from
-            // the previous chain step).
-            const int32_t rc_relay = (k == 0)
-                ? llama_mtp_relay_h(ctx_tgt, ctx_mtp, /*n_rows=*/ 1)
-                : llama_mtp_relay_h_self(ctx_mtp,    /*n_rows=*/ 1);
+            // Stage h. Step 0: from ctx_tgt's t_h_pre_norm at the row whose
+            // hidden produced id_last. After a previous verify [sampled, d0,
+            // ..., d_{K-1}] with `last_n_accepted` drafts accepted, the bonus
+            // (= new id_last) was sampled from h at row `last_n_accepted`
+            // (rows 0..K of t_h_pre_norm correspond to those K+1 positions).
+            // For the very first draft of a generation (last_n_accepted=-1)
+            // ctx_tgt only computed the prompt's last position → row 0.
+            // Step k>0: self-relay from ctx_mtp's previous t_mtp_out.
+            int32_t rc_relay;
+            if (k == 0) {
+                const int32_t src_row = (last_n_accepted < 0) ? 0 : last_n_accepted;
+                rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp, src_row);
+            } else {
+                rc_relay = llama_mtp_relay_h_self(ctx_mtp, /*n_rows=*/ 1);
+            }
             if (rc_relay != 0) {
                 LOG_DBG("%s: relay rc=%d at k=%d; stopping chain\n", __func__, rc_relay, k);
                 return;
@@ -774,6 +791,7 @@ struct common_speculative_state_mtp : public common_speculative_state {
         // right slots.
         const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
         if (pos_max < 0) {
+            last_n_accepted = (int32_t) n_accepted;
             return;
         }
         const int32_t n_drafted_last = (int32_t) last_n_drafted;
@@ -784,6 +802,9 @@ struct common_speculative_state_mtp : public common_speculative_state {
                                 /*p0=*/ drop_from, /*p1=*/ -1);
         }
         last_n_drafted = 0;
+        // Record so the NEXT draft()'s k=0 relay knows which row of ctx_tgt's
+        // t_h_pre_norm to copy.
+        last_n_accepted = (int32_t) n_accepted;
     }
 
     int32_t n_max(const common_params_speculative & params) const override {
diff --git a/include/llama.h b/include/llama.h
@@ -988,18 +988,24 @@ extern "C" {
     // hidden state plus a token batch to produce draft logits, with its own KV
     // cache populated by build_attn the same way any other layer's is.
     //
-    // Copies the LAST n_rows of ctx_target's t_h_pre_norm into the FIRST n_rows
-    // of ctx_mtp's t_inp_h. Typical use: n_rows=1 to feed a single-token MTP
-    // draft step with the most recently produced h row. Both backends must be
-    // able to issue a copy between each other (typical case: same device, fast
-    // on-device copy).
+    // Copies a single row at index `src_row` of ctx_target's t_h_pre_norm into
+    // row 0 of ctx_mtp's t_inp_h. Both backends must be able to issue a copy
+    // between each other (typical case: same device, fast on-device copy).
+    //
+    // The right `src_row` for MTP drafting is the row whose hidden produced the
+    // verifier sample that becomes the next draft's id_last. After a verify
+    // batch [sampled, d0, ..., d_{K-1}] with `n_accepted` drafts accepted, that
+    // is `src_row = n_accepted` (the bonus token was sampled from h at row
+    // n_accepted). Using the last row instead silently corrupts MTP whenever
+    // n_accepted < K; the bug is invisible at K=1 most of the time but tanks
+    // K>=2.
     //
     // Returns 0 on success; negative on error (e.g. ctx_target's last decode
-    // didn't produce t_h_pre_norm, n_rows out of range, shape mismatch).
+    // didn't produce t_h_pre_norm, src_row out of range, shape mismatch).
     LLAMA_API int32_t llama_mtp_relay_h(
             struct llama_context * ctx_target,
             struct llama_context * ctx_mtp,
-            int32_t                n_rows);
+            int32_t                src_row);
 
     // Self-relay: copy the LAST n_rows of ctx_mtp's most recent t_mtp_out
     // (the MTP block's post-FFN hidden) into the FIRST n_rows of its own
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -3118,15 +3118,15 @@ ggml_tensor * llama_context::get_t_inp_h() const {
     return nullptr;
 }
 
-// Common implementation: copy the LAST n_rows of `src` into the FIRST n_rows
+// Common implementation: copy a single row at `src_row` of `src` into row 0
 // of `dst`, on-device via ggml_backend_tensor_copy_async. ctx_src/ctx_dst are
 // used to look up backends per tensor and to synchronize the source.
 static int32_t llama_mtp_relay_impl(
         struct llama_context * ctx_src,
         struct llama_context * ctx_dst,
         ggml_tensor          * src,
         ggml_tensor          * dst,
-        int32_t                n_rows,
+        int32_t                src_row,
         const char           * fn) {
     if (!src) {
         LLAMA_LOG_ERROR("%s: src tensor missing\n", fn);
@@ -3141,9 +3141,9 @@ static int32_t llama_mtp_relay_impl(
                 fn, src->ne[0], dst->ne[0]);
         return -4;
     }
-    if (n_rows <= 0 || n_rows > src->ne[1] || n_rows > dst->ne[1]) {
-        LLAMA_LOG_ERROR("%s: n_rows=%d out of range (src cap=%" PRId64 ", dst cap=%" PRId64 ")\n",
-                fn, n_rows, src->ne[1], dst->ne[1]);
+    if (src_row < 0 || src_row >= src->ne[1] || dst->ne[1] < 1) {
+        LLAMA_LOG_ERROR("%s: src_row=%d out of range (src cap=%" PRId64 ", dst cap=%" PRId64 ")\n",
+                fn, src_row, src->ne[1], dst->ne[1]);
         return -5;
     }
 
@@ -3155,8 +3155,8 @@ static int32_t llama_mtp_relay_impl(
     // only forwards view->data + offset), so wire the buffer manually before
     // passing the views to copy_async.
     const size_t row_size   = src->nb[1];
-    const int32_t src_first = (int32_t) src->ne[1] - n_rows; // last n_rows of src
-    const size_t src_offset = (size_t) src_first * row_size;
+    const int32_t n_rows    = 1;
+    const size_t src_offset = (size_t) src_row * row_size;
 
     ggml_context_ptr view_ctx;
     {
@@ -3195,14 +3195,14 @@ static int32_t llama_mtp_relay_impl(
 int32_t llama_mtp_relay_h(
         struct llama_context * ctx_target,
         struct llama_context * ctx_mtp,
-        int32_t                n_rows) {
+        int32_t                src_row) {
     if (!ctx_target || !ctx_mtp) {
         return -1;
     }
     return llama_mtp_relay_impl(ctx_target, ctx_mtp,
             ctx_target->get_t_h_pre_norm(),
             ctx_mtp->get_t_inp_h(),
-            n_rows, __func__);
+            src_row, __func__);
 }
 
 int32_t llama_mtp_relay_h_self(
@@ -3211,10 +3211,18 @@ int32_t llama_mtp_relay_h_self(
     if (!ctx_mtp) {
         return -1;
     }
+    // Self-relay: t_mtp_out has shape [n_embd, n_tokens] from the previous
+    // single-token decode, so n_tokens=1 and the only row is 0.
+    GGML_UNUSED(n_rows);
+    ggml_tensor * src = ctx_mtp->get_t_mtp_out();
+    if (!src) {
+        return -2;
+    }
+    const int32_t src_row = (int32_t) src->ne[1] - 1;
     return llama_mtp_relay_impl(ctx_mtp, ctx_mtp,
-            ctx_mtp->get_t_mtp_out(),
+            src,
             ctx_mtp->get_t_inp_h(),
-            n_rows, __func__);
+            src_row, __func__);
 }
 
 void llama_synchronize(llama_context * ctx) {