Chain MTP

pwilkin · pwilkin · commit 5b4a8f1df7b8 · 2026-05-19T16:26:43.000+02:00
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -416,6 +416,12 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
 
     int32_t n_embd = 0;
 
+    // Stacked-MTP draft chain length. Each MTP block k is trained to take the
+    // (k-1)-th block's hidden state and predict the (k+1)-th token ahead, so
+    // we can draft at most `n_mtp_layers` tokens per round before going
+    // out-of-distribution.
+    int32_t n_mtp_layers = 0;
+
     // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
     // The last h-row of one process() call needs the first token of the NEXT
     // call to pair with, so it's stashed here until that next call fires.
@@ -442,7 +448,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
         auto * ctx_dft = this->params.ctx_dft;
         GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set");
 
-        n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
+        n_embd       = llama_model_n_embd (llama_get_model(ctx_dft));
+        n_mtp_layers = llama_model_n_nextn(llama_get_model(ctx_dft));
+        GGML_ASSERT(n_mtp_layers > 0 && "MTP draft requires the draft model to declare nextn_predict_layers > 0");
 
         LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
         LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
@@ -635,9 +643,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
         }
 
-        // First draft step uses the first MTP block (step 0). Archs with a
-        // single MTP block ignore this; multi-block archs (Step-3.5-Flash) use
-        // it to round-robin across their N MTP layers.
+        // Stacked-MTP is a *chain*: block k consumes block (k-1)'s hidden
+        // state and predicts one token further than block (k-1). Step k uses
+        // MTP block k; we cannot exceed `n_mtp_layers` steps without going
+        // out of training distribution.
         llama_set_mtp_step(ctx_dft, 0);
 
         int ret = llama_decode(ctx_dft, batch);
@@ -648,6 +657,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
 
         int i = 0;
 
+        // Cap draft depth: never run more MTP chain steps than there are
+        // trained MTP blocks. `params.n_max` may be larger; we just stop.
+        const int n_chain_max = n_mtp_layers;
+
         while (n_drafting > 0) {
             int i_batch = 0;
 
@@ -704,8 +717,16 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
                 break;
             }
 
-            // Step i+1: feed the i-th sampled draft token into the (i+1)-th
-            // MTP block. Multi-block archs round-robin via mtp_step % N.
+            // We just sampled the (i+1)-th draft token (T_{i+1}). If the
+            // chain has no further block (T_{i+2} would need MTP block i+1
+            // which doesn't exist), stop — sampling, not decoding, is what
+            // matters for the drafted result.
+            if (i + 1 >= n_chain_max) {
+                break;
+            }
+
+            // Step i+1: feed the i-th sampled draft token into MTP block i+1.
+            // Direct indexing, no modulo — the chain is bounded by n_mtp_layers.
             llama_set_mtp_step(ctx_dft, (uint32_t)(i + 1));
 
             // evaluate the drafted tokens on the draft model
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -31,9 +31,12 @@ struct llama_cparams {
     bool embeddings_pre_norm;        // also extract the hidden state before the final output norm
     bool embeddings_pre_norm_masked; // extract for only rows where batch.logits != 0
 
-    // MTP draft-step index, used by archs with num_nextn_predict_layers > 1 to
-    // round-robin across MTP blocks (matches vllm's spec_step_idx). The graph
-    // builder selects `il = n_main + (mtp_step % nextn_predict_layers)`.
+    // MTP draft-chain step index. Stacked-MTP archs (Step-3.5, with multiple
+    // trained MTP blocks) interpret block `k` as: takes block (k-1)'s output
+    // hidden state + the embedding of block (k-1)'s sampled token and
+    // predicts one token further. The chain is bounded — block k is only
+    // valid for step k, with no wrap-around — so the speculative driver caps
+    // the AR loop at `nextn_predict_layers`.
     uint32_t mtp_step;
     bool causal_attn;
     bool offload_kqv;
diff --git a/src/llama-ext.h b/src/llama-ext.h
@@ -85,6 +85,12 @@ using llama_memory_breakdown = std::map<ggml_backend_buffer_type_t, llama_memory
 LLAMA_API int32_t llama_model_n_expert (const struct llama_model * model);
 LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
 
+// Number of NextN/MTP prediction blocks (0 if the model has none). For
+// stacked-MTP architectures this caps the maximum useful speculative draft
+// depth: each block is a distinct chain step and they cannot be reused
+// because each block expects the previous block's hidden state as input.
+LLAMA_API int32_t llama_model_n_nextn (const struct llama_model * model);
+
 LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
 
 LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
@@ -109,8 +115,10 @@ LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx,
 // MTP draft-step index (round-robin selector across MTP blocks)
 //
 
-// Set the MTP draft-step index for the next llama_decode call. Used by archs
-// with num_nextn_predict_layers > 1 to round-robin across their MTP blocks
-// (matches vllm's spec_step_idx). Pass step = 0 for the first draft token,
-// step = 1 for the second, etc. The graph builder reads cparams.mtp_step.
+// Set the MTP draft-chain step index for the next llama_decode call. Stacked
+// MTP architectures consume one block per step in chain order — block 0
+// produces logits to sample T_{t+1}, block 1 produces T_{t+2}, etc.
+// `step` must be in [0, num_nextn_predict_layers): there is no wrap-around,
+// because block k expects block (k-1)'s output hidden state as input and is
+// out-of-distribution for any other position.
 LLAMA_API void llama_set_mtp_step(struct llama_context * ctx, uint32_t step);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -2503,6 +2503,10 @@ int32_t llama_model_n_devices(const struct llama_model * model) {
     return (int32_t)model->devices.size();
 }
 
+int32_t llama_model_n_nextn(const struct llama_model * model) {
+    return (int32_t) model->hparams.nextn_predict_layers;
+}
+
 ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i) {
     if (i < 0 || i >= (int)model->devices.size()) {
         return nullptr;
diff --git a/src/models/step35.cpp b/src/models/step35.cpp
@@ -362,13 +362,15 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     : llm_graph_context(params) {
     GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0");
 
-    // Round-robin across MTP blocks at draft step boundaries. Matches vllm's
-    // `current_step_idx = spec_step_idx % num_mtp_layers` (step3p5_mtp.py).
-    // The first MTP block lives at layer index `n_main`; the speculative
-    // driver bumps `cparams.mtp_step` between AR iterations.
-    const int n_main      = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
-    const int step_offset = (int) (cparams.mtp_step % hparams.nextn_predict_layers);
-    const int il          = n_main + step_offset;
+    // Stacked-MTP is a chain, NOT a round-robin: MTP block k expects the
+    // hidden state output by block k-1 (or the backbone for k=0). Block k can
+    // only predict the (k+1)-th token ahead and is undefined input
+    // distribution for any other step. The speculative driver caps the AR
+    // loop at `num_mtp_layers`; we assert that here.
+    GGML_ASSERT(cparams.mtp_step < hparams.nextn_predict_layers &&
+                "STEP35 MTP: draft step exceeds number of trained MTP blocks (no wrap-around)");
+    const int n_main = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
+    const int il     = n_main + (int) cparams.mtp_step;
     const auto & layer = model.layers[il];
 
     GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");