multistep process

pwilkin · pwilkin · commit 5fba55ecac7e · 2026-05-19T16:28:20.000+02:00
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -549,10 +549,19 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
 
         const size_t row_bytes = (size_t) n_embd * sizeof(float);
 
+        // Stacked MTP needs each block's KV pre-populated with the chain
+        // context, not just MTP1. For multi-block archs we request logits at
+        // every batch position so the masked-mode hidden-state extraction in
+        // ctx_dft captures every row (we feed those rows into the next chain
+        // step). The extra LM-head matmuls only fire during prefill / verify,
+        // not during drafting, so the cost is bounded.
+        const bool chain_prefill = (n_mtp_layers > 1);
+        const int8_t want_logits = chain_prefill ? 1 : 0;
+
         common_batch_clear(batch);
 
         for (int k = 0; k < n_tokens; ++k) {
-            common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
+            common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, want_logits);
         }
 
         // shift the tgt embeddings to the right by one position
@@ -587,12 +596,97 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
             set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
         }
 
+        llama_set_mtp_step(ctx_dft, 0);
         const int32_t rc = llama_decode(ctx_dft, batch);
         if (rc != 0) {
             LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
             return false;
         }
 
+        // For stacked MTP, run the remaining chain steps so blocks 2..N also
+        // get their KV slots filled with the verified context. Each step uses:
+        //   token[i] = batch_in.token[i+1]   (the next trunk token)
+        //   pos  [i] = batch_in.pos  [i+1]   (the next trunk position)
+        //   embd [i] = prev block's hidden at original batch index i
+        // Each step drops the leading position of every sequence (no prior
+        // chain hidden available there), so the deeper blocks lose a few
+        // entries at sequence starts — negligible for prompts of meaningful
+        // length, and correct: those positions never come up during drafting.
+        if (chain_prefill) {
+            // Snapshot MTP_{k-1}'s pre-norm hiddens, indexed by the original
+            // batch_in position so we can chain across the per-seq remap.
+            std::vector<float> prev_hiddens((size_t) n_tokens * n_embd);
+            {
+                const float * h_dft = llama_get_embeddings_pre_norm(ctx_dft);
+                if (h_dft != nullptr) {
+                    std::memcpy(prev_hiddens.data(), h_dft, (size_t) n_tokens * row_bytes);
+                } else {
+                    LOG_WRN("%s: chain prefill skipped (ctx_dft pre-norm embeddings unavailable)\n", __func__);
+                    return true;
+                }
+            }
+
+            for (int step = 1; step < n_mtp_layers; ++step) {
+                common_batch_clear(batch);
+
+                // Maps step-batch index -> original batch_in index so the
+                // next iteration can pick this step's hidden by absolute pos.
+                std::vector<int32_t> step_idx_to_in(n_tokens, -1);
+
+                for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                    const int32_t beg = i_batch_beg[seq_id];
+                    const int32_t end = i_batch_end[seq_id];
+                    if (beg < 0 || end < 0) {
+                        continue;
+                    }
+                    if (end - beg < step) {
+                        continue; // this seq is shorter than the chain depth so far
+                    }
+
+                    for (int32_t i = beg + step; i <= end; ++i) {
+                        const int32_t prev_idx = i - 1; // MTP_{step-1}'s hidden at the source position
+                        const int32_t step_i   = batch.n_tokens;
+
+                        common_batch_add(batch, batch_in.token[i], batch_in.pos[i], { seq_id }, 1);
+                        std::memcpy(batch.embd + (size_t) step_i * n_embd,
+                                    prev_hiddens.data() + (size_t) prev_idx * n_embd,
+                                    row_bytes);
+
+                        step_idx_to_in[step_i] = i;
+                    }
+                }
+
+                if (batch.n_tokens == 0) {
+                    break;
+                }
+
+                llama_set_mtp_step(ctx_dft, (uint32_t) step);
+                const int32_t rc_step = llama_decode(ctx_dft, batch);
+                if (rc_step != 0) {
+                    LOG_WRN("%s: chain prefill step %d llama_decode failed rc=%d\n", __func__, step, rc_step);
+                    break;
+                }
+
+                // Gather this step's hidden states for the next iteration,
+                // remapped to the original batch_in indexing.
+                if (step + 1 < n_mtp_layers) {
+                    std::vector<float> next_prev((size_t) n_tokens * n_embd, 0.0f);
+                    for (int32_t step_i = 0; step_i < (int32_t) batch.n_tokens; ++step_i) {
+                        const int32_t in_i = step_idx_to_in[step_i];
+                        if (in_i < 0) {
+                            continue;
+                        }
+                        const float * h = llama_get_embeddings_pre_norm_ith(ctx_dft, step_i);
+                        std::memcpy(next_prev.data() + (size_t) in_i * n_embd, h, row_bytes);
+                    }
+                    prev_hiddens = std::move(next_prev);
+                }
+            }
+
+            // Reset so subsequent draft() starts the chain at block 0.
+            llama_set_mtp_step(ctx_dft, 0);
+        }
+
         for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
             if (i_batch_end[seq_id] < 0) {
                 continue;