am17an
diff --git a/‎common/speculative.cpp‎
Lines changed: 196 additions & 30 deletions b/‎common/speculative.cpp‎
Lines changed: 196 additions & 30 deletions
diff --git a/‎common/speculative.h‎
Lines changed: 13 additions & 0 deletions b/‎common/speculative.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/llama.h‎
Lines changed: 10 additions & 0 deletions b/‎include/llama.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/llama-context.cpp‎
Lines changed: 25 additions & 4 deletions b/‎src/llama-context.cpp‎
Lines changed: 25 additions & 4 deletions
@@ -154,6 +154,13 @@ struct common_speculative_state {
 
     virtual void accept(uint16_t n_accepted) = 0;
 
+    // Optional hook: invoked by the server after each successful llama_decode
+    // on ctx_tgt. MTP uses it (only when is_prompt_prefill) to mirror the
+    // ubatch into ctx_mtp's KV.
+    virtual void on_target_decoded(const llama_batch & /*batch*/,
+                                   llama_seq_id /*slot_seq_id*/,
+                                   bool /*is_prompt_prefill*/) {}
+
     virtual int32_t n_max(const common_params_speculative & params) const = 0;
     virtual int32_t n_min(const common_params_speculative & params) const = 0;
 };
@@ -642,6 +649,15 @@ struct common_speculative_state_mtp : public common_speculative_state {
     // where ctx_tgt's t_h_pre_norm has only the prompt's last-position row.
     int32_t last_n_accepted = -1;
 
+    // No prompt-prefill accumulator: instead of harvesting trunk h rows into
+    // a host buffer and replaying them in one big MTP decode at begin(), we
+    // do an MTP ubatch decode FROM INSIDE on_target_decoded — i.e. each time
+    // ctx_target finishes a ubatch, we immediately push those rows + tokens
+    // through ctx_mtp. ctx_mtp's KV grows incrementally as the trunk's
+    // prompt prefill progresses, so by the time begin() is called the MTP
+    // KV already covers the full prompt, no matter how many ubatches it
+    // took on the trunk side.
+
     common_speculative_state_mtp(enum common_speculative_type type,
                                  llama_context * ctx_tgt,
                                  llama_context * ctx_mtp)
@@ -651,8 +667,11 @@ struct common_speculative_state_mtp : public common_speculative_state {
         const int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model_mtp));
         logits_buf.resize(n_vocab);
 
-        // Single-token batches drive the MTP draft step.
-        batch = llama_batch_init(/*n_tokens=*/ 1, /*n_embd=*/ 0, /*n_seq_max=*/ 1);
+        // Sized to a full ctx_mtp ubatch: largest case is the prompt-prefill
+        // mirror in on_target_decoded, which can run up to n_ubatch tokens
+        // per chunk; per-step drafts only use 1.
+        const int32_t n_batch_max = (int32_t) llama_n_ubatch(ctx_mtp);
+        batch = llama_batch_init(/*n_tokens=*/ n_batch_max, /*n_embd=*/ 0, /*n_seq_max=*/ 1);
 
         // Warmup decode on ctx_mtp: builds the graph for real (not just reserve)
         // and populates ctx_mtp->gf_res_prev->t_inp_h so the relay function can
@@ -683,30 +702,44 @@ struct common_speculative_state_mtp : public common_speculative_state {
     }
 
     void begin(const llama_tokens & prompt) override {
-        // Reset ctx_mtp's KV. Step 7 will replay the prompt here so MTP
-        // attention has full history before the first draft.
-        llama_memory_clear(llama_get_memory(ctx_mtp), /*data=*/ true);
-
-        // Seed a single token at position 0 so the cache has a "last position"
-        // baseline. M-RoPE's X<Y check fires if a fresh batch tries to start
-        // at the same position the cache just saw, so the first real draft
-        // will land at position 1.
-        const llama_model * model = llama_get_model(ctx_mtp);
-        const llama_token bos     = llama_vocab_bos(llama_model_get_vocab(model));
-        batch.n_tokens     = 1;
-        batch.token[0]     = bos;
-        batch.pos[0]       = 0;
-        batch.n_seq_id[0]  = 1;
-        batch.seq_id[0][0] = 0;
-        batch.logits[0]    = 0;  // we don't need logits from this seed decode
-        const int32_t rc = llama_decode(ctx_mtp, batch);
-        if (rc != 0) {
-            LOG_WRN("%s: ctx_mtp seed decode rc=%d\n", __func__, rc);
-        }
-        mtp_pos = 1;
-        last_n_accepted = -1; // signal "first draft of this generation"
+        // ctx_mtp's KV has been incrementally populated by on_target_decoded
+        // as the trunk processed each prompt-prefill ubatch. By the time
+        // begin() is called, MTP KV covers positions 0..N-1 (matching the
+        // trunk's prompt) — provided the server-side toggle and the
+        // contiguous-rows precondition held. We just need to set up the
+        // tracking state for the first draft.
+        last_n_accepted = -1;
 
-        GGML_UNUSED(prompt);
+        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
+        const int32_t N = (int32_t) prompt.size();
+        LOG_INF("mtp begin: N=%d mtp_pos_max=%d (KV %s)\n",
+                N, (int) pos_max,
+                (pos_max + 1 == N) ? "fully prefilled" :
+                (pos_max < 0)     ? "empty (no prefill)" : "partial");
+
+        if (pos_max < 0) {
+            // No prefill happened (e.g. server toggle off for non-MTP slot,
+            // or contiguous-rows precondition failed). Seed BOS at position
+            // 0 so the first draft has a non-empty KV to attend to. RoPE
+            // will be misaligned with trunk for short prompts that's
+            // tolerable; for long prompts the prefill path should always
+            // win this race.
+            const llama_model * model_mtp = llama_get_model(ctx_mtp);
+            const llama_token  bos = llama_vocab_bos(llama_model_get_vocab(model_mtp));
+            batch.n_tokens     = 1;
+            batch.token[0]     = bos;
+            batch.pos[0]       = 0;
+            batch.n_seq_id[0]  = 1;
+            batch.seq_id[0][0] = 0;
+            batch.logits[0]    = 0;
+            const int32_t rc = llama_decode(ctx_mtp, batch);
+            if (rc != 0) {
+                LOG_WRN("%s: ctx_mtp seed decode rc=%d\n", __func__, rc);
+            }
+            mtp_pos = 1;
+        } else {
+            mtp_pos = pos_max + 1;
+        }
     }
 
     void draft(
@@ -725,6 +758,10 @@ struct common_speculative_state_mtp : public common_speculative_state {
         const int32_t n_vocab = (int32_t) logits_buf.size();
         llama_token cond_tok = id_last;
 
+        const llama_pos pos_max_before = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
+        LOG_INF("mtp draft: id_last=%d n_max=%d last_n_accepted=%d mtp_pos_max=%d\n",
+                (int) id_last, (int) n_max, (int) last_n_accepted, (int) pos_max_before);
+
         for (int32_t k = 0; k < n_max; ++k) {
             // Stage h. Step 0: from ctx_tgt's t_h_pre_norm at the row whose
             // hidden produced id_last. After a previous verify [sampled, d0,
@@ -735,14 +772,16 @@ struct common_speculative_state_mtp : public common_speculative_state {
             // ctx_tgt only computed the prompt's last position → row 0.
             // Step k>0: self-relay from ctx_mtp's previous t_mtp_out.
             int32_t rc_relay;
+            int32_t src_row_used = -1;
             if (k == 0) {
-                const int32_t src_row = (last_n_accepted < 0) ? 0 : last_n_accepted;
-                rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp, src_row, /*n_rows=*/ 1);
+                src_row_used = (last_n_accepted < 0) ? 0 : last_n_accepted;
+                rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp, src_row_used, /*n_rows=*/ 1);
             } else {
                 rc_relay = llama_mtp_relay_h_self(ctx_mtp, /*n_rows=*/ 1);
             }
             if (rc_relay != 0) {
-                LOG_DBG("%s: relay rc=%d at k=%d; stopping chain\n", __func__, rc_relay, k);
+                LOG_WRN("%s: relay rc=%d at k=%d (src_row=%d); stopping chain\n",
+                        __func__, rc_relay, k, src_row_used);
                 return;
             }
 
@@ -775,6 +814,8 @@ struct common_speculative_state_mtp : public common_speculative_state {
             for (int i = 1; i < n_vocab; ++i) {
                 if (logits_buf[i] > bv) { bv = logits_buf[i]; best = i; }
             }
+            LOG_INF("mtp draft   k=%d pos=%d cond=%d -> %d (logit=%.2f)\n",
+                    (int) k, (int) pos, (int) cond_tok, best, bv);
             draft_tokens.push_back(best);
             cond_tok = best;
         }
@@ -790,12 +831,14 @@ struct common_speculative_state_mtp : public common_speculative_state {
         // positions from ctx_mtp's KV so the next draft writes K/V at the
         // right slots.
         const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
+        const int32_t n_drafted_last = (int32_t) last_n_drafted;
+        const int32_t n_to_drop = std::max(0, n_drafted_last - (int32_t) n_accepted);
+        LOG_INF("mtp accept: n_drafted=%d n_accepted=%d n_to_drop=%d mtp_pos_max=%d\n",
+                n_drafted_last, (int) n_accepted, n_to_drop, (int) pos_max);
         if (pos_max < 0) {
             last_n_accepted = (int32_t) n_accepted;
             return;
         }
-        const int32_t n_drafted_last = (int32_t) last_n_drafted;
-        const int32_t n_to_drop = std::max(0, n_drafted_last - (int32_t) n_accepted);
         if (n_to_drop > 0) {
             const llama_pos drop_from = pos_max - n_to_drop + 1;
             llama_memory_seq_rm(llama_get_memory(ctx_mtp), /*seq_id=*/ 0,
@@ -807,6 +850,116 @@ struct common_speculative_state_mtp : public common_speculative_state {
         last_n_accepted = (int32_t) n_accepted;
     }
 
+    void on_target_decoded(const llama_batch & batch, llama_seq_id slot_seq_id, bool is_prompt_prefill) override {
+        if (!is_prompt_prefill) {
+            return; // verify-batch decodes are owned by the draft path
+        }
+        // Mirror the trunk's just-finished ubatch into ctx_mtp by running one
+        // MTP forward over the same positions. ctx_target's t_h_pre_norm
+        // currently carries one row per output position of THIS ubatch (the
+        // server toggles output=true on every prompt-prefill token for MTP
+        // slots), and its data is still fresh — graph_compute_async finished
+        // before this hook fires.
+        //
+        // Conditions for staging a real prefill MTP decode:
+        //   - we're in prompt prefill (not the verify decode that draft()
+        //     handles itself: skip if any slot tokens have logits=true,
+        //     since the verify batch always sets logits everywhere). We
+        //     detect this by checking that ALL of OUR slot's tokens carry
+        //     logits=true AND the trunk t_h_pre_norm has rows for all of
+        //     them — i.e. this is the prefill regime.
+        //   - the slot is single-seq (n_parallel=1 is enforced for MTP).
+        //
+        // For each token at trunk pos p in the slot, we feed (h_p, prompt[p])
+        // to the MTP block at MTP pos p. This is a "no-shift" approximation
+        // — MTP was trained on (h_p, x_{p+1}) → predict x_{p+2}, so feeding
+        // (h_p, x_p) puts slightly off-distribution K/V into MTP's KV, but
+        // the K/V values are at the right positions for attention. The
+        // alternative (proper shift) requires looking ahead to the next
+        // ubatch's first token, which we don't have here.
+        if (batch.n_tokens <= 0) {
+            return;
+        }
+        ggml_tensor * h = llama_context_get_t_h_pre_norm(ctx_tgt);
+        if (!h) {
+            return; // trunk didn't produce t_h_pre_norm this decode
+        }
+        const int64_t n_rows = h->ne[1];
+        if (n_rows < batch.n_tokens) {
+            return; // not all positions have output rows; can't safely match
+        }
+
+        // Filter tokens belonging to this slot, preserving batch order.
+        // For n_parallel=1 every token belongs to the slot; the filter is a
+        // no-op there.
+        struct entry { int batch_idx; int row_idx; };
+        std::vector<entry> mine;
+        mine.reserve(batch.n_tokens);
+        int row_idx = -1;
+        for (int i = 0; i < batch.n_tokens; ++i) {
+            const bool has_out = batch.logits && batch.logits[i];
+            if (has_out) row_idx++;
+            bool is_mine = false;
+            if (batch.n_seq_id && batch.n_seq_id[i] > 0 && batch.seq_id) {
+                for (int j = 0; j < batch.n_seq_id[i]; ++j) {
+                    if (batch.seq_id[i][j] == slot_seq_id) { is_mine = true; break; }
+                }
+            }
+            if (is_mine && has_out && row_idx >= 0 && row_idx < n_rows) {
+                mine.push_back({i, row_idx});
+            }
+        }
+        if (mine.empty()) {
+            return;
+        }
+        // Heuristic: only run prefill if the rows in t_h_pre_norm are
+        // contiguous starting at 0 (they will be when our slot's tokens are
+        // the only ones with output=true). Otherwise we'd need to gather
+        // non-contiguous rows — skip rather than risk wrong h.
+        for (size_t k = 0; k < mine.size(); ++k) {
+            if (mine[k].row_idx != (int) k) {
+                LOG_INF("mtp prefill skip: non-contiguous rows (slot=%d)\n", (int) slot_seq_id);
+                return;
+            }
+        }
+
+        const int n = (int) mine.size();
+        // Run MTP forwards in chunks of at most n_ubatch tokens — single
+        // huge MTP forwards (e.g. 1500-token prompts) exceed compute scratch
+        // and crash in ggml. The KV result is identical regardless of split,
+        // since each chunk attends to all earlier MTP KV positions.
+        const int chunk_max = (int) llama_n_ubatch(ctx_mtp);
+        for (int off = 0; off < n; off += chunk_max) {
+            const int n_chunk = std::min(chunk_max, n - off);
+
+            this->batch.n_tokens = n_chunk;
+            for (int k = 0; k < n_chunk; ++k) {
+                const int bi = mine[off + k].batch_idx;
+                this->batch.token[k]     = batch.token[bi];
+                this->batch.pos[k]       = batch.pos ? batch.pos[bi] : (off + k);
+                this->batch.n_seq_id[k]  = 1;
+                this->batch.seq_id[k][0] = 0;
+                this->batch.logits[k]    = 0;
+            }
+            const int32_t rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp,
+                                                       /*src_row=*/ off, /*n_rows=*/ n_chunk);
+            if (rc_relay != 0) {
+                LOG_WRN("mtp prefill: relay rc=%d (chunk_off=%d, n=%d, slot=%d)\n",
+                        rc_relay, off, n_chunk, (int) slot_seq_id);
+                return;
+            }
+            const int32_t rc = llama_decode(ctx_mtp, this->batch);
+            if (rc != 0) {
+                LOG_WRN("mtp prefill: decode rc=%d (chunk_off=%d, n=%d, slot=%d)\n",
+                        rc, off, n_chunk, (int) slot_seq_id);
+                return;
+            }
+        }
+        const llama_pos new_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_mtp), 0);
+        LOG_INF("mtp prefill: slot=%d n=%d chunks=%d mtp_pos_max=%d\n",
+                (int) slot_seq_id, n, (n + chunk_max - 1) / chunk_max, (int) new_pos_max);
+    }
+
     int32_t n_max(const common_params_speculative & params) const override {
         return std::max(1, params.draft.n_max);
     }
@@ -1423,6 +1576,19 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
     }
 }
 
+void common_speculative_on_target_decoded(
+        common_speculative * spec,
+        const llama_batch  & batch,
+        llama_seq_id         slot_seq_id,
+        bool                 is_prompt_prefill) {
+    if (!spec) {
+        return;
+    }
+    for (auto & impl : spec->impls) {
+        impl->on_target_decoded(batch, slot_seq_id, is_prompt_prefill);
+    }
+}
+
 int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params) {
     if (spec == nullptr) {
         return 0;
 
@@ -37,6 +37,19 @@ llama_tokens common_speculative_draft(
 // informs the speculative decoder that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
 
+// Notifies the speculative decoder that ctx_tgt just decoded a batch. MTP
+// uses this hook (only when is_prompt_prefill = true) to mirror the just-
+// decoded ubatch into ctx_mtp — i.e. each trunk prompt-prefill ubatch
+// triggers one MTP ubatch decode with the same positions and tokens, so
+// MTP's KV grows incrementally as the trunk's prompt prefill progresses.
+// Pass is_prompt_prefill=false for verify-batch decodes (drafting flow
+// owns those) so MTP's draft-time K/V isn't clobbered.
+void common_speculative_on_target_decoded(
+        common_speculative * spec,
+        const llama_batch  & batch,
+        llama_seq_id         slot_seq_id,
+        bool                 is_prompt_prefill);
+
 int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params);
 int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params);
 
 
@@ -988,6 +988,15 @@ extern "C" {
     // hidden state plus a token batch to produce draft logits, with its own KV
     // cache populated by build_attn the same way any other layer's is.
     //
+    // Returns ctx's most recent t_h_pre_norm tensor (the trunk's pre-output-
+    // norm hidden state) for an MTP-enabled trunk arch, or NULL. Used by the
+    // common speculative MTP implementation to harvest hidden-state rows
+    // across ubatches during prompt prefill (the trunk's gf_res_prev only
+    // carries the last ubatch's rows, so we accumulate as they're produced).
+    // The returned tensor's data lives in ctx's compute or output buffer;
+    // call llama_synchronize(ctx) before reading via ggml_backend_tensor_get.
+    LLAMA_API struct ggml_tensor * llama_context_get_t_h_pre_norm(struct llama_context * ctx);
+
     // Stages a copy of n_rows of ctx_target's t_h_pre_norm starting at index
     // `src_row` into rows [0, n_rows) of ctx_mtp's t_inp_h. The copy is
     // deferred to the next llama_decode on ctx_mtp — by then the destination
@@ -1017,6 +1026,7 @@ extern "C" {
             struct llama_context * ctx_mtp,
             int32_t                n_rows);
 
+
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 
 
@@ -3100,18 +3100,39 @@ ggml_tensor * llama_context::get_t_h_pre_norm() const {
     return gf_res_prev ? gf_res_prev->t_h_pre_norm : nullptr;
 }
 
+ggml_tensor * llama_context_get_t_h_pre_norm(struct llama_context * ctx) {
+    return ctx ? ctx->get_t_h_pre_norm() : nullptr;
+}
+
 ggml_tensor * llama_context::get_t_mtp_out() const {
     return gf_res_prev ? gf_res_prev->t_mtp_out : nullptr;
 }
 
 void llama_context::set_mtp_h_source(struct llama_context * ctx_src, ggml_tensor * src,
                                      int32_t row_first, int32_t n_rows) {
-    mtp_h_staging.ctx_src   = ctx_src;
-    mtp_h_staging.src       = src;
-    mtp_h_staging.row_first = row_first;
-    mtp_h_staging.n_rows    = n_rows;
+    GGML_ASSERT(ctx_src && src && n_rows > 0);
+    GGML_ASSERT(row_first >= 0 && row_first + n_rows <= src->ne[1]);
+
+    // Wait for the source's compute to finish before reading its rows.
+    ctx_src->synchronize();
+
+    const size_t row_bytes = src->nb[1];
+    mtp_h_staging.host_buf.resize(row_bytes * (size_t) n_rows);
+    mtp_h_staging.n_rows = n_rows;
+    mtp_h_staging.n_embd = (int32_t) src->ne[0];
+
+    // Synchronous device-to-host of the requested row range.
+    ggml_backend_tensor_get(src, mtp_h_staging.host_buf.data(),
+                            (size_t) row_first * row_bytes,
+                            row_bytes * (size_t) n_rows);
+
+    LLAMA_LOG_DEBUG("mtp_relay stage: src=%s ne=[%lld,%lld] rows=[%d,%d) embd=%d bytes=%zu\n",
+                    src->name, (long long) src->ne[0], (long long) src->ne[1],
+                    row_first, row_first + n_rows, mtp_h_staging.n_embd,
+                    mtp_h_staging.host_buf.size());
 }
 
+
 ggml_tensor * llama_context::get_t_inp_h() const {
     // gf_res_prev->t_inp_h is set by the model's graph builder (e.g.
     // llm_build_qwen35_mtp). After the first real llama_decode it lives there.