mtp: stage h relay through set_input for graph-rebuild safety

am17an · am17an · commit b9be09564908 · 2026-04-30T14:51:26.000+08:00
Refactor the h relay path so the device-to-device copy from ctx_target's
t_h_pre_norm into ctx_mtp's t_inp_h happens during set_input on the next
decode, rather than immediately at the relay call. This prepares the
ground for batched MTP prompt prefill, where ctx_mtp has to switch
between n_tokens=N (prefill) and n_tokens=1 (chain step) graphs and the
old "copy now" path could not survive the rebuild — t_inp_h's tensor
identity changes when the graph rebuilds, but the relay had already
written to the prior graph's tensor.

Mechanism:
- llama_context gains an mtp_h_source_t staging slot (ctx_src + src
  tensor + row range), set by llama_mtp_relay_h{,_self} and consumed
  during set_inputs on the next decode.
- llm_graph_input_h_pre_norm now holds a llama_context* and reads the
  staged source in its set_input. The actual ggml_backend_tensor_copy_
  async lives there (synchronizes ctx_src, builds row views with
  manually-wired buffers, sched-resolves backends per side, then async
  copies). After the copy the staging is cleared so a stray decode
  without a fresh relay call doesn't replay stale data.
- llm_graph_params carries a llama_context* so the graph builder can
  wire it onto the input class. graph_params() in llama-context.cpp
  passes `this`.
- llama_mtp_relay_h gains an n_rows parameter (1 by default for per-
  step drafting; N for an upcoming batched-prefill caller).

No behavior change at K=1/K=2 — relay still fires every draft step,
still copies the same rows. Verified send_req on Qwen3.6-q8_0-mtp:

  K=1: 88.2% accept (187/212), 12.0 tok/s   (was 88%, 12.5)
  K=2: 85.7% accept (252/294), 16.2 tok/s   (was 86%, 16.9)

Within noise — the slight tok/s dip is the extra synchronize + view
allocation per set_input call; trivially recoverable later.

Why this matters: with the relay flowing through set_input, the next
commit can do batched MTP prompt prefill (single n_tokens=N decode)
followed by the existing single-token chain steps without the t_inp_h
identity gymnastics. That fixes the long-context issue where MTP's KV
currently holds only [BOS, draft_1, ..., draft_M] and MTP attention
cannot see prompt context, plus the position drift where MTP applies
RoPE at local positions 1..M+1 while the trunk is at absolute position
N..N+M (for a 4K prompt those rotations diverge enough to wreck
attention quality).
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -737,7 +737,7 @@ struct common_speculative_state_mtp : public common_speculative_state {
             int32_t rc_relay;
             if (k == 0) {
                 const int32_t src_row = (last_n_accepted < 0) ? 0 : last_n_accepted;
-                rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp, src_row);
+                rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp, src_row, /*n_rows=*/ 1);
             } else {
                 rc_relay = llama_mtp_relay_h_self(ctx_mtp, /*n_rows=*/ 1);
             }
diff --git a/include/llama.h b/include/llama.h
@@ -988,24 +988,25 @@ extern "C" {
     // hidden state plus a token batch to produce draft logits, with its own KV
     // cache populated by build_attn the same way any other layer's is.
     //
-    // Copies a single row at index `src_row` of ctx_target's t_h_pre_norm into
-    // row 0 of ctx_mtp's t_inp_h. Both backends must be able to issue a copy
-    // between each other (typical case: same device, fast on-device copy).
+    // Stages a copy of n_rows of ctx_target's t_h_pre_norm starting at index
+    // `src_row` into rows [0, n_rows) of ctx_mtp's t_inp_h. The copy is
+    // deferred to the next llama_decode on ctx_mtp — by then the destination
+    // graph has been built and t_inp_h has stable identity. Calling this
+    // function only records the source; the actual device-to-device copy
+    // happens during set_inputs on the next decode.
     //
-    // The right `src_row` for MTP drafting is the row whose hidden produced the
-    // verifier sample that becomes the next draft's id_last. After a verify
-    // batch [sampled, d0, ..., d_{K-1}] with `n_accepted` drafts accepted, that
-    // is `src_row = n_accepted` (the bonus token was sampled from h at row
-    // n_accepted). Using the last row instead silently corrupts MTP whenever
-    // n_accepted < K; the bug is invisible at K=1 most of the time but tanks
-    // K>=2.
+    // For per-step drafting use n_rows=1 with src_row = n_accepted (the row
+    // whose hidden produced the verifier sample that became id_last). For
+    // batched MTP prompt prefill use src_row=0 and n_rows = N (the prompt
+    // length, requiring ctx_target's prompt prefill to have logits=true on
+    // every position so t_h_pre_norm carries all rows).
     //
-    // Returns 0 on success; negative on error (e.g. ctx_target's last decode
-    // didn't produce t_h_pre_norm, src_row out of range, shape mismatch).
+    // Returns 0 on success; negative on error.
     LLAMA_API int32_t llama_mtp_relay_h(
             struct llama_context * ctx_target,
             struct llama_context * ctx_mtp,
-            int32_t                src_row);
+            int32_t                src_row,
+            int32_t                n_rows);
 
     // Self-relay: copy the LAST n_rows of ctx_mtp's most recent t_mtp_out
     // (the MTP block's post-FFN hidden) into the FIRST n_rows of its own
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -2165,6 +2165,7 @@ llm_graph_params llama_context::graph_params(
         /*.gtype       =*/ gtype,
         /*.sched       =*/ sched.get(),
         /*.backend_cpu =*/ backend_cpu,
+        /*.ctx         =*/ const_cast<llama_context *>(this),
         /*.cvec        =*/ cvec.get(),
         /*.loras       =*/ loras.get(),
         /*.mctx        =*/ mctx,
@@ -3103,6 +3104,14 @@ ggml_tensor * llama_context::get_t_mtp_out() const {
     return gf_res_prev ? gf_res_prev->t_mtp_out : nullptr;
 }
 
+void llama_context::set_mtp_h_source(struct llama_context * ctx_src, ggml_tensor * src,
+                                     int32_t row_first, int32_t n_rows) {
+    mtp_h_staging.ctx_src   = ctx_src;
+    mtp_h_staging.src       = src;
+    mtp_h_staging.row_first = row_first;
+    mtp_h_staging.n_rows    = n_rows;
+}
+
 ggml_tensor * llama_context::get_t_inp_h() const {
     // gf_res_prev->t_inp_h is set by the model's graph builder (e.g.
     // llm_build_qwen35_mtp). After the first real llama_decode it lives there.
@@ -3118,91 +3127,45 @@ ggml_tensor * llama_context::get_t_inp_h() const {
     return nullptr;
 }
 
-// Common implementation: copy a single row at `src_row` of `src` into row 0
-// of `dst`, on-device via ggml_backend_tensor_copy_async. ctx_src/ctx_dst are
-// used to look up backends per tensor and to synchronize the source.
-static int32_t llama_mtp_relay_impl(
+// Helper: validate the source tensor + row range, then stage on ctx_mtp.
+// The actual device-to-device copy is deferred to llm_graph_input_h_pre_norm::
+// set_input on the next decode — by then ctx_mtp's graph is built and t_inp_h
+// has stable identity. Doing the copy immediately would race with later graph
+// rebuilds (e.g. between an n_tokens=N prefill and an n_tokens=1 chain step).
+static int32_t llama_mtp_stage_source(
         struct llama_context * ctx_src,
-        struct llama_context * ctx_dst,
+        struct llama_context * ctx_mtp,
         ggml_tensor          * src,
-        ggml_tensor          * dst,
-        int32_t                src_row,
+        int32_t                row_first,
+        int32_t                n_rows,
         const char           * fn) {
     if (!src) {
         LLAMA_LOG_ERROR("%s: src tensor missing\n", fn);
         return -2;
     }
-    if (!dst) {
-        LLAMA_LOG_ERROR("%s: dst tensor missing (graph not built or wrong arch)\n", fn);
+    if (n_rows <= 0) {
+        LLAMA_LOG_ERROR("%s: n_rows=%d must be > 0\n", fn, n_rows);
         return -3;
     }
-    if (src->ne[0] != dst->ne[0]) {
-        LLAMA_LOG_ERROR("%s: shape mismatch: src n_embd=%" PRId64 ", dst n_embd=%" PRId64 "\n",
-                fn, src->ne[0], dst->ne[0]);
+    if (row_first < 0 || row_first + n_rows > src->ne[1]) {
+        LLAMA_LOG_ERROR("%s: row range [%d, %d) out of src cap %" PRId64 "\n",
+                fn, row_first, row_first + n_rows, src->ne[1]);
         return -4;
     }
-    if (src_row < 0 || src_row >= src->ne[1] || dst->ne[1] < 1) {
-        LLAMA_LOG_ERROR("%s: src_row=%d out of range (src cap=%" PRId64 ", dst cap=%" PRId64 ")\n",
-                fn, src_row, src->ne[1], dst->ne[1]);
-        return -5;
-    }
-
-    // Wait for the source's compute to finish before reading.
-    ctx_src->synchronize();
-
-    // Build views for the row range. ggml_view_2d does not propagate the
-    // parent's backend buffer to the view (it sets view->buffer = NULL and
-    // only forwards view->data + offset), so wire the buffer manually before
-    // passing the views to copy_async.
-    const size_t row_size   = src->nb[1];
-    const int32_t n_rows    = 1;
-    const size_t src_offset = (size_t) src_row * row_size;
-
-    ggml_context_ptr view_ctx;
-    {
-        ggml_init_params params = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * 2,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        view_ctx.reset(ggml_init(params));
-        if (!view_ctx) {
-            return -7;
-        }
-    }
-
-    ggml_tensor * src_view = ggml_view_2d(view_ctx.get(), src,
-            src->ne[0], n_rows, src->nb[1], src_offset);
-    ggml_tensor * dst_view = ggml_view_2d(view_ctx.get(), dst,
-            dst->ne[0], n_rows, dst->nb[1], /*offset=*/ 0);
-    src_view->buffer = src->buffer;
-    dst_view->buffer = dst->buffer;
-
-    auto * sched_src = ctx_src->get_sched();
-    auto * sched_dst = ctx_dst->get_sched();
-    auto * backend_src = ggml_backend_sched_get_tensor_backend(sched_src, src);
-    auto * backend_dst = ggml_backend_sched_get_tensor_backend(sched_dst, dst);
-    if (!backend_src || !backend_dst) {
-        LLAMA_LOG_ERROR("%s: backend resolve failed (src=%p dst=%p)\n",
-                fn, (void *) backend_src, (void *) backend_dst);
-        return -8;
-    }
-
-    ggml_backend_tensor_copy_async(backend_src, backend_dst, src_view, dst_view);
+    ctx_mtp->set_mtp_h_source(ctx_src, src, row_first, n_rows);
     return 0;
 }
 
 int32_t llama_mtp_relay_h(
         struct llama_context * ctx_target,
         struct llama_context * ctx_mtp,
-        int32_t                src_row) {
+        int32_t                src_row,
+        int32_t                n_rows) {
     if (!ctx_target || !ctx_mtp) {
         return -1;
     }
-    return llama_mtp_relay_impl(ctx_target, ctx_mtp,
-            ctx_target->get_t_h_pre_norm(),
-            ctx_mtp->get_t_inp_h(),
-            src_row, __func__);
+    return llama_mtp_stage_source(ctx_target, ctx_mtp,
+            ctx_target->get_t_h_pre_norm(), src_row, n_rows, __func__);
 }
 
 int32_t llama_mtp_relay_h_self(
@@ -3211,18 +3174,18 @@ int32_t llama_mtp_relay_h_self(
     if (!ctx_mtp) {
         return -1;
     }
-    // Self-relay: t_mtp_out has shape [n_embd, n_tokens] from the previous
-    // single-token decode, so n_tokens=1 and the only row is 0.
     GGML_UNUSED(n_rows);
+    // Self-relay sources from the LAST row of t_mtp_out (the most recent
+    // chain step's post-FFN hidden). Single row only — t_mtp_out has
+    // shape [n_embd, n_tokens] of the prior decode and we always want row
+    // n_tokens-1 here.
     ggml_tensor * src = ctx_mtp->get_t_mtp_out();
     if (!src) {
         return -2;
     }
-    const int32_t src_row = (int32_t) src->ne[1] - 1;
-    return llama_mtp_relay_impl(ctx_mtp, ctx_mtp,
-            src,
-            ctx_mtp->get_t_inp_h(),
-            src_row, __func__);
+    const int32_t row_first = (int32_t) src->ne[1] - 1;
+    return llama_mtp_stage_source(ctx_mtp, ctx_mtp,
+            src, row_first, /*n_rows=*/ 1, __func__);
 }
 
 void llama_synchronize(llama_context * ctx) {
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -85,6 +85,25 @@ struct llama_context {
     // self-relay copies this into t_inp_h for the next chain step.
     ggml_tensor * get_t_mtp_out() const;
 
+    // MTP h staging — set by llama_mtp_relay_h{,_self}, consumed during the
+    // NEXT llama_decode by llm_graph_input_h_pre_norm::set_input. Stable across
+    // graph rebuilds (lives on the context, not on a per-decode graph result),
+    // which is what lets us survive the n_tokens=N → n_tokens=1 transition
+    // between batched prompt prefill and single-token chain steps. The relay
+    // stages instead of copying immediately because the destination tensor
+    // (t_inp_h) only exists once the next decode builds its graph.
+    struct mtp_h_source_t {
+        struct llama_context * ctx_src   = nullptr; // for synchronize() + sched lookup
+        ggml_tensor          * src       = nullptr; // tensor to copy rows from
+        int32_t                row_first = 0;       // first source row
+        int32_t                n_rows    = 0;       // 0 = no staging, set_input is a no-op
+    };
+
+    void           set_mtp_h_source(struct llama_context * ctx_src, ggml_tensor * src,
+                                    int32_t row_first, int32_t n_rows);
+    mtp_h_source_t get_mtp_h_source() const { return mtp_h_staging; }
+    void           clear_mtp_h_source() { mtp_h_staging = {}; }
+
     llama_token * get_sampled_tokens() const;
     llama_token   get_sampled_token_ith(int32_t idx);
 
@@ -362,4 +381,6 @@ struct llama_context {
     mutable int32_t n_eval   = 0; // number of eval calls
 
     mutable int32_t n_reused = 0; // number of times the previous graph was reused
+
+    mtp_h_source_t mtp_h_staging;
 };
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -3,6 +3,7 @@
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-batch.h"
+#include "llama-context.h"
 #include "llama-cparams.h"
 
 #include "llama-kv-cache.h"
@@ -97,6 +98,63 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_h_pre_norm::set_input(const llama_ubatch * /*ubatch*/) {
+    // Read the staged source from the owning context. The relay function
+    // (llama_mtp_relay_h{,_self}) only RECORDS the source; the actual copy
+    // happens here, after the current graph has been built and `h` has stable
+    // identity. This is what lets us survive graph rebuilds between batched
+    // prefill (n_tokens=N) and single-token chain steps.
+    if (!ctx_mtp || !h) {
+        return;
+    }
+    auto staged = ctx_mtp->get_mtp_h_source();
+    if (!staged.src || staged.n_rows <= 0) {
+        return; // no relay staged for this decode
+    }
+
+    GGML_ASSERT(staged.src->ne[0] == h->ne[0] && "h embd dim mismatch");
+    GGML_ASSERT(staged.row_first >= 0 && staged.row_first + staged.n_rows <= staged.src->ne[1]);
+    GGML_ASSERT(staged.n_rows <= h->ne[1] && "staged n_rows exceeds h capacity");
+
+    // The source ctx may be ctx_mtp itself (self-relay during chained
+    // drafting) or a separate ctx_target. Sync it so its compute is done.
+    staged.ctx_src->synchronize();
+
+    // ggml_view_2d does not propagate the parent's backend buffer onto the
+    // view — it leaves view->buffer == NULL. Wire it manually before passing
+    // the views to copy_async / sched_get_tensor_backend.
+    const size_t row_size = staged.src->nb[1];
+    const size_t src_off  = (size_t) staged.row_first * row_size;
+
+    ggml_init_params init = {
+        /*.mem_size   =*/ ggml_tensor_overhead() * 2,
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_context_ptr view_ctx;
+    view_ctx.reset(ggml_init(init));
+    GGML_ASSERT(view_ctx);
+
+    ggml_tensor * src_view = ggml_view_2d(view_ctx.get(), staged.src,
+            staged.src->ne[0], staged.n_rows, row_size, src_off);
+    ggml_tensor * dst_view = ggml_view_2d(view_ctx.get(), h,
+            h->ne[0], staged.n_rows, h->nb[1], /*offset=*/ 0);
+    src_view->buffer = staged.src->buffer;
+    dst_view->buffer = h->buffer;
+
+    auto * sched_src = staged.ctx_src->get_sched();
+    auto * sched_dst = ctx_mtp->get_sched();
+    auto * backend_src = ggml_backend_sched_get_tensor_backend(sched_src, staged.src);
+    auto * backend_dst = ggml_backend_sched_get_tensor_backend(sched_dst, h);
+    GGML_ASSERT(backend_src && backend_dst && "MTP h relay: backend resolve failed");
+
+    ggml_backend_tensor_copy_async(backend_src, backend_dst, src_view, dst_view);
+
+    // Consume the staging so a subsequent decode without a fresh relay call
+    // doesn't accidentally re-copy stale rows.
+    ctx_mtp->clear_mtp_h_source();
+}
+
 void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -18,6 +18,7 @@ struct ggml_tensor;
 
 struct llama_cparams;
 struct llama_layer;
+struct llama_context;
 
 struct llama_memory_context_i;
 
@@ -120,23 +121,26 @@ class llm_graph_input_embd : public llm_graph_input_i {
     const int64_t n_embd = 0;
 };
 
-// Graph input for the trunk's pre-output-norm hidden state, passed into a
-// separate ctx_mtp from a target ctx via ggml_backend_tensor_copy_async on
-// each draft step. set_input is a no-op — the tensor data is populated
-// externally (the relay function), not from the ubatch.
+// Graph input for the trunk's pre-output-norm hidden state. Populated during
+// set_input by reading the staged source on the owning llama_context (set by
+// llama_mtp_relay_h{,_self}) and doing a device-to-device copy from those
+// rows into this->h. Going through set_input rather than an immediate copy in
+// the relay function is what lets the relay survive a graph rebuild between
+// different n_tokens (e.g. n_tokens=N prompt prefill → n_tokens=1 chain step):
+// the staging lives on the context, but the destination this->h is whatever
+// tensor the current graph just allocated.
 class llm_graph_input_h_pre_norm : public llm_graph_input_i {
 public:
-    llm_graph_input_h_pre_norm(int64_t n_embd) : n_embd(n_embd) {}
+    llm_graph_input_h_pre_norm(int64_t n_embd, llama_context * ctx_mtp)
+        : n_embd(n_embd), ctx_mtp(ctx_mtp) {}
     virtual ~llm_graph_input_h_pre_norm() = default;
 
-    void set_input(const llama_ubatch * /*ubatch*/) override {
-        // The h tensor is populated by the speculative wrapper before
-        // llama_decode via ggml_backend_tensor_copy_async. Nothing to do here.
-    }
+    void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * h = nullptr; // F32 [n_embd, n_batch]
 
-    const int64_t n_embd = 0;
+    const int64_t   n_embd  = 0;
+    llama_context * ctx_mtp = nullptr; // not owned; used to read staged source
 };
 
 class llm_graph_input_pos : public llm_graph_input_i {
@@ -559,6 +563,11 @@ struct llm_graph_params {
     ggml_backend_sched_t sched;
     ggml_backend_t backend_cpu;
 
+    // Owning context. Currently only consumed by llm_graph_input_h_pre_norm,
+    // which needs to read MTP h-staging state on every set_input. Not used by
+    // can_reuse / allow_reuse — same context across decodes by construction.
+    llama_context * ctx = nullptr;
+
     const llama_adapter_cvec     * cvec;
     const llama_adapter_loras    * loras;
     const llama_memory_context_i * mctx;
diff --git a/src/models/qwen35_mtp.cpp b/src/models/qwen35_mtp.cpp
@@ -46,7 +46,7 @@ llm_build_qwen35_mtp::llm_build_qwen35_mtp(const llama_model & model, const llm_
     // input buffer. Populated externally (no ubatch source) by the speculative
     // wrapper via ggml_backend_tensor_copy_async, using ctx_target's
     // t_h_pre_norm graph output as the source.
-    auto h_inp = std::make_unique<llm_graph_input_h_pre_norm>(hparams.n_embd);
+    auto h_inp = std::make_unique<llm_graph_input_h_pre_norm>(hparams.n_embd, params.ctx);
     h_inp->h  = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
     ggml_set_input(h_inp->h);
     ggml_set_name(h_inp->h, "mtp_h_input");

Original file line number	Diff line number	Diff line change
`@@ -737,7 +737,7 @@ struct common_speculative_state_mtp : public common_speculative_state {`
`737`	`737`	`int32_t rc_relay;`
`738`	`738`	`if (k == 0) {`
`739`	`739`	`const int32_t src_row = (last_n_accepted < 0) ? 0 : last_n_accepted;`
`740`		`- rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp, src_row);`
	`740`	`+ rc_relay = llama_mtp_relay_h(ctx_tgt, ctx_mtp, src_row, /n_rows=/ 1);`
`741`	`741`	`} else {`
`742`	`742`	`rc_relay = llama_mtp_relay_h_self(ctx_mtp, /n_rows=/ 1);`
`743`	`743`	`}`