diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
index a4bb575f..66f27e9e 100644
--- a/dflash/CMakeLists.txt
+++ b/dflash/CMakeLists.txt
@@ -126,6 +126,8 @@ add_library(dflash27b STATIC
     src/laguna_target_graph.cpp
     src/laguna_daemon.cpp
     src/sampler.cpp
+    # Native MTP / NextN helpers
+    src/f16_convert.cu
 )
 # FlashPrefill custom CUDA kernels need BF16 WMMA (sm_80+). On Turing (sm_75)
 # the drafter uses ggml's flash_attn_ext instead. Guard added after SM check.
@@ -334,6 +336,29 @@ if(DFLASH27B_TESTS)
         endif()
     endif()
 
+    # Native MTP / NextN: contract test + functional smokes. The contract test
+    # uses synthetic tensors and runs in CI; the smokes need a real MTP GGUF.
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_mtp_graph_contract.cpp")
+        add_executable(test_mtp_graph_contract test/test_mtp_graph_contract.cpp)
+        target_include_directories(test_mtp_graph_contract PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(test_mtp_graph_contract PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_graph.cpp")
+        add_executable(smoke_mtp_graph test/smoke_mtp_graph.cpp)
+        target_include_directories(smoke_mtp_graph PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_mtp_graph PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_mtp_handoff.cpp")
+        add_executable(smoke_target_mtp_handoff test/smoke_target_mtp_handoff.cpp)
+        target_include_directories(smoke_target_mtp_handoff PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_target_mtp_handoff PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_integrated_decode.cpp")
+        add_executable(smoke_mtp_integrated_decode test/smoke_mtp_integrated_decode.cpp)
+        target_include_directories(smoke_mtp_integrated_decode PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_mtp_integrated_decode PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+
     # internal.h includes <cuda_runtime.h> when GGML_USE_CUDA is set; link
     # CUDA::cudart so the toolkit headers are on the compile line (same as
     # test_dflash historically had alone).
@@ -350,6 +375,10 @@ if(DFLASH27B_TESTS)
         smoke_target_forward
         test_generate
         test_dflash
+        test_mtp_graph_contract
+        smoke_mtp_graph
+        smoke_target_mtp_handoff
+        smoke_mtp_integrated_decode
     )
     foreach(_t IN LISTS _dflash_internal_h_cuda_tests)
         if(TARGET ${_t})
diff --git a/dflash/docs/MTP_2026-05-11.md b/dflash/docs/MTP_2026-05-11.md
new file mode 100644
index 00000000..04b30316
--- /dev/null
+++ b/dflash/docs/MTP_2026-05-11.md
@@ -0,0 +1,111 @@
+# Native MTP (NextN) — runtime status, 2026-05-11
+
+This document describes the native multi-token prediction (MTP / NextN)
+runtime introduced into `dflash` in PR `feat(dflash): native Qwen3.6 MTP
+integrated decode`. It tracks the contract, what already works, and what
+remains for the next PR before MTP becomes a default-on decode mode.
+
+## What this PR ships
+
+- `dflash/src/f16_convert.cu` — small `f16/bf16 → f32` widen kernels used
+  by both the rollback path and the MTP token-embedding widen.
+- `dflash/src/internal.h` — new types:
+    - `TargetNextN`, `TargetMtpLayer`
+    - `TargetMtpCache` (KV cache for the NextN tail block only)
+    - `QwenMtpGraphInputs`, `QwenMtpGraphOutputs`
+    - `expose_pre_norm_hidden` on `QwenGraphInputs`
+    - `pre_norm_hidden` on `QwenGraphOutputs`
+    - `TargetWeights::mtp_layers`, `nextn_predict_layers`, `gguf_block_count`,
+      `tok_embd_gpu` (no fields removed; the trunk API is preserved).
+- `dflash/src/qwen35_target_graph.cpp` — four new functions:
+    - `create_target_mtp_cache` / `free_target_mtp_cache` / `reset_target_mtp_cache`
+    - `build_qwen35_mtp_graph` — RMSNorm(e) || RMSNorm(h) → `eh_proj` →
+      full-attention transformer block → SwiGLU FFN → shared head.
+  Also wires `expose_pre_norm_hidden` into `build_qwen35_graph`.
+- `dflash/src/gguf_target_loader.cpp` — reads `qwen35.nextn_predict_layers`,
+  splits the GGUF blocks into trunk + MTP tail, loads `blk.<i>.nextn.*`
+  tensors into `TargetWeights::mtp_layers`, and uploads `token_embd.weight`
+  to the GPU when the checkpoint carries MTP (`DFLASH27B_UPLOAD_TOK_EMBD`
+  env var overrides).
+- `dflash/test/test_mtp_graph_contract.cpp` — synthetic-tensor test that
+  asserts the MTP graph wires together correctly. No GPU model needed;
+  cheap to run in CI.
+- `dflash/test/smoke_mtp_graph.cpp` — loads a real MTP GGUF, builds the
+  NextN graph for a single token, and validates the output is finite.
+- `dflash/test/smoke_target_mtp_handoff.cpp` — loads a real MTP GGUF and
+  proves that the trunk pre-norm hidden tensor feeds directly into the
+  MTP block within the same `ggml_cgraph` (no CPU roundtrip required).
+- `dflash/test/smoke_mtp_integrated_decode.cpp` — full integrated decode
+  loop: target greedy + MTP greedy in one graph, with per-step accept /
+  correct counters. This is the functional baseline the upcoming PR's
+  speculative loop will be built on top of.
+
+## GGUF compatibility
+
+The loader follows the tensor naming convention introduced by llama.cpp's
+[MTP PR #22673](https://github.com/ggml-org/llama.cpp/pull/22673). It is
+compatible with the reference Qwen3.6-MTP GGUFs published on the Hub:
+
+- `am17an/Qwen3.6-27B-MTP-GGUF`
+- `am17an/Qwen3.6-35BA3B-MTP-GGUF` (MoE — see "MoE limitation" below)
+- `havenoammo/Qwen3.6-27B-MTP-UD-GGUF`
+- `havenoammo/Qwen3.6-35B-A3B-MTP-GGUF`
+- `froggeric/Qwen3.6-27B-MTP-GGUF`
+
+The expected tail-block tensor names are:
+
+```text
+blk.<n_trunk>.nextn.eh_proj.weight              [2 * hidden, hidden]
+blk.<n_trunk>.nextn.embed_tokens.weight          [hidden, vocab]    (optional)
+blk.<n_trunk>.nextn.enorm.weight                 [hidden]
+blk.<n_trunk>.nextn.hnorm.weight                 [hidden]
+blk.<n_trunk>.nextn.shared_head_head.weight      [hidden, vocab]    (optional)
+blk.<n_trunk>.nextn.shared_head_norm.weight      [hidden]           (optional)
+```
+
+When the optional shared-head tensors are absent the runtime falls back to
+the trunk's `output_norm` / `output` (lm_head), matching how am17an's
+GGUFs are typically packed.
+
+## MoE limitation
+
+`build_qwen35_mtp_graph` currently implements the dense-FFN path only. The
+35B-A3B MTP GGUFs require the MoE `TargetLayer` fields and the routed
+FFN path that howard0su is upstreaming in
+[PR #120 "Qwen3.5 MoE support"](https://github.com/Luce-Org/lucebox-hub/pull/120).
+A MoE-aware `build_qwen35_mtp_graph` is a one-line dispatch on top of
+this PR once #120 lands. Until then, loading a MoE-MTP GGUF + invoking
+the MTP graph returns a clear error rather than producing wrong output.
+
+## Why MTP is opt-in, not default-on
+
+Measured today against `DFlash + PFlash` on the same MTP GGUF with MTP
+disabled, on a single RTX 6000 Ada (sm_89), Qwen3.6-27B Q4_K_M target,
+`q4_0/q4_0` KV, FA_WINDOW=0, DDTree budget=16, draft feature mirror on:
+
+| n_gen | Same GGUF, MTP off (tok/s) | Same GGUF, MTP chain-2 (tok/s) | Δ |
+|---:|---:|---:|---:|
+| 64  | 57.58 | 54.72 | **−5.0%** |
+| 128 | 67.58 | 64.23 | **−5.0%** |
+| 256 | 60.40 | 82.18 | **+36.1%** |
+
+What changes between 64 and 256 tokens is that DDTree rounds drop from
+roughly 60 → 38 and average tokens committed per draft step rise from
+4.27 → 6.74, so the extra MTP forward starts paying for itself.
+
+This is real but workload-dependent acceleration, not a universal default.
+The next PR adds the speculative loop that turns this into a default-on
+mode for long generations; today's PR ships only the runtime contract and
+the tests that pin it.
+
+## Known follow-ups (next PR)
+
+1. Speculative decode loop wiring (`run_mtp_integrated_prompt`,
+   target-batched verify, fast rollback) inside `test_dflash`.
+2. Daemon-side `--mtp-integrated` CLI + metrics surface (`[mtp-daemon]`
+   line, `last_mtp` aggregated in `prefix_cache.py`).
+3. `mtp_baseline_gate.py` published as a reusable parity gate harness.
+4. CPU hidden-readback elimination — the current functional smoke still
+   round-trips token ids through CPU between MTP steps. Removing that is
+   the highest-value perf fix and is queued behind CUDA-graph capture.
+5. MoE MTP path after PR #120 merges.
diff --git a/dflash/src/f16_convert.cu b/dflash/src/f16_convert.cu
new file mode 100644
index 00000000..49bd309e
--- /dev/null
+++ b/dflash/src/f16_convert.cu
@@ -0,0 +1,49 @@
+// Tiny half-precision → f32 conversion kernels used by the DDtree rollback
+// path and the drafter's target_feat widen. We store some tensors
+// (ssm_intermediate, target_feat) at 16-bit to halve their memory footprint,
+// and widen on read into f32 consumers.
+//
+// Exposes plain C entry points so test_dflash.cpp can call them without
+// pulling in a CUDA compile unit of its own.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+static __global__ void f16_to_f32_kernel(const __half * __restrict__ src,
+                                         float * __restrict__ dst,
+                                         size_t n_elems) {
+    const size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_elems) {
+        dst[i] = __half2float(src[i]);
+    }
+}
+
+static __global__ void bf16_to_f32_kernel(const __nv_bfloat16 * __restrict__ src,
+                                          float * __restrict__ dst,
+                                          size_t n_elems) {
+    const size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_elems) {
+        dst[i] = __bfloat162float(src[i]);
+    }
+}
+
+extern "C" void dflash27b_launch_f16_to_f32(const void * src,
+                                            void * dst,
+                                            size_t n_elems,
+                                            cudaStream_t stream) {
+    const int threads = 256;
+    const int blocks  = (int)((n_elems + threads - 1) / threads);
+    f16_to_f32_kernel<<<blocks, threads, 0, stream>>>(
+        (const __half *)src, (float *)dst, n_elems);
+}
+
+extern "C" void dflash27b_launch_bf16_to_f32(const void * src,
+                                             void * dst,
+                                             size_t n_elems,
+                                             cudaStream_t stream) {
+    const int threads = 256;
+    const int blocks  = (int)((n_elems + threads - 1) / threads);
+    bf16_to_f32_kernel<<<blocks, threads, 0, stream>>>(
+        (const __nv_bfloat16 *)src, (float *)dst, n_elems);
+}
diff --git a/dflash/src/gguf_target_loader.cpp b/dflash/src/gguf_target_loader.cpp
index f5fde060..38ccfdf9 100644
--- a/dflash/src/gguf_target_loader.cpp
+++ b/dflash/src/gguf_target_loader.cpp
@@ -273,7 +273,7 @@ bool load_target_gguf_partial(const std::string & path,
     std::string err;
     const uint32_t n_embd = get_u32_or(gctx, "qwen35.embedding_length",    0);
     const uint32_t n_ff   = get_u32_or(gctx, "qwen35.feed_forward_length", 0);
-    const uint32_t n_layer= get_u32_or(gctx, "qwen35.block_count",         0);
+    const uint32_t n_block= get_u32_or(gctx, "qwen35.block_count",         0);
     const uint32_t n_head = get_u32_or(gctx, "qwen35.attention.head_count",0);
     const uint32_t n_headkv=get_u32_or(gctx, "qwen35.attention.head_count_kv",0);
     const uint32_t kl     = get_u32_or(gctx, "qwen35.attention.key_length",   0);
@@ -285,21 +285,48 @@ bool load_target_gguf_partial(const std::string & path,
     const uint32_t ssm_dt    = get_u32_or(gctx, "qwen35.ssm.time_step_rank",0);
     const uint32_t ssm_grp   = get_u32_or(gctx, "qwen35.ssm.group_count",  0);
 
-    if (n_embd == 0 || n_layer == 0 || n_head == 0 || n_headkv == 0 ||
+    // Native MTP / NextN: zero on non-MTP GGUFs, 1 on the am17an Qwen3.6-MTP
+    // GGUFs. We treat the last `nextn_predict_layers` blocks as the MTP tail
+    // and the remaining `block_count - nextn` as the trunk.
+    const uint32_t nextn_predict_layers = get_u32_or(gctx, "qwen35.nextn_predict_layers", 0);
+
+    if (n_embd == 0 || n_block == 0 || n_head == 0 || n_headkv == 0 ||
         kl == 0 || vl == 0 || n_ff == 0 || fai == 0 ||
         ssm_conv == 0 || ssm_inner == 0 || ssm_state == 0 ||
         ssm_dt == 0 || ssm_grp == 0) {
         char buf[512];
         std::snprintf(buf, sizeof(buf),
-            "missing or zero hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u "
+            "missing or zero hparams: n_embd=%u n_block=%u n_head=%u n_head_kv=%u "
             "kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}",
-            n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai,
+            n_embd, n_block, n_head, n_headkv, kl, vl, n_ff, fai,
             ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp);
         set_last_error(buf);
         gguf_free(gctx);
         return false;
     }
 
+    if (nextn_predict_layers > n_block) {
+        char buf[160];
+        std::snprintf(buf, sizeof(buf),
+            "nextn_predict_layers=%u exceeds block_count=%u",
+            nextn_predict_layers, n_block);
+        set_last_error(buf);
+        gguf_free(gctx); return false;
+    }
+    if (nextn_predict_layers > 1) {
+        char buf[160];
+        std::snprintf(buf, sizeof(buf),
+            "nextn_predict_layers=%u not supported yet (loader supports 0 or 1)",
+            nextn_predict_layers);
+        set_last_error(buf);
+        gguf_free(gctx); return false;
+    }
+    const uint32_t n_layer = n_block - nextn_predict_layers;
+    if (n_layer == 0) {
+        set_last_error("no trunk layers left after subtracting nextn_predict_layers");
+        gguf_free(gctx); return false;
+    }
+
     // Structural invariants required by the graph builder.
     if (kl != vl) {
         set_last_error("key_length != value_length not supported");
@@ -312,8 +339,10 @@ bool load_target_gguf_partial(const std::string & path,
         gguf_free(gctx); return false;
     }
     if (n_layer % fai != 0) {
-        char buf[128];
-        std::snprintf(buf, sizeof(buf), "block_count=%u not divisible by full_attention_interval=%u", n_layer, fai);
+        char buf[160];
+        std::snprintf(buf, sizeof(buf),
+            "trunk layer count=%u (block_count=%u nextn=%u) not divisible by full_attention_interval=%u",
+            n_layer, n_block, nextn_predict_layers, fai);
         set_last_error(buf);
         gguf_free(gctx); return false;
     }
@@ -364,13 +393,15 @@ bool load_target_gguf_partial(const std::string & path,
 
     TargetLoadPlan plan = plan_in;
     if (plan.layer_begin < 0) plan.layer_begin = 0;
-    if (plan.layer_end < 0) plan.layer_end = (int)n_layer;
+    // Default end covers trunk + MTP/NextN tail so blk.<n_layer..n_block-1>.*
+    // tensors are uploaded when nextn_predict_layers > 0.
+    if (plan.layer_end < 0) plan.layer_end = (int)n_block;
     if (plan.layer_begin > plan.layer_end ||
-        plan.layer_end > (int)n_layer) {
+        plan.layer_end > (int)n_block) {
         char buf[160];
         std::snprintf(buf, sizeof(buf),
-            "invalid target load layer range [%d,%d) for n_layer=%u",
-            plan.layer_begin, plan.layer_end, n_layer);
+            "invalid target load layer range [%d,%d) for n_block=%u",
+            plan.layer_begin, plan.layer_end, n_block);
         set_last_error(buf);
         gguf_free(gctx);
         return false;
@@ -379,6 +410,8 @@ bool load_target_gguf_partial(const std::string & path,
     out.ctx     = meta_ctx;
     out.backend = backend;
     out.n_layer = (int)n_layer;
+    out.gguf_block_count = (int)n_block;
+    out.nextn_predict_layers = (int)nextn_predict_layers;
     out.n_embd  = (int)n_embd;
     out.n_ff    = (int)n_ff;
     out.n_head  = (int)n_head;
@@ -392,6 +425,7 @@ bool load_target_gguf_partial(const std::string & path,
     out.ssm_d_state= (int)ssm_state;
     out.ssm_dt_rank= (int)ssm_dt;
     out.ssm_n_group= (int)ssm_grp;
+    out.mtp_layers.assign((size_t)nextn_predict_layers, TargetMtpLayer{});
 
     // EOS token ids from GGUF tokenizer metadata (stored as UINT32 by the
     // GGUF spec; we use the u32 helper and cast). UINT32_MAX is the
@@ -491,6 +525,77 @@ bool load_target_gguf_partial(const std::string & path,
         }
     }
 
+    // ── 2b. Wire MTP / NextN tail blocks (Qwen3.6-MTP GGUFs) ─────────
+    // GGUF block index for MTP layer `mi` is (n_layer + mi). Each MTP block
+    // ships a regular full-attention transformer (no DeltaNet) plus the
+    // NextN-specific projections (eh_proj, enorm, hnorm, optional shared head).
+    for (int mi = 0; mi < (int)nextn_predict_layers; mi++) {
+        const int il = (int)n_layer + mi;
+        char name[128];
+        auto fnd = [&](const char * suffix) -> ggml_tensor * {
+            std::snprintf(name, sizeof(name), "blk.%d.%s", il, suffix);
+            return ggml_get_tensor(meta_ctx, name);
+        };
+        TargetMtpLayer & M = out.mtp_layers[(size_t)mi];
+        M.gguf_layer_index = il;
+        TargetLayer & L = M.block;
+
+        L.attn_norm      = fnd("attn_norm.weight");
+        L.attn_post_norm = fnd("post_attention_norm.weight");
+        L.w_gate         = fnd("ffn_gate.weight");
+        L.w_up           = fnd("ffn_up.weight");
+        L.w_down         = fnd("ffn_down.weight");
+        L.wq             = fnd("attn_q.weight");
+        L.wk             = fnd("attn_k.weight");
+        L.wv             = fnd("attn_v.weight");
+        L.wo             = fnd("attn_output.weight");
+        L.q_norm         = fnd("attn_q_norm.weight");
+        L.k_norm         = fnd("attn_k_norm.weight");
+
+        M.nextn.eh_proj          = fnd("nextn.eh_proj.weight");
+        M.nextn.embed_tokens     = fnd("nextn.embed_tokens.weight");
+        M.nextn.enorm            = fnd("nextn.enorm.weight");
+        M.nextn.hnorm            = fnd("nextn.hnorm.weight");
+        M.nextn.shared_head_head = fnd("nextn.shared_head_head.weight");
+        M.nextn.shared_head_norm = fnd("nextn.shared_head_norm.weight");
+
+        const bool has_attn = L.wq && L.wk && L.wv && L.wo && L.q_norm && L.k_norm;
+        if (!L.attn_norm || !L.attn_post_norm || !has_attn) {
+            char b[160];
+            std::snprintf(b, sizeof(b),
+                "mtp layer %d: missing full-attention tensors", il);
+            set_last_error(b);
+            gguf_free(gctx);
+            return false;
+        }
+        if (!L.w_gate || !L.w_up || !L.w_down) {
+            char b[160];
+            std::snprintf(b, sizeof(b),
+                "mtp layer %d: missing required FFN tensors", il);
+            set_last_error(b);
+            gguf_free(gctx);
+            return false;
+        }
+        if (!M.nextn.eh_proj || !M.nextn.enorm || !M.nextn.hnorm) {
+            char b[160];
+            std::snprintf(b, sizeof(b),
+                "mtp layer %d: missing required nextn tensors "
+                "(eh_proj/enorm/hnorm)", il);
+            set_last_error(b);
+            gguf_free(gctx);
+            return false;
+        }
+    }
+
+    // Plain target decode still embeds on CPU. Native MTP needs device-side
+    // token lookup to chain proposals inside one graph, so MTP-enabled
+    // checkpoints upload token_embd to the GPU as a regular weight.
+    bool upload_tok_embd = nextn_predict_layers > 0;
+    if (const char * s = std::getenv("DFLASH27B_UPLOAD_TOK_EMBD")) {
+        upload_tok_embd = std::atoi(s) != 0;
+    }
+    out.tok_embd_gpu = upload_tok_embd;
+
     // 3. Allocate CUDA buffer only for the selected target tensors.
     ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
     const size_t alignment = ggml_backend_buft_get_alignment(buft);
@@ -500,7 +605,11 @@ bool load_target_gguf_partial(const std::string & path,
     for (int64_t tid = 0; tid < n_tensors; tid++) {
         const char * tname = gguf_get_tensor_name(gctx, tid);
         ggml_tensor * t = ggml_get_tensor(meta_ctx, tname);
-        if (!t || !should_load_target_tensor(tname, plan.layer_begin, plan.layer_end, plan.load_output)) {
+        if (!t) continue;
+        const bool is_tok_embd = (std::strcmp(tname, "token_embd.weight") == 0);
+        const bool selected_by_plan =
+            should_load_target_tensor(tname, plan.layer_begin, plan.layer_end, plan.load_output);
+        if (!selected_by_plan && !(is_tok_embd && upload_tok_embd)) {
             continue;
         }
         alloc_total = align_up_size(alloc_total, alignment);
@@ -559,10 +668,15 @@ bool load_target_gguf_partial(const std::string & path,
             return false;
         }
         if (std::string(tname) == "token_embd.weight") {
-            // Remember offset + size for the CPU embedder; don't upload to GPU.
+            // Remember offset + size for the CPU embedder regardless of GPU
+            // upload — MTP still needs the CPU mmap for tokenizer-side lookups.
             tok_embd_off  = off;
             tok_embd_sz   = sz;
             tok_embd_type = gguf_get_tensor_type(gctx, tid);
+            if (!upload_tok_embd) continue;
+            // MTP path: also stream the bytes into the GPU-resident tensor.
+            ggml_backend_tensor_set(t, (const uint8_t *)mm.addr + off, 0, sz);
+            total += sz;
             continue;
         }
         if (!should_load_target_tensor(tname, plan.layer_begin, plan.layer_end, plan.load_output)) {
@@ -597,12 +711,16 @@ bool load_target_gguf_partial(const std::string & path,
     mm.release();  // don't munmap on Mmap dtor — now owned by the embedder
 
     // Stash the total for callers that want to print it
-    char summary[192];
+    char summary[256];
     std::snprintf(summary, sizeof(summary),
-        "target loaded: layers [%d,%d) output=%d, %zu tensors on GPU %.2f GiB, tok_embd %.0f MiB CPU-only (%s)",
+        "target loaded: layers [%d,%d) output=%d, %zu tensors on GPU %.2f GiB, "
+        "tok_embd %.0f MiB %s (%s), trunk_layers=%d nextn=%d",
         plan.layer_begin, plan.layer_end, (int)plan.load_output, allocs.size(),
         total / (1024.0 * 1024.0 * 1024.0),
-        tok_embd_sz / (1024.0 * 1024.0), ggml_type_name(tok_embd_type));
+        tok_embd_sz / (1024.0 * 1024.0),
+        upload_tok_embd ? "GPU+CPU" : "CPU-only",
+        ggml_type_name(tok_embd_type),
+        out.n_layer, out.nextn_predict_layers);
     set_last_error(summary);
 
     return true;
@@ -613,7 +731,11 @@ void free_target_weights(TargetWeights & w) {
     if (w.ctx) { ggml_free(w.ctx);                w.ctx = nullptr; }
     // CpuEmbedder destructor handles the mmap automatically.
     w.layers.clear();
+    w.mtp_layers.clear();
+    w.nextn_predict_layers = 0;
+    w.gguf_block_count = 0;
     w.tok_embd = nullptr;
+    w.tok_embd_gpu = false;
     w.out_norm = nullptr;
     w.output   = nullptr;
 }
diff --git a/dflash/src/internal.h b/dflash/src/internal.h
index b9cc88d5..945e8611 100644
--- a/dflash/src/internal.h
+++ b/dflash/src/internal.h
@@ -73,6 +73,34 @@ struct TargetLayer {
     ggml_tensor * ssm_out        = nullptr;  // output projection after delta-net
 };
 
+// Qwen3.5/3.6 NextN / MTP tail block tensors. These live in the tail
+// `nextn_predict_layers` of the GGUF (one such block in Qwen3.6-MTP).
+// Follow the tensor naming convention introduced by llama.cpp PR #22673:
+//   blk.<i>.nextn.eh_proj          [2*hidden, hidden]
+//   blk.<i>.nextn.embed_tokens     [hidden, vocab]   (optional)
+//   blk.<i>.nextn.enorm            [hidden]
+//   blk.<i>.nextn.hnorm            [hidden]
+//   blk.<i>.nextn.shared_head_head [hidden, vocab]   (optional, falls back to w.output)
+//   blk.<i>.nextn.shared_head_norm [hidden]          (optional, falls back to w.out_norm)
+struct TargetNextN {
+    ggml_tensor * eh_proj          = nullptr;
+    ggml_tensor * embed_tokens     = nullptr;
+    ggml_tensor * enorm            = nullptr;
+    ggml_tensor * hnorm            = nullptr;
+    ggml_tensor * shared_head_head = nullptr;
+    ggml_tensor * shared_head_norm = nullptr;
+};
+
+// One MTP / NextN layer in the GGUF tail. Holds the regular transformer
+// block tensors (full-attention only — no DeltaNet on MTP) plus the
+// NextN-specific projections above. The trunk decoder's pre-norm hidden
+// state is fed into this block to produce the MTP draft logits.
+struct TargetMtpLayer {
+    TargetLayer block;
+    TargetNextN nextn;
+    int         gguf_layer_index = -1;
+};
+
 // CPU-side embedder: keeps a mmap of the GGUF alive and knows how to
 // dequantize individual rows of the quantized tok_embd tensor on demand.
 // This matches llama.cpp's behavior of running embedding get_rows on CPU
@@ -108,7 +136,11 @@ struct TargetWeights {
     CpuEmbedder           embedder;
 
     ggml_tensor * tok_embd = nullptr;        // [hidden, vocab] (metadata only; data NOT on GPU)
-    std::vector<TargetLayer> layers;         // size = 64
+    bool tok_embd_gpu = false;               // true when token_embd bytes were uploaded for GPU get_rows.
+                                             // Required by MTP because the integrated decode path needs
+                                             // device-side token lookup to chain proposals within a graph.
+    std::vector<TargetLayer> layers;         // trunk layers only, excludes any nextn/MTP tail blocks
+    std::vector<TargetMtpLayer> mtp_layers;  // size = nextn_predict_layers (0 for non-MTP GGUFs)
     ggml_tensor * out_norm = nullptr;        // [hidden]
     ggml_tensor * output   = nullptr;        // [hidden, vocab]  (lm_head)
 
@@ -119,7 +151,9 @@ struct TargetWeights {
     int n_embd_head_v           = 256;  // value_length
     int n_head                  = 24;
     int n_head_kv               = 4;
-    int n_layer                 = 64;
+    int gguf_block_count        = 64;    // raw qwen35.block_count from the GGUF
+    int nextn_predict_layers    = 0;     // qwen35.nextn_predict_layers (0 = non-MTP GGUF)
+    int n_layer                 = 64;    // trunk layer count: gguf_block_count - nextn_predict_layers
     int n_embd                  = 5120;
     int n_ff                    = 17408;
     int ssm_d_conv              = 4;
@@ -413,6 +447,36 @@ bool migrate_prefill_cache(const TargetWeights & w,
                            ggml_backend_t backend,
                            TargetCache & cache);
 
+// ─── Native MTP / NextN cache ─────────────────────────────────────
+//
+// Qwen3.5/3.6 native multi-token prediction keeps a tiny KV cache for the
+// tail NextN block(s) only — the trunk decoder retains its own TargetCache
+// above. Matches the "kv_only_nextn" contract used by llama.cpp PR #22673
+// and llama-crucible's MTP cache layout.
+struct TargetMtpCache {
+    ggml_context *        ctx     = nullptr;
+    ggml_backend_buffer_t buf     = nullptr;
+    ggml_backend_t        backend = nullptr;
+
+    int max_ctx = 0;
+    int cur_pos = 0;
+
+    ggml_type kv_k_type = GGML_TYPE_Q8_0;
+    ggml_type kv_v_type = GGML_TYPE_Q8_0;
+
+    std::vector<ggml_tensor *> attn_k; // one per TargetWeights::mtp_layers entry
+    std::vector<ggml_tensor *> attn_v;
+};
+
+bool create_target_mtp_cache(const TargetWeights & w,
+                             int max_ctx,
+                             ggml_backend_t backend,
+                             TargetMtpCache & out);
+
+void free_target_mtp_cache(TargetMtpCache & c);
+
+void reset_target_mtp_cache(TargetMtpCache & c);
+
 // ─── Target forward graph ─────────────────────────────────────────
 
 // Per-delta-net-layer pointers exposed by the graph for spec-decode rollback.
@@ -443,6 +507,7 @@ struct QwenGraphInputs {
     int           kv_start;       // position where the new tokens begin
     bool          capture_layers; // if true, write captured layer features into cache.target_feat
     bool          capture_delta_intermediate = false; // if true, populate out_delta_captures
+    bool          expose_pre_norm_hidden = false;     // if true, keep final pre-norm hidden for MTP handoff
     int           fa_window = 0;  // sliding window for FA layers: 0 = full attention
     bool          last_token_logits_only = false; // if true, only compute logits for last token (prefill optimization)
     ggml_tensor * parent_ids = nullptr; // [n_tokens] i32; tree mode when non-null
@@ -450,6 +515,11 @@ struct QwenGraphInputs {
 
 struct QwenGraphOutputs {
     ggml_tensor * logits;      // [vocab, n_tokens] f32
+    // Final hidden state before the target output norm. Populated when
+    // QwenGraphInputs::expose_pre_norm_hidden is true. Used as the
+    // `t_h_pre_norm` handoff into the native NextN/MTP block — matches the
+    // convention from llama-crucible MTP.
+    ggml_tensor * pre_norm_hidden = nullptr; // [hidden, n_tokens] f32
     // One entry per delta-net layer (48 for qwen35-27b). Only populated when
     // QwenGraphInputs::capture_delta_intermediate is true. Tensors are graph
     // views marked as ggml_set_output() so their data persists after
@@ -464,6 +534,38 @@ QwenGraphOutputs build_qwen35_graph(
     TargetCache &          cache,
     const QwenGraphInputs & in);
 
+// ─── Native MTP / NextN forward graph ─────────────────────────────
+//
+// Single-layer NextN/MTP block. Consumes the trunk decoder's pre-norm hidden
+// state (`pre_norm_hidden`) plus the embedding of the current token, runs
+// the NextN concat → eh_proj → transformer-block → shared-head pipeline,
+// and returns the MTP draft logits + the post-block hidden state.
+//
+// Today this PR implements the dense-FFN path only; MoE MTP requires the
+// MoE TargetLayer fields landing first (see PR #120 "Qwen3.5 MoE support").
+struct QwenMtpGraphInputs {
+    ggml_tensor * token_embed;     // [hidden, n_tokens] f32; embedding of the current token(s)
+    ggml_tensor * pre_norm_hidden; // [hidden, n_tokens] f32 from trunk output
+    ggml_tensor * positions;       // [4 * n_tokens] i32
+    ggml_tensor * attn_mask;       // optional [kv_len, n_tokens_padded] f32
+    int           n_tokens = 1;
+    int           kv_start = 0;
+    int           mtp_layer_index = 0;
+    int           fa_window = 0;
+};
+
+struct QwenMtpGraphOutputs {
+    ggml_tensor * logits = nullptr; // [vocab, n_tokens] f32
+    ggml_tensor * hidden = nullptr; // [hidden, n_tokens] f32, post-MTP block
+};
+
+QwenMtpGraphOutputs build_qwen35_mtp_graph(
+    ggml_context *             ctx,
+    ggml_cgraph *              gf,
+    const TargetWeights &      w,
+    TargetMtpCache &           cache,
+    const QwenMtpGraphInputs & in);
+
 // Build a single-layer forward graph. Mirrors build_qwen35_graph but processes
 // only one layer, taking `inp` as the input activation and returning the output.
 // Used by layer-segmented prefill to iterate layers as the outer loop.
diff --git a/dflash/src/qwen35_target_graph.cpp b/dflash/src/qwen35_target_graph.cpp
index 47b989ff..039a9cf7 100644
--- a/dflash/src/qwen35_target_graph.cpp
+++ b/dflash/src/qwen35_target_graph.cpp
@@ -1358,6 +1358,17 @@ QwenGraphOutputs build_qwen35_graph(
         inpL = cur;
     }
 
+    // Expose the final pre-norm hidden state so the native NextN/MTP block can
+    // consume it as `t_h_pre_norm`. Marked as graph output so its data persists
+    // after graph_compute. Caller threads it into build_qwen35_mtp_graph.
+    QwenGraphOutputs og = std::move(og_early);
+    if (in.expose_pre_norm_hidden) {
+        ggml_set_name(inpL, "target_pre_norm_hidden");
+        ggml_set_output(inpL);
+        ggml_build_forward_expand(gf, inpL);
+        og.pre_norm_hidden = inpL;
+    }
+
     // 2. Final norm
     ggml_tensor * out = rms_norm_mul(ctx, inpL, w.out_norm, EPS);
 
@@ -1373,7 +1384,6 @@ QwenGraphOutputs build_qwen35_graph(
 
     ggml_build_forward_expand(gf, logits);
 
-    QwenGraphOutputs og = std::move(og_early);
     og.logits = logits;
     return og;
 }
@@ -1396,4 +1406,196 @@ ggml_tensor * build_qwen35_layer(
                               attn_mask, kv_start, n_tokens, capture, fa_window);
 }
 
+// ─── Native MTP / NextN cache and graph ───────────────────────────────
+
+bool create_target_mtp_cache(const TargetWeights & w,
+                             int max_ctx,
+                             ggml_backend_t backend,
+                             TargetMtpCache & out) {
+    if (w.mtp_layers.empty()) {
+        set_last_error("create_target_mtp_cache requires TargetWeights::mtp_layers");
+        return false;
+    }
+    if (max_ctx <= 0) {
+        set_last_error("create_target_mtp_cache requires max_ctx > 0");
+        return false;
+    }
+
+    out.backend = backend;
+    out.max_ctx = max_ctx;
+    out.cur_pos = 0;
+
+    ggml_type kv_k_type = GGML_TYPE_Q8_0;
+    ggml_type kv_v_type = GGML_TYPE_Q8_0;
+    dflash::resolve_kv_types(kv_k_type, kv_v_type);
+    out.kv_k_type = kv_k_type;
+    out.kv_v_type = kv_v_type;
+    const int max_ctx_alloc = (kv_k_type == GGML_TYPE_TQ3_0 || kv_v_type == GGML_TYPE_TQ3_0)
+        ? ((max_ctx + 255) / 256) * 256
+        : max_ctx;
+
+    const int n_mtp = (int)w.mtp_layers.size();
+    out.attn_k.assign(n_mtp, nullptr);
+    out.attn_v.assign(n_mtp, nullptr);
+
+    ggml_init_params ip{};
+    ip.mem_size   = (size_t)(2 * n_mtp + 16) * ggml_tensor_overhead();
+    ip.mem_buffer = nullptr;
+    ip.no_alloc   = true;
+    out.ctx = ggml_init(ip);
+    if (!out.ctx) {
+        set_last_error("mtp cache ggml_init failed");
+        return false;
+    }
+
+    for (int mi = 0; mi < n_mtp; mi++) {
+        ggml_tensor * K = ggml_new_tensor_3d(out.ctx, kv_k_type,
+                                             w.n_embd_head_k, max_ctx_alloc, w.n_head_kv);
+        ggml_tensor * V = ggml_new_tensor_3d(out.ctx, kv_v_type,
+                                             w.n_embd_head_k, max_ctx_alloc, w.n_head_kv);
+        char name[64];
+        std::snprintf(name, sizeof(name), "mtp_cache_k_%d", mi);
+        ggml_set_name(K, name);
+        std::snprintf(name, sizeof(name), "mtp_cache_v_%d", mi);
+        ggml_set_name(V, name);
+        out.attn_k[mi] = K;
+        out.attn_v[mi] = V;
+    }
+
+    out.buf = ggml_backend_alloc_ctx_tensors(out.ctx, backend);
+    if (!out.buf) {
+        set_last_error("ggml_backend_alloc_ctx_tensors failed for mtp cache");
+        ggml_free(out.ctx);
+        out.ctx = nullptr;
+        out.attn_k.clear();
+        out.attn_v.clear();
+        return false;
+    }
+
+    reset_target_mtp_cache(out);
+    return true;
+}
+
+void free_target_mtp_cache(TargetMtpCache & c) {
+    if (c.buf) { ggml_backend_buffer_free(c.buf); c.buf = nullptr; }
+    if (c.ctx) { ggml_free(c.ctx); c.ctx = nullptr; }
+    c.attn_k.clear();
+    c.attn_v.clear();
+    c.max_ctx = 0;
+    c.cur_pos = 0;
+}
+
+void reset_target_mtp_cache(TargetMtpCache & c) {
+    c.cur_pos = 0;
+    std::vector<uint8_t> zeros(1 * 1024 * 1024, 0);
+    if (!c.ctx) return;
+    for (ggml_tensor * t = ggml_get_first_tensor(c.ctx); t != nullptr;
+         t = ggml_get_next_tensor(c.ctx, t)) {
+        size_t nb = ggml_nbytes(t);
+        size_t off = 0;
+        while (off < nb) {
+            size_t chunk = std::min(nb - off, zeros.size());
+            ggml_backend_tensor_set(t, zeros.data(), off, chunk);
+            off += chunk;
+        }
+    }
+}
+
+QwenMtpGraphOutputs build_qwen35_mtp_graph(
+    ggml_context *             ctx,
+    ggml_cgraph *              gf,
+    const TargetWeights &      w,
+    TargetMtpCache &           cache,
+    const QwenMtpGraphInputs & in) {
+
+    if (w.mtp_layers.empty()) {
+        set_last_error("build_qwen35_mtp_graph requires TargetWeights::mtp_layers");
+        return {};
+    }
+    if (in.mtp_layer_index < 0 || in.mtp_layer_index >= (int)w.mtp_layers.size()) {
+        set_last_error("build_qwen35_mtp_graph mtp_layer_index out of range");
+        return {};
+    }
+    if (!in.token_embed || !in.pre_norm_hidden || !in.positions) {
+        set_last_error("build_qwen35_mtp_graph missing required input tensor");
+        return {};
+    }
+    if ((int)cache.attn_k.size() <= in.mtp_layer_index ||
+        (int)cache.attn_v.size() <= in.mtp_layer_index ||
+        !cache.attn_k[in.mtp_layer_index] || !cache.attn_v[in.mtp_layer_index]) {
+        set_last_error("build_qwen35_mtp_graph missing MTP KV cache tensors");
+        return {};
+    }
+
+    const int n_tokens = std::max(1, in.n_tokens);
+    const TargetMtpLayer & M = w.mtp_layers[(size_t)in.mtp_layer_index];
+    const TargetLayer & L = M.block;
+
+    // Dense FFN only — MoE MTP requires the MoE TargetLayer fields landing
+    // first (see PR #120 "Qwen3.5 MoE support"). For non-MoE GGUFs this is
+    // the production path and matches the am17an reference layout.
+    const bool has_dense_ffn = L.w_gate && L.w_up && L.w_down;
+    if (!M.nextn.eh_proj || !M.nextn.enorm || !M.nextn.hnorm ||
+        !L.attn_norm || !L.attn_post_norm ||
+        !L.wq || !L.wk || !L.wv || !L.wo || !L.q_norm || !L.k_norm ||
+        !has_dense_ffn) {
+        set_last_error("build_qwen35_mtp_graph missing loaded MTP tensors "
+                       "(MoE MTP not supported in this PR — needs #120)");
+        return {};
+    }
+
+    // NextN concat path: [enorm(e); hnorm(h)] → eh_proj → transformer block.
+    ggml_tensor * e_norm = rms_norm_mul(ctx, in.token_embed, M.nextn.enorm, EPS);
+    ggml_tensor * h_norm = rms_norm_mul(ctx, in.pre_norm_hidden, M.nextn.hnorm, EPS);
+
+    ggml_tensor * concat = ggml_concat(ctx, e_norm, h_norm, 0);
+    ggml_set_name(concat, "mtp_concat_embedding_hidden");
+
+    ggml_tensor * cur = ggml_mul_mat(ctx, M.nextn.eh_proj, concat);
+    ggml_set_name(cur, "mtp_eh_proj");
+
+    ggml_tensor * inpSA = cur;
+    cur = rms_norm_mul(ctx, cur, L.attn_norm, EPS);
+    ggml_set_name(cur, "mtp_attn_norm");
+
+    cur = build_full_attn_block(ctx, gf, w, L, cur, in.positions,
+                                cache.attn_k[in.mtp_layer_index],
+                                cache.attn_v[in.mtp_layer_index],
+                                in.attn_mask, in.kv_start, n_tokens,
+                                cache.kv_k_type, cache.kv_v_type,
+                                /*kv_k_rotated=*/false, in.fa_window);
+    ggml_set_name(cur, "mtp_attn_out");
+
+    cur = ggml_add(ctx, cur, inpSA);
+
+    ggml_tensor * ffn_residual = cur;
+    cur = rms_norm_mul(ctx, cur, L.attn_post_norm, EPS);
+    ggml_set_name(cur, "mtp_post_attn_norm");
+    ggml_tensor * ffn = build_swiglu_ffn(ctx, cur, L);
+    if (!ffn) return {};
+    cur = ggml_add(ctx, ffn, ffn_residual);
+    ggml_set_name(cur, "mtp_hidden");
+    ggml_set_output(cur);
+
+    // Final norm + shared LM head. Falls back to the trunk's out_norm / output
+    // when the NextN block doesn't ship its own shared head tensors (am17an
+    // GGUFs do not always include shared_head_*).
+    ggml_tensor * head_norm_w = M.nextn.shared_head_norm ? M.nextn.shared_head_norm : w.out_norm;
+    ggml_tensor * head_w      = M.nextn.shared_head_head ? M.nextn.shared_head_head : w.output;
+    if (!head_norm_w || !head_w) {
+        set_last_error("build_qwen35_mtp_graph missing MTP/shared LM head tensors");
+        return {};
+    }
+
+    ggml_tensor * out_h = rms_norm_mul(ctx, cur, head_norm_w, EPS);
+    ggml_tensor * logits = ggml_mul_mat(ctx, head_w, out_h);
+    ggml_set_name(logits, "mtp_logits");
+    ggml_build_forward_expand(gf, logits);
+
+    QwenMtpGraphOutputs og{};
+    og.logits = logits;
+    og.hidden = cur;
+    return og;
+}
+
 } // namespace dflash27b
diff --git a/dflash/test/smoke_mtp_graph.cpp b/dflash/test/smoke_mtp_graph.cpp
new file mode 100644
index 00000000..b5c710ab
--- /dev/null
+++ b/dflash/test/smoke_mtp_graph.cpp
@@ -0,0 +1,173 @@
+// Smoke test for the native Qwen35 NextN/MTP graph.
+//
+// Loads a GGUF with embedded nextn tensors, creates the MTP KV cache, runs a
+// single-token MTP forward from caller-provided token embedding + synthetic
+// target pre-norm hidden, and checks the resulting logits for NaN/Inf.
+//
+// Usage: smoke_mtp_graph <qwen35-mtp.gguf> [cuda_gpu]
+
+#include "dflash27b.h"
+#include "internal.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+using namespace dflash27b;
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <qwen35-mtp.gguf>\n", argv[0]);
+        return 2;
+    }
+
+    int gpu = 0;
+    if (argc >= 3) gpu = std::atoi(argv[2]);
+
+    ggml_backend_t backend = ggml_backend_cuda_init(gpu);
+    if (!backend) {
+        std::fprintf(stderr, "cuda init failed\n");
+        return 1;
+    }
+
+    TargetWeights w;
+    if (!load_target_gguf(argv[1], backend, w)) {
+        std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error());
+        ggml_backend_free(backend);
+        return 1;
+    }
+    std::printf("[target] %s\n", dflash27b_last_error());
+    if (w.mtp_layers.empty()) {
+        std::fprintf(stderr, "model has no MTP/nextn layers\n");
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    TargetMtpCache mtp_cache;
+    if (!create_target_mtp_cache(w, /*max_ctx=*/64, backend, mtp_cache)) {
+        std::fprintf(stderr, "create_target_mtp_cache: %s\n", dflash27b_last_error());
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+    std::printf("[mtp-cache] layers=%zu max_ctx=%d kv=%s/%s\n",
+        mtp_cache.attn_k.size(), mtp_cache.max_ctx,
+        ggml_type_name(mtp_cache.kv_k_type), ggml_type_name(mtp_cache.kv_v_type));
+
+    ggml_init_params ip{};
+    ip.mem_size   = 512 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc   = true;
+    ggml_context * gctx = ggml_init(ip);
+    if (!gctx) {
+        std::fprintf(stderr, "ggml_init graph failed\n");
+        free_target_mtp_cache(mtp_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    const int n_tokens = 1;
+    ggml_tensor * token_embed = ggml_new_tensor_2d(gctx, GGML_TYPE_F32, w.n_embd, n_tokens);
+    ggml_tensor * hidden      = ggml_new_tensor_2d(gctx, GGML_TYPE_F32, w.n_embd, n_tokens);
+    ggml_tensor * positions   = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, 4 * n_tokens);
+    ggml_set_name(token_embed, "mtp_token_embed");
+    ggml_set_name(hidden, "target_pre_norm_hidden");
+    ggml_set_name(positions, "positions");
+    ggml_set_input(token_embed);
+    ggml_set_input(hidden);
+    ggml_set_input(positions);
+
+    ggml_cgraph * gf = ggml_new_graph_custom(gctx, 2048, false);
+    QwenMtpGraphInputs gi{};
+    gi.token_embed = token_embed;
+    gi.pre_norm_hidden = hidden;
+    gi.positions = positions;
+    gi.n_tokens = n_tokens;
+    gi.kv_start = 0;
+
+    QwenMtpGraphOutputs go = build_qwen35_mtp_graph(gctx, gf, w, mtp_cache, gi);
+    if (!go.logits) {
+        std::fprintf(stderr, "build_qwen35_mtp_graph: %s\n", dflash27b_last_error());
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+    ggml_set_output(go.logits);
+    ggml_build_forward_expand(gf, go.logits);
+    std::printf("[graph] nodes=%d\n", ggml_graph_n_nodes(gf));
+
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+        std::fprintf(stderr, "ggml_gallocr_alloc_graph failed\n");
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    int32_t tok_ids[1] = { 1 };
+    std::vector<float> embed_buf((size_t)w.n_embd * n_tokens);
+    if (!w.embedder.embed(tok_ids, n_tokens, embed_buf.data())) {
+        std::fprintf(stderr, "cpu embedder failed\n");
+        ggml_gallocr_free(alloc);
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+    std::vector<float> hidden_buf((size_t)w.n_embd * n_tokens, 0.0f);
+    int32_t pos4[4] = { 0, 0, 0, 0 };
+    ggml_backend_tensor_set(token_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size());
+    ggml_backend_tensor_set(hidden, hidden_buf.data(), 0, sizeof(float) * hidden_buf.size());
+    ggml_backend_tensor_set(positions, pos4, 0, sizeof(pos4));
+
+    auto status = ggml_backend_graph_compute(backend, gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        std::fprintf(stderr, "compute failed: %d\n", (int)status);
+        ggml_gallocr_free(alloc);
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    const int64_t vocab = go.logits->ne[0];
+    std::vector<float> logits((size_t)vocab);
+    ggml_backend_tensor_get(go.logits, logits.data(), 0, sizeof(float) * logits.size());
+
+    int n_nan = 0, n_inf = 0;
+    float vmin = 1e30f, vmax = -1e30f;
+    for (float v : logits) {
+        if (std::isnan(v)) n_nan++;
+        else if (std::isinf(v)) n_inf++;
+        else {
+            vmin = std::min(vmin, v);
+            vmax = std::max(vmax, v);
+        }
+    }
+    std::printf("[mtp-logits] vocab=%lld nan=%d inf=%d min=%.4g max=%.4g\n",
+        (long long)vocab, n_nan, n_inf, vmin, vmax);
+
+    ggml_gallocr_free(alloc);
+    ggml_free(gctx);
+    free_target_mtp_cache(mtp_cache);
+    free_target_weights(w);
+    ggml_backend_free(backend);
+    std::printf("OK\n");
+    return (n_nan == 0 && n_inf == 0) ? 0 : 1;
+}
diff --git a/dflash/test/smoke_mtp_integrated_decode.cpp b/dflash/test/smoke_mtp_integrated_decode.cpp
new file mode 100644
index 00000000..3f09b215
--- /dev/null
+++ b/dflash/test/smoke_mtp_integrated_decode.cpp
@@ -0,0 +1,225 @@
+// Minimal integrated DFlash + native MTP decode smoke.
+//
+// This is not the optimized multi-token speculative loop yet. It proves the
+// functional contract end-to-end:
+//   1. target DFlash consumes the committed token and exposes pre-norm hidden
+//   2. native MTP/NextN consumes that hidden in the same graph and drafts
+//   3. greedy target logits accept or correct the MTP draft token
+//   4. the chosen token becomes the next committed token
+//
+// Usage:
+//   smoke_mtp_integrated_decode <qwen35-mtp.gguf> [n_gen] [seed_token_id] [cuda_gpu]
+
+#include "dflash27b.h"
+#include "internal.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+using namespace dflash27b;
+
+static bool run_integrated_step(const TargetWeights & w,
+                                TargetCache & target_cache,
+                                TargetMtpCache & mtp_cache,
+                                ggml_backend_t backend,
+                                int32_t token,
+                                int kv_start,
+                                int32_t & target_next,
+                                int32_t & mtp_next) {
+    ggml_init_params ip{};
+    ip.mem_size   = 768 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc   = true;
+    ggml_context * ctx = ggml_init(ip);
+    if (!ctx) {
+        std::fprintf(stderr, "ggml_init graph failed\n");
+        return false;
+    }
+
+    const int n_tokens = 1;
+    ggml_tensor * inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w.n_embd, n_tokens, 1);
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * n_tokens);
+    ggml_set_name(inp_embed, "inp_embed");
+    ggml_set_name(positions, "positions");
+    ggml_set_input(inp_embed);
+    ggml_set_input(positions);
+
+    ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
+
+    QwenGraphInputs target_in{};
+    target_in.inp_embed = inp_embed;
+    target_in.positions = positions;
+    target_in.n_tokens = n_tokens;
+    target_in.kv_start = kv_start;
+    target_in.expose_pre_norm_hidden = true;
+
+    QwenGraphOutputs target_out = build_qwen35_graph(ctx, gf, w, target_cache, target_in);
+    if (!target_out.logits || !target_out.pre_norm_hidden) {
+        std::fprintf(stderr, "build_qwen35_graph failed: %s\n", dflash27b_last_error());
+        ggml_free(ctx);
+        return false;
+    }
+
+    QwenMtpGraphInputs mtp_in{};
+    mtp_in.token_embed = ggml_reshape_2d(ctx, inp_embed, w.n_embd, n_tokens);
+    mtp_in.pre_norm_hidden = target_out.pre_norm_hidden;
+    mtp_in.positions = positions;
+    mtp_in.n_tokens = n_tokens;
+    mtp_in.kv_start = kv_start;
+
+    QwenMtpGraphOutputs mtp_out = build_qwen35_mtp_graph(ctx, gf, w, mtp_cache, mtp_in);
+    if (!mtp_out.logits) {
+        std::fprintf(stderr, "build_qwen35_mtp_graph failed: %s\n", dflash27b_last_error());
+        ggml_free(ctx);
+        return false;
+    }
+
+    ggml_tensor * target_argmax = ggml_argmax(ctx, target_out.logits);
+    ggml_set_name(target_argmax, "target_argmax");
+    ggml_set_output(target_argmax);
+    ggml_build_forward_expand(gf, target_argmax);
+
+    ggml_tensor * mtp_argmax = ggml_argmax(ctx, mtp_out.logits);
+    ggml_set_name(mtp_argmax, "mtp_argmax");
+    ggml_set_output(mtp_argmax);
+    ggml_build_forward_expand(gf, mtp_argmax);
+
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+        std::fprintf(stderr, "ggml_gallocr_alloc_graph failed\n");
+        ggml_gallocr_free(alloc);
+        ggml_free(ctx);
+        return false;
+    }
+
+    std::vector<float> embed_buf((size_t)w.n_embd);
+    if (!w.embedder.embed(&token, 1, embed_buf.data())) {
+        std::fprintf(stderr, "cpu embedder failed for token %d\n", (int)token);
+        ggml_gallocr_free(alloc);
+        ggml_free(ctx);
+        return false;
+    }
+    int32_t pos4[4] = { kv_start, kv_start, kv_start, kv_start };
+    ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size());
+    ggml_backend_tensor_set(positions, pos4, 0, sizeof(pos4));
+
+    auto status = ggml_backend_graph_compute(backend, gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        std::fprintf(stderr, "compute failed: %d\n", (int)status);
+        ggml_gallocr_free(alloc);
+        ggml_free(ctx);
+        return false;
+    }
+
+    ggml_backend_tensor_get(target_argmax, &target_next, 0, sizeof(target_next));
+    ggml_backend_tensor_get(mtp_argmax, &mtp_next, 0, sizeof(mtp_next));
+
+    ggml_gallocr_free(alloc);
+    ggml_free(ctx);
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <qwen35-mtp.gguf> [n_gen] [seed_token_id] [cuda_gpu]\n", argv[0]);
+        return 2;
+    }
+    const int n_gen = argc >= 3 ? std::max(1, std::atoi(argv[2])) : 8;
+    int32_t last_tok = argc >= 4 ? (int32_t)std::atoi(argv[3]) : 1;
+    const int gpu = argc >= 5 ? std::atoi(argv[4]) : 0;
+
+    ggml_backend_t backend = ggml_backend_cuda_init(gpu);
+    if (!backend) {
+        std::fprintf(stderr, "cuda init failed\n");
+        return 1;
+    }
+
+    TargetWeights w;
+    if (!load_target_gguf(argv[1], backend, w)) {
+        std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error());
+        ggml_backend_free(backend);
+        return 1;
+    }
+    std::printf("[target] %s\n", dflash27b_last_error());
+
+    TargetCache target_cache;
+    if (!create_target_cache(w, /*max_ctx=*/std::max(64, n_gen + 8),
+                             /*max_verify_tokens=*/0, backend, target_cache)) {
+        std::fprintf(stderr, "create_target_cache: %s\n", dflash27b_last_error());
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    TargetMtpCache mtp_cache;
+    if (!create_target_mtp_cache(w, /*max_ctx=*/std::max(64, n_gen + 8),
+                                 backend, mtp_cache)) {
+        std::fprintf(stderr, "create_target_mtp_cache: %s\n", dflash27b_last_error());
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    std::vector<int32_t> generated;
+    generated.reserve((size_t)n_gen);
+    int draft_n = 0;
+    int accepted = 0;
+    int corrected = 0;
+
+    auto t0 = std::chrono::steady_clock::now();
+    for (int pos = 0; pos < n_gen; pos++) {
+        int32_t target_next = -1;
+        int32_t mtp_next = -1;
+        if (!run_integrated_step(w, target_cache, mtp_cache, backend,
+                                 last_tok, pos, target_next, mtp_next)) {
+            free_target_mtp_cache(mtp_cache);
+            free_target_cache(target_cache);
+            free_target_weights(w);
+            ggml_backend_free(backend);
+            return 1;
+        }
+
+        draft_n++;
+        const bool ok = (mtp_next == target_next);
+        if (ok) {
+            accepted++;
+        } else {
+            corrected++;
+        }
+        const int32_t chosen = ok ? mtp_next : target_next;
+        generated.push_back(chosen);
+        target_cache.cur_pos = pos + 1;
+        target_cache.last_tok = chosen;
+        mtp_cache.cur_pos = pos + 1;
+
+        std::printf("[mtp-decode step=%d] input=%d mtp=%d target=%d %s chosen=%d\n",
+            pos, (int)last_tok, (int)mtp_next, (int)target_next,
+            ok ? "ACCEPT" : "CORRECT", (int)chosen);
+        last_tok = chosen;
+    }
+    auto t1 = std::chrono::steady_clock::now();
+    const double seconds = std::chrono::duration<double>(t1 - t0).count();
+
+    std::printf("[mtp-decode] generated=%d draft_n=%d accepted=%d corrected=%d acceptance=%.1f%% tok/s=%.2f\n",
+        n_gen, draft_n, accepted, corrected,
+        draft_n > 0 ? 100.0 * accepted / draft_n : 0.0,
+        n_gen / std::max(1e-9, seconds));
+    std::printf("[mtp-decode ids]");
+    for (int32_t t : generated) std::printf(" %d", (int)t);
+    std::printf("\nOK\n");
+
+    free_target_mtp_cache(mtp_cache);
+    free_target_cache(target_cache);
+    free_target_weights(w);
+    ggml_backend_free(backend);
+    return 0;
+}
diff --git a/dflash/test/smoke_target_mtp_handoff.cpp b/dflash/test/smoke_target_mtp_handoff.cpp
new file mode 100644
index 00000000..81ddfc33
--- /dev/null
+++ b/dflash/test/smoke_target_mtp_handoff.cpp
@@ -0,0 +1,199 @@
+// Smoke test for the DFlash target -> native MTP handoff.
+//
+// Builds one graph containing:
+//   target forward with expose_pre_norm_hidden=true
+//   MTP/NextN forward fed by that target_pre_norm_hidden tensor
+//
+// This validates the C++ tensor handoff required by the real speculative loop.
+//
+// Usage: smoke_target_mtp_handoff <qwen35-mtp.gguf> [cuda_gpu]
+
+#include "dflash27b.h"
+#include "internal.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+using namespace dflash27b;
+
+static int check_logits(ggml_tensor * logits, const char * label) {
+    const int64_t vocab = logits->ne[0];
+    std::vector<float> buf((size_t)vocab);
+    ggml_backend_tensor_get(logits, buf.data(), 0, sizeof(float) * buf.size());
+    int n_nan = 0, n_inf = 0;
+    float vmin = 1e30f, vmax = -1e30f;
+    for (float v : buf) {
+        if (std::isnan(v)) n_nan++;
+        else if (std::isinf(v)) n_inf++;
+        else {
+            vmin = std::min(vmin, v);
+            vmax = std::max(vmax, v);
+        }
+    }
+    std::printf("[%s] vocab=%lld nan=%d inf=%d min=%.4g max=%.4g\n",
+        label, (long long)vocab, n_nan, n_inf, vmin, vmax);
+    return (n_nan == 0 && n_inf == 0) ? 0 : 1;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <qwen35-mtp.gguf>\n", argv[0]);
+        return 2;
+    }
+
+    int gpu = 0;
+    if (argc >= 3) gpu = std::atoi(argv[2]);
+
+    ggml_backend_t backend = ggml_backend_cuda_init(gpu);
+    if (!backend) {
+        std::fprintf(stderr, "cuda init failed\n");
+        return 1;
+    }
+
+    TargetWeights w;
+    if (!load_target_gguf(argv[1], backend, w)) {
+        std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error());
+        ggml_backend_free(backend);
+        return 1;
+    }
+    std::printf("[target] %s\n", dflash27b_last_error());
+
+    TargetCache target_cache;
+    if (!create_target_cache(w, /*max_ctx=*/64, /*max_verify_tokens=*/0, backend, target_cache)) {
+        std::fprintf(stderr, "create_target_cache: %s\n", dflash27b_last_error());
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+    TargetMtpCache mtp_cache;
+    if (!create_target_mtp_cache(w, /*max_ctx=*/64, backend, mtp_cache)) {
+        std::fprintf(stderr, "create_target_mtp_cache: %s\n", dflash27b_last_error());
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    ggml_init_params ip{};
+    ip.mem_size   = 768 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc   = true;
+    ggml_context * gctx = ggml_init(ip);
+    if (!gctx) {
+        std::fprintf(stderr, "ggml_init graph failed\n");
+        free_target_mtp_cache(mtp_cache);
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    const int n_tokens = 1;
+    ggml_tensor * inp_embed = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, w.n_embd, n_tokens, 1);
+    ggml_tensor * positions = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, 4 * n_tokens);
+    ggml_set_name(inp_embed, "inp_embed");
+    ggml_set_name(positions, "positions");
+    ggml_set_input(inp_embed);
+    ggml_set_input(positions);
+
+    ggml_cgraph * gf = ggml_new_graph_custom(gctx, 8192, false);
+    QwenGraphInputs target_in{};
+    target_in.inp_embed = inp_embed;
+    target_in.positions = positions;
+    target_in.n_tokens = n_tokens;
+    target_in.kv_start = 0;
+    target_in.expose_pre_norm_hidden = true;
+
+    QwenGraphOutputs target_out = build_qwen35_graph(gctx, gf, w, target_cache, target_in);
+    if (!target_out.logits || !target_out.pre_norm_hidden) {
+        std::fprintf(stderr, "build_qwen35_graph did not expose target hidden: %s\n", dflash27b_last_error());
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    QwenMtpGraphInputs mtp_in{};
+    mtp_in.token_embed = ggml_reshape_2d(gctx, inp_embed, w.n_embd, n_tokens);
+    mtp_in.pre_norm_hidden = target_out.pre_norm_hidden;
+    mtp_in.positions = positions;
+    mtp_in.n_tokens = n_tokens;
+    mtp_in.kv_start = 0;
+
+    QwenMtpGraphOutputs mtp_out = build_qwen35_mtp_graph(gctx, gf, w, mtp_cache, mtp_in);
+    if (!mtp_out.logits || !mtp_out.hidden) {
+        std::fprintf(stderr, "build_qwen35_mtp_graph failed: %s\n", dflash27b_last_error());
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+    ggml_set_output(target_out.logits);
+    ggml_set_output(mtp_out.logits);
+    ggml_build_forward_expand(gf, mtp_out.logits);
+    std::printf("[graph] nodes=%d\n", ggml_graph_n_nodes(gf));
+
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+        std::fprintf(stderr, "ggml_gallocr_alloc_graph failed\n");
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    int32_t tok_ids[1] = { 1 };
+    std::vector<float> embed_buf((size_t)w.n_embd * n_tokens);
+    if (!w.embedder.embed(tok_ids, n_tokens, embed_buf.data())) {
+        std::fprintf(stderr, "cpu embedder failed\n");
+        ggml_gallocr_free(alloc);
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+    int32_t pos4[4] = { 0, 0, 0, 0 };
+    ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size());
+    ggml_backend_tensor_set(positions, pos4, 0, sizeof(pos4));
+
+    auto status = ggml_backend_graph_compute(backend, gf);
+    if (status != GGML_STATUS_SUCCESS) {
+        std::fprintf(stderr, "compute failed: %d\n", (int)status);
+        ggml_gallocr_free(alloc);
+        ggml_free(gctx);
+        free_target_mtp_cache(mtp_cache);
+        free_target_cache(target_cache);
+        free_target_weights(w);
+        ggml_backend_free(backend);
+        return 1;
+    }
+
+    const int bad_target = check_logits(target_out.logits, "target-logits");
+    const int bad_mtp = check_logits(mtp_out.logits, "mtp-logits");
+
+    ggml_gallocr_free(alloc);
+    ggml_free(gctx);
+    free_target_mtp_cache(mtp_cache);
+    free_target_cache(target_cache);
+    free_target_weights(w);
+    ggml_backend_free(backend);
+    std::printf("OK\n");
+    return (bad_target || bad_mtp) ? 1 : 0;
+}
diff --git a/dflash/test/test_mtp_graph_contract.cpp b/dflash/test/test_mtp_graph_contract.cpp
new file mode 100644
index 00000000..b67b8c83
--- /dev/null
+++ b/dflash/test/test_mtp_graph_contract.cpp
@@ -0,0 +1,108 @@
+#include "internal.h"
+
+#include "ggml.h"
+
+#include <cstdio>
+
+using namespace dflash27b;
+
+static ggml_tensor * tensor_1d(ggml_context * ctx, int n0, const char * name) {
+    ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n0);
+    ggml_set_name(t, name);
+    return t;
+}
+
+static ggml_tensor * tensor_2d(ggml_context * ctx, int n0, int n1, const char * name) {
+    ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n0, n1);
+    ggml_set_name(t, name);
+    return t;
+}
+
+int main() {
+    ggml_init_params ip{};
+    ip.mem_size   = 16 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc   = true;
+
+    ggml_context * ctx = ggml_init(ip);
+    if (!ctx) {
+        std::fprintf(stderr, "ggml_init failed\n");
+        return 1;
+    }
+
+    TargetWeights w{};
+    w.n_embd = 8;
+    w.n_ff = 16;
+    w.n_head = 2;
+    w.n_head_kv = 1;
+    w.n_embd_head_k = 4;
+    w.n_embd_head_v = 4;
+    w.rope_sections[0] = 1;
+    w.rope_sections[1] = 1;
+    w.rope_sections[2] = 0;
+    w.rope_sections[3] = 0;
+    w.out_norm = tensor_1d(ctx, w.n_embd, "output_norm.weight");
+    w.output   = tensor_2d(ctx, w.n_embd, 32, "output.weight");
+    w.mtp_layers.assign(1, TargetMtpLayer{});
+
+    TargetMtpLayer & M = w.mtp_layers[0];
+    TargetLayer & L = M.block;
+    M.gguf_layer_index = 64;
+    M.nextn.eh_proj          = tensor_2d(ctx, 2 * w.n_embd, w.n_embd, "blk.64.nextn.eh_proj.weight");
+    M.nextn.enorm            = tensor_1d(ctx, w.n_embd, "blk.64.nextn.enorm.weight");
+    M.nextn.hnorm            = tensor_1d(ctx, w.n_embd, "blk.64.nextn.hnorm.weight");
+    M.nextn.shared_head_norm = tensor_1d(ctx, w.n_embd, "blk.64.nextn.shared_head_norm.weight");
+
+    const int q_dim = w.n_head * w.n_embd_head_k;
+    const int kv_dim = w.n_head_kv * w.n_embd_head_k;
+    L.attn_norm      = tensor_1d(ctx, w.n_embd, "blk.64.attn_norm.weight");
+    L.attn_post_norm = tensor_1d(ctx, w.n_embd, "blk.64.post_attention_norm.weight");
+    L.wq             = tensor_2d(ctx, w.n_embd, 2 * q_dim, "blk.64.attn_q.weight");
+    L.wk             = tensor_2d(ctx, w.n_embd, kv_dim, "blk.64.attn_k.weight");
+    L.wv             = tensor_2d(ctx, w.n_embd, kv_dim, "blk.64.attn_v.weight");
+    L.wo             = tensor_2d(ctx, q_dim, w.n_embd, "blk.64.attn_output.weight");
+    L.q_norm         = tensor_1d(ctx, w.n_embd_head_k, "blk.64.attn_q_norm.weight");
+    L.k_norm         = tensor_1d(ctx, w.n_embd_head_k, "blk.64.attn_k_norm.weight");
+    L.w_gate         = tensor_2d(ctx, w.n_embd, w.n_ff, "blk.64.ffn_gate.weight");
+    L.w_up           = tensor_2d(ctx, w.n_embd, w.n_ff, "blk.64.ffn_up.weight");
+    L.w_down         = tensor_2d(ctx, w.n_ff, w.n_embd, "blk.64.ffn_down.weight");
+
+    TargetMtpCache cache{};
+    cache.max_ctx = 8;
+    cache.kv_k_type = GGML_TYPE_F16;
+    cache.kv_v_type = GGML_TYPE_F16;
+    cache.attn_k.push_back(ggml_new_tensor_3d(ctx, GGML_TYPE_F16, w.n_embd_head_k, cache.max_ctx, w.n_head_kv));
+    cache.attn_v.push_back(ggml_new_tensor_3d(ctx, GGML_TYPE_F16, w.n_embd_head_k, cache.max_ctx, w.n_head_kv));
+
+    const int n_tokens = 1;
+    ggml_tensor * token_embed = tensor_2d(ctx, w.n_embd, n_tokens, "mtp_token_embed");
+    ggml_tensor * hidden      = tensor_2d(ctx, w.n_embd, n_tokens, "target_pre_norm_hidden");
+    ggml_tensor * positions   = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * n_tokens);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(token_embed);
+    ggml_set_input(hidden);
+    ggml_set_input(positions);
+
+    ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false);
+    QwenMtpGraphInputs in{};
+    in.token_embed = token_embed;
+    in.pre_norm_hidden = hidden;
+    in.positions = positions;
+    in.n_tokens = n_tokens;
+    in.kv_start = 0;
+
+    QwenMtpGraphOutputs out = build_qwen35_mtp_graph(ctx, gf, w, cache, in);
+    if (!out.logits || !out.hidden) {
+        std::fprintf(stderr, "build_qwen35_mtp_graph failed: %s\n", dflash27b_last_error());
+        ggml_free(ctx);
+        return 1;
+    }
+
+    std::printf("[mtp-graph-contract] nodes=%d logits=[%lld,%lld] hidden=[%lld,%lld]\n",
+        ggml_graph_n_nodes(gf),
+        (long long)out.logits->ne[0], (long long)out.logits->ne[1],
+        (long long)out.hidden->ne[0], (long long)out.hidden->ne[1]);
+
+    ggml_free(ctx);
+    return 0;
+}