diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt index a4bb575f..66f27e9e 100644 --- a/dflash/CMakeLists.txt +++ b/dflash/CMakeLists.txt @@ -126,6 +126,8 @@ add_library(dflash27b STATIC src/laguna_target_graph.cpp src/laguna_daemon.cpp src/sampler.cpp + # Native MTP / NextN helpers + src/f16_convert.cu ) # FlashPrefill custom CUDA kernels need BF16 WMMA (sm_80+). On Turing (sm_75) # the drafter uses ggml's flash_attn_ext instead. Guard added after SM check. @@ -334,6 +336,29 @@ if(DFLASH27B_TESTS) endif() endif() + # Native MTP / NextN: contract test + functional smokes. The contract test + # uses synthetic tensors and runs in CI; the smokes need a real MTP GGUF. + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_mtp_graph_contract.cpp") + add_executable(test_mtp_graph_contract test/test_mtp_graph_contract.cpp) + target_include_directories(test_mtp_graph_contract PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(test_mtp_graph_contract PRIVATE dflash27b ggml ggml-cuda) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_graph.cpp") + add_executable(smoke_mtp_graph test/smoke_mtp_graph.cpp) + target_include_directories(smoke_mtp_graph PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(smoke_mtp_graph PRIVATE dflash27b ggml ggml-cuda) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_mtp_handoff.cpp") + add_executable(smoke_target_mtp_handoff test/smoke_target_mtp_handoff.cpp) + target_include_directories(smoke_target_mtp_handoff PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(smoke_target_mtp_handoff PRIVATE dflash27b ggml ggml-cuda) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_integrated_decode.cpp") + add_executable(smoke_mtp_integrated_decode test/smoke_mtp_integrated_decode.cpp) + target_include_directories(smoke_mtp_integrated_decode PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(smoke_mtp_integrated_decode PRIVATE dflash27b ggml ggml-cuda) + endif() + # internal.h includes when GGML_USE_CUDA is set; link # CUDA::cudart so the toolkit headers are on the compile line (same as # test_dflash historically had alone). @@ -350,6 +375,10 @@ if(DFLASH27B_TESTS) smoke_target_forward test_generate test_dflash + test_mtp_graph_contract + smoke_mtp_graph + smoke_target_mtp_handoff + smoke_mtp_integrated_decode ) foreach(_t IN LISTS _dflash_internal_h_cuda_tests) if(TARGET ${_t}) diff --git a/dflash/docs/MTP_2026-05-11.md b/dflash/docs/MTP_2026-05-11.md new file mode 100644 index 00000000..04b30316 --- /dev/null +++ b/dflash/docs/MTP_2026-05-11.md @@ -0,0 +1,111 @@ +# Native MTP (NextN) — runtime status, 2026-05-11 + +This document describes the native multi-token prediction (MTP / NextN) +runtime introduced into `dflash` in PR `feat(dflash): native Qwen3.6 MTP +integrated decode`. It tracks the contract, what already works, and what +remains for the next PR before MTP becomes a default-on decode mode. + +## What this PR ships + +- `dflash/src/f16_convert.cu` — small `f16/bf16 → f32` widen kernels used + by both the rollback path and the MTP token-embedding widen. +- `dflash/src/internal.h` — new types: + - `TargetNextN`, `TargetMtpLayer` + - `TargetMtpCache` (KV cache for the NextN tail block only) + - `QwenMtpGraphInputs`, `QwenMtpGraphOutputs` + - `expose_pre_norm_hidden` on `QwenGraphInputs` + - `pre_norm_hidden` on `QwenGraphOutputs` + - `TargetWeights::mtp_layers`, `nextn_predict_layers`, `gguf_block_count`, + `tok_embd_gpu` (no fields removed; the trunk API is preserved). +- `dflash/src/qwen35_target_graph.cpp` — four new functions: + - `create_target_mtp_cache` / `free_target_mtp_cache` / `reset_target_mtp_cache` + - `build_qwen35_mtp_graph` — RMSNorm(e) || RMSNorm(h) → `eh_proj` → + full-attention transformer block → SwiGLU FFN → shared head. + Also wires `expose_pre_norm_hidden` into `build_qwen35_graph`. +- `dflash/src/gguf_target_loader.cpp` — reads `qwen35.nextn_predict_layers`, + splits the GGUF blocks into trunk + MTP tail, loads `blk..nextn.*` + tensors into `TargetWeights::mtp_layers`, and uploads `token_embd.weight` + to the GPU when the checkpoint carries MTP (`DFLASH27B_UPLOAD_TOK_EMBD` + env var overrides). +- `dflash/test/test_mtp_graph_contract.cpp` — synthetic-tensor test that + asserts the MTP graph wires together correctly. No GPU model needed; + cheap to run in CI. +- `dflash/test/smoke_mtp_graph.cpp` — loads a real MTP GGUF, builds the + NextN graph for a single token, and validates the output is finite. +- `dflash/test/smoke_target_mtp_handoff.cpp` — loads a real MTP GGUF and + proves that the trunk pre-norm hidden tensor feeds directly into the + MTP block within the same `ggml_cgraph` (no CPU roundtrip required). +- `dflash/test/smoke_mtp_integrated_decode.cpp` — full integrated decode + loop: target greedy + MTP greedy in one graph, with per-step accept / + correct counters. This is the functional baseline the upcoming PR's + speculative loop will be built on top of. + +## GGUF compatibility + +The loader follows the tensor naming convention introduced by llama.cpp's +[MTP PR #22673](https://github.com/ggml-org/llama.cpp/pull/22673). It is +compatible with the reference Qwen3.6-MTP GGUFs published on the Hub: + +- `am17an/Qwen3.6-27B-MTP-GGUF` +- `am17an/Qwen3.6-35BA3B-MTP-GGUF` (MoE — see "MoE limitation" below) +- `havenoammo/Qwen3.6-27B-MTP-UD-GGUF` +- `havenoammo/Qwen3.6-35B-A3B-MTP-GGUF` +- `froggeric/Qwen3.6-27B-MTP-GGUF` + +The expected tail-block tensor names are: + +```text +blk..nextn.eh_proj.weight [2 * hidden, hidden] +blk..nextn.embed_tokens.weight [hidden, vocab] (optional) +blk..nextn.enorm.weight [hidden] +blk..nextn.hnorm.weight [hidden] +blk..nextn.shared_head_head.weight [hidden, vocab] (optional) +blk..nextn.shared_head_norm.weight [hidden] (optional) +``` + +When the optional shared-head tensors are absent the runtime falls back to +the trunk's `output_norm` / `output` (lm_head), matching how am17an's +GGUFs are typically packed. + +## MoE limitation + +`build_qwen35_mtp_graph` currently implements the dense-FFN path only. The +35B-A3B MTP GGUFs require the MoE `TargetLayer` fields and the routed +FFN path that howard0su is upstreaming in +[PR #120 "Qwen3.5 MoE support"](https://github.com/Luce-Org/lucebox-hub/pull/120). +A MoE-aware `build_qwen35_mtp_graph` is a one-line dispatch on top of +this PR once #120 lands. Until then, loading a MoE-MTP GGUF + invoking +the MTP graph returns a clear error rather than producing wrong output. + +## Why MTP is opt-in, not default-on + +Measured today against `DFlash + PFlash` on the same MTP GGUF with MTP +disabled, on a single RTX 6000 Ada (sm_89), Qwen3.6-27B Q4_K_M target, +`q4_0/q4_0` KV, FA_WINDOW=0, DDTree budget=16, draft feature mirror on: + +| n_gen | Same GGUF, MTP off (tok/s) | Same GGUF, MTP chain-2 (tok/s) | Δ | +|---:|---:|---:|---:| +| 64 | 57.58 | 54.72 | **−5.0%** | +| 128 | 67.58 | 64.23 | **−5.0%** | +| 256 | 60.40 | 82.18 | **+36.1%** | + +What changes between 64 and 256 tokens is that DDTree rounds drop from +roughly 60 → 38 and average tokens committed per draft step rise from +4.27 → 6.74, so the extra MTP forward starts paying for itself. + +This is real but workload-dependent acceleration, not a universal default. +The next PR adds the speculative loop that turns this into a default-on +mode for long generations; today's PR ships only the runtime contract and +the tests that pin it. + +## Known follow-ups (next PR) + +1. Speculative decode loop wiring (`run_mtp_integrated_prompt`, + target-batched verify, fast rollback) inside `test_dflash`. +2. Daemon-side `--mtp-integrated` CLI + metrics surface (`[mtp-daemon]` + line, `last_mtp` aggregated in `prefix_cache.py`). +3. `mtp_baseline_gate.py` published as a reusable parity gate harness. +4. CPU hidden-readback elimination — the current functional smoke still + round-trips token ids through CPU between MTP steps. Removing that is + the highest-value perf fix and is queued behind CUDA-graph capture. +5. MoE MTP path after PR #120 merges. diff --git a/dflash/src/f16_convert.cu b/dflash/src/f16_convert.cu new file mode 100644 index 00000000..49bd309e --- /dev/null +++ b/dflash/src/f16_convert.cu @@ -0,0 +1,49 @@ +// Tiny half-precision → f32 conversion kernels used by the DDtree rollback +// path and the drafter's target_feat widen. We store some tensors +// (ssm_intermediate, target_feat) at 16-bit to halve their memory footprint, +// and widen on read into f32 consumers. +// +// Exposes plain C entry points so test_dflash.cpp can call them without +// pulling in a CUDA compile unit of its own. + +#include +#include +#include + +static __global__ void f16_to_f32_kernel(const __half * __restrict__ src, + float * __restrict__ dst, + size_t n_elems) { + const size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_elems) { + dst[i] = __half2float(src[i]); + } +} + +static __global__ void bf16_to_f32_kernel(const __nv_bfloat16 * __restrict__ src, + float * __restrict__ dst, + size_t n_elems) { + const size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_elems) { + dst[i] = __bfloat162float(src[i]); + } +} + +extern "C" void dflash27b_launch_f16_to_f32(const void * src, + void * dst, + size_t n_elems, + cudaStream_t stream) { + const int threads = 256; + const int blocks = (int)((n_elems + threads - 1) / threads); + f16_to_f32_kernel<<>>( + (const __half *)src, (float *)dst, n_elems); +} + +extern "C" void dflash27b_launch_bf16_to_f32(const void * src, + void * dst, + size_t n_elems, + cudaStream_t stream) { + const int threads = 256; + const int blocks = (int)((n_elems + threads - 1) / threads); + bf16_to_f32_kernel<<>>( + (const __nv_bfloat16 *)src, (float *)dst, n_elems); +} diff --git a/dflash/src/gguf_target_loader.cpp b/dflash/src/gguf_target_loader.cpp index f5fde060..38ccfdf9 100644 --- a/dflash/src/gguf_target_loader.cpp +++ b/dflash/src/gguf_target_loader.cpp @@ -273,7 +273,7 @@ bool load_target_gguf_partial(const std::string & path, std::string err; const uint32_t n_embd = get_u32_or(gctx, "qwen35.embedding_length", 0); const uint32_t n_ff = get_u32_or(gctx, "qwen35.feed_forward_length", 0); - const uint32_t n_layer= get_u32_or(gctx, "qwen35.block_count", 0); + const uint32_t n_block= get_u32_or(gctx, "qwen35.block_count", 0); const uint32_t n_head = get_u32_or(gctx, "qwen35.attention.head_count",0); const uint32_t n_headkv=get_u32_or(gctx, "qwen35.attention.head_count_kv",0); const uint32_t kl = get_u32_or(gctx, "qwen35.attention.key_length", 0); @@ -285,21 +285,48 @@ bool load_target_gguf_partial(const std::string & path, const uint32_t ssm_dt = get_u32_or(gctx, "qwen35.ssm.time_step_rank",0); const uint32_t ssm_grp = get_u32_or(gctx, "qwen35.ssm.group_count", 0); - if (n_embd == 0 || n_layer == 0 || n_head == 0 || n_headkv == 0 || + // Native MTP / NextN: zero on non-MTP GGUFs, 1 on the am17an Qwen3.6-MTP + // GGUFs. We treat the last `nextn_predict_layers` blocks as the MTP tail + // and the remaining `block_count - nextn` as the trunk. + const uint32_t nextn_predict_layers = get_u32_or(gctx, "qwen35.nextn_predict_layers", 0); + + if (n_embd == 0 || n_block == 0 || n_head == 0 || n_headkv == 0 || kl == 0 || vl == 0 || n_ff == 0 || fai == 0 || ssm_conv == 0 || ssm_inner == 0 || ssm_state == 0 || ssm_dt == 0 || ssm_grp == 0) { char buf[512]; std::snprintf(buf, sizeof(buf), - "missing or zero hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u " + "missing or zero hparams: n_embd=%u n_block=%u n_head=%u n_head_kv=%u " "kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}", - n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai, + n_embd, n_block, n_head, n_headkv, kl, vl, n_ff, fai, ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp); set_last_error(buf); gguf_free(gctx); return false; } + if (nextn_predict_layers > n_block) { + char buf[160]; + std::snprintf(buf, sizeof(buf), + "nextn_predict_layers=%u exceeds block_count=%u", + nextn_predict_layers, n_block); + set_last_error(buf); + gguf_free(gctx); return false; + } + if (nextn_predict_layers > 1) { + char buf[160]; + std::snprintf(buf, sizeof(buf), + "nextn_predict_layers=%u not supported yet (loader supports 0 or 1)", + nextn_predict_layers); + set_last_error(buf); + gguf_free(gctx); return false; + } + const uint32_t n_layer = n_block - nextn_predict_layers; + if (n_layer == 0) { + set_last_error("no trunk layers left after subtracting nextn_predict_layers"); + gguf_free(gctx); return false; + } + // Structural invariants required by the graph builder. if (kl != vl) { set_last_error("key_length != value_length not supported"); @@ -312,8 +339,10 @@ bool load_target_gguf_partial(const std::string & path, gguf_free(gctx); return false; } if (n_layer % fai != 0) { - char buf[128]; - std::snprintf(buf, sizeof(buf), "block_count=%u not divisible by full_attention_interval=%u", n_layer, fai); + char buf[160]; + std::snprintf(buf, sizeof(buf), + "trunk layer count=%u (block_count=%u nextn=%u) not divisible by full_attention_interval=%u", + n_layer, n_block, nextn_predict_layers, fai); set_last_error(buf); gguf_free(gctx); return false; } @@ -364,13 +393,15 @@ bool load_target_gguf_partial(const std::string & path, TargetLoadPlan plan = plan_in; if (plan.layer_begin < 0) plan.layer_begin = 0; - if (plan.layer_end < 0) plan.layer_end = (int)n_layer; + // Default end covers trunk + MTP/NextN tail so blk..* + // tensors are uploaded when nextn_predict_layers > 0. + if (plan.layer_end < 0) plan.layer_end = (int)n_block; if (plan.layer_begin > plan.layer_end || - plan.layer_end > (int)n_layer) { + plan.layer_end > (int)n_block) { char buf[160]; std::snprintf(buf, sizeof(buf), - "invalid target load layer range [%d,%d) for n_layer=%u", - plan.layer_begin, plan.layer_end, n_layer); + "invalid target load layer range [%d,%d) for n_block=%u", + plan.layer_begin, plan.layer_end, n_block); set_last_error(buf); gguf_free(gctx); return false; @@ -379,6 +410,8 @@ bool load_target_gguf_partial(const std::string & path, out.ctx = meta_ctx; out.backend = backend; out.n_layer = (int)n_layer; + out.gguf_block_count = (int)n_block; + out.nextn_predict_layers = (int)nextn_predict_layers; out.n_embd = (int)n_embd; out.n_ff = (int)n_ff; out.n_head = (int)n_head; @@ -392,6 +425,7 @@ bool load_target_gguf_partial(const std::string & path, out.ssm_d_state= (int)ssm_state; out.ssm_dt_rank= (int)ssm_dt; out.ssm_n_group= (int)ssm_grp; + out.mtp_layers.assign((size_t)nextn_predict_layers, TargetMtpLayer{}); // EOS token ids from GGUF tokenizer metadata (stored as UINT32 by the // GGUF spec; we use the u32 helper and cast). UINT32_MAX is the @@ -491,6 +525,77 @@ bool load_target_gguf_partial(const std::string & path, } } + // ── 2b. Wire MTP / NextN tail blocks (Qwen3.6-MTP GGUFs) ───────── + // GGUF block index for MTP layer `mi` is (n_layer + mi). Each MTP block + // ships a regular full-attention transformer (no DeltaNet) plus the + // NextN-specific projections (eh_proj, enorm, hnorm, optional shared head). + for (int mi = 0; mi < (int)nextn_predict_layers; mi++) { + const int il = (int)n_layer + mi; + char name[128]; + auto fnd = [&](const char * suffix) -> ggml_tensor * { + std::snprintf(name, sizeof(name), "blk.%d.%s", il, suffix); + return ggml_get_tensor(meta_ctx, name); + }; + TargetMtpLayer & M = out.mtp_layers[(size_t)mi]; + M.gguf_layer_index = il; + TargetLayer & L = M.block; + + L.attn_norm = fnd("attn_norm.weight"); + L.attn_post_norm = fnd("post_attention_norm.weight"); + L.w_gate = fnd("ffn_gate.weight"); + L.w_up = fnd("ffn_up.weight"); + L.w_down = fnd("ffn_down.weight"); + L.wq = fnd("attn_q.weight"); + L.wk = fnd("attn_k.weight"); + L.wv = fnd("attn_v.weight"); + L.wo = fnd("attn_output.weight"); + L.q_norm = fnd("attn_q_norm.weight"); + L.k_norm = fnd("attn_k_norm.weight"); + + M.nextn.eh_proj = fnd("nextn.eh_proj.weight"); + M.nextn.embed_tokens = fnd("nextn.embed_tokens.weight"); + M.nextn.enorm = fnd("nextn.enorm.weight"); + M.nextn.hnorm = fnd("nextn.hnorm.weight"); + M.nextn.shared_head_head = fnd("nextn.shared_head_head.weight"); + M.nextn.shared_head_norm = fnd("nextn.shared_head_norm.weight"); + + const bool has_attn = L.wq && L.wk && L.wv && L.wo && L.q_norm && L.k_norm; + if (!L.attn_norm || !L.attn_post_norm || !has_attn) { + char b[160]; + std::snprintf(b, sizeof(b), + "mtp layer %d: missing full-attention tensors", il); + set_last_error(b); + gguf_free(gctx); + return false; + } + if (!L.w_gate || !L.w_up || !L.w_down) { + char b[160]; + std::snprintf(b, sizeof(b), + "mtp layer %d: missing required FFN tensors", il); + set_last_error(b); + gguf_free(gctx); + return false; + } + if (!M.nextn.eh_proj || !M.nextn.enorm || !M.nextn.hnorm) { + char b[160]; + std::snprintf(b, sizeof(b), + "mtp layer %d: missing required nextn tensors " + "(eh_proj/enorm/hnorm)", il); + set_last_error(b); + gguf_free(gctx); + return false; + } + } + + // Plain target decode still embeds on CPU. Native MTP needs device-side + // token lookup to chain proposals inside one graph, so MTP-enabled + // checkpoints upload token_embd to the GPU as a regular weight. + bool upload_tok_embd = nextn_predict_layers > 0; + if (const char * s = std::getenv("DFLASH27B_UPLOAD_TOK_EMBD")) { + upload_tok_embd = std::atoi(s) != 0; + } + out.tok_embd_gpu = upload_tok_embd; + // 3. Allocate CUDA buffer only for the selected target tensors. ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend); const size_t alignment = ggml_backend_buft_get_alignment(buft); @@ -500,7 +605,11 @@ bool load_target_gguf_partial(const std::string & path, for (int64_t tid = 0; tid < n_tensors; tid++) { const char * tname = gguf_get_tensor_name(gctx, tid); ggml_tensor * t = ggml_get_tensor(meta_ctx, tname); - if (!t || !should_load_target_tensor(tname, plan.layer_begin, plan.layer_end, plan.load_output)) { + if (!t) continue; + const bool is_tok_embd = (std::strcmp(tname, "token_embd.weight") == 0); + const bool selected_by_plan = + should_load_target_tensor(tname, plan.layer_begin, plan.layer_end, plan.load_output); + if (!selected_by_plan && !(is_tok_embd && upload_tok_embd)) { continue; } alloc_total = align_up_size(alloc_total, alignment); @@ -559,10 +668,15 @@ bool load_target_gguf_partial(const std::string & path, return false; } if (std::string(tname) == "token_embd.weight") { - // Remember offset + size for the CPU embedder; don't upload to GPU. + // Remember offset + size for the CPU embedder regardless of GPU + // upload — MTP still needs the CPU mmap for tokenizer-side lookups. tok_embd_off = off; tok_embd_sz = sz; tok_embd_type = gguf_get_tensor_type(gctx, tid); + if (!upload_tok_embd) continue; + // MTP path: also stream the bytes into the GPU-resident tensor. + ggml_backend_tensor_set(t, (const uint8_t *)mm.addr + off, 0, sz); + total += sz; continue; } if (!should_load_target_tensor(tname, plan.layer_begin, plan.layer_end, plan.load_output)) { @@ -597,12 +711,16 @@ bool load_target_gguf_partial(const std::string & path, mm.release(); // don't munmap on Mmap dtor — now owned by the embedder // Stash the total for callers that want to print it - char summary[192]; + char summary[256]; std::snprintf(summary, sizeof(summary), - "target loaded: layers [%d,%d) output=%d, %zu tensors on GPU %.2f GiB, tok_embd %.0f MiB CPU-only (%s)", + "target loaded: layers [%d,%d) output=%d, %zu tensors on GPU %.2f GiB, " + "tok_embd %.0f MiB %s (%s), trunk_layers=%d nextn=%d", plan.layer_begin, plan.layer_end, (int)plan.load_output, allocs.size(), total / (1024.0 * 1024.0 * 1024.0), - tok_embd_sz / (1024.0 * 1024.0), ggml_type_name(tok_embd_type)); + tok_embd_sz / (1024.0 * 1024.0), + upload_tok_embd ? "GPU+CPU" : "CPU-only", + ggml_type_name(tok_embd_type), + out.n_layer, out.nextn_predict_layers); set_last_error(summary); return true; @@ -613,7 +731,11 @@ void free_target_weights(TargetWeights & w) { if (w.ctx) { ggml_free(w.ctx); w.ctx = nullptr; } // CpuEmbedder destructor handles the mmap automatically. w.layers.clear(); + w.mtp_layers.clear(); + w.nextn_predict_layers = 0; + w.gguf_block_count = 0; w.tok_embd = nullptr; + w.tok_embd_gpu = false; w.out_norm = nullptr; w.output = nullptr; } diff --git a/dflash/src/internal.h b/dflash/src/internal.h index b9cc88d5..945e8611 100644 --- a/dflash/src/internal.h +++ b/dflash/src/internal.h @@ -73,6 +73,34 @@ struct TargetLayer { ggml_tensor * ssm_out = nullptr; // output projection after delta-net }; +// Qwen3.5/3.6 NextN / MTP tail block tensors. These live in the tail +// `nextn_predict_layers` of the GGUF (one such block in Qwen3.6-MTP). +// Follow the tensor naming convention introduced by llama.cpp PR #22673: +// blk..nextn.eh_proj [2*hidden, hidden] +// blk..nextn.embed_tokens [hidden, vocab] (optional) +// blk..nextn.enorm [hidden] +// blk..nextn.hnorm [hidden] +// blk..nextn.shared_head_head [hidden, vocab] (optional, falls back to w.output) +// blk..nextn.shared_head_norm [hidden] (optional, falls back to w.out_norm) +struct TargetNextN { + ggml_tensor * eh_proj = nullptr; + ggml_tensor * embed_tokens = nullptr; + ggml_tensor * enorm = nullptr; + ggml_tensor * hnorm = nullptr; + ggml_tensor * shared_head_head = nullptr; + ggml_tensor * shared_head_norm = nullptr; +}; + +// One MTP / NextN layer in the GGUF tail. Holds the regular transformer +// block tensors (full-attention only — no DeltaNet on MTP) plus the +// NextN-specific projections above. The trunk decoder's pre-norm hidden +// state is fed into this block to produce the MTP draft logits. +struct TargetMtpLayer { + TargetLayer block; + TargetNextN nextn; + int gguf_layer_index = -1; +}; + // CPU-side embedder: keeps a mmap of the GGUF alive and knows how to // dequantize individual rows of the quantized tok_embd tensor on demand. // This matches llama.cpp's behavior of running embedding get_rows on CPU @@ -108,7 +136,11 @@ struct TargetWeights { CpuEmbedder embedder; ggml_tensor * tok_embd = nullptr; // [hidden, vocab] (metadata only; data NOT on GPU) - std::vector layers; // size = 64 + bool tok_embd_gpu = false; // true when token_embd bytes were uploaded for GPU get_rows. + // Required by MTP because the integrated decode path needs + // device-side token lookup to chain proposals within a graph. + std::vector layers; // trunk layers only, excludes any nextn/MTP tail blocks + std::vector mtp_layers; // size = nextn_predict_layers (0 for non-MTP GGUFs) ggml_tensor * out_norm = nullptr; // [hidden] ggml_tensor * output = nullptr; // [hidden, vocab] (lm_head) @@ -119,7 +151,9 @@ struct TargetWeights { int n_embd_head_v = 256; // value_length int n_head = 24; int n_head_kv = 4; - int n_layer = 64; + int gguf_block_count = 64; // raw qwen35.block_count from the GGUF + int nextn_predict_layers = 0; // qwen35.nextn_predict_layers (0 = non-MTP GGUF) + int n_layer = 64; // trunk layer count: gguf_block_count - nextn_predict_layers int n_embd = 5120; int n_ff = 17408; int ssm_d_conv = 4; @@ -413,6 +447,36 @@ bool migrate_prefill_cache(const TargetWeights & w, ggml_backend_t backend, TargetCache & cache); +// ─── Native MTP / NextN cache ───────────────────────────────────── +// +// Qwen3.5/3.6 native multi-token prediction keeps a tiny KV cache for the +// tail NextN block(s) only — the trunk decoder retains its own TargetCache +// above. Matches the "kv_only_nextn" contract used by llama.cpp PR #22673 +// and llama-crucible's MTP cache layout. +struct TargetMtpCache { + ggml_context * ctx = nullptr; + ggml_backend_buffer_t buf = nullptr; + ggml_backend_t backend = nullptr; + + int max_ctx = 0; + int cur_pos = 0; + + ggml_type kv_k_type = GGML_TYPE_Q8_0; + ggml_type kv_v_type = GGML_TYPE_Q8_0; + + std::vector attn_k; // one per TargetWeights::mtp_layers entry + std::vector attn_v; +}; + +bool create_target_mtp_cache(const TargetWeights & w, + int max_ctx, + ggml_backend_t backend, + TargetMtpCache & out); + +void free_target_mtp_cache(TargetMtpCache & c); + +void reset_target_mtp_cache(TargetMtpCache & c); + // ─── Target forward graph ───────────────────────────────────────── // Per-delta-net-layer pointers exposed by the graph for spec-decode rollback. @@ -443,6 +507,7 @@ struct QwenGraphInputs { int kv_start; // position where the new tokens begin bool capture_layers; // if true, write captured layer features into cache.target_feat bool capture_delta_intermediate = false; // if true, populate out_delta_captures + bool expose_pre_norm_hidden = false; // if true, keep final pre-norm hidden for MTP handoff int fa_window = 0; // sliding window for FA layers: 0 = full attention bool last_token_logits_only = false; // if true, only compute logits for last token (prefill optimization) ggml_tensor * parent_ids = nullptr; // [n_tokens] i32; tree mode when non-null @@ -450,6 +515,11 @@ struct QwenGraphInputs { struct QwenGraphOutputs { ggml_tensor * logits; // [vocab, n_tokens] f32 + // Final hidden state before the target output norm. Populated when + // QwenGraphInputs::expose_pre_norm_hidden is true. Used as the + // `t_h_pre_norm` handoff into the native NextN/MTP block — matches the + // convention from llama-crucible MTP. + ggml_tensor * pre_norm_hidden = nullptr; // [hidden, n_tokens] f32 // One entry per delta-net layer (48 for qwen35-27b). Only populated when // QwenGraphInputs::capture_delta_intermediate is true. Tensors are graph // views marked as ggml_set_output() so their data persists after @@ -464,6 +534,38 @@ QwenGraphOutputs build_qwen35_graph( TargetCache & cache, const QwenGraphInputs & in); +// ─── Native MTP / NextN forward graph ───────────────────────────── +// +// Single-layer NextN/MTP block. Consumes the trunk decoder's pre-norm hidden +// state (`pre_norm_hidden`) plus the embedding of the current token, runs +// the NextN concat → eh_proj → transformer-block → shared-head pipeline, +// and returns the MTP draft logits + the post-block hidden state. +// +// Today this PR implements the dense-FFN path only; MoE MTP requires the +// MoE TargetLayer fields landing first (see PR #120 "Qwen3.5 MoE support"). +struct QwenMtpGraphInputs { + ggml_tensor * token_embed; // [hidden, n_tokens] f32; embedding of the current token(s) + ggml_tensor * pre_norm_hidden; // [hidden, n_tokens] f32 from trunk output + ggml_tensor * positions; // [4 * n_tokens] i32 + ggml_tensor * attn_mask; // optional [kv_len, n_tokens_padded] f32 + int n_tokens = 1; + int kv_start = 0; + int mtp_layer_index = 0; + int fa_window = 0; +}; + +struct QwenMtpGraphOutputs { + ggml_tensor * logits = nullptr; // [vocab, n_tokens] f32 + ggml_tensor * hidden = nullptr; // [hidden, n_tokens] f32, post-MTP block +}; + +QwenMtpGraphOutputs build_qwen35_mtp_graph( + ggml_context * ctx, + ggml_cgraph * gf, + const TargetWeights & w, + TargetMtpCache & cache, + const QwenMtpGraphInputs & in); + // Build a single-layer forward graph. Mirrors build_qwen35_graph but processes // only one layer, taking `inp` as the input activation and returning the output. // Used by layer-segmented prefill to iterate layers as the outer loop. diff --git a/dflash/src/qwen35_target_graph.cpp b/dflash/src/qwen35_target_graph.cpp index 47b989ff..039a9cf7 100644 --- a/dflash/src/qwen35_target_graph.cpp +++ b/dflash/src/qwen35_target_graph.cpp @@ -1358,6 +1358,17 @@ QwenGraphOutputs build_qwen35_graph( inpL = cur; } + // Expose the final pre-norm hidden state so the native NextN/MTP block can + // consume it as `t_h_pre_norm`. Marked as graph output so its data persists + // after graph_compute. Caller threads it into build_qwen35_mtp_graph. + QwenGraphOutputs og = std::move(og_early); + if (in.expose_pre_norm_hidden) { + ggml_set_name(inpL, "target_pre_norm_hidden"); + ggml_set_output(inpL); + ggml_build_forward_expand(gf, inpL); + og.pre_norm_hidden = inpL; + } + // 2. Final norm ggml_tensor * out = rms_norm_mul(ctx, inpL, w.out_norm, EPS); @@ -1373,7 +1384,6 @@ QwenGraphOutputs build_qwen35_graph( ggml_build_forward_expand(gf, logits); - QwenGraphOutputs og = std::move(og_early); og.logits = logits; return og; } @@ -1396,4 +1406,196 @@ ggml_tensor * build_qwen35_layer( attn_mask, kv_start, n_tokens, capture, fa_window); } +// ─── Native MTP / NextN cache and graph ─────────────────────────────── + +bool create_target_mtp_cache(const TargetWeights & w, + int max_ctx, + ggml_backend_t backend, + TargetMtpCache & out) { + if (w.mtp_layers.empty()) { + set_last_error("create_target_mtp_cache requires TargetWeights::mtp_layers"); + return false; + } + if (max_ctx <= 0) { + set_last_error("create_target_mtp_cache requires max_ctx > 0"); + return false; + } + + out.backend = backend; + out.max_ctx = max_ctx; + out.cur_pos = 0; + + ggml_type kv_k_type = GGML_TYPE_Q8_0; + ggml_type kv_v_type = GGML_TYPE_Q8_0; + dflash::resolve_kv_types(kv_k_type, kv_v_type); + out.kv_k_type = kv_k_type; + out.kv_v_type = kv_v_type; + const int max_ctx_alloc = (kv_k_type == GGML_TYPE_TQ3_0 || kv_v_type == GGML_TYPE_TQ3_0) + ? ((max_ctx + 255) / 256) * 256 + : max_ctx; + + const int n_mtp = (int)w.mtp_layers.size(); + out.attn_k.assign(n_mtp, nullptr); + out.attn_v.assign(n_mtp, nullptr); + + ggml_init_params ip{}; + ip.mem_size = (size_t)(2 * n_mtp + 16) * ggml_tensor_overhead(); + ip.mem_buffer = nullptr; + ip.no_alloc = true; + out.ctx = ggml_init(ip); + if (!out.ctx) { + set_last_error("mtp cache ggml_init failed"); + return false; + } + + for (int mi = 0; mi < n_mtp; mi++) { + ggml_tensor * K = ggml_new_tensor_3d(out.ctx, kv_k_type, + w.n_embd_head_k, max_ctx_alloc, w.n_head_kv); + ggml_tensor * V = ggml_new_tensor_3d(out.ctx, kv_v_type, + w.n_embd_head_k, max_ctx_alloc, w.n_head_kv); + char name[64]; + std::snprintf(name, sizeof(name), "mtp_cache_k_%d", mi); + ggml_set_name(K, name); + std::snprintf(name, sizeof(name), "mtp_cache_v_%d", mi); + ggml_set_name(V, name); + out.attn_k[mi] = K; + out.attn_v[mi] = V; + } + + out.buf = ggml_backend_alloc_ctx_tensors(out.ctx, backend); + if (!out.buf) { + set_last_error("ggml_backend_alloc_ctx_tensors failed for mtp cache"); + ggml_free(out.ctx); + out.ctx = nullptr; + out.attn_k.clear(); + out.attn_v.clear(); + return false; + } + + reset_target_mtp_cache(out); + return true; +} + +void free_target_mtp_cache(TargetMtpCache & c) { + if (c.buf) { ggml_backend_buffer_free(c.buf); c.buf = nullptr; } + if (c.ctx) { ggml_free(c.ctx); c.ctx = nullptr; } + c.attn_k.clear(); + c.attn_v.clear(); + c.max_ctx = 0; + c.cur_pos = 0; +} + +void reset_target_mtp_cache(TargetMtpCache & c) { + c.cur_pos = 0; + std::vector zeros(1 * 1024 * 1024, 0); + if (!c.ctx) return; + for (ggml_tensor * t = ggml_get_first_tensor(c.ctx); t != nullptr; + t = ggml_get_next_tensor(c.ctx, t)) { + size_t nb = ggml_nbytes(t); + size_t off = 0; + while (off < nb) { + size_t chunk = std::min(nb - off, zeros.size()); + ggml_backend_tensor_set(t, zeros.data(), off, chunk); + off += chunk; + } + } +} + +QwenMtpGraphOutputs build_qwen35_mtp_graph( + ggml_context * ctx, + ggml_cgraph * gf, + const TargetWeights & w, + TargetMtpCache & cache, + const QwenMtpGraphInputs & in) { + + if (w.mtp_layers.empty()) { + set_last_error("build_qwen35_mtp_graph requires TargetWeights::mtp_layers"); + return {}; + } + if (in.mtp_layer_index < 0 || in.mtp_layer_index >= (int)w.mtp_layers.size()) { + set_last_error("build_qwen35_mtp_graph mtp_layer_index out of range"); + return {}; + } + if (!in.token_embed || !in.pre_norm_hidden || !in.positions) { + set_last_error("build_qwen35_mtp_graph missing required input tensor"); + return {}; + } + if ((int)cache.attn_k.size() <= in.mtp_layer_index || + (int)cache.attn_v.size() <= in.mtp_layer_index || + !cache.attn_k[in.mtp_layer_index] || !cache.attn_v[in.mtp_layer_index]) { + set_last_error("build_qwen35_mtp_graph missing MTP KV cache tensors"); + return {}; + } + + const int n_tokens = std::max(1, in.n_tokens); + const TargetMtpLayer & M = w.mtp_layers[(size_t)in.mtp_layer_index]; + const TargetLayer & L = M.block; + + // Dense FFN only — MoE MTP requires the MoE TargetLayer fields landing + // first (see PR #120 "Qwen3.5 MoE support"). For non-MoE GGUFs this is + // the production path and matches the am17an reference layout. + const bool has_dense_ffn = L.w_gate && L.w_up && L.w_down; + if (!M.nextn.eh_proj || !M.nextn.enorm || !M.nextn.hnorm || + !L.attn_norm || !L.attn_post_norm || + !L.wq || !L.wk || !L.wv || !L.wo || !L.q_norm || !L.k_norm || + !has_dense_ffn) { + set_last_error("build_qwen35_mtp_graph missing loaded MTP tensors " + "(MoE MTP not supported in this PR — needs #120)"); + return {}; + } + + // NextN concat path: [enorm(e); hnorm(h)] → eh_proj → transformer block. + ggml_tensor * e_norm = rms_norm_mul(ctx, in.token_embed, M.nextn.enorm, EPS); + ggml_tensor * h_norm = rms_norm_mul(ctx, in.pre_norm_hidden, M.nextn.hnorm, EPS); + + ggml_tensor * concat = ggml_concat(ctx, e_norm, h_norm, 0); + ggml_set_name(concat, "mtp_concat_embedding_hidden"); + + ggml_tensor * cur = ggml_mul_mat(ctx, M.nextn.eh_proj, concat); + ggml_set_name(cur, "mtp_eh_proj"); + + ggml_tensor * inpSA = cur; + cur = rms_norm_mul(ctx, cur, L.attn_norm, EPS); + ggml_set_name(cur, "mtp_attn_norm"); + + cur = build_full_attn_block(ctx, gf, w, L, cur, in.positions, + cache.attn_k[in.mtp_layer_index], + cache.attn_v[in.mtp_layer_index], + in.attn_mask, in.kv_start, n_tokens, + cache.kv_k_type, cache.kv_v_type, + /*kv_k_rotated=*/false, in.fa_window); + ggml_set_name(cur, "mtp_attn_out"); + + cur = ggml_add(ctx, cur, inpSA); + + ggml_tensor * ffn_residual = cur; + cur = rms_norm_mul(ctx, cur, L.attn_post_norm, EPS); + ggml_set_name(cur, "mtp_post_attn_norm"); + ggml_tensor * ffn = build_swiglu_ffn(ctx, cur, L); + if (!ffn) return {}; + cur = ggml_add(ctx, ffn, ffn_residual); + ggml_set_name(cur, "mtp_hidden"); + ggml_set_output(cur); + + // Final norm + shared LM head. Falls back to the trunk's out_norm / output + // when the NextN block doesn't ship its own shared head tensors (am17an + // GGUFs do not always include shared_head_*). + ggml_tensor * head_norm_w = M.nextn.shared_head_norm ? M.nextn.shared_head_norm : w.out_norm; + ggml_tensor * head_w = M.nextn.shared_head_head ? M.nextn.shared_head_head : w.output; + if (!head_norm_w || !head_w) { + set_last_error("build_qwen35_mtp_graph missing MTP/shared LM head tensors"); + return {}; + } + + ggml_tensor * out_h = rms_norm_mul(ctx, cur, head_norm_w, EPS); + ggml_tensor * logits = ggml_mul_mat(ctx, head_w, out_h); + ggml_set_name(logits, "mtp_logits"); + ggml_build_forward_expand(gf, logits); + + QwenMtpGraphOutputs og{}; + og.logits = logits; + og.hidden = cur; + return og; +} + } // namespace dflash27b diff --git a/dflash/test/smoke_mtp_graph.cpp b/dflash/test/smoke_mtp_graph.cpp new file mode 100644 index 00000000..b5c710ab --- /dev/null +++ b/dflash/test/smoke_mtp_graph.cpp @@ -0,0 +1,173 @@ +// Smoke test for the native Qwen35 NextN/MTP graph. +// +// Loads a GGUF with embedded nextn tensors, creates the MTP KV cache, runs a +// single-token MTP forward from caller-provided token embedding + synthetic +// target pre-norm hidden, and checks the resulting logits for NaN/Inf. +// +// Usage: smoke_mtp_graph [cuda_gpu] + +#include "dflash27b.h" +#include "internal.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include +#include +#include +#include +#include + +using namespace dflash27b; + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + + int gpu = 0; + if (argc >= 3) gpu = std::atoi(argv[2]); + + ggml_backend_t backend = ggml_backend_cuda_init(gpu); + if (!backend) { + std::fprintf(stderr, "cuda init failed\n"); + return 1; + } + + TargetWeights w; + if (!load_target_gguf(argv[1], backend, w)) { + std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error()); + ggml_backend_free(backend); + return 1; + } + std::printf("[target] %s\n", dflash27b_last_error()); + if (w.mtp_layers.empty()) { + std::fprintf(stderr, "model has no MTP/nextn layers\n"); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + TargetMtpCache mtp_cache; + if (!create_target_mtp_cache(w, /*max_ctx=*/64, backend, mtp_cache)) { + std::fprintf(stderr, "create_target_mtp_cache: %s\n", dflash27b_last_error()); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + std::printf("[mtp-cache] layers=%zu max_ctx=%d kv=%s/%s\n", + mtp_cache.attn_k.size(), mtp_cache.max_ctx, + ggml_type_name(mtp_cache.kv_k_type), ggml_type_name(mtp_cache.kv_v_type)); + + ggml_init_params ip{}; + ip.mem_size = 512 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + ggml_context * gctx = ggml_init(ip); + if (!gctx) { + std::fprintf(stderr, "ggml_init graph failed\n"); + free_target_mtp_cache(mtp_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + const int n_tokens = 1; + ggml_tensor * token_embed = ggml_new_tensor_2d(gctx, GGML_TYPE_F32, w.n_embd, n_tokens); + ggml_tensor * hidden = ggml_new_tensor_2d(gctx, GGML_TYPE_F32, w.n_embd, n_tokens); + ggml_tensor * positions = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, 4 * n_tokens); + ggml_set_name(token_embed, "mtp_token_embed"); + ggml_set_name(hidden, "target_pre_norm_hidden"); + ggml_set_name(positions, "positions"); + ggml_set_input(token_embed); + ggml_set_input(hidden); + ggml_set_input(positions); + + ggml_cgraph * gf = ggml_new_graph_custom(gctx, 2048, false); + QwenMtpGraphInputs gi{}; + gi.token_embed = token_embed; + gi.pre_norm_hidden = hidden; + gi.positions = positions; + gi.n_tokens = n_tokens; + gi.kv_start = 0; + + QwenMtpGraphOutputs go = build_qwen35_mtp_graph(gctx, gf, w, mtp_cache, gi); + if (!go.logits) { + std::fprintf(stderr, "build_qwen35_mtp_graph: %s\n", dflash27b_last_error()); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + ggml_set_output(go.logits); + ggml_build_forward_expand(gf, go.logits); + std::printf("[graph] nodes=%d\n", ggml_graph_n_nodes(gf)); + + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + std::fprintf(stderr, "ggml_gallocr_alloc_graph failed\n"); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + int32_t tok_ids[1] = { 1 }; + std::vector embed_buf((size_t)w.n_embd * n_tokens); + if (!w.embedder.embed(tok_ids, n_tokens, embed_buf.data())) { + std::fprintf(stderr, "cpu embedder failed\n"); + ggml_gallocr_free(alloc); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + std::vector hidden_buf((size_t)w.n_embd * n_tokens, 0.0f); + int32_t pos4[4] = { 0, 0, 0, 0 }; + ggml_backend_tensor_set(token_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size()); + ggml_backend_tensor_set(hidden, hidden_buf.data(), 0, sizeof(float) * hidden_buf.size()); + ggml_backend_tensor_set(positions, pos4, 0, sizeof(pos4)); + + auto status = ggml_backend_graph_compute(backend, gf); + if (status != GGML_STATUS_SUCCESS) { + std::fprintf(stderr, "compute failed: %d\n", (int)status); + ggml_gallocr_free(alloc); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + const int64_t vocab = go.logits->ne[0]; + std::vector logits((size_t)vocab); + ggml_backend_tensor_get(go.logits, logits.data(), 0, sizeof(float) * logits.size()); + + int n_nan = 0, n_inf = 0; + float vmin = 1e30f, vmax = -1e30f; + for (float v : logits) { + if (std::isnan(v)) n_nan++; + else if (std::isinf(v)) n_inf++; + else { + vmin = std::min(vmin, v); + vmax = std::max(vmax, v); + } + } + std::printf("[mtp-logits] vocab=%lld nan=%d inf=%d min=%.4g max=%.4g\n", + (long long)vocab, n_nan, n_inf, vmin, vmax); + + ggml_gallocr_free(alloc); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_weights(w); + ggml_backend_free(backend); + std::printf("OK\n"); + return (n_nan == 0 && n_inf == 0) ? 0 : 1; +} diff --git a/dflash/test/smoke_mtp_integrated_decode.cpp b/dflash/test/smoke_mtp_integrated_decode.cpp new file mode 100644 index 00000000..3f09b215 --- /dev/null +++ b/dflash/test/smoke_mtp_integrated_decode.cpp @@ -0,0 +1,225 @@ +// Minimal integrated DFlash + native MTP decode smoke. +// +// This is not the optimized multi-token speculative loop yet. It proves the +// functional contract end-to-end: +// 1. target DFlash consumes the committed token and exposes pre-norm hidden +// 2. native MTP/NextN consumes that hidden in the same graph and drafts +// 3. greedy target logits accept or correct the MTP draft token +// 4. the chosen token becomes the next committed token +// +// Usage: +// smoke_mtp_integrated_decode [n_gen] [seed_token_id] [cuda_gpu] + +#include "dflash27b.h" +#include "internal.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include +#include +#include +#include + +using namespace dflash27b; + +static bool run_integrated_step(const TargetWeights & w, + TargetCache & target_cache, + TargetMtpCache & mtp_cache, + ggml_backend_t backend, + int32_t token, + int kv_start, + int32_t & target_next, + int32_t & mtp_next) { + ggml_init_params ip{}; + ip.mem_size = 768 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + ggml_context * ctx = ggml_init(ip); + if (!ctx) { + std::fprintf(stderr, "ggml_init graph failed\n"); + return false; + } + + const int n_tokens = 1; + ggml_tensor * inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w.n_embd, n_tokens, 1); + ggml_tensor * positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * n_tokens); + ggml_set_name(inp_embed, "inp_embed"); + ggml_set_name(positions, "positions"); + ggml_set_input(inp_embed); + ggml_set_input(positions); + + ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + QwenGraphInputs target_in{}; + target_in.inp_embed = inp_embed; + target_in.positions = positions; + target_in.n_tokens = n_tokens; + target_in.kv_start = kv_start; + target_in.expose_pre_norm_hidden = true; + + QwenGraphOutputs target_out = build_qwen35_graph(ctx, gf, w, target_cache, target_in); + if (!target_out.logits || !target_out.pre_norm_hidden) { + std::fprintf(stderr, "build_qwen35_graph failed: %s\n", dflash27b_last_error()); + ggml_free(ctx); + return false; + } + + QwenMtpGraphInputs mtp_in{}; + mtp_in.token_embed = ggml_reshape_2d(ctx, inp_embed, w.n_embd, n_tokens); + mtp_in.pre_norm_hidden = target_out.pre_norm_hidden; + mtp_in.positions = positions; + mtp_in.n_tokens = n_tokens; + mtp_in.kv_start = kv_start; + + QwenMtpGraphOutputs mtp_out = build_qwen35_mtp_graph(ctx, gf, w, mtp_cache, mtp_in); + if (!mtp_out.logits) { + std::fprintf(stderr, "build_qwen35_mtp_graph failed: %s\n", dflash27b_last_error()); + ggml_free(ctx); + return false; + } + + ggml_tensor * target_argmax = ggml_argmax(ctx, target_out.logits); + ggml_set_name(target_argmax, "target_argmax"); + ggml_set_output(target_argmax); + ggml_build_forward_expand(gf, target_argmax); + + ggml_tensor * mtp_argmax = ggml_argmax(ctx, mtp_out.logits); + ggml_set_name(mtp_argmax, "mtp_argmax"); + ggml_set_output(mtp_argmax); + ggml_build_forward_expand(gf, mtp_argmax); + + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + std::fprintf(stderr, "ggml_gallocr_alloc_graph failed\n"); + ggml_gallocr_free(alloc); + ggml_free(ctx); + return false; + } + + std::vector embed_buf((size_t)w.n_embd); + if (!w.embedder.embed(&token, 1, embed_buf.data())) { + std::fprintf(stderr, "cpu embedder failed for token %d\n", (int)token); + ggml_gallocr_free(alloc); + ggml_free(ctx); + return false; + } + int32_t pos4[4] = { kv_start, kv_start, kv_start, kv_start }; + ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size()); + ggml_backend_tensor_set(positions, pos4, 0, sizeof(pos4)); + + auto status = ggml_backend_graph_compute(backend, gf); + if (status != GGML_STATUS_SUCCESS) { + std::fprintf(stderr, "compute failed: %d\n", (int)status); + ggml_gallocr_free(alloc); + ggml_free(ctx); + return false; + } + + ggml_backend_tensor_get(target_argmax, &target_next, 0, sizeof(target_next)); + ggml_backend_tensor_get(mtp_argmax, &mtp_next, 0, sizeof(mtp_next)); + + ggml_gallocr_free(alloc); + ggml_free(ctx); + return true; +} + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s [n_gen] [seed_token_id] [cuda_gpu]\n", argv[0]); + return 2; + } + const int n_gen = argc >= 3 ? std::max(1, std::atoi(argv[2])) : 8; + int32_t last_tok = argc >= 4 ? (int32_t)std::atoi(argv[3]) : 1; + const int gpu = argc >= 5 ? std::atoi(argv[4]) : 0; + + ggml_backend_t backend = ggml_backend_cuda_init(gpu); + if (!backend) { + std::fprintf(stderr, "cuda init failed\n"); + return 1; + } + + TargetWeights w; + if (!load_target_gguf(argv[1], backend, w)) { + std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error()); + ggml_backend_free(backend); + return 1; + } + std::printf("[target] %s\n", dflash27b_last_error()); + + TargetCache target_cache; + if (!create_target_cache(w, /*max_ctx=*/std::max(64, n_gen + 8), + /*max_verify_tokens=*/0, backend, target_cache)) { + std::fprintf(stderr, "create_target_cache: %s\n", dflash27b_last_error()); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + TargetMtpCache mtp_cache; + if (!create_target_mtp_cache(w, /*max_ctx=*/std::max(64, n_gen + 8), + backend, mtp_cache)) { + std::fprintf(stderr, "create_target_mtp_cache: %s\n", dflash27b_last_error()); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + std::vector generated; + generated.reserve((size_t)n_gen); + int draft_n = 0; + int accepted = 0; + int corrected = 0; + + auto t0 = std::chrono::steady_clock::now(); + for (int pos = 0; pos < n_gen; pos++) { + int32_t target_next = -1; + int32_t mtp_next = -1; + if (!run_integrated_step(w, target_cache, mtp_cache, backend, + last_tok, pos, target_next, mtp_next)) { + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + draft_n++; + const bool ok = (mtp_next == target_next); + if (ok) { + accepted++; + } else { + corrected++; + } + const int32_t chosen = ok ? mtp_next : target_next; + generated.push_back(chosen); + target_cache.cur_pos = pos + 1; + target_cache.last_tok = chosen; + mtp_cache.cur_pos = pos + 1; + + std::printf("[mtp-decode step=%d] input=%d mtp=%d target=%d %s chosen=%d\n", + pos, (int)last_tok, (int)mtp_next, (int)target_next, + ok ? "ACCEPT" : "CORRECT", (int)chosen); + last_tok = chosen; + } + auto t1 = std::chrono::steady_clock::now(); + const double seconds = std::chrono::duration(t1 - t0).count(); + + std::printf("[mtp-decode] generated=%d draft_n=%d accepted=%d corrected=%d acceptance=%.1f%% tok/s=%.2f\n", + n_gen, draft_n, accepted, corrected, + draft_n > 0 ? 100.0 * accepted / draft_n : 0.0, + n_gen / std::max(1e-9, seconds)); + std::printf("[mtp-decode ids]"); + for (int32_t t : generated) std::printf(" %d", (int)t); + std::printf("\nOK\n"); + + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 0; +} diff --git a/dflash/test/smoke_target_mtp_handoff.cpp b/dflash/test/smoke_target_mtp_handoff.cpp new file mode 100644 index 00000000..81ddfc33 --- /dev/null +++ b/dflash/test/smoke_target_mtp_handoff.cpp @@ -0,0 +1,199 @@ +// Smoke test for the DFlash target -> native MTP handoff. +// +// Builds one graph containing: +// target forward with expose_pre_norm_hidden=true +// MTP/NextN forward fed by that target_pre_norm_hidden tensor +// +// This validates the C++ tensor handoff required by the real speculative loop. +// +// Usage: smoke_target_mtp_handoff [cuda_gpu] + +#include "dflash27b.h" +#include "internal.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include +#include +#include +#include +#include + +using namespace dflash27b; + +static int check_logits(ggml_tensor * logits, const char * label) { + const int64_t vocab = logits->ne[0]; + std::vector buf((size_t)vocab); + ggml_backend_tensor_get(logits, buf.data(), 0, sizeof(float) * buf.size()); + int n_nan = 0, n_inf = 0; + float vmin = 1e30f, vmax = -1e30f; + for (float v : buf) { + if (std::isnan(v)) n_nan++; + else if (std::isinf(v)) n_inf++; + else { + vmin = std::min(vmin, v); + vmax = std::max(vmax, v); + } + } + std::printf("[%s] vocab=%lld nan=%d inf=%d min=%.4g max=%.4g\n", + label, (long long)vocab, n_nan, n_inf, vmin, vmax); + return (n_nan == 0 && n_inf == 0) ? 0 : 1; +} + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + + int gpu = 0; + if (argc >= 3) gpu = std::atoi(argv[2]); + + ggml_backend_t backend = ggml_backend_cuda_init(gpu); + if (!backend) { + std::fprintf(stderr, "cuda init failed\n"); + return 1; + } + + TargetWeights w; + if (!load_target_gguf(argv[1], backend, w)) { + std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error()); + ggml_backend_free(backend); + return 1; + } + std::printf("[target] %s\n", dflash27b_last_error()); + + TargetCache target_cache; + if (!create_target_cache(w, /*max_ctx=*/64, /*max_verify_tokens=*/0, backend, target_cache)) { + std::fprintf(stderr, "create_target_cache: %s\n", dflash27b_last_error()); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + TargetMtpCache mtp_cache; + if (!create_target_mtp_cache(w, /*max_ctx=*/64, backend, mtp_cache)) { + std::fprintf(stderr, "create_target_mtp_cache: %s\n", dflash27b_last_error()); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + ggml_init_params ip{}; + ip.mem_size = 768 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + ggml_context * gctx = ggml_init(ip); + if (!gctx) { + std::fprintf(stderr, "ggml_init graph failed\n"); + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + const int n_tokens = 1; + ggml_tensor * inp_embed = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, w.n_embd, n_tokens, 1); + ggml_tensor * positions = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, 4 * n_tokens); + ggml_set_name(inp_embed, "inp_embed"); + ggml_set_name(positions, "positions"); + ggml_set_input(inp_embed); + ggml_set_input(positions); + + ggml_cgraph * gf = ggml_new_graph_custom(gctx, 8192, false); + QwenGraphInputs target_in{}; + target_in.inp_embed = inp_embed; + target_in.positions = positions; + target_in.n_tokens = n_tokens; + target_in.kv_start = 0; + target_in.expose_pre_norm_hidden = true; + + QwenGraphOutputs target_out = build_qwen35_graph(gctx, gf, w, target_cache, target_in); + if (!target_out.logits || !target_out.pre_norm_hidden) { + std::fprintf(stderr, "build_qwen35_graph did not expose target hidden: %s\n", dflash27b_last_error()); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + QwenMtpGraphInputs mtp_in{}; + mtp_in.token_embed = ggml_reshape_2d(gctx, inp_embed, w.n_embd, n_tokens); + mtp_in.pre_norm_hidden = target_out.pre_norm_hidden; + mtp_in.positions = positions; + mtp_in.n_tokens = n_tokens; + mtp_in.kv_start = 0; + + QwenMtpGraphOutputs mtp_out = build_qwen35_mtp_graph(gctx, gf, w, mtp_cache, mtp_in); + if (!mtp_out.logits || !mtp_out.hidden) { + std::fprintf(stderr, "build_qwen35_mtp_graph failed: %s\n", dflash27b_last_error()); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + ggml_set_output(target_out.logits); + ggml_set_output(mtp_out.logits); + ggml_build_forward_expand(gf, mtp_out.logits); + std::printf("[graph] nodes=%d\n", ggml_graph_n_nodes(gf)); + + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + std::fprintf(stderr, "ggml_gallocr_alloc_graph failed\n"); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + int32_t tok_ids[1] = { 1 }; + std::vector embed_buf((size_t)w.n_embd * n_tokens); + if (!w.embedder.embed(tok_ids, n_tokens, embed_buf.data())) { + std::fprintf(stderr, "cpu embedder failed\n"); + ggml_gallocr_free(alloc); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + int32_t pos4[4] = { 0, 0, 0, 0 }; + ggml_backend_tensor_set(inp_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size()); + ggml_backend_tensor_set(positions, pos4, 0, sizeof(pos4)); + + auto status = ggml_backend_graph_compute(backend, gf); + if (status != GGML_STATUS_SUCCESS) { + std::fprintf(stderr, "compute failed: %d\n", (int)status); + ggml_gallocr_free(alloc); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + return 1; + } + + const int bad_target = check_logits(target_out.logits, "target-logits"); + const int bad_mtp = check_logits(mtp_out.logits, "mtp-logits"); + + ggml_gallocr_free(alloc); + ggml_free(gctx); + free_target_mtp_cache(mtp_cache); + free_target_cache(target_cache); + free_target_weights(w); + ggml_backend_free(backend); + std::printf("OK\n"); + return (bad_target || bad_mtp) ? 1 : 0; +} diff --git a/dflash/test/test_mtp_graph_contract.cpp b/dflash/test/test_mtp_graph_contract.cpp new file mode 100644 index 00000000..b67b8c83 --- /dev/null +++ b/dflash/test/test_mtp_graph_contract.cpp @@ -0,0 +1,108 @@ +#include "internal.h" + +#include "ggml.h" + +#include + +using namespace dflash27b; + +static ggml_tensor * tensor_1d(ggml_context * ctx, int n0, const char * name) { + ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n0); + ggml_set_name(t, name); + return t; +} + +static ggml_tensor * tensor_2d(ggml_context * ctx, int n0, int n1, const char * name) { + ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n0, n1); + ggml_set_name(t, name); + return t; +} + +int main() { + ggml_init_params ip{}; + ip.mem_size = 16 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + + ggml_context * ctx = ggml_init(ip); + if (!ctx) { + std::fprintf(stderr, "ggml_init failed\n"); + return 1; + } + + TargetWeights w{}; + w.n_embd = 8; + w.n_ff = 16; + w.n_head = 2; + w.n_head_kv = 1; + w.n_embd_head_k = 4; + w.n_embd_head_v = 4; + w.rope_sections[0] = 1; + w.rope_sections[1] = 1; + w.rope_sections[2] = 0; + w.rope_sections[3] = 0; + w.out_norm = tensor_1d(ctx, w.n_embd, "output_norm.weight"); + w.output = tensor_2d(ctx, w.n_embd, 32, "output.weight"); + w.mtp_layers.assign(1, TargetMtpLayer{}); + + TargetMtpLayer & M = w.mtp_layers[0]; + TargetLayer & L = M.block; + M.gguf_layer_index = 64; + M.nextn.eh_proj = tensor_2d(ctx, 2 * w.n_embd, w.n_embd, "blk.64.nextn.eh_proj.weight"); + M.nextn.enorm = tensor_1d(ctx, w.n_embd, "blk.64.nextn.enorm.weight"); + M.nextn.hnorm = tensor_1d(ctx, w.n_embd, "blk.64.nextn.hnorm.weight"); + M.nextn.shared_head_norm = tensor_1d(ctx, w.n_embd, "blk.64.nextn.shared_head_norm.weight"); + + const int q_dim = w.n_head * w.n_embd_head_k; + const int kv_dim = w.n_head_kv * w.n_embd_head_k; + L.attn_norm = tensor_1d(ctx, w.n_embd, "blk.64.attn_norm.weight"); + L.attn_post_norm = tensor_1d(ctx, w.n_embd, "blk.64.post_attention_norm.weight"); + L.wq = tensor_2d(ctx, w.n_embd, 2 * q_dim, "blk.64.attn_q.weight"); + L.wk = tensor_2d(ctx, w.n_embd, kv_dim, "blk.64.attn_k.weight"); + L.wv = tensor_2d(ctx, w.n_embd, kv_dim, "blk.64.attn_v.weight"); + L.wo = tensor_2d(ctx, q_dim, w.n_embd, "blk.64.attn_output.weight"); + L.q_norm = tensor_1d(ctx, w.n_embd_head_k, "blk.64.attn_q_norm.weight"); + L.k_norm = tensor_1d(ctx, w.n_embd_head_k, "blk.64.attn_k_norm.weight"); + L.w_gate = tensor_2d(ctx, w.n_embd, w.n_ff, "blk.64.ffn_gate.weight"); + L.w_up = tensor_2d(ctx, w.n_embd, w.n_ff, "blk.64.ffn_up.weight"); + L.w_down = tensor_2d(ctx, w.n_ff, w.n_embd, "blk.64.ffn_down.weight"); + + TargetMtpCache cache{}; + cache.max_ctx = 8; + cache.kv_k_type = GGML_TYPE_F16; + cache.kv_v_type = GGML_TYPE_F16; + cache.attn_k.push_back(ggml_new_tensor_3d(ctx, GGML_TYPE_F16, w.n_embd_head_k, cache.max_ctx, w.n_head_kv)); + cache.attn_v.push_back(ggml_new_tensor_3d(ctx, GGML_TYPE_F16, w.n_embd_head_k, cache.max_ctx, w.n_head_kv)); + + const int n_tokens = 1; + ggml_tensor * token_embed = tensor_2d(ctx, w.n_embd, n_tokens, "mtp_token_embed"); + ggml_tensor * hidden = tensor_2d(ctx, w.n_embd, n_tokens, "target_pre_norm_hidden"); + ggml_tensor * positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * n_tokens); + ggml_set_name(positions, "positions"); + ggml_set_input(token_embed); + ggml_set_input(hidden); + ggml_set_input(positions); + + ggml_cgraph * gf = ggml_new_graph_custom(ctx, 1024, false); + QwenMtpGraphInputs in{}; + in.token_embed = token_embed; + in.pre_norm_hidden = hidden; + in.positions = positions; + in.n_tokens = n_tokens; + in.kv_start = 0; + + QwenMtpGraphOutputs out = build_qwen35_mtp_graph(ctx, gf, w, cache, in); + if (!out.logits || !out.hidden) { + std::fprintf(stderr, "build_qwen35_mtp_graph failed: %s\n", dflash27b_last_error()); + ggml_free(ctx); + return 1; + } + + std::printf("[mtp-graph-contract] nodes=%d logits=[%lld,%lld] hidden=[%lld,%lld]\n", + ggml_graph_n_nodes(gf), + (long long)out.logits->ne[0], (long long)out.logits->ne[1], + (long long)out.hidden->ne[0], (long long)out.hidden->ne[1]); + + ggml_free(ctx); + return 0; +}