Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ add_library(dflash27b STATIC
src/laguna_target_graph.cpp
src/laguna_daemon.cpp
src/sampler.cpp
# Native MTP / NextN helpers
src/f16_convert.cu
)
# FlashPrefill custom CUDA kernels need BF16 WMMA (sm_80+). On Turing (sm_75)
# the drafter uses ggml's flash_attn_ext instead. Guard added after SM check.
Expand Down Expand Up @@ -334,6 +336,29 @@ if(DFLASH27B_TESTS)
endif()
endif()

# Native MTP / NextN: contract test + functional smokes. The contract test
# uses synthetic tensors and runs in CI; the smokes need a real MTP GGUF.
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_mtp_graph_contract.cpp")
add_executable(test_mtp_graph_contract test/test_mtp_graph_contract.cpp)
target_include_directories(test_mtp_graph_contract PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_mtp_graph_contract PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_graph.cpp")
add_executable(smoke_mtp_graph test/smoke_mtp_graph.cpp)
target_include_directories(smoke_mtp_graph PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_mtp_graph PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_mtp_handoff.cpp")
add_executable(smoke_target_mtp_handoff test/smoke_target_mtp_handoff.cpp)
target_include_directories(smoke_target_mtp_handoff PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_target_mtp_handoff PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_mtp_integrated_decode.cpp")
add_executable(smoke_mtp_integrated_decode test/smoke_mtp_integrated_decode.cpp)
target_include_directories(smoke_mtp_integrated_decode PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_mtp_integrated_decode PRIVATE dflash27b ggml ggml-cuda)
endif()

# internal.h includes <cuda_runtime.h> when GGML_USE_CUDA is set; link
# CUDA::cudart so the toolkit headers are on the compile line (same as
# test_dflash historically had alone).
Expand All @@ -350,6 +375,10 @@ if(DFLASH27B_TESTS)
smoke_target_forward
test_generate
test_dflash
test_mtp_graph_contract
smoke_mtp_graph
smoke_target_mtp_handoff
smoke_mtp_integrated_decode
)
foreach(_t IN LISTS _dflash_internal_h_cuda_tests)
if(TARGET ${_t})
Expand Down
111 changes: 111 additions & 0 deletions dflash/docs/MTP_2026-05-11.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Native MTP (NextN) — runtime status, 2026-05-11

This document describes the native multi-token prediction (MTP / NextN)
runtime introduced into `dflash` in PR `feat(dflash): native Qwen3.6 MTP
integrated decode`. It tracks the contract, what already works, and what
remains for the next PR before MTP becomes a default-on decode mode.

## What this PR ships

- `dflash/src/f16_convert.cu` — small `f16/bf16 → f32` widen kernels used
by both the rollback path and the MTP token-embedding widen.
- `dflash/src/internal.h` — new types:
- `TargetNextN`, `TargetMtpLayer`
- `TargetMtpCache` (KV cache for the NextN tail block only)
- `QwenMtpGraphInputs`, `QwenMtpGraphOutputs`
- `expose_pre_norm_hidden` on `QwenGraphInputs`
- `pre_norm_hidden` on `QwenGraphOutputs`
- `TargetWeights::mtp_layers`, `nextn_predict_layers`, `gguf_block_count`,
`tok_embd_gpu` (no fields removed; the trunk API is preserved).
- `dflash/src/qwen35_target_graph.cpp` — four new functions:
- `create_target_mtp_cache` / `free_target_mtp_cache` / `reset_target_mtp_cache`
- `build_qwen35_mtp_graph` — RMSNorm(e) || RMSNorm(h) → `eh_proj` →
full-attention transformer block → SwiGLU FFN → shared head.
Also wires `expose_pre_norm_hidden` into `build_qwen35_graph`.
- `dflash/src/gguf_target_loader.cpp` — reads `qwen35.nextn_predict_layers`,
splits the GGUF blocks into trunk + MTP tail, loads `blk.<i>.nextn.*`
tensors into `TargetWeights::mtp_layers`, and uploads `token_embd.weight`
to the GPU when the checkpoint carries MTP (`DFLASH27B_UPLOAD_TOK_EMBD`
env var overrides).
- `dflash/test/test_mtp_graph_contract.cpp` — synthetic-tensor test that
asserts the MTP graph wires together correctly. No GPU model needed;
cheap to run in CI.
- `dflash/test/smoke_mtp_graph.cpp` — loads a real MTP GGUF, builds the
NextN graph for a single token, and validates the output is finite.
- `dflash/test/smoke_target_mtp_handoff.cpp` — loads a real MTP GGUF and
proves that the trunk pre-norm hidden tensor feeds directly into the
MTP block within the same `ggml_cgraph` (no CPU roundtrip required).
- `dflash/test/smoke_mtp_integrated_decode.cpp` — full integrated decode
loop: target greedy + MTP greedy in one graph, with per-step accept /
correct counters. This is the functional baseline the upcoming PR's
speculative loop will be built on top of.

## GGUF compatibility

The loader follows the tensor naming convention introduced by llama.cpp's
[MTP PR #22673](https://github.com/ggml-org/llama.cpp/pull/22673). It is
compatible with the reference Qwen3.6-MTP GGUFs published on the Hub:

- `am17an/Qwen3.6-27B-MTP-GGUF`
- `am17an/Qwen3.6-35BA3B-MTP-GGUF` (MoE — see "MoE limitation" below)
- `havenoammo/Qwen3.6-27B-MTP-UD-GGUF`
- `havenoammo/Qwen3.6-35B-A3B-MTP-GGUF`
- `froggeric/Qwen3.6-27B-MTP-GGUF`

The expected tail-block tensor names are:

```text
blk.<n_trunk>.nextn.eh_proj.weight [2 * hidden, hidden]
blk.<n_trunk>.nextn.embed_tokens.weight [hidden, vocab] (optional)
blk.<n_trunk>.nextn.enorm.weight [hidden]
blk.<n_trunk>.nextn.hnorm.weight [hidden]
blk.<n_trunk>.nextn.shared_head_head.weight [hidden, vocab] (optional)
blk.<n_trunk>.nextn.shared_head_norm.weight [hidden] (optional)
```

When the optional shared-head tensors are absent the runtime falls back to
the trunk's `output_norm` / `output` (lm_head), matching how am17an's
GGUFs are typically packed.

## MoE limitation

`build_qwen35_mtp_graph` currently implements the dense-FFN path only. The
35B-A3B MTP GGUFs require the MoE `TargetLayer` fields and the routed
FFN path that howard0su is upstreaming in
[PR #120 "Qwen3.5 MoE support"](https://github.com/Luce-Org/lucebox-hub/pull/120).
A MoE-aware `build_qwen35_mtp_graph` is a one-line dispatch on top of
this PR once #120 lands. Until then, loading a MoE-MTP GGUF + invoking
the MTP graph returns a clear error rather than producing wrong output.

## Why MTP is opt-in, not default-on

Measured today against `DFlash + PFlash` on the same MTP GGUF with MTP
disabled, on a single RTX 6000 Ada (sm_89), Qwen3.6-27B Q4_K_M target,
`q4_0/q4_0` KV, FA_WINDOW=0, DDTree budget=16, draft feature mirror on:

| n_gen | Same GGUF, MTP off (tok/s) | Same GGUF, MTP chain-2 (tok/s) | Δ |
|---:|---:|---:|---:|
| 64 | 57.58 | 54.72 | **−5.0%** |
| 128 | 67.58 | 64.23 | **−5.0%** |
| 256 | 60.40 | 82.18 | **+36.1%** |

What changes between 64 and 256 tokens is that DDTree rounds drop from
roughly 60 → 38 and average tokens committed per draft step rise from
4.27 → 6.74, so the extra MTP forward starts paying for itself.

This is real but workload-dependent acceleration, not a universal default.
The next PR adds the speculative loop that turns this into a default-on
mode for long generations; today's PR ships only the runtime contract and
the tests that pin it.

## Known follow-ups (next PR)

1. Speculative decode loop wiring (`run_mtp_integrated_prompt`,
target-batched verify, fast rollback) inside `test_dflash`.
2. Daemon-side `--mtp-integrated` CLI + metrics surface (`[mtp-daemon]`
line, `last_mtp` aggregated in `prefix_cache.py`).
3. `mtp_baseline_gate.py` published as a reusable parity gate harness.
4. CPU hidden-readback elimination — the current functional smoke still
round-trips token ids through CPU between MTP steps. Removing that is
the highest-value perf fix and is queued behind CUDA-graph capture.
5. MoE MTP path after PR #120 merges.
49 changes: 49 additions & 0 deletions dflash/src/f16_convert.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Tiny half-precision → f32 conversion kernels used by the DDtree rollback
// path and the drafter's target_feat widen. We store some tensors
// (ssm_intermediate, target_feat) at 16-bit to halve their memory footprint,
// and widen on read into f32 consumers.
//
// Exposes plain C entry points so test_dflash.cpp can call them without
// pulling in a CUDA compile unit of its own.

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>

static __global__ void f16_to_f32_kernel(const __half * __restrict__ src,
float * __restrict__ dst,
size_t n_elems) {
const size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
if (i < n_elems) {
dst[i] = __half2float(src[i]);
}
}

static __global__ void bf16_to_f32_kernel(const __nv_bfloat16 * __restrict__ src,
float * __restrict__ dst,
size_t n_elems) {
const size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
if (i < n_elems) {
dst[i] = __bfloat162float(src[i]);
}
}

extern "C" void dflash27b_launch_f16_to_f32(const void * src,
void * dst,
size_t n_elems,
cudaStream_t stream) {
const int threads = 256;
const int blocks = (int)((n_elems + threads - 1) / threads);
f16_to_f32_kernel<<<blocks, threads, 0, stream>>>(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Missing zero-length guard causes an invalid zero-grid CUDA launch when n_elems == 0.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At dflash/src/f16_convert.cu, line 37:

<comment>Missing zero-length guard causes an invalid zero-grid CUDA launch when `n_elems == 0`.</comment>

<file context>
@@ -0,0 +1,49 @@
+                                            cudaStream_t stream) {
+    const int threads = 256;
+    const int blocks  = (int)((n_elems + threads - 1) / threads);
+    f16_to_f32_kernel<<<blocks, threads, 0, stream>>>(
+        (const __half *)src, (float *)dst, n_elems);
+}
</file context>

(const __half *)src, (float *)dst, n_elems);
}

extern "C" void dflash27b_launch_bf16_to_f32(const void * src,
void * dst,
size_t n_elems,
cudaStream_t stream) {
const int threads = 256;
const int blocks = (int)((n_elems + threads - 1) / threads);
bf16_to_f32_kernel<<<blocks, threads, 0, stream>>>(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Missing zero-length guard causes an invalid zero-grid CUDA launch when n_elems == 0.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At dflash/src/f16_convert.cu, line 47:

<comment>Missing zero-length guard causes an invalid zero-grid CUDA launch when `n_elems == 0`.</comment>

<file context>
@@ -0,0 +1,49 @@
+                                             cudaStream_t stream) {
+    const int threads = 256;
+    const int blocks  = (int)((n_elems + threads - 1) / threads);
+    bf16_to_f32_kernel<<<blocks, threads, 0, stream>>>(
+        (const __nv_bfloat16 *)src, (float *)dst, n_elems);
+}
</file context>

(const __nv_bfloat16 *)src, (float *)dst, n_elems);
}
Loading