refactor(mtp): GPU-resident req_to_accept_len + simplify verify-decode plumbing

sufubao · sufubao · commit 0d15236fac5d · 2026-06-16T14:58:41.000+08:00
- is_mtp_verify: drop the redundant `b_num_accepted_tokens is not None` clause
  (post grouped-revert it's implied by mtp_step&gt;0 ∧ ¬prefill).
- Replace the per-step host round-trip for b_num_accepted_tokens with a
  GPU-resident ReqManager.req_to_accept_len: a triton scatter_mtp_accept_len
  after verify + a GDN-only gather in init_mtp_verify_extra_state. Removes the
  gen_from_list H2D rebuild, the phase-2 req.mtp_accept_len writeback, and the
  host attr (linear-att offload + resets now read/write the buffer).
- Drop the redundant `if mtp_step&gt;0` guard inside decode_mtp/decode_overlap_mtp.
- config_objs: inline the mtp draft-layer count, dropping the _mtp_added_layer_num
  helper (kept get_added_mtp_kv_layer_num inlined in envs_utils).
- cpu_cache_meta: don't bump layer_num for linear-att models (the draft full-att
  slots are already in LinearAttCacheConfig.get_cpu_cache_big_page_bytes()).

Static checks pass (ast, flake8). The req_to_accept_len refactor is not yet
runtime-verified; pending a hybrid GSM8K + cudagraph-ON parity run.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ requirements-musa.txt
 logs/
 
 /benchmark/
+artifacts/
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -26,11 +26,7 @@
 from lightllm.common.basemodel.triton_kernel.gather_token_id import gather_token, gather_token_prefill_decode_mixed
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.dist_utils import get_dp_world_size
-from lightllm.utils.envs_utils import (
-    get_env_start_args,
-    get_llm_data_type,
-    get_added_mtp_kv_layer_num,
-)
+from lightllm.utils.envs_utils import get_env_start_args, get_llm_data_type, get_added_mtp_kv_layer_num
 from lightllm.distributed.communication_op import dist_group_manager
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from lightllm.common.triton_utils.autotuner import AutotuneLevel
@@ -381,105 +377,36 @@ def _create_padded_decode_model_input(self, model_input: ModelInput, new_batch_s
         is_mtp_grouped_decode = (not model_input.is_prefill) and self.args.mtp_step > 0
         if is_mtp_grouped_decode:
             mtp_size = self.args.mtp_step + 1
-            assert model_input.batch_size % mtp_size == 0
-            assert new_batch_size % mtp_size == 0
             assert padded_batch_size % mtp_size == 0
             padded_req_num = padded_batch_size // mtp_size
-
-            pad_mtp_index = torch.arange(
-                mtp_size,
-                dtype=new_model_input.b_mtp_index.dtype,
-                device=new_model_input.b_mtp_index.device,
-            ).repeat(padded_req_num)
-            pad_seq_len = torch.arange(
-                2,
-                mtp_size + 2,
-                dtype=new_model_input.b_seq_len.dtype,
-                device=new_model_input.b_seq_len.device,
-            ).repeat(padded_req_num)
             new_model_input.total_token_num += padded_req_num * (mtp_size * (mtp_size + 3) // 2)
             new_model_input.max_kv_seq_len = max(mtp_size + 1, model_input.max_kv_seq_len)
-            new_model_input.input_ids = torch.cat(
-                (
-                    new_model_input.input_ids,
-                    torch.ones(
-                        padded_batch_size,
-                        dtype=new_model_input.input_ids.dtype,
-                        device=new_model_input.input_ids.device,
-                    ),
-                ),
-                dim=0,
-            )
-            new_model_input.b_req_idx = torch.cat(
-                (
-                    new_model_input.b_req_idx,
-                    torch.full(
-                        (padded_batch_size,),
-                        self.req_manager.HOLD_REQUEST_ID,
-                        dtype=new_model_input.b_req_idx.dtype,
-                        device=new_model_input.b_req_idx.device,
-                    ),
-                ),
-                dim=0,
-            )
-            new_model_input.b_mtp_index = torch.cat((new_model_input.b_mtp_index, pad_mtp_index), dim=0)
+            pad_seq_len = torch.arange(
+                2, mtp_size + 2, dtype=new_model_input.b_seq_len.dtype, device=new_model_input.b_seq_len.device
+            ).repeat(padded_req_num)
             new_model_input.b_seq_len = torch.cat((new_model_input.b_seq_len, pad_seq_len), dim=0)
-            new_model_input.mem_indexes = torch.cat(
-                (
-                    new_model_input.mem_indexes,
-                    torch.full(
-                        (padded_batch_size,),
-                        self.mem_manager.HOLD_TOKEN_MEMINDEX,
-                        dtype=new_model_input.mem_indexes.dtype,
-                        device=new_model_input.mem_indexes.device,
-                    ),
-                ),
-                dim=0,
-            )
-            new_model_input.b_num_accepted_tokens = torch.cat(
-                (
-                    new_model_input.b_num_accepted_tokens,
-                    torch.ones(
-                        padded_req_num,
-                        dtype=new_model_input.b_num_accepted_tokens.dtype,
-                        device=new_model_input.b_num_accepted_tokens.device,
-                    ),
-                ),
-                dim=0,
-            )
+            # b_num_accepted_tokens 不再随 model_input 流转/补齐：它在 GDN 的 init_mtp_verify_extra_state
+            # 里按 req_first 从 req_to_accept_len gather，padding 组 req_first=HOLD（槽恒为 1）自然得 1。
         else:
             new_model_input.total_token_num += padded_batch_size * 2
             new_model_input.max_kv_seq_len = max(2, model_input.max_kv_seq_len)
-            new_model_input.input_ids = F.pad(
-                new_model_input.input_ids,
-                (0, padded_batch_size),
-                mode="constant",
-                value=1,
-            )
-            new_model_input.b_req_idx = F.pad(
-                new_model_input.b_req_idx,
-                (0, padded_batch_size),
-                mode="constant",
-                value=self.req_manager.HOLD_REQUEST_ID,
-            )
-            new_model_input.b_mtp_index = F.pad(
-                new_model_input.b_mtp_index,
-                (0, padded_batch_size),
-                mode="constant",
-                value=0,
-            )
             new_model_input.b_seq_len = F.pad(
-                new_model_input.b_seq_len,
-                (0, padded_batch_size),
-                mode="constant",
-                value=2,
-            )
-            new_model_input.mem_indexes = F.pad(
-                new_model_input.mem_indexes,
-                (0, padded_batch_size),
-                mode="constant",
-                value=self.mem_manager.HOLD_TOKEN_MEMINDEX,
+                new_model_input.b_seq_len, (0, padded_batch_size), mode="constant", value=2
             )
+
+        new_model_input.input_ids = F.pad(new_model_input.input_ids, (0, padded_batch_size), mode="constant", value=1)
+        new_model_input.b_req_idx = F.pad(
+            new_model_input.b_req_idx, (0, padded_batch_size), mode="constant", value=self.req_manager.HOLD_REQUEST_ID
+        )
+        new_model_input.b_mtp_index = F.pad(
+            new_model_input.b_mtp_index, (0, padded_batch_size), mode="constant", value=0
+        )
+        new_model_input.mem_indexes = F.pad(
+            new_model_input.mem_indexes,
+            (0, padded_batch_size),
+            mode="constant",
+            value=self.mem_manager.HOLD_TOKEN_MEMINDEX,
+        )
         new_model_input.multimodal_params = new_model_input.multimodal_params + [
             {"images": [], "audios": []} for _ in range(padded_batch_size)
         ]
@@ -698,6 +625,7 @@ def _decode(
 
     @final
     def _context_forward(self, infer_state: InferStateInfo):
+
         input_embs = self.pre_infer.context_forward(infer_state.input_ids, infer_state, self.pre_post_weight)
         if self.args.enable_dp_prefill_balance:
             assert not self.args.enable_prefill_cudagraph, "not support now"
diff --git a/lightllm/common/basemodel/cuda_graph.py b/lightllm/common/basemodel/cuda_graph.py
@@ -102,7 +102,8 @@ def _build_warmup_decode_model_input(
             real_batch_size = batch_size // mtp_size
             b_mtp_index = torch.arange(mtp_size, dtype=torch.int32, device=device).repeat(real_batch_size)
             b_seq_len = torch.arange(2, mtp_size + 2, dtype=torch.int32, device=device).repeat(real_batch_size)
-            b_num_accepted_tokens = torch.ones(real_batch_size, dtype=torch.int32, device=device)
+            # b_num_accepted_tokens 不再随 model_input 传入：GDN 的 init_mtp_verify_extra_state 会按
+            # req_first(全 HOLD，槽恒为 1) gather，warmup/capture 自然得到全 1，等价旧的 torch.ones。
             total_token_num = real_batch_size * (mtp_size * (mtp_size + 3) // 2)
         else:
             seq_len = 2
diff --git a/lightllm/common/basemodel/mtp_verify_extra_state.py b/lightllm/common/basemodel/mtp_verify_extra_state.py
@@ -3,29 +3,14 @@
 from lightllm.utils.envs_utils import get_env_start_args
 
 
-def init_mtp_verify_extra_state(self):
-    """Shared MTP-verify decode metadata, used by qwen3_5 and qwen3next infer-struct classes (#12).
-    Call AFTER super().init_some_extra_state(model). `self` is the InferStateInfo instance."""
+def init_mtp_verify_extra_state(self, model):
     self.b_att_seq_len = self.b_seq_len
     mtp_step = get_env_start_args().mtp_step
     self.b_buffer_idx = self.b_req_idx * (mtp_step + 1) + self.b_mtp_index
-    # conv buffer is now ONE widened slot per request (indexed by req_idx),
-    # dropping the *(S+1) + mtp_index addressing used by the SSM block.
     self.b_conv_buffer_idx = self.b_req_idx
-    # MTP verify batch: decode-mode, S+1 expanded, and gated on the
-    # per-real-request accept tensor that decode_mtp threads in. Gating on
-    # b_num_accepted_tokens (vs only b_mtp_index, which is set for any decode)
-    # distinguishes the main-model verify forward from draft/plain decode.
-    self.is_mtp_verify = (
-        (mtp_step > 0)
-        and (not self.is_prefill)
-        and (self.b_mtp_index is not None)
-        and (self.b_num_accepted_tokens is not None)
-    )
+    self.is_mtp_verify = (mtp_step > 0) and (not self.is_prefill) and (self.b_mtp_index is not None)
     self.b_gdn_verify_cu_seqlens = None
     self.b_ssm_index_rows = None
-    # b_num_accepted_tokens is threaded onto the infer_state from ModelInput by
-    # _create_inferstate (mirrors b_mtp_index) BEFORE this runs; nothing to do here.
     if self.is_mtp_verify:
         step = mtp_step + 1
         n_real = self.b_req_idx.shape[0] // step
@@ -36,12 +21,6 @@ def init_mtp_verify_extra_state(self):
         base = (req_first * step).view(n_real, 1)
         self.b_ssm_index_rows = base + torch.arange(step, device=base.device, dtype=base.dtype).view(1, step)
         assert self.b_ssm_index_rows.shape == (n_real, step)
-        # The spec conv kernel is per-SEQUENCE (one program per real request),
-        # indexed by conv_state_indices[idx_seq] with idx_seq in [0, n_real),
-        # aligned 1:1 with b_gdn_verify_cu_seqlens / b_num_accepted_tokens. The
-        # default b_conv_buffer_idx = b_req_idx has the expanded length n_real*step,
-        # which launches n_real*step conv programs and reads num_accepted/
-        # query_start_loc out of bounds for idx_seq >= n_real, corrupting the
-        # committed conv slot. Narrow it to one widened conv slot per request.
         self.b_conv_buffer_idx = req_first
+        self.b_num_accepted_tokens = model.req_manager.req_to_accept_len[req_first]
     return
diff --git a/lightllm/common/basemodel/triton_kernel/mtp_utils.py b/lightllm/common/basemodel/triton_kernel/mtp_utils.py
@@ -148,6 +148,51 @@ def mtp_scatter_next_token_ids(
     )
 
 
+@triton.jit
+def _fwd_kernel_scatter_accept_len(
+    req_to_accept_len,
+    b_req_mtp_start_loc,
+    b_req_idx,
+    mtp_accept_len,
+):
+    cur_index = tl.program_id(0)
+    req_start_loc = tl.load(b_req_mtp_start_loc + cur_index)
+    cur_req_idx = tl.load(b_req_idx + req_start_loc)
+    accept_len = tl.load(mtp_accept_len + cur_index)
+    tl.store(req_to_accept_len + cur_req_idx, accept_len)
+    return
+
+
+def scatter_mtp_accept_len(
+    req_to_accept_len: torch.Tensor,
+    b_req_mtp_start_loc: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    mtp_accept_len: torch.Tensor,
+):
+    """
+    将本步每个真实请求(组首)的 accept 数量写入 GPU 常驻的 req_to_accept_len[req_idx]。
+    融合 `req_to_accept_len[b_req_idx[b_req_mtp_start_loc]] = mtp_accept_len` 的 gather+scatter
+    为单次 launch、无中间张量。每个 program 处理一个真实请求。
+    Args:
+        req_to_accept_len: (max_req_num + 1,)
+        b_req_mtp_start_loc: (num_reqs,)  每组首行在 batch 中的偏移
+        b_req_idx: (batch_size,)          grouped 布局的 req_idx（组首即该请求的 req_idx）
+        mtp_accept_len: (num_reqs,)
+    """
+    num_reqs = mtp_accept_len.shape[0]
+    if num_reqs == 0:
+        return
+    grid = (num_reqs,)
+    _fwd_kernel_scatter_accept_len[grid](
+        req_to_accept_len=req_to_accept_len,
+        b_req_mtp_start_loc=b_req_mtp_start_loc,
+        b_req_idx=b_req_idx,
+        mtp_accept_len=mtp_accept_len,
+        num_warps=1,
+        num_stages=1,
+    )
+
+
 def test_mtp_verify():
     req_to_next_token_ids = torch.tensor(
         [[1, 2, -2, -1, -1], [1, 2, 0, -1, -1], [1, 3, 4, 4, 5]], dtype=torch.int32, device="cuda"
diff --git a/lightllm/common/linear_att_cache_manager/config_objs.py b/lightllm/common/linear_att_cache_manager/config_objs.py
@@ -1,16 +1,21 @@
 import torch
 import dataclasses
 import triton
-from lightllm.utils.envs_utils import get_env_start_args, _mtp_added_layer_num
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.torch_dtype_utils import get_torch_dtype
 
 logger = init_logger(__name__)
 
 
 def get_mtp_draft_full_att_layer_num(args) -> int:
-    # Delegates to the single source of truth in envs_utils (#9).
-    return _mtp_added_layer_num(getattr(args, "mtp_mode", None), getattr(args, "mtp_step", 0))
+    # mtp_mode -> draft model 增加的 full-att KV 层数（与 envs_utils.get_added_mtp_kv_layer_num 同口径）。
+    mtp_mode = getattr(args, "mtp_mode", None)
+    if mtp_mode == "eagle_with_att":
+        return 1
+    if mtp_mode == "vanilla_with_att":
+        return getattr(args, "mtp_step", 0)
+    return 0
 
 
 @dataclasses.dataclass
diff --git a/lightllm/common/req_manager.py b/lightllm/common/req_manager.py
@@ -86,6 +86,15 @@ def __init__(self, max_request_num, max_sequence_length, mem_manager: MemoryMana
         self.req_sampling_params_manager = ReqSamplingParamsManager(max_request_num)
         self.max_request_num = max_request_num
         self.HOLD_REQUEST_ID = max_request_num
+        # MTP verify decode 的 per-req accept 数量：GPU 常驻、按 req_idx 索引（含 HOLD 槽）。
+        # 取代旧的 req.mtp_accept_len host 属性 —— verify 后在 GPU 上 scatter，下一步在 GDN 的
+        # init_mtp_verify_extra_state 里按 req_first gather 成 b_num_accepted_tokens，省掉每步的
+        # host 回写 + H2D 重建。HOLD 槽恒为 1，使 padding 组 gather 到 1。仅 mtp_step>0 时分配。
+        self.req_to_accept_len = (
+            torch.ones((max_request_num + 1,), dtype=torch.int32, device="cuda")
+            if get_env_start_args().mtp_step > 0
+            else None
+        )
 
     def alloc(self):
         return self.req_list.alloc()
@@ -274,7 +283,8 @@ def init_linear_att_state(self, req: "InferReq"):
         # #17: zero the FULL (mtp_step + 1)-row SSM block, not just canonical row +0, so a future
         # first-step verify reading offset>0 after fresh init never hits a never-written row (NaN).
         self.req_to_ssm_state.buffer[:, ssm_start : ssm_start + (self.mtp_step + 1), ...].fill_(0)
-        req.mtp_accept_len = 1
+        if self.req_to_accept_len is not None:
+            self.req_to_accept_len[req.req_idx] = 1
         return
 
     def get_mamba_cache(self, layer_idx_in_all: int):
@@ -298,7 +308,8 @@ def copy_big_page_buffer_to_linear_att_state(self, big_page_buffer_idx: int, req
         narrow_w = conv_state.shape[-1]  # persisted (narrow) width
         self.req_to_conv_state.buffer[:, conv_dest, ..., :narrow_w] = conv_state
         self.req_to_ssm_state.buffer[:, ssm_dest, ...] = ssm_state
-        req.mtp_accept_len = 1
+        if self.req_to_accept_len is not None:
+            self.req_to_accept_len[req.req_idx] = 1
         return
 
     def copy_small_page_buffer_to_linear_att_state(
@@ -314,5 +325,6 @@ def copy_small_page_buffer_to_linear_att_state(
         # 同时，非连续对象的拷贝，可能存在效率问题。
         self.req_to_conv_state.buffer[:, conv_dest, ..., :narrow_w] = conv_state
         self.req_to_ssm_state.buffer[:, ssm_dest, ...] = ssm_state
-        req.mtp_accept_len = 1
+        if self.req_to_accept_len is not None:
+            self.req_to_accept_len[req.req_idx] = 1
         return
diff --git a/lightllm/models/qwen3_5/infer_struct.py b/lightllm/models/qwen3_5/infer_struct.py
@@ -10,5 +10,5 @@ def init_some_extra_state(self, model):
         super().init_some_extra_state(model)
         from lightllm.common.basemodel.mtp_verify_extra_state import init_mtp_verify_extra_state
 
-        init_mtp_verify_extra_state(self)
+        init_mtp_verify_extra_state(self, model)
         return
diff --git a/lightllm/models/qwen3next/infer_struct.py b/lightllm/models/qwen3next/infer_struct.py
@@ -10,5 +10,5 @@ def init_some_extra_state(self, model):
         super().init_some_extra_state(model)
         from lightllm.common.basemodel.mtp_verify_extra_state import init_mtp_verify_extra_state
 
-        init_mtp_verify_extra_state(self)
+        init_mtp_verify_extra_state(self, model)
         return
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -389,9 +389,11 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
 
             from lightllm.common.basemodel.triton_kernel.linear_att_copy import copy_linear_att_state_to_kv_buffer
 
-            b_num_accepted_tokens = torch.tensor(
-                [req.mtp_accept_len for req in reqs], dtype=torch.int32, requires_grad=False, device="cpu"
+            # accept 数量改由 GPU 常驻的 req_to_accept_len 按 req_idx gather（不再读 req.mtp_accept_len）。
+            req_idxs = torch.tensor(
+                [req.req_idx for req in reqs], dtype=torch.int32, requires_grad=False, device="cpu"
             ).cuda(non_blocking=True)
+            b_num_accepted_tokens = self.req_manager.req_to_accept_len[req_idxs]
 
             copy_linear_att_state_to_kv_buffer(
                 b_req_idx=b_req_idx,
@@ -417,11 +419,13 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
                         self.radix_cache.linear_att_small_page_buffers.alloc_one_state_cache()
                     )
                     if req.tail_linear_att_small_page_buffer_id is not None:
-                        assert 1 <= req.mtp_accept_len <= self.args.mtp_step + 1, (
-                            f"mtp_accept_len={req.mtp_accept_len} out of range "
+                        # 冷路径(prefill 跨小页边界)：单标量从 GPU buffer 读回做 Python 切片下标。
+                        accept_len = int(self.req_manager.req_to_accept_len[req.req_idx].item())
+                        assert 1 <= accept_len <= self.args.mtp_step + 1, (
+                            f"mtp_accept_len={accept_len} out of range "
                             f"[1, {self.args.mtp_step + 1}]; would slice past the widened conv slot"
                         )
-                        canonical_off = req.mtp_accept_len - 1
+                        canonical_off = accept_len - 1
                         conv_src_idx = req.req_idx
                         ssm_src_idx = req.req_idx * (self.args.mtp_step + 1) + canonical_off
                         narrow_w = self.req_manager.linear_config.get_persisted_conv_state_shape()[-1]
@@ -578,8 +582,6 @@ def __init__(
         else:
             self.decode_need_token_num = self._normal_decode_need_token_num
 
-        self.mtp_accept_len: int = 1
-
         if g_infer_context.is_linear_att_mixed_model:
             self.get_chuncked_input_token_len = self.get_chuncked_input_token_len_for_linear_att
             self.get_chuncked_input_token_ids = self.get_chuncked_input_token_ids_for_linear_att
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
diff --git a/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py b/lightllm/server/router/model_infer/mode_backend/dp_backend/impl.py
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
diff --git a/lightllm/utils/kv_cache_utils.py b/lightllm/utils/kv_cache_utils.py

Original file line number	Diff line number	Diff line change
`@@ -10,3 +10,4 @@ requirements-musa.txt`
`10`	`10`	`logs/`
`11`	`11`
`12`	`12`	`/benchmark/`
	`13`	`+artifacts/`