feat(qwen3_5_mtp): scheduler MTP verify backend + accept-len transport

sufubao · sufubao · commit c7783856e863 · 2026-06-04T19:17:16.000+08:00
Drive the draft/verify loop from the scheduler:

- carry a canonical InferReq.mtp_accept_len pointer and persist the
  per-request accept_len across steps; build per-req
  b_num_accepted_tokens in decode_mtp and commit it in phase 2 so the
  next step reads a fresh count.
- extend the chunked_prefill backend / base_backend with the MTP verify
  dispatch and the partial-accept read offset.
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -357,6 +357,11 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
         if not self.is_linear_att_mixed_model:
             return
 
+        # 当 dynamic prompt cache 被禁用时 radix_cache 为 None，没有大页/小页缓冲可写，
+        # 线性层状态仅存于 req_manager 的 GPU buffer 即可，直接跳过跨请求缓存拷贝。
+        if self.radix_cache is None:
+            return
+
         # 大页对应的 linear att 的拷贝
         big_page_token_num = self.args.linear_att_hash_page_size * self.args.linear_att_page_block_num
         big_page_buffer_ids = []
@@ -377,6 +382,10 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
 
         from lightllm.common.basemodel.triton_kernel.linear_att_copy import copy_linear_att_state_to_kv_buffer
 
+        b_num_accepted_tokens = torch.tensor(
+            [req.mtp_accept_len for req in reqs], dtype=torch.int32, requires_grad=False, device="cpu"
+        ).cuda(non_blocking=True)
+
         copy_linear_att_state_to_kv_buffer(
             b_req_idx=b_req_idx,
             big_page_buffer_ids=big_page_buffer_ids,
@@ -385,6 +394,7 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
             cpu_kv_conv_state=self.radix_cache.linear_att_big_page_buffers.conv_state_cache.buffer,
             cpu_kv_ssm_state=self.radix_cache.linear_att_big_page_buffers.ssm_state_cache.buffer,
             mtp_step=self.args.mtp_step,
+            b_num_accepted_tokens=b_num_accepted_tokens,
         )
 
         assert not self.args.disable_chunked_prefill, "chunked prefill mode must be enabled for linear att mixed model"
@@ -400,9 +410,14 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
                         self.radix_cache.linear_att_small_page_buffers.alloc_one_state_cache()
                     )
                     if req.tail_linear_att_small_page_buffer_id is not None:
-                        src_buffer_idx = req.req_idx * (self.args.mtp_step + 1)
-                        gpu_conv_state = self.req_manager.req_to_conv_state.buffer[:, src_buffer_idx, ...]
-                        gpu_ssm_state = self.req_manager.req_to_ssm_state.buffer[:, src_buffer_idx, ...]
+                        canonical_off = req.mtp_accept_len - 1
+                        conv_src_idx = req.req_idx
+                        ssm_src_idx = req.req_idx * (self.args.mtp_step + 1) + canonical_off
+                        narrow_w = self.req_manager.linear_config.get_persisted_conv_state_shape()[-1]
+                        gpu_conv_state = self.req_manager.req_to_conv_state.buffer[
+                            :, conv_src_idx, ..., canonical_off : canonical_off + narrow_w
+                        ]
+                        gpu_ssm_state = self.req_manager.req_to_ssm_state.buffer[:, ssm_src_idx, ...]
                         dst_buffer_idx = req.tail_linear_att_small_page_buffer_id
 
                         dst_conv_state, dst_ssm_state = self.radix_cache.linear_att_small_page_buffers.get_state_cache(
@@ -558,6 +573,8 @@ def __init__(
         else:
             self.decode_need_token_num = self._normal_decode_need_token_num
 
+        self.mtp_accept_len: int = 1
+
         if g_infer_context.is_linear_att_mixed_model:
             self.get_chuncked_input_token_len = self.get_chuncked_input_token_len_for_linear_att
             self.get_chuncked_input_token_ids = self.get_chuncked_input_token_ids_for_linear_att
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -357,6 +357,16 @@ def init_mtp_draft_model(self, main_kvargs: dict):
             elif mtp_model_cfg["model_type"] == "glm4_moe_lite":
                 assert self.args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
                 self.draft_models.append(Glm4MoeLiteMTPModel(mtp_model_kvargs))
+            elif model_type in ("qwen3_5", "qwen3_5_text"):
+                assert self.args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
+                from lightllm.models.qwen3_5_mtp.model import Qwen3_5MTPModel
+
+                self.draft_models.append(Qwen3_5MTPModel(mtp_model_kvargs))
+            elif model_type in ("qwen3_5_moe", "qwen3_5_moe_text"):
+                assert self.args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
+                from lightllm.models.qwen3_5_moe_mtp.model import Qwen3_5MoeMTPModel
+
+                self.draft_models.append(Qwen3_5MoeMTPModel(mtp_model_kvargs))
             else:
                 raise ValueError(f"Unsupported MTP model type: {model_type}")
 
@@ -602,7 +612,6 @@ def _get_classed_reqs(
         can_alloc_token_num = g_infer_context.get_can_alloc_token_num()
 
         for req_obj in ready_reqs:
-
             if req_obj.filter_mark:
                 finished_reqs.append(req_obj)
                 continue
@@ -783,11 +792,35 @@ def _verify_mtp_v2(
         )
         return mtp_accept_len, accepted_index
 
+    def _commit_mtp_accept_len(
+        self,
+        decode_reqs: List[InferReq],
+        mtp_accept_len_cpu: torch.Tensor,
+    ):
+        # Carry the per-req accept count into the NEXT step as the canonical
+        # pointer (design §3.1). This must run on every rank (not only master):
+        # the kernels on this rank read req.mtp_accept_len.
+        #
+        # CRITICAL ordering (overlap scheduler): the next step's decode_mtp reads
+        # req.mtp_accept_len (to build b_num_accepted_tokens) the moment its
+        # wait_to_forward() is released, which happens at THIS step's
+        # notify_forward_and_wait_post_handle() (start of phase 3). So this carry
+        # MUST be committed in phase 2 (pre_post_handle), before that release —
+        # otherwise the next step reads a one-step-stale accept count. The error
+        # is invisible while accept_len is constant (==1) and corrupts the GDN
+        # conv/ssm committed-state read-offset the instant a multi-token accept
+        # (accept_len>=2) occurs.
+        for req, accept_len in zip(decode_reqs, mtp_accept_len_cpu):
+            req.mtp_accept_len = int(accept_len)
+        return
+
     def _update_mtp_accept_ratio(
         self,
         decode_reqs: List[InferReq],
         mtp_accept_len_cpu: torch.Tensor,
     ):
+        # Master-only accept-ratio statistics. Unlike _commit_mtp_accept_len this
+        # only feeds metrics, so it may stay in the phase-3 post_handle region.
         if self.is_master_in_dp:
             for req, accept_len in zip(decode_reqs, mtp_accept_len_cpu):
                 req.update_mtp_accepted_token_num(accept_token_num=accept_len - 1)
@@ -809,7 +842,6 @@ def _sample_and_scatter_token(
         b_prefill_has_output_cpu: torch.Tensor = None,
         mask_func: Optional[Callable] = None,
     ):
-
         if mask_func is not None:
             assert len(run_reqs) == logits.shape[0]
             mask_func(run_reqs, logits)
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py