ModelTC
diff --git a/‎lightllm/server/router/model_infer/infer_batch.py‎
Lines changed: 24 additions & 3 deletions b/‎lightllm/server/router/model_infer/infer_batch.py‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/base_backend.py‎
Lines changed: 66 additions & 23 deletions b/‎lightllm/server/router/model_infer/mode_backend/base_backend.py‎
Lines changed: 66 additions & 23 deletions
diff --git a/‎lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py‎
Lines changed: 46 additions & 30 deletions b/‎lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py‎
Lines changed: 46 additions & 30 deletions
@@ -361,6 +361,11 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
         if not self.is_linear_att_mixed_model:
             return
 
+        # 当 dynamic prompt cache 被禁用时 radix_cache 为 None，没有大页/小页缓冲可写，
+        # 线性层状态仅存于 req_manager 的 GPU buffer 即可，直接跳过跨请求缓存拷贝。
+        if self.radix_cache is None:
+            return
+
         # 大页对应的 linear att 的拷贝
         big_page_token_num = self.args.linear_att_hash_page_size * self.args.linear_att_page_block_num
         big_page_buffer_ids = []
@@ -384,6 +389,10 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
 
             from lightllm.common.basemodel.triton_kernel.linear_att_copy import copy_linear_att_state_to_kv_buffer
 
+            b_num_accepted_tokens = torch.tensor(
+                [req.mtp_accept_len for req in reqs], dtype=torch.int32, requires_grad=False, device="cpu"
+            ).cuda(non_blocking=True)
+
             copy_linear_att_state_to_kv_buffer(
                 b_req_idx=b_req_idx,
                 big_page_buffer_ids=big_page_buffer_ids,
@@ -392,6 +401,7 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
                 cpu_kv_conv_state=self.radix_cache.linear_att_big_page_buffers.conv_state_cache.buffer,
                 cpu_kv_ssm_state=self.radix_cache.linear_att_big_page_buffers.ssm_state_cache.buffer,
                 mtp_step=self.args.mtp_step,
+                b_num_accepted_tokens=b_num_accepted_tokens,
             )
 
         assert not self.args.disable_chunked_prefill, "chunked prefill mode must be enabled for linear att mixed model"
@@ -407,9 +417,18 @@ def copy_linear_att_state_to_cache_buffer(self, b_req_idx: torch.Tensor, reqs: L
                         self.radix_cache.linear_att_small_page_buffers.alloc_one_state_cache()
                     )
                     if req.tail_linear_att_small_page_buffer_id is not None:
-                        src_buffer_idx = req.req_idx * (self.args.mtp_step + 1)
-                        gpu_conv_state = self.req_manager.req_to_conv_state.buffer[:, src_buffer_idx, ...]
-                        gpu_ssm_state = self.req_manager.req_to_ssm_state.buffer[:, src_buffer_idx, ...]
+                        assert 1 <= req.mtp_accept_len <= self.args.mtp_step + 1, (
+                            f"mtp_accept_len={req.mtp_accept_len} out of range "
+                            f"[1, {self.args.mtp_step + 1}]; would slice past the widened conv slot"
+                        )
+                        canonical_off = req.mtp_accept_len - 1
+                        conv_src_idx = req.req_idx
+                        ssm_src_idx = req.req_idx * (self.args.mtp_step + 1) + canonical_off
+                        narrow_w = self.req_manager.linear_config.get_persisted_conv_state_shape()[-1]
+                        gpu_conv_state = self.req_manager.req_to_conv_state.buffer[
+                            :, conv_src_idx, ..., canonical_off : canonical_off + narrow_w
+                        ]
+                        gpu_ssm_state = self.req_manager.req_to_ssm_state.buffer[:, ssm_src_idx, ...]
                         dst_buffer_idx = req.tail_linear_att_small_page_buffer_id
 
                         dst_conv_state, dst_ssm_state = self.radix_cache.linear_att_small_page_buffers.get_state_cache(
@@ -559,6 +578,8 @@ def __init__(
         else:
             self.decode_need_token_num = self._normal_decode_need_token_num
 
+        self.mtp_accept_len: int = 1
+
         if g_infer_context.is_linear_att_mixed_model:
             self.get_chuncked_input_token_len = self.get_chuncked_input_token_len_for_linear_att
             self.get_chuncked_input_token_ids = self.get_chuncked_input_token_ids_for_linear_att
 
@@ -1,4 +1,5 @@
 import os
+import copy
 import numpy as np
 import torch
 import time
@@ -41,10 +42,6 @@
 )
 from lightllm.server.core.objs.shm_objs_io_buffer import ShmObjsIOBuffer
 from lightllm.server.router.model_infer.mode_backend.overlap_events import OverlapEventManager, OverlapEventPack
-from lightllm.models.deepseek_mtp.model import Deepseek3MTPModel
-from lightllm.models.qwen3_moe_mtp.model import Qwen3MOEMTPModel
-from lightllm.models.mistral_mtp.model import MistralMTPModel
-from lightllm.models.glm4_moe_lite_mtp.model import Glm4MoeLiteMTPModel
 from lightllm.server.router.model_infer.mode_backend.generic_post_process import sample
 from lightllm.common.basemodel.triton_kernel.gather_token_id import scatter_token
 from lightllm.server.pd_io_struct import PDChunckedTransTaskRet
@@ -328,22 +325,11 @@ def init_mtp_draft_model(self, main_kvargs: dict):
                 "mtp_previous_draft_models": self.draft_models.copy(),
             }
 
-            # Select MTP model class based on model type
+            # Select MTP model class based on model type (single source of truth: #10).
+            from lightllm.server.router.model_infer.mode_backend.mtp_model_factory import create_mtp_draft_model
+
             model_type = mtp_model_cfg.get("model_type", "")
-            if model_type == "deepseek_v3":
-                assert self.args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
-                self.draft_models.append(Deepseek3MTPModel(mtp_model_kvargs))
-            elif model_type == "qwen3_moe":
-                assert self.args.mtp_mode in ["vanilla_no_att", "eagle_no_att"]
-                self.draft_models.append(Qwen3MOEMTPModel(mtp_model_kvargs))
-            elif model_type == "mistral":
-                assert self.args.mtp_mode in ["vanilla_no_att", "eagle_no_att"]
-                self.draft_models.append(MistralMTPModel(mtp_model_kvargs))
-            elif mtp_model_cfg["model_type"] == "glm4_moe_lite":
-                assert self.args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
-                self.draft_models.append(Glm4MoeLiteMTPModel(mtp_model_kvargs))
-            else:
-                raise ValueError(f"Unsupported MTP model type: {model_type}")
+            self.draft_models.append(create_mtp_draft_model(model_type, self.args.mtp_mode, mtp_model_kvargs))
 
             self.logger.info(f"loaded mtp model class {self.draft_models[i].__class__}")
         return
@@ -584,7 +570,6 @@ def _get_classed_reqs(
         can_alloc_token_num = g_infer_context.get_can_alloc_token_num()
 
         for req_obj in ready_reqs:
-
             if req_obj.filter_mark:
                 finished_reqs.append(req_obj)
                 continue
@@ -761,20 +746,79 @@ def _verify_mtp_v2(
         )
         return mtp_accept_len, accepted_index
 
+    def _build_eagle_accepted_draft_input(
+        self,
+        main_model_input: ModelInput,
+        main_model_output: ModelOutput,
+        next_token_ids: torch.Tensor,
+        mtp_accept_len: torch.Tensor,
+        b_req_mtp_start_loc: torch.Tensor,
+    ):
+        accepted_row_idx = b_req_mtp_start_loc + mtp_accept_len - 1
+        accepted_row_idx_long = accepted_row_idx.long()
+
+        draft_model_input = copy.copy(main_model_input)
+        draft_model_input.batch_size = accepted_row_idx.shape[0]
+        draft_model_input.total_token_num = draft_model_input.batch_size * main_model_input.max_kv_seq_len
+        draft_model_input.input_ids = next_token_ids.index_select(0, accepted_row_idx_long)
+        draft_model_input.mtp_draft_input_hiddens = main_model_output.mtp_main_output_hiddens.index_select(
+            0, accepted_row_idx_long
+        )
+        draft_model_input.b_req_idx = main_model_input.b_req_idx.index_select(0, accepted_row_idx_long)
+        draft_model_input.b_mtp_index = main_model_input.b_mtp_index.index_select(0, accepted_row_idx_long)
+        draft_model_input.b_seq_len = main_model_input.b_seq_len.index_select(0, accepted_row_idx_long)
+        draft_model_input.b_num_accepted_tokens = None
+        if main_model_input.mem_indexes is not None:
+            draft_model_input.mem_indexes = main_model_input.mem_indexes.index_select(0, accepted_row_idx_long)
+            draft_model_input.mem_indexes_cpu = None
+        if main_model_input.b_shared_seq_len is not None:
+            draft_model_input.b_shared_seq_len = main_model_input.b_shared_seq_len.index_select(
+                0, accepted_row_idx_long
+            )
+        if main_model_input.b_mark_shared_group is not None:
+            draft_model_input.b_mark_shared_group = main_model_input.b_mark_shared_group.index_select(
+                0, accepted_row_idx_long
+            )
+
+        if accepted_row_idx.device.type == "cpu":
+            selected_rows = accepted_row_idx.tolist()
+            draft_model_input.multimodal_params = [main_model_input.multimodal_params[i] for i in selected_rows]
+        else:
+            draft_model_input.multimodal_params = [
+                {"images": [], "audios": []} for _ in range(draft_model_input.batch_size)
+            ]
+
+        accepted_next_token_ids = draft_model_input.input_ids
+        accepted_req_idx = draft_model_input.b_req_idx
+        return draft_model_input, accepted_next_token_ids, accepted_req_idx
+
+    def _scatter_accepted_next_token_ids(self, accepted_req_idx: torch.Tensor, all_next_token_ids: torch.Tensor):
+        req_to_next_token_ids = self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids
+        width = all_next_token_ids.shape[1]
+        req_to_next_token_ids[:, :width].index_copy_(
+            0,
+            accepted_req_idx.long(),
+            all_next_token_ids.to(dtype=req_to_next_token_ids.dtype),
+        )
+        return
+
     def _update_mtp_accept_ratio(
         self,
         decode_reqs: List[InferReq],
         mtp_accept_len_cpu: torch.Tensor,
     ):
+        # Master-only accept-ratio statistics. Unlike the phase-2 mtp_accept_len commit
+        # (inlined in decode_mtp) this only feeds metrics, so it may stay in phase 3.
         if self.is_master_in_dp:
             for req, accept_len in zip(decode_reqs, mtp_accept_len_cpu):
                 req.update_mtp_accepted_token_num(accept_token_num=accept_len - 1)
         return
 
     def _gen_argmax_token_ids(self, model_output: ModelOutput):
         logits = model_output.logits
-        probs = torch.softmax(logits, dim=-1)
-        draft_next_token_ids_gpu = torch.argmax(probs, dim=-1)
+        # softmax is strictly monotonic, so argmax(softmax(logits)) == argmax(logits);
+        # skip the softmax to shorten the per-step MTP draft critical chain (need-to-fix #16).
+        draft_next_token_ids_gpu = torch.argmax(logits, dim=-1)
         return draft_next_token_ids_gpu
 
     def _sample_and_scatter_token(
@@ -787,7 +831,6 @@ def _sample_and_scatter_token(
         b_prefill_has_output_cpu: torch.Tensor = None,
         mask_func: Optional[Callable] = None,
     ):
-
         if mask_func is not None:
             assert len(run_reqs) == logits.shape[0]
             mask_func(run_reqs, logits)
 
@@ -1,5 +1,6 @@
 import torch
 import time
+import copy
 from typing import List, Optional, Callable, Dict, Any
 from queue import Queue
 from lightllm.server.router.model_infer.mode_backend.base_backend import ModeBackend
@@ -240,17 +241,23 @@ def decode_mtp(
         """
         model_input, run_reqs = prepare_decode_inputs(decode_reqs)
 
+        if self.mtp_step > 0:
+            accept_lens = [req.mtp_accept_len for req in decode_reqs]
+            model_input.b_num_accepted_tokens = g_pin_mem_manager.gen_from_list(
+                key="b_num_accepted_tokens",
+                data=accept_lens,
+                dtype=torch.int32,
+            )
+
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
-            b_mtp_index_cpu = model_input.b_mtp_index
             model_output = self.model.forward(model_input)
             next_token_ids, next_token_logprobs = sample(model_output.logits, run_reqs, self.eos_id)
-            # verify the next_token_ids
-            b_req_mtp_start_loc = [index for index, mtp_index in enumerate(b_mtp_index_cpu) if mtp_index == 0]
-            b_req_mtp_start_loc = g_pin_mem_manager.gen_from_list(
-                key="b_req_mtp_start_loc",
-                data=b_req_mtp_start_loc,
-                dtype=torch.int32,
-            ).cuda(non_blocking=True)
+            # verify the next_token_ids. The chunked decode batch is the contiguous
+            # (mtp_step+1)-expanded layout, so request starts are structurally
+            # arange(n_real)*(mtp_step+1). Compute on device instead of a per-step Python
+            # list-comp + pinned pack + H2D (#22).
+            n_real = model_input.batch_size // (self.mtp_step + 1)
+            b_req_mtp_start_loc = torch.arange(n_real, dtype=torch.int32, device="cuda") * (self.mtp_step + 1)
 
             mtp_accept_len, accepted_index = self._verify_mtp_v2(
                 new_next_token_ids=next_token_ids,
@@ -292,6 +299,8 @@ def decode_mtp(
         # 第二阶段
         event_pack.notify_post_handle_and_wait_pre_post_handle()
         verify_event.synchronize()
+        for req, accept_len in zip(decode_reqs, mtp_accept_len_cpu):
+            req.mtp_accept_len = int(accept_len)
         verify_ok_reqs = [run_reqs[i] for i in range(len(run_reqs)) if accepted_index_cpu[i] == 1]
         update_packs = self._pre_post_handle(verify_ok_reqs, is_chuncked_mode=False)
 
@@ -344,15 +353,19 @@ def _draft_decode_vanilla(
         mtp_accept_len: torch.Tensor,
         b_req_mtp_start_loc: torch.Tensor,
     ):
-        # share some inference info with the main model
-        draft_model_input = main_model_input
+        # share some inference info with the main model. copy.copy 后清空 b_num_accepted_tokens，
+        # 使 draft (MTP) forward 走普通 decode 布局 (bs, False)；否则会沿用主模型 decode_mtp 设置的
+        # verify 布局，命中 MTP draft 模型从未捕获的 cudagraph key (bs, True) -> KeyError
+        # （cudagraph 关闭时则会在扁平的 draft batch 上误用 S+1 分组的 verify attention）。
+        # 镜像 eagle 路径 _build_eagle_accepted_draft_input 中清空 b_num_accepted_tokens 的处理。
+        draft_model_input = copy.copy(main_model_input)
+        draft_model_input.b_num_accepted_tokens = None
         draft_model_output = main_model_output
         draft_next_token_ids = next_token_ids
         all_next_token_ids = []
         all_next_token_ids.append(next_token_ids)
         # process the draft model output
         for draft_model_idx in range(self.mtp_step):
-
             draft_model_input.input_ids = draft_next_token_ids
             draft_model_input.mtp_draft_input_hiddens = draft_model_output.mtp_main_output_hiddens
             # spec decode: MTP
@@ -379,44 +392,47 @@ def _draft_decode_eagle(
         mtp_accept_len: torch.Tensor,
         b_req_mtp_start_loc: torch.Tensor,
     ):
-        batch_size = main_model_input.batch_size
-        num_reqs = batch_size // (self.mtp_step + 1)
+        num_reqs = b_req_mtp_start_loc.shape[0]
         if g_infer_context.radix_cache is not None:
             g_infer_context.radix_cache.free_radix_cache_to_get_enough_token(num_reqs * self.mtp_step)
         eagle_mem_indexes_cpu = g_infer_context.req_manager.mem_manager.alloc(num_reqs * self.mtp_step)
         eagle_mem_indexes = eagle_mem_indexes_cpu.cuda(non_blocking=True)
 
-        # share some inference info with the main model
-        draft_model_input = main_model_input
+        (draft_model_input, draft_next_token_ids, accepted_req_idx,) = self._build_eagle_accepted_draft_input(
+            main_model_input=main_model_input,
+            main_model_output=main_model_output,
+            next_token_ids=next_token_ids,
+            mtp_accept_len=mtp_accept_len,
+            b_req_mtp_start_loc=b_req_mtp_start_loc,
+        )
         draft_model_output = main_model_output
-        draft_next_token_ids = next_token_ids
         all_next_token_ids = []
-        all_next_token_ids.append(next_token_ids)
-        # process the draft model output
-        for _step in range(self.mtp_step):
+        all_next_token_ids.append(draft_next_token_ids)
+
+        mtp_size = self.mtp_step + 1
+        main_mem_indexes = main_model_input.mem_indexes.view(num_reqs, mtp_size)
+        eagle_mem_indexes_by_req = eagle_mem_indexes.view(self.mtp_step, num_reqs).transpose(0, 1).contiguous()
+        mem_index_plan = torch.cat([main_mem_indexes, eagle_mem_indexes_by_req], dim=1)
+        accepted_offsets = mtp_accept_len.long() - 1
+        req_offsets = torch.arange(num_reqs, dtype=torch.long, device=mtp_accept_len.device)
 
+        for _step in range(self.mtp_step):
             draft_model_input.input_ids = draft_next_token_ids
-            draft_model_input.mtp_draft_input_hiddens = draft_model_output.mtp_main_output_hiddens
+            if _step > 0:
+                draft_model_input.mtp_draft_input_hiddens = draft_model_output.mtp_main_output_hiddens
+            draft_model_input.mem_indexes = mem_index_plan[req_offsets, accepted_offsets + _step]
             # spec decode: MTP
             draft_model_idx = _step % self.num_mtp_models
             draft_model_output: ModelOutput = self.draft_models[draft_model_idx].forward(draft_model_input)
             draft_next_token_ids = self._gen_argmax_token_ids(draft_model_output)
             draft_model_input.b_seq_len += 1
             draft_model_input.max_kv_seq_len += 1
-            eagle_mem_indexes_i = eagle_mem_indexes[_step * num_reqs : (_step + 1) * num_reqs]
-            draft_model_input.mem_indexes = torch.cat(
-                [draft_model_input.mem_indexes.view(-1, self.mtp_step + 1)[:, 1:], eagle_mem_indexes_i.view(-1, 1)],
-                dim=1,
-            ).view(-1)
             all_next_token_ids.append(draft_next_token_ids)
 
         all_next_token_ids = torch.stack(all_next_token_ids, dim=1)  # [batch_size, mtp_step + 1]
 
-        mtp_scatter_next_token_ids(
-            req_to_next_token_ids=self.model.req_manager.req_sampling_params_manager.req_to_next_token_ids,
-            b_req_mtp_start_loc=b_req_mtp_start_loc,
+        self._scatter_accepted_next_token_ids(
+            accepted_req_idx=accepted_req_idx,
             all_next_token_ids=all_next_token_ids,
-            b_req_idx=main_model_input.b_req_idx,
-            mtp_accept_len=mtp_accept_len,
         )
         return eagle_mem_indexes_cpu