style: align formatting with upstream/main and inline mtp accept-len commit

sufubao · sufubao · commit 16170f3b090f · 2026-06-07T15:55:49.000+08:00
- Revert local reformatting to match upstream/main exactly, minimizing PR diff
- Inline _commit_mtp_accept_len into decode_mtp (phase-2 ordering preserved)
- Drop redundant inline comments
diff --git a/lightllm/common/basemodel/attention/fa3/fp.py b/lightllm/common/basemodel/attention/fa3/fp.py
@@ -1,19 +1,12 @@
 import dataclasses
 import torch
-from ..base_att import (
-    BaseAttBackend,
-    BasePrefillAttState,
-    BaseDecodeAttState,
-    AttControl,
-)
+from ..base_att import BaseAttBackend, BasePrefillAttState, BaseDecodeAttState, AttControl
 from typing import Optional, TYPE_CHECKING
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.utils.sgl_utils import flash_attn_with_kvcache
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.common.basemodel.triton_kernel.fa3_utils import page_table_copy
-from lightllm.common.basemodel.triton_kernel.gen_prefill_params import (
-    gen_cumsum_pad0_tensor,
-)
+from lightllm.common.basemodel.triton_kernel.gen_prefill_params import gen_cumsum_pad0_tensor
 
 
 class Fa3AttBackend(BaseAttBackend):
@@ -28,14 +21,12 @@ def get_page_table_buffer(self):
         model = self.model
         if not hasattr(self, "_shared_page_table_buffer"):
             self._shared_page_table_buffer = [
-                torch.empty(
-                    model.graph_max_batch_size * model.graph_max_len_in_batch,
-                    dtype=torch.int32,
-                ).to(get_current_device_id()),
-                torch.empty(
-                    model.graph_max_batch_size * model.graph_max_len_in_batch,
-                    dtype=torch.int32,
-                ).to(get_current_device_id()),
+                torch.empty(model.graph_max_batch_size * model.graph_max_len_in_batch, dtype=torch.int32).to(
+                    get_current_device_id()
+                ),
+                torch.empty(model.graph_max_batch_size * model.graph_max_len_in_batch, dtype=torch.int32).to(
+                    get_current_device_id()
+                ),
             ]
         return self._shared_page_table_buffer
 
@@ -84,12 +75,7 @@ def prefill_att(
         )
 
     def _nomarl_prefill_att(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        att_control: AttControl,
-        alloc_func=torch.empty,
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, att_control: AttControl, alloc_func=torch.empty
     ) -> torch.Tensor:
         self.backend: Fa3AttBackend = self.backend  # for typing
 
diff --git a/lightllm/common/basemodel/attention/fa3/fp8.py b/lightllm/common/basemodel/attention/fa3/fp8.py
@@ -44,12 +44,9 @@ def init_state(self):
             torch.arange(batch_size, device=device), self.infer_state.b_q_seq_len
         )
         # 为了减少推理计算量，在推理外部初始化k_descale和v_descale
-        self.k_descale = (
-            offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
-        )
-        self.v_descale = (
-            offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
-        )
+        self.k_descale = offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+        self.v_descale = offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+
 
     def prefill_att(
         self,
@@ -125,12 +122,8 @@ def init_state(self):
         head_num = mem_manager.head_num
 
         # 为了减少推理计算量，在推理外部初始化k_descale和v_descale
-        self.k_descale = (
-            offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
-        )
-        self.v_descale = (
-            offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
-        )
+        self.k_descale = offline_scales[:, :head_num].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
+        self.v_descale = offline_scales[:, head_num:].view(-1, 1, head_num).expand(offline_scales.shape[0], batch_size, head_num)
 
         return
 
diff --git a/lightllm/common/basemodel/attention/fa3/mla.py b/lightllm/common/basemodel/attention/fa3/mla.py
@@ -1,19 +1,12 @@
 import dataclasses
 import torch
-from ..base_att import (
-    BaseAttBackend,
-    BasePrefillAttState,
-    BaseDecodeAttState,
-    AttControl,
-)
+from ..base_att import BaseAttBackend, BasePrefillAttState, BaseDecodeAttState, AttControl
 from typing import Optional, TYPE_CHECKING, Tuple
 from lightllm.utils.dist_utils import get_current_device_id
 from lightllm.utils.sgl_utils import flash_attn_with_kvcache
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.common.basemodel.triton_kernel.fa3_utils import page_table_copy
-from lightllm.common.basemodel.triton_kernel.gen_prefill_params import (
-    gen_cumsum_pad0_tensor,
-)
+from lightllm.common.basemodel.triton_kernel.gen_prefill_params import gen_cumsum_pad0_tensor
 from lightllm.utils.sgl_utils import flash_attn_varlen_func
 
 
@@ -29,14 +22,12 @@ def get_page_table_buffer(self):
         model = self.model
         if not hasattr(self, "_shared_page_table_buffer"):
             self._shared_page_table_buffer = [
-                torch.empty(
-                    model.graph_max_batch_size * model.graph_max_len_in_batch,
-                    dtype=torch.int32,
-                ).to(get_current_device_id()),
-                torch.empty(
-                    model.graph_max_batch_size * model.graph_max_len_in_batch,
-                    dtype=torch.int32,
-                ).to(get_current_device_id()),
+                torch.empty(model.graph_max_batch_size * model.graph_max_len_in_batch, dtype=torch.int32).to(
+                    get_current_device_id()
+                ),
+                torch.empty(model.graph_max_batch_size * model.graph_max_len_in_batch, dtype=torch.int32).to(
+                    get_current_device_id()
+                ),
             ]
         return self._shared_page_table_buffer
 
@@ -78,12 +69,7 @@ def prefill_att(
         )
 
     def _mla_prefill_att(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        att_control: AttControl,
-        alloc_func=torch.empty,
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, att_control: AttControl, alloc_func=torch.empty
     ) -> torch.Tensor:
         self.backend: MlaFa3AttBackend = self.backend  # for typing
         k_nope, k_rope = k
diff --git a/lightllm/common/basemodel/triton_kernel/linear_att_copy.py b/lightllm/common/basemodel/triton_kernel/linear_att_copy.py
@@ -46,15 +46,7 @@ def _copy_linear_att_state_to_kv_buffer(
     accept_len = tl.load(num_accepted_tokens_ptr + cur_batch).to(tl.int64)
     canonical_off = accept_len - 1
 
-    # --- conv snapshot ---
-    # conv is a single WIDENED slot keyed by req_idx (asymmetric layout, §3.4).
-    # The committed NARROW window of byte length conv_narrow_row_bytes sits at
-    # byte offset canonical_off * itemsize inside each widened row. The flattened
-    # uint8 tail lays out element [d, w] at d * gpu_conv_row_bytes + w (bytes),
-    # so the narrow window is strided per row: copy row-by-row.
     conv_src_slot = cur_req_idx
-    # gpu_conv_stride_d carries the per-element byte size (itemsize); the narrow
-    # window starts canonical_off elements into the widened row.
     conv_off_bytes = canonical_off * gpu_conv_stride_d
     gpu_conv_base = gpu_conv_ptr + cur_layer * gpu_conv_stride_l + conv_src_slot * gpu_conv_stride_s + conv_off_bytes
     cpu_conv_base = cpu_kv_conv_ptr + big_page_buffer_idx * cpu_kv_conv_stride_s + cur_layer * cpu_kv_conv_stride_l
@@ -65,9 +57,6 @@ def _copy_linear_att_state_to_kv_buffer(
             conv_data = tl.load(gpu_conv_base + d * gpu_conv_row_bytes + off, mask=mask)
             tl.store(cpu_conv_base + d * cpu_kv_conv_stride_d + off, conv_data, mask=mask)
 
-    # --- ssm snapshot ---
-    # ssm is an (S+1) BLOCK per request; the committed block slot is
-    # req_idx * (mtp_step + 1) + canonical_off.
     ssm_src_slot = (cur_req_idx * (mtp_step + 1) + canonical_off).to(tl.int64)
     for i in range(tl.cdiv(gpu_ssm_tail_dim, BLOCK)):
         gpu_start_off = i * BLOCK + tl.arange(0, BLOCK)
@@ -98,10 +87,6 @@ def copy_linear_att_state_to_kv_buffer(
     assert len(b_req_idx) == b_num_accepted_tokens.shape[0]
     BLOCK = 4096
 
-    # Conv: keep the (conv_dim, width) tail un-flattened so the committed narrow
-    # window can be read per row at the canonical offset (the window is strided
-    # in the flattened widened layout). Capture itemsize BEFORE the uint8 view to
-    # convert the element-unit canonical offset into a byte offset.
     assert gpu_conv_state.dim() >= 4, "gpu_conv_state must be [layer, s, conv_dim, widened_width]"
     assert cpu_kv_conv_state.dim() >= 4, "cpu_kv_conv_state must be [size, layer, conv_dim, width_narrow]"
     conv_itemsize = gpu_conv_state.element_size()
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -794,35 +794,13 @@ def _verify_mtp_v2(
         )
         return mtp_accept_len, accepted_index
 
-    def _commit_mtp_accept_len(
-        self,
-        decode_reqs: List[InferReq],
-        mtp_accept_len_cpu: torch.Tensor,
-    ):
-        # Carry the per-req accept count into the NEXT step as the canonical
-        # pointer (design §3.1). This must run on every rank (not only master):
-        # the kernels on this rank read req.mtp_accept_len.
-        #
-        # CRITICAL ordering (overlap scheduler): the next step's decode_mtp reads
-        # req.mtp_accept_len (to build b_num_accepted_tokens) the moment its
-        # wait_to_forward() is released, which happens at THIS step's
-        # notify_forward_and_wait_post_handle() (start of phase 3). So this carry
-        # MUST be committed in phase 2 (pre_post_handle), before that release —
-        # otherwise the next step reads a one-step-stale accept count. The error
-        # is invisible while accept_len is constant (==1) and corrupts the GDN
-        # conv/ssm committed-state read-offset the instant a multi-token accept
-        # (accept_len>=2) occurs.
-        for req, accept_len in zip(decode_reqs, mtp_accept_len_cpu):
-            req.mtp_accept_len = int(accept_len)
-        return
-
     def _update_mtp_accept_ratio(
         self,
         decode_reqs: List[InferReq],
         mtp_accept_len_cpu: torch.Tensor,
     ):
-        # Master-only accept-ratio statistics. Unlike _commit_mtp_accept_len this
-        # only feeds metrics, so it may stay in the phase-3 post_handle region.
+        # Master-only accept-ratio statistics. Unlike the phase-2 mtp_accept_len commit
+        # (inlined in decode_mtp) this only feeds metrics, so it may stay in phase 3.
         if self.is_master_in_dp:
             for req, accept_len in zip(decode_reqs, mtp_accept_len_cpu):
                 req.update_mtp_accepted_token_num(accept_token_num=accept_len - 1)
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py
@@ -4,9 +4,7 @@
 from typing import List, Optional, Callable, Dict, Any
 from queue import Queue
 from lightllm.server.router.model_infer.mode_backend.base_backend import ModeBackend
-from lightllm.server.router.model_infer.mode_backend.overlap_events import (
-    OverlapEventPack,
-)
+from lightllm.server.router.model_infer.mode_backend.overlap_events import OverlapEventPack
 from lightllm.server.router.model_infer.infer_batch import InferReq
 from lightllm.server.router.model_infer.mode_backend.pre import (
     prepare_prefill_inputs,
@@ -43,10 +41,7 @@ def __init__(self) -> None:
         if get_env_start_args().mtp_mode:
             self.prefill = self.prefill_mtp
             self.decode = self.decode_mtp
-            self.is_mtp_eagle = get_env_start_args().mtp_mode in [
-                "eagle_with_att",
-                "eagle_no_att",
-            ]
+            self.is_mtp_eagle = get_env_start_args().mtp_mode in ["eagle_with_att", "eagle_no_att"]
             self.num_mtp_models = 1 if self.is_mtp_eagle else get_env_start_args().mtp_step
             self._draft_decode_func = self._draft_decode_eagle if self.is_mtp_eagle else self._draft_decode_vanilla
         else:
@@ -115,7 +110,7 @@ def prefill_normal(
         model_input, run_reqs = prepare_prefill_inputs(prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill)
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
-            (_, next_token_ids_cpu, next_token_logprobs_cpu,) = self._sample_and_scatter_token(
+            _, next_token_ids_cpu, next_token_logprobs_cpu = self._sample_and_scatter_token(
                 logits=model_output.logits,
                 b_req_idx=model_input.b_req_idx,
                 b_mtp_index=model_input.b_mtp_index,
@@ -158,7 +153,7 @@ def decode_normal(
         model_input, run_reqs = prepare_decode_inputs(decode_reqs)
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
-            (_, next_token_ids_cpu, next_token_logprobs_cpu,) = self._sample_and_scatter_token(
+            _, next_token_ids_cpu, next_token_logprobs_cpu = self._sample_and_scatter_token(
                 logits=model_output.logits,
                 b_req_idx=model_input.b_req_idx,
                 b_mtp_index=model_input.b_mtp_index,
@@ -196,7 +191,7 @@ def prefill_mtp(
         model_input, run_reqs = prepare_prefill_inputs(prefill_reqs, is_chuncked_mode=not self.disable_chunked_prefill)
         with torch.cuda.stream(g_infer_context.get_overlap_stream()):
             model_output = self.model.forward(model_input)
-            (next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu,) = self._sample_and_scatter_token(
+            next_token_ids, next_token_ids_cpu, next_token_logprobs_cpu = self._sample_and_scatter_token(
                 logits=model_output.logits,
                 b_req_idx=model_input.b_req_idx,
                 b_mtp_index=model_input.b_mtp_index,
@@ -207,9 +202,7 @@ def prefill_mtp(
             )
             # mtp kv fill
             self._draft_prefill_forward(
-                model_input=model_input,
-                model_output=model_output,
-                next_token_ids=next_token_ids,
+                model_input=model_input, model_output=model_output, next_token_ids=next_token_ids
             )
             g_infer_context.copy_linear_att_state_to_cache_buffer(
                 b_req_idx=model_input.b_req_idx,
@@ -249,11 +242,6 @@ def decode_mtp(
         """
         model_input, run_reqs = prepare_decode_inputs(decode_reqs)
 
-        # Build the per-real-request accept tensor (carried InferReq.mtp_accept_len
-        # from the previous step). decode_reqs is one entry per real request,
-        # aligning 1:1 with the b_gdn_verify_cu_seqlens grouping (the same zip used
-        # by _update_mtp_accept_ratio). Threaded onto the infer_state via ModelInput
-        # (mirrors b_mtp_index); to_cuda() moves it inside forward. §3.1
         if self.mtp_step > 0:
             accept_lens = [req.mtp_accept_len for req in decode_reqs]
             model_input.b_num_accepted_tokens = g_pin_mem_manager.gen_from_list(
@@ -290,10 +278,9 @@ def decode_mtp(
             verify_event = torch.cuda.Event()
             verify_event.record()
 
-            (
-                next_token_ids_cpu,
-                next_token_logprobs_cpu,
-            ) = self._async_copy_next_token_infos_to_pin_mem(next_token_ids, next_token_logprobs)
+            next_token_ids_cpu, next_token_logprobs_cpu = self._async_copy_next_token_infos_to_pin_mem(
+                next_token_ids, next_token_logprobs
+            )
 
             # 调用具体的draft decode函数
             additional_mem_indexes_cpu = self._draft_decode_func(
@@ -315,12 +302,8 @@ def decode_mtp(
         # 第二阶段
         event_pack.notify_post_handle_and_wait_pre_post_handle()
         verify_event.synchronize()
-        # Commit the carried accept count HERE (phase 2 / pre_post_handle), not in
-        # phase 3: the next overlapped step reads req.mtp_accept_len as soon as this
-        # step calls notify_forward_and_wait_post_handle() below, so the update must
-        # land before that release to avoid feeding the kernels a stale (one-step-old)
-        # accept count. See _commit_mtp_accept_len for the full rationale.
-        self._commit_mtp_accept_len(decode_reqs=decode_reqs, mtp_accept_len_cpu=mtp_accept_len_cpu)
+        for req, accept_len in zip(decode_reqs, mtp_accept_len_cpu):
+            req.mtp_accept_len = int(accept_len)
         verify_ok_reqs = [run_reqs[i] for i in range(len(run_reqs)) if accepted_index_cpu[i] == 1]
         update_packs = self._pre_post_handle(verify_ok_reqs, is_chuncked_mode=False)
 
@@ -352,12 +335,7 @@ def decode_mtp(
         event_pack.notify_pre_post_handle()
         return
 
-    def _draft_prefill_forward(
-        self,
-        model_input: ModelInput,
-        model_output: ModelOutput,
-        next_token_ids: torch.Tensor,
-    ):
+    def _draft_prefill_forward(self, model_input: ModelInput, model_output: ModelOutput, next_token_ids: torch.Tensor):
         # spec prefill: MTP, 这个地方只是为了填充draft model的 kv， 并不会使用生成的token_id。
         draft_model_input = model_input
         draft_model_output = model_output