fix(qwen3next): persist mtp full-attn cpu cache slots

sufubao · sufubao · commit 347e52e85b54 · 2026-06-05T14:50:53.000+08:00
diff --git a/lightllm/common/basemodel/triton_kernel/linear_att_cpu_cache_copy.py b/lightllm/common/basemodel/triton_kernel/linear_att_cpu_cache_copy.py
@@ -193,11 +193,7 @@ def copy_kv_buffer_to_cpu_cache(
     cpu_kv_ssm_tail_dim = cpu_kv_ssm_state.shape[-1]
     full_att_layer_num = gpu_kv_full_att_state.shape[-2]
 
-    assert (
-        full_att_layer_num
-        == (linear_config.all_layer_num // linear_config.full_attention_interval)
-        == (linear_config.all_layer_num - linear_config.linear_layer_num)
-    )
+    assert full_att_layer_num == linear_config.get_persisted_full_att_layer_num()
     assert gpu_full_att_tail_dim == cpu_cache_full_att.shape[-1]
     assert cpu_cache_conv.shape[-1] == cpu_kv_conv_state.shape[-1]
     assert cpu_cache_ssm.shape[-1] == cpu_kv_ssm_state.shape[-1]
@@ -428,6 +424,7 @@ def copy_cpu_cache_to_kv_buffer(
     cpu_kv_ssm_tail_dim = cpu_kv_ssm_state.shape[-1]
     full_att_layer_num = gpu_full_att_kv_state.shape[-2]
 
+    assert full_att_layer_num == linear_config.get_persisted_full_att_layer_num()
     assert gpu_full_att_tail_dim == cpu_cache_full_att.shape[-1]
     assert cpu_cache_conv.shape[-1] == cpu_kv_conv_state.shape[-1]
     assert cpu_cache_ssm.shape[-1] == cpu_kv_ssm_state.shape[-1]
diff --git a/lightllm/common/kv_cache_mem_manager/operator/linear_att.py b/lightllm/common/kv_cache_mem_manager/operator/linear_att.py
@@ -24,6 +24,16 @@ def __init__(self, mem_manager):
         super().__init__(mem_manager)
         self.linear_config = LinearAttCacheConfig.load_from_args()
 
+    @staticmethod
+    def _get_persisted_full_att_layer_num(mem_manager) -> int:
+        persisted_full_att = getattr(mem_manager, "persisted_full_att_layer_num", None)
+        if persisted_full_att is None:
+            main_full_att = getattr(mem_manager, "main_full_att_layer_num", mem_manager.kv_buffer.shape[0])
+            draft_full_att = getattr(mem_manager, "draft_full_att_layers", 0)
+            persisted_full_att = main_full_att + draft_full_att
+        assert 0 < persisted_full_att <= mem_manager.kv_buffer.shape[0]
+        return int(persisted_full_att)
+
     def load_cpu_cache_to_gpu(
         self,
         mem_indexes: torch.Tensor,
@@ -76,16 +86,14 @@ def load_cpu_cache_to_gpu(
             copy_cpu_cache_to_kv_buffer,
         )
 
-        # Persist/restore ONLY the main model's full-attn slice. The kv buffer is widened by
-        # dedicated MTP draft slots [main_full_att, main_full_att + draft) (speculative KV that
-        # must never touch the CPU/disk cache), so slice them off here.
-        main_full_att = getattr(mem_manager, "main_full_att_layer_num", mem_manager.kv_buffer.shape[0])
+        # Restore the persisted full-attn slice: main slots followed by MTP draft slots.
+        persisted_full_att = self._get_persisted_full_att_layer_num(mem_manager)
 
         copy_cpu_cache_to_kv_buffer(
             mem_indexes=mem_indexes,
             big_page_buffer_ids=big_page_buffer_ids_gpu,
             page_indexes=page_indexes,
-            gpu_full_att_kv_state=mem_manager.kv_buffer[:main_full_att],
+            gpu_full_att_kv_state=mem_manager.kv_buffer[:persisted_full_att],
             cpu_kv_conv_state=mem_manager.linear_att_big_page_buffers.conv_state_cache.buffer,
             cpu_kv_ssm_state=mem_manager.linear_att_big_page_buffers.ssm_state_cache.buffer,
             cpu_cache_tensor=cpu_cache_client.cpu_kv_cache_tensor,
@@ -174,17 +182,15 @@ def offload_gpu_kv_to_cpu_cache(
             copy_kv_buffer_to_cpu_cache,
         )
 
-        # Persist ONLY the main model's full-attn slice. The kv buffer is widened by dedicated
-        # MTP draft slots [main_full_att, main_full_att + draft) (speculative KV that must never
-        # be persisted to the CPU/disk cache), so slice them off here.
-        main_full_att = getattr(mem_manager, "main_full_att_layer_num", mem_manager.kv_buffer.shape[0])
+        # Persist the full-attn slice used for prefix reuse: main slots followed by MTP draft slots.
+        persisted_full_att = self._get_persisted_full_att_layer_num(mem_manager)
 
         copy_kv_buffer_to_cpu_cache(
             mem_indexes=mem_indexes,
             page_indexes=page_indexes,
             page_readies=page_readies,
             big_page_buffer_ids=big_page_buffer_ids_gpu,
-            gpu_kv_full_att_state=mem_manager.kv_buffer[:main_full_att],
+            gpu_kv_full_att_state=mem_manager.kv_buffer[:persisted_full_att],
             cpu_kv_conv_state=mem_manager.linear_att_big_page_buffers.conv_state_cache.buffer,
             cpu_kv_ssm_state=mem_manager.linear_att_big_page_buffers.ssm_state_cache.buffer,
             cpu_cache_tensor=cpu_cache_client.cpu_kv_cache_tensor,
diff --git a/lightllm/common/linear_att_cache_manager/config_objs.py b/lightllm/common/linear_att_cache_manager/config_objs.py
@@ -8,6 +8,15 @@
 logger = init_logger(__name__)
 
 
+def get_mtp_draft_full_att_layer_num(args) -> int:
+    mtp_mode = getattr(args, "mtp_mode", None)
+    if mtp_mode == "eagle_with_att":
+        return 1
+    if mtp_mode == "vanilla_with_att":
+        return getattr(args, "mtp_step", 0)
+    return 0
+
+
 @dataclasses.dataclass
 class LinearAttCacheConfig:
     tp_world_size: int
@@ -28,10 +37,19 @@ class LinearAttCacheConfig:
     ssm_state_dtype: torch.dtype
     full_attention_interval: int
     all_layer_num: int  # 包括 linear att 和 full att 的层加起来的层数
+    draft_full_att_layer_num: int = 0
 
     def get_conv_dim(self):
         return self.head_linear_k_dim * self.num_linear_k_heads * 2 + self.head_linear_v_dim * self.num_linear_v_heads
 
+    def get_main_full_att_layer_num(self):
+        main_full_att_layer_num = self.all_layer_num - self.linear_layer_num
+        assert main_full_att_layer_num == self.all_layer_num // self.full_attention_interval
+        return main_full_att_layer_num
+
+    def get_persisted_full_att_layer_num(self):
+        return self.get_main_full_att_layer_num() + self.draft_full_att_layer_num
+
     def get_persisted_conv_state_shape(self):
         # NARROW shape used for the CPU/disk persisted page and ALL byte math.
         # Persisted state is always the committed (narrow) sliding window.
@@ -71,7 +89,7 @@ def get_cpu_cache_full_att_bytes(self):
         )
         assert big_page_token_num == get_env_start_args().cpu_cache_token_page_size
         full_att_bytes = 2 * self.full_att_all_num_kv_heads * self.full_att_head_dim * self.full_att_dtype.itemsize
-        a = full_att_bytes * (self.all_layer_num - self.linear_layer_num) * big_page_token_num
+        a = full_att_bytes * self.get_persisted_full_att_layer_num() * big_page_token_num
         return a
 
     def get_cpu_cache_conv_bytes(self):
@@ -116,4 +134,5 @@ def load_from_args() -> "LinearAttCacheConfig":
             ssm_state_dtype=get_torch_dtype(args.linear_att_ssm_data_type),
             full_attention_interval=llm_config["full_attention_interval"],
             all_layer_num=n_layer,
+            draft_full_att_layer_num=get_mtp_draft_full_att_layer_num(args),
         )
diff --git a/lightllm/models/qwen3next/model.py b/lightllm/models/qwen3next/model.py
@@ -16,7 +16,10 @@
 from lightllm.common.kv_cache_mem_manager.qwen3next_mem_manager import Qwen3NextMemManager
 from lightllm.server.core.objs.start_args_type import StartArgs
 from lightllm.common.req_manager import ReqManagerForMamba
-from lightllm.common.linear_att_cache_manager.config_objs import LinearAttCacheConfig
+from lightllm.common.linear_att_cache_manager.config_objs import (
+    LinearAttCacheConfig,
+    get_mtp_draft_full_att_layer_num,
+)
 
 logger = init_logger(__name__)
 
@@ -58,6 +61,7 @@ def _init_mem_manager(self):
         assert self.config["num_attention_heads"] % self.tp_world_size_ == 0
         start_args: StartArgs = get_env_start_args()
         ssm_dtype_dict = {"bfloat16": torch.bfloat16, "float32": torch.float32}
+        draft_full_att_layers = get_mtp_draft_full_att_layer_num(start_args)
         self.linear_config = LinearAttCacheConfig(
             tp_world_size=self.tp_world_size_,
             full_att_all_num_kv_heads=self.config["num_key_value_heads"],
@@ -75,14 +79,11 @@ def _init_mem_manager(self):
             ssm_state_dtype=ssm_dtype_dict[start_args.linear_att_ssm_data_type],
             full_attention_interval=self.config["full_attention_interval"],
             all_layer_num=self.config["n_layer"],
+            draft_full_att_layer_num=draft_full_att_layers,
         )
 
-        main_full_att = self.linear_config.all_layer_num - self.linear_config.linear_layer_num
-        draft_full_att_layers = 0
-        if start_args.mtp_mode == "eagle_with_att":
-            draft_full_att_layers = 1
-        elif start_args.mtp_mode == "vanilla_with_att":
-            draft_full_att_layers = start_args.mtp_step
+        main_full_att = self.linear_config.get_main_full_att_layer_num()
+        persisted_full_att = self.linear_config.get_persisted_full_att_layer_num()
         self._main_full_att_layer_num = main_full_att
         self._draft_full_att_layers = draft_full_att_layers
 
@@ -91,12 +92,13 @@ def _init_mem_manager(self):
             dtype=self.data_type,
             num_kv_heads=self.num_kv_heads,
             head_dim=self.config["head_dim"],
-            full_att_layer_num=main_full_att + draft_full_att_layers,
+            full_att_layer_num=persisted_full_att,
             linear_config=self.linear_config,
             mem_fraction=self.mem_fraction,
         )
         self.mem_manager.main_full_att_layer_num = main_full_att
         self.mem_manager.draft_full_att_layers = draft_full_att_layers
+        self.mem_manager.persisted_full_att_layer_num = persisted_full_att
 
     def _init_req_manager(self):
         create_max_seq_len = 0
diff --git a/lightllm/utils/kv_cache_utils.py b/lightllm/utils/kv_cache_utils.py
@@ -121,6 +121,8 @@ def calcu_cpu_cache_meta() -> "CpuKVCacheMeta":
     if args.mtp_mode is not None:
         # TODO 可能会存在不同mtp模式的精度问题
         if is_linear_att_mixed_model(args.model_dir):
+            # Linear mixed models use one packed byte page; MTP draft full-attn
+            # slots are accounted in LinearAttCacheConfig.get_cpu_cache_big_page_bytes().
             pass
         else:
             cpu_cache_meta.layer_num += get_added_mtp_kv_layer_num()
diff --git a/test/acc/cpu_cache_roundtrip_test.py b/test/acc/cpu_cache_roundtrip_test.py
@@ -0,0 +1,86 @@
+"""Force the CPU KV-cache offload->restore path and check correctness.
+
+GSM8K can't exercise the CPU cache (one shared hot prefix, sub-page tails).
+This driver builds N distinct, page-aligned, long prompts that overflow the
+GPU KV budget so their KV is offloaded to CPU, then re-requests them so they
+are restored from CPU. With greedy decoding the round-2 (CPU-restored) output
+MUST be token-identical to round-1 (freshly computed). For the MTP build it
+also tracks accept-rate (mtp_avg_token_per_step) which would degrade if the
+draft full-attn slots were not persisted/restored correctly.
+"""
+import argparse
+import sys
+import requests
+from concurrent.futures import ThreadPoolExecutor
+
+
+def make_prompts(n, words_per_prompt):
+    prompts = []
+    for i in range(n):
+        # Distinct, deterministic filler so each prompt is its own radix branch
+        # and long enough to span several 256-token pages.
+        filler = " ".join(f"item{i}-{j}" for j in range(words_per_prompt))
+        prompts.append(
+            f"You are given list number {i}. The list is: {filler}. "
+            f"Question: briefly summarize what list number {i} contains. Answer:"
+        )
+    return prompts
+
+
+def gen(url, prompt, max_tokens):
+    data = {
+        "inputs": prompt,
+        "parameters": {
+            "temperature": 0.0,
+            "max_new_tokens": max_tokens,
+            "stop_sequences": None,
+            "repetition_penalty": 1.0,
+            "top_p": 1.0,
+            "top_k": 1,
+        },
+    }
+    r = requests.post(url, json=data, timeout=120)
+    assert r.status_code == 200, f"{r.status_code}: {r.text}"
+    return r.json()["generated_text"][0]
+
+
+def run_round(url, prompts, max_tokens, parallel):
+    out = [None] * len(prompts)
+    with ThreadPoolExecutor(max_workers=parallel) as ex:
+        futs = {ex.submit(gen, url, p, max_tokens): k for k, p in enumerate(prompts)}
+        for f in futs:
+            k = futs[f]
+            out[k] = f.result()
+    return out
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--host", default="http://127.0.0.1")
+    ap.add_argument("--port", type=int, default=8088)
+    ap.add_argument("--num-prompts", type=int, default=24)
+    ap.add_argument("--words-per-prompt", type=int, default=400)
+    ap.add_argument("--max-tokens", type=int, default=32)
+    ap.add_argument("--parallel", type=int, default=8)
+    args = ap.parse_args()
+
+    url = f"{args.host}:{args.port}/generate"
+    prompts = make_prompts(args.num_prompts, args.words_per_prompt)
+
+    print(f"Round 1 (cold compute): {len(prompts)} distinct prompts", flush=True)
+    r1 = run_round(url, prompts, args.max_tokens, args.parallel)
+    print("Round 2 (CPU-restored):", flush=True)
+    r2 = run_round(url, prompts, args.max_tokens, args.parallel)
+
+    mismatches = [i for i in range(len(prompts)) if r1[i] != r2[i]]
+    print(f"\n=== RESULT ===")
+    print(f"prompts: {len(prompts)}  identical: {len(prompts) - len(mismatches)}  mismatches: {len(mismatches)}")
+    if mismatches:
+        for i in mismatches[:5]:
+            print(f"  [#{i}] R1={r1[i]!r}\n        R2={r2[i]!r}")
+        sys.exit(1)
+    print("PASS: round-2 (CPU-restored) output is token-identical to round-1.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/unit_tests/common/test_linear_att_mtp_cpu_cache_persistence.py b/unit_tests/common/test_linear_att_mtp_cpu_cache_persistence.py