ModelTC
diff --git a/‎lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py‎
Lines changed: 13 additions & 1 deletion b/‎lightllm/server/router/dynamic_prompt/linear_att_radix_cache.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎lightllm/server/router/model_infer/infer_batch.py‎
Lines changed: 16 additions & 0 deletions b/‎lightllm/server/router/model_infer/infer_batch.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎unit_tests/server/router/dynamic_prompt/test_linear_att_radix_cache_bigpage.py‎
Lines changed: 300 additions & 0 deletions b/‎unit_tests/server/router/dynamic_prompt/test_linear_att_radix_cache_bigpage.py‎
Lines changed: 300 additions & 0 deletions
@@ -163,7 +163,10 @@ def _discard_node(self, node: LinearAttPagedTreeNode):
         return
 
     def _add_node(self, node: LinearAttPagedTreeNode):
-        if node.is_leaf():
+        # root 永远不参与回收：当树为空时 root 自身也满足 is_leaf()，若加入 _evict_tree_set，
+        # 会与 _evict 中 "node is not self.root_node" 的断言相矛盾（当前仅靠 root 的 ref_counter>=1
+        # 和回收水位 guard 掩盖）。这里显式排除，使数据结构与回收逻辑的意图一致。
+        if node.is_leaf() and node is not self.root_node:
             self._evict_tree_set.add(node)
         if node.small_page_buffer_idx is not None:
             self._evict_tree_set_for_linear_att.add(node)
@@ -362,12 +365,18 @@ def match_prefix(
             ans_node_list=ans_node_list,
             update_refs=update_refs,
         )
+        # _match_prefix_helper 进入时一定对 root 自增了一次 ref_counter。命中链非空时，调用方最终会
+        # 通过 dec_node_ref_counter(ans_node) 沿父链回收（含 root），增减平衡；但下面两个 "命中为空"
+        # 的提前返回会把 None 交给调用方，调用方不会再回收，root 自增就无人抵消，导致 root.ref_counter
+        # 在每次 miss / trim 到空时持续漂移。这里显式补偿这一次 root 自增。
         if len(ans_node_list) == 0:
+            self.dec_node_ref_counter(self.root_node)
             return None, 0, None
 
         # 判定真正可以用的匹配节点。
         ans_node_list = self._trim_unusable_match_tail(ans_node_list)
         if len(ans_node_list) == 0:
+            self.dec_node_ref_counter(self.root_node)
             return None, 0, None
 
         ans_node = ans_node_list[-1]
@@ -482,6 +491,9 @@ def deref_to_first_big_page_node(self, node: LinearAttPagedTreeNode) -> Optional
             iter_node = iter_node.parent
 
         if iter_node is self.root_node:
+            # 没有可承接的 big-page 节点交给调用方释放：root 在 match 阶段同样被 +1，
+            # 这里必须补偿，否则与 match_prefix miss 路径同类的 root ref 漂移。
+            self.dec_node_ref_counter(self.root_node)
             return None
         else:
             return iter_node
 
@@ -149,6 +149,18 @@ def _full_att_free_req(self, free_token_index: List, req: "InferReq"):
             req.shared_kv_node = None
         return
 
+    def _release_pending_linear_att_big_page_ids(self, req: "InferReq"):
+        # 释放本请求 prefill 阶段在 big page 边界上申请、但尚未插入 radix cache 的 big page
+        # state buffer。仅当请求未走 insert 分支(小页/大页插入)就被释放时才会有残留，典型场景：
+        # big page 模式下请求在 prefill 跨过 big page 边界后、到达末尾前被 pause / abort。
+        # 若不释放，会泄漏 big page state slot，并触发 free_a_req_mem 中 dict 为空的断言。
+        if req.linear_att_len_to_big_page_id:
+            self.radix_cache.linear_att_big_page_buffers.free_state_cache(
+                list(req.linear_att_len_to_big_page_id.values())
+            )
+            req.linear_att_len_to_big_page_id.clear()
+        return
+
     def _linear_att_free_req(self, free_token_index: List, req: "InferReq"):
         assert g_infer_context.is_linear_att_mixed_model is True
         args = get_env_start_args()
@@ -164,6 +176,7 @@ def _linear_att_free_req(self, free_token_index: List, req: "InferReq"):
             assert req.linear_att_cache_len <= req.cur_kv_len
 
         if req.cur_kv_len == 0:
+            self._release_pending_linear_att_big_page_ids(req)
             return
 
         if req.linear_att_cache_len <= req.cur_kv_len and req.tail_linear_att_small_page_buffer_id is not None:
@@ -232,6 +245,9 @@ def _linear_att_free_req(self, free_token_index: List, req: "InferReq"):
                 assert req.shared_kv_node.node_prefix_total_len == req.cur_kv_len
                 self.radix_cache.dec_node_ref_counter(req.shared_kv_node)
                 req.shared_kv_node = None
+            # 该分支不会把 prefill 阶段累积的 big page id 插入 radix cache（典型为 pause/abort
+            # 在 prefill 跨过 big page 边界后、到达末尾前触发），需在此显式释放，避免泄漏。
+            self._release_pending_linear_att_big_page_ids(req)
             return
 
         assert False, f"error state: cur_kv_len: {req.cur_kv_len}"
 
@@ -0,0 +1,300 @@
+"""Big-page-regime coverage + invariant fuzz for LinearAttPagedRadixCache.
+
+Active in production only when --linear_att_page_block_num is set (e.g. the GSM8K
+launch scripts use 8). Here big_page_num is small so inserts create big-page nodes
+plus an optional small tail, mirroring _linear_att_free_req's two insert calls and
+copy_linear_att_state_to_cache_buffer's len_to_big_page_id construction.
+"""
+import uuid
+
+import numpy as np
+import pytest
+import torch
+from sortedcontainers import SortedDict
+
+from lightllm.server.router.dynamic_prompt.linear_att_radix_cache import LinearAttPagedRadixCache
+from lightllm.utils.kv_cache_utils import compute_token_list_hash
+
+PAGE = 4
+BIGN = 2
+BIG_TOKENS = PAGE * BIGN
+
+
+class FakePool:
+    def __init__(self, size):
+        self.size = size
+        self.free_set = set(range(size))
+        self.order = list(range(size))
+
+    def alloc_one_state_cache(self):
+        if not self.order:
+            return None
+        i = self.order.pop(0)
+        self.free_set.discard(i)
+        return i
+
+    def free_state_cache(self, free_indexes):
+        for i in free_indexes:
+            assert i is not None and i not in self.free_set, f"double free {i}"
+            self.free_set.add(i)
+            self.order.append(i)
+
+    def get_free_cache_num(self):
+        return len(self.order)
+
+
+class FakeAllocator:
+    def __init__(self, size):
+        self.size = size
+        self.can_use_mem_size = size
+
+
+class FakeMem:
+    def __init__(self, size, big_pool):
+        self.allocator = FakeAllocator(size)
+        self.linear_att_big_page_buffers = big_pool
+
+    def free(self, mem_index):
+        self.allocator.can_use_mem_size += len(mem_index)
+
+
+def build(small_size=32, big_size=64, mem=400_000):
+    small = FakePool(small_size)
+    big = FakePool(big_size)
+    mm = FakeMem(mem, big)
+    cache = LinearAttPagedRadixCache(
+        unique_name=f"bp_{uuid.uuid4().hex[:8]}",
+        total_token_num=mem,
+        rank_in_node=0,
+        hash_page_size=PAGE,
+        big_page_num=BIGN,
+        kv_cache_mem_manager=mm,
+        linear_att_small_page_buffers=small,
+    )
+    return cache, small, big, mm
+
+
+def walk(cache):
+    out = []
+    st = list(cache.root_node.children.values())
+    while st:
+        n = st.pop()
+        out.append(n)
+        st.extend(n.children.values())
+    return out
+
+
+def page_tokens(pid):
+    return list(range(pid * PAGE, pid * PAGE + PAGE))
+
+
+def hashes_for(pids):
+    toks = []
+    for p in pids:
+        toks += page_tokens(p)
+    toks.append(-1)
+    return compute_token_list_hash(toks, PAGE)
+
+
+def check(cache, small, big):
+    nodes = walk(cache)
+    # structural + accounting
+    total = 0
+    refed = 0
+    for n in nodes:
+        assert n.parent is not None
+        assert n.node_prefix_total_len == n.parent.node_prefix_total_len + n.node_value_len
+        assert n.ref_counter >= 0
+        assert n.node_value_len == len(n.token_mem_index_value)
+        if n.is_big_page_node():
+            assert n.page_num == BIGN and n.node_value_len == BIG_TOKENS
+            assert n.big_page_buffer_idx is not None
+            assert n.small_page_buffer_idx is None
+        else:
+            assert n.page_num == 1 and n.node_value_len == PAGE
+            assert n.big_page_buffer_idx is None
+        total += n.node_value_len
+        if n.ref_counter > 0:
+            refed += n.node_value_len
+        for k, c in n.children.items():
+            assert c.page_hash == k and c.parent is n
+    assert cache.get_tree_total_tokens_num() == total
+    assert cache.get_refed_tokens_num() == refed
+    # evict set == non-root leaves
+    leaves = {id(n) for n in nodes if n.is_leaf()}
+    assert {id(n) for n in cache._evict_tree_set} == leaves
+    assert id(cache.root_node) not in {id(n) for n in cache._evict_tree_set}
+    # buffer-evict set == small-buffer holders
+    assert {id(n) for n in cache._evict_tree_set_for_linear_att} == {
+        id(n) for n in nodes if n.small_page_buffer_idx is not None
+    }
+    # big-page id conservation
+    big_in_tree = [n.big_page_buffer_idx for n in nodes if n.is_big_page_node()]
+    assert len(big_in_tree) == len(set(big_in_tree)), "big-page id reused by two nodes"
+    assert set(big_in_tree).isdisjoint(big.free_set)
+    assert set(big_in_tree) | big.free_set == set(range(big.size)), "big-page id leaked"
+    # small-page id conservation
+    small_in_tree = [n.small_page_buffer_idx for n in nodes if n.small_page_buffer_idx is not None]
+    assert len(small_in_tree) == len(set(small_in_tree))
+    assert set(small_in_tree).isdisjoint(small.free_set)
+    assert set(small_in_tree) | small.free_set == set(range(small.size)), "small-page id leaked"
+
+
+def make_insert(cache, small, big):
+    """Mirror _linear_att_free_req: big-page-aligned prefix (+ optional small tail)."""
+
+    def insert(pids, mem_base, with_small_tail):
+        L = len(pids)
+        num_big = L // BIGN
+        # len_to_big_page_id: one fresh big id per big-page boundary along the path
+        l2b = SortedDict()
+        big_ids_alloced = []
+        for j in range(1, num_big + 1):
+            bid = big.alloc_one_state_cache()
+            if bid is None:
+                # big pool exhausted: the real caller would not start this insert; roll back.
+                for got in big_ids_alloced:
+                    big.free_state_cache([got])
+                return
+            big_ids_alloced.append(bid)
+            l2b[j * BIG_TOKENS] = bid
+        hashs = hashes_for(pids)
+        key = torch.tensor([t for p in pids for t in page_tokens(p)], dtype=torch.int64)
+        value = torch.arange(mem_base, mem_base + L * PAGE, dtype=torch.int64)
+        linear_idxs = [None] * L
+        tail_buf = None
+        if with_small_tail and (L % BIGN != 0):
+            tail_buf = small.alloc_one_state_cache()
+            if tail_buf is None:
+                # contract: cannot insert a None-tailed non-aligned path; drop the tail page
+                pids = pids[:-1]
+                L = len(pids)
+                if L == 0:
+                    # nothing to insert; release any big ids we grabbed (none, since num_big recomputed)
+                    for bid in big_ids_alloced:
+                        big.free_state_cache([bid])
+                    return
+                hashs = hashes_for(pids)
+                key = torch.tensor([t for p in pids for t in page_tokens(p)], dtype=torch.int64)
+                value = torch.arange(mem_base, mem_base + L * PAGE, dtype=torch.int64)
+                linear_idxs = [None] * L
+            else:
+                linear_idxs[-1] = tail_buf
+        elif L % BIGN != 0:
+            # no small tail wanted but path is not big-aligned -> trim to aligned length
+            pids = pids[: num_big * BIGN]
+            L = len(pids)
+            if L == 0:
+                for bid in big_ids_alloced:
+                    big.free_state_cache([bid])
+                return
+            hashs = hashes_for(pids)
+            key = torch.tensor([t for p in pids for t in page_tokens(p)], dtype=torch.int64)
+            value = torch.arange(mem_base, mem_base + L * PAGE, dtype=torch.int64)
+            linear_idxs = [None] * L
+
+        before_small = set(small.free_set)
+        cache.insert(key, value, block_hashs=hashs, block_linear_idxs=linear_idxs, len_to_big_page_id=l2b)
+        # any tail buffer that was a duplicate got freed by the cache; nothing to track
+        _ = before_small
+
+    return insert
+
+
+def test_pure_bigpage_insert_and_match():
+    cache, small, big = build()[:3]
+    ins = make_insert(cache, small, big)
+    # 4 pages -> 2 big pages, no small tail
+    ins([1, 2, 3, 4], 1000, with_small_tail=False)
+    check(cache, small, big)
+    assert cache.get_tree_total_tokens_num() == 4 * PAGE
+
+    hashs = hashes_for([1, 2, 3, 4])
+    key = torch.tensor([t for p in [1, 2, 3, 4] for t in page_tokens(p)], dtype=torch.int64)
+    node, kv, mem = cache.match_prefix(key, block_hashs=hashs, update_refs=True)
+    assert node is not None and node.is_big_page_node()
+    assert kv == 16 and len(mem) == 16
+    assert torch.equal(mem, torch.arange(1000, 1016, dtype=torch.int64))
+    cache.dec_node_ref_counter(node)
+    check(cache, small, big)
+
+
+def test_mixed_insert_match_trims_to_bigpage_when_tail_unusable():
+    cache, small, big = build(small_size=1)[:3]
+    ins = make_insert(cache, small, big)
+    # 5 pages -> 2 big pages (8 tokens *2 =16) + 1 small tail page (4) = 20 tokens
+    ins([1, 2, 3, 4, 5], 2000, with_small_tail=True)
+    check(cache, small, big)
+    assert cache.get_tree_total_tokens_num() == 20
+
+    # exhaust small pool and steal the tail buffer -> tail page unusable
+    while small.alloc_one_state_cache() is not None:
+        pass
+    cache.free_one_small_page_linear_att_buffer()
+    check(cache, small, big)
+
+    hashs = hashes_for([1, 2, 3, 4, 5])
+    key = torch.tensor([t for p in [1, 2, 3, 4, 5] for t in page_tokens(p)], dtype=torch.int64)
+    node, kv, mem = cache.match_prefix(key, block_hashs=hashs, update_refs=True)
+    # tail small page has no buffer -> trim back to the last big-page boundary (16)
+    assert node is not None and node.is_big_page_node()
+    assert kv == 16
+    cache.dec_node_ref_counter(node)
+    check(cache, small, big)
+
+
+@pytest.mark.parametrize("seed", list(range(10)))
+def test_bigpage_fuzz(seed):
+    rng = np.random.default_rng(seed)
+    cache, small, big, mm = build(small_size=10, big_size=48, mem=400_000)
+    ins = make_insert(cache, small, big)
+    live = []
+    mem_base = [10_000]
+
+    def do_ins():
+        L = int(rng.integers(1, 7))
+        pids = [int(rng.integers(0, 25)) for _ in range(L)]
+        ins(pids, mem_base[0], with_small_tail=bool(rng.integers(0, 2)))
+        mem_base[0] += 100
+
+    def do_match():
+        L = int(rng.integers(1, 7))
+        pids = [int(rng.integers(0, 25)) for _ in range(L)]
+        hashs = hashes_for(pids)
+        key = torch.tensor([t for p in pids for t in page_tokens(p)], dtype=torch.int64)
+        node, kv, mem = cache.match_prefix(key, block_hashs=hashs, update_refs=True)
+        if node is None:
+            assert kv == 0 and mem is None
+            return
+        assert kv == node.node_prefix_total_len == len(mem)
+        assert node.is_big_page_node() or node.small_page_buffer_idx is not None
+        live.append(node)
+
+    def do_dec():
+        if live:
+            cache.dec_node_ref_counter(live.pop(int(rng.integers(0, len(live)))))
+
+    def do_steal():
+        cache.free_one_small_page_linear_att_buffer()
+
+    def do_evict():
+        unref = cache.get_tree_total_tokens_num() - cache.get_refed_tokens_num()
+        if unref < PAGE:
+            return
+        need = int(rng.integers(1, unref // PAGE + 1)) * PAGE
+        cache._evict(need, lambda m, b: small.free_state_cache([b]) if b is not None else None)
+
+    ops = [do_ins, do_ins, do_match, do_match, do_dec, do_steal, do_evict]
+    for _ in range(400):
+        ops[int(rng.integers(0, len(ops)))]()
+        check(cache, small, big)
+        assert cache.root_node.ref_counter == 1 + len(live), "root ref drifted (big-page regime)"
+
+    while live:
+        cache.dec_node_ref_counter(live.pop())
+    assert cache.get_refed_tokens_num() == 0
+    t = cache.get_tree_total_tokens_num()
+    if t:
+        cache._evict(t, lambda m, b: small.free_state_cache([b]) if b is not None else None)
+    assert cache.get_tree_total_tokens_num() == 0
+    check(cache, small, big)