ModelTC
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎test/benchmark/static_inference/model_infer.py‎
Lines changed: 1 addition & 1 deletion b/‎test/benchmark/static_inference/model_infer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/benchmark/static_inference/model_infer_mtp.py‎
Lines changed: 216 additions & 66 deletions b/‎test/benchmark/static_inference/model_infer_mtp.py‎
Lines changed: 216 additions & 66 deletions
diff --git a/‎test/benchmark/static_inference/test_model.py‎
Lines changed: 19 additions & 2 deletions b/‎test/benchmark/static_inference/test_model.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎test/cpu_cache_kernel/test_speed.py‎
Lines changed: 1 addition & 1 deletion b/‎test/cpu_cache_kernel/test_speed.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unit_tests/common/basemodel/test_fp8_decode_verify_narrowed.py‎
Lines changed: 59 additions & 0 deletions b/‎unit_tests/common/basemodel/test_fp8_decode_verify_narrowed.py‎
Lines changed: 59 additions & 0 deletions
@@ -7,4 +7,6 @@ dist
 .vscode
 tmp/
 requirements-musa.txt
-logs/
+logs/
+
+/benchmark/
@@ -36,7 +36,7 @@ def test_model_inference(args):
             "graph_max_len_in_batch": args.max_req_total_len,
             "graph_max_batch_size": args.graph_max_batch_size,
             "mem_fraction": args.mem_fraction,
-            "max_req_num": 2048,
+            "max_req_num": 512,
             "batch_max_tokens": 1024,
             "run_mode": "normal",
             "max_seq_length": args.max_req_total_len,
 
@@ -11,12 +11,29 @@
 from lightllm.utils.config_utils import get_config_json, get_dtype
 
 
+def parse_batch_size(value):
+    parts = [part.strip() for part in value.split(",") if part.strip()]
+    if not parts:
+        raise ValueError("batch_size must contain at least one integer")
+
+    batch_sizes = []
+    for part in parts:
+        size = int(part)
+        if size <= 0:
+            raise ValueError("batch_size values must be positive integers")
+        batch_sizes.append(size)
+
+    if len(batch_sizes) == 1:
+        return batch_sizes[0]
+    return batch_sizes
+
+
 class TestModelInfer(unittest.TestCase):
     def test_model_infer(self):
         args = get_env_start_args()
         if args.data_type is None:
             args.data_type = get_dtype(args.model_dir)
-        if args.mtp_mode == "deepseekv3":
+        if args.mtp_mode is not None:
             test_model_inference_mtp(args)
         else:
             test_model_inference(args)
@@ -27,7 +44,7 @@ def test_model_infer(self):
     import torch
 
     parser = make_argument_parser()
-    parser.add_argument("--batch_size", type=int, default=None, help="batch size")
+    parser.add_argument("--batch_size", type=parse_batch_size, default=None, help="batch size, e.g. 8 or 1,2,4,8")
     parser.add_argument("--input_len", type=int, default=64, help="input sequence length")
     parser.add_argument("--output_len", type=int, default=128, help="output sequence length")
     parser.add_argument(
 
@@ -104,7 +104,7 @@
 buffer_count = triton.cdiv(SEQ_LEN, big_page_token_num) + 2  # matches Qwen3NextMemManager
 
 
-conv_shape = linear_config.get_conv_state_shape()
+conv_shape = linear_config.get_persisted_conv_state_shape()
 cpu_kv_conv_state = torch.empty(
     (buffer_count, linear_config.linear_layer_num, *conv_shape),
     dtype=linear_config.conv_state_dtype,
 
@@ -0,0 +1,59 @@
+import types
+import torch
+import pytest
+
+import lightllm.common.basemodel.attention.fa3.fp8 as fp8_mod
+from lightllm.common.basemodel.attention.fa3.fp8 import Fp8Fa3DecodeAttState
+
+
+def _make_verify_state(n_real, mtp_size, head_num=2, head_dim=8):
+    """Build an Fp8Fa3DecodeAttState as init_state would leave it in MTP-verify mode,
+    bypassing init_state. b_att_seq_len/page_table are NARROW (n_real); infer_state.b_seq_len
+    is the FULL expanded tensor (n_real*mtp_size) that must NOT be used as cache_seqlens."""
+    state = object.__new__(Fp8Fa3DecodeAttState)
+    batch = n_real * mtp_size
+    state.b_att_seq_len = torch.full((n_real,), 16, dtype=torch.int32)
+    state.page_table = torch.zeros((n_real, 16), dtype=torch.int32)
+    state.cu_seqlens_q = torch.arange(0, (n_real + 1) * mtp_size, mtp_size, dtype=torch.int32)
+    state.cu_seqlens_k = torch.zeros((n_real + 1,), dtype=torch.int32)
+    state.decode_max_q_seq_len = mtp_size
+    state.infer_state = types.SimpleNamespace(
+        b_seq_len=torch.full((batch,), 16, dtype=torch.int32),
+        batch_size=batch,
+    )
+    # k/v descale sized per real request (att_batch_size), indexed by layer
+    state.k_descale = torch.ones((1, n_real, head_num))
+    state.v_descale = torch.ones((1, n_real, head_num))
+    state.backend = types.SimpleNamespace(_find_layer_index=lambda k, v, att_state: 0)
+    return state, batch
+
+
+def test_fp8_decode_uses_narrowed_cache_seqlens_and_causal(monkeypatch):
+    n_real, mtp_size, head_num, head_dim = 3, 4, 2, 8
+    state, batch = _make_verify_state(n_real, mtp_size, head_num, head_dim)
+
+    captured = {}
+
+    def fake_flash(**kwargs):
+        captured.update(kwargs)
+        q = kwargs["q"]
+        return torch.zeros((q.shape[0], q.shape[1], q.shape[2]))
+
+    def fake_quant(x, use_per_token_if_dynamic=True):
+        return x, torch.ones((x.shape[0], 1))
+
+    monkeypatch.setattr(fp8_mod, "flash_attn_with_kvcache", fake_flash)
+    monkeypatch.setattr(fp8_mod, "scaled_fp8_quant", fake_quant)
+
+    q = torch.randn((batch, head_num, head_dim))
+    k = torch.randn((batch, head_num, head_dim))
+    v = torch.randn((batch, head_num, head_dim))
+
+    state._fp8_decode_att(q=q, k=k, v=v)
+
+    # The KV-side seqlens must be the NARROW per-real-request tensor, matching page_table rows.
+    assert captured["cache_seqlens"] is state.b_att_seq_len
+    assert captured["cache_seqlens"].shape[0] == n_real
+    assert captured["cache_seqlens"].shape[0] == captured["page_table"].shape[0]
+    # Verify decode must be causal, like the non-fp8 sibling.
+    assert captured["causal"] is True