test(static_inference): generalize MTP static benchmark

sufubao · sufubao · commit 0d42047b6ed1 · 2026-06-07T16:10:14.000+08:00
- Dispatch to MTP bench whenever mtp_mode is set (was dead-coded to 'deepseekv3')
- init_mtp_model: dispatch by config model_type (deepseek_v3/qwen3_moe/mistral/
  glm4_moe_lite/qwen3_5/qwen3_5_moe), handle eagle (1 instance) vs vanilla
  (mtp_step instances); fix mem_faction typo; pass full att/kv/quant kvargs
- run_forward_once: adapt to new ModelInput API (mem_indexes_cpu + CPU tensors,
  max_q/kv_seq_len, b_mtp_index, b_prefill_start_loc); reuse draft instances via
  _step % num_instances; pad/truncate draft_ids to mtp_step+1
- Cap max_req_num at 512 to avoid GDN req-state cache OOM under MTP
diff --git a/test/benchmark/static_inference/model_infer.py b/test/benchmark/static_inference/model_infer.py
@@ -36,7 +36,7 @@ def test_model_inference(args):
             "graph_max_len_in_batch": args.max_req_total_len,
             "graph_max_batch_size": args.graph_max_batch_size,
             "mem_fraction": args.mem_fraction,
-            "max_req_num": 2048,
+            "max_req_num": 512,
             "batch_max_tokens": 1024,
             "run_mode": "normal",
             "max_seq_length": args.max_req_total_len,
diff --git a/test/benchmark/static_inference/model_infer_mtp.py b/test/benchmark/static_inference/model_infer_mtp.py
@@ -9,42 +9,85 @@
 from lightllm.models import get_model
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from lightllm.server.core.objs.start_args_type import StartArgs
-from torch.profiler import profile, record_function, ProfilerActivity
+from torch.profiler import profile, ProfilerActivity
 from lightllm.utils.log_utils import init_logger
 from lightllm.models.deepseek_mtp.model import Deepseek3MTPModel
-import torch.cuda as cuda
+from lightllm.models.qwen3_moe_mtp.model import Qwen3MOEMTPModel
+from lightllm.models.mistral_mtp.model import MistralMTPModel
+from lightllm.models.glm4_moe_lite_mtp.model import Glm4MoeLiteMTPModel
 
 logger = init_logger(__name__)
 
 
 def init_mtp_model(args: StartArgs, kvargs, main_model):
-    mtp_step = args.mtp_step
     draft_models = []
 
     os.environ["DISABLE_CHECK_MAX_LEN_INFER"] = "1"
-    mtp_model_kvargs = kvargs
-    mtp_model_kvargs.update(
-        {
-            "weight_dir": args.mtp_draft_model_dir,
+
+    if args.mtp_mode in ["vanilla_with_att", "vanilla_no_att"]:
+        num_mtp_modules = args.mtp_step
+    elif args.mtp_mode in ["eagle_with_att", "eagle_no_att"]:
+        num_mtp_modules = 1
+    else:
+        assert False, f"error mtp mode {args.mtp_mode}"
+
+    for i in range(num_mtp_modules):
+        mtp_model_cfg, _ = PretrainedConfig.get_config_dict(args.mtp_draft_model_dir[i])
+        model_type = mtp_model_cfg.get("model_type", "")
+        mtp_model_kvargs = {
+            "weight_dir": args.mtp_draft_model_dir[i],
             "max_total_token_num": main_model.mem_manager.size,
-            "disable_chunked_prefill": True,
-            "mtp_mode": args.mtp_mode,
+            "load_way": kvargs["load_way"],
+            "max_req_num": kvargs.get("max_req_num", 1000),
+            "max_seq_length": kvargs.get("max_seq_length", 1024 * 5),
+            "is_token_healing": False,
+            "return_all_prompt_logics": False,
+            "disable_chunked_prefill": args.disable_chunked_prefill,
+            "data_type": kvargs.get("data_type", "float16"),
+            "graph_max_batch_size": kvargs.get("graph_max_batch_size", 16),
+            "graph_max_len_in_batch": kvargs.get("graph_max_len_in_batch", 8196),
+            "disable_cudagraph": kvargs.get("disable_cudagraph", False),
+            "mem_fraction": kvargs["mem_fraction"],
+            "batch_max_tokens": kvargs.get("batch_max_tokens", None),
+            "quant_type": kvargs.get("quant_type", None),
+            "quant_cfg": kvargs.get("quant_cfg", None),
+            "run_mode": "normal",
+            "llm_prefill_att_backend": kvargs.get("llm_prefill_att_backend", args.llm_prefill_att_backend),
+            "llm_decode_att_backend": kvargs.get("llm_decode_att_backend", args.llm_decode_att_backend),
+            "vit_att_backend": kvargs.get("vit_att_backend", args.vit_att_backend),
+            "llm_kv_type": kvargs.get("llm_kv_type", args.llm_kv_type),
+            "llm_kv_quant_group_size": kvargs.get("llm_kv_quant_group_size", args.llm_kv_quant_group_size),
             "main_model": main_model,
+            "mtp_previous_draft_models": draft_models.copy(),
+            "mtp_mode": args.mtp_mode,
         }
-    )
-    for i in range(mtp_step):
-        mtp_model_cfg, _ = PretrainedConfig.get_config_dict(args.mtp_draft_model_dir)
-        mtp_model_kvargs.update(
-            {
-                "weight_dir": args.spec_model_dir,
-                "max_total_token_num": main_model.mem_manager.size,
-                "disable_chunked_prefill": True,
-                "mtp_mode": args.mtp_mode,
-                "main_model": main_model,
-                "mem_layer_start": main_model.config["num_hidden_layers"] + i * mtp_model_cfg["num_hidden_layers"],
-            }
-        )
-        draft_models.append(Deepseek3MTPModel(mtp_model_kvargs))
+
+        if model_type == "deepseek_v3":
+            assert args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
+            draft_models.append(Deepseek3MTPModel(mtp_model_kvargs))
+        elif model_type == "qwen3_moe":
+            assert args.mtp_mode in ["vanilla_no_att", "eagle_no_att"]
+            draft_models.append(Qwen3MOEMTPModel(mtp_model_kvargs))
+        elif model_type == "mistral":
+            assert args.mtp_mode in ["vanilla_no_att", "eagle_no_att"]
+            draft_models.append(MistralMTPModel(mtp_model_kvargs))
+        elif mtp_model_cfg["model_type"] == "glm4_moe_lite":
+            assert args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
+            draft_models.append(Glm4MoeLiteMTPModel(mtp_model_kvargs))
+        elif model_type in ("qwen3_5", "qwen3_5_text"):
+            assert args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
+            from lightllm.models.qwen3_5_mtp.model import Qwen3_5MTPModel
+
+            draft_models.append(Qwen3_5MTPModel(mtp_model_kvargs))
+        elif model_type in ("qwen3_5_moe", "qwen3_5_moe_text"):
+            assert args.mtp_mode in ["vanilla_with_att", "eagle_with_att"]
+            from lightllm.models.qwen3_5_moe_mtp.model import Qwen3_5MoeMTPModel
+
+            draft_models.append(Qwen3_5MoeMTPModel(mtp_model_kvargs))
+        else:
+            raise ValueError(f"Unsupported MTP model type: {model_type}")
+
+        logger.info(f"loaded mtp model class {draft_models[i].__class__}")
     return draft_models
 
 
@@ -68,13 +111,22 @@ def test_model_inference_mtp(args):
             "max_total_token_num": args.max_total_token_num,
             "graph_max_len_in_batch": args.max_req_total_len,
             "graph_max_batch_size": args.graph_max_batch_size,
-            "mem_faction": args.mem_fraction,
-            "max_req_num": 2000,
+            "mem_fraction": args.mem_fraction,
+            # Static bench runs explicit batch sizes (<= a few hundred). The hybrid Qwen3.5
+            # GDN req-state cache is sized max_req_num * (mtp_step + 1) at ~34 MB/slot, so the
+            # old default of 2000 alloc'd ~140 GB and OOM'd under MTP. 512 covers any realistic
+            # static batch sweep while keeping the GDN cache small.
+            "max_req_num": 512,
             "batch_max_tokens": 2048,
             "run_mode": "normal",
             "max_seq_length": args.max_req_total_len,
-            "spec_algo": args.spec_algo,
             "disable_cudagraph": args.disable_cudagraph,
+            "quant_cfg": args.quant_cfg,
+            "llm_prefill_att_backend": args.llm_prefill_att_backend,
+            "llm_decode_att_backend": args.llm_decode_att_backend,
+            "vit_att_backend": args.vit_att_backend,
+            "llm_kv_type": args.llm_kv_type,
+            "llm_kv_quant_group_size": args.llm_kv_quant_group_size,
         }
         proc = multiprocessing.Process(
             target=tppart_model_infer,
@@ -113,28 +165,36 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_
 
     test_data = np.vstack([np.random.randint(0, 50256, input_len) for _ in range(batch_size)])
     test_data = test_data.reshape(-1)
-    test_data = torch.from_numpy(test_data).cuda()
+    test_data = torch.from_numpy(test_data)
 
     b_req_idx = torch.tensor(
-        [main_model.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cuda"
+        [main_model.req_manager.alloc() for _ in range(batch_size)], dtype=torch.int32, device="cpu"
     )
-    b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
-    b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+    b_seq_len = torch.zeros(batch_size, dtype=torch.int32, device="cpu")
+    b_ready_cache_len = torch.zeros(batch_size, dtype=torch.int32, device="cpu")
     for i in range(batch_size):
         b_seq_len[i] = input_len
 
     total_token_num = input_len * batch_size
-    mem_indexes = main_model.req_manager.mem_manager.alloc(test_data.shape[0]).cuda()
+    mem_indexes = main_model.req_manager.mem_manager.alloc(test_data.shape[0])
+    b_mtp_index = torch.zeros(batch_size, dtype=torch.int32)
+    b_prefill_start_loc = b_seq_len.cumsum(dim=0, dtype=torch.int32) - b_seq_len
     # Main model Prefill
     model_input = ModelInput(
         batch_size=batch_size,
         total_token_num=total_token_num,
+        max_q_seq_len=input_len,
+        max_kv_seq_len=input_len,
+        max_cache_len=0,
         input_ids=test_data,
-        mem_indexes=mem_indexes,
+        mem_indexes_cpu=mem_indexes,
         b_req_idx=b_req_idx,
+        b_mtp_index=b_mtp_index,
         b_seq_len=b_seq_len,
         is_prefill=True,
         b_ready_cache_len=b_ready_cache_len,
+        b_prefill_start_loc=b_prefill_start_loc,
+        prefix_total_token_num=0,
         multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size)],
     )
 
@@ -167,8 +227,22 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_
 
     torch.cuda.synchronize()
 
+    # Speculative width = args.mtp_step in BOTH modes (mirrors base_backend: self.mtp_step =
+    # args.mtp_step). The number of draft MODEL INSTANCES differs: vanilla loads mtp_step
+    # instances (each forwarded once), eagle loads ONE instance forwarded mtp_step times
+    # (chunked_prefill/impl.py: draft_models[_step % num_instances]). The verify batch always
+    # expands to (mtp_step + 1) rows per request.
+    spec_width = args.mtp_step
+    num_instances = len(draft_models)
+    # The draft prefill above produced (1 + num_instances) columns; pad/truncate to
+    # (spec_width + 1) so the decode verify batch matches the server's expand width. Only the
+    # SHAPE matters for throughput here (argmax over random inputs); token values do not.
+    while len(draft_ids) < spec_width + 1:
+        draft_ids.append(draft_ids[-1])
+    draft_ids = draft_ids[: spec_width + 1]
     decode_input_ids = np.stack(draft_ids, axis=-1).reshape(-1)
-    decode_input_ids = torch.from_numpy(decode_input_ids).cuda()
+    decode_input_ids = torch.from_numpy(decode_input_ids)
+    mtp_step = spec_width
 
     # build main decode input:
     nopad_b_seq_idx = []
@@ -177,35 +251,39 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_
     nopad_max_len_in_batch = 0
 
     for i in range(batch_size):
-        nopad_b_seq_idx.append(b_req_idx[i])
+        nopad_b_seq_idx.append(b_req_idx[i].item())
         seq_len = b_seq_len[i].item()
         nopad_b_seq_len.append(seq_len + 1)
         nopad_total_token_num += seq_len + 1
-        nopad_max_len_in_batch = max(nopad_max_len_in_batch, b_seq_len[i] + 1)
+        nopad_max_len_in_batch = max(nopad_max_len_in_batch, seq_len + 1)
 
-        for step in range(len(draft_models)):
-            nopad_b_seq_idx.append(b_req_idx[i])
+        for step in range(mtp_step):
+            nopad_b_seq_idx.append(b_req_idx[i].item())
             nopad_b_seq_len.append(seq_len + step + 2)
             nopad_total_token_num += seq_len + step + 2
             nopad_max_len_in_batch = max(nopad_max_len_in_batch, seq_len + step + 2)
 
-    nopad_b_seq_idx = torch.tensor(nopad_b_seq_idx, dtype=torch.int32, device="cuda")
-    nopad_b_seq_len = torch.tensor(nopad_b_seq_len, dtype=torch.int32, device="cuda")
-    mem_indexes = main_model.req_manager.mem_manager.alloc(batch_size * (len(draft_models) + 1)).cuda()
+    nopad_b_seq_idx = torch.tensor(nopad_b_seq_idx, dtype=torch.int32, device="cpu")
+    nopad_b_seq_len = torch.tensor(nopad_b_seq_len, dtype=torch.int32, device="cpu")
+    b_mtp_index = torch.arange(mtp_step + 1, dtype=torch.int32).repeat(batch_size)
+    mem_indexes = main_model.req_manager.mem_manager.alloc(batch_size * (mtp_step + 1))
 
     model_input = ModelInput(
-        batch_size=batch_size * (len(draft_models) + 1),
+        batch_size=batch_size * (mtp_step + 1),
         total_token_num=nopad_total_token_num,
+        max_q_seq_len=1,
+        max_kv_seq_len=nopad_max_len_in_batch,
         input_ids=decode_input_ids,
-        mem_indexes=mem_indexes,
+        mem_indexes_cpu=mem_indexes,
         b_req_idx=nopad_b_seq_idx,
+        b_mtp_index=b_mtp_index,
         b_seq_len=nopad_b_seq_len,
         is_prefill=False,
-        multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size * (len(draft_models) + 1))],
+        multimodal_params=[{"images": [], "audios": []} for _ in range(batch_size * (mtp_step + 1))],
     )
 
     # Main decode
-    for i in range(0, output_len, len(draft_models) + 1):
+    for i in range(0, output_len, mtp_step + 1):
         torch.cuda.synchronize()
         step_start_time = time.time()
         model_output = main_model.forward(
@@ -214,12 +292,13 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_
         prob_out = torch.softmax(model_output.logits, dim=-1)
         predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
 
-        # draft decode
+        # draft decode: mtp_step forwards, reusing draft_models[_step % num_instances]
+        # (eagle: one instance reused mtp_step times; vanilla: a distinct instance per step).
         model_input.input_ids = predict_ids.reshape(-1)
         model_input.mtp_draft_input_hiddens = model_output.mtp_main_output_hiddens
 
-        for draft_model_id in range(len(draft_models)):
-            draft_model = draft_models[draft_model_id]
+        for _step in range(mtp_step):
+            draft_model = draft_models[_step % num_instances]
             model_output = draft_model.forward(
                 model_input,
             )
@@ -237,7 +316,7 @@ def run_forward_once(args, input_len, output_len, batch_size, main_model, draft_
             if get_current_rank_in_dp() == 0 and not warmup:
                 step_time = step_end_time - step_start_time
                 print(i, " step cost time:", step_time * 1000)
-                print(f"Decode throughput: {batch_size * (len(draft_models) + 1) * args.dp / step_time} tokens/s")
+                print(f"Decode throughput: {batch_size * (mtp_step + 1) * args.dp / step_time} tokens/s")
 
     main_model.mem_manager.free_all()
     main_model.req_manager.free_all()
diff --git a/test/benchmark/static_inference/test_model.py b/test/benchmark/static_inference/test_model.py
@@ -16,7 +16,7 @@ def test_model_infer(self):
         args = get_env_start_args()
         if args.data_type is None:
             args.data_type = get_dtype(args.model_dir)
-        if args.mtp_mode == "deepseekv3":
+        if args.mtp_mode is not None:
             test_model_inference_mtp(args)
         else:
             test_model_inference(args)