PaddlePaddle
diff --git a/‎benchmarks/README.md‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/backend_request_func.py‎
Lines changed: 129 additions & 1 deletion b/‎benchmarks/backend_request_func.py‎
Lines changed: 129 additions & 1 deletion
diff --git a/‎benchmarks/benchmark_serving.py‎
Lines changed: 31 additions & 12 deletions b/‎benchmarks/benchmark_serving.py‎
Lines changed: 31 additions & 12 deletions
@@ -49,6 +49,12 @@ python -m pip install -r requirements.txt
 --ip-list：支持多个ip:port，将总请求数以及总并发数均分到每个IP，按整除取余分配。例：0.0.0.0:1211,0.0.0.0:1222，默认为空
 --multi-turn：开启多轮对话，将数据集messages中的多轮对话逐轮请求，默认False不区分多轮。若需要添加tool_call，需在hyperparameter-path超参yaml中配置tools，参考yaml/request_yaml/GLM-32k-tool-call.yaml，数据集中需要指定tool_url，max_loop（非必选，默认10）为单轮调用最大次数
 ```
+多轮对话使用prompt_token_ids模式请求
+```bash
+开启--multi-turn
+--tokenizer-model：使用prompt_token_ids请求时指定，多轮对话tokenizer模型类型，可选"eb": ErnieBotTokenizer, "eb5": Ernie5Tokenizer, "eb_mm": Ernie4_5Tokenizer
+--tokenizer-path：使用prompt_token_ids请求时指定，模型tokenizer路径
+```
 
 ##### /v1/chat/completions接口压测单条数据调试
 
 
@@ -20,6 +20,7 @@
 import copy
 import io
 import json
+import logging
 import os
 import sys
 import time
@@ -56,6 +57,9 @@ class RequestFuncInput:
     response_format: Optional[dict] = None
     random_flag: bool = False
     json_data: Optional[dict] = None
+    prompt_token_ids: Optional[list] = None
+    tokenizer_model: str = None
+    tokenizer_path: str = None
 
 
 @dataclass
@@ -81,6 +85,7 @@ class RequestFuncOutput:
     error: str = ""
     metrics: dict = field(default_factory=dict)
     tool_calls: list = field(default_factory=list)
+    output_ids: list = field(default_factory=list)
 
 
 @dataclass
@@ -178,6 +183,49 @@ def metrics_summary(metrics, token_timestamps):
     return summary
 
 
+def load_tokenizer(model, actor_tokenizer_path):
+    """加载tokenizer"""
+    from ernie_tokenizer import Ernie5Tokenizer, ErnieBotTokenizer
+    from paddleformers.transformers import AutoTokenizer
+
+    from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
+
+    vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
+
+    try:
+        if model == "eb":
+            for i in range(len(vocab_file_names)):
+                if os.path.exists(os.path.join(actor_tokenizer_path, vocab_file_names[i])):
+                    ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
+                    break
+            tokenizer = ErnieBotTokenizer.from_pretrained(actor_tokenizer_path)
+        elif model == "eb_mm":
+            for vocab_file in vocab_file_names:
+                full_path = os.path.join(actor_tokenizer_path, vocab_file)
+                if os.path.exists(full_path):
+                    Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file
+            # for i in range(len(vocab_file_names)):
+            #     if os.path.exists(os.path.join(actor_tokenizer_path, vocab_file_names[i])):
+            #         Ernie45Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
+            #         break
+            tokenizer = Ernie4_5Tokenizer.from_pretrained(actor_tokenizer_path)
+            # tokenizer.ignored_index = -100
+        elif model == "eb5":
+            for i in range(len(vocab_file_names)):
+                if os.path.exists(os.path.join(actor_tokenizer_path, vocab_file_names[i])):
+                    Ernie5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
+                    break
+            tokenizer = Ernie5Tokenizer.from_pretrained(actor_tokenizer_path)
+        else:
+            print("tokenizer: AUTO")
+            tokenizer = AutoTokenizer.from_pretrained(actor_tokenizer_path, padding_side="left", use_fast=True)
+    except Exception as e:
+        tokenizer = None
+        logging.warning(f"Load tokenizer error: {e}")
+
+    return tokenizer
+
+
 async def async_request_eb_openai_chat_completions(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -221,6 +269,14 @@ async def async_request_eb_openai_chat_completions(
     if request_func_input.response_format:
         payload["response_format"] = request_func_input.response_format
 
+    # 支持传入prompt_token_ids
+    if request_func_input.prompt_token_ids:
+        # 不走messages
+        payload["messages"] = [{"role": "user", "content": [{"type": "text", "text": ""}]}]
+        payload["prompt_token_ids"] = request_func_input.prompt_token_ids
+        payload["return_token_ids"] = True
+        # print("use_token_ids:", payload)
+
     # 超参由yaml传入
     payload.update(request_func_input.hyper_parameters)
 
@@ -298,6 +354,7 @@ async def async_request_eb_openai_chat_completions(
                             content = choices[0]["delta"].get("content")
                             reason_content = choices[0]["delta"].get("reasoning_content")
                             tool_calls = choices[0]["delta"].get("tool_calls")
+                            completion_token_ids = choices[0]["delta"].get("completion_token_ids", [])
                             if tool_calls:
                                 for tc in tool_calls:
                                     idx = tc.get("index", 0)
@@ -343,6 +400,8 @@ async def async_request_eb_openai_chat_completions(
 
                             output.generated_text += content or ""
                             output.reasoning_content += reason_content or ""
+                            if completion_token_ids:
+                                output.output_ids.extend(completion_token_ids)
                             # print(f"####content:{data}")
                             output.arrival_time.append(choices[0].get("arrival_time", timestamp))
                         elif usage := data.get("usage", {}):
@@ -487,6 +546,27 @@ async def async_request_eb_openai_chat_completions_multi_turn(
     print("START", request_func_input.no, "user对话轮数:", user_count, flush=True)
     history = []
     prompt_no = 0
+    max_prompt_len = (
+        hyper.get("max_prompt_len") if hyper.get("max_prompt_len") is not None else json_data.get("max_prompt_len")
+    )
+    print("max_prompt_len:", max_prompt_len)
+    input_ids_all = []
+    # FD每轮 completion_token_ids
+    output_ids = []
+    use_token_ids = bool(request_func_input.tokenizer_model and request_func_input.tokenizer_path)
+    tokenizer = None
+
+    if use_token_ids:
+        print("token ids 拼接模式")
+        enable_tools = False
+        print("tokenizer_model:", request_func_input.tokenizer_model)
+        print("tokenizer_path:", request_func_input.tokenizer_path)
+        tokenizer = load_tokenizer(
+            request_func_input.tokenizer_model,
+            request_func_input.tokenizer_path,
+        )
+    else:
+        print("messages 明文拼接模式")
 
     # 只创建一次 session
     session_start = time.perf_counter()
@@ -508,6 +588,44 @@ async def async_request_eb_openai_chat_completions_multi_turn(
                 round_input = copy.deepcopy(request_func_input)
                 round_input.history_QA = history
                 round_input.no = f"{round_input.no}_{prompt_no}"
+                if use_token_ids:
+                    if len(input_ids_all) == 0:
+                        # 拼接token_ids模式，首轮token_ids
+                        spliced_text = tokenizer.apply_chat_template(
+                            history,
+                            tokenize=False,
+                            split_special_tokens=False,
+                            add_special_tokens=False,
+                        )
+                        # 转换为token ids
+                        tokens = tokenizer.tokenize(spliced_text)
+                        prompt_token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                        input_ids_all.extend(prompt_token_ids)
+                        round_input.prompt_token_ids = input_ids_all
+                    else:
+                        prompt_length = len(input_ids_all) + len(output_ids)
+                        if max_prompt_len and prompt_length >= max_prompt_len:
+                            # 超长截断
+                            print(
+                                f"[SESSION STOP] {round_input.no} reach max_prompt_len={max_prompt_len}, stop session"
+                            )
+                            break
+                        # 拼接token_ids模式，后续轮
+                        input_ids_all.extend(output_ids)
+                        user_prompt = message["content"]
+                        # 拼接user_prompt
+                        if round_input.tokenizer_model == "eb5":
+                            # EB5模型
+                            user_prompt = (
+                                f"\n\n<|im_start|>user\n{user_prompt}<|im_end|>\n\n<|im_start|>assistant\n<think>\n"
+                            )
+                        else:
+                            # 0.3B模型,2 </s>，拼接时会被替换成100272 <|end_of_sentence|>
+                            input_ids_all[-1] = 100272
+                            user_prompt = f"User: {user_prompt}\nAssistant: "
+                        prompt_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(user_prompt))
+                        input_ids_all.extend(prompt_token_ids)
+                        round_input.prompt_token_ids = input_ids_all
                 # 复用 session
                 s0 = time.perf_counter()
                 output = await async_request_eb_openai_chat_completions(
@@ -536,6 +654,14 @@ async def async_request_eb_openai_chat_completions_multi_turn(
                 input_tokens += output.prompt_tokens
                 output_tokens += output.output_tokens
 
+                # 更新output_ids
+                output_ids = output.output_ids
+
+                if max_prompt_len and input_tokens >= max_prompt_len:
+                    # 后验超长截断
+                    print(f"[SESSION STOP] {round_input.no} reach max_prompt_len={max_prompt_len}, stop session")
+                    break
+
                 if enable_tools:
                     # 循环调用工具
                     max_loop = json_data.get("max_loop", 10)
@@ -643,7 +769,9 @@ async def async_request_eb_openai_chat_completions_multi_turn(
                         output_tokens += output.output_tokens
                         # 若session输入长度超过max_prompt_len，则停止session
                         if max_prompt_len and input_tokens >= max_prompt_len:
-                            print(f"[SESSION STOP] {prompt_no} reach max_prompt_len={max_prompt_len}, stop session")
+                            print(
+                                f"[SESSION STOP] {round_input.no} reach max_prompt_len={max_prompt_len}, stop session"
+                            )
                             session_end = time.perf_counter()
                             metrics = SessionMetrics(
                                 session_no=request_func_input.no,
 
@@ -383,23 +383,26 @@ async def benchmark(
         response_format=response_format,
         random_flag=random_flag,
         json_data=test_json_data,
+        tokenizer_model=args.tokenizer_model,
+        tokenizer_path=args.tokenizer_path,
     )
 
-    print("test_input:", test_input)
+    if not debug:
+        print("test_input:", test_input)
 
-    test_output = await request_func(request_func_input=test_input)
+        test_output = await request_func(request_func_input=test_input)
 
-    if args.multi_turn:
-        out_list, metrics = test_output
-        test_output = out_list[0]
+        if args.multi_turn:
+            out_list, metrics = test_output
+            test_output = out_list[0]
 
-    if not test_output.success:
-        print("test_output:", test_output, flush=True)
-        raise ValueError(
-            f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}"
-        )
-    else:
-        print("Initial test run completed. Starting main benchmark run...")
+        if not test_output.success:
+            print("test_output:", test_output, flush=True)
+            raise ValueError(
+                f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}"
+            )
+        else:
+            print("Initial test run completed. Starting main benchmark run...")
 
     if lora_modules:
         # For each input request, choose a LoRA module at random.
@@ -490,6 +493,8 @@ async def limited_request_func(request_func_input, pbar):
                 response_format=response_format,
                 random_flag=random_flag,
                 json_data=json_data,
+                tokenizer_model=args.tokenizer_model,
+                tokenizer_path=args.tokenizer_path,
             )
             tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
 
@@ -576,6 +581,8 @@ async def limited_request_func_per_ip(req_input, semaphore, pbar):
                     response_format=response_format,
                     random_flag=random_flag,
                     json_data=json_data,
+                    tokenizer_model=args.tokenizer_model,
+                    tokenizer_path=args.tokenizer_path,
                 )
 
                 tasks.append(asyncio.create_task(limited_request_func_per_ip(req_input, semaphore, pbar)))
@@ -1427,6 +1434,18 @@ def main(args: argparse.Namespace):
         action="store_true",
         help="按多轮对话方式请求",
     )
+    parser.add_argument(
+        "--tokenizer-model",
+        default="auto",
+        type=str,
+        help="使用token_ids请求时指定，多轮对话tokenizer模型类型，'eb': ErnieBotTokenizer, 'eb5': Ernie5Tokenizer, 'eb_mm': Ernie4_5Tokenizer",
+    )
+    parser.add_argument(
+        "--tokenizer-path",
+        type=str,
+        default=None,
+        help="使用token_ids请求时指定，模型tokenizer路径",
+    )
     parser.add_argument(
         "--drop-ratio",
         type=float,