Skip to content

Commit 9abc1e1

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy into logit_stat_dev
2 parents a8259f6 + 6f5aa88 commit 9abc1e1

9 files changed

Lines changed: 1792 additions & 193 deletions

File tree

benchmarks/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ python -m pip install -r requirements.txt
4949
--ip-list:支持多个ip:port,将总请求数以及总并发数均分到每个IP,按整除取余分配。例:0.0.0.0:1211,0.0.0.0:1222,默认为空
5050
--multi-turn:开启多轮对话,将数据集messages中的多轮对话逐轮请求,默认False不区分多轮。若需要添加tool_call,需在hyperparameter-path超参yaml中配置tools,参考yaml/request_yaml/GLM-32k-tool-call.yaml,数据集中需要指定tool_url,max_loop(非必选,默认10)为单轮调用最大次数
5151
```
52+
多轮对话使用prompt_token_ids模式请求
53+
```bash
54+
开启--multi-turn
55+
--tokenizer-model:使用prompt_token_ids请求时指定,多轮对话tokenizer模型类型,可选"eb": ErnieBotTokenizer, "eb5": Ernie5Tokenizer, "eb_mm": Ernie4_5Tokenizer
56+
--tokenizer-path:使用prompt_token_ids请求时指定,模型tokenizer路径
57+
```
5258

5359
##### /v1/chat/completions接口压测单条数据调试
5460

benchmarks/backend_request_func.py

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import copy
2121
import io
2222
import json
23+
import logging
2324
import os
2425
import sys
2526
import time
@@ -56,6 +57,9 @@ class RequestFuncInput:
5657
response_format: Optional[dict] = None
5758
random_flag: bool = False
5859
json_data: Optional[dict] = None
60+
prompt_token_ids: Optional[list] = None
61+
tokenizer_model: str = None
62+
tokenizer_path: str = None
5963

6064

6165
@dataclass
@@ -81,6 +85,7 @@ class RequestFuncOutput:
8185
error: str = ""
8286
metrics: dict = field(default_factory=dict)
8387
tool_calls: list = field(default_factory=list)
88+
output_ids: list = field(default_factory=list)
8489

8590

8691
@dataclass
@@ -178,6 +183,49 @@ def metrics_summary(metrics, token_timestamps):
178183
return summary
179184

180185

186+
def load_tokenizer(model, actor_tokenizer_path):
187+
"""加载tokenizer"""
188+
from ernie_tokenizer import Ernie5Tokenizer, ErnieBotTokenizer
189+
from paddleformers.transformers import AutoTokenizer
190+
191+
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
192+
193+
vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
194+
195+
try:
196+
if model == "eb":
197+
for i in range(len(vocab_file_names)):
198+
if os.path.exists(os.path.join(actor_tokenizer_path, vocab_file_names[i])):
199+
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
200+
break
201+
tokenizer = ErnieBotTokenizer.from_pretrained(actor_tokenizer_path)
202+
elif model == "eb_mm":
203+
for vocab_file in vocab_file_names:
204+
full_path = os.path.join(actor_tokenizer_path, vocab_file)
205+
if os.path.exists(full_path):
206+
Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file
207+
# for i in range(len(vocab_file_names)):
208+
# if os.path.exists(os.path.join(actor_tokenizer_path, vocab_file_names[i])):
209+
# Ernie45Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
210+
# break
211+
tokenizer = Ernie4_5Tokenizer.from_pretrained(actor_tokenizer_path)
212+
# tokenizer.ignored_index = -100
213+
elif model == "eb5":
214+
for i in range(len(vocab_file_names)):
215+
if os.path.exists(os.path.join(actor_tokenizer_path, vocab_file_names[i])):
216+
Ernie5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
217+
break
218+
tokenizer = Ernie5Tokenizer.from_pretrained(actor_tokenizer_path)
219+
else:
220+
print("tokenizer: AUTO")
221+
tokenizer = AutoTokenizer.from_pretrained(actor_tokenizer_path, padding_side="left", use_fast=True)
222+
except Exception as e:
223+
tokenizer = None
224+
logging.warning(f"Load tokenizer error: {e}")
225+
226+
return tokenizer
227+
228+
181229
async def async_request_eb_openai_chat_completions(
182230
request_func_input: RequestFuncInput,
183231
pbar: Optional[tqdm] = None,
@@ -221,6 +269,14 @@ async def async_request_eb_openai_chat_completions(
221269
if request_func_input.response_format:
222270
payload["response_format"] = request_func_input.response_format
223271

272+
# 支持传入prompt_token_ids
273+
if request_func_input.prompt_token_ids:
274+
# 不走messages
275+
payload["messages"] = [{"role": "user", "content": [{"type": "text", "text": ""}]}]
276+
payload["prompt_token_ids"] = request_func_input.prompt_token_ids
277+
payload["return_token_ids"] = True
278+
# print("use_token_ids:", payload)
279+
224280
# 超参由yaml传入
225281
payload.update(request_func_input.hyper_parameters)
226282

@@ -298,6 +354,7 @@ async def async_request_eb_openai_chat_completions(
298354
content = choices[0]["delta"].get("content")
299355
reason_content = choices[0]["delta"].get("reasoning_content")
300356
tool_calls = choices[0]["delta"].get("tool_calls")
357+
completion_token_ids = choices[0]["delta"].get("completion_token_ids", [])
301358
if tool_calls:
302359
for tc in tool_calls:
303360
idx = tc.get("index", 0)
@@ -343,6 +400,8 @@ async def async_request_eb_openai_chat_completions(
343400

344401
output.generated_text += content or ""
345402
output.reasoning_content += reason_content or ""
403+
if completion_token_ids:
404+
output.output_ids.extend(completion_token_ids)
346405
# print(f"####content:{data}")
347406
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
348407
elif usage := data.get("usage", {}):
@@ -487,6 +546,27 @@ async def async_request_eb_openai_chat_completions_multi_turn(
487546
print("START", request_func_input.no, "user对话轮数:", user_count, flush=True)
488547
history = []
489548
prompt_no = 0
549+
max_prompt_len = (
550+
hyper.get("max_prompt_len") if hyper.get("max_prompt_len") is not None else json_data.get("max_prompt_len")
551+
)
552+
print("max_prompt_len:", max_prompt_len)
553+
input_ids_all = []
554+
# FD每轮 completion_token_ids
555+
output_ids = []
556+
use_token_ids = bool(request_func_input.tokenizer_model and request_func_input.tokenizer_path)
557+
tokenizer = None
558+
559+
if use_token_ids:
560+
print("token ids 拼接模式")
561+
enable_tools = False
562+
print("tokenizer_model:", request_func_input.tokenizer_model)
563+
print("tokenizer_path:", request_func_input.tokenizer_path)
564+
tokenizer = load_tokenizer(
565+
request_func_input.tokenizer_model,
566+
request_func_input.tokenizer_path,
567+
)
568+
else:
569+
print("messages 明文拼接模式")
490570

491571
# 只创建一次 session
492572
session_start = time.perf_counter()
@@ -508,6 +588,44 @@ async def async_request_eb_openai_chat_completions_multi_turn(
508588
round_input = copy.deepcopy(request_func_input)
509589
round_input.history_QA = history
510590
round_input.no = f"{round_input.no}_{prompt_no}"
591+
if use_token_ids:
592+
if len(input_ids_all) == 0:
593+
# 拼接token_ids模式,首轮token_ids
594+
spliced_text = tokenizer.apply_chat_template(
595+
history,
596+
tokenize=False,
597+
split_special_tokens=False,
598+
add_special_tokens=False,
599+
)
600+
# 转换为token ids
601+
tokens = tokenizer.tokenize(spliced_text)
602+
prompt_token_ids = tokenizer.convert_tokens_to_ids(tokens)
603+
input_ids_all.extend(prompt_token_ids)
604+
round_input.prompt_token_ids = input_ids_all
605+
else:
606+
prompt_length = len(input_ids_all) + len(output_ids)
607+
if max_prompt_len and prompt_length >= max_prompt_len:
608+
# 超长截断
609+
print(
610+
f"[SESSION STOP] {round_input.no} reach max_prompt_len={max_prompt_len}, stop session"
611+
)
612+
break
613+
# 拼接token_ids模式,后续轮
614+
input_ids_all.extend(output_ids)
615+
user_prompt = message["content"]
616+
# 拼接user_prompt
617+
if round_input.tokenizer_model == "eb5":
618+
# EB5模型
619+
user_prompt = (
620+
f"\n\n<|im_start|>user\n{user_prompt}<|im_end|>\n\n<|im_start|>assistant\n<think>\n"
621+
)
622+
else:
623+
# 0.3B模型,2 </s>,拼接时会被替换成100272 <|end_of_sentence|>
624+
input_ids_all[-1] = 100272
625+
user_prompt = f"User: {user_prompt}\nAssistant: "
626+
prompt_token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(user_prompt))
627+
input_ids_all.extend(prompt_token_ids)
628+
round_input.prompt_token_ids = input_ids_all
511629
# 复用 session
512630
s0 = time.perf_counter()
513631
output = await async_request_eb_openai_chat_completions(
@@ -536,6 +654,14 @@ async def async_request_eb_openai_chat_completions_multi_turn(
536654
input_tokens += output.prompt_tokens
537655
output_tokens += output.output_tokens
538656

657+
# 更新output_ids
658+
output_ids = output.output_ids
659+
660+
if max_prompt_len and input_tokens >= max_prompt_len:
661+
# 后验超长截断
662+
print(f"[SESSION STOP] {round_input.no} reach max_prompt_len={max_prompt_len}, stop session")
663+
break
664+
539665
if enable_tools:
540666
# 循环调用工具
541667
max_loop = json_data.get("max_loop", 10)
@@ -643,7 +769,9 @@ async def async_request_eb_openai_chat_completions_multi_turn(
643769
output_tokens += output.output_tokens
644770
# 若session输入长度超过max_prompt_len,则停止session
645771
if max_prompt_len and input_tokens >= max_prompt_len:
646-
print(f"[SESSION STOP] {prompt_no} reach max_prompt_len={max_prompt_len}, stop session")
772+
print(
773+
f"[SESSION STOP] {round_input.no} reach max_prompt_len={max_prompt_len}, stop session"
774+
)
647775
session_end = time.perf_counter()
648776
metrics = SessionMetrics(
649777
session_no=request_func_input.no,

benchmarks/benchmark_serving.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -383,23 +383,26 @@ async def benchmark(
383383
response_format=response_format,
384384
random_flag=random_flag,
385385
json_data=test_json_data,
386+
tokenizer_model=args.tokenizer_model,
387+
tokenizer_path=args.tokenizer_path,
386388
)
387389

388-
print("test_input:", test_input)
390+
if not debug:
391+
print("test_input:", test_input)
389392

390-
test_output = await request_func(request_func_input=test_input)
393+
test_output = await request_func(request_func_input=test_input)
391394

392-
if args.multi_turn:
393-
out_list, metrics = test_output
394-
test_output = out_list[0]
395+
if args.multi_turn:
396+
out_list, metrics = test_output
397+
test_output = out_list[0]
395398

396-
if not test_output.success:
397-
print("test_output:", test_output, flush=True)
398-
raise ValueError(
399-
f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}"
400-
)
401-
else:
402-
print("Initial test run completed. Starting main benchmark run...")
399+
if not test_output.success:
400+
print("test_output:", test_output, flush=True)
401+
raise ValueError(
402+
f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}"
403+
)
404+
else:
405+
print("Initial test run completed. Starting main benchmark run...")
403406

404407
if lora_modules:
405408
# For each input request, choose a LoRA module at random.
@@ -490,6 +493,8 @@ async def limited_request_func(request_func_input, pbar):
490493
response_format=response_format,
491494
random_flag=random_flag,
492495
json_data=json_data,
496+
tokenizer_model=args.tokenizer_model,
497+
tokenizer_path=args.tokenizer_path,
493498
)
494499
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
495500

@@ -576,6 +581,8 @@ async def limited_request_func_per_ip(req_input, semaphore, pbar):
576581
response_format=response_format,
577582
random_flag=random_flag,
578583
json_data=json_data,
584+
tokenizer_model=args.tokenizer_model,
585+
tokenizer_path=args.tokenizer_path,
579586
)
580587

581588
tasks.append(asyncio.create_task(limited_request_func_per_ip(req_input, semaphore, pbar)))
@@ -1427,6 +1434,18 @@ def main(args: argparse.Namespace):
14271434
action="store_true",
14281435
help="按多轮对话方式请求",
14291436
)
1437+
parser.add_argument(
1438+
"--tokenizer-model",
1439+
default="auto",
1440+
type=str,
1441+
help="使用token_ids请求时指定,多轮对话tokenizer模型类型,'eb': ErnieBotTokenizer, 'eb5': Ernie5Tokenizer, 'eb_mm': Ernie4_5Tokenizer",
1442+
)
1443+
parser.add_argument(
1444+
"--tokenizer-path",
1445+
type=str,
1446+
default=None,
1447+
help="使用token_ids请求时指定,模型tokenizer路径",
1448+
)
14301449
parser.add_argument(
14311450
"--drop-ratio",
14321451
type=float,

0 commit comments

Comments
 (0)