Skip to content

Commit 85b2116

Browse files
authored
Merge branch 'develop' into loader_speed
2 parents b26c788 + 1908465 commit 85b2116

28 files changed

Lines changed: 1505 additions & 120 deletions

.github/workflows/_base_test.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ jobs:
134134
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
135135
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
136136
-e "FLASK_PORT=${FLASK_PORT}" \
137-
-e "FD_FORCE_CHUNKED_PREFILL=1" \
138137
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
139138
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
140139
-v "${CACHE_DIR}/.cache:/root/.cache" \

docs/features/structured_outputs.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,3 +330,65 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong
330330
Address: No.1 Century Avenue, Pudong New Area, Shanghai
331331
Height: 468
332332
```
333+
334+
### Offline Inference
335+
336+
Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference:
337+
338+
```python
339+
json: Optional[Union[str, dict]] = None
340+
regex: Optional[str] = None
341+
choice: Optional[List[str]] = None
342+
grammar: Optional[str] = None
343+
json_object: Optional[bool] = None
344+
structural_tag: Optional[str] = None
345+
```
346+
347+
The following example demonstrates how to use offline inference to generate a structured json:
348+
349+
```python
350+
from fastdeploy import LLM, SamplingParams
351+
from fastdeploy.engine.sampling_params import GuidedDecodingParams
352+
from pydantic import BaseModel
353+
from enum import Enum
354+
355+
class BookType(str, Enum):
356+
romance = "Romance"
357+
historical = "Historical"
358+
adventure = "Adventure"
359+
mystery = "Mystery"
360+
dystopian = "Dystopian"
361+
362+
class BookDescription(BaseModel):
363+
author: str
364+
title: str
365+
genre: BookType
366+
367+
# Constrained decoding parameters
368+
guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
369+
370+
# Sampling parameters
371+
sampling_params = SamplingParams(
372+
top_p=0.95,
373+
max_tokens=6400,
374+
guided_decoding=guided_decoding_params,
375+
)
376+
377+
# Load model
378+
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
379+
380+
outputs = llm.generate(
381+
prompts="Generate a JSON describing a literary work, including author, title and book type.",
382+
sampling_params=sampling_params,
383+
)
384+
385+
# Output results
386+
for output in outputs:
387+
print(output.outputs.text)
388+
```
389+
390+
Output:
391+
392+
```
393+
{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
394+
```

docs/zh/features/structured_outputs.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,3 +330,67 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪
330330
地址: 上海市浦东新区世纪大道1号
331331
高度: 468
332332
```
333+
334+
### 离线推理
335+
336+
离线推理允许通过预先指定约束条件,限制模型输出格式。在 `FastDeploy` 中,支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件,使用方式可以参考在线推理:
337+
338+
```python
339+
json: Optional[Union[str, dict]] = None
340+
regex: Optional[str] = None
341+
choice: Optional[List[str]] = None
342+
grammar: Optional[str] = None
343+
json_object: Optional[bool] = None
344+
structural_tag: Optional[str] = None
345+
```
346+
347+
以下示例展示了如何使用离线推理生成一个结构化的 json :
348+
349+
```python
350+
351+
from fastdeploy import LLM, SamplingParams
352+
from fastdeploy.engine.sampling_params import GuidedDecodingParams
353+
from pydantic import BaseModel
354+
from enum import Enum
355+
356+
class BookType(str, Enum):
357+
romance = "Romance"
358+
historical = "Historical"
359+
adventure = "Adventure"
360+
mystery = "Mystery"
361+
dystopian = "Dystopian"
362+
363+
class BookDescription(BaseModel):
364+
author: str
365+
title: str
366+
genre: BookType
367+
368+
# Constrained decoding parameters
369+
guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
370+
371+
# Sampling parameters
372+
sampling_params = SamplingParams(
373+
top_p=0.95,
374+
max_tokens=6400,
375+
guided_decoding=guided_decoding_params,
376+
)
377+
378+
# Load model
379+
llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
380+
381+
outputs = llm.generate(
382+
prompts="生成一个JSON,描述一本中国的著作,要包含作者、标题和书籍类型。",
383+
sampling_params=sampling_params,
384+
)
385+
386+
# Output results
387+
for output in outputs:
388+
print(output.outputs.text)
389+
390+
```
391+
392+
输出
393+
394+
```
395+
{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"}
396+
```

docs/zh/usage/faq.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
1. 服务可以支持多大并发?
2+
- 服务部署时推荐配置环境变量export ENABLE_V1_KVCACHE_SCHEDULER=1
3+
- 服务在启动时需要配置```max-num-seqs```,此参数用于表示Decode阶段的最大Batch数,如果并发超过此值,则超出的请求会排队等待处理, 常规情况下你可以将```max-num-seqs```配置为128,保持在较高的范围,实际并发由发压客户端来决定。
4+
- ```max-num-seqs```仅表示设定的上限,但实际上服务能并发处理的上限取决于KVCache的大小,在启动服务后,查看log/worker_process.log会看到类似```num_blocks_global: 17131```的日志,这表明当前服务的KVCache Block数量为17131, 17131*block_size(默认64)即知道总共可缓存的Token数量,例如此处为17131*64=1096384。如果你的请求数据平均输入和输出Token之和为20K,那么服务实际可以处理的并发大概为1096384/20k=53

fastdeploy/config.py

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,13 @@ def __init__(
127127
self.redundant_experts_num = 0
128128
self.seed = 0
129129
self.quantization = None
130+
self.reasoning_parser = None
130131
self.pad_token_id: int = -1
131132
self.eos_tokens_lens: int = 2
132133
self.lm_head_fp32: bool = False
133134
self.model_format = "auto"
134135
for key, value in args.items():
135-
if hasattr(self, key):
136+
if hasattr(self, key) and value != "None":
136137
setattr(self, key, value)
137138

138139
assert self.model != ""
@@ -1233,23 +1234,14 @@ def postprocess(self):
12331234

12341235
self.paddle_commit_id = paddle.version.commit
12351236

1236-
if self.cache_config.enable_chunked_prefill:
1237-
self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
1238-
if (
1239-
self.speculative_config is not None
1240-
and self.speculative_config.method in ["mtp"]
1241-
and not self.force_chunked_prefill
1242-
):
1243-
self.cache_config.enable_chunked_prefill = False
1244-
12451237
if self.max_num_batched_tokens is None:
1246-
if self.cache_config.enable_chunked_prefill:
1247-
self.max_num_batched_tokens = 2048
1238+
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
1239+
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
12481240
else:
1249-
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
1250-
self.max_num_batched_tokens = self.max_model_len
1241+
if self.cache_config.enable_chunked_prefill:
1242+
self.max_num_batched_tokens = 2048
12511243
else:
1252-
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
1244+
self.max_num_batched_tokens = self.max_model_len
12531245

12541246
if self.long_prefill_token_threshold == 0:
12551247
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -1258,7 +1250,8 @@ def postprocess(self):
12581250
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
12591251

12601252
if self.guided_decoding_backend == "auto":
1261-
if self.model_config.enable_mm:
1253+
if current_platform.is_xpu() or self.speculative_config.method is not None:
1254+
logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
12621255
self.guided_decoding_backend = "off"
12631256
else:
12641257
self.guided_decoding_backend = "xgrammar"
@@ -1328,12 +1321,10 @@ def check(self):
13281321
], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
13291322

13301323
if self.guided_decoding_backend != "off":
1331-
# TODO: mm support guided_decoding
1332-
assert (
1333-
self.model_config.enable_mm is False
1334-
), "Multimodal model currently do not support guided_decoding"
1335-
13361324
# TODO: speculative decoding support guided_decoding
1325+
assert (
1326+
self.speculative_config.method is None
1327+
), "speculative decoding currently do not support guided_decoding"
13371328

13381329
# TODO: xpu support guided_decoding
13391330
assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
@@ -1344,6 +1335,7 @@ def check(self):
13441335
raise Exception(
13451336
f"import XGrammar failed, please install XGrammar use `pip install xgrammar==0.1.19`. \n\t {e}"
13461337
)
1338+
13471339
if self.scheduler_config is not None:
13481340
self.scheduler_config.check()
13491341

fastdeploy/engine/args_utils.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
"""
1616

1717
import json
18-
import os
1918
from dataclasses import asdict, dataclass
2019
from dataclasses import fields as dataclass_fields
2120
from typing import Any, Dict, List, Optional
2221

22+
from fastdeploy import envs
2323
from fastdeploy.config import (
2424
CacheConfig,
2525
EarlyStopConfig,
@@ -243,7 +243,7 @@ class EngineArgs:
243243
Ports for rdma communication.
244244
"""
245245

246-
enable_chunked_prefill: bool = True
246+
enable_chunked_prefill: bool = False
247247
"""
248248
Flag to enable chunked prefilling.
249249
"""
@@ -981,22 +981,36 @@ def create_engine_config(self) -> FDConfig:
981981

982982
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
983983
self.tensor_parallel_size = model_cfg.tensor_parallel_size
984+
985+
speculative_cfg = self.create_speculative_config()
986+
if not self.enable_chunked_prefill:
987+
if (
988+
current_platform.is_cuda()
989+
and self.splitwise_role == "mixed"
990+
and (speculative_cfg is None or speculative_cfg.method not in ["mtp"])
991+
):
992+
# default enable chunked prefill
993+
self.enable_chunked_prefill = True
994+
995+
self.disable_chunked_prefill = int(envs.FD_DISABLE_CHUNKED_PREFILL)
996+
if self.disable_chunked_prefill:
997+
self.enable_chunked_prefill = False
998+
984999
if self.max_num_batched_tokens is None:
985-
if self.enable_chunked_prefill:
986-
self.max_num_batched_tokens = 2048
1000+
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
1001+
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
9871002
else:
988-
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
989-
self.max_num_batched_tokens = self.max_model_len
1003+
if self.enable_chunked_prefill:
1004+
self.max_num_batched_tokens = 2048
9901005
else:
991-
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
1006+
self.max_num_batched_tokens = self.max_model_len
9921007

9931008
all_dict = asdict(self)
9941009
all_dict["model_cfg"] = model_cfg
9951010
cache_cfg = CacheConfig(all_dict)
9961011
load_cfg = LoadConfig(all_dict)
9971012
parallel_cfg = ParallelConfig(all_dict)
9981013
scheduler_cfg = self.create_scheduler_config()
999-
speculative_cfg = self.create_speculative_config()
10001014
graph_opt_cfg = self.create_graph_optimization_config()
10011015
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
10021016
moba_attention_config = self.create_moba_attention_config()

fastdeploy/engine/common_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ def _insert_zmq_task_to_scheduler(self):
589589
else:
590590
err, data = self.zmq_server.receive_pyobj_once(block)
591591
if err is not None:
592-
llm_logger.error("Engine stops inserting zmq task into scheduler, err:{err}")
592+
llm_logger.error(f"Engine stops inserting zmq task into scheduler, err:{err}")
593593
break
594594

595595
request, insert_task = None, []

fastdeploy/engine/engine.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,22 @@ def _get_generated_result(self):
178178

179179
# _insert_task_to_worker moved to CommonEngine
180180

181+
def _has_guided_input(self, request):
182+
"""
183+
Check if the request has any guided input.
184+
"""
185+
return any(
186+
x is not None
187+
for x in (
188+
request.guided_json,
189+
request.guided_regex,
190+
request.guided_choice,
191+
request.structural_tag,
192+
request.guided_grammar,
193+
request.guided_json_object,
194+
)
195+
)
196+
181197
def add_requests(self, task, sampling_params=None, **kwargs):
182198
"""
183199
Add a new request to the queue.
@@ -249,8 +265,15 @@ def add_requests(self, task, sampling_params=None, **kwargs):
249265
llm_logger.error(error_msg)
250266
raise EngineError(error_msg, error_code=400)
251267

252-
if self.engine.guided_decoding_checker is not None:
253-
request, err_msg = self.engine.guided_decoding_checker.schema_format(request)
268+
if self._has_guided_input(request):
269+
err_msg = None
270+
if self.guided_decoding_checker is None:
271+
err_msg = (
272+
"guided_backend is None, use --guided-decoding-backend to specify the backend at server startup."
273+
)
274+
else:
275+
request, err_msg = self.guided_decoding_checker.schema_format(request)
276+
254277
if err_msg is not None:
255278
llm_logger.error(err_msg)
256279
raise EngineError(err_msg, error_code=400)
@@ -469,6 +492,7 @@ def _start_worker_service(self):
469492
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
470493
f" --load_strategy {self.cfg.load_config.load_strategy}"
471494
f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
495+
f" --reasoning_parser {self.cfg.reasoning_parser}"
472496
f" --load_choices {self.cfg.load_config.load_choices}"
473497
f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
474498
f" --ips {ips}"

fastdeploy/engine/request.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -263,13 +263,11 @@ def set(self, key, value):
263263
setattr(self, key, value)
264264

265265
def __repr__(self) -> str:
266-
return (
267-
f"Request(request_id={self.request_id}, "
268-
f"prompt={self.prompt!r}, "
269-
f"prompt_token_ids={self.prompt_token_ids}, "
270-
f"draft_token_ids={self.draft_token_ids}, "
271-
f"sampling_params={self.sampling_params})"
272-
)
266+
non_none_fields = []
267+
for attr, value in vars(self).items():
268+
if value is not None and not attr.startswith("_"):
269+
non_none_fields.append(f"{attr}={value!r}")
270+
return f"Request({', '.join(non_none_fields)})"
273271

274272

275273
@dataclass(slots=True)

0 commit comments

Comments
 (0)