bukejiyu
diff --git a/‎.github/workflows/_base_test.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/_base_test.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/features/structured_outputs.md‎
Lines changed: 62 additions & 0 deletions b/‎docs/features/structured_outputs.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎docs/zh/features/structured_outputs.md‎
Lines changed: 64 additions & 0 deletions b/‎docs/zh/features/structured_outputs.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎docs/zh/usage/faq.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/zh/usage/faq.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 13 additions & 21 deletions b/‎fastdeploy/config.py‎
Lines changed: 13 additions & 21 deletions
diff --git a/‎fastdeploy/engine/args_utils.py‎
Lines changed: 22 additions & 8 deletions b/‎fastdeploy/engine/args_utils.py‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎fastdeploy/engine/common_engine.py‎
Lines changed: 1 addition & 1 deletion b/‎fastdeploy/engine/common_engine.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/engine/engine.py‎
Lines changed: 26 additions & 2 deletions b/‎fastdeploy/engine/engine.py‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎fastdeploy/engine/request.py‎
Lines changed: 5 additions & 7 deletions b/‎fastdeploy/engine/request.py‎
Lines changed: 5 additions & 7 deletions
@@ -134,7 +134,6 @@ jobs:
           -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
           -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
           -e "FLASK_PORT=${FLASK_PORT}" \
-          -e "FD_FORCE_CHUNKED_PREFILL=1" \
           -v "${MODEL_CACHE_DIR}:/MODELDATA" \
           -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
           -v "${CACHE_DIR}/.cache:/root/.cache" \
 
@@ -330,3 +330,65 @@ ParsedChatCompletionMessage[Info](content='{"addr": "No.1 Century Avenue, Pudong
 Address: No.1 Century Avenue, Pudong New Area, Shanghai
 Height: 468
 ```
+
+### Offline Inference
+
+Offline inference allows restricting the model's output format by pre-specified constraints. In `FastDeploy`, constraints can be specified through the `GuidedDecodingParams` class in `SamplingParams`. `GuidedDecodingParams` supports the following constraint types, with usage similar to online inference:
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+The following example demonstrates how to use offline inference to generate a structured json:
+
+```python
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# Constrained decoding parameters
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# Sampling parameters
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
+
+outputs = llm.generate(
+    prompts="Generate a JSON describing a literary work, including author, title and book type.",
+    sampling_params=sampling_params,
+)
+
+# Output results
+for output in outputs:
+    print(output.outputs.text)
+```
+
+Output:
+
+```
+{"author": "George Orwell", "title": "1984", "genre": "Dystopian"}
+```
@@ -330,3 +330,67 @@ ParsedChatCompletionMessage[Info](content='{"addr": "上海市浦东新区世纪
 地址: 上海市浦东新区世纪大道1号
 高度: 468
 ```
+
+### 离线推理
+
+离线推理允许通过预先指定约束条件，限制模型输出格式。在 `FastDeploy` 中，支持通过 `SamplingParams` 中的 `GuidedDecodingParams` 类指定相关约束条件。`GuidedDecodingParams` 支持以下几种约束条件，使用方式可以参考在线推理：
+
+```python
+json: Optional[Union[str, dict]] = None
+regex: Optional[str] = None
+choice: Optional[List[str]] = None
+grammar: Optional[str] = None
+json_object: Optional[bool] = None
+structural_tag: Optional[str] = None
+```
+
+以下示例展示了如何使用离线推理生成一个结构化的 json :
+
+```python
+
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.engine.sampling_params import GuidedDecodingParams
+from pydantic import BaseModel
+from enum import Enum
+
+class BookType(str, Enum):
+    romance = "Romance"
+    historical = "Historical"
+    adventure = "Adventure"
+    mystery = "Mystery"
+    dystopian = "Dystopian"
+
+class BookDescription(BaseModel):
+    author: str
+    title: str
+    genre: BookType
+
+# Constrained decoding parameters
+guided_decoding_params = GuidedDecodingParams(json=BookDescription.model_json_schema())
+
+# Sampling parameters
+sampling_params = SamplingParams(
+    top_p=0.95,
+    max_tokens=6400,
+    guided_decoding=guided_decoding_params,
+)
+
+# Load model
+llm = LLM(model="ERNIE-4.5-0.3B", tensor_parallel_size=1, max_model_len=8192, guided_decoding_backend="auto")
+
+outputs = llm.generate(
+    prompts="生成一个JSON，描述一本中国的著作，要包含作者、标题和书籍类型。",
+    sampling_params=sampling_params,
+)
+
+# Output results
+for output in outputs:
+    print(output.outputs.text)
+
+```
+
+输出
+
+```
+{"author": "曹雪芹", "title": "红楼梦", "genre": "Historical"}
+```
@@ -0,0 +1,4 @@
+1. 服务可以支持多大并发？
+- 服务部署时推荐配置环境变量export ENABLE_V1_KVCACHE_SCHEDULER=1
+- 服务在启动时需要配置```max-num-seqs```，此参数用于表示Decode阶段的最大Batch数，如果并发超过此值，则超出的请求会排队等待处理, 常规情况下你可以将```max-num-seqs```配置为128，保持在较高的范围，实际并发由发压客户端来决定。
+- ```max-num-seqs```仅表示设定的上限，但实际上服务能并发处理的上限取决于KVCache的大小，在启动服务后，查看log/worker_process.log会看到类似```num_blocks_global: 17131```的日志，这表明当前服务的KVCache Block数量为17131, 17131*block_size(默认64）即知道总共可缓存的Token数量，例如此处为17131*64=1096384。如果你的请求数据平均输入和输出Token之和为20K，那么服务实际可以处理的并发大概为1096384/20k=53
@@ -127,12 +127,13 @@ def __init__(
         self.redundant_experts_num = 0
         self.seed = 0
         self.quantization = None
+        self.reasoning_parser = None
         self.pad_token_id: int = -1
         self.eos_tokens_lens: int = 2
         self.lm_head_fp32: bool = False
         self.model_format = "auto"
         for key, value in args.items():
-            if hasattr(self, key):
+            if hasattr(self, key) and value != "None":
                 setattr(self, key, value)
 
         assert self.model != ""
@@ -1233,23 +1234,14 @@ def postprocess(self):
 
         self.paddle_commit_id = paddle.version.commit
 
-        if self.cache_config.enable_chunked_prefill:
-            self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
-            if (
-                self.speculative_config is not None
-                and self.speculative_config.method in ["mtp"]
-                and not self.force_chunked_prefill
-            ):
-                self.cache_config.enable_chunked_prefill = False
-
         if self.max_num_batched_tokens is None:
-            if self.cache_config.enable_chunked_prefill:
-                self.max_num_batched_tokens = 2048
+            if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
+                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
             else:
-                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
-                    self.max_num_batched_tokens = self.max_model_len
+                if self.cache_config.enable_chunked_prefill:
+                    self.max_num_batched_tokens = 2048
                 else:
-                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                    self.max_num_batched_tokens = self.max_model_len
 
         if self.long_prefill_token_threshold == 0:
             self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -1258,7 +1250,8 @@ def postprocess(self):
         self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
 
         if self.guided_decoding_backend == "auto":
-            if self.model_config.enable_mm:
+            if current_platform.is_xpu() or self.speculative_config.method is not None:
+                logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
                 self.guided_decoding_backend = "off"
             else:
                 self.guided_decoding_backend = "xgrammar"
@@ -1328,12 +1321,10 @@ def check(self):
             ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
 
             if self.guided_decoding_backend != "off":
-                # TODO: mm support guided_decoding
-                assert (
-                    self.model_config.enable_mm is False
-                ), "Multimodal model currently do not support guided_decoding"
-
                 # TODO: speculative decoding support guided_decoding
+                assert (
+                    self.speculative_config.method is None
+                ), "speculative decoding currently do not support guided_decoding"
 
                 # TODO: xpu support guided_decoding
                 assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
@@ -1344,6 +1335,7 @@ def check(self):
                     raise Exception(
                         f"import XGrammar failed, please install XGrammar use `pip install xgrammar==0.1.19`. \n\t {e}"
                     )
+
         if self.scheduler_config is not None:
             self.scheduler_config.check()
 
 
@@ -15,11 +15,11 @@
 """
 
 import json
-import os
 from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional
 
+from fastdeploy import envs
 from fastdeploy.config import (
     CacheConfig,
     EarlyStopConfig,
@@ -243,7 +243,7 @@ class EngineArgs:
     Ports for rdma communication.
     """
 
-    enable_chunked_prefill: bool = True
+    enable_chunked_prefill: bool = False
     """
     Flag to enable chunked prefilling.
     """
@@ -981,22 +981,36 @@ def create_engine_config(self) -> FDConfig:
 
         if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
             self.tensor_parallel_size = model_cfg.tensor_parallel_size
+
+        speculative_cfg = self.create_speculative_config()
+        if not self.enable_chunked_prefill:
+            if (
+                current_platform.is_cuda()
+                and self.splitwise_role == "mixed"
+                and (speculative_cfg is None or speculative_cfg.method not in ["mtp"])
+            ):
+                # default enable chunked prefill
+                self.enable_chunked_prefill = True
+
+            self.disable_chunked_prefill = int(envs.FD_DISABLE_CHUNKED_PREFILL)
+            if self.disable_chunked_prefill:
+                self.enable_chunked_prefill = False
+
         if self.max_num_batched_tokens is None:
-            if self.enable_chunked_prefill:
-                self.max_num_batched_tokens = 2048
+            if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
+                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
             else:
-                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
-                    self.max_num_batched_tokens = self.max_model_len
+                if self.enable_chunked_prefill:
+                    self.max_num_batched_tokens = 2048
                 else:
-                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                    self.max_num_batched_tokens = self.max_model_len
 
         all_dict = asdict(self)
         all_dict["model_cfg"] = model_cfg
         cache_cfg = CacheConfig(all_dict)
         load_cfg = LoadConfig(all_dict)
         parallel_cfg = ParallelConfig(all_dict)
         scheduler_cfg = self.create_scheduler_config()
-        speculative_cfg = self.create_speculative_config()
         graph_opt_cfg = self.create_graph_optimization_config()
         graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
         moba_attention_config = self.create_moba_attention_config()
 
@@ -589,7 +589,7 @@ def _insert_zmq_task_to_scheduler(self):
                 else:
                     err, data = self.zmq_server.receive_pyobj_once(block)
                 if err is not None:
-                    llm_logger.error("Engine stops inserting zmq task into scheduler, err:{err}")
+                    llm_logger.error(f"Engine stops inserting zmq task into scheduler, err:{err}")
                     break
 
                 request, insert_task = None, []
 
@@ -178,6 +178,22 @@ def _get_generated_result(self):
 
     # _insert_task_to_worker moved to CommonEngine
 
+    def _has_guided_input(self, request):
+        """
+        Check if the request has any guided input.
+        """
+        return any(
+            x is not None
+            for x in (
+                request.guided_json,
+                request.guided_regex,
+                request.guided_choice,
+                request.structural_tag,
+                request.guided_grammar,
+                request.guided_json_object,
+            )
+        )
+
     def add_requests(self, task, sampling_params=None, **kwargs):
         """
         Add a new request to the queue.
@@ -249,8 +265,15 @@ def add_requests(self, task, sampling_params=None, **kwargs):
                     llm_logger.error(error_msg)
                     raise EngineError(error_msg, error_code=400)
 
-        if self.engine.guided_decoding_checker is not None:
-            request, err_msg = self.engine.guided_decoding_checker.schema_format(request)
+        if self._has_guided_input(request):
+            err_msg = None
+            if self.guided_decoding_checker is None:
+                err_msg = (
+                    "guided_backend is None, use --guided-decoding-backend to specify the backend at server startup."
+                )
+            else:
+                request, err_msg = self.guided_decoding_checker.schema_format(request)
+
             if err_msg is not None:
                 llm_logger.error(err_msg)
                 raise EngineError(err_msg, error_code=400)
@@ -469,6 +492,7 @@ def _start_worker_service(self):
             f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
             f" --load_strategy {self.cfg.load_config.load_strategy}"
             f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
+            f" --reasoning_parser {self.cfg.reasoning_parser}"
             f" --load_choices {self.cfg.load_config.load_choices}"
             f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
             f" --ips {ips}"
 
@@ -263,13 +263,11 @@ def set(self, key, value):
             setattr(self, key, value)
 
     def __repr__(self) -> str:
-        return (
-            f"Request(request_id={self.request_id}, "
-            f"prompt={self.prompt!r}, "
-            f"prompt_token_ids={self.prompt_token_ids}, "
-            f"draft_token_ids={self.draft_token_ids}, "
-            f"sampling_params={self.sampling_params})"
-        )
+        non_none_fields = []
+        for attr, value in vars(self).items():
+            if value is not None and not attr.startswith("_"):
+                non_none_fields.append(f"{attr}={value!r}")
+        return f"Request({', '.join(non_none_fields)})"
 
 
 @dataclass(slots=True)