PaddlePaddle
diff --git a/‎docs/parameters.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/parameters.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/zh/parameters.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/zh/parameters.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 55 additions & 0 deletions b/‎fastdeploy/config.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎fastdeploy/engine/args_utils.py‎
Lines changed: 68 additions & 0 deletions b/‎fastdeploy/engine/args_utils.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎fastdeploy/engine/async_llm.py‎
Lines changed: 1 addition & 0 deletions b/‎fastdeploy/engine/async_llm.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fastdeploy/engine/common_engine.py‎
Lines changed: 1 addition & 0 deletions b/‎fastdeploy/engine/common_engine.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fastdeploy/engine/engine.py‎
Lines changed: 4 additions & 1 deletion b/‎fastdeploy/engine/engine.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎fastdeploy/entrypoints/engine_client.py‎
Lines changed: 6 additions & 1 deletion b/‎fastdeploy/entrypoints/engine_client.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎fastdeploy/input/base_processor.py‎
Lines changed: 69 additions & 5 deletions b/‎fastdeploy/input/base_processor.py‎
Lines changed: 69 additions & 5 deletions
@@ -15,6 +15,11 @@ When using FastDeploy to deploy models (including offline inference and service
 | ```engine_worker_queue_port``` | `list[int]` | FastDeploy internal engine communication port list, auto-allocated based on data_parallel_size |
 | ```cache_queue_port``` | `list[int]` | FastDeploy internal KVCache process communication port list, auto-allocated based on data_parallel_size |
 | ```max_model_len``` | `int` | Default maximum supported context length for inference, default: 2048 |
+| ```max_completion_tokens``` | `int` | Server-level maximum allowed completion token length (hard cap). Per-request max_tokens will be clamped to this value. Default: None (bounded by max_model_len - input_len) |
+| ```reasoning_max_tokens``` | `int` | Server-level maximum allowed reasoning/thinking token length (hard cap). Per-request value will be clamped to this value. Default: None (no cap) |
+| ```response_max_tokens``` | `int` | Server-level maximum allowed response token length (hard cap). Per-request value will be clamped to this value. Default: None (no cap) |
+| ```min_completion_tokens``` | `int` | Server-level minimum generation length floor. Effective min_tokens = max(server_value, per-request value). Default: None (no floor) |
+| ```input_max_tokens``` | `int` | Server-level maximum input token length. Requests with prompt longer than this will be rejected. Default: None (no limit, bounded by max_model_len) |
 | ```tensor_parallel_size``` | `int` | Default tensor parallelism degree for model, default: 1 |
 | ```data_parallel_size``` | `int` | Default data parallelism degree for model, default: 1 |
 | ```block_size``` | `int` | KVCache management granularity (Token count), recommended default: 64 |
 
@@ -13,6 +13,11 @@
 | ```engine_worker_queue_port```     | `list[int]` | FastDeploy内部引擎进程通信端口列表，会根据data_parallel_size自动分配 |
 | ```cache_queue_port```             | `list[int]` | FastDeploy内部KVCache进程通信端口列表，会根据data_parallel_size自动分配 |
 | ```max_model_len```                | `int`       | 推理默认最大支持上下文长度，默认2048 |
+| ```max_completion_tokens```        | `int`       | 服务级最大生成token数硬上限。请求中的max_tokens会被截断至此值。默认：None（受max_model_len - input_len约束） |
+| ```reasoning_max_tokens```         | `int`       | 服务级推理/思考token数硬上限。请求中的reasoning_max_tokens会被截断至此值。默认：None（不限制） |
+| ```response_max_tokens```          | `int`       | 服务级回复token数硬上限。请求中的response_max_tokens会被截断至此值。默认：None（不限制） |
+| ```min_completion_tokens```        | `int`       | 服务级最小生成长度下限。实际min_tokens = max(服务值, 请求值)，请求不能低于此下限。默认：None（不限制） |
+| ```input_max_tokens```             | `int`       | 服务级输入token数上限。超过此值的请求将被拒绝。默认：None（不限制，受max_model_len约束） |
 | ```tensor_parallel_size```         | `int`       | 模型默认张量并行数，默认1 |
 | ```data_parallel_size```           | `int`       | 模型默认数据并行数，默认1 |
 | ```block_size```                   | `int`       | KVCache管理粒度(Token数)，推荐默认值64 |
 
@@ -1886,6 +1886,59 @@ def __str__(self):
         return self.to_json_string()
 
 
+class ServingLimitsConfig:
+    """Server-level request length limits and policies."""
+
+    def __init__(self, args):
+        self.max_completion_tokens = None
+        self.reasoning_max_tokens = None
+        self.response_max_tokens = None
+        self.min_completion_tokens = None
+        self.input_max_tokens = None
+
+        for key, value in args.items():
+            if hasattr(self, key) and value != "None":
+                setattr(self, key, value)
+
+    def validate(self, max_model_len):
+        """Validate serving limits against max_model_len at startup."""
+        for name in ("max_completion_tokens", "input_max_tokens", "response_max_tokens"):
+            value = getattr(self, name)
+            if value is not None and value <= 0:
+                flag = name.replace("_", "-")
+                raise ValueError(f"--{flag} ({value}) must be greater than 0.")
+
+        for name in ("reasoning_max_tokens", "min_completion_tokens"):
+            value = getattr(self, name)
+            if value is not None and value < 0:
+                flag = name.replace("_", "-")
+                raise ValueError(f"--{flag} ({value}) must be greater than or equal to 0.")
+
+        if self.min_completion_tokens is not None:
+            if self.min_completion_tokens >= max_model_len:
+                raise ValueError(
+                    f"--min-completion-tokens ({self.min_completion_tokens}) must be less than "
+                    f"--max-model-len ({max_model_len}). All requests would be rejected."
+                )
+            if self.max_completion_tokens is not None and self.min_completion_tokens > self.max_completion_tokens:
+                raise ValueError(
+                    f"--min-completion-tokens ({self.min_completion_tokens}) must not exceed "
+                    f"--max-completion-tokens ({self.max_completion_tokens})."
+                )
+
+        if self.max_completion_tokens is not None and self.max_completion_tokens > max_model_len:
+            logger.warning(
+                f"--max-completion-tokens ({self.max_completion_tokens}) > "
+                f"--max-model-len ({max_model_len}), it will have no effect."
+            )
+
+        if self.input_max_tokens is not None and self.input_max_tokens > max_model_len:
+            logger.warning(
+                f"--input-max-tokens ({self.input_max_tokens}) > "
+                f"--max-model-len ({max_model_len}), it will have no effect."
+            )
+
+
 class BenchmarkMetricsConfig:
     """Configuration for in-process benchmark metrics logger.
 
@@ -1981,6 +2034,7 @@ def __init__(
         routing_replay_config: Optional[RoutingReplayConfig] = None,
         benchmark_metrics_config=None,
         deploy_modality: DeployModality = DeployModality.MIXED,
+        serving_limits_config: ServingLimitsConfig = None,  # resolved below
     ):
         self.model_config: ModelConfig = model_config  # type: ignore
         self.cache_config: CacheConfig = cache_config  # type: ignore
@@ -1999,6 +2053,7 @@ def __init__(
         self.routing_replay_config = routing_replay_config
         self.benchmark_metrics_config = benchmark_metrics_config
         self.deploy_modality: DeployModality = deploy_modality
+        self.serving_limits_config: ServingLimitsConfig = serving_limits_config or ServingLimitsConfig({})
         # Initialize cuda graph capture list
         max_capture_shape = self.scheduler_config.max_num_seqs
         if self.graph_opt_config.cudagraph_only_prefill:
 
@@ -38,6 +38,7 @@
     RouterConfig,
     RoutingReplayConfig,
     RunnerOption,
+    ServingLimitsConfig,
     SpeculativeConfig,
     StructuredOutputsConfig,
     TaskOption,
@@ -111,6 +112,33 @@ class EngineArgs:
     """
     Maximum context length supported by the model.
     """
+    max_completion_tokens: Optional[int] = None
+    """
+    Server-level maximum allowed completion token length (hard cap).
+    Per-request max_tokens will be clamped to this value. None means no server-level cap
+    (bounded by max_model_len - input_len).
+    """
+    reasoning_max_tokens: Optional[int] = None
+    """
+    Server-level maximum allowed reasoning/thinking token length (hard cap).
+    Per-request reasoning_max_tokens will be clamped to this value. None means no server-level cap.
+    """
+    response_max_tokens: Optional[int] = None
+    """
+    Server-level maximum allowed response token length (hard cap).
+    Per-request response_max_tokens will be clamped to this value. None means no server-level cap.
+    """
+    min_completion_tokens: Optional[int] = None
+    """
+    Server-level minimum generation length floor.
+    Effective min_tokens = max(server_value, per-request value). Requests cannot set min_tokens
+    below this floor. None means no server-level floor.
+    """
+    input_max_tokens: Optional[int] = None
+    """
+    Server-level maximum input token length.
+    Requests with prompt longer than this will be rejected. None means no limit (bounded by max_model_len).
+    """
     tensor_parallel_size: int = 1
     """
     Degree of tensor parallelism.
@@ -768,6 +796,43 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.max_model_len,
             help="Maximum context length supported by the model.",
         )
+        model_group.add_argument(
+            "--max-completion-tokens",
+            type=int,
+            default=EngineArgs.max_completion_tokens,
+            help="Server-level maximum allowed completion token length (hard cap). "
+            "Per-request max_tokens will be clamped to this value. "
+            "Default: None (bounded by max_model_len - input_len).",
+        )
+        model_group.add_argument(
+            "--reasoning-max-tokens",
+            type=int,
+            default=EngineArgs.reasoning_max_tokens,
+            help="Server-level maximum allowed reasoning/thinking token length (hard cap). "
+            "Per-request reasoning_max_tokens will be clamped to this value. Default: None (no cap).",
+        )
+        model_group.add_argument(
+            "--response-max-tokens",
+            type=int,
+            default=EngineArgs.response_max_tokens,
+            help="Server-level maximum allowed response token length (hard cap). "
+            "Per-request response_max_tokens will be clamped to this value. Default: None (no cap).",
+        )
+        model_group.add_argument(
+            "--min-completion-tokens",
+            type=int,
+            default=EngineArgs.min_completion_tokens,
+            help="Server-level minimum generation length floor. "
+            "Effective min_tokens = max(server_value, per-request value). Default: None (no floor).",
+        )
+        model_group.add_argument(
+            "--input-max-tokens",
+            type=int,
+            default=EngineArgs.input_max_tokens,
+            help="Server-level maximum input token length. "
+            "Requests with prompt longer than this will be rejected. "
+            "Default: None (no limit, bounded by max_model_len).",
+        )
         model_group.add_argument(
             "--block-size",
             type=int,
@@ -1577,6 +1642,8 @@ def create_engine_config(self) -> FDConfig:
         cache_cfg = CacheConfig(all_dict)
         load_cfg = LoadConfig(all_dict)
         parallel_cfg = ParallelConfig(all_dict)
+        serving_limits_cfg = ServingLimitsConfig(all_dict)
+        serving_limits_cfg.validate(model_cfg.max_model_len)
         scheduler_cfg = self.create_scheduler_config()
         graph_opt_cfg = self.create_graph_optimization_config()
         plas_attention_config = self.create_plas_attention_config()
@@ -1613,4 +1680,5 @@ def create_engine_config(self) -> FDConfig:
             routing_replay_config=routing_replay_config,
             benchmark_metrics_config=benchmark_metrics_cfg,
             deploy_modality=DeployModality.from_str(self.deploy_modality),
+            serving_limits_config=serving_limits_cfg,
         )
@@ -299,6 +299,7 @@ def __init__(self, cfg, pid):
         )
         # Create data processor
         self.data_processor = self.input_processor.create_processor()
+        self.data_processor.set_server_defaults(cfg.serving_limits_config)
 
         # Create high-performance async connection manager
         self.connection_manager = None
 
@@ -365,6 +365,7 @@ def create_data_processor(self):
             enable_mm_runtime=self.cfg.enable_mm_runtime,
         )
         self.data_processor = self.input_processor.create_processor()
+        self.data_processor.set_server_defaults(self.cfg.serving_limits_config)
         self.mm_max_tokens_per_item = self.data_processor.get_mm_max_tokens_per_item(
             self.cfg.model_config.max_model_len
         )
 
@@ -746,7 +746,10 @@ def _format_and_add_data(self, prompts: dict):
                     prompts["prompt"] = query_list
 
         if "max_tokens" not in prompts:
-            prompts["max_tokens"] = self.cfg.model_config.max_model_len
+            if self.cfg.serving_limits_config.max_completion_tokens is not None:
+                prompts["max_tokens"] = self.cfg.serving_limits_config.max_completion_tokens
+            else:
+                prompts["max_tokens"] = self.cfg.model_config.max_model_len
 
         self.add_requests(prompts)
         return prompts["request_id"]
 
@@ -104,12 +104,14 @@ def __init__(self, pid: int | str, port: int | str, fd_config: FDConfig, workers
         )
         self.enable_logprob = self.fd_config.model_config.enable_logprob
         self.data_processor = input_processor.create_processor()
+        self.data_processor.set_server_defaults(self.fd_config.serving_limits_config)
         self.ori_vocab_size = (
             len(self.data_processor.tokenizer.sp_model)
             if hasattr(self.data_processor.tokenizer, "sp_model")
             else len(self.data_processor.tokenizer.vocab)
         )
         self.max_model_len = self.fd_config.model_config.max_model_len
+        self.max_completion_tokens = self.fd_config.serving_limits_config.max_completion_tokens
         self.enable_prefix_caching = self.fd_config.cache_config.enable_prefix_caching
         self.enable_cache_transfer = (
             self.fd_config.cache_config.swap_space or self.fd_config.cache_config.kvcache_storage_backend
@@ -297,7 +299,10 @@ async def format_and_add_data(self, request: Request | dict):
             request["request_id"] = request_id
 
         if "max_tokens" not in request:
-            request["max_tokens"] = self.max_model_len - 1
+            if self.max_completion_tokens is not None:
+                request["max_tokens"] = self.max_completion_tokens
+            else:
+                request["max_tokens"] = self.max_model_len - 1
 
         await self.add_requests(request)
         return request["prompt_token_ids"]
 
@@ -110,6 +110,27 @@ def __init__(self, model_name_or_path, tokenizer_type="auto", reasoning_parser_o
         self.tokenizer.pad_token_id = self.pad_token_id
         self._init_parsers(reasoning_parser_obj, tool_parser_obj)
 
+        # Server-level defaults (set via set_server_defaults after construction)
+        self.max_completion_tokens = None
+        self.reasoning_max_tokens = None
+        self.response_max_tokens = None
+        self.min_completion_tokens = None
+        self.input_max_tokens = None
+
+    def set_server_defaults(self, serving_limits_config):
+        """Set server-level default values from serving limits config.
+
+        These defaults are applied in process_request_dict when per-request
+        values are not specified.
+        """
+        if serving_limits_config is None:
+            return
+        self.max_completion_tokens = serving_limits_config.max_completion_tokens
+        self.reasoning_max_tokens = serving_limits_config.reasoning_max_tokens
+        self.response_max_tokens = serving_limits_config.response_max_tokens
+        self.min_completion_tokens = serving_limits_config.min_completion_tokens
+        self.input_max_tokens = serving_limits_config.input_max_tokens
+
     # ------------------------------------------------------------------
     # Abstract interface
     # ------------------------------------------------------------------
@@ -438,20 +459,63 @@ def process_request_dict(self, request, max_model_len=None, **kwargs):
         if request.get("completion_token_ids"):
             request["prompt_token_ids"].extend(request["completion_token_ids"])
 
-        # truncate prompts that exceed the length limit
+        # Reject requests exceeding input_max_tokens
+        if self.input_max_tokens is not None and len(request["prompt_token_ids"]) > self.input_max_tokens:
+            raise ValueError(
+                f"Input token length {len(request['prompt_token_ids'])} exceeds the configured input_max_tokens limit {self.input_max_tokens}"
+            )
+
         if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
-            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+            raise ValueError(
+                f"Input token length {len(request['prompt_token_ids'])} exceeds "
+                f"the configured max_model_len {max_model_len}"
+            )
 
         logits_processors_args = self._update_thinking_prompt_state(
             request["prompt_token_ids"], request.get("logits_processors_args") or {}
         )
         request["logits_processors_args"] = logits_processors_args
 
-        max_tokens = max_model_len - len(request["prompt_token_ids"])
+        # Compute effective length limits
+        def _min_non_none(*values):
+            return min(v for v in values if v is not None)
+
+        context_remaining = max(1, max_model_len - len(request["prompt_token_ids"]))
+
         if request.get("max_tokens") is None:
-            request["max_tokens"] = max(1, max_tokens)
+            # User didn't specify: default to min(context_remaining, server_default)
+            if self.max_completion_tokens is not None:
+                request["max_tokens"] = max(1, min(context_remaining, self.max_completion_tokens))
+            else:
+                request["max_tokens"] = context_remaining
         else:
-            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+            # User specified: clamp to min(context_remaining, max_completion_tokens)
+            request["max_tokens"] = _min_non_none(context_remaining, self.max_completion_tokens, request["max_tokens"])
+
+        max_tokens = request["max_tokens"]
+        if self.reasoning_max_tokens is not None or request.get("reasoning_max_tokens") is not None:
+            request["reasoning_max_tokens"] = _min_non_none(
+                max_tokens, self.reasoning_max_tokens, request.get("reasoning_max_tokens")
+            )
+        if self.response_max_tokens is not None or request.get("response_max_tokens") is not None:
+            request["response_max_tokens"] = _min_non_none(
+                max_tokens, self.response_max_tokens, request.get("response_max_tokens")
+            )
+
+        # min_tokens: take the larger of server-level and user value, reject if > max_tokens
+        server_min = self.min_completion_tokens
+        user_min = request.get("min_tokens")
+        if server_min is None:
+            effective_min = user_min
+        elif user_min is None:
+            effective_min = server_min
+        else:
+            effective_min = max(server_min, user_min)
+        if effective_min is not None:
+            if effective_min > max_tokens:
+                raise ValueError(f"min_tokens ({effective_min}) must not exceed max_tokens ({max_tokens})")
+            request["min_tokens"] = effective_min
+
         if request.get("temperature") < _SAMPLING_EPS:
             # zero temperature means greedy decoding: set top_k=1 to force argmax
             request["temperature"] = 1
Original file line number	Diff line number	Diff line change
`@@ -299,6 +299,7 @@ def __init__(self, cfg, pid):`
`299`	`299`	`)`
`300`	`300`	`# Create data processor`
`301`	`301`	`self.data_processor = self.input_processor.create_processor()`
	`302`	`+ self.data_processor.set_server_defaults(cfg.serving_limits_config)`
`302`	`303`
`303`	`304`	`# Create high-performance async connection manager`
`304`	`305`	`self.connection_manager = None`
Original file line number	Diff line number	Diff line change
`@@ -365,6 +365,7 @@ def create_data_processor(self):`
`365`	`365`	`enable_mm_runtime=self.cfg.enable_mm_runtime,`
`366`	`366`	`)`
`367`	`367`	`self.data_processor = self.input_processor.create_processor()`
	`368`	`+ self.data_processor.set_server_defaults(self.cfg.serving_limits_config)`
`368`	`369`	`self.mm_max_tokens_per_item = self.data_processor.get_mm_max_tokens_per_item(`
`369`	`370`	`self.cfg.model_config.max_model_len`
`370`	`371`	`)`