Skip to content

Commit 1b43115

Browse files
committed
extension/llm/server: serving foundations (schemas, errors, templating, tools)
Add the OpenAI server's standalone control-plane building blocks, independent of the HTTP layer and of any model runtime: OpenAI request/response schemas (protocol.py), structured errors (errors.py), HF chat templating (chat_template.py), and tool-call parsers (tool_parsers/) for Hermes-style JSON and Qwen XML. The wire contract is documented in spec/README.md. Unit-tested under tests/ (tool parsing). The HTTP server builds on these. ghstack-source-id: 9ee61a2 ghstack-comment-id: 4617262868 Pull-Request: #19993
1 parent b9dfe05 commit 1b43115

12 files changed

Lines changed: 753 additions & 0 deletions

File tree

extension/llm/server/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-compatible server for ExecuTorch LLMs (Python implementation)."""
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""Render OpenAI chat messages into a single prompt string.
8+
9+
The ExecuTorch runner tokenizes a plain prompt; chat formatting is the server's
10+
job (control plane). We require the model's own Hugging Face ``chat_template``
11+
(via ``--hf-tokenizer``) for correct, tool-aware, reasoning-aware formatting.
12+
The generic ChatML fallback is opt-in only (``allow_fallback``): it is
13+
approximate and cannot reproduce model-specific controls (e.g. enable_thinking),
14+
so it must be a deliberate choice rather than a silent default.
15+
"""
16+
17+
import logging
18+
from typing import Any, Optional
19+
20+
from .protocol import ChatMessage
21+
22+
logger = logging.getLogger(__name__)
23+
24+
25+
_DEFAULT_SPECIAL_TOKENS = ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "<|end|>"]
26+
27+
28+
class ChatTemplate:
29+
def __init__(
30+
self,
31+
hf_tokenizer_path: Optional[str] = None,
32+
default_template_kwargs: Optional[dict[str, Any]] = None,
33+
allow_fallback: bool = False,
34+
):
35+
# Server-level defaults (e.g. {"enable_thinking": False}); per-request
36+
# chat_template_kwargs override these.
37+
self._defaults = default_template_kwargs or {}
38+
self._hf = None
39+
if hf_tokenizer_path:
40+
from transformers import AutoTokenizer
41+
42+
self._hf = AutoTokenizer.from_pretrained(hf_tokenizer_path)
43+
if self._hf.chat_template is None:
44+
self._hf = None
45+
if not allow_fallback:
46+
raise ValueError(
47+
f"HF tokenizer at {hf_tokenizer_path} has no chat_template; "
48+
"pass an explicit fallback flag to use approximate ChatML."
49+
)
50+
logger.warning(
51+
"No chat_template at %s; using approximate ChatML.",
52+
hf_tokenizer_path,
53+
)
54+
elif not allow_fallback:
55+
raise ValueError(
56+
"A chat template is required: pass --hf-tokenizer for the model's own "
57+
"template, or opt into approximate ChatML with --allow-chatml-fallback."
58+
)
59+
else:
60+
logger.warning(
61+
"No --hf-tokenizer; using approximate ChatML (no thinking control)."
62+
)
63+
64+
def render(
65+
self,
66+
messages: list[ChatMessage],
67+
tools: Optional[list[dict[str, Any]]] = None,
68+
template_kwargs: Optional[dict[str, Any]] = None,
69+
) -> str:
70+
kwargs = {**self._defaults, **(template_kwargs or {})}
71+
if self._hf is not None:
72+
return self._hf.apply_chat_template(
73+
[m.model_dump(exclude_none=True) for m in messages],
74+
tools=tools,
75+
add_generation_prompt=True,
76+
tokenize=False,
77+
**kwargs,
78+
)
79+
return self._fallback(messages)
80+
81+
def chat_template_str(self) -> Optional[str]:
82+
"""Raw chat-template string (for tool-format auto-detection), if available."""
83+
return (
84+
getattr(self._hf, "chat_template", None) if self._hf is not None else None
85+
)
86+
87+
def count_tokens(self, prompt: str) -> Optional[int]:
88+
"""Token count for the rendered prompt, or None if no tokenizer is available."""
89+
if self._hf is not None:
90+
# The prompt is already rendered (apply_chat_template includes the
91+
# control tokens), so encode without re-adding BOS/EOS — matching the
92+
# session/prefix-cache paths, so the count isn't inflated and
93+
# near-limit requests aren't falsely rejected under --max-context.
94+
return len(self._hf.encode(prompt, add_special_tokens=False))
95+
return None
96+
97+
def special_tokens(self) -> list[str]:
98+
"""Special-token strings whose appearance ends the visible content.
99+
100+
From the HF tokenizer when available (model-accurate), else a default set
101+
covering common chat models.
102+
"""
103+
if self._hf is not None:
104+
toks = list(getattr(self._hf, "all_special_tokens", []) or [])
105+
return [t for t in toks if isinstance(t, str) and t]
106+
return list(_DEFAULT_SPECIAL_TOKENS)
107+
108+
@staticmethod
109+
def _fallback(messages: list[ChatMessage]) -> str:
110+
# Approximate ChatML. Provide --hf-tokenizer for model-correct formatting
111+
# (including reasoning controls like enable_thinking, which the fallback
112+
# cannot reproduce).
113+
parts = []
114+
for m in messages:
115+
content = m.content if isinstance(m.content, str) else str(m.content or "")
116+
parts.append(f"<|im_start|>{m.role}\n{content}<|im_end|>")
117+
parts.append("<|im_start|>assistant\n")
118+
return "\n".join(parts)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-shaped API errors.
8+
9+
Raising these lets the server return a structured `{"error": {...}}` body with
10+
the right HTTP status instead of dropping the connection.
11+
"""
12+
13+
from typing import Optional
14+
15+
16+
class APIError(Exception):
17+
def __init__(
18+
self, status: int, message: str, err_type: str, code: Optional[str] = None
19+
):
20+
super().__init__(message)
21+
self.status = status
22+
self.message = message
23+
self.err_type = err_type
24+
self.code = code
25+
26+
def body(self) -> dict:
27+
return {
28+
"error": {"message": self.message, "type": self.err_type, "code": self.code}
29+
}
30+
31+
32+
class ContextLengthExceeded(APIError):
33+
def __init__(self, num_tokens: int, max_context: int):
34+
super().__init__(
35+
status=400,
36+
message=(
37+
f"This model's maximum context length is {max_context} tokens, "
38+
f"but the request has {num_tokens} prompt tokens."
39+
),
40+
err_type="invalid_request_error",
41+
code="context_length_exceeded",
42+
)
43+
44+
45+
class GenerationError(APIError):
46+
def __init__(self, detail: str):
47+
super().__init__(
48+
status=500, message=f"Generation failed: {detail}", err_type="server_error"
49+
)
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-compatible request/response schemas for the ExecuTorch LLM server.
8+
9+
This is the Python view of the contract defined in ``extension/llm/server/spec``.
10+
Any language server must serialize to the same shapes; the conformance suite in
11+
``extension/llm/server/conformance`` validates them.
12+
"""
13+
14+
import time
15+
import uuid
16+
from typing import Any, Literal, Optional, Union
17+
18+
from pydantic import BaseModel, Field
19+
20+
21+
def _new_id(prefix: str) -> str:
22+
return f"{prefix}-{uuid.uuid4().hex}"
23+
24+
25+
class FunctionCall(BaseModel):
26+
name: Optional[str] = None
27+
arguments: Optional[str] = None
28+
29+
30+
class ToolCall(BaseModel):
31+
index: Optional[int] = None
32+
id: Optional[str] = None
33+
type: Literal["function"] = "function"
34+
function: FunctionCall
35+
36+
37+
class ChatMessage(BaseModel):
38+
role: str
39+
content: Optional[Union[str, list[dict[str, Any]]]] = None
40+
name: Optional[str] = None
41+
tool_calls: Optional[list[ToolCall]] = None
42+
tool_call_id: Optional[str] = None
43+
44+
45+
class StreamOptions(BaseModel):
46+
include_usage: bool = False
47+
48+
49+
class ChatCompletionRequest(BaseModel):
50+
model: Optional[str] = None
51+
messages: list[ChatMessage]
52+
stream: bool = False
53+
stream_options: Optional[StreamOptions] = None
54+
temperature: Optional[float] = None
55+
top_p: Optional[float] = None
56+
max_tokens: Optional[int] = None
57+
max_completion_tokens: Optional[int] = None
58+
stop: Optional[Union[str, list[str]]] = None
59+
n: int = 1
60+
seed: Optional[int] = None
61+
# Sampling knobs that change generation output. We don't plumb these, so they
62+
# are modeled (not dropped) in order to be rejected with a clear error rather
63+
# than silently ignored — see serving_chat's unsupported-parameter check.
64+
frequency_penalty: Optional[float] = None
65+
presence_penalty: Optional[float] = None
66+
top_k: Optional[int] = None
67+
logit_bias: Optional[dict[str, float]] = None
68+
# Output-contract fields: modeled (not dropped) so we reject the ones we
69+
# can't honor rather than returning an output that violates what was asked.
70+
response_format: Optional[dict[str, Any]] = None
71+
logprobs: Optional[bool] = None
72+
top_logprobs: Optional[int] = None
73+
parallel_tool_calls: Optional[bool] = None
74+
# Per-request chat-template controls, e.g. {"enable_thinking": false} for Qwen3.
75+
chat_template_kwargs: Optional[dict[str, Any]] = None
76+
# Accepted now so the contract is stable; parsing/enforcement land in M2/M5.
77+
tools: Optional[list[dict[str, Any]]] = None
78+
tool_choice: Optional[Union[str, dict[str, Any]]] = None
79+
reasoning_effort: Optional[str] = None
80+
81+
def resolved_max_tokens(self) -> int:
82+
# `is not None` (not `or`): an explicit 0 must not be treated as unset.
83+
# Callers validate positivity; -1 means "unset / auto".
84+
if self.max_completion_tokens is not None:
85+
return self.max_completion_tokens
86+
if self.max_tokens is not None:
87+
return self.max_tokens
88+
return -1
89+
90+
91+
class Usage(BaseModel):
92+
prompt_tokens: int = 0
93+
completion_tokens: int = 0
94+
total_tokens: int = 0
95+
96+
97+
class ResponseMessage(BaseModel):
98+
role: str = "assistant"
99+
content: Optional[str] = None
100+
tool_calls: Optional[list[ToolCall]] = None
101+
102+
103+
class Choice(BaseModel):
104+
index: int = 0
105+
message: ResponseMessage
106+
finish_reason: Optional[str] = None
107+
108+
109+
class ChatCompletionResponse(BaseModel):
110+
id: str = Field(default_factory=lambda: _new_id("chatcmpl"))
111+
object: Literal["chat.completion"] = "chat.completion"
112+
created: int = Field(default_factory=lambda: int(time.time()))
113+
model: str
114+
choices: list[Choice]
115+
usage: Usage = Field(default_factory=Usage)
116+
117+
118+
class DeltaMessage(BaseModel):
119+
role: Optional[str] = None
120+
content: Optional[str] = None
121+
tool_calls: Optional[list[ToolCall]] = None
122+
123+
124+
class ChunkChoice(BaseModel):
125+
index: int = 0
126+
delta: DeltaMessage
127+
finish_reason: Optional[str] = None
128+
129+
130+
class ChatCompletionChunk(BaseModel):
131+
id: str
132+
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
133+
created: int = Field(default_factory=lambda: int(time.time()))
134+
model: str
135+
choices: list[ChunkChoice]
136+
usage: Optional[Usage] = None
137+
138+
139+
class ModelCard(BaseModel):
140+
id: str
141+
object: Literal["model"] = "model"
142+
created: int = Field(default_factory=lambda: int(time.time()))
143+
owned_by: str = "executorch"
144+
145+
146+
class ModelList(BaseModel):
147+
object: Literal["list"] = "list"
148+
data: list[ModelCard]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
fastapi>=0.110
2+
uvicorn[standard]>=0.27
3+
pydantic>=2.0
4+
# Optional but recommended for model-correct chat templating (--hf-tokenizer):
5+
# transformers>=4.40

0 commit comments

Comments
 (0)