Skip to content

Commit 8a1f27f

Browse files
committed
extension/llm/server: serving foundations (schemas, errors, templating, tools)
Add the OpenAI server's standalone control-plane building blocks, independent of the HTTP layer and of any model runtime: OpenAI request/response schemas (protocol.py), structured errors (errors.py), HF chat templating (chat_template.py), and tool-call parsers (tool_parsers/) for Hermes-style JSON and Qwen XML. The wire contract is documented in spec/README.md. Unit-tested under tests/ (tool parsing). The HTTP server builds on these. Part of #20001 ghstack-source-id: 4e5664f ghstack-comment-id: 4617262868 Pull-Request: #19993
1 parent 411cda5 commit 8a1f27f

12 files changed

Lines changed: 860 additions & 0 deletions

File tree

extension/llm/server/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-compatible server for ExecuTorch LLMs (Python implementation)."""
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""Render OpenAI chat messages into a single prompt string.
8+
9+
The ExecuTorch runner tokenizes a plain prompt; chat formatting is the server's
10+
job (control plane). We require the model's own Hugging Face ``chat_template``
11+
(via ``--hf-tokenizer``) for correct, tool-aware, reasoning-aware formatting.
12+
The generic ChatML fallback is opt-in only (``allow_fallback``): it is
13+
approximate and cannot reproduce model-specific controls (e.g. enable_thinking),
14+
so it must be a deliberate choice rather than a silent default.
15+
"""
16+
17+
import json
18+
import logging
19+
from typing import Any, Optional
20+
21+
from .protocol import ChatMessage
22+
23+
logger = logging.getLogger(__name__)
24+
25+
26+
_DEFAULT_SPECIAL_TOKENS = ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "<|end|>"]
27+
28+
29+
def _decode_tool_call_arguments(messages: list[dict[str, Any]]) -> None:
30+
"""In-place: parse each tool call's ``function.arguments`` from a JSON string
31+
into an object.
32+
33+
OpenAI sends assistant tool-call arguments as a JSON-encoded string, but HF
34+
chat templates expect a mapping (e.g. Qwen renders ``arguments|items`` into
35+
``<parameter=…>`` tags). Without this, a multi-turn tool conversation makes
36+
the template raise "Can only get item pairs from a mapping". Left as-is if
37+
the value isn't valid JSON, so a template that wants the raw string still works.
38+
"""
39+
for m in messages:
40+
for tc in m.get("tool_calls") or []:
41+
fn = tc.get("function")
42+
if not isinstance(fn, dict):
43+
continue
44+
args = fn.get("arguments")
45+
if isinstance(args, str):
46+
try:
47+
fn["arguments"] = json.loads(args)
48+
except (ValueError, TypeError):
49+
pass
50+
51+
52+
class ChatTemplate:
53+
def __init__(
54+
self,
55+
hf_tokenizer_path: Optional[str] = None,
56+
default_template_kwargs: Optional[dict[str, Any]] = None,
57+
allow_fallback: bool = False,
58+
):
59+
# Server-level defaults (e.g. {"enable_thinking": False}); per-request
60+
# chat_template_kwargs override these.
61+
self._defaults = default_template_kwargs or {}
62+
self._hf = None
63+
if hf_tokenizer_path:
64+
from transformers import AutoTokenizer
65+
66+
self._hf = AutoTokenizer.from_pretrained(hf_tokenizer_path)
67+
if self._hf.chat_template is None:
68+
self._hf = None
69+
if not allow_fallback:
70+
raise ValueError(
71+
f"HF tokenizer at {hf_tokenizer_path} has no chat_template; "
72+
"pass an explicit fallback flag to use approximate ChatML."
73+
)
74+
logger.warning(
75+
"No chat_template at %s; using approximate ChatML.",
76+
hf_tokenizer_path,
77+
)
78+
elif not allow_fallback:
79+
raise ValueError(
80+
"A chat template is required: pass --hf-tokenizer for the model's own "
81+
"template, or opt into approximate ChatML with --allow-chatml-fallback."
82+
)
83+
else:
84+
logger.warning(
85+
"No --hf-tokenizer; using approximate ChatML (no thinking control)."
86+
)
87+
88+
def render(
89+
self,
90+
messages: list[ChatMessage],
91+
tools: Optional[list[dict[str, Any]]] = None,
92+
template_kwargs: Optional[dict[str, Any]] = None,
93+
) -> str:
94+
kwargs = {**self._defaults, **(template_kwargs or {})}
95+
if self._hf is not None:
96+
dumped = [m.model_dump(exclude_none=True) for m in messages]
97+
_decode_tool_call_arguments(dumped)
98+
return self._hf.apply_chat_template(
99+
dumped,
100+
tools=tools,
101+
add_generation_prompt=True,
102+
tokenize=False,
103+
**kwargs,
104+
)
105+
return self._fallback(messages)
106+
107+
def chat_template_str(self) -> Optional[str]:
108+
"""Raw chat-template string (for tool-format auto-detection), if available."""
109+
return (
110+
getattr(self._hf, "chat_template", None) if self._hf is not None else None
111+
)
112+
113+
def count_tokens(self, prompt: str) -> Optional[int]:
114+
"""Token count for the rendered prompt, or None if no tokenizer is available."""
115+
if self._hf is not None:
116+
# The prompt is already rendered (apply_chat_template includes the
117+
# control tokens), so encode without re-adding BOS/EOS — matching the
118+
# session/prefix-cache paths, so the count isn't inflated and
119+
# near-limit requests aren't falsely rejected under --max-context.
120+
return len(self._hf.encode(prompt, add_special_tokens=False))
121+
return None
122+
123+
def special_tokens(self) -> list[str]:
124+
"""Special-token strings whose appearance ends the visible content.
125+
126+
From the HF tokenizer when available (model-accurate), else a default set
127+
covering common chat models.
128+
"""
129+
if self._hf is not None:
130+
toks = list(getattr(self._hf, "all_special_tokens", []) or [])
131+
return [t for t in toks if isinstance(t, str) and t]
132+
return list(_DEFAULT_SPECIAL_TOKENS)
133+
134+
@staticmethod
135+
def _fallback(messages: list[ChatMessage]) -> str:
136+
# Approximate ChatML. Provide --hf-tokenizer for model-correct formatting
137+
# (including reasoning controls like enable_thinking, which the fallback
138+
# cannot reproduce).
139+
parts = []
140+
for m in messages:
141+
content = m.content if isinstance(m.content, str) else str(m.content or "")
142+
parts.append(f"<|im_start|>{m.role}\n{content}<|im_end|>")
143+
parts.append("<|im_start|>assistant\n")
144+
return "\n".join(parts)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-shaped API errors.
8+
9+
Raising these lets the server return a structured `{"error": {...}}` body with
10+
the right HTTP status instead of dropping the connection.
11+
"""
12+
13+
from typing import Optional
14+
15+
16+
class APIError(Exception):
17+
def __init__(
18+
self, status: int, message: str, err_type: str, code: Optional[str] = None
19+
):
20+
super().__init__(message)
21+
self.status = status
22+
self.message = message
23+
self.err_type = err_type
24+
self.code = code
25+
26+
def body(self) -> dict:
27+
return {
28+
"error": {"message": self.message, "type": self.err_type, "code": self.code}
29+
}
30+
31+
32+
class ContextLengthExceeded(APIError):
33+
def __init__(self, num_tokens: int, max_context: int, completion_tokens: int = 0):
34+
# completion_tokens > 0: the prompt fits but prompt + requested
35+
# max_tokens would run past the window — reject up front rather than
36+
# fail (or truncate) mid-generation.
37+
if completion_tokens > 0:
38+
message = (
39+
f"This model's maximum context length is {max_context} tokens. "
40+
f"However, you requested {num_tokens + completion_tokens} tokens "
41+
f"({num_tokens} in the messages, {completion_tokens} in the "
42+
f"completion). Please reduce the length of the messages or "
43+
f"completion."
44+
)
45+
else:
46+
message = (
47+
f"This model's maximum context length is {max_context} tokens, "
48+
f"but the request has {num_tokens} prompt tokens."
49+
)
50+
super().__init__(
51+
status=400,
52+
message=message,
53+
err_type="invalid_request_error",
54+
code="context_length_exceeded",
55+
)
56+
57+
58+
class GenerationError(APIError):
59+
def __init__(self, detail: str):
60+
super().__init__(
61+
status=500, message=f"Generation failed: {detail}", err_type="server_error"
62+
)
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-compatible request/response schemas for the ExecuTorch LLM server.
8+
9+
This is the Python view of the contract defined in ``extension/llm/server/spec``.
10+
Any language server must serialize to the same shapes; the conformance suite in
11+
``extension/llm/server/conformance`` validates them.
12+
"""
13+
14+
import time
15+
import uuid
16+
from typing import Any, Literal, Optional, Union
17+
18+
from pydantic import BaseModel, Field
19+
20+
21+
def _new_id(prefix: str) -> str:
22+
return f"{prefix}-{uuid.uuid4().hex}"
23+
24+
25+
class FunctionCall(BaseModel):
26+
name: Optional[str] = None
27+
arguments: Optional[str] = None
28+
29+
30+
class ToolCall(BaseModel):
31+
index: Optional[int] = None
32+
id: Optional[str] = None
33+
type: Literal["function"] = "function"
34+
function: FunctionCall
35+
36+
37+
class ChatMessage(BaseModel):
38+
role: str
39+
content: Optional[Union[str, list[dict[str, Any]]]] = None
40+
name: Optional[str] = None
41+
tool_calls: Optional[list[ToolCall]] = None
42+
tool_call_id: Optional[str] = None
43+
44+
45+
class StreamOptions(BaseModel):
46+
include_usage: bool = False
47+
48+
49+
class ChatCompletionRequest(BaseModel):
50+
model: Optional[str] = None
51+
messages: list[ChatMessage]
52+
stream: bool = False
53+
stream_options: Optional[StreamOptions] = None
54+
temperature: Optional[float] = None
55+
top_p: Optional[float] = None
56+
max_tokens: Optional[int] = None
57+
max_completion_tokens: Optional[int] = None
58+
stop: Optional[Union[str, list[str]]] = None
59+
n: int = 1
60+
seed: Optional[int] = None
61+
# Sampling knobs that change generation output. We don't plumb these, so they
62+
# are modeled (not dropped) in order to be rejected with a clear error rather
63+
# than silently ignored — see serving_chat's unsupported-parameter check.
64+
frequency_penalty: Optional[float] = None
65+
presence_penalty: Optional[float] = None
66+
top_k: Optional[int] = None
67+
logit_bias: Optional[dict[str, float]] = None
68+
# Output-contract fields: modeled (not dropped) so we reject the ones we
69+
# can't honor rather than returning an output that violates what was asked.
70+
response_format: Optional[dict[str, Any]] = None
71+
logprobs: Optional[bool] = None
72+
top_logprobs: Optional[int] = None
73+
parallel_tool_calls: Optional[bool] = None
74+
# Per-request chat-template controls, e.g. {"enable_thinking": false} for Qwen3.
75+
chat_template_kwargs: Optional[dict[str, Any]] = None
76+
# Accepted now so the contract is stable; parsing/enforcement land in M2/M5.
77+
tools: Optional[list[dict[str, Any]]] = None
78+
tool_choice: Optional[Union[str, dict[str, Any]]] = None
79+
reasoning_effort: Optional[str] = None
80+
81+
def resolved_max_tokens(self) -> int:
82+
# `is not None` (not `or`): an explicit 0 must not be treated as unset.
83+
# Callers validate positivity; -1 means "unset / auto".
84+
if self.max_completion_tokens is not None:
85+
return self.max_completion_tokens
86+
if self.max_tokens is not None:
87+
return self.max_tokens
88+
return -1
89+
90+
91+
class Usage(BaseModel):
92+
prompt_tokens: int = 0
93+
completion_tokens: int = 0
94+
total_tokens: int = 0
95+
96+
97+
class ResponseMessage(BaseModel):
98+
role: str = "assistant"
99+
content: Optional[str] = None
100+
tool_calls: Optional[list[ToolCall]] = None
101+
102+
103+
class Choice(BaseModel):
104+
index: int = 0
105+
message: ResponseMessage
106+
finish_reason: Optional[str] = None
107+
108+
109+
class ChatCompletionResponse(BaseModel):
110+
id: str = Field(default_factory=lambda: _new_id("chatcmpl"))
111+
object: Literal["chat.completion"] = "chat.completion"
112+
created: int = Field(default_factory=lambda: int(time.time()))
113+
model: str
114+
choices: list[Choice]
115+
usage: Usage = Field(default_factory=Usage)
116+
117+
118+
class DeltaMessage(BaseModel):
119+
role: Optional[str] = None
120+
content: Optional[str] = None
121+
tool_calls: Optional[list[ToolCall]] = None
122+
123+
124+
class ChunkChoice(BaseModel):
125+
index: int = 0
126+
delta: DeltaMessage
127+
finish_reason: Optional[str] = None
128+
129+
130+
class ChatCompletionChunk(BaseModel):
131+
id: str
132+
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
133+
created: int = Field(default_factory=lambda: int(time.time()))
134+
model: str
135+
choices: list[ChunkChoice]
136+
usage: Optional[Usage] = None
137+
138+
139+
class ModelCard(BaseModel):
140+
id: str
141+
object: Literal["model"] = "model"
142+
created: int = Field(default_factory=lambda: int(time.time()))
143+
owned_by: str = "executorch"
144+
145+
146+
class ModelList(BaseModel):
147+
object: Literal["list"] = "list"
148+
data: list[ModelCard]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
fastapi>=0.110
2+
uvicorn[standard]>=0.27
3+
pydantic>=2.0
4+
# Optional but recommended for model-correct chat templating (--hf-tokenizer):
5+
# transformers>=4.40

0 commit comments

Comments
 (0)