Skip to content

Commit c326f1a

Browse files
committed
extension/llm/server: serving foundations (schemas, tool parser, prefix cache)
Add the OpenAI server's standalone building blocks, independent of the HTTP layer: OpenAI request/response schemas (protocol.py), structured errors (errors.py), HF chat templating (chat_template.py), the Hermes/Qwen <tool_call> parser (tool_parsers/), and the turn-to-turn KV prefix-reuse policy over an LLMSession (prefix_cache.py). The wire contract is documented in spec/README.md. prefix_cache tracks the exact decode_one() token ids (never re-tokenizes generated text) and caps reuse at the session's resident position. Unit-tested by tests/test_prefix_cache.py. Third of four stacked commits; the HTTP server builds on these. ghstack-source-id: 90c71c4 ghstack-comment-id: 4617262868 Pull-Request: #19993
1 parent 9c9d850 commit c326f1a

12 files changed

Lines changed: 1005 additions & 0 deletions

File tree

extension/llm/server/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-compatible server for ExecuTorch LLMs (Python implementation)."""
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""Render OpenAI chat messages into a single prompt string.
8+
9+
The ExecuTorch runner tokenizes a plain prompt; chat formatting is the server's
10+
job (control plane). We require the model's own Hugging Face ``chat_template``
11+
(via ``--hf-tokenizer``) for correct, tool-aware, reasoning-aware formatting.
12+
The generic ChatML fallback is opt-in only (``allow_fallback``): it is
13+
approximate and cannot reproduce model-specific controls (e.g. enable_thinking),
14+
so it must be a deliberate choice rather than a silent default.
15+
"""
16+
17+
import logging
18+
from typing import Any, Optional
19+
20+
from .protocol import ChatMessage
21+
22+
logger = logging.getLogger(__name__)
23+
24+
25+
_DEFAULT_SPECIAL_TOKENS = ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "<|end|>"]
26+
27+
28+
class ChatTemplate:
29+
def __init__(
30+
self,
31+
hf_tokenizer_path: Optional[str] = None,
32+
default_template_kwargs: Optional[dict[str, Any]] = None,
33+
allow_fallback: bool = False,
34+
):
35+
# Server-level defaults (e.g. {"enable_thinking": False}); per-request
36+
# chat_template_kwargs override these.
37+
self._defaults = default_template_kwargs or {}
38+
self._hf = None
39+
if hf_tokenizer_path:
40+
from transformers import AutoTokenizer
41+
42+
self._hf = AutoTokenizer.from_pretrained(hf_tokenizer_path)
43+
if self._hf.chat_template is None:
44+
self._hf = None
45+
if not allow_fallback:
46+
raise ValueError(
47+
f"HF tokenizer at {hf_tokenizer_path} has no chat_template; "
48+
"pass an explicit fallback flag to use approximate ChatML."
49+
)
50+
logger.warning(
51+
"No chat_template at %s; using approximate ChatML.",
52+
hf_tokenizer_path,
53+
)
54+
elif not allow_fallback:
55+
raise ValueError(
56+
"A chat template is required: pass --hf-tokenizer for the model's own "
57+
"template, or opt into approximate ChatML with --allow-chatml-fallback."
58+
)
59+
else:
60+
logger.warning(
61+
"No --hf-tokenizer; using approximate ChatML (no thinking control)."
62+
)
63+
64+
def render(
65+
self,
66+
messages: list[ChatMessage],
67+
tools: Optional[list[dict[str, Any]]] = None,
68+
template_kwargs: Optional[dict[str, Any]] = None,
69+
) -> str:
70+
kwargs = {**self._defaults, **(template_kwargs or {})}
71+
if self._hf is not None:
72+
return self._hf.apply_chat_template(
73+
[m.model_dump(exclude_none=True) for m in messages],
74+
tools=tools,
75+
add_generation_prompt=True,
76+
tokenize=False,
77+
**kwargs,
78+
)
79+
return self._fallback(messages)
80+
81+
def chat_template_str(self) -> Optional[str]:
82+
"""Raw chat-template string (for tool-format auto-detection), if available."""
83+
return (
84+
getattr(self._hf, "chat_template", None) if self._hf is not None else None
85+
)
86+
87+
def tokenizer(self):
88+
"""The underlying HF tokenizer (for token-level prefix caching), or None.
89+
90+
Must match the runner's tokenizer (same model) for prefix reuse to be
91+
valid — i.e. the recommended --hf-tokenizer matching the exported model.
92+
"""
93+
return self._hf
94+
95+
def count_tokens(self, prompt: str) -> Optional[int]:
96+
"""Token count for the rendered prompt, or None if no tokenizer is available."""
97+
if self._hf is not None:
98+
return len(self._hf.encode(prompt))
99+
return None
100+
101+
def special_tokens(self) -> list[str]:
102+
"""Special-token strings whose appearance ends the visible content.
103+
104+
From the HF tokenizer when available (model-accurate), else a default set
105+
covering common chat models.
106+
"""
107+
if self._hf is not None:
108+
toks = list(getattr(self._hf, "all_special_tokens", []) or [])
109+
return [t for t in toks if isinstance(t, str) and t]
110+
return list(_DEFAULT_SPECIAL_TOKENS)
111+
112+
@staticmethod
113+
def _fallback(messages: list[ChatMessage]) -> str:
114+
# Approximate ChatML. Provide --hf-tokenizer for model-correct formatting
115+
# (including reasoning controls like enable_thinking, which the fallback
116+
# cannot reproduce).
117+
parts = []
118+
for m in messages:
119+
content = m.content if isinstance(m.content, str) else str(m.content or "")
120+
parts.append(f"<|im_start|>{m.role}\n{content}<|im_end|>")
121+
parts.append("<|im_start|>assistant\n")
122+
return "\n".join(parts)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""OpenAI-shaped API errors.
8+
9+
Raising these lets the server return a structured `{"error": {...}}` body with
10+
the right HTTP status instead of dropping the connection.
11+
"""
12+
13+
from typing import Optional
14+
15+
16+
class APIError(Exception):
17+
def __init__(
18+
self, status: int, message: str, err_type: str, code: Optional[str] = None
19+
):
20+
super().__init__(message)
21+
self.status = status
22+
self.message = message
23+
self.err_type = err_type
24+
self.code = code
25+
26+
def body(self) -> dict:
27+
return {
28+
"error": {"message": self.message, "type": self.err_type, "code": self.code}
29+
}
30+
31+
32+
class ContextLengthExceeded(APIError):
33+
def __init__(self, num_tokens: int, max_context: int):
34+
super().__init__(
35+
status=400,
36+
message=(
37+
f"This model's maximum context length is {max_context} tokens, "
38+
f"but the request has {num_tokens} prompt tokens."
39+
),
40+
err_type="invalid_request_error",
41+
code="context_length_exceeded",
42+
)
43+
44+
45+
class GenerationError(APIError):
46+
def __init__(self, detail: str):
47+
super().__init__(
48+
status=500, message=f"Generation failed: {detail}", err_type="server_error"
49+
)

0 commit comments

Comments
 (0)