pytorch
diff --git a/‎extension/llm/server/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎extension/llm/server/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎extension/llm/server/python/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎extension/llm/server/python/__init__.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎extension/llm/server/python/chat_template.py‎
Lines changed: 122 additions & 0 deletions b/‎extension/llm/server/python/chat_template.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎extension/llm/server/python/errors.py‎
Lines changed: 49 additions & 0 deletions b/‎extension/llm/server/python/errors.py‎
Lines changed: 49 additions & 0 deletions
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenAI-compatible server for ExecuTorch LLMs (Python implementation)."""
@@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Render OpenAI chat messages into a single prompt string.
+
+The ExecuTorch runner tokenizes a plain prompt; chat formatting is the server's
+job (control plane). We require the model's own Hugging Face ``chat_template``
+(via ``--hf-tokenizer``) for correct, tool-aware, reasoning-aware formatting.
+The generic ChatML fallback is opt-in only (``allow_fallback``): it is
+approximate and cannot reproduce model-specific controls (e.g. enable_thinking),
+so it must be a deliberate choice rather than a silent default.
+"""
+
+import logging
+from typing import Any, Optional
+
+from .protocol import ChatMessage
+
+logger = logging.getLogger(__name__)
+
+
+_DEFAULT_SPECIAL_TOKENS = ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "<|end|>"]
+
+
+class ChatTemplate:
+    def __init__(
+        self,
+        hf_tokenizer_path: Optional[str] = None,
+        default_template_kwargs: Optional[dict[str, Any]] = None,
+        allow_fallback: bool = False,
+    ):
+        # Server-level defaults (e.g. {"enable_thinking": False}); per-request
+        # chat_template_kwargs override these.
+        self._defaults = default_template_kwargs or {}
+        self._hf = None
+        if hf_tokenizer_path:
+            from transformers import AutoTokenizer
+
+            self._hf = AutoTokenizer.from_pretrained(hf_tokenizer_path)
+            if self._hf.chat_template is None:
+                self._hf = None
+                if not allow_fallback:
+                    raise ValueError(
+                        f"HF tokenizer at {hf_tokenizer_path} has no chat_template; "
+                        "pass an explicit fallback flag to use approximate ChatML."
+                    )
+                logger.warning(
+                    "No chat_template at %s; using approximate ChatML.",
+                    hf_tokenizer_path,
+                )
+        elif not allow_fallback:
+            raise ValueError(
+                "A chat template is required: pass --hf-tokenizer for the model's own "
+                "template, or opt into approximate ChatML with --allow-chatml-fallback."
+            )
+        else:
+            logger.warning(
+                "No --hf-tokenizer; using approximate ChatML (no thinking control)."
+            )
+
+    def render(
+        self,
+        messages: list[ChatMessage],
+        tools: Optional[list[dict[str, Any]]] = None,
+        template_kwargs: Optional[dict[str, Any]] = None,
+    ) -> str:
+        kwargs = {**self._defaults, **(template_kwargs or {})}
+        if self._hf is not None:
+            return self._hf.apply_chat_template(
+                [m.model_dump(exclude_none=True) for m in messages],
+                tools=tools,
+                add_generation_prompt=True,
+                tokenize=False,
+                **kwargs,
+            )
+        return self._fallback(messages)
+
+    def chat_template_str(self) -> Optional[str]:
+        """Raw chat-template string (for tool-format auto-detection), if available."""
+        return (
+            getattr(self._hf, "chat_template", None) if self._hf is not None else None
+        )
+
+    def tokenizer(self):
+        """The underlying HF tokenizer (for token-level prefix caching), or None.
+
+        Must match the runner's tokenizer (same model) for prefix reuse to be
+        valid — i.e. the recommended --hf-tokenizer matching the exported model.
+        """
+        return self._hf
+
+    def count_tokens(self, prompt: str) -> Optional[int]:
+        """Token count for the rendered prompt, or None if no tokenizer is available."""
+        if self._hf is not None:
+            return len(self._hf.encode(prompt))
+        return None
+
+    def special_tokens(self) -> list[str]:
+        """Special-token strings whose appearance ends the visible content.
+
+        From the HF tokenizer when available (model-accurate), else a default set
+        covering common chat models.
+        """
+        if self._hf is not None:
+            toks = list(getattr(self._hf, "all_special_tokens", []) or [])
+            return [t for t in toks if isinstance(t, str) and t]
+        return list(_DEFAULT_SPECIAL_TOKENS)
+
+    @staticmethod
+    def _fallback(messages: list[ChatMessage]) -> str:
+        # Approximate ChatML. Provide --hf-tokenizer for model-correct formatting
+        # (including reasoning controls like enable_thinking, which the fallback
+        # cannot reproduce).
+        parts = []
+        for m in messages:
+            content = m.content if isinstance(m.content, str) else str(m.content or "")
+            parts.append(f"<|im_start|>{m.role}\n{content}<|im_end|>")
+        parts.append("<|im_start|>assistant\n")
+        return "\n".join(parts)
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""OpenAI-shaped API errors.
+
+Raising these lets the server return a structured `{"error": {...}}` body with
+the right HTTP status instead of dropping the connection.
+"""
+
+from typing import Optional
+
+
+class APIError(Exception):
+    def __init__(
+        self, status: int, message: str, err_type: str, code: Optional[str] = None
+    ):
+        super().__init__(message)
+        self.status = status
+        self.message = message
+        self.err_type = err_type
+        self.code = code
+
+    def body(self) -> dict:
+        return {
+            "error": {"message": self.message, "type": self.err_type, "code": self.code}
+        }
+
+
+class ContextLengthExceeded(APIError):
+    def __init__(self, num_tokens: int, max_context: int):
+        super().__init__(
+            status=400,
+            message=(
+                f"This model's maximum context length is {max_context} tokens, "
+                f"but the request has {num_tokens} prompt tokens."
+            ),
+            err_type="invalid_request_error",
+            code="context_length_exceeded",
+        )
+
+
+class GenerationError(APIError):
+    def __init__(self, detail: str):
+        super().__init__(
+            status=500, message=f"Generation failed: {detail}", err_type="server_error"
+        )