pytorch
diff --git a/‎extension/llm/server/README.md‎
Lines changed: 34 additions & 0 deletions b/‎extension/llm/server/README.md‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎extension/llm/server/conformance/test_openai_contract.py‎
Lines changed: 207 additions & 0 deletions b/‎extension/llm/server/conformance/test_openai_contract.py‎
Lines changed: 207 additions & 0 deletions
diff --git a/‎extension/llm/server/cpp/CMakeLists.txt‎
Lines changed: 88 additions & 0 deletions b/‎extension/llm/server/cpp/CMakeLists.txt‎
Lines changed: 88 additions & 0 deletions
@@ -0,0 +1,34 @@
+# ExecuTorch LLM Server
+
+OpenAI-compatible serving for ExecuTorch LLMs, so any OpenAI-compatible agent
+harness (pi, opencode, ...) can use ExecuTorch as a local backend.
+
+```
+extension/llm/server/
+  spec/          # language-neutral OpenAI contract ExecuTorch targets
+  conformance/   # one test suite every language server must pass
+  python/        # Python server implementation (current)
+  # cpp/         # future: no-Python single-binary server
+```
+
+Why this layout: the OpenAI contract is identical across languages, so the
+**spec** and **conformance** suite are shared, and each language gets its own
+implementation directory. The real cross-language reuse comes from the C++
+`LLMEngine`/`LLMSession` primitives underneath, packaged as a process-isolated
+**worker binary** (`text_llm_worker`) that any control plane drives over a small
+JSONL protocol — the server is a thin protocol shell that spawns and talks to
+that worker. See `python/README.md` to run it.
+
+Status: experimental, reliability-first and deliberately narrow. Implemented:
+`/health`, `/v1/models`, `/v1/chat/completions` (streaming + non-streaming),
+Hugging Face chat templates (`--hf-tokenizer`), `temperature` / `max_tokens` /
+`max_completion_tokens` / `stop`, Hermes tool calling by default
+(`<tool_call>...</tool_call>` JSON, complete calls only; model-specific launchers
+may select the Qwen XML format) with `tool_choice="none"`,
+structured API errors, and best-effort cancellation. V1 serving is single-slot
+(one worker, one session) with no prefix cache; KV prefix reuse, if it returns,
+lives inside the worker/session, not the control plane. Unsupported params (including `top_p`,
+`seed`, `n>1`, `reasoning_effort`, penalties, `logit_bias`, `response_format`,
+`logprobs`, and `tool_choice="required"`) are rejected with a structured 400
+rather than silently ignored. See `python/README.md` to run it and
+`spec/README.md` for the exact contract.
@@ -0,0 +1,207 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Language-neutral OpenAI-contract conformance tests.
+
+Runs against any base URL (ExecuTorch, llama.cpp, mlx-lm, ...) so every server
+implementation is validated against one shared spec. Point it at a running
+server:
+
+    OPENAI_BASE_URL=http://127.0.0.1:8000/v1 pytest test_openai_contract.py
+
+Skips automatically if no server is reachable.
+"""
+
+import json
+import os
+import urllib.error
+import urllib.request
+
+import pytest
+
+BASE_URL = os.environ.get("OPENAI_BASE_URL", "http://127.0.0.1:8000/v1").rstrip("/")
+MODEL = os.environ.get("OPENAI_MODEL", "executorch")
+
+
+def _post(path: str, body: dict, stream: bool = False):
+    req = urllib.request.Request(
+        f"{BASE_URL}{path}",
+        data=json.dumps(body).encode(),
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    return urllib.request.urlopen(req, timeout=120)
+
+
+def _server_up() -> bool:
+    try:
+        urllib.request.urlopen(f"{BASE_URL}/models", timeout=5)
+        return True
+    except Exception:
+        return False
+
+
+pytestmark = pytest.mark.skipif(
+    not _server_up(), reason="no OpenAI server at OPENAI_BASE_URL"
+)
+
+
+def test_models_listing():
+    with urllib.request.urlopen(f"{BASE_URL}/models", timeout=10) as r:
+        data = json.loads(r.read())
+    assert data["object"] == "list"
+    assert any("id" in m for m in data["data"])
+
+
+def test_chat_completion_nonstreaming():
+    body = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": "Say hello in one word."}],
+        "max_tokens": 16,
+        "temperature": 0.0,
+    }
+    with _post("/chat/completions", body) as r:
+        data = json.loads(r.read())
+    assert data["object"] == "chat.completion"
+    assert data["choices"][0]["message"]["role"] == "assistant"
+    assert isinstance(data["choices"][0]["message"]["content"], str)
+    assert data["choices"][0]["finish_reason"] is not None
+
+
+def test_chat_completion_streaming():
+    body = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": "Count to three."}],
+        "max_tokens": 32,
+        "stream": True,
+    }
+    saw_role = saw_content = saw_done = False
+    with _post("/chat/completions", body, stream=True) as r:
+        for raw in r:
+            line = raw.decode().strip()
+            if not line.startswith("data:"):
+                continue
+            payload = line[len("data:") :].strip()
+            if payload == "[DONE]":
+                saw_done = True
+                break
+            chunk = json.loads(payload)
+            assert chunk["object"] == "chat.completion.chunk"
+            delta = chunk["choices"][0]["delta"]
+            saw_role = saw_role or delta.get("role") == "assistant"
+            saw_content = saw_content or bool(delta.get("content"))
+    assert saw_role and saw_content and saw_done
+
+
+def test_multibyte_streaming_integrity():
+    # Byte-level BPE can split a multi-byte character across tokens; the stream
+    # must reassemble it, not abort with a UTF-8 decode error.
+    body = {
+        "model": MODEL,
+        "messages": [
+            {"role": "user", "content": "Reply with exactly: 你好世界 🌍 café"}
+        ],
+        "max_tokens": 32,
+        "temperature": 0.0,
+        "stream": True,
+    }
+    content, saw_done, saw_error = "", False, False
+    with _post("/chat/completions", body, stream=True) as r:
+        for raw in r:
+            line = raw.decode().strip()
+            if not line.startswith("data:"):
+                continue
+            payload = line[len("data:") :].strip()
+            if payload == "[DONE]":
+                saw_done = True
+                break
+            chunk = json.loads(payload)
+            if "error" in chunk:
+                saw_error = True
+            content += (
+                chunk["choices"][0]["delta"].get("content", "")
+                if chunk.get("choices")
+                else ""
+            )
+    assert saw_done and not saw_error
+    assert isinstance(content, str) and content  # reassembled, valid UTF-8
+
+
+def test_usage_chunk_in_stream():
+    body = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": "Say hi."}],
+        "max_tokens": 16,
+        "stream": True,
+        "stream_options": {"include_usage": True},
+    }
+    usage = None
+    with _post("/chat/completions", body, stream=True) as r:
+        for raw in r:
+            line = raw.decode().strip()
+            if not line.startswith("data:"):
+                continue
+            payload = line[len("data:") :].strip()
+            if payload == "[DONE]":
+                break
+            chunk = json.loads(payload)
+            if chunk.get("usage"):
+                usage = chunk["usage"]
+    assert usage is not None, "no usage chunk emitted with include_usage"
+    assert usage["prompt_tokens"] > 0 and usage["completion_tokens"] > 0
+    assert usage["total_tokens"] == usage["prompt_tokens"] + usage["completion_tokens"]
+
+
+WEATHER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather for a city.",
+        "parameters": {
+            "type": "object",
+            "properties": {"city": {"type": "string"}},
+            "required": ["city"],
+        },
+    },
+}
+
+
+def test_tool_call_response_shape():
+    body = {
+        "model": MODEL,
+        "messages": [
+            {"role": "user", "content": "What is the weather in Paris? Use the tool."}
+        ],
+        "tools": [WEATHER_TOOL],
+        "max_tokens": 128,
+        "temperature": 0.0,
+    }
+    with _post("/chat/completions", body) as r:
+        data = json.loads(r.read())
+    calls = data["choices"][0]["message"].get("tool_calls")
+    assert calls, "expected tool_calls in response"
+    tc = calls[0]
+    assert tc["type"] == "function"
+    assert tc["id"]
+    assert tc["function"]["name"] == "get_weather"
+    json.loads(tc["function"]["arguments"])  # arguments is a JSON string
+    assert data["choices"][0]["finish_reason"] == "tool_calls"
+
+
+def test_error_body_shape():
+    # Over-long prompt -> structured 400 (OpenAI error envelope), not a 500/drop.
+    body = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": "word " * 40000}],
+        "max_tokens": 8,
+    }
+    try:
+        _post("/chat/completions", body)
+        raise AssertionError("expected an HTTP error for over-long prompt")
+    except urllib.error.HTTPError as e:
+        assert 400 <= e.code < 500
+        err = json.loads(e.read())["error"]
+        assert err["message"] and err["type"]
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Generic model-execution worker for standard .pte TextLLM models. One binary,
+# no registry/factory: it constructs TextLLMEngine/TextLLMSession directly and
+# speaks the JSONL worker protocol (worker_client.py). Model execution is C++
+# only — the Python server is HTTP/control plane.
+#
+# Build like the example runners (standalone), e.g. from this directory: cmake
+# -S . -B <executorch-cmake-out>/extension/llm/server/cpp \
+# -DCMAKE_PREFIX_PATH=<executorch-cmake-out> -DEXECUTORCH_BUILD_XNNPACK=ON cmake
+# --build <...>/extension/llm/server/cpp --target text_llm_worker
+
+cmake_minimum_required(VERSION 3.24)
+project(llm_server_workers)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+# Vendored single-include nlohmann/json for the worker protocol (no new dep).
+set(_json_include
+    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
+)
+
+# gflags
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# executorch
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(link_libraries executorch gflags)
+
+# CPU ops
+list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+
+# Custom + quantized kernels that export_llm models need, whole-archived so the
+# static op registrations survive the linker: llama::custom_sdpa (from
+# use_sdpa_with_kv_cache) and quantized_decomposed ops (from quantized exports).
+# Without these the model loads but execution fails with "Missing operator".
+if(TARGET custom_ops)
+  executorch_target_link_options_shared_lib(custom_ops)
+  list(APPEND link_libraries custom_ops)
+endif()
+if(TARGET quantized_ops_lib)
+  list(APPEND link_libraries quantized_kernels quantized_ops_lib)
+  executorch_target_link_options_shared_lib(quantized_ops_lib)
+endif()
+
+# Extensions (Engine/Session lives in extension_llm_runner)
+list(
+  APPEND
+  link_libraries
+  extension_llm_runner
+  extension_module
+  extension_data_loader
+  extension_tensor
+  extension_flat_tensor
+)
+
+# XNNPACK: the standard CPU backend for normal .pte TextLLM models.
+list(APPEND link_libraries xnnpack_backend)
+executorch_target_link_options_shared_lib(xnnpack_backend)
+
+# Tokenizer
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(text_llm_worker text_llm_worker.cpp)
+target_include_directories(
+  text_llm_worker PUBLIC ${_common_include_directories} ${_json_include}
+)
+target_link_libraries(text_llm_worker PUBLIC ${link_libraries})
+
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(text_llm_worker)
+  target_link_options(text_llm_worker PRIVATE "LINKER:-s")
+endif()