pytorch
diff --git a/‎extension/llm/runner/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎extension/llm/runner/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎extension/llm/runner/_llm_runner.pyi‎
Lines changed: 60 additions & 0 deletions b/‎extension/llm/runner/_llm_runner.pyi‎
Lines changed: 60 additions & 0 deletions
@@ -18,6 +18,8 @@
     from executorch.extension.llm.runner._llm_runner import (  # noqa: F401
         GenerationConfig,
         Image,
+        LLMEngine,
+        LLMSession,
         make_audio_input,
         make_image_input,
         make_raw_audio_input,
@@ -234,5 +236,7 @@ def generate_text_hf(
     "MultimodalInput",
     "MultimodalRunner",
     "TextLLMRunner",
+    "LLMEngine",
+    "LLMSession",
     "Stats",
 ]
@@ -411,6 +411,66 @@ class TextLLMRunner:
 
     def __repr__(self) -> str: ...
 
+class LLMSession:
+    """A per-conversation session created by LLMEngine: reuses the engine's
+    program/resources (weight sharing is backend-dependent — see
+    LLMEngine.serving_capacity()) but owns its own KV cache. Backend calls
+    (prefill_tokens/decode_one) are serialized across the engine's sessions by
+    an engine-owned lock."""
+
+    def prefill_tokens(self, token_ids: List[int]) -> None: ...
+    def decode_one(self, temperature: float = -1.0) -> dict:
+        """One decode step -> {"token_id": int, "text": bytes, "is_eos": bool}."""
+        ...
+
+    def seek(self, pos: int) -> None: ...
+    def position(self) -> int: ...
+    def reset(self) -> None: ...
+    def stop(self) -> None:
+        """Token-boundary cooperative stop: safe from another thread, but it
+        does not abort a decode_one() already running — it takes effect before
+        the next decode_one()."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class LLMEngine:
+    """Engine for multi-session text generation over one loaded program.
+
+    Loads the model's program once; create_session() returns a LLMSession that
+    reuses it but owns its own KV cache. Whether extra sessions avoid
+    duplicating packed weights is backend-dependent — ask serving_capacity(). Backend execution across all sessions of one engine is
+    serialized by an engine-owned lock (backend ops are not assumed
+    thread-safe), so it is safe to drive multiple sessions from multiple Python
+    threads.
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        tokenizer_path: str,
+        data_path: Optional[str] = None,
+        method_name: str = "forward",
+        temperature: float = -1.0,
+    ) -> None: ...
+    def create_session(self) -> LLMSession:
+        """Create a session that reuses this engine's program/resources (weight
+        sharing is backend-dependent — see serving_capacity()), with its own KV
+        cache."""
+        ...
+
+    def serving_capacity(self) -> dict:
+        """Serving-capacity dict: max_physical_sessions_without_weight_duplication
+        (1 = single-slot, no weight duplication) and estimated_bytes_per_session
+        (0 = unknown). The server clamps physical sessions to this."""
+        ...
+
+    def metadata(self) -> dict:
+        """Model metadata from the .pte, e.g. get_max_context_len."""
+        ...
+
+    def __repr__(self) -> str: ...
+
 class MultimodalRunner:
     """Runner for multimodal language models."""