Skip to content

Commit b59d99f

Browse files
committed
extension/llm/runner: Python bindings for the Engine/Session API
Bind LLMEngine and LLMSession (PyLLMEngine/PyLLMSession), exposing create_session(), serving_capacity(), and the token-step methods (prefill_tokens/decode_one/seek/position/reset/stop) only on LLMSession. PyTextLLMRunner stays a legacy direct runner and no longer exposes token-step methods, so LLMSession is the single Python serving surface. Backend execution across an engine's sessions is serialized by an engine-owned lock. TokenStringCallback buffers byte-level BPE tokens so a multi-byte character split across callbacks does not break UTF-8 decoding. Stubs in _llm_runner.pyi; boundary and session tests in test_runner_pybindings.py. Second of four stacked commits; depends on the C++ core. ghstack-source-id: 86cd500 ghstack-comment-id: 4617262734 Pull-Request: #19992
1 parent 5d85624 commit b59d99f

4 files changed

Lines changed: 446 additions & 7 deletions

File tree

extension/llm/runner/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
from executorch.extension.llm.runner._llm_runner import ( # noqa: F401
1919
GenerationConfig,
2020
Image,
21+
LLMEngine,
22+
LLMSession,
2123
make_audio_input,
2224
make_image_input,
2325
make_raw_audio_input,
@@ -234,5 +236,7 @@ def generate_text_hf(
234236
"MultimodalInput",
235237
"MultimodalRunner",
236238
"TextLLMRunner",
239+
"LLMEngine",
240+
"LLMSession",
237241
"Stats",
238242
]

extension/llm/runner/_llm_runner.pyi

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,66 @@ class TextLLMRunner:
411411

412412
def __repr__(self) -> str: ...
413413

414+
class LLMSession:
415+
"""A per-conversation session created by LLMEngine: reuses the engine's
416+
program/resources (weight sharing is backend-dependent — see
417+
LLMEngine.serving_capacity()) but owns its own KV cache. Backend calls
418+
(prefill_tokens/decode_one) are serialized across the engine's sessions by
419+
an engine-owned lock."""
420+
421+
def prefill_tokens(self, token_ids: List[int]) -> None: ...
422+
def decode_one(self, temperature: float = -1.0) -> dict:
423+
"""One decode step -> {"token_id": int, "text": bytes, "is_eos": bool}."""
424+
...
425+
426+
def seek(self, pos: int) -> None: ...
427+
def position(self) -> int: ...
428+
def reset(self) -> None: ...
429+
def stop(self) -> None:
430+
"""Token-boundary cooperative stop: safe from another thread, but it
431+
does not abort a decode_one() already running — it takes effect before
432+
the next decode_one()."""
433+
...
434+
435+
def __repr__(self) -> str: ...
436+
437+
class LLMEngine:
438+
"""Engine for multi-session text generation over one loaded program.
439+
440+
Loads the model's program once; create_session() returns a LLMSession that
441+
reuses it but owns its own KV cache. Whether extra sessions avoid
442+
duplicating packed weights is backend-dependent — ask serving_capacity(). Backend execution across all sessions of one engine is
443+
serialized by an engine-owned lock (backend ops are not assumed
444+
thread-safe), so it is safe to drive multiple sessions from multiple Python
445+
threads.
446+
"""
447+
448+
def __init__(
449+
self,
450+
model_path: str,
451+
tokenizer_path: str,
452+
data_path: Optional[str] = None,
453+
method_name: str = "forward",
454+
temperature: float = -1.0,
455+
) -> None: ...
456+
def create_session(self) -> LLMSession:
457+
"""Create a session that reuses this engine's program/resources (weight
458+
sharing is backend-dependent — see serving_capacity()), with its own KV
459+
cache."""
460+
...
461+
462+
def serving_capacity(self) -> dict:
463+
"""Serving-capacity dict: max_physical_sessions_without_weight_duplication
464+
(1 = single-slot, no weight duplication) and estimated_bytes_per_session
465+
(0 = unknown). The server clamps physical sessions to this."""
466+
...
467+
468+
def metadata(self) -> dict:
469+
"""Model metadata from the .pte, e.g. get_max_context_len."""
470+
...
471+
472+
def __repr__(self) -> str: ...
473+
414474
class MultimodalRunner:
415475
"""Runner for multimodal language models."""
416476

0 commit comments

Comments
 (0)