Skip to content

Commit 5d85624

Browse files
committed
extension/llm/runner: Engine/Session C++ core + token-step primitives
Add the model-agnostic LLMEngine/LLMSession interfaces (llm_session.h) with SamplingConfig, DecodeResult and LLMServingCapacity; the TextLLMRunner token-step primitives the session layer is built on (seek, prefill_tokens, position, decode_one); and TextLLMEngine/TextLLMSession over a single loaded Program. decode_one() shares generate()'s logit processors via TextTokenGenerator::apply_logit_processors so the two decode paths cannot diverge. serving_capacity() reports a conservative single physical session (physical weight sharing is backend-dependent). Covered by gtests in test_text_llm_runner.cpp. First of four stacked commits: C++ core -> Python bindings -> server foundations -> HTTP server. ghstack-source-id: 3705b82 ghstack-comment-id: 4617262593 Pull-Request: #19991
1 parent eeb0646 commit 5d85624

8 files changed

Lines changed: 819 additions & 6 deletions

File tree

extension/llm/runner/llm_runner_helper.cpp

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,18 @@
2929
namespace executorch::extension::llm {
3030

3131
using ::executorch::extension::Module;
32+
using ::executorch::extension::Program;
3233
using ::executorch::runtime::Error;
3334

35+
// Assembles the per-Module components (decoder/prefiller/token generator/io
36+
// manager/stats) into a TextLLMRunner. Shared by the path-based and the
37+
// shared-Program (TextLLMEngine session) construction paths.
38+
static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
39+
std::unique_ptr<Module> module,
40+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
41+
float temperature,
42+
const std::string& method_name);
43+
3444
std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
3545
const std::string& tokenizer_path,
3646
std::unique_ptr<std::vector<std::string>> special_tokens,
@@ -251,6 +261,15 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
251261
max_cached_memory_size_bytes_));
252262
}
253263

264+
return assemble_text_llm_runner(
265+
std::move(module), std::move(tokenizer), temperature, method_name);
266+
}
267+
268+
static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
269+
std::unique_ptr<Module> module,
270+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
271+
float temperature,
272+
const std::string& method_name) {
254273
// Get metadata from Module
255274
ET_LOG(Info, "Reading metadata from model");
256275
auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
@@ -305,6 +324,172 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
305324
temperature);
306325
}
307326

327+
std::unique_ptr<TextLLMRunner> create_text_llm_runner_from_program(
328+
std::shared_ptr<Program> program,
329+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
330+
float temperature,
331+
const std::string& method_name) {
332+
if (!tokenizer || !tokenizer->is_loaded()) {
333+
ET_LOG(Error, "Tokenizer is null or not loaded");
334+
return nullptr;
335+
}
336+
if (!program) {
337+
ET_LOG(Error, "Program is null");
338+
return nullptr;
339+
}
340+
// A Module over the already-loaded Program: it reuses that Program rather
341+
// than re-loading it, and its loaded method allocates its own planned (KV)
342+
// memory. Whether packed weights are physically shared vs. re-materialized
343+
// per method instance is backend-dependent (serving_capacity() is the
344+
// authority); on XNNPACK assume per-instance.
345+
constexpr uint32_t kMaxCachedMemoryBytes = 1024 * 1024 * 10; // 10MB
346+
auto module = std::make_unique<Module>(
347+
std::move(program),
348+
nullptr, // memory allocator
349+
std::make_unique<executorch::extension::CPUCachingAllocator>(
350+
kMaxCachedMemoryBytes));
351+
return assemble_text_llm_runner(
352+
std::move(module), std::move(tokenizer), temperature, method_name);
353+
}
354+
355+
namespace {
356+
// The TextLLM adapter: implements the model-agnostic LLMSession over a
357+
// TextLLMRunner. TextLLMRunner is an implementation detail here — the engine
358+
// and server depend only on LLMSession.
359+
class TextLLMSession : public LLMSession {
360+
public:
361+
explicit TextLLMSession(std::unique_ptr<TextLLMRunner> runner)
362+
: runner_(std::move(runner)) {}
363+
364+
Error prefill_tokens(std::vector<uint64_t> tokens) override {
365+
return runner_->prefill_tokens(std::move(tokens)).error();
366+
}
367+
::executorch::runtime::Result<DecodeResult> decode_one(
368+
const SamplingConfig& sampling) override {
369+
// Only temperature is plumbed today; top_p/top_k/seed need a per-session
370+
// sampler (a follow-up). Reject non-default values rather than silently
371+
// ignoring them, so callers can't assume constraints are applied.
372+
if (sampling.top_p != 1.0f || sampling.top_k != 0 || sampling.seed != 0) {
373+
ET_LOG(
374+
Error,
375+
"TextLLMSession: only temperature is supported; top_p/top_k/seed are "
376+
"not yet implemented");
377+
return ::executorch::runtime::Error::NotSupported;
378+
}
379+
return runner_->decode_one(sampling.temperature);
380+
}
381+
Error seek(int64_t pos) override {
382+
return runner_->seek(pos);
383+
}
384+
int64_t position() const override {
385+
return runner_->position();
386+
}
387+
Error reset() override {
388+
runner_->reset();
389+
return Error::Ok;
390+
}
391+
void stop() override {
392+
runner_->stop();
393+
}
394+
395+
private:
396+
std::unique_ptr<TextLLMRunner> runner_;
397+
};
398+
} // namespace
399+
400+
TextLLMEngine::TextLLMEngine(
401+
std::unique_ptr<Module> loader_module,
402+
std::shared_ptr<Program> program,
403+
std::string tokenizer_path,
404+
float temperature,
405+
std::string method_name,
406+
std::unordered_map<std::string, int64_t> metadata)
407+
: loader_module_(std::move(loader_module)),
408+
program_(std::move(program)),
409+
tokenizer_path_(std::move(tokenizer_path)),
410+
temperature_(temperature),
411+
method_name_(std::move(method_name)),
412+
metadata_(std::move(metadata)) {}
413+
414+
std::unique_ptr<TextLLMEngine> TextLLMEngine::create(
415+
const std::string& model_path,
416+
const std::string& tokenizer_path,
417+
std::optional<const std::string> data_path,
418+
float temperature,
419+
const std::string& method_name,
420+
Module::LoadMode load_mode) {
421+
// External .ptd weights are not yet supported for shared sessions: each
422+
// session Module built from the shared Program would also need the
423+
// data_map_loader threaded into its load_method() to resolve external
424+
// weights (see Module::load_method merged_data_map_). Fail loudly rather than
425+
// silently produce sessions that error on first generate.
426+
if (data_path.has_value()) {
427+
ET_LOG(
428+
Error,
429+
"TextLLMEngine: external data_path (.ptd) is not yet supported for "
430+
"shared sessions; use a self-contained .pte for now.");
431+
return nullptr;
432+
}
433+
// Load the program ONCE; sessions reuse it (loaded a single time, per-session
434+
// KV). Physical weight sharing across sessions is backend-dependent — see
435+
// serving_capacity().
436+
auto loader_module = std::make_unique<Module>(model_path, load_mode);
437+
if (loader_module->load() != Error::Ok) {
438+
ET_LOG(
439+
Error,
440+
"TextLLMEngine: failed to load program from %s",
441+
model_path.c_str());
442+
return nullptr;
443+
}
444+
auto program = loader_module->program();
445+
if (!program) {
446+
ET_LOG(Error, "TextLLMEngine: program is null after load");
447+
return nullptr;
448+
}
449+
// Read model-level metadata once (shared by all sessions).
450+
auto meta_tokenizer = load_tokenizer(tokenizer_path);
451+
if (!meta_tokenizer) {
452+
ET_LOG(
453+
Error,
454+
"TextLLMEngine: failed to load tokenizer from %s",
455+
tokenizer_path.c_str());
456+
return nullptr;
457+
}
458+
auto metadata_result =
459+
get_llm_metadata(meta_tokenizer.get(), loader_module.get());
460+
if (metadata_result.error() != Error::Ok) {
461+
ET_LOG(Error, "TextLLMEngine: failed to read metadata");
462+
return nullptr;
463+
}
464+
return std::unique_ptr<TextLLMEngine>(new TextLLMEngine(
465+
std::move(loader_module),
466+
std::move(program),
467+
tokenizer_path,
468+
temperature,
469+
method_name,
470+
metadata_result.get()));
471+
}
472+
473+
::executorch::runtime::Result<std::unique_ptr<LLMSession>>
474+
TextLLMEngine::create_session() {
475+
auto tokenizer = load_tokenizer(tokenizer_path_);
476+
if (!tokenizer) {
477+
ET_LOG(
478+
Error,
479+
"TextLLMEngine: failed to load tokenizer from %s",
480+
tokenizer_path_.c_str());
481+
return Error::InvalidState;
482+
}
483+
auto runner = create_text_llm_runner_from_program(
484+
program_, std::move(tokenizer), temperature_, method_name_);
485+
if (!runner) {
486+
ET_LOG(Error, "TextLLMEngine: failed to build session runner");
487+
return Error::InvalidState;
488+
}
489+
return std::unique_ptr<LLMSession>(
490+
std::make_unique<TextLLMSession>(std::move(runner)));
491+
}
492+
308493
std::unique_ptr<MultimodalRunner> create_multimodal_runner(
309494
const std::string& model_path,
310495
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,

extension/llm/runner/llm_runner_helper.h

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <vector>
1919

2020
#include <executorch/extension/llm/runner/constants.h>
21+
#include <executorch/extension/llm/runner/llm_session.h>
2122
#include <executorch/extension/module/module.h>
2223
#include <executorch/runtime/core/result.h>
2324
#include <executorch/runtime/platform/compiler.h>
@@ -141,6 +142,93 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
141142
const std::string& method_name = "forward",
142143
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);
143144

145+
/**
146+
* @brief Creates a TextLLMRunner over an already-loaded Program.
147+
*
148+
* Unlike create_text_llm_runner(model_path, ...), this does not load the model
149+
* file again: the resulting runner's Module reuses `program` while owning its
150+
* own method state and KV cache. This is the per-session construction path for
151+
* TextLLMEngine — N sessions reuse one loaded Program but isolate their mutable
152+
* KV state. Whether they also avoid re-materializing packed weights per session
153+
* is backend-dependent (serving_capacity() is authoritative; XNNPACK repacks
154+
* per method instance, so assume per-session weights there).
155+
*
156+
* The caller must keep the DataLoader backing `program` alive for the lifetime
157+
* of every runner created from it (TextLLMEngine holds the loader Module).
158+
*
159+
* @param program Shared, already-loaded program.
160+
* @param tokenizer Initialized tokenizer instance (owned by the new runner).
161+
* @param temperature Optional temperature (deprecated; prefer
162+
* GenerationConfig).
163+
* @param method_name Name of the method to execute in the model.
164+
* @return std::unique_ptr<TextLLMRunner> on success, or nullptr on failure.
165+
*/
166+
ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner>
167+
create_text_llm_runner_from_program(
168+
std::shared_ptr<Program> program,
169+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
170+
float temperature = -1.0f,
171+
const std::string& method_name = "forward");
172+
173+
/**
174+
* @brief Engine for multi-session text generation over one loaded Program.
175+
*
176+
* Loads the model's Program (weights/constants) once; create_session() builds a
177+
* TextLLMRunner that reuses that Program but owns its own method/KV state. This
178+
* is the correctness-first foundation for serving multiple conversations.
179+
* Backend execution should be serialized by the caller until per-backend thread
180+
* safety is proven (Module::execute is not assumed thread-safe). Whether extra
181+
* sessions actually avoid duplicating packed weights is a backend property
182+
* (e.g. AOTI/CUDA share device weights) reported by serving_capacity(); on the
183+
* XNNPACK path weights are repacked per method instance and the KV cache is
184+
* baked into the .pte, so it conservatively reports a single physical session.
185+
*/
186+
class ET_EXPERIMENTAL TextLLMEngine : public LLMEngine {
187+
public:
188+
static std::unique_ptr<TextLLMEngine> create(
189+
const std::string& model_path,
190+
const std::string& tokenizer_path,
191+
std::optional<const std::string> data_path = std::nullopt,
192+
float temperature = -1.0f,
193+
const std::string& method_name = "forward",
194+
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);
195+
196+
// Returns a TextLLMSession (LLMSession) that reuses this engine's loaded
197+
// Program (physical weight sharing is backend-dependent; see
198+
// serving_capacity).
199+
::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
200+
override;
201+
// Conservative v1: a self-contained .pte repacks XNNPACK weights per runtime,
202+
// so we don't claim multiple physical sessions share weights. Raise this on a
203+
// backend/artifact proven to share packed weights.
204+
LLMServingCapacity serving_capacity() const override {
205+
return LLMServingCapacity{};
206+
}
207+
const std::unordered_map<std::string, int64_t>& metadata() const override {
208+
return metadata_;
209+
}
210+
211+
TextLLMEngine(const TextLLMEngine&) = delete;
212+
TextLLMEngine& operator=(const TextLLMEngine&) = delete;
213+
214+
private:
215+
TextLLMEngine(
216+
std::unique_ptr<Module> loader_module,
217+
std::shared_ptr<Program> program,
218+
std::string tokenizer_path,
219+
float temperature,
220+
std::string method_name,
221+
std::unordered_map<std::string, int64_t> metadata);
222+
223+
// Keeps the shared Program's DataLoader alive for the lifetime of sessions.
224+
std::unique_ptr<Module> loader_module_;
225+
std::shared_ptr<Program> program_;
226+
std::string tokenizer_path_;
227+
float temperature_;
228+
std::string method_name_;
229+
std::unordered_map<std::string, int64_t> metadata_;
230+
};
231+
144232
/**
145233
* @brief Creates a MultimodalRunner instance with dependency injection
146234
*

0 commit comments

Comments
 (0)