|
15 | 15 | #include <executorch/extension/llm/runner/multimodal_runner.h> |
16 | 16 | #include <executorch/extension/llm/runner/stats.h> |
17 | 17 | #include <executorch/extension/llm/runner/text_llm_runner.h> |
| 18 | +#include <executorch/extension/llm/runner/text_llm_session.h> |
18 | 19 | #include <executorch/extension/llm/runner/text_prefiller.h> |
19 | 20 | #include <executorch/extension/llm/runner/text_token_generator.h> |
20 | 21 | #include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h> |
|
29 | 30 | namespace executorch::extension::llm { |
30 | 31 |
|
31 | 32 | using ::executorch::extension::Module; |
| 33 | +using ::executorch::extension::Program; |
32 | 34 | using ::executorch::runtime::Error; |
33 | 35 |
|
| 36 | +// Assembles the per-Module components (decoder/prefiller/token generator/io |
| 37 | +// manager/stats) into a TextLLMRunner. Shared by the path-based and the |
| 38 | +// shared-Program (TextLLMEngine session) construction paths. |
| 39 | +static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner( |
| 40 | + std::unique_ptr<Module> module, |
| 41 | + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, |
| 42 | + float temperature, |
| 43 | + const std::string& method_name); |
| 44 | + |
34 | 45 | std::unique_ptr<tokenizers::Tokenizer> load_tokenizer( |
35 | 46 | const std::string& tokenizer_path, |
36 | 47 | std::unique_ptr<std::vector<std::string>> special_tokens, |
@@ -251,6 +262,15 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner( |
251 | 262 | max_cached_memory_size_bytes_)); |
252 | 263 | } |
253 | 264 |
|
| 265 | + return assemble_text_llm_runner( |
| 266 | + std::move(module), std::move(tokenizer), temperature, method_name); |
| 267 | +} |
| 268 | + |
| 269 | +static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner( |
| 270 | + std::unique_ptr<Module> module, |
| 271 | + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, |
| 272 | + float temperature, |
| 273 | + const std::string& method_name) { |
254 | 274 | // Get metadata from Module |
255 | 275 | ET_LOG(Info, "Reading metadata from model"); |
256 | 276 | auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get()); |
@@ -305,6 +325,198 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner( |
305 | 325 | temperature); |
306 | 326 | } |
307 | 327 |
|
| 328 | +// Builds a TextLLMRunner over an already-loaded Program: the runner's Module |
| 329 | +// reuses `program` while owning its own method state and KV cache. File-local — |
| 330 | +// the per-session construction path for TextLLMEngine (which keeps the backing |
| 331 | +// DataLoader alive for the runners' lifetime). External callers go through |
| 332 | +// LLMEngine -> LLMSession, not a raw shared-Program runner. |
| 333 | +static std::unique_ptr<TextLLMRunner> create_text_llm_runner_from_program( |
| 334 | + std::shared_ptr<Program> program, |
| 335 | + std::unique_ptr<::tokenizers::Tokenizer> tokenizer, |
| 336 | + float temperature, |
| 337 | + const std::string& method_name) { |
| 338 | + if (!tokenizer || !tokenizer->is_loaded()) { |
| 339 | + ET_LOG(Error, "Tokenizer is null or not loaded"); |
| 340 | + return nullptr; |
| 341 | + } |
| 342 | + if (!program) { |
| 343 | + ET_LOG(Error, "Program is null"); |
| 344 | + return nullptr; |
| 345 | + } |
| 346 | + // A Module over the already-loaded Program: it reuses that Program rather |
| 347 | + // than re-loading it, and its loaded method allocates its own planned (KV) |
| 348 | + // memory. Whether packed weights are physically shared vs. re-materialized |
| 349 | + // per method instance is backend-dependent (serving_capacity() is the |
| 350 | + // authority). |
| 351 | + constexpr uint32_t kMaxCachedMemoryBytes = 1024 * 1024 * 10; // 10MB |
| 352 | + auto module = std::make_unique<Module>( |
| 353 | + std::move(program), |
| 354 | + nullptr, // memory allocator |
| 355 | + std::make_unique<executorch::extension::CPUCachingAllocator>( |
| 356 | + kMaxCachedMemoryBytes)); |
| 357 | + return assemble_text_llm_runner( |
| 358 | + std::move(module), std::move(tokenizer), temperature, method_name); |
| 359 | +} |
| 360 | + |
| 361 | +namespace detail { |
| 362 | +// The TextLLM adapter: implements the model-agnostic LLMSession over a |
| 363 | +// TextLLMRunner. TextLLMRunner's token-step methods are private; this adapter |
| 364 | +// is their only (friended) caller, so the engine and server depend solely on |
| 365 | +// LLMSession. |
| 366 | +TextLLMSession::TextLLMSession(std::unique_ptr<TextLLMRunner> runner) |
| 367 | + : runner_(std::move(runner)) {} |
| 368 | + |
| 369 | +Error TextLLMSession::prefill_tokens( |
| 370 | + std::vector<uint64_t> tokens, |
| 371 | + const SamplingConfig* initial_sampling) { |
| 372 | + // The model samples the FIRST generated token during prefill, so apply the |
| 373 | + // request's sampling here (not a stale default). Only temperature is |
| 374 | + // plumbed; reject non-default top_p/top_k/seed for parity with decode_one(). |
| 375 | + float temperature = -1.0f; |
| 376 | + if (initial_sampling != nullptr) { |
| 377 | + if (initial_sampling->top_p != 1.0f || initial_sampling->top_k != 0 || |
| 378 | + initial_sampling->seed != 0) { |
| 379 | + ET_LOG( |
| 380 | + Error, |
| 381 | + "TextLLMSession: only temperature is supported; top_p/top_k/seed " |
| 382 | + "are not yet implemented"); |
| 383 | + return ::executorch::runtime::Error::NotSupported; |
| 384 | + } |
| 385 | + temperature = initial_sampling->temperature; |
| 386 | + } |
| 387 | + return runner_->prefill_tokens(std::move(tokens), temperature).error(); |
| 388 | +} |
| 389 | + |
| 390 | +::executorch::runtime::Result<DecodeResult> TextLLMSession::decode_one( |
| 391 | + const SamplingConfig& sampling) { |
| 392 | + // Only temperature is plumbed today; top_p/top_k/seed need a per-session |
| 393 | + // sampler (a follow-up). Reject non-default values rather than silently |
| 394 | + // ignoring them, so callers can't assume constraints are applied. |
| 395 | + if (sampling.top_p != 1.0f || sampling.top_k != 0 || sampling.seed != 0) { |
| 396 | + ET_LOG( |
| 397 | + Error, |
| 398 | + "TextLLMSession: only temperature is supported; top_p/top_k/seed are " |
| 399 | + "not yet implemented"); |
| 400 | + return ::executorch::runtime::Error::NotSupported; |
| 401 | + } |
| 402 | + return runner_->decode_one(sampling.temperature); |
| 403 | +} |
| 404 | + |
| 405 | +Error TextLLMSession::seek(int64_t pos) { |
| 406 | + return runner_->seek(pos); |
| 407 | +} |
| 408 | + |
| 409 | +int64_t TextLLMSession::position() const { |
| 410 | + return runner_->position(); |
| 411 | +} |
| 412 | + |
| 413 | +Error TextLLMSession::reset() { |
| 414 | + runner_->reset(); |
| 415 | + return Error::Ok; |
| 416 | +} |
| 417 | + |
| 418 | +void TextLLMSession::stop() { |
| 419 | + runner_->stop(); |
| 420 | +} |
| 421 | + |
| 422 | +std::unique_ptr<LLMSession> make_text_llm_session( |
| 423 | + std::unique_ptr<TextLLMRunner> runner) { |
| 424 | + return std::make_unique<TextLLMSession>(std::move(runner)); |
| 425 | +} |
| 426 | +} // namespace detail |
| 427 | + |
| 428 | +TextLLMEngine::TextLLMEngine( |
| 429 | + std::unique_ptr<Module> loader_module, |
| 430 | + std::shared_ptr<Program> program, |
| 431 | + std::string tokenizer_path, |
| 432 | + float temperature, |
| 433 | + std::string method_name, |
| 434 | + std::unordered_map<std::string, int64_t> metadata) |
| 435 | + : loader_module_(std::move(loader_module)), |
| 436 | + program_(std::move(program)), |
| 437 | + tokenizer_path_(std::move(tokenizer_path)), |
| 438 | + temperature_(temperature), |
| 439 | + method_name_(std::move(method_name)), |
| 440 | + metadata_(std::move(metadata)) {} |
| 441 | + |
| 442 | +std::unique_ptr<TextLLMEngine> TextLLMEngine::create( |
| 443 | + const std::string& model_path, |
| 444 | + const std::string& tokenizer_path, |
| 445 | + std::optional<const std::string> data_path, |
| 446 | + float temperature, |
| 447 | + const std::string& method_name, |
| 448 | + Module::LoadMode load_mode) { |
| 449 | + // External .ptd weights are not yet supported for shared sessions: each |
| 450 | + // session Module built from the shared Program would also need the |
| 451 | + // data_map_loader threaded into its load_method() to resolve external |
| 452 | + // weights (see Module::load_method merged_data_map_). Fail loudly rather than |
| 453 | + // silently produce sessions that error on first generate. |
| 454 | + if (data_path.has_value()) { |
| 455 | + ET_LOG( |
| 456 | + Error, |
| 457 | + "TextLLMEngine: external data_path (.ptd) is not yet supported for " |
| 458 | + "shared sessions; use a self-contained .pte for now."); |
| 459 | + return nullptr; |
| 460 | + } |
| 461 | + // Load the program ONCE; sessions reuse it (loaded a single time, per-session |
| 462 | + // KV). Physical weight sharing across sessions is backend-dependent — see |
| 463 | + // serving_capacity(). |
| 464 | + auto loader_module = std::make_unique<Module>(model_path, load_mode); |
| 465 | + if (loader_module->load() != Error::Ok) { |
| 466 | + ET_LOG( |
| 467 | + Error, |
| 468 | + "TextLLMEngine: failed to load program from %s", |
| 469 | + model_path.c_str()); |
| 470 | + return nullptr; |
| 471 | + } |
| 472 | + auto program = loader_module->program(); |
| 473 | + if (!program) { |
| 474 | + ET_LOG(Error, "TextLLMEngine: program is null after load"); |
| 475 | + return nullptr; |
| 476 | + } |
| 477 | + // Read model-level metadata once (shared by all sessions). |
| 478 | + auto meta_tokenizer = load_tokenizer(tokenizer_path); |
| 479 | + if (!meta_tokenizer) { |
| 480 | + ET_LOG( |
| 481 | + Error, |
| 482 | + "TextLLMEngine: failed to load tokenizer from %s", |
| 483 | + tokenizer_path.c_str()); |
| 484 | + return nullptr; |
| 485 | + } |
| 486 | + auto metadata_result = |
| 487 | + get_llm_metadata(meta_tokenizer.get(), loader_module.get()); |
| 488 | + if (metadata_result.error() != Error::Ok) { |
| 489 | + ET_LOG(Error, "TextLLMEngine: failed to read metadata"); |
| 490 | + return nullptr; |
| 491 | + } |
| 492 | + return std::unique_ptr<TextLLMEngine>(new TextLLMEngine( |
| 493 | + std::move(loader_module), |
| 494 | + std::move(program), |
| 495 | + tokenizer_path, |
| 496 | + temperature, |
| 497 | + method_name, |
| 498 | + metadata_result.get())); |
| 499 | +} |
| 500 | + |
| 501 | +::executorch::runtime::Result<std::unique_ptr<LLMSession>> |
| 502 | +TextLLMEngine::create_session() { |
| 503 | + auto tokenizer = load_tokenizer(tokenizer_path_); |
| 504 | + if (!tokenizer) { |
| 505 | + ET_LOG( |
| 506 | + Error, |
| 507 | + "TextLLMEngine: failed to load tokenizer from %s", |
| 508 | + tokenizer_path_.c_str()); |
| 509 | + return Error::InvalidState; |
| 510 | + } |
| 511 | + auto runner = create_text_llm_runner_from_program( |
| 512 | + program_, std::move(tokenizer), temperature_, method_name_); |
| 513 | + if (!runner) { |
| 514 | + ET_LOG(Error, "TextLLMEngine: failed to build session runner"); |
| 515 | + return Error::InvalidState; |
| 516 | + } |
| 517 | + return detail::make_text_llm_session(std::move(runner)); |
| 518 | +} |
| 519 | + |
308 | 520 | std::unique_ptr<MultimodalRunner> create_multimodal_runner( |
309 | 521 | const std::string& model_path, |
310 | 522 | std::unique_ptr<::tokenizers::Tokenizer> tokenizer, |
|
0 commit comments