executorch/extension/llm/runner/llm_runner_helper.h at main · CodeLinaro/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

// Helper utilities for creating and configuring LLM runners

#pragma once

#include <memory>
#include <optional>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include <executorch/extension/llm/runner/constants.h>
#include <executorch/extension/module/module.h>
#include <executorch/runtime/core/result.h>
#include <executorch/runtime/platform/compiler.h>
#include <pytorch/tokenizers/tokenizer.h>

namespace executorch::extension::llm {

// Forward declarations
class TextLLMRunner;
class MultimodalRunner;

/**
 * @brief Loads a tokenizer from the specified path
 *
 * This function creates and initializes a tokenizer from a file, with options
 * to customize special tokens and regex patterns. It tries different tokenizer
 * types in order: HF JSON, TikToken, SentencePiece, and BPE.
 *
 * @param tokenizer_path Path to the tokenizer file
 * @param special_tokens Optional list of special tokens to add to the tokenizer
 * @param pattern Optional regex pattern for tokenization
 * @param bos_token_index Index of the beginning-of-sequence token
 * @param eos_token_index Index of the end-of-sequence token
 * @return std::unique_ptr<tokenizers::Tokenizer> Initialized tokenizer
 * instance, or nullptr on failure
 */
ET_EXPERIMENTAL std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
    const std::string& tokenizer_path,
    std::unique_ptr<std::vector<std::string>> special_tokens = nullptr,
    std::optional<std::string> pattern = std::nullopt,
    size_t bos_token_index = 0,
    size_t eos_token_index = 1);

/**
 * @brief Gets LLM metadata from the model and tokenizer
 *
 * This function extracts metadata from the model such as vocabulary size,
 * context length, and other configuration parameters. It reads metadata
 * methods from the model and combines them with tokenizer information.
 *
 * @param tokenizer Initialized tokenizer instance
 * @param module The model module
 * @return Result<std::unordered_map<std::string, int64_t>> Metadata key-value
 * pairs on success, or Error::InvalidArgument if required metadata (e.g.,
 * kMaxSeqLen) is missing from the model
 */
ET_EXPERIMENTAL ::executorch::runtime::Result<
    std::unordered_map<std::string, int64_t>>
get_llm_metadata(tokenizers::Tokenizer* tokenizer, Module* module);

/**
 * @brief Gets EOS token IDs from the model and tokenizer
 *
 * This function extracts the end-of-sequence token IDs from the model.
 * It first tries to get EOS IDs from the model's metadata, falling back
 * to the tokenizer's default EOS token.
 *
 * @param tokenizer Initialized tokenizer instance
 * @param module The model module
 * @return std::unordered_set<uint64_t> Set of EOS token IDs
 */
ET_EXPERIMENTAL std::unordered_set<uint64_t> get_eos_ids(
    tokenizers::Tokenizer* tokenizer,
    Module* module);

/**
 * @brief Creates a TextLLMRunner instance with dependency injection
 *
 * This factory function creates and initializes a TextLLMRunner with all
 * necessary components for text generation using the specified model and
 * tokenizer.
 *
 * @param model_path Path to the model file
 * @param tokenizer Initialized tokenizer instance
 * @param data_path Optional path to additional data required by the model
 * @param temperature Optional temperature parameter for controlling randomness
 * (deprecated)
 * @param method_name Name of the method to execute in the model
 * @param load_mode Loading strategy for the model file. Defaults to
 * MmapUseMlockIgnoreErrors which uses mmap to avoid loading the entire
 * model into RAM and attempts to pin pages with mlock for lower inference
 * latency, gracefully falling back to standard mmap if mlock is unavailable.
 * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance, or
 * nullptr on failure
 */
ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
    const std::string& model_path,
    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
    std::optional<const std::string> data_path,
    float temperature = -1.0f,
    const std::string& method_name = "forward",
    Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

/**
 * @brief Creates a TextLLMRunner instance with dependency injection
 *
 * This factory function creates and initializes a TextLLMRunner with all
 * necessary components for text generation using the specified model and
 * tokenizer.
 *
 * @param model_path Path to the model file
 * @param tokenizer Initialized tokenizer instance
 * @param data_files Vector of paths to additional data required by the model
 * @param temperature Optional temperature parameter for controlling randomness
 * (deprecated)
 * @param event_tracer Optional event tracer for profiling
 * @param method_name Name of the method to execute in the model
 * @param load_mode Loading strategy for the model file. Defaults to
 * MmapUseMlockIgnoreErrors which uses mmap to avoid loading the entire
 * model into RAM and attempts to pin pages with mlock for lower inference
 * latency, gracefully falling back to standard mmap if mlock is unavailable.
 * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance, or
 * nullptr on failure
 */
ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
    const std::string& model_path,
    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
    std::vector<std::string> data_files = {},
    float temperature = -1.0f,
    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr,
    const std::string& method_name = "forward",
    Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

/**
 * @brief Creates a MultimodalRunner instance with dependency injection
 *
 * This factory function creates and initializes a MultimodalRunner with all
 * necessary components for multimodal text generation.
 *
 * @param model_path Path to the model file
 * @param tokenizer Initialized tokenizer instance
 * @param data_path Optional path to additional .ptd required by the model
 * @return std::unique_ptr<MultimodalRunner> Initialized MultimodalRunner
 * instance, or nullptr on failure
 */
ET_EXPERIMENTAL std::unique_ptr<MultimodalRunner> create_multimodal_runner(
    const std::string& model_path,
    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
    std::optional<const std::string> data_path = std::nullopt,
    Module::LoadMode load_mode = Module::LoadMode::File);

} // namespace executorch::extension::llm