executorch/extension/llm/runner/text_llm_runner.h at main · CodeLinaro/executorch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

// A simple llama2 runner that includes preprocessing and post processing logic.
// The module takes in a string as input and emits a string as output.

#pragma once

#include <cstdint>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <unordered_map>

#include <executorch/extension/llm/runner/irunner.h>
#include <executorch/extension/llm/runner/stats.h>
#include <executorch/extension/llm/runner/text_decoder_runner.h>
#include <executorch/extension/llm/runner/text_prefiller.h>
#include <executorch/extension/llm/runner/text_token_generator.h>
#include <executorch/extension/module/module.h>
#include <pytorch/tokenizers/tokenizer.h>
// Helper functions are now in llm_runner_helper.h
// These are provided for backward compatibility
#include <executorch/extension/llm/runner/llm_runner_helper.h>

namespace executorch::extension::llm {

class ET_EXPERIMENTAL TextLLMRunner : public IRunner {
 public:
  /**
   * @brief Constructor for TextLLMRunner with dependency injection
   *
   * Creates a TextLLMRunner instance with all required components for text
   * generation.
   *
   * @param metadata Key-value pairs containing model metadata (e.g.,
   * vocab_size, context_length)
   * @param tokenizer Tokenizer for converting between text and token IDs
   * @param module The underlying model module that performs inference
   * @param text_decoder_runner Component responsible for running the decoder
   * part of the model
   * @param text_prefiller Component for handling the prefill phase of text
   * generation
   * @param io_manager Component for handling I/O operations
   * @param text_token_generator Component for generating tokens during the
   * decode phase
   * @param stats Statistics tracking object for performance monitoring
   * @param temperature Temperature parameter for controlling randomness in
   * generation (deprecated). Please use GenerationConfig.temperature instead.
   */
  explicit TextLLMRunner(
      std::unordered_map<std::string, int64_t> metadata,
      std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
      std::unique_ptr<::executorch::extension::Module> module,
      std::unique_ptr<TextDecoderRunner> text_decoder_runner,
      std::unique_ptr<TextPrefiller> text_prefiller,
      std::unique_ptr<IOManager> io_manager,
      std::unique_ptr<TextTokenGenerator> text_token_generator,
      std::unique_ptr<Stats> stats,
      float temperature = -1.0f);

  /**
   * @brief Checks if the model is loaded and ready for inference
   *
   * @return bool True if the model is loaded, false otherwise
   */
  bool is_loaded() const override;
  /**
   * @brief Loads the model and prepares it for inference
   *
   * This method initializes all components and prepares the model for text
   * generation.
   *
   * @return ::executorch::runtime::Error Success or error status
   */
  ::executorch::runtime::Error load() override;
  /**
   * @brief Generates text based on the provided prompt
   *
   * This method performs text generation using the loaded model. It processes
   * the input prompt, runs the model in prefill and decode phases until max
   * tokens to generate is reached or eos token is generated, then returns
   * generated text and perf stats through callbacks.
   *
   * @param prompt The input text to generate from
   * @param config Configuration parameters for text generation (e.g.,
   * max_new_tokens, temperature)
   * @param token_callback Function called for each generated token with the
   * decoded text
   * @param stats_callback Function called with performance statistics
   * @return ::executorch::runtime::Error Success or error status
   */
  ::executorch::runtime::Error generate(
      const std::string& prompt,
      const GenerationConfig& config,
      std::function<void(const std::string&)> token_callback = {},
      std::function<void(const Stats&)> stats_callback = {}) override;

  /**
   * Prefill multimodal inputs into the KV cache without generating.
   * Only text inputs are processed; non-text inputs are skipped.
   * @param inputs A vector of MultimodalInput objects.
   * @param num_bos Number of BOS tokens to prepend during text encoding.
   * @param num_eos Number of EOS tokens to append during text encoding.
   * @return The next token predicted after prefill, or an error.
   *         KV cache position is tracked internally in pos_.
   */
  ::executorch::runtime::Result<uint64_t> prefill(
      const std::vector<MultimodalInput>& inputs,
      int32_t num_bos = 0,
      int32_t num_eos = 0) override;

  /**
   * Convenience overload: prefill a single text prompt.
   */
  ::executorch::runtime::Result<uint64_t>
  prefill(const std::string& prompt, int32_t num_bos = 0, int32_t num_eos = 0);

  /**
   * Prefill a text prompt using GenerationConfig.
   * Deprecated: prefer prefill(prompt, num_bos, num_eos).
   */
  ::executorch::runtime::Result<uint64_t> prefill(
      const std::string& prompt,
      const GenerationConfig& config);

  /**
   * @brief Warms up the model with a sample prompt
   *
   * This method runs a complete generation cycle without returning results,
   * which helps initialize internal caches and optimize subsequent inferences.
   *
   * @param prompt The sample prompt to use for warmup
   * @param max_new_tokens Maximum number of tokens to generate during warmup
   * @return ::executorch::runtime::Error Success or error status
   */
  ::executorch::runtime::Error warmup(
      const std::string& prompt,
      int32_t max_new_tokens);

  /**
   * @brief Remove prefilled tokens and reset start position, and stats.
   *
   * This method removes the prefilled tokens from the KV cache and resets the
   * start position to 0. It also clears the stats for previous runs.
   */
  void reset() override;

  /**
   * @brief Stops the ongoing text generation process
   *
   * This method signals the generator to stop producing new tokens and
   * terminate the current generation process.
   */
  void stop() override;

 private:
  // Components
  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
  std::unordered_map<std::string, int64_t> metadata_;
  std::unique_ptr<::executorch::extension::Module>
      module_; // Manage module's lifecycle, make sure it outlives
               // text_decoder_runner_.
  std::unique_ptr<TextDecoderRunner>
      text_decoder_runner_; // Manage text_decoder_runner_'s lifecycle, make
                            // sure it outlives text_prefiller_ &
                            // text_token_generator_.
  std::unique_ptr<TextPrefiller> text_prefiller_;
  std::unique_ptr<IOManager> io_manager_;
  std::unique_ptr<TextTokenGenerator> text_token_generator_;

  // Stats
  std::unique_ptr<Stats> stats_;

  // temperature.
  // Deprecated, we should rely on the temperature in GenerationConfig instead.
  float temperature_ = -1.0f;

  // Token predicted by the last prefill() call, consumed by generate("").
  std::optional<uint64_t> prefill_next_token_;

  // The position in KV cache of the input, starting from 0.
  int64_t pos_ = 0;
};

} // namespace executorch::extension::llm