feat: support different input format in LLM text prefiller (#661)

IgorSwat · IgorSwat · commit d67b5d9f84ad · 2025-10-24T16:10:51.000+02:00
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp
@@ -8,13 +8,14 @@ namespace rnexecutorch::models {
 
 using namespace facebook;
 using namespace executorch::extension;
+using ::executorch::extension::module::Module;
 using ::executorch::runtime::Error;
 
 BaseModel::BaseModel(const std::string &modelSource,
-                     std::shared_ptr<react::CallInvoker> callInvoker)
+                     std::shared_ptr<react::CallInvoker> callInvoker,
+                     Module::LoadMode loadMode)
     : callInvoker(callInvoker),
-      module_(std::make_unique<Module>(
-          modelSource, Module::LoadMode::MmapUseMlockIgnoreErrors)) {
+      module_(std::make_unique<Module>(modelSource, loadMode)) {
   Error loadError = module_->load();
   if (loadError != Error::Ok) {
     throw std::runtime_error("Failed to load model: Error " +
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h
@@ -13,12 +13,16 @@
 namespace rnexecutorch {
 namespace models {
 using namespace facebook;
+using executorch::extension::module::Module;
 using executorch::runtime::EValue;
 using executorch::runtime::Result;
+
 class BaseModel {
 public:
-  BaseModel(const std::string &modelSource,
-            std::shared_ptr<react::CallInvoker> callInvoker);
+  BaseModel(
+      const std::string &modelSource,
+      std::shared_ptr<react::CallInvoker> callInvoker,
+      Module::LoadMode loadMode = Module::LoadMode::MmapUseMlockIgnoreErrors);
   std::size_t getMemoryLowerBound() const noexcept;
   void unload() noexcept;
   std::vector<int32_t> getInputShape(std::string method_name, int32_t index);
@@ -42,12 +46,13 @@ class BaseModel {
   std::shared_ptr<react::CallInvoker> callInvoker;
   std::unique_ptr<executorch::extension::Module> module_;
 
-private:
   std::size_t memorySizeLowerBound{0};
+
+private:
   std::vector<int32_t> getTensorShape(const executorch::aten::Tensor &tensor);
 };
 } // namespace models
 
 REGISTER_CONSTRUCTOR(models::BaseModel, std::string,
                      std::shared_ptr<react::CallInvoker>);
-} // namespace rnexecutorch
+} // namespace rnexecutorch
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -8,21 +8,34 @@
 namespace rnexecutorch::models::llm {
 using namespace facebook;
 using executorch::extension::TensorPtr;
+using executorch::extension::module::Module;
 using executorch::runtime::Error;
 
 LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::shared_ptr<react::CallInvoker> callInvoker)
-    : runner(std::make_unique<example::Runner>(modelSource, tokenizerSource)),
-      callInvoker(callInvoker) {
-
+    : BaseModel(modelSource, callInvoker, Module::LoadMode::File),
+      runner(std::make_unique<example::Runner>(module_.get(), modelSource,
+                                               tokenizerSource)) {
   auto loadResult = runner->load();
   if (loadResult != Error::Ok) {
     throw std::runtime_error("Failed to load LLM runner, error code: " +
                              std::to_string(static_cast<int>(loadResult)));
   }
+
   memorySizeLowerBound =
       std::filesystem::file_size(std::filesystem::path(modelSource)) +
       std::filesystem::file_size(std::filesystem::path(tokenizerSource));
+
+  // Determine the input mode
+  auto tokensTensorShape = getInputShape("forward", 0);
+  auto positionsTensorShape = getInputShape("forward", 1);
+  if (tokensTensorShape.size() != 2 || positionsTensorShape.size() != 1) {
+    throw std::runtime_error("Unsupported LLM input format");
+  }
+  if (positionsTensorShape[0] != 1 &&
+      tokensTensorShape[1] == positionsTensorShape[0]) {
+    runner->set_extended_input_mode(true);
+  }
 }
 
 void LLM::generate(std::string input, std::shared_ptr<jsi::Function> callback) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -3,16 +3,16 @@
 #include <memory>
 #include <string>
 
-#include "rnexecutorch/metaprogramming/ConstructorHelpers.h"
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
+#include <rnexecutorch/models/BaseModel.h>
 #include <runner/runner.h>
 
 namespace rnexecutorch {
 namespace models::llm {
 using namespace facebook;
 
-class LLM {
+class LLM : public BaseModel {
 public:
   explicit LLM(const std::string &modelSource,
                const std::string &tokenizerSource,
@@ -27,12 +27,16 @@ class LLM {
   void setTimeInterval(size_t timeInterval);
 
 private:
-  size_t memorySizeLowerBound;
   std::unique_ptr<example::Runner> runner;
-  std::shared_ptr<react::CallInvoker> callInvoker;
+
+  // A typical input for parallel processing in exported LLM model consists of 2
+  // tensors of shapes [1, N] and [1], where N is the number of tokens. Hovewer,
+  // some exported models require inputs of shapes [1, N] and [N], which needs
+  // to be marked before using LLM runner.
+  bool extended_input_mode_ = false;
 };
 } // namespace models::llm
 
 REGISTER_CONSTRUCTOR(models::llm::LLM, std::string, std::string,
                      std::shared_ptr<react::CallInvoker>);
-} // namespace rnexecutorch
+} // namespace rnexecutorch
diff --git a/packages/react-native-executorch/common/runner/runner.cpp b/packages/react-native-executorch/common/runner/runner.cpp
@@ -47,27 +47,19 @@ static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 } // namespace
 
-Runner::Runner(const std::string &model_path, const std::string &tokenizer_path,
-               const float temperature,
+Runner::Runner(Module *module, const std::string &model_path,
+               const std::string &tokenizer_path,
+               const bool extended_input_mode, const float temperature,
                std::optional<const std::string> data_path)
-    // NOTE: we observed ~2x loading performance increase on iPhone 15
-    // and a ~5% improvement on Galaxy S22 by switching to
-    // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-    : temperature_(temperature), tokenizer_path_(tokenizer_path),
-      metadata_({
-          {kEnableDynamicShape, false},
-          {kMaxSeqLen, 128},
-          {kMaxContextLen, 128},
-          {kUseKVCache, true},
-          {kUseSDPAWithKVCache, false},
-      }) {
-  if (data_path.has_value()) {
-    module_ = std::make_unique<Module>(model_path, data_path.value(),
-                                       Module::LoadMode::File);
-  } else {
-    module_ = std::make_unique<Module>(model_path, Module::LoadMode::File);
-  }
-  ET_LOG(Info, "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
+    : module_(module), temperature_(temperature),
+      tokenizer_path_(tokenizer_path), metadata_({
+                                           {kEnableDynamicShape, false},
+                                           {kMaxSeqLen, 128},
+                                           {kMaxContextLen, 128},
+                                           {kUseKVCache, true},
+                                           {kUseSDPAWithKVCache, false},
+                                       }) {
+  ET_LOG(Info, "Creating LLM runner: model_path=%s, tokenizer_path=%s",
          model_path.c_str(), tokenizer_path.c_str());
 }
 
@@ -116,7 +108,7 @@ Error Runner::load() {
     }
   }
   text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-      module_.get(), metadata_.at(kUseKVCache), metadata_.at(kVocabSize),
+      module_, metadata_.at(kUseKVCache), metadata_.at(kVocabSize),
       temperature_);
   text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(), metadata_.at(kUseKVCache),
@@ -206,7 +198,8 @@ Error Runner::generate(const std::string &prompt,
     wrapped_callback(prompt);
   }
   int64_t pos = 0;
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens_uint64, pos);
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens_uint64, pos,
+                                              extend_position_input_);
   stats_.first_token_ms = llm::time_in_ms();
   stats_.prompt_eval_end_ms = llm::time_in_ms();
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
@@ -269,6 +262,10 @@ void Runner::stop() {
   }
 }
 
+void Runner::set_extended_input_mode(bool extend_position_input) {
+  extend_position_input_ = extend_position_input;
+}
+
 void Runner::set_count_interval(size_t count_interval) {
   text_token_generator_->set_count_interval(count_interval);
 }
diff --git a/packages/react-native-executorch/common/runner/runner.h b/packages/react-native-executorch/common/runner/runner.h
@@ -29,10 +29,13 @@ namespace example {
 
 class Runner : public executorch::extension::llm::IRunner {
 public:
-  explicit Runner(const std::string &model_path,
-                  const std::string &tokenizer_path,
-                  const float temperature = 0.8f,
-                  std::optional<const std::string> data_path = std::nullopt);
+  explicit Runner(
+      ::executorch::extension::Module *module,
+      const std::string &model_path, // TODO: consider removing this arg since
+                                     // it is only used for debug purposes
+      const std::string &tokenizer_path, const bool extended_input_mode = false,
+      const float temperature = 0.8f,
+      std::optional<const std::string> data_path = std::nullopt);
 
   bool is_loaded() const;
   ::executorch::runtime::Error load();
@@ -43,6 +46,7 @@ class Runner : public executorch::extension::llm::IRunner {
                stats_callback = {},
            bool echo = true, bool warming = false);
   ::executorch::runtime::Error warmup(const std::string &prompt);
+  void set_extended_input_mode(bool extend_position_input);
   void set_count_interval(size_t count_interval);
   void set_time_interval(size_t time_interval);
   void stop();
@@ -51,10 +55,13 @@ class Runner : public executorch::extension::llm::IRunner {
 
 private:
   float temperature_;
+  bool extend_position_input_{false};
   bool shouldStop_{false};
 
-  // model
-  std::unique_ptr<::executorch::extension::Module> module_;
+  // Main model
+  ::executorch::extension::Module *module_;
+
+  // Subcomponents
   std::string tokenizer_path_;
   std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
@@ -65,4 +72,4 @@ class Runner : public executorch::extension::llm::IRunner {
       text_token_generator_;
 };
 
-} // namespace example
+} // namespace example
diff --git a/packages/react-native-executorch/common/runner/text_prefiller.cpp b/packages/react-native-executorch/common/runner/text_prefiller.cpp
@@ -10,6 +10,7 @@
 // LLM.
 
 #include "text_prefiller.h"
+#include <numeric>
 
 namespace executorch {
 namespace extension {
@@ -21,8 +22,8 @@ TextPrefiller::TextPrefiller(TextDecoderRunner *text_decoder_runner,
       enable_parallel_prefill_(enable_parallel_prefill) {}
 
 ::executorch::runtime::Result<uint64_t>
-TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
-                       int64_t &start_pos) {
+TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens, int64_t &start_pos,
+                       bool extend_position_input) {
   ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
   if (!text_decoder_runner_->is_method_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
@@ -38,8 +39,21 @@ TextPrefiller::prefill(std::vector<uint64_t> &prompt_tokens,
     auto tokens = from_blob(prompt_tokens.data(), {1, num_prompt_tokens},
                             executorch::aten::ScalarType::Long);
 
-    auto start_pos_tensor =
-        from_blob(&start_pos, {1}, executorch::aten::ScalarType::Long);
+    std::unique_ptr<std::vector<int64_t>> extended_start_pos = nullptr;
+    if (extend_position_input) {
+      extended_start_pos =
+          std::make_unique<std::vector<int64_t>>(num_prompt_tokens);
+
+      // Fill the starting positions with values from [start_pos, start_pos +
+      // num_prompt_tokens)
+      std::iota(extended_start_pos->begin(), extended_start_pos->end(),
+                start_pos);
+    }
+
+    auto start_pos_tensor = from_blob(
+        extend_position_input ? extended_start_pos->data() : &start_pos,
+        {extend_position_input ? num_prompt_tokens : 1},
+        executorch::aten::ScalarType::Long);
 
     auto outputs_res = text_decoder_runner_->step(tokens, start_pos_tensor);
 
diff --git a/packages/react-native-executorch/common/runner/text_prefiller.h b/packages/react-native-executorch/common/runner/text_prefiller.h
@@ -30,7 +30,8 @@ class TextPrefiller {
    * @return The next token of the LLM Module after prefill.
    */
   ::executorch::runtime::Result<uint64_t>
-  prefill(std::vector<uint64_t> &prompt_tokens, int64_t &start_pos);
+  prefill(std::vector<uint64_t> &prompt_tokens, int64_t &start_pos,
+          bool extend_position_input = false);
 
 private:
   TextDecoderRunner *text_decoder_runner_;