fix: address PR review comments for VLM support

NorbertKlockiewicz · NorbertKlockiewicz · commit 1413826ff427 · 2026-03-06T22:20:22.000+01:00
diff --git a/apps/llm/app.json b/apps/llm/app.json
@@ -55,8 +55,7 @@
       },
       "entitlements": {
         "com.apple.developer.kernel.increased-memory-limit": true
-      },
-      "appleTeamId": "B357MU264T"
+      }
     },
     "android": {
       "adaptiveIcon": {
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -45,7 +45,7 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
           "getInputShape"));
     }
 
-    // LLM has overloaded generate — handled explicitly in the LLM block below
+    // LLM::generate and LLM::generateMultimodal registered explicitly below
     if constexpr (meta::HasGenerate<Model> &&
                   !meta::SameAs<Model, models::llm::LLM>) {
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
@@ -100,11 +100,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
     }
 
     if constexpr (meta::SameAs<Model, models::llm::LLM>) {
-      addFunctions(JSI_EXPORT_FUNCTION(
-          ModelHostObject<Model>,
-          promiseHostFunction<static_cast<std::string (Model::*)(
-              std::string, std::shared_ptr<jsi::Function>)>(&Model::generate)>,
-          "generate"));
+      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                                       promiseHostFunction<&Model::generate>,
+                                       "generate"));
 
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>, synchronousHostFunction<&Model::interrupt>,
@@ -153,12 +151,10 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
                                        synchronousHostFunction<&Model::reset>,
                                        "reset"));
 
-      addFunctions(JSI_EXPORT_FUNCTION(
-          ModelHostObject<Model>,
-          promiseHostFunction<static_cast<std::string (Model::*)(
-              std::string, std::vector<std::string>, std::string,
-              std::shared_ptr<jsi::Function>)>(&Model::generate)>,
-          "generateMultimodal"));
+      addFunctions(
+          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                              promiseHostFunction<&Model::generateMultimodal>,
+                              "generateMultimodal"));
 
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -4,6 +4,7 @@
 #include <filesystem>
 #include <map>
 #include <rnexecutorch/Error.h>
+#include <rnexecutorch/Log.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
 #include <runner/encoders/vision_encoder.h>
 #include <runner/multimodal_runner.h>
@@ -29,7 +30,7 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
     for (const auto &cap : capabilities) {
       if (cap == "vision") {
         encoders[llm::MultimodalType::Image] =
-            std::make_unique<llm::VisionEncoder>(module_.get());
+            std::make_unique<llm::VisionEncoder>(*module_);
       }
     }
     runner_ = std::make_unique<llm::MultimodalRunner>(
@@ -69,21 +70,25 @@ std::string LLM::generate(std::string input,
   return output;
 }
 
-std::string LLM::generate(std::string prompt,
-                          std::vector<std::string> imagePaths,
-                          std::string imageToken,
-                          std::shared_ptr<jsi::Function> callback) {
+std::string LLM::generateMultimodal(std::string prompt,
+                                    std::vector<std::string> imagePaths,
+                                    std::string imageToken,
+                                    std::shared_ptr<jsi::Function> callback) {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
   if (!runner_->is_multimodal()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidUserInput,
-        "This is a text-only model. Call generate(prompt, cb).");
+        "This model does not support multimodal input. Use generate(prompt, "
+        "callback) for text-only generation.");
   }
   if (imageToken.empty()) {
-    imageToken = "<image>";
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::InvalidUserInput,
+        "imageToken must not be empty. Pass the model's image token (e.g. "
+        "from tokenizer_config.json).");
   }
 
   const size_t kImageTokenLen = imageToken.size();
@@ -109,12 +114,19 @@ std::string LLM::generate(std::string prompt,
     if (imageIdx >= imagePaths.size()) {
       throw RnExecutorchError(
           RnExecutorchErrorCode::InvalidUserInput,
-          "More <image> placeholders in prompt than image paths provided");
+          "More '" + imageToken +
+              "' placeholders in prompt than image paths provided");
     }
     inputs.push_back(llm::make_image_input(imagePaths[imageIdx++]));
     searchPos = found + kImageTokenLen;
   }
 
+  if (imageIdx < imagePaths.size()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "More image paths provided than '" + imageToken +
+                                "' placeholders in prompt");
+  }
+
   if (inputs.empty()) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
                             "No inputs to generate from");
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -22,9 +22,10 @@ class LLM : public BaseModel {
 
   std::string generate(std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
-  std::string generate(std::string prompt, std::vector<std::string> imagePaths,
-                       std::string imageToken,
-                       std::shared_ptr<jsi::Function> callback);
+  std::string generateMultimodal(std::string prompt,
+                                 std::vector<std::string> imagePaths,
+                                 std::string imageToken,
+                                 std::shared_ptr<jsi::Function> callback);
 
   void interrupt();
   void reset();
diff --git a/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h b/packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h
@@ -35,8 +35,6 @@ class GlobalThreadPool {
       }
 
       numThreads = std::max(numThreads.value(), 2u);
-      log(rnexecutorch::LOG_LEVEL::Info, "Initializing global thread pool with",
-          numThreads, "threads");
       instance = std::make_unique<HighPerformanceThreadPool>(numThreads.value(),
                                                              config);
       // Disable OpenCV's internal threading to prevent it from overriding our
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
@@ -31,7 +31,8 @@ Error BaseLLMRunner::load() {
   if (status != tokenizers::Error::Ok) {
     throw rnexecutorch::RnExecutorchError(
         rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occurred while loading tokenizer");
+        "Unexpected issue occurred while loading tokenizer (error code: " +
+            std::to_string(static_cast<int>(status)) + ")");
   }
 
   const auto method_names =
@@ -46,8 +47,6 @@ Error BaseLLMRunner::load() {
                   .toScalar()
                   .to<decltype(metadata_)::mapped_type>();
     }
-    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                      "[BaseLLMRunner] Metadata:", method_name, "=", value);
   }
 
   if (config_.max_seq_len < 0)
diff --git a/packages/react-native-executorch/common/runner/encoders/iencoder.h b/packages/react-native-executorch/common/runner/encoders/iencoder.h
@@ -12,7 +12,7 @@ class IEncoder {
 public:
   virtual ~IEncoder() = default;
   virtual ::executorch::runtime::Error load() = 0;
-  virtual bool is_loaded() const = 0;
+  virtual bool is_loaded() const noexcept = 0;
 
   virtual ::executorch::runtime::Result<::executorch::runtime::EValue>
   encode(const MultimodalInput &input) = 0;
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -15,8 +15,8 @@ using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::Result;
 
-VisionEncoder::VisionEncoder(::executorch::extension::Module *module)
-    : module_(module) {}
+VisionEncoder::VisionEncoder(::executorch::extension::Module &module)
+    : module_(&module) {}
 
 Error VisionEncoder::load() {
   if (is_loaded()) {
@@ -33,16 +33,14 @@ Error VisionEncoder::load() {
         "Model does not support vision: 'vision_encoder' method not found. "
         "Check that the .pte file matches the declared capabilities.");
   }
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[VisionEncoder] Loading method:", kVisionEncoderMethod);
   return module_->load_method(kVisionEncoderMethod);
 }
 
-bool VisionEncoder::is_loaded() const {
+bool VisionEncoder::is_loaded() const noexcept {
   return module_->is_method_loaded(kVisionEncoderMethod);
 }
 
-int32_t VisionEncoder::encoderTokenCount() const {
+int32_t VisionEncoder::encoderTokenCount() const noexcept {
   if (!is_loaded()) {
     return 0;
   }
@@ -78,16 +76,17 @@ Result<VisionEncoder::ImageShape> VisionEncoder::getInputShape() const {
 
 std::vector<float>
 VisionEncoder::preprocessImage(const std::string &path,
-                               const ImageShape &shape) const {
+                               const ImageShape &targetShape) const {
   cv::Mat mat = rnexecutorch::image_processing::readImage(path);
-  cv::resize(mat, mat, cv::Size(shape.width, shape.height));
+  cv::resize(mat, mat, cv::Size(targetShape.width, targetShape.height));
   cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
 
-  const int32_t pixelCount = shape.height * shape.width;
-  std::vector<float> chw(shape.channels * pixelCount);
+  const int32_t pixelCount = targetShape.height * targetShape.width;
+  std::vector<float> chw(targetShape.channels * pixelCount);
   for (int32_t i = 0; i < pixelCount; ++i) {
-    cv::Vec3b px = mat.at<cv::Vec3b>(i / shape.width, i % shape.width);
-    for (int32_t c = 0; c < shape.channels; ++c) {
+    cv::Vec3b px =
+        mat.at<cv::Vec3b>(i / targetShape.width, i % targetShape.width);
+    for (int32_t c = 0; c < targetShape.channels; ++c) {
       chw[c * pixelCount + i] = static_cast<float>(px[c]);
     }
   }
@@ -122,7 +121,7 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
       chw.data(), sizes, ::executorch::aten::ScalarType::Float);
 
   auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
-  EValue embedding = result[0];
+  auto embedding = result[0];
   embedding_cache_.emplace(path, embedding);
   return embedding;
 }
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -12,13 +12,13 @@ namespace executorch::extension::llm {
 
 class VisionEncoder : public IEncoder {
 public:
-  explicit VisionEncoder(::executorch::extension::Module *module);
+  explicit VisionEncoder(::executorch::extension::Module &module);
 
   ::executorch::runtime::Error load() override;
-  bool is_loaded() const override;
+  bool is_loaded() const noexcept override;
   ::executorch::runtime::Result<::executorch::runtime::EValue>
   encode(const MultimodalInput &input) override;
-  int32_t encoderTokenCount() const override;
+  int32_t encoderTokenCount() const noexcept override;
 
 private:
   struct ImageShape {
@@ -28,7 +28,7 @@ class VisionEncoder : public IEncoder {
 
   ::executorch::runtime::Result<ImageShape> getInputShape() const;
   std::vector<float> preprocessImage(const std::string &path,
-                                     const ImageShape &shape) const;
+                                     const ImageShape &targetShape) const;
 
   ::executorch::extension::Module *module_;
   std::unordered_map<std::string, ::executorch::runtime::EValue>
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -38,8 +38,6 @@ bool MultimodalRunner::is_loaded() const {
 }
 
 Error MultimodalRunner::load_subcomponents() {
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading",
-                    encoders_.size(), "encoder(s)");
   for (auto &[type, encoder] : encoders_) {
     ET_CHECK_OK_OR_RETURN_ERROR(encoder->load());
   }
diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -29,9 +29,6 @@ Error TextRunner::load_subcomponents() {
 
   text_decoder_runner_ = std::make_unique<TextDecoderRunner>(
       module_.get(), io_manager_.get(), config_.temperature, config_.topp);
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[TextRunner] Parallel prefill (enable_dynamic_shape):",
-                    config_.enable_dynamic_shape);
   text_prefiller_ = std::make_unique<TextPrefiller>(
       text_decoder_runner_.get(), config_.enable_kv_cache,
       config_.enable_dynamic_shape, config_.max_seq_len);
@@ -103,9 +100,6 @@ Error TextRunner::generate_internal(
   auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_);
   stats_.first_token_ms = time_in_ms();
   stats_.prompt_eval_end_ms = time_in_ms();
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[TextRunner] Prefill took",
-                    stats_.prompt_eval_end_ms - stats_.inference_start_ms,
-                    "ms for", num_prompt_tokens, "tokens");
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
 
   uint64_t cur_token = prefill_res.get();
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -380,6 +380,7 @@ const LFM2_VL_TOKENIZER_CONFIG = `https://huggingface.co/nklockiewicz/lfm2-vl-et
  * @category Models - VLM
  */
 export const LFM2_VL_1_6B_QUANTIZED = {
+  modelName: 'lfm2.5-vl-1.6b-quantized',
   capabilities: ['vision'] as const,
   modelSource: LFM2_VL_1_6B_QUANTIZED_MODEL,
   tokenizerSource: LFM2_VL_TOKENIZER,
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -120,9 +120,11 @@ export class LLMController {
       this.tokenizerConfig = JSON.parse(
         await ResourceFetcher.fs.readAsString(tokenizerConfigPath!)
       );
-      this.nativeModule = global.loadLLM(modelPath, tokenizerPath, [
-        ...(capabilities ?? []),
-      ]);
+      this.nativeModule = global.loadLLM(
+        modelPath,
+        tokenizerPath,
+        capabilities ?? []
+      );
       this.isReadyCallback(true);
       this.onToken = (data: string) => {
         if (!data) {
@@ -319,9 +321,9 @@ export class LLMController {
 
   public async sendMessage(
     message: string,
-    media?: { imagePath?: string; audioPath?: string }
+    media?: { imagePath?: string }
   ): Promise<string> {
-    const mediaPath = media?.imagePath ?? media?.audioPath;
+    const mediaPath = media?.imagePath;
     const newMessage: Message = {
       content: message,
       role: 'user',
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
@@ -4,6 +4,7 @@ import {
   ResourceFetcherAdapter,
 } from './utils/ResourceFetcher';
 import { Triple } from './types/common';
+import { LLMCapability } from './types/llm';
 /**
  * Configuration that goes to the `initExecutorch`.
  * You can pass either bare React Native or Expo configuration.
@@ -51,7 +52,7 @@ declare global {
   var loadLLM: (
     modelSource: string,
     tokenizerSource: string,
-    capabilities: string[]
+    capabilities: readonly LLMCapability[]
   ) => any;
   var loadTextToImage: (
     tokenizerSource: string,

Original file line number	Diff line number	Diff line change
`@@ -35,8 +35,6 @@ class GlobalThreadPool {`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`numThreads = std::max(numThreads.value(), 2u);`
`38`		`- log(rnexecutorch::LOG_LEVEL::Info, "Initializing global thread pool with",`
`39`		`- numThreads, "threads");`
`40`	`38`	`instance = std::make_unique<HighPerformanceThreadPool>(numThreads.value(),`
`41`	`39`	`config);`
`42`	`40`	`// Disable OpenCV's internal threading to prevent it from overriding our`
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,8 @@ Error BaseLLMRunner::load() {`
`31`	`31`	`if (status != tokenizers::Error::Ok) {`
`32`	`32`	`throw rnexecutorch::RnExecutorchError(`
`33`	`33`	`rnexecutorch::RnExecutorchErrorCode::TokenizerError,`
`34`		`- "Unexpected issue occurred while loading tokenizer");`
	`34`	`+ "Unexpected issue occurred while loading tokenizer (error code: " +`
	`35`	`+ std::to_string(static_cast<int>(status)) + ")");`
`35`	`36`	`}`
`36`	`37`
`37`	`38`	`const auto method_names =`
`@@ -46,8 +47,6 @@ Error BaseLLMRunner::load() {`
`46`	`47`	`.toScalar()`
`47`	`48`	`.to<decltype(metadata_)::mapped_type>();`
`48`	`49`	`}`
`49`		`- rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,`
`50`		`- "[BaseLLMRunner] Metadata:", method_name, "=", value);`
`51`	`50`	`}`
`52`	`51`
`53`	`52`	`if (config_.max_seq_len < 0)`
Original file line number	Diff line number	Diff line change
`@@ -38,8 +38,6 @@ bool MultimodalRunner::is_loaded() const {`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`Error MultimodalRunner::load_subcomponents() {`
`41`		`- rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading",`
`42`		`- encoders_.size(), "encoder(s)");`
`43`	`41`	`for (auto &[type, encoder] : encoders_) {`
`44`	`42`	`ET_CHECK_OK_OR_RETURN_ERROR(encoder->load());`
`45`	`43`	`}`