refactor: unify generate — Jinja renders prompt+<image> tokens in JS, C++ splits on placeholder

NorbertKlockiewicz · claude · NorbertKlockiewicz · commit a0b80e38ae49 · 2026-03-02T15:56:55.000+01:00
Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -228,36 +228,6 @@ getValue<std::vector<uint64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
   return getArrayAsVector<uint64_t>(val, runtime);
 }
 
-struct NativeMessage {
-  std::string role; // "user" | "assistant" | "system"
-  std::string content;
-  std::string mediaPath; // empty string if no media
-};
-
-template <>
-inline std::vector<NativeMessage>
-getValue<std::vector<NativeMessage>>(const jsi::Value &val,
-                                     jsi::Runtime &runtime) {
-  jsi::Array array = val.asObject(runtime).asArray(runtime);
-  size_t length = array.size(runtime);
-  std::vector<NativeMessage> result;
-  result.reserve(length);
-  for (size_t i = 0; i < length; ++i) {
-    jsi::Object obj = array.getValueAtIndex(runtime, i).asObject(runtime);
-    NativeMessage msg;
-    msg.role =
-        obj.getProperty(runtime, "role").getString(runtime).utf8(runtime);
-    msg.content =
-        obj.getProperty(runtime, "content").getString(runtime).utf8(runtime);
-    auto mediaProp = obj.getProperty(runtime, "mediaPath");
-    if (!mediaProp.isUndefined() && !mediaProp.isNull()) {
-      msg.mediaPath = mediaProp.getString(runtime).utf8(runtime);
-    }
-    result.push_back(std::move(msg));
-  }
-  return result;
-}
-
 // Template specializations for std::span<T> types
 template <>
 inline std::span<float> getValue<std::span<float>>(const jsi::Value &val,
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -156,14 +156,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
           promiseHostFunction<static_cast<std::string (Model::*)(
-              std::string, std::string, std::shared_ptr<jsi::Function>)>(
-              &Model::generate)>,
-          "generateWithImage"));
-
-      addFunctions(
-          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
-                              promiseHostFunction<&Model::generateMultimodal>,
-                              "generateMultimodal"));
+              std::string, std::vector<std::string>,
+              std::shared_ptr<jsi::Function>)>(&Model::generate)>,
+          "generateMultimodal"));
     }
 
     if constexpr (meta::SameAs<Model, models::text_to_image::TextToImage>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -18,15 +18,6 @@ using executorch::runtime::Error;
 static constexpr int kImageSize = 512;
 static constexpr int kImageChannels = 3;
 
-// LFM2-VL chat template
-static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n";
-static constexpr const char *kChatSuffix =
-    "<|im_end|>\n<|im_start|>assistant\n";
-// Separator inserted after each assistant turn in multi-turn conversations
-static constexpr const char *kAssistantTurnEnd = "<|im_end|>\n";
-// Prefix for subsequent user turns (no BOS token — only first turn has it)
-static constexpr const char *kUserTurnPrefix = "<|im_start|>user\n";
-
 static llm::Image loadImageForVLM(const std::string &imagePath) {
   cv::Mat mat = image_processing::readImage(imagePath);
   cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
@@ -106,7 +97,8 @@ std::string LLM::generate(std::string input,
   return output;
 }
 
-std::string LLM::generate(std::string imagePath, std::string prompt,
+std::string LLM::generate(std::string prompt,
+                          std::vector<std::string> imagePaths,
                           std::shared_ptr<jsi::Function> callback) {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
@@ -118,77 +110,34 @@ std::string LLM::generate(std::string imagePath, std::string prompt,
         "This is a text-only model. Call generate(prompt, cb).");
   }
 
-  llm::Image image = loadImageForVLM(imagePath);
-  std::vector<llm::MultimodalInput> inputs = {
-      llm::make_text_input(std::string(kChatPrefix)),
-      llm::make_image_input(std::move(image)),
-      llm::make_text_input(prompt + kChatSuffix),
-  };
-
-  std::string output;
-  auto nativeCallback = [this, &callback, &output](const std::string &token) {
-    output += token;
-    if (callback && callInvoker) {
-      callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
-        callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
-      });
-    }
-  };
-
-  auto error =
-      runner_->generate(inputs, temperature_, topp_, -1, nativeCallback);
-  if (error != Error::Ok) {
-    throw RnExecutorchError(error, "Failed to generate multimodal response");
-  }
-
-  return output;
-}
-
-std::string LLM::generateMultimodal(
-    std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
-    std::shared_ptr<jsi::Function> callback) {
-  if (!runner_ || !runner_->is_loaded()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
-                            "Runner is not loaded");
-  }
-  if (!multimodal_) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::InvalidUserInput,
-        "This is a text-only model. Use generate(prompt, cb) instead.");
-  }
+  // Split rendered prompt on "<image>" placeholders and interleave with images.
+  static constexpr const char *kImageToken = "<image>";
+  static constexpr size_t kImageTokenLen = 7; // strlen("<image>")
 
   std::vector<llm::MultimodalInput> inputs;
-  bool isFirst = true;
-
-  for (const auto &msg : messages) {
-    if (msg.role == "system") {
-      // LFM2-VL has no dedicated system turn — skip silently, consistent
-      // with the single-turn generate(imagePath, prompt, cb) path.
-      continue;
-    }
-
-    if (msg.role == "user") {
-      if (isFirst) {
-        inputs.push_back(llm::make_text_input(std::string(kChatPrefix)));
-        isFirst = false;
-      } else {
-        inputs.push_back(llm::make_text_input(std::string(kUserTurnPrefix)));
-      }
-
-      if (!msg.mediaPath.empty()) {
-        const llm::Image &img = getOrLoadImage(msg.mediaPath);
-        inputs.push_back(llm::make_image_input(img));
+  size_t imageIdx = 0;
+  size_t searchPos = 0;
+
+  while (true) {
+    size_t found = prompt.find(kImageToken, searchPos);
+    if (found == std::string::npos) {
+      // Remaining text after last image (or entire prompt if no images)
+      if (searchPos < prompt.size()) {
+        inputs.push_back(llm::make_text_input(prompt.substr(searchPos)));
       }
-
-      if (!msg.content.empty()) {
-        inputs.push_back(llm::make_text_input(msg.content));
-      }
-
-      inputs.push_back(llm::make_text_input(std::string(kChatSuffix)));
-    } else if (msg.role == "assistant") {
-      inputs.push_back(llm::make_text_input(msg.content + kAssistantTurnEnd));
-      isFirst = false;
+      break;
+    }
+    // Text segment before this placeholder
+    if (found > searchPos) {
+      inputs.push_back(
+          llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
+    }
+    // Image at this position
+    if (imageIdx < imagePaths.size()) {
+      const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]);
+      inputs.push_back(llm::make_image_input(img));
     }
+    searchPos = found + kImageTokenLen;
   }
 
   if (inputs.empty()) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -5,7 +5,6 @@
 
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
-#include <rnexecutorch/host_objects/JsiConversions.h>
 #include <rnexecutorch/models/BaseModel.h>
 #include <runner/image.h>
 #include <runner/unified_runner.h>
@@ -20,19 +19,15 @@ class LLM : public BaseModel {
                const std::string &tokenizerSource,
                std::shared_ptr<react::CallInvoker> callInvoker);
 
-  // Text-only generate (existing signature — used by LLMController)
-  std::string generate(std::string input,
+  // Text-only: pre-rendered prompt string
+  std::string generate(std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
 
-  // Multimodal generate (image + text prompt)
-  std::string generate(std::string imagePath, std::string prompt,
+  // Multimodal: pre-rendered prompt string with <image> placeholders +
+  // ordered list of image paths (one per placeholder)
+  std::string generate(std::string prompt, std::vector<std::string> imagePaths,
                        std::shared_ptr<jsi::Function> callback);
 
-  // Multimodal generate — takes full message history, builds MultimodalInput[]
-  std::string generateMultimodal(
-      std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
-      std::shared_ptr<jsi::Function> callback);
-
   void interrupt();
   void reset();
   void unload() noexcept;
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -211,7 +211,7 @@ export class LLMController {
     this.isGeneratingCallback(false);
   }
 
-  public async forward(input: string): Promise<string> {
+  public async forward(input: string, imagePaths?: string[]): Promise<string> {
     if (!this._isReady) {
       throw new RnExecutorchError(
         RnExecutorchErrorCode.ModuleNotLoaded,
@@ -227,7 +227,14 @@ export class LLMController {
     try {
       this.isGeneratingCallback(true);
       this.nativeModule.reset();
-      const response = await this.nativeModule.generate(input, this.onToken);
+      const response =
+        imagePaths && imagePaths.length > 0
+          ? await this.nativeModule.generateMultimodal(
+              input,
+              imagePaths,
+              this.onToken
+            )
+          : await this.nativeModule.generate(input, this.onToken);
       return this.filterSpecialTokens(response);
     } catch (e) {
       throw parseUnknownError(e);
@@ -317,42 +324,56 @@ export class LLMController {
 
     let response: string;
 
-    if (updatedHistory.some((m) => m.mediaPath)) {
-      // Any message in history has media — use multimodal path
-      const historyWithSystemPrompt = [
-        { content: this.chatConfig.systemPrompt, role: 'system' as const },
-        ...updatedHistory,
-      ];
-      try {
-        this.isGeneratingCallback(true);
-        response = await this.nativeModule.generateMultimodal(
-          historyWithSystemPrompt,
-          this.onToken
-        );
-      } catch (e) {
-        throw parseUnknownError(e);
-      } finally {
-        this.isGeneratingCallback(false);
-      }
+    const isMultimodal = updatedHistory.some((m) => m.mediaPath);
+
+    // For multimodal messages, convert mediaPath into structured content so
+    // the chat template emits <image> placeholders in the right position.
+    const historyForTemplate = isMultimodal
+      ? updatedHistory.map((m) =>
+          m.mediaPath
+            ? {
+                ...m,
+                content: [
+                  { type: 'image' },
+                  { type: 'text', text: m.content },
+                ] as any,
+              }
+            : m
+        )
+      : updatedHistory;
+
+    const countTokensCallback = (messages: Message[]) => {
+      const rendered = this.applyChatTemplate(
+        messages,
+        this.tokenizerConfig,
+        this.toolsConfig?.tools,
+        // eslint-disable-next-line camelcase
+        { tools_in_user_message: false, add_generation_prompt: true }
+      );
+      return this.nativeModule.countTextTokens(rendered);
+    };
+    const maxContextLength = this.nativeModule.getMaxContextLength();
+    const messageHistoryWithPrompt =
+      this.chatConfig.contextStrategy.buildContext(
+        this.chatConfig.systemPrompt,
+        historyForTemplate,
+        maxContextLength,
+        countTokensCallback
+      );
+
+    if (isMultimodal) {
+      const renderedPrompt = this.applyChatTemplate(
+        messageHistoryWithPrompt,
+        this.tokenizerConfig,
+        undefined,
+        // eslint-disable-next-line camelcase
+        { tools_in_user_message: false, add_generation_prompt: true }
+      );
+      const imagePaths = updatedHistory
+        .filter((m) => m.mediaPath)
+        .map((m) => m.mediaPath!);
+      response = await this.forward(renderedPrompt, imagePaths);
     } else {
-      const countTokensCallback = (messages: Message[]) => {
-        const rendered = this.applyChatTemplate(
-          messages,
-          this.tokenizerConfig,
-          this.toolsConfig?.tools,
-          // eslint-disable-next-line camelcase
-          { tools_in_user_message: false, add_generation_prompt: true }
-        );
-        return this.nativeModule.countTextTokens(rendered);
-      };
-      const maxContextLength = this.nativeModule.getMaxContextLength();
-      const messageHistoryWithPrompt =
-        this.chatConfig.contextStrategy.buildContext(
-          this.chatConfig.systemPrompt,
-          updatedHistory,
-          maxContextLength,
-          countTokensCallback
-        );
       response = await this.generate(
         messageHistoryWithPrompt,
         this.toolsConfig?.tools