Skip to content

Commit a0b80e3

Browse files
refactor: unify generate — Jinja renders prompt+<image> tokens in JS, C++ splits on placeholder
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent b398952 commit a0b80e3

5 files changed

Lines changed: 92 additions & 162 deletions

File tree

packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -228,36 +228,6 @@ getValue<std::vector<uint64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
228228
return getArrayAsVector<uint64_t>(val, runtime);
229229
}
230230

231-
struct NativeMessage {
232-
std::string role; // "user" | "assistant" | "system"
233-
std::string content;
234-
std::string mediaPath; // empty string if no media
235-
};
236-
237-
template <>
238-
inline std::vector<NativeMessage>
239-
getValue<std::vector<NativeMessage>>(const jsi::Value &val,
240-
jsi::Runtime &runtime) {
241-
jsi::Array array = val.asObject(runtime).asArray(runtime);
242-
size_t length = array.size(runtime);
243-
std::vector<NativeMessage> result;
244-
result.reserve(length);
245-
for (size_t i = 0; i < length; ++i) {
246-
jsi::Object obj = array.getValueAtIndex(runtime, i).asObject(runtime);
247-
NativeMessage msg;
248-
msg.role =
249-
obj.getProperty(runtime, "role").getString(runtime).utf8(runtime);
250-
msg.content =
251-
obj.getProperty(runtime, "content").getString(runtime).utf8(runtime);
252-
auto mediaProp = obj.getProperty(runtime, "mediaPath");
253-
if (!mediaProp.isUndefined() && !mediaProp.isNull()) {
254-
msg.mediaPath = mediaProp.getString(runtime).utf8(runtime);
255-
}
256-
result.push_back(std::move(msg));
257-
}
258-
return result;
259-
}
260-
261231
// Template specializations for std::span<T> types
262232
template <>
263233
inline std::span<float> getValue<std::span<float>>(const jsi::Value &val,

packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
156156
addFunctions(JSI_EXPORT_FUNCTION(
157157
ModelHostObject<Model>,
158158
promiseHostFunction<static_cast<std::string (Model::*)(
159-
std::string, std::string, std::shared_ptr<jsi::Function>)>(
160-
&Model::generate)>,
161-
"generateWithImage"));
162-
163-
addFunctions(
164-
JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
165-
promiseHostFunction<&Model::generateMultimodal>,
166-
"generateMultimodal"));
159+
std::string, std::vector<std::string>,
160+
std::shared_ptr<jsi::Function>)>(&Model::generate)>,
161+
"generateMultimodal"));
167162
}
168163

169164
if constexpr (meta::SameAs<Model, models::text_to_image::TextToImage>) {

packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp

Lines changed: 26 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,6 @@ using executorch::runtime::Error;
1818
static constexpr int kImageSize = 512;
1919
static constexpr int kImageChannels = 3;
2020

21-
// LFM2-VL chat template
22-
static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n";
23-
static constexpr const char *kChatSuffix =
24-
"<|im_end|>\n<|im_start|>assistant\n";
25-
// Separator inserted after each assistant turn in multi-turn conversations
26-
static constexpr const char *kAssistantTurnEnd = "<|im_end|>\n";
27-
// Prefix for subsequent user turns (no BOS token — only first turn has it)
28-
static constexpr const char *kUserTurnPrefix = "<|im_start|>user\n";
29-
3021
static llm::Image loadImageForVLM(const std::string &imagePath) {
3122
cv::Mat mat = image_processing::readImage(imagePath);
3223
cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
@@ -106,7 +97,8 @@ std::string LLM::generate(std::string input,
10697
return output;
10798
}
10899

109-
std::string LLM::generate(std::string imagePath, std::string prompt,
100+
std::string LLM::generate(std::string prompt,
101+
std::vector<std::string> imagePaths,
110102
std::shared_ptr<jsi::Function> callback) {
111103
if (!runner_ || !runner_->is_loaded()) {
112104
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
@@ -118,77 +110,34 @@ std::string LLM::generate(std::string imagePath, std::string prompt,
118110
"This is a text-only model. Call generate(prompt, cb).");
119111
}
120112

121-
llm::Image image = loadImageForVLM(imagePath);
122-
std::vector<llm::MultimodalInput> inputs = {
123-
llm::make_text_input(std::string(kChatPrefix)),
124-
llm::make_image_input(std::move(image)),
125-
llm::make_text_input(prompt + kChatSuffix),
126-
};
127-
128-
std::string output;
129-
auto nativeCallback = [this, &callback, &output](const std::string &token) {
130-
output += token;
131-
if (callback && callInvoker) {
132-
callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
133-
callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
134-
});
135-
}
136-
};
137-
138-
auto error =
139-
runner_->generate(inputs, temperature_, topp_, -1, nativeCallback);
140-
if (error != Error::Ok) {
141-
throw RnExecutorchError(error, "Failed to generate multimodal response");
142-
}
143-
144-
return output;
145-
}
146-
147-
std::string LLM::generateMultimodal(
148-
std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
149-
std::shared_ptr<jsi::Function> callback) {
150-
if (!runner_ || !runner_->is_loaded()) {
151-
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
152-
"Runner is not loaded");
153-
}
154-
if (!multimodal_) {
155-
throw RnExecutorchError(
156-
RnExecutorchErrorCode::InvalidUserInput,
157-
"This is a text-only model. Use generate(prompt, cb) instead.");
158-
}
113+
// Split rendered prompt on "<image>" placeholders and interleave with images.
114+
static constexpr const char *kImageToken = "<image>";
115+
static constexpr size_t kImageTokenLen = 7; // strlen("<image>")
159116

160117
std::vector<llm::MultimodalInput> inputs;
161-
bool isFirst = true;
162-
163-
for (const auto &msg : messages) {
164-
if (msg.role == "system") {
165-
// LFM2-VL has no dedicated system turn — skip silently, consistent
166-
// with the single-turn generate(imagePath, prompt, cb) path.
167-
continue;
168-
}
169-
170-
if (msg.role == "user") {
171-
if (isFirst) {
172-
inputs.push_back(llm::make_text_input(std::string(kChatPrefix)));
173-
isFirst = false;
174-
} else {
175-
inputs.push_back(llm::make_text_input(std::string(kUserTurnPrefix)));
176-
}
177-
178-
if (!msg.mediaPath.empty()) {
179-
const llm::Image &img = getOrLoadImage(msg.mediaPath);
180-
inputs.push_back(llm::make_image_input(img));
118+
size_t imageIdx = 0;
119+
size_t searchPos = 0;
120+
121+
while (true) {
122+
size_t found = prompt.find(kImageToken, searchPos);
123+
if (found == std::string::npos) {
124+
// Remaining text after last image (or entire prompt if no images)
125+
if (searchPos < prompt.size()) {
126+
inputs.push_back(llm::make_text_input(prompt.substr(searchPos)));
181127
}
182-
183-
if (!msg.content.empty()) {
184-
inputs.push_back(llm::make_text_input(msg.content));
185-
}
186-
187-
inputs.push_back(llm::make_text_input(std::string(kChatSuffix)));
188-
} else if (msg.role == "assistant") {
189-
inputs.push_back(llm::make_text_input(msg.content + kAssistantTurnEnd));
190-
isFirst = false;
128+
break;
129+
}
130+
// Text segment before this placeholder
131+
if (found > searchPos) {
132+
inputs.push_back(
133+
llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
134+
}
135+
// Image at this position
136+
if (imageIdx < imagePaths.size()) {
137+
const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]);
138+
inputs.push_back(llm::make_image_input(img));
191139
}
140+
searchPos = found + kImageTokenLen;
192141
}
193142

194143
if (inputs.empty()) {

packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
#include <ReactCommon/CallInvoker.h>
77
#include <jsi/jsi.h>
8-
#include <rnexecutorch/host_objects/JsiConversions.h>
98
#include <rnexecutorch/models/BaseModel.h>
109
#include <runner/image.h>
1110
#include <runner/unified_runner.h>
@@ -20,19 +19,15 @@ class LLM : public BaseModel {
2019
const std::string &tokenizerSource,
2120
std::shared_ptr<react::CallInvoker> callInvoker);
2221

23-
// Text-only generate (existing signature — used by LLMController)
24-
std::string generate(std::string input,
22+
// Text-only: pre-rendered prompt string
23+
std::string generate(std::string prompt,
2524
std::shared_ptr<jsi::Function> callback);
2625

27-
// Multimodal generate (image + text prompt)
28-
std::string generate(std::string imagePath, std::string prompt,
26+
// Multimodal: pre-rendered prompt string with <image> placeholders +
27+
// ordered list of image paths (one per placeholder)
28+
std::string generate(std::string prompt, std::vector<std::string> imagePaths,
2929
std::shared_ptr<jsi::Function> callback);
3030

31-
// Multimodal generate — takes full message history, builds MultimodalInput[]
32-
std::string generateMultimodal(
33-
std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
34-
std::shared_ptr<jsi::Function> callback);
35-
3631
void interrupt();
3732
void reset();
3833
void unload() noexcept;

packages/react-native-executorch/src/controllers/LLMController.ts

Lines changed: 58 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ export class LLMController {
211211
this.isGeneratingCallback(false);
212212
}
213213

214-
public async forward(input: string): Promise<string> {
214+
public async forward(input: string, imagePaths?: string[]): Promise<string> {
215215
if (!this._isReady) {
216216
throw new RnExecutorchError(
217217
RnExecutorchErrorCode.ModuleNotLoaded,
@@ -227,7 +227,14 @@ export class LLMController {
227227
try {
228228
this.isGeneratingCallback(true);
229229
this.nativeModule.reset();
230-
const response = await this.nativeModule.generate(input, this.onToken);
230+
const response =
231+
imagePaths && imagePaths.length > 0
232+
? await this.nativeModule.generateMultimodal(
233+
input,
234+
imagePaths,
235+
this.onToken
236+
)
237+
: await this.nativeModule.generate(input, this.onToken);
231238
return this.filterSpecialTokens(response);
232239
} catch (e) {
233240
throw parseUnknownError(e);
@@ -317,42 +324,56 @@ export class LLMController {
317324

318325
let response: string;
319326

320-
if (updatedHistory.some((m) => m.mediaPath)) {
321-
// Any message in history has media — use multimodal path
322-
const historyWithSystemPrompt = [
323-
{ content: this.chatConfig.systemPrompt, role: 'system' as const },
324-
...updatedHistory,
325-
];
326-
try {
327-
this.isGeneratingCallback(true);
328-
response = await this.nativeModule.generateMultimodal(
329-
historyWithSystemPrompt,
330-
this.onToken
331-
);
332-
} catch (e) {
333-
throw parseUnknownError(e);
334-
} finally {
335-
this.isGeneratingCallback(false);
336-
}
327+
const isMultimodal = updatedHistory.some((m) => m.mediaPath);
328+
329+
// For multimodal messages, convert mediaPath into structured content so
330+
// the chat template emits <image> placeholders in the right position.
331+
const historyForTemplate = isMultimodal
332+
? updatedHistory.map((m) =>
333+
m.mediaPath
334+
? {
335+
...m,
336+
content: [
337+
{ type: 'image' },
338+
{ type: 'text', text: m.content },
339+
] as any,
340+
}
341+
: m
342+
)
343+
: updatedHistory;
344+
345+
const countTokensCallback = (messages: Message[]) => {
346+
const rendered = this.applyChatTemplate(
347+
messages,
348+
this.tokenizerConfig,
349+
this.toolsConfig?.tools,
350+
// eslint-disable-next-line camelcase
351+
{ tools_in_user_message: false, add_generation_prompt: true }
352+
);
353+
return this.nativeModule.countTextTokens(rendered);
354+
};
355+
const maxContextLength = this.nativeModule.getMaxContextLength();
356+
const messageHistoryWithPrompt =
357+
this.chatConfig.contextStrategy.buildContext(
358+
this.chatConfig.systemPrompt,
359+
historyForTemplate,
360+
maxContextLength,
361+
countTokensCallback
362+
);
363+
364+
if (isMultimodal) {
365+
const renderedPrompt = this.applyChatTemplate(
366+
messageHistoryWithPrompt,
367+
this.tokenizerConfig,
368+
undefined,
369+
// eslint-disable-next-line camelcase
370+
{ tools_in_user_message: false, add_generation_prompt: true }
371+
);
372+
const imagePaths = updatedHistory
373+
.filter((m) => m.mediaPath)
374+
.map((m) => m.mediaPath!);
375+
response = await this.forward(renderedPrompt, imagePaths);
337376
} else {
338-
const countTokensCallback = (messages: Message[]) => {
339-
const rendered = this.applyChatTemplate(
340-
messages,
341-
this.tokenizerConfig,
342-
this.toolsConfig?.tools,
343-
// eslint-disable-next-line camelcase
344-
{ tools_in_user_message: false, add_generation_prompt: true }
345-
);
346-
return this.nativeModule.countTextTokens(rendered);
347-
};
348-
const maxContextLength = this.nativeModule.getMaxContextLength();
349-
const messageHistoryWithPrompt =
350-
this.chatConfig.contextStrategy.buildContext(
351-
this.chatConfig.systemPrompt,
352-
updatedHistory,
353-
maxContextLength,
354-
countTokensCallback
355-
);
356377
response = await this.generate(
357378
messageHistoryWithPrompt,
358379
this.toolsConfig?.tools

0 commit comments

Comments
 (0)