Skip to content

Commit 1413826

Browse files
fix: address PR review comments for VLM support
1 parent 31e7a00 commit 1413826

14 files changed

Lines changed: 62 additions & 62 deletions

File tree

apps/llm/app.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,7 @@
5555
},
5656
"entitlements": {
5757
"com.apple.developer.kernel.increased-memory-limit": true
58-
},
59-
"appleTeamId": "B357MU264T"
58+
}
6059
},
6160
"android": {
6261
"adaptiveIcon": {

packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
4545
"getInputShape"));
4646
}
4747

48-
// LLM has overloaded generate — handled explicitly in the LLM block below
48+
// LLM::generate and LLM::generateMultimodal registered explicitly below
4949
if constexpr (meta::HasGenerate<Model> &&
5050
!meta::SameAs<Model, models::llm::LLM>) {
5151
addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
@@ -100,11 +100,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
100100
}
101101

102102
if constexpr (meta::SameAs<Model, models::llm::LLM>) {
103-
addFunctions(JSI_EXPORT_FUNCTION(
104-
ModelHostObject<Model>,
105-
promiseHostFunction<static_cast<std::string (Model::*)(
106-
std::string, std::shared_ptr<jsi::Function>)>(&Model::generate)>,
107-
"generate"));
103+
addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
104+
promiseHostFunction<&Model::generate>,
105+
"generate"));
108106

109107
addFunctions(JSI_EXPORT_FUNCTION(
110108
ModelHostObject<Model>, synchronousHostFunction<&Model::interrupt>,
@@ -153,12 +151,10 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
153151
synchronousHostFunction<&Model::reset>,
154152
"reset"));
155153

156-
addFunctions(JSI_EXPORT_FUNCTION(
157-
ModelHostObject<Model>,
158-
promiseHostFunction<static_cast<std::string (Model::*)(
159-
std::string, std::vector<std::string>, std::string,
160-
std::shared_ptr<jsi::Function>)>(&Model::generate)>,
161-
"generateMultimodal"));
154+
addFunctions(
155+
JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
156+
promiseHostFunction<&Model::generateMultimodal>,
157+
"generateMultimodal"));
162158

163159
addFunctions(JSI_EXPORT_FUNCTION(
164160
ModelHostObject<Model>,

packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <filesystem>
55
#include <map>
66
#include <rnexecutorch/Error.h>
7+
#include <rnexecutorch/Log.h>
78
#include <rnexecutorch/threads/GlobalThreadPool.h>
89
#include <runner/encoders/vision_encoder.h>
910
#include <runner/multimodal_runner.h>
@@ -29,7 +30,7 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
2930
for (const auto &cap : capabilities) {
3031
if (cap == "vision") {
3132
encoders[llm::MultimodalType::Image] =
32-
std::make_unique<llm::VisionEncoder>(module_.get());
33+
std::make_unique<llm::VisionEncoder>(*module_);
3334
}
3435
}
3536
runner_ = std::make_unique<llm::MultimodalRunner>(
@@ -69,21 +70,25 @@ std::string LLM::generate(std::string input,
6970
return output;
7071
}
7172

72-
std::string LLM::generate(std::string prompt,
73-
std::vector<std::string> imagePaths,
74-
std::string imageToken,
75-
std::shared_ptr<jsi::Function> callback) {
73+
std::string LLM::generateMultimodal(std::string prompt,
74+
std::vector<std::string> imagePaths,
75+
std::string imageToken,
76+
std::shared_ptr<jsi::Function> callback) {
7677
if (!runner_ || !runner_->is_loaded()) {
7778
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
7879
"Runner is not loaded");
7980
}
8081
if (!runner_->is_multimodal()) {
8182
throw RnExecutorchError(
8283
RnExecutorchErrorCode::InvalidUserInput,
83-
"This is a text-only model. Call generate(prompt, cb).");
84+
"This model does not support multimodal input. Use generate(prompt, "
85+
"callback) for text-only generation.");
8486
}
8587
if (imageToken.empty()) {
86-
imageToken = "<image>";
88+
throw RnExecutorchError(
89+
RnExecutorchErrorCode::InvalidUserInput,
90+
"imageToken must not be empty. Pass the model's image token (e.g. "
91+
"from tokenizer_config.json).");
8792
}
8893

8994
const size_t kImageTokenLen = imageToken.size();
@@ -109,12 +114,19 @@ std::string LLM::generate(std::string prompt,
109114
if (imageIdx >= imagePaths.size()) {
110115
throw RnExecutorchError(
111116
RnExecutorchErrorCode::InvalidUserInput,
112-
"More <image> placeholders in prompt than image paths provided");
117+
"More '" + imageToken +
118+
"' placeholders in prompt than image paths provided");
113119
}
114120
inputs.push_back(llm::make_image_input(imagePaths[imageIdx++]));
115121
searchPos = found + kImageTokenLen;
116122
}
117123

124+
if (imageIdx < imagePaths.size()) {
125+
throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
126+
"More image paths provided than '" + imageToken +
127+
"' placeholders in prompt");
128+
}
129+
118130
if (inputs.empty()) {
119131
throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
120132
"No inputs to generate from");

packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ class LLM : public BaseModel {
2222

2323
std::string generate(std::string prompt,
2424
std::shared_ptr<jsi::Function> callback);
25-
std::string generate(std::string prompt, std::vector<std::string> imagePaths,
26-
std::string imageToken,
27-
std::shared_ptr<jsi::Function> callback);
25+
std::string generateMultimodal(std::string prompt,
26+
std::vector<std::string> imagePaths,
27+
std::string imageToken,
28+
std::shared_ptr<jsi::Function> callback);
2829

2930
void interrupt();
3031
void reset();

packages/react-native-executorch/common/rnexecutorch/threads/GlobalThreadPool.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@ class GlobalThreadPool {
3535
}
3636

3737
numThreads = std::max(numThreads.value(), 2u);
38-
log(rnexecutorch::LOG_LEVEL::Info, "Initializing global thread pool with",
39-
numThreads, "threads");
4038
instance = std::make_unique<HighPerformanceThreadPool>(numThreads.value(),
4139
config);
4240
// Disable OpenCV's internal threading to prevent it from overriding our

packages/react-native-executorch/common/runner/base_llm_runner.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ Error BaseLLMRunner::load() {
3131
if (status != tokenizers::Error::Ok) {
3232
throw rnexecutorch::RnExecutorchError(
3333
rnexecutorch::RnExecutorchErrorCode::TokenizerError,
34-
"Unexpected issue occurred while loading tokenizer");
34+
"Unexpected issue occurred while loading tokenizer (error code: " +
35+
std::to_string(static_cast<int>(status)) + ")");
3536
}
3637

3738
const auto method_names =
@@ -46,8 +47,6 @@ Error BaseLLMRunner::load() {
4647
.toScalar()
4748
.to<decltype(metadata_)::mapped_type>();
4849
}
49-
rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
50-
"[BaseLLMRunner] Metadata:", method_name, "=", value);
5150
}
5251

5352
if (config_.max_seq_len < 0)

packages/react-native-executorch/common/runner/encoders/iencoder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class IEncoder {
1212
public:
1313
virtual ~IEncoder() = default;
1414
virtual ::executorch::runtime::Error load() = 0;
15-
virtual bool is_loaded() const = 0;
15+
virtual bool is_loaded() const noexcept = 0;
1616

1717
virtual ::executorch::runtime::Result<::executorch::runtime::EValue>
1818
encode(const MultimodalInput &input) = 0;

packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ using ::executorch::runtime::Error;
1515
using ::executorch::runtime::EValue;
1616
using ::executorch::runtime::Result;
1717

18-
VisionEncoder::VisionEncoder(::executorch::extension::Module *module)
19-
: module_(module) {}
18+
VisionEncoder::VisionEncoder(::executorch::extension::Module &module)
19+
: module_(&module) {}
2020

2121
Error VisionEncoder::load() {
2222
if (is_loaded()) {
@@ -33,16 +33,14 @@ Error VisionEncoder::load() {
3333
"Model does not support vision: 'vision_encoder' method not found. "
3434
"Check that the .pte file matches the declared capabilities.");
3535
}
36-
rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
37-
"[VisionEncoder] Loading method:", kVisionEncoderMethod);
3836
return module_->load_method(kVisionEncoderMethod);
3937
}
4038

41-
bool VisionEncoder::is_loaded() const {
39+
bool VisionEncoder::is_loaded() const noexcept {
4240
return module_->is_method_loaded(kVisionEncoderMethod);
4341
}
4442

45-
int32_t VisionEncoder::encoderTokenCount() const {
43+
int32_t VisionEncoder::encoderTokenCount() const noexcept {
4644
if (!is_loaded()) {
4745
return 0;
4846
}
@@ -78,16 +76,17 @@ Result<VisionEncoder::ImageShape> VisionEncoder::getInputShape() const {
7876

7977
std::vector<float>
8078
VisionEncoder::preprocessImage(const std::string &path,
81-
const ImageShape &shape) const {
79+
const ImageShape &targetShape) const {
8280
cv::Mat mat = rnexecutorch::image_processing::readImage(path);
83-
cv::resize(mat, mat, cv::Size(shape.width, shape.height));
81+
cv::resize(mat, mat, cv::Size(targetShape.width, targetShape.height));
8482
cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
8583

86-
const int32_t pixelCount = shape.height * shape.width;
87-
std::vector<float> chw(shape.channels * pixelCount);
84+
const int32_t pixelCount = targetShape.height * targetShape.width;
85+
std::vector<float> chw(targetShape.channels * pixelCount);
8886
for (int32_t i = 0; i < pixelCount; ++i) {
89-
cv::Vec3b px = mat.at<cv::Vec3b>(i / shape.width, i % shape.width);
90-
for (int32_t c = 0; c < shape.channels; ++c) {
87+
cv::Vec3b px =
88+
mat.at<cv::Vec3b>(i / targetShape.width, i % targetShape.width);
89+
for (int32_t c = 0; c < targetShape.channels; ++c) {
9190
chw[c * pixelCount + i] = static_cast<float>(px[c]);
9291
}
9392
}
@@ -122,7 +121,7 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
122121
chw.data(), sizes, ::executorch::aten::ScalarType::Float);
123122

124123
auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
125-
EValue embedding = result[0];
124+
auto embedding = result[0];
126125
embedding_cache_.emplace(path, embedding);
127126
return embedding;
128127
}

packages/react-native-executorch/common/runner/encoders/vision_encoder.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@ namespace executorch::extension::llm {
1212

1313
class VisionEncoder : public IEncoder {
1414
public:
15-
explicit VisionEncoder(::executorch::extension::Module *module);
15+
explicit VisionEncoder(::executorch::extension::Module &module);
1616

1717
::executorch::runtime::Error load() override;
18-
bool is_loaded() const override;
18+
bool is_loaded() const noexcept override;
1919
::executorch::runtime::Result<::executorch::runtime::EValue>
2020
encode(const MultimodalInput &input) override;
21-
int32_t encoderTokenCount() const override;
21+
int32_t encoderTokenCount() const noexcept override;
2222

2323
private:
2424
struct ImageShape {
@@ -28,7 +28,7 @@ class VisionEncoder : public IEncoder {
2828

2929
::executorch::runtime::Result<ImageShape> getInputShape() const;
3030
std::vector<float> preprocessImage(const std::string &path,
31-
const ImageShape &shape) const;
31+
const ImageShape &targetShape) const;
3232

3333
::executorch::extension::Module *module_;
3434
std::unordered_map<std::string, ::executorch::runtime::EValue>

packages/react-native-executorch/common/runner/multimodal_runner.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@ bool MultimodalRunner::is_loaded() const {
3838
}
3939

4040
Error MultimodalRunner::load_subcomponents() {
41-
rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading",
42-
encoders_.size(), "encoder(s)");
4341
for (auto &[type, encoder] : encoders_) {
4442
ET_CHECK_OK_OR_RETURN_ERROR(encoder->load());
4543
}

0 commit comments

Comments
 (0)