software-mansion
diff --git a/‎packages/react-native-executorch/common/runner/base_llm_runner.cpp‎
Lines changed: 3 additions & 2 deletions b/‎packages/react-native-executorch/common/runner/base_llm_runner.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎packages/react-native-executorch/common/runner/constants.h‎
Lines changed: 5 additions & 0 deletions b/‎packages/react-native-executorch/common/runner/constants.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp‎
Lines changed: 9 additions & 0 deletions b/‎packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎packages/react-native-executorch/common/runner/multimodal_decoder_runner.h‎
Lines changed: 54 additions & 1 deletion b/‎packages/react-native-executorch/common/runner/multimodal_decoder_runner.h‎
Lines changed: 54 additions & 1 deletion
@@ -18,8 +18,8 @@ BaseLLMRunner::BaseLLMRunner(std::unique_ptr<Module> module,
       tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
       metadata_({
           {kEnableDynamicShape, false},
-          {kMaxSeqLen, 128},
-          {kMaxContextLen, 128},
+          {kMaxSeqLen, 2048},
+          {kMaxContextLen, 2048},
           {kUseKVCache, true},
       }) {}
 
@@ -69,6 +69,7 @@ Error BaseLLMRunner::load() {
       eos_ids_->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
     }
   }
+  eos_ids_->emplace(static_cast<uint64_t>(1));
   if (eos_ids_->empty()) {
     throw rnexecutorch::RnExecutorchError(
         rnexecutorch::RnExecutorchErrorCode::InvalidModelOutput,
 
@@ -17,6 +17,11 @@ inline constexpr auto kMaxSeqLen = "get_max_seq_len";
 inline constexpr auto kMaxContextLen = "get_max_context_len";
 inline constexpr auto kVocabSize = "get_vocab_size";
 inline constexpr auto kUseKVCache = "use_kv_cache";
+// PLE models only: token id that marks image placeholder slots in input_ids.
+// token_embedding run on this id produces the per-layer PLE signal for image
+// positions; the inputs_embeds output for those positions is discarded (the
+// vision encoder output replaces it).
+inline constexpr auto kImagePlaceholderId = "image_placeholder_id";
 
 // Multimodal method name conventions
 inline constexpr auto kVisionEncoderMethod = "vision_encoder";
 
@@ -41,6 +41,8 @@ bool VisionEncoder::is_loaded() const noexcept {
 }
 
 int32_t VisionEncoder::encoderTokenCount() const {
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "VisionEncoder::encoderTokenCount");
   if (!is_loaded()) {
     return 0;
   }
@@ -102,6 +104,8 @@ VisionEncoder::preprocessImage(const std::string &path,
 }
 
 Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "VisionEncoder::encode start");
   if (!is_loaded()) {
     return Error::InvalidState;
   }
@@ -128,9 +132,14 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
   auto image_tensor = ::executorch::extension::from_blob(
       chw.data(), sizes, ::executorch::aten::ScalarType::Float);
 
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "VisionEncoder::encode start1");
   auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "VisionEncoder::encode end1");
   auto embedding = result[0];
   embedding_cache_.emplace(path, embedding);
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "VisionEncoder::encode end");
   return embedding;
 }
 
 
@@ -14,21 +14,48 @@
 #include "text_decoder_runner.h"
 
 namespace executorch::extension::llm {
+// Supports two PTE contracts, selected automatically at load time from
+// `token_embedding`'s output arity:
+//
+//  * Legacy (default):
+//      token_embedding(ids) -> inputs_embeds
+//      text_decoder(inputs_embeds, input_pos)
+//
+//  * Gemma-style PLE (when token_embedding emits 2 outputs):
+//      token_embedding(ids) -> (inputs_embeds, ple_tok)
+//      text_decoder(inputs_embeds, ple_tok, input_pos)
+//    ple_tok carries Gemma4's per-layer PLE signal keyed on input_ids. It's
+//    computed once in token_embedding and threaded through every decoder call
+//    so PLE fires at every position (including multimodal placeholder slots).
 class MultimodalDecoderRunner : public TextDecoderRunner {
 public:
   explicit MultimodalDecoderRunner(Module &module, IOManager *io_manager,
                                    const GenerationConfig &config)
       : TextDecoderRunner(module, io_manager, config) {}
 
+  // True iff the loaded PTE uses the Gemma-style PLE contract above.
+  // Meaningful only after load() has been called.
+  bool uses_ple() const { return uses_ple_; }
+
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
   step(TensorPtr &tokens, int64_t start_pos) override {
     auto embed_result = module_->execute(kTokenEmbeddingMethod, tokens);
+
     if (!embed_result.ok()) {
       return embed_result.error();
     }
-    return decode((*embed_result)[0], start_pos);
+    auto &embed_outputs = *embed_result;
+    if (uses_ple_) {
+      ET_CHECK_MSG(embed_outputs.size() == 2,
+                   "Expected 2 outputs (inputs_embeds, ple_tok) from "
+                   "token_embedding, got %zu",
+                   embed_outputs.size());
+      return decode(embed_outputs[0], embed_outputs[1], start_pos);
+    }
+    return decode(embed_outputs[0], start_pos);
   }
 
+  // Legacy 2-input text_decoder(inputs_embeds, input_pos).
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
   decode(const ::executorch::runtime::EValue &embeddings, int64_t start_pos) {
     auto start_pos_tensor = ::executorch::extension::from_blob(
@@ -46,19 +73,45 @@ class MultimodalDecoderRunner : public TextDecoderRunner {
     return outputs[0].toTensor();
   }
 
+  // PLE 3-input text_decoder(inputs_embeds, ple_tok, input_pos).
+  inline ::executorch::runtime::Result<::executorch::aten::Tensor>
+  decode(const ::executorch::runtime::EValue &embeddings,
+         const ::executorch::runtime::EValue &ple_tok, int64_t start_pos) {
+    auto start_pos_tensor = ::executorch::extension::from_blob(
+        &start_pos, {1}, ::executorch::aten::ScalarType::Long);
+    auto outputs_result = module_->execute(
+        kTextModelMethod, {embeddings, ple_tok, start_pos_tensor});
+    if (!outputs_result.ok()) {
+      return outputs_result.error();
+    }
+    auto &outputs = *outputs_result;
+    ET_CHECK_MSG(outputs.size() == 1,
+                 "Expected 1 output from text_decoder, got %zu",
+                 outputs.size());
+    ET_CHECK_MSG(outputs[0].isTensor(), "text_decoder output is not a tensor");
+    return outputs[0].toTensor();
+  }
+
   inline ::executorch::runtime::Error load() override {
     if (is_method_loaded()) {
       return ::executorch::runtime::Error::Ok;
     }
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
     ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
+
+    auto meta = module_->method_meta(kTokenEmbeddingMethod);
+    ET_CHECK_OK_OR_RETURN_ERROR(meta.error());
+    uses_ple_ = (meta->num_outputs() == 2);
     return ::executorch::runtime::Error::Ok;
   }
 
   inline bool is_method_loaded() override {
     return module_->is_method_loaded(kTokenEmbeddingMethod) &&
            module_->is_method_loaded(kTextModelMethod);
   }
+
+private:
+  bool uses_ple_ = true;
 };
 
 } // namespace executorch::extension::llm
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,8 @@ bool VisionEncoder::is_loaded() const noexcept {`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`int32_t VisionEncoder::encoderTokenCount() const {`
	`44`	`+ rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,`
	`45`	`+ "VisionEncoder::encoderTokenCount");`
`44`	`46`	`if (!is_loaded()) {`
`45`	`47`	`return 0;`
`46`	`48`	`}`
`@@ -102,6 +104,8 @@ VisionEncoder::preprocessImage(const std::string &path,`
`102`	`104`	`}`
`103`	`105`
`104`	`106`	`Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {`
	`107`	`+ rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,`
	`108`	`+ "VisionEncoder::encode start");`
`105`	`109`	`if (!is_loaded()) {`
`106`	`110`	`return Error::InvalidState;`
`107`	`111`	`}`
`@@ -128,9 +132,14 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {`
`128`	`132`	`auto image_tensor = ::executorch::extension::from_blob(`
`129`	`133`	`chw.data(), sizes, ::executorch::aten::ScalarType::Float);`
`130`	`134`
	`135`	`+ rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,`
	`136`	`+ "VisionEncoder::encode start1");`
`131`	`137`	`auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));`
	`138`	`+ rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,`
	`139`	`+ "VisionEncoder::encode end1");`
`132`	`140`	`auto embedding = result[0];`
`133`	`141`	`embedding_cache_.emplace(path, embedding);`
	`142`	`+ rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "VisionEncoder::encode end");`
`134`	`143`	`return embedding;`
`135`	`144`	`}`
`136`	`145`