software-mansion
diff --git a/‎apps/llm/app/llm/index.tsx‎
Lines changed: 1 addition & 0 deletions b/‎apps/llm/app/llm/index.tsx‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/llm/app/multimodal_llm/index.tsx‎
Lines changed: 12 additions & 9 deletions b/‎apps/llm/app/multimodal_llm/index.tsx‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h‎
Lines changed: 0 additions & 2 deletions b/‎packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h‎
Lines changed: 0 additions & 5 deletions b/‎packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp‎
Lines changed: 8 additions & 86 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp‎
Lines changed: 8 additions & 86 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h‎
Lines changed: 5 additions & 8 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎packages/react-native-executorch/common/runner/base_llm_runner.cpp‎
Lines changed: 8 additions & 3 deletions b/‎packages/react-native-executorch/common/runner/base_llm_runner.cpp‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎packages/react-native-executorch/common/runner/base_llm_runner.h‎
Lines changed: 1 addition & 0 deletions b/‎packages/react-native-executorch/common/runner/base_llm_runner.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/react-native-executorch/common/runner/constants.h‎
Lines changed: 5 additions & 1 deletion b/‎packages/react-native-executorch/common/runner/constants.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎packages/react-native-executorch/common/runner/encoders/audio_encoder.cpp‎
Lines changed: 35 additions & 27 deletions b/‎packages/react-native-executorch/common/runner/encoders/audio_encoder.cpp‎
Lines changed: 35 additions & 27 deletions
@@ -75,6 +75,7 @@ function LLMScreen() {
     }
   };
 
+  console.log(llm.messageHistory)
   return !llm.isReady && !llm.error ? (
     <Spinner
       visible={true}
 
@@ -34,6 +34,7 @@ const SUGGESTED_PROMPTS = [
   'Describe this scene in detail',
   'What objects can you see?',
   'What text appears in this image?',
+  'Transcribe the audio?',
 ];
 import { useLLMStats } from '../../hooks/useLLMStats';
 import { StatsBar } from '../../components/StatsBar';
@@ -175,7 +176,8 @@ function MultimodalLLMScreen() {
   };
 
   const sendMessage = async () => {
-    if (!userInput.trim() || vlm.isGenerating) return;
+    if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating)
+      return;
     onMessageSend();
     const text = userInput.trim();
     setUserInput('');
@@ -346,14 +348,15 @@ function MultimodalLLMScreen() {
               onChangeText={setUserInput}
             />
 
-            {userInput.trim() && !vlm.isGenerating && (
-              <TouchableOpacity
-                style={styles.sendChatTouchable}
-                onPress={sendMessage}
-              >
-                <SendIcon height={24} width={24} padding={4} margin={8} />
-              </TouchableOpacity>
-            )}
+            {(imageUri || audioBuffer || userInput.trim()) &&
+              !vlm.isGenerating && (
+                <TouchableOpacity
+                  style={styles.sendChatTouchable}
+                  onPress={sendMessage}
+                >
+                  <SendIcon height={24} width={24} padding={4} margin={8} />
+                </TouchableOpacity>
+              )}
             {vlm.isGenerating && (
               <TouchableOpacity
                 style={styles.sendChatTouchable}
 
@@ -223,8 +223,6 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,
   return getArrayAsVector<float>(val, runtime);
 }
 
-// JS side passes an Array<Float32Array> (one clip per element). Each inner
-// element is read as a typed-array span and copied into a std::vector<float>.
 template <>
 inline std::vector<std::vector<float>>
 getValue<std::vector<std::vector<float>>>(const jsi::Value &val,
 
@@ -166,11 +166,6 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
                               promiseHostFunction<&Model::generateMultimodal>,
                               "generateMultimodal"));
 
-      addFunctions(JSI_EXPORT_FUNCTION(
-          ModelHostObject<Model>,
-          promiseHostFunction<&Model::generateMultimodalWithAudio>,
-          "generateMultimodalWithAudio"));
-
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
           synchronousHostFunction<&Model::getVisualTokenCount>,
 
@@ -4,7 +4,6 @@
 #include <filesystem>
 #include <map>
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/Log.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
 #include <runner/encoders/audio_encoder.h>
 #include <runner/encoders/vision_encoder.h>
@@ -22,7 +21,6 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::vector<std::string> capabilities,
          std::shared_ptr<react::CallInvoker> callInvoker)
     : BaseModel(modelSource, callInvoker, Module::LoadMode::Mmap) {
-
   if (capabilities.empty()) {
     runner_ =
         std::make_unique<llm::TextRunner>(std::move(module_), tokenizerSource);
@@ -72,96 +70,19 @@ std::string LLM::generate(std::string input,
 
   auto config = llm::GenerationConfig{.echo = false, .warming = false};
   auto error = runner_->generate(input, config, nativeCallback, {});
+  // No-op unless built with ET_EVENT_TRACER_ENABLED. Writes etdump.bin
+  // alongside the model after the generation finishes.
+  dumpEventTracer();
   if (error != Error::Ok) {
     throw RnExecutorchError(error, "Failed to generate text");
   }
   return output;
 }
 
-std::string LLM::generateMultimodal(std::string prompt,
-                                    std::vector<std::string> imagePaths,
-                                    std::string imageToken,
-                                    std::shared_ptr<jsi::Function> callback) {
-  if (!runner_ || !runner_->is_loaded()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
-                            "Runner is not loaded");
-  }
-  if (!runner_->is_multimodal()) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::InvalidUserInput,
-        "This model does not support multimodal input. Use generate(prompt, "
-        "callback) for text-only generation.");
-  }
-  if (imageToken.empty()) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::InvalidUserInput,
-        "imageToken must not be empty. Pass the model's image token (e.g. "
-        "from tokenizer_config.json).");
-  }
-
-  const size_t kImageTokenLen = imageToken.size();
-
-  std::vector<llm::MultimodalInput> inputs;
-  size_t imageIdx = 0;
-  size_t searchPos = 0;
-
-  while (true) {
-    size_t found = prompt.find(imageToken, searchPos);
-    if (found == std::string::npos) {
-      if (searchPos < prompt.size()) {
-        inputs.push_back(llm::make_text_input(prompt.substr(searchPos)));
-      }
-      break;
-    }
-    // Text segment before this placeholder
-    if (found > searchPos) {
-      inputs.push_back(
-          llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
-    }
-    // Image at this position
-    if (imageIdx >= imagePaths.size()) {
-      throw RnExecutorchError(
-          RnExecutorchErrorCode::InvalidUserInput,
-          "More '" + imageToken +
-              "' placeholders in prompt than image paths provided");
-    }
-    inputs.push_back(llm::make_image_input(imagePaths[imageIdx++]));
-    searchPos = found + kImageTokenLen;
-  }
-
-  if (imageIdx < imagePaths.size()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "More image paths provided than '" + imageToken +
-                                "' placeholders in prompt");
-  }
-
-  if (inputs.empty()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "No inputs to generate from");
-  }
-
-  std::string output;
-  auto nativeCallback = [this, callback, &output](const std::string &token) {
-    output += token;
-    if (callback && callInvoker) {
-      callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
-        callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
-      });
-    }
-  };
-
-  auto error = runner_->generate(inputs, nativeCallback);
-  if (error != Error::Ok) {
-    throw RnExecutorchError(error, "Failed to generate multimodal response");
-  }
-
-  return output;
-}
-
-std::string LLM::generateMultimodalWithAudio(
-    std::string prompt, std::vector<std::string> imagePaths,
-    std::string imageToken, std::vector<std::vector<float>> audioWaveforms,
-    std::string audioToken, std::shared_ptr<jsi::Function> callback) {
+std::string LLM::generateMultimodal(
+    std::string prompt, std::shared_ptr<jsi::Function> callback,
+    std::vector<std::string> imagePaths, std::string imageToken,
+    std::vector<std::vector<float>> audioWaveforms, std::string audioToken) {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
@@ -234,6 +155,7 @@ std::string LLM::generateMultimodalWithAudio(
       });
     }
   };
+
   auto error = runner_->generate(inputs, nativeCallback);
   if (error != Error::Ok) {
     throw RnExecutorchError(error, "Failed to generate multimodal response");
 
@@ -22,19 +22,16 @@ class LLM : public BaseModel {
 
   std::string generate(std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
-  std::string generateMultimodal(std::string prompt,
-                                 std::vector<std::string> imagePaths,
-                                 std::string imageToken,
-                                 std::shared_ptr<jsi::Function> callback);
   // Audio variant: `audioWaveforms` is a parallel vector of fp32 mono 16 kHz
   // PCM buffers (decoded upstream, same contract as SpeechToText::transcribe).
   // The prompt is scanned for `imageToken` and/or `audioToken` placeholders;
   // each placeholder consumes the next entry from its respective vector in
   // order. Either set of paths/waveforms/token may be empty.
-  std::string generateMultimodalWithAudio(
-      std::string prompt, std::vector<std::string> imagePaths,
-      std::string imageToken, std::vector<std::vector<float>> audioWaveforms,
-      std::string audioToken, std::shared_ptr<jsi::Function> callback);
+  std::string generateMultimodal(
+      std::string prompt, std::shared_ptr<jsi::Function> callback,
+      std::vector<std::string> imagePaths = {}, std::string imageToken = "",
+      std::vector<std::vector<float>> audioWaveforms = {},
+      std::string audioToken = "");
 
   void interrupt();
   void reset();
 
@@ -18,8 +18,8 @@ BaseLLMRunner::BaseLLMRunner(std::unique_ptr<Module> module,
       tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
       metadata_({
           {kEnableDynamicShape, false},
-          {kMaxSeqLen, 2048},
-          {kMaxContextLen, 2048},
+          {kMaxSeqLen, 128},
+          {kMaxContextLen, 128},
           {kUseKVCache, true},
       }) {}
 
@@ -69,7 +69,7 @@ Error BaseLLMRunner::load() {
       eos_ids_->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
     }
   }
-  eos_ids_->emplace(static_cast<uint64_t>(1));
+
   if (eos_ids_->empty()) {
     throw rnexecutorch::RnExecutorchError(
         rnexecutorch::RnExecutorchErrorCode::InvalidModelOutput,
@@ -150,6 +150,11 @@ void BaseLLMRunner::set_repetition_penalty(float repetition_penalty) noexcept {
   config_.repetition_penalty = repetition_penalty;
 }
 
+void BaseLLMRunner::set_topk(int32_t topk) noexcept {
+  config_.topk = topk;
+  set_topk_impl(topk);
+}
+
 void BaseLLMRunner::set_count_interval(size_t count_interval) {
   config_.output_token_batch_size = count_interval;
 }
 
@@ -55,6 +55,7 @@ class BaseLLMRunner {
   void set_topp(float topp) noexcept;
   void set_min_p(float min_p) noexcept;
   void set_repetition_penalty(float repetition_penalty) noexcept;
+  void set_topk(int32_t topk) noexcept;
   void set_count_interval(size_t count_interval);
   void set_time_interval(size_t time_interval);
 
 
@@ -28,7 +28,11 @@ inline constexpr auto kVisionEncoderMethod = "vision_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
 inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
 inline constexpr auto kTextModelMethod = "text_decoder";
-inline constexpr auto kMaxPrefillLen = 1024;
+// Absolute ceiling on prefill length (in tokens) and the fallback value used
+// when a PTE doesn't bake `get_max_seq_len`. 2048 matches Gemma4 iter201's
+// PREFILL_LEN / get_max_context_len; legacy PTEs (e.g. LFM2-VL) typically
+// bake their own get_max_seq_len so this ceiling does not affect them.
+inline constexpr auto kMaxPrefillLen = 2048;
 inline constexpr auto numOfAddedBoSTokens = 0;
 inline constexpr auto numOfAddedEoSTokens = 0;
 
 
@@ -1,21 +1,4 @@
 // common/runner/encoders/audio_encoder.cpp
-//
-// Pattern mirrors models/speech_to_text/whisper/ASR.cpp::encode — the PTE has
-// the log-mel frontend baked in, so this encoder hands the raw waveform
-// straight to the `audio_encoder` method. Mel extraction, STFT, filterbank,
-// normalization all live inside the exported module.
-//
-// PTE contract (exp107f onward):
-//   inputs:
-//     waveform[1, N_padded]  fp32  (N_padded = kSamplesPerBlock * k, k>=1)
-//     num_valid_samples[]    int64 (real PCM length before zero-padding)
-//   output:
-//     embeds[1, 12*k, hidden] fp32
-// Caller right-pads the raw waveform up to the next multiple of
-// kSamplesPerBlock with silence; num_valid_samples tells MelFrontend which
-// mel frames correspond to real audio so padded-silence frames are masked
-// out and don't dilute the encoding.
-
 #include "audio_encoder.h"
 
 #include <rnexecutorch/Error.h>
@@ -24,8 +7,10 @@
 
 #include <executorch/extension/tensor/tensor.h>
 
+#include <cmath>
 #include <cstdint>
 #include <cstring>
+#include <string>
 #include <vector>
 
 namespace executorch::extension::llm {
@@ -36,9 +21,14 @@ using ::executorch::runtime::EValue;
 using ::executorch::runtime::Result;
 
 namespace {
-// Matches AUDIO_SAMPLES_PER_BLOCK in gemma_export/experiments/exp107f_*.py.
+// Matches AUDIO_SAMPLES_PER_BLOCK in gemma_export/experiments_vulkan/
+// op_bisect/iter201_mm_4method_dynaudio_prefill2048_export.py.
 // The PTE's audio_samples dim was exported as `7680 * audio_blocks`.
 constexpr int32_t kSamplesPerBlock = 7680;
+// k ∈ [kAudioBlockKMin, kAudioBlockKMax] from MODEL_INTERFACE.md §6.
+// k=62 == 29.76 s @ 16 kHz is the SDPA mask + rel-shift bake point.
+constexpr int64_t kAudioBlockKMin = 1;
+constexpr int64_t kAudioBlockKMax = 62;
 } // namespace
 
 AudioEncoder::AudioEncoder(::executorch::extension::Module &module)
@@ -84,26 +74,44 @@ Result<EValue> AudioEncoder::encode(const MultimodalInput &input) {
 
   const int64_t n_valid = static_cast<int64_t>(wav.samples.size());
   const int64_t k_blocks = (n_valid + kSamplesPerBlock - 1) / kSamplesPerBlock;
+  ET_CHECK_OR_RETURN_ERROR(
+      k_blocks >= kAudioBlockKMin && k_blocks <= kAudioBlockKMax,
+      InvalidArgument,
+      "AudioEncoder: waveform of %lld samples needs k_blocks=%lld; "
+      "audio_encoder accepts k in [%lld, %lld] (block=%d samples; max %.2f s "
+      "@ 16 kHz)",
+      static_cast<long long>(n_valid), static_cast<long long>(k_blocks),
+      static_cast<long long>(kAudioBlockKMin),
+      static_cast<long long>(kAudioBlockKMax),
+      static_cast<int>(kSamplesPerBlock),
+      static_cast<double>(kSamplesPerBlock) *
+          static_cast<double>(kAudioBlockKMax) / 16000.0);
   const int64_t n_padded = k_blocks * kSamplesPerBlock;
 
-  // Owns the padded buffer for the lifetime of this call; from_blob below
-  // borrows it without copying.
+  // Own the padded waveform and the attention_mask buffers for the lifetime
+  // of this call; from_blob below borrows without copying. Mask is bool
+  // (1 byte per element): true at the first n_valid samples (real PCM),
+  // false at the zero-padded tail. Matches the iter191+ export at
+  // iter201_mm_4method_dynaudio_prefill2048_export.py:484-486 — `forward(
+  // self, waveform[1,N] fp32, attention_mask[1,N] bool)`.
   padded_wav_.assign(static_cast<size_t>(n_padded), 0.0f);
   std::memcpy(padded_wav_.data(), wav.samples.data(),
               static_cast<size_t>(n_valid) * sizeof(float));
 
+  padded_mask_.assign(static_cast<size_t>(n_padded), uint8_t{0});
+  if (n_valid > 0) {
+    std::memset(padded_mask_.data(), 1, static_cast<size_t>(n_valid));
+  }
+
   auto wav_tensor = ::executorch::extension::from_blob(
       padded_wav_.data(), {1, static_cast<SizesType>(n_padded)},
       ::executorch::aten::ScalarType::Float);
 
-  // 0-d int64 scalar. The PTE was exported with
-  //   sample_num_valid = torch.tensor(..., dtype=torch.long)
-  // which traces to a 0-rank Long tensor.
-  num_valid_scalar_ = n_valid;
-  auto num_valid_tensor = ::executorch::extension::from_blob(
-      &num_valid_scalar_, {}, ::executorch::aten::ScalarType::Long);
+  auto mask_tensor = ::executorch::extension::from_blob(
+      padded_mask_.data(), {1, static_cast<SizesType>(n_padded)},
+      ::executorch::aten::ScalarType::Bool);
 
-  std::vector<EValue> args = {EValue(*wav_tensor), EValue(*num_valid_tensor)};
+  std::vector<EValue> args = {EValue(*wav_tensor), EValue(*mask_tensor)};
   auto exec_result = ET_UNWRAP(module_->execute(kAudioEncoderMethod, args));
   ET_CHECK_OR_RETURN_ERROR(!exec_result.empty(), InvalidState,
                            "audio_encoder returned no outputs");
Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ function LLMScreen() {`
`75`	`75`	`}`
`76`	`76`	`};`
`77`	`77`
	`78`	`+ console.log(llm.messageHistory)`
`78`	`79`	`return !llm.isReady && !llm.error ? (`
`79`	`80`	`<Spinner`
`80`	`81`	`visible={true}`
Original file line number	Diff line number	Diff line change
`@@ -223,8 +223,6 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,`
`223`	`223`	`return getArrayAsVector<float>(val, runtime);`
`224`	`224`	`}`
`225`	`225`
`226`		`-// JS side passes an Array<Float32Array> (one clip per element). Each inner`
`227`		`-// element is read as a typed-array span and copied into a std::vector<float>.`
`228`	`226`	`template <>`
`229`	`227`	`inline std::vector<std::vector<float>>`
`230`	`228`	`getValue<std::vector<std::vector<float>>>(const jsi::Value &val,`