Skip to content

Commit e1649a5

Browse files
committed
gemma_vulkan
1 parent 75d95dc commit e1649a5

31 files changed

Lines changed: 849 additions & 329 deletions

apps/llm/app/llm/index.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ function LLMScreen() {
7575
}
7676
};
7777

78+
console.log(llm.messageHistory)
7879
return !llm.isReady && !llm.error ? (
7980
<Spinner
8081
visible={true}

apps/llm/app/multimodal_llm/index.tsx

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ const SUGGESTED_PROMPTS = [
3434
'Describe this scene in detail',
3535
'What objects can you see?',
3636
'What text appears in this image?',
37+
'Transcribe the audio?',
3738
];
3839
import { useLLMStats } from '../../hooks/useLLMStats';
3940
import { StatsBar } from '../../components/StatsBar';
@@ -175,7 +176,8 @@ function MultimodalLLMScreen() {
175176
};
176177

177178
const sendMessage = async () => {
178-
if (!userInput.trim() || vlm.isGenerating) return;
179+
if (!(imageUri || audioBuffer || userInput.trim()) || vlm.isGenerating)
180+
return;
179181
onMessageSend();
180182
const text = userInput.trim();
181183
setUserInput('');
@@ -346,14 +348,15 @@ function MultimodalLLMScreen() {
346348
onChangeText={setUserInput}
347349
/>
348350

349-
{userInput.trim() && !vlm.isGenerating && (
350-
<TouchableOpacity
351-
style={styles.sendChatTouchable}
352-
onPress={sendMessage}
353-
>
354-
<SendIcon height={24} width={24} padding={4} margin={8} />
355-
</TouchableOpacity>
356-
)}
351+
{(imageUri || audioBuffer || userInput.trim()) &&
352+
!vlm.isGenerating && (
353+
<TouchableOpacity
354+
style={styles.sendChatTouchable}
355+
onPress={sendMessage}
356+
>
357+
<SendIcon height={24} width={24} padding={4} margin={8} />
358+
</TouchableOpacity>
359+
)}
357360
{vlm.isGenerating && (
358361
<TouchableOpacity
359362
style={styles.sendChatTouchable}

packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,6 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,
223223
return getArrayAsVector<float>(val, runtime);
224224
}
225225

226-
// JS side passes an Array<Float32Array> (one clip per element). Each inner
227-
// element is read as a typed-array span and copied into a std::vector<float>.
228226
template <>
229227
inline std::vector<std::vector<float>>
230228
getValue<std::vector<std::vector<float>>>(const jsi::Value &val,

packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,6 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
166166
promiseHostFunction<&Model::generateMultimodal>,
167167
"generateMultimodal"));
168168

169-
addFunctions(JSI_EXPORT_FUNCTION(
170-
ModelHostObject<Model>,
171-
promiseHostFunction<&Model::generateMultimodalWithAudio>,
172-
"generateMultimodalWithAudio"));
173-
174169
addFunctions(JSI_EXPORT_FUNCTION(
175170
ModelHostObject<Model>,
176171
synchronousHostFunction<&Model::getVisualTokenCount>,

packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp

Lines changed: 8 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <filesystem>
55
#include <map>
66
#include <rnexecutorch/Error.h>
7-
#include <rnexecutorch/Log.h>
87
#include <rnexecutorch/threads/GlobalThreadPool.h>
98
#include <runner/encoders/audio_encoder.h>
109
#include <runner/encoders/vision_encoder.h>
@@ -22,7 +21,6 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
2221
std::vector<std::string> capabilities,
2322
std::shared_ptr<react::CallInvoker> callInvoker)
2423
: BaseModel(modelSource, callInvoker, Module::LoadMode::Mmap) {
25-
2624
if (capabilities.empty()) {
2725
runner_ =
2826
std::make_unique<llm::TextRunner>(std::move(module_), tokenizerSource);
@@ -72,96 +70,19 @@ std::string LLM::generate(std::string input,
7270

7371
auto config = llm::GenerationConfig{.echo = false, .warming = false};
7472
auto error = runner_->generate(input, config, nativeCallback, {});
73+
// No-op unless built with ET_EVENT_TRACER_ENABLED. Writes etdump.bin
74+
// alongside the model after the generation finishes.
75+
dumpEventTracer();
7576
if (error != Error::Ok) {
7677
throw RnExecutorchError(error, "Failed to generate text");
7778
}
7879
return output;
7980
}
8081

81-
std::string LLM::generateMultimodal(std::string prompt,
82-
std::vector<std::string> imagePaths,
83-
std::string imageToken,
84-
std::shared_ptr<jsi::Function> callback) {
85-
if (!runner_ || !runner_->is_loaded()) {
86-
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
87-
"Runner is not loaded");
88-
}
89-
if (!runner_->is_multimodal()) {
90-
throw RnExecutorchError(
91-
RnExecutorchErrorCode::InvalidUserInput,
92-
"This model does not support multimodal input. Use generate(prompt, "
93-
"callback) for text-only generation.");
94-
}
95-
if (imageToken.empty()) {
96-
throw RnExecutorchError(
97-
RnExecutorchErrorCode::InvalidUserInput,
98-
"imageToken must not be empty. Pass the model's image token (e.g. "
99-
"from tokenizer_config.json).");
100-
}
101-
102-
const size_t kImageTokenLen = imageToken.size();
103-
104-
std::vector<llm::MultimodalInput> inputs;
105-
size_t imageIdx = 0;
106-
size_t searchPos = 0;
107-
108-
while (true) {
109-
size_t found = prompt.find(imageToken, searchPos);
110-
if (found == std::string::npos) {
111-
if (searchPos < prompt.size()) {
112-
inputs.push_back(llm::make_text_input(prompt.substr(searchPos)));
113-
}
114-
break;
115-
}
116-
// Text segment before this placeholder
117-
if (found > searchPos) {
118-
inputs.push_back(
119-
llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
120-
}
121-
// Image at this position
122-
if (imageIdx >= imagePaths.size()) {
123-
throw RnExecutorchError(
124-
RnExecutorchErrorCode::InvalidUserInput,
125-
"More '" + imageToken +
126-
"' placeholders in prompt than image paths provided");
127-
}
128-
inputs.push_back(llm::make_image_input(imagePaths[imageIdx++]));
129-
searchPos = found + kImageTokenLen;
130-
}
131-
132-
if (imageIdx < imagePaths.size()) {
133-
throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
134-
"More image paths provided than '" + imageToken +
135-
"' placeholders in prompt");
136-
}
137-
138-
if (inputs.empty()) {
139-
throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
140-
"No inputs to generate from");
141-
}
142-
143-
std::string output;
144-
auto nativeCallback = [this, callback, &output](const std::string &token) {
145-
output += token;
146-
if (callback && callInvoker) {
147-
callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
148-
callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
149-
});
150-
}
151-
};
152-
153-
auto error = runner_->generate(inputs, nativeCallback);
154-
if (error != Error::Ok) {
155-
throw RnExecutorchError(error, "Failed to generate multimodal response");
156-
}
157-
158-
return output;
159-
}
160-
161-
std::string LLM::generateMultimodalWithAudio(
162-
std::string prompt, std::vector<std::string> imagePaths,
163-
std::string imageToken, std::vector<std::vector<float>> audioWaveforms,
164-
std::string audioToken, std::shared_ptr<jsi::Function> callback) {
82+
std::string LLM::generateMultimodal(
83+
std::string prompt, std::shared_ptr<jsi::Function> callback,
84+
std::vector<std::string> imagePaths, std::string imageToken,
85+
std::vector<std::vector<float>> audioWaveforms, std::string audioToken) {
16586
if (!runner_ || !runner_->is_loaded()) {
16687
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
16788
"Runner is not loaded");
@@ -234,6 +155,7 @@ std::string LLM::generateMultimodalWithAudio(
234155
});
235156
}
236157
};
158+
237159
auto error = runner_->generate(inputs, nativeCallback);
238160
if (error != Error::Ok) {
239161
throw RnExecutorchError(error, "Failed to generate multimodal response");

packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,16 @@ class LLM : public BaseModel {
2222

2323
std::string generate(std::string prompt,
2424
std::shared_ptr<jsi::Function> callback);
25-
std::string generateMultimodal(std::string prompt,
26-
std::vector<std::string> imagePaths,
27-
std::string imageToken,
28-
std::shared_ptr<jsi::Function> callback);
2925
// Audio variant: `audioWaveforms` is a parallel vector of fp32 mono 16 kHz
3026
// PCM buffers (decoded upstream, same contract as SpeechToText::transcribe).
3127
// The prompt is scanned for `imageToken` and/or `audioToken` placeholders;
3228
// each placeholder consumes the next entry from its respective vector in
3329
// order. Either set of paths/waveforms/token may be empty.
34-
std::string generateMultimodalWithAudio(
35-
std::string prompt, std::vector<std::string> imagePaths,
36-
std::string imageToken, std::vector<std::vector<float>> audioWaveforms,
37-
std::string audioToken, std::shared_ptr<jsi::Function> callback);
30+
std::string generateMultimodal(
31+
std::string prompt, std::shared_ptr<jsi::Function> callback,
32+
std::vector<std::string> imagePaths = {}, std::string imageToken = "",
33+
std::vector<std::vector<float>> audioWaveforms = {},
34+
std::string audioToken = "");
3835

3936
void interrupt();
4037
void reset();

packages/react-native-executorch/common/runner/base_llm_runner.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ BaseLLMRunner::BaseLLMRunner(std::unique_ptr<Module> module,
1818
tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
1919
metadata_({
2020
{kEnableDynamicShape, false},
21-
{kMaxSeqLen, 2048},
22-
{kMaxContextLen, 2048},
21+
{kMaxSeqLen, 128},
22+
{kMaxContextLen, 128},
2323
{kUseKVCache, true},
2424
}) {}
2525

@@ -69,7 +69,7 @@ Error BaseLLMRunner::load() {
6969
eos_ids_->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
7070
}
7171
}
72-
eos_ids_->emplace(static_cast<uint64_t>(1));
72+
7373
if (eos_ids_->empty()) {
7474
throw rnexecutorch::RnExecutorchError(
7575
rnexecutorch::RnExecutorchErrorCode::InvalidModelOutput,
@@ -150,6 +150,11 @@ void BaseLLMRunner::set_repetition_penalty(float repetition_penalty) noexcept {
150150
config_.repetition_penalty = repetition_penalty;
151151
}
152152

153+
void BaseLLMRunner::set_topk(int32_t topk) noexcept {
154+
config_.topk = topk;
155+
set_topk_impl(topk);
156+
}
157+
153158
void BaseLLMRunner::set_count_interval(size_t count_interval) {
154159
config_.output_token_batch_size = count_interval;
155160
}

packages/react-native-executorch/common/runner/base_llm_runner.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class BaseLLMRunner {
5555
void set_topp(float topp) noexcept;
5656
void set_min_p(float min_p) noexcept;
5757
void set_repetition_penalty(float repetition_penalty) noexcept;
58+
void set_topk(int32_t topk) noexcept;
5859
void set_count_interval(size_t count_interval);
5960
void set_time_interval(size_t time_interval);
6061

packages/react-native-executorch/common/runner/constants.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ inline constexpr auto kVisionEncoderMethod = "vision_encoder";
2828
inline constexpr auto kAudioEncoderMethod = "audio_encoder";
2929
inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
3030
inline constexpr auto kTextModelMethod = "text_decoder";
31-
inline constexpr auto kMaxPrefillLen = 1024;
31+
// Absolute ceiling on prefill length (in tokens) and the fallback value used
32+
// when a PTE doesn't bake `get_max_seq_len`. 2048 matches Gemma4 iter201's
33+
// PREFILL_LEN / get_max_context_len; legacy PTEs (e.g. LFM2-VL) typically
34+
// bake their own get_max_seq_len so this ceiling does not affect them.
35+
inline constexpr auto kMaxPrefillLen = 2048;
3236
inline constexpr auto numOfAddedBoSTokens = 0;
3337
inline constexpr auto numOfAddedEoSTokens = 0;
3438

packages/react-native-executorch/common/runner/encoders/audio_encoder.cpp

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,4 @@
11
// common/runner/encoders/audio_encoder.cpp
2-
//
3-
// Pattern mirrors models/speech_to_text/whisper/ASR.cpp::encode — the PTE has
4-
// the log-mel frontend baked in, so this encoder hands the raw waveform
5-
// straight to the `audio_encoder` method. Mel extraction, STFT, filterbank,
6-
// normalization all live inside the exported module.
7-
//
8-
// PTE contract (exp107f onward):
9-
// inputs:
10-
// waveform[1, N_padded] fp32 (N_padded = kSamplesPerBlock * k, k>=1)
11-
// num_valid_samples[] int64 (real PCM length before zero-padding)
12-
// output:
13-
// embeds[1, 12*k, hidden] fp32
14-
// Caller right-pads the raw waveform up to the next multiple of
15-
// kSamplesPerBlock with silence; num_valid_samples tells MelFrontend which
16-
// mel frames correspond to real audio so padded-silence frames are masked
17-
// out and don't dilute the encoding.
18-
192
#include "audio_encoder.h"
203

214
#include <rnexecutorch/Error.h>
@@ -24,8 +7,10 @@
247

258
#include <executorch/extension/tensor/tensor.h>
269

10+
#include <cmath>
2711
#include <cstdint>
2812
#include <cstring>
13+
#include <string>
2914
#include <vector>
3015

3116
namespace executorch::extension::llm {
@@ -36,9 +21,14 @@ using ::executorch::runtime::EValue;
3621
using ::executorch::runtime::Result;
3722

3823
namespace {
39-
// Matches AUDIO_SAMPLES_PER_BLOCK in gemma_export/experiments/exp107f_*.py.
24+
// Matches AUDIO_SAMPLES_PER_BLOCK in gemma_export/experiments_vulkan/
25+
// op_bisect/iter201_mm_4method_dynaudio_prefill2048_export.py.
4026
// The PTE's audio_samples dim was exported as `7680 * audio_blocks`.
4127
constexpr int32_t kSamplesPerBlock = 7680;
28+
// k ∈ [kAudioBlockKMin, kAudioBlockKMax] from MODEL_INTERFACE.md §6.
29+
// k=62 == 29.76 s @ 16 kHz is the SDPA mask + rel-shift bake point.
30+
constexpr int64_t kAudioBlockKMin = 1;
31+
constexpr int64_t kAudioBlockKMax = 62;
4232
} // namespace
4333

4434
AudioEncoder::AudioEncoder(::executorch::extension::Module &module)
@@ -84,26 +74,44 @@ Result<EValue> AudioEncoder::encode(const MultimodalInput &input) {
8474

8575
const int64_t n_valid = static_cast<int64_t>(wav.samples.size());
8676
const int64_t k_blocks = (n_valid + kSamplesPerBlock - 1) / kSamplesPerBlock;
77+
ET_CHECK_OR_RETURN_ERROR(
78+
k_blocks >= kAudioBlockKMin && k_blocks <= kAudioBlockKMax,
79+
InvalidArgument,
80+
"AudioEncoder: waveform of %lld samples needs k_blocks=%lld; "
81+
"audio_encoder accepts k in [%lld, %lld] (block=%d samples; max %.2f s "
82+
"@ 16 kHz)",
83+
static_cast<long long>(n_valid), static_cast<long long>(k_blocks),
84+
static_cast<long long>(kAudioBlockKMin),
85+
static_cast<long long>(kAudioBlockKMax),
86+
static_cast<int>(kSamplesPerBlock),
87+
static_cast<double>(kSamplesPerBlock) *
88+
static_cast<double>(kAudioBlockKMax) / 16000.0);
8789
const int64_t n_padded = k_blocks * kSamplesPerBlock;
8890

89-
// Owns the padded buffer for the lifetime of this call; from_blob below
90-
// borrows it without copying.
91+
// Own the padded waveform and the attention_mask buffers for the lifetime
92+
// of this call; from_blob below borrows without copying. Mask is bool
93+
// (1 byte per element): true at the first n_valid samples (real PCM),
94+
// false at the zero-padded tail. Matches the iter191+ export at
95+
// iter201_mm_4method_dynaudio_prefill2048_export.py:484-486 — `forward(
96+
// self, waveform[1,N] fp32, attention_mask[1,N] bool)`.
9197
padded_wav_.assign(static_cast<size_t>(n_padded), 0.0f);
9298
std::memcpy(padded_wav_.data(), wav.samples.data(),
9399
static_cast<size_t>(n_valid) * sizeof(float));
94100

101+
padded_mask_.assign(static_cast<size_t>(n_padded), uint8_t{0});
102+
if (n_valid > 0) {
103+
std::memset(padded_mask_.data(), 1, static_cast<size_t>(n_valid));
104+
}
105+
95106
auto wav_tensor = ::executorch::extension::from_blob(
96107
padded_wav_.data(), {1, static_cast<SizesType>(n_padded)},
97108
::executorch::aten::ScalarType::Float);
98109

99-
// 0-d int64 scalar. The PTE was exported with
100-
// sample_num_valid = torch.tensor(..., dtype=torch.long)
101-
// which traces to a 0-rank Long tensor.
102-
num_valid_scalar_ = n_valid;
103-
auto num_valid_tensor = ::executorch::extension::from_blob(
104-
&num_valid_scalar_, {}, ::executorch::aten::ScalarType::Long);
110+
auto mask_tensor = ::executorch::extension::from_blob(
111+
padded_mask_.data(), {1, static_cast<SizesType>(n_padded)},
112+
::executorch::aten::ScalarType::Bool);
105113

106-
std::vector<EValue> args = {EValue(*wav_tensor), EValue(*num_valid_tensor)};
114+
std::vector<EValue> args = {EValue(*wav_tensor), EValue(*mask_tensor)};
107115
auto exec_result = ET_UNWRAP(module_->execute(kAudioEncoderMethod, args));
108116
ET_CHECK_OR_RETURN_ERROR(!exec_result.empty(), InvalidState,
109117
"audio_encoder returned no outputs");

0 commit comments

Comments
 (0)