Add whisper kv-cache & fix demo app permissions

IgorSwat · IgorSwat · commit bab2ffb0ace6 · 2026-02-13T12:20:28.000+01:00
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -40,7 +40,8 @@ SpeechToText::encode(std::span<float> waveform) const {
 std::shared_ptr<OwningArrayBuffer>
 SpeechToText::decode(std::span<uint64_t> tokens,
                      std::span<float> encoderOutput) const {
-  std::vector<float> decoderOutput = this->asr->decode(tokens, encoderOutput);
+  std::vector<float> decoderOutput =
+      this->asr->decode(tokens, 0, encoderOutput);
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -1,3 +1,4 @@
+#include <numeric>
 #include <random>
 #include <sstream>
 
@@ -42,11 +43,15 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
   std::vector<float> encoderOutput = this->encode(waveform);
 
   std::vector<uint64_t> sequenceIds = this->getInitialSequence(options);
+  std::vector<uint64_t> cachedTokens = sequenceIds;
   const size_t initialSequenceLenght = sequenceIds.size();
   std::vector<float> scores;
 
-  while (std::cmp_less_equal(sequenceIds.size(), ASR::kMaxDecodeLength)) {
-    std::vector<float> logits = this->decode(sequenceIds, encoderOutput);
+  uint64_t startPos = 0;
+  while (std::cmp_less_equal(startPos + sequenceIds.size(),
+                             ASR::kMaxDecodeLength)) {
+    std::vector<float> logits =
+        this->decode(sequenceIds, startPos, encoderOutput);
 
     // intentionally comparing float to float
     // temperatures are predefined, so this is safe
@@ -74,16 +79,20 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
       nextProb = probs[nextId];
     }
 
-    sequenceIds.push_back(nextId);
+    // Move the startPos pointer by the amount of tokens we processed
+    startPos += sequenceIds.size();
+    sequenceIds = {nextId};
+    cachedTokens.push_back(nextId);
     scores.push_back(nextProb);
 
     if (nextId == this->endOfTranscriptionToken) {
       break;
     }
   }
 
-  return {.tokens = std::vector<uint64_t>(
-              sequenceIds.cbegin() + initialSequenceLenght, sequenceIds.cend()),
+  return {.tokens = std::vector<uint64_t>(cachedTokens.cbegin() +
+                                              initialSequenceLenght,
+                                          cachedTokens.cend()),
           .scores = scores};
 }
 
@@ -318,13 +327,19 @@ std::vector<float> ASR::encode(std::span<float> waveform) const {
   return {dataPtr, dataPtr + outputNumel};
 }
 
-std::vector<float> ASR::decode(std::span<const uint64_t> tokens,
+std::vector<float> ASR::decode(std::span<uint64_t> tokens, uint64_t startPos,
                                std::span<float> encoderOutput) const {
   std::vector<int32_t> tokenShape = {1, static_cast<int32_t>(tokens.size())};
-  auto tokensLong = std::vector<int64_t>(tokens.begin(), tokens.end());
+  std::vector<int32_t> positionShape = {static_cast<int32_t>(tokens.size())};
 
   auto tokenTensor = executorch::extension::make_tensor_ptr(
-      tokenShape, tokensLong.data(), ScalarType::Long);
+      tokenShape, tokens.data(), ScalarType::Long);
+
+  // Populate cache position vector
+  std::vector<uint64_t> cachePositions(tokens.size());
+  std::iota(cachePositions.begin(), cachePositions.end(), startPos);
+  auto positionTensor = executorch::extension::make_tensor_ptr(
+      positionShape, cachePositions.data(), ScalarType::Long);
 
   const auto encoderOutputSize = static_cast<int32_t>(encoderOutput.size());
   std::vector<int32_t> encShape = {1, ASR::kNumFrames,
@@ -333,7 +348,7 @@ std::vector<float> ASR::decode(std::span<const uint64_t> tokens,
       std::move(encShape), encoderOutput.data(), ScalarType::Float);
 
   const auto decoderResult =
-      this->decoder->forward({tokenTensor, encoderTensor});
+      this->decoder->forward({tokenTensor, positionTensor, encoderTensor});
 
   if (!decoderResult.ok()) {
     throw RnExecutorchError(decoderResult.error(),
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
@@ -17,7 +17,7 @@ class ASR {
   transcribe(std::span<float> waveform,
              const types::DecodingOptions &options) const;
   std::vector<float> encode(std::span<float> waveform) const;
-  std::vector<float> decode(std::span<const uint64_t> tokens,
+  std::vector<float> decode(std::span<uint64_t> tokens, uint64_t startPos,
                             std::span<float> encoderOutput) const;
 
 private:
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -418,8 +418,10 @@ export const STYLE_TRANSFER_UDNIE = {
 
 // S2T
 const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`;
-const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`;
+// const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`;
+// const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`;
+const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/resolve/kv-cache/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`;
+const WHISPER_TINY_EN_DECODER = `${URL_PREFIX}-whisper-tiny.en/resolve/kv-cache/xnnpack/whisper_tiny_en_decoder_xnnpack.pte`;
 
 const WHISPER_TINY_EN_ENCODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_encoder_xnnpack.pte`;
 const WHISPER_TINY_EN_DECODER_QUANTIZED = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_quantized_en_decoder_xnnpack.pte`;

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,8 @@ SpeechToText::encode(std::span<float> waveform) const {`
`40`	`40`	`std::shared_ptr<OwningArrayBuffer>`
`41`	`41`	`SpeechToText::decode(std::span<uint64_t> tokens,`
`42`	`42`	`std::span<float> encoderOutput) const {`
`43`		`- std::vector<float> decoderOutput = this->asr->decode(tokens, encoderOutput);`
	`43`	`+ std::vector<float> decoderOutput =`
	`44`	`+ this->asr->decode(tokens, 0, encoderOutput);`
`44`	`45`	`return std::make_shared<OwningArrayBuffer>(decoderOutput);`
`45`	`46`	`}`
`46`	`47`