Apply review suggestions

IgorSwat · IgorSwat · commit 081aea076b75 · 2026-03-09T17:04:26.000+01:00
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -27,14 +27,21 @@ SpeechToText::SpeechToText(const std::string &modelName,
   }
 }
 
+SpeechToText::SpeechToText(SpeechToText &&other) noexcept
+    : callInvoker_(std::move(other.callInvoker_)),
+      transcriber_(std::move(other.transcriber_)),
+      streamer_(std::move(other.streamer_)),
+      isStreaming_(other.isStreaming_.load()),
+      readyToProcess_(other.readyToProcess_.load()) {}
+
 void SpeechToText::unload() noexcept { transcriber_->unload(); }
 
 std::shared_ptr<OwningArrayBuffer>
 SpeechToText::encode(std::span<float> waveform) const {
   executorch::aten::Tensor encoderOutputTensor = transcriber_->encode(waveform);
 
   return std::make_shared<OwningArrayBuffer>(
-      encoderOutputTensor.const_data_ptr(),
+      encoderOutputTensor.const_data_ptr<float>(),
       sizeof(float) * encoderOutputTensor.numel());
 }
 
@@ -45,7 +52,7 @@ SpeechToText::decode(std::span<uint64_t> tokens,
       transcriber_->decode(tokens, encoderOutput);
 
   return std::make_shared<OwningArrayBuffer>(
-      decoderOutputTensor.const_data_ptr(),
+      decoderOutputTensor.const_data_ptr<float>(),
       sizeof(float) * decoderOutputTensor.numel());
 }
 
@@ -137,12 +144,12 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     if (readyToProcess_ && streamer_->isReady()) {
       ProcessResult res = streamer_->process(options);
 
-      TranscriptionResult cRes =
+      TranscriptionResult committedRes =
           wordsToResult(res.committed, languageOption, verbose);
-      TranscriptionResult ncRes =
+      TranscriptionResult nonCommittedRes =
           wordsToResult(res.nonCommitted, languageOption, verbose);
 
-      nativeCallback(cRes, ncRes, false);
+      nativeCallback(committedRes, nonCommittedRes, false);
       readyToProcess_ = false;
     }
 
@@ -151,7 +158,7 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     // running transcriptions too rapidly (before the audio buffer is filled
     // with significant amount of new data) can cause streamer to commit wrong
     // phrases.
-    std::this_thread::sleep_for(std::chrono::milliseconds(75));
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
   }
 
   std::vector<Word> finalWords = streamer_->finish();
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <atomic>
 #include <span>
 #include <string>
 #include <vector>
@@ -14,10 +15,12 @@ namespace models::speech_to_text {
 
 class SpeechToText {
 public:
-  explicit SpeechToText(const std::string &modelName,
-                        const std::string &modelSource,
-                        const std::string &tokenizerSource,
-                        std::shared_ptr<react::CallInvoker> callInvoker);
+  SpeechToText(const std::string &modelName, const std::string &modelSource,
+               const std::string &tokenizerSource,
+               std::shared_ptr<react::CallInvoker> callInvoker);
+
+  // Required because of std::atomic usage
+  SpeechToText(SpeechToText &&other) noexcept;
 
   void unload() noexcept;
   [[nodiscard(
@@ -53,8 +56,8 @@ class SpeechToText {
 
   // Online ASR-like module (streaming only)
   std::unique_ptr<schema::OnlineASR> streamer_ = nullptr;
-  bool isStreaming_ = false;
-  bool readyToProcess_ = false;
+  std::atomic<bool> isStreaming_ = false;
+  std::atomic<bool> readyToProcess_ = false;
 };
 
 } // namespace models::speech_to_text
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
@@ -9,8 +9,8 @@ struct Word {
   float start;
   float end;
 
-  std::string punctations =
-      ""; // Trailing punctations which appear after the main content
+  std::string
+      punctations; // Trailing punctations which appear after the main content
 };
 
 } // namespace rnexecutorch::models::speech_to_text
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.h
@@ -161,9 +161,9 @@ class ASR : public models::BaseModel, public schema::ASR {
   std::unique_ptr<TokenizerModule> tokenizer_;
 
   // Tokenization helper definitions
-  const Token startOfTranscriptionToken_;
-  const Token endOfTranscriptionToken_;
-  const Token timestampBeginToken_;
+  Token startOfTranscriptionToken_;
+  Token endOfTranscriptionToken_;
+  Token timestampBeginToken_;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::whisper
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
@@ -1,6 +1,8 @@
 #include "HypothesisBuffer.h"
 #include "Params.h"
 #include "Utils.h"
+
+#include <algorithm>
 #include <cmath>
 
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
@@ -17,7 +19,9 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
   size_t firstFreshWordIdx = 0;
   if (!committed_.empty()) {
     std::optional<size_t> lastMatchingWordIdx =
-        findCommittedSuffix(words, 5, 6.F, 5);
+        findCommittedSuffix(words, params::kStreamCommitedSuffixSearchSize,
+                            params::kStreamMaxOverlapTimestampDiff1,
+                            params::kStreamWordsPerErrorRate);
     firstFreshWordIdx = lastMatchingWordIdx.value_or(0);
   }
 
@@ -48,7 +52,7 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
     // which were just repeated after some time.
     size_t overlapSize = utils::findLargestOverlapingFragment(
         committed_, fresh_, params::kStreamMaxOverlapSize,
-        params::kStreamMaxOverlapTimestampDiff);
+        params::kStreamMaxOverlapTimestampDiff2);
 
     if (overlapSize > 0) {
       fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize);
@@ -124,24 +128,24 @@ std::optional<size_t> HypothesisBuffer::findCommittedSuffix(
 
   // Iterate backwards through 'words' to find the most recent occurrence of a
   // suffix of 'committed_' (or the full 'committed_' sequence).
-  for (int i = static_cast<int>(words.size()) - 1; i >= 0; --i) {
+  for (int32_t i = static_cast<int32_t>(words.size()) - 1; i >= 0; --i) {
     bool match = true;
     size_t matchedCount = 0;
     size_t contentMistakeCount = 0;
 
     // Linearly interpolate tolerance if we are at the beginning and can't check
     // all committed words.
     float effectiveTolerance = timestampDiffTolerance;
-    if (i < static_cast<int>(committedToMatchSize) - 1) {
+    if (i < static_cast<int32_t>(committedToMatchSize) - 1) {
       effectiveTolerance *=
           static_cast<float>(i + 1) / static_cast<float>(committedToMatchSize);
     }
 
     // Try to match backwards from words[i] and committed_.back()
     for (size_t j = 0; j < committedToMatchSize; ++j) {
-      int wordsIdx = i - static_cast<int>(j);
-      int committedIdx =
-          static_cast<int>(committed_.size()) - 1 - static_cast<int>(j);
+      int32_t wordsIdx = i - static_cast<int32_t>(j);
+      int32_t committedIdx =
+          static_cast<int32_t>(committed_.size()) - 1 - static_cast<int32_t>(j);
 
       if (wordsIdx < 0) {
         // We reached the beginning of the words span.
@@ -153,8 +157,8 @@ std::optional<size_t> HypothesisBuffer::findCommittedSuffix(
       const Word &w2 = committed_[committedIdx];
 
       // Check timestamps within tolerance
-      if (std::abs(w1.start - w2.start) > effectiveTolerance ||
-          std::abs(w1.end - w2.end) > effectiveTolerance) {
+      if (std::max(std::abs(w1.start - w2.start), std::abs(w1.end - w2.end)) >
+          effectiveTolerance) {
         match = false;
         break;
       }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -24,7 +24,7 @@ OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
 }
 
 void OnlineASR::insertAudioChunk(std::span<const float> audio) {
-  std::lock_guard<std::mutex> lock(audioBufferMutex_);
+  std::scoped_lock<std::mutex> lock(audioBufferMutex_);
   audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
 }
 
@@ -33,10 +33,16 @@ bool OnlineASR::isReady() const {
 }
 
 ProcessResult OnlineASR::process(const DecodingOptions &options) {
-  std::unique_lock<std::mutex> lock(audioBufferMutex_);
+  std::vector<float> audioCopy;
+
+  // Copy the audio buffer to avoid keeping the lock during the entire
+  // transcription process.
+  {
+    std::scoped_lock<std::mutex> lock(audioBufferMutex_);
+    audioCopy = audioBuffer_;
+  }
 
   std::vector<Segment> transcriptions = asr_->transcribe(audioBuffer_, options);
-  lock.unlock();
 
   if (transcriptions.empty()) {
     return {.committed = {}, .nonCommitted = {}};
@@ -106,30 +112,32 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
 
   // Since Whisper does not accept waveforms longer than 30 seconds, we need
   // to cut the audio at some safe point.
-  lock.lock();
-  const float audioDuration =
-      static_cast<float>(audioBuffer_.size()) / constants::kSamplingRate;
-  if (audioDuration > params::kStreamChunkThreshold) {
-    // Leave some portion of audio in, to improve model behavior
-    // in future iterations.
-    const float erasePoint =
-        hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_
-            ? audioDuration
-            : std::min(lastSentenceEnd_, params::kStreamChunkThreshold);
-    const float minEraseDuration =
-        audioDuration - params::kStreamAudioBufferMaxReserve;
-    const float maxEraseDuration =
-        audioDuration - params::kStreamAudioBufferMinReserve;
-    const float eraseDuration = std::clamp(erasePoint - bufferTimeOffset_,
-                                           minEraseDuration, maxEraseDuration);
-    const size_t nSamplesToErase =
-        static_cast<size_t>(eraseDuration * constants::kSamplingRate);
-
-    audioBuffer_.erase(audioBuffer_.begin(),
-                       audioBuffer_.begin() + nSamplesToErase);
-    bufferTimeOffset_ += eraseDuration;
+  {
+    std::scoped_lock<std::mutex> lock(audioBufferMutex_);
+
+    const float audioDuration =
+        static_cast<float>(audioBuffer_.size()) / constants::kSamplingRate;
+    if (audioDuration > params::kStreamChunkThreshold) {
+      // Leave some portion of audio in, to improve model behavior
+      // in future iterations.
+      const float erasePoint =
+          hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_
+              ? audioDuration
+              : std::min(lastSentenceEnd_, params::kStreamChunkThreshold);
+      const float minEraseDuration =
+          audioDuration - params::kStreamAudioBufferMaxReserve;
+      const float maxEraseDuration =
+          audioDuration - params::kStreamAudioBufferMinReserve;
+      const float eraseDuration = std::clamp(
+          erasePoint - bufferTimeOffset_, minEraseDuration, maxEraseDuration);
+      const size_t nSamplesToErase =
+          static_cast<size_t>(eraseDuration * constants::kSamplingRate);
+
+      audioBuffer_.erase(audioBuffer_.begin(),
+                         audioBuffer_.begin() + nSamplesToErase);
+      bufferTimeOffset_ += eraseDuration;
+    }
   }
-  lock.unlock();
 
   return {.committed = move_to_vector(committed),
           .nonCommitted = move_to_vector(nonCommitted)};
@@ -144,7 +152,7 @@ std::vector<Word> OnlineASR::finish() {
 }
 
 void OnlineASR::reset() {
-  std::lock_guard<std::mutex> lock(audioBufferMutex_);
+  std::scoped_lock<std::mutex> lock(audioBufferMutex_);
 
   hypothesisBuffer_.reset();
   bufferTimeOffset_ = 0.f;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
@@ -66,7 +66,7 @@ class OnlineASR : public schema::OnlineASR {
   // Stores the increasing amounts of streamed audio.
   // Cleared from time to time after reaching a threshold size.
   std::vector<float> audioBuffer_ = {};
-  std::mutex audioBufferMutex_;
+  mutable std::mutex audioBufferMutex_;
   float bufferTimeOffset_ = 0.F; // Audio buffer offset
 
   // Helper buffers - hypothesis buffer
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
@@ -25,7 +25,17 @@ constexpr static int32_t kChunkBreakBuffer = 2; // [s]
  * Determines the maximum timestamp difference available for a word to be
  * considered as fresh in streaming algorithm.
  */
-constexpr static float kStreamFreshThreshold = 2.F; // [s], originally 0.5
+constexpr static float kStreamFreshThreshold = 3.F; // [s], originally 0.5
+
+/**
+ * The size of the most recent committed suffix searched in
+ * fresh words string.
+ *
+ * For example, if the committed buffer contains ["I", "did" "a" "very" "nasty"
+ * "thing."], and kStreamCommitedSuffixSearchSize = 3, then we search for
+ * ["very" "nasty" "thing."] suffix.
+ */
+constexpr static size_t kStreamCommitedSuffixSearchSize = 5;
 
 /**
  * Determines the maximum expected size of overlapping fragments between
@@ -40,8 +50,28 @@ constexpr static size_t kStreamMaxOverlapSize =
 /**
  * Similar to kMaxStreamOverlapSize, but this one determines
  * the maximum allowed timestamp difference between the overlaping fragments.
+ *
+ * It's the first, more strict threshold, used when searching for recently
+ * committed entries.
+ */
+constexpr static float kStreamMaxOverlapTimestampDiff1 = 6.F; // [s]
+
+/**
+ * Similar to kMaxStreamOverlapSize, but this one determines
+ * the maximum allowed timestamp difference between the overlaping fragments.
+ *
+ * It's the second, more liberal threshold, used in overlap correction
+ * algorithm.
+ */
+constexpr static float kStreamMaxOverlapTimestampDiff2 = 15.F; // [s]
+
+/**
+ * Number of words per 1 allowed mistake (error correction).
+ *
+ * For example, if kStreamWordsPerErrorRate = 4, then we allow maximum 1 mistake
+ * in a 4 word string.
  */
-constexpr static float kStreamMaxOverlapTimestampDiff = 15.F; // [s]
+constexpr static size_t kStreamWordsPerErrorRate = 5;
 
 /**
  * A threshold which exceeded causes the main streaming audio buffer to be
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
@@ -10,8 +10,9 @@ namespace rnexecutorch::models::speech_to_text::whisper::utils {
 
 // Compares two strings without case-sensitivity.
 inline bool equalsIgnoreCase(const std::string &a, const std::string &b) {
-  if (a.size() != b.size())
+  if (a.size() != b.size()) {
     return false;
+  }
   return std::equal(a.begin(), a.end(), b.begin(), [](char c1, char c2) {
     return std::tolower(static_cast<unsigned char>(c1)) ==
            std::tolower(static_cast<unsigned char>(c2));
@@ -55,13 +56,14 @@ inline size_t findLargestOverlapingFragment(const Container &suffixVec,
     if (equalsIgnoreCase(suffixVec[i].content, prefixVec[0].content)) {
       size_t calculatedSize = suffixVec.size() - i;
 
-      bool isEqual = std::equal(
-          suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(),
-          [maxTimestampDiff](const Word &sWord, const Word &pWord) {
-            return equalsIgnoreCase(sWord.content, pWord.content) &&
-                   std::fabs(sWord.start - pWord.start) <= maxTimestampDiff &&
-                   std::fabs(sWord.end - pWord.end) <= maxTimestampDiff;
-          });
+      bool isEqual =
+          std::equal(suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(),
+                     [maxTimestampDiff](const Word &sWord, const Word &pWord) {
+                       return equalsIgnoreCase(sWord.content, pWord.content) &&
+                              std::max(std::fabs(sWord.start - pWord.start),
+                                       std::fabs(sWord.end - pWord.end)) <=
+                                  maxTimestampDiff;
+                     });
 
       if (isEqual) {
         return calculatedSize;
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts