Final nits

IgorSwat · IgorSwat · commit ae017ef2604f · 2026-03-06T17:12:08.000+01:00
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
@@ -21,6 +21,7 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
     firstFreshWordIdx = lastMatchingWordIdx.value_or(0);
   }
 
+  bool isCompletelyFresh = firstFreshWordIdx == 0;
   for (size_t i = firstFreshWordIdx; i < words.size(); i++) {
     const auto &word = words[i];
 
@@ -29,7 +30,8 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
     const float startGlobal = word.start + offset;
     const float endGlobal = word.end + offset;
 
-    if (startGlobal > lastCommittedTime_ - 3.F) {
+    if (!isCompletelyFresh ||
+        startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) {
       fresh_.emplace_back(word.content, startGlobal, endGlobal,
                           word.punctations);
     }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -24,6 +24,7 @@ OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
 }
 
 void OnlineASR::insertAudioChunk(std::span<const float> audio) {
+  std::lock_guard<std::mutex> lock(audioBufferMutex_);
   audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
 }
 
@@ -32,7 +33,10 @@ bool OnlineASR::isReady() const {
 }
 
 ProcessResult OnlineASR::process(const DecodingOptions &options) {
+  std::unique_lock<std::mutex> lock(audioBufferMutex_);
+
   std::vector<Segment> transcriptions = asr_->transcribe(audioBuffer_, options);
+  lock.unlock();
 
   if (transcriptions.empty()) {
     return {.committed = {}, .nonCommitted = {}};
@@ -57,9 +61,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
     const float newEnd = hypothesisBuffer_.fresh_.back().end;
     float shift = 0.F;
     for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) {
-      const float originalStart = hypothesisBuffer_.fresh_[i].start;
       const float originalEnd = hypothesisBuffer_.fresh_[i].end;
-      const std::string &wordContent = hypothesisBuffer_.fresh_[i].content;
 
       if (i < hypothesisBuffer_.hypothesis_.size() &&
           utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content,
@@ -104,6 +106,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
 
   // Since Whisper does not accept waveforms longer than 30 seconds, we need
   // to cut the audio at some safe point.
+  lock.lock();
   const float audioDuration =
       static_cast<float>(audioBuffer_.size()) / constants::kSamplingRate;
   if (audioDuration > params::kStreamChunkThreshold) {
@@ -126,6 +129,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
                        audioBuffer_.begin() + nSamplesToErase);
     bufferTimeOffset_ += eraseDuration;
   }
+  lock.unlock();
 
   return {.committed = move_to_vector(committed),
           .nonCommitted = move_to_vector(nonCommitted)};
@@ -140,6 +144,8 @@ std::vector<Word> OnlineASR::finish() {
 }
 
 void OnlineASR::reset() {
+  std::lock_guard<std::mutex> lock(audioBufferMutex_);
+
   hypothesisBuffer_.reset();
   bufferTimeOffset_ = 0.f;
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
@@ -7,6 +7,8 @@
 #include "ASR.h"
 #include "HypothesisBuffer.h"
 
+#include <mutex>
+
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
 /**
@@ -64,6 +66,7 @@ class OnlineASR : public schema::OnlineASR {
   // Stores the increasing amounts of streamed audio.
   // Cleared from time to time after reaching a threshold size.
   std::vector<float> audioBuffer_ = {};
+  std::mutex audioBufferMutex_;
   float bufferTimeOffset_ = 0.F; // Audio buffer offset
 
   // Helper buffers - hypothesis buffer
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
@@ -25,7 +25,7 @@ constexpr static int32_t kChunkBreakBuffer = 2; // [s]
  * Determines the maximum timestamp difference available for a word to be
  * considered as fresh in streaming algorithm.
  */
-constexpr static float kStreamFreshThreshold = 1.F; // [s], originally 0.5
+constexpr static float kStreamFreshThreshold = 2.F; // [s], originally 0.5
 
 /**
  * Determines the maximum expected size of overlapping fragments between

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {`
`21`	`21`	`firstFreshWordIdx = lastMatchingWordIdx.value_or(0);`
`22`	`22`	`}`
`23`	`23`
	`24`	`+ bool isCompletelyFresh = firstFreshWordIdx == 0;`
`24`	`25`	`for (size_t i = firstFreshWordIdx; i < words.size(); i++) {`
`25`	`26`	`const auto &word = words[i];`
`26`	`27`
`@@ -29,7 +30,8 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {`
`29`	`30`	`const float startGlobal = word.start + offset;`
`30`	`31`	`const float endGlobal = word.end + offset;`
`31`	`32`
`32`		`- if (startGlobal > lastCommittedTime_ - 3.F) {`
	`33`	`+ if (!isCompletelyFresh \|\|`
	`34`	`+ startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) {`
`33`	`35`	`fresh_.emplace_back(word.content, startGlobal, endGlobal,`
`34`	`36`	`word.punctations);`
`35`	`37`	`}`