Remove special tokens

IgorSwat · IgorSwat · commit 7b1e6ff13851 · 2026-03-02T10:04:12.000+01:00
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -344,7 +344,6 @@ std::vector<Segment> ASR::calculateWordLevelTimestamps(
       if (words.size()) {
         Segment seg;
         seg.words = std::move(words);
-        // seg.tokens = {};  // WTF ?
         seg.tokens = tokens;
         seg.avgLogprob = avgLogProb;
         seg.temperature = temperature;
@@ -409,8 +408,12 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
   std::vector<std::string> wordsStr;
   std::string word;
   while (iss >> word) {
-    wordsStr.emplace_back(" ");
-    wordsStr.back().append(word);
+    // Detect special tokens such as [BLANK_AUDIO] by searching for square
+    // bracket
+    if (word.find('[') == std::string::npos) {
+      wordsStr.emplace_back(" ");
+      wordsStr.back().append(word);
+    }
   }
 
   size_t numChars = 0;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -92,9 +92,9 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
       // (assuming some fixed words per second frequency).
       const float freshDuration = newEnd - establishedEnd;
       const float epsilon = std::max(
-          0.F, 0.8F * (freshDuration -
-                       static_cast<float>(noNewWords /
-                                          params::kStreamWordsPerSecond)));
+          0.F, 0.85F * (freshDuration -
+                        static_cast<float>(noNewWords /
+                                           params::kStreamWordsPerSecond)));
       const float beforeScaleStart = hypothesisBuffer_.fresh_[i].start;
       const float beforeScaleEnd = hypothesisBuffer_.fresh_[i].end;
       float scale = (freshDuration - epsilon) / (newEnd - newBegin);
@@ -134,7 +134,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
 std::vector<Word> OnlineASR::finish() {
   // We always push the last remaining hypothesis, even if it's not
   // confirmed in second iteration.
-  auto remaining = hypothesisBuffer_.hypothesis_;
+  std::deque<Word> remaining = hypothesisBuffer_.hypothesis_;
 
   reset();