Remove special tokens

IgorSwat · IgorSwat · commit 2ee6d1d22533 · 2026-03-02T10:21:28.000+01:00
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -241,12 +241,6 @@ std::vector<Segment> ASR::generate(std::span<float> waveform,
     }
   }
 
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[ASR] Raw transcription results (tokens): ", bestTokens);
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[ASR] Raw transcription results (text): ",
-                    tokenizer_->decode(bestTokens, true));
-
   return this->calculateWordLevelTimestamps(bestTokens, waveform,
                                             bestAvgLogProb, bestTemperature,
                                             bestCompressionRatio);
@@ -344,7 +338,6 @@ std::vector<Segment> ASR::calculateWordLevelTimestamps(
       if (words.size()) {
         Segment seg;
         seg.words = std::move(words);
-        // seg.tokens = {};  // WTF ?
         seg.tokens = tokens;
         seg.avgLogprob = avgLogProb;
         seg.temperature = temperature;
@@ -369,17 +362,19 @@ std::vector<Segment> ASR::calculateWordLevelTimestamps(
   const uint64_t end = generatedTokens[generatedTokensSize - 2];
   auto words = this->estimateWordLevelTimestampsLinear(tokens, start, end);
 
+  if (words.empty()) {
+    return {};
+  }
+
   Segment seg;
   seg.words = std::move(words);
   seg.tokens = tokens;
   seg.avgLogprob = avgLogProb;
   seg.temperature = temperature;
   seg.compressionRatio = compressionRatio;
 
-  if (!seg.words.empty()) {
-    seg.start = seg.words.front().start;
-    seg.end = seg.words.back().end;
-  }
+  seg.start = seg.words.front().start;
+  seg.end = seg.words.back().end;
 
   segments.push_back(std::move(seg));
 
@@ -409,8 +404,12 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
   std::vector<std::string> wordsStr;
   std::string word;
   while (iss >> word) {
-    wordsStr.emplace_back(" ");
-    wordsStr.back().append(word);
+    // Detect special tokens such as [BLANK_AUDIO] by searching for square
+    // bracket
+    if (word.find('[') == std::string::npos) {
+      wordsStr.emplace_back(" ");
+      wordsStr.back().append(word);
+    }
   }
 
   size_t numChars = 0;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -92,11 +92,9 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
       // (assuming some fixed words per second frequency).
       const float freshDuration = newEnd - establishedEnd;
       const float epsilon = std::max(
-          0.F, 0.8F * (freshDuration -
-                       static_cast<float>(noNewWords /
-                                          params::kStreamWordsPerSecond)));
-      const float beforeScaleStart = hypothesisBuffer_.fresh_[i].start;
-      const float beforeScaleEnd = hypothesisBuffer_.fresh_[i].end;
+          0.F, 0.85F * (freshDuration -
+                        static_cast<float>(noNewWords /
+                                           params::kStreamWordsPerSecond)));
       float scale = (freshDuration - epsilon) / (newEnd - newBegin);
       hypothesisBuffer_.fresh_[i].start =
           (hypothesisBuffer_.fresh_[i].start - newEnd) * scale + newEnd;
@@ -134,9 +132,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
 std::vector<Word> OnlineASR::finish() {
   // We always push the last remaining hypothesis, even if it's not
   // confirmed in second iteration.
-  auto remaining = hypothesisBuffer_.hypothesis_;
-
-  reset();
+  std::deque<Word> remaining = hypothesisBuffer_.hypothesis_;
 
   return move_to_vector(remaining);
 }