software-mansion
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp‎
Lines changed: 30 additions & 8 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h‎
Lines changed: 5 additions & 3 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp‎
Lines changed: 58 additions & 26 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp‎
Lines changed: 58 additions & 26 deletions
@@ -4,11 +4,14 @@
 
 #include "ASR.h"
 #include "Constants.h"
+#include "Params.h"
 #include <executorch/extension/tensor/tensor_ptr.h>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/data_processing/Numerical.h>
 #include <rnexecutorch/data_processing/gzip.h>
 
+#include <rnexecutorch/Log.h>
+
 namespace rnexecutorch::models::speech_to_text::whisper {
 
 using executorch::runtime::etensor::ScalarType;
@@ -30,18 +33,24 @@ ASR::ASR(const std::string &modelSource, const std::string &tokenizerSource,
  */
 std::vector<Segment> ASR::transcribe(std::span<float> waveform,
                                      const DecodingOptions &options) const {
-  int32_t seek = 0;
+  // Use floats to prevent downcasting and timestamp mismatches
+  float seek = 0.f;
   std::vector<Segment> results;
 
+  const float waveformSize = static_cast<float>(waveform.size());
+  const float waveformSkipBoundary =
+      static_cast<float>((constants::kChunkSize - params::kChunkBreakBuffer) *
+                         constants::kSamplingRate);
+
   // We loop through the input audio waveform and process it in 30s chunks.
   // This is determined by Whisper models strict 30s audio length requirement.
-  while (std::cmp_less(seek * constants::kSamplingRate, waveform.size())) {
+  while (seek * constants::kSamplingRate < waveformSize) {
     // Calculate chunk bounds and extract the chunk.
-    int32_t start = seek * constants::kSamplingRate;
+    float start = seek * constants::kSamplingRate;
     const auto end =
-        std::min<int32_t>(static_cast<int32_t>((seek + constants::kChunkSize) *
-                                               constants::kSamplingRate),
-                          static_cast<int32_t>(waveform.size()));
+        std::min<float>(static_cast<float>((seek + constants::kChunkSize) *
+                                           constants::kSamplingRate),
+                        waveformSize);
     auto chunk = waveform.subspan(start, end - start);
 
     if (std::cmp_less(chunk.size(), constants::kMinChunkSamples)) {
@@ -71,7 +80,12 @@ std::vector<Segment> ASR::transcribe(std::span<float> waveform,
     }
 
     if (!segments.empty() && !segments.back().words.empty()) {
-      seek = static_cast<int32_t>(segments.back().words.back().end);
+      // This prevents additional segments to appear, unless the audio length is
+      // very close to the max chunk size, that is there could be some words
+      // spoken near the breakpoint.
+      seek = waveformSize < waveformSkipBoundary
+                 ? seek + constants::kChunkSize
+                 : segments.back().words.back().end;
     }
     results.insert(results.end(), std::make_move_iterator(segments.begin()),
                    std::make_move_iterator(segments.end()));
@@ -226,6 +240,12 @@ std::vector<Segment> ASR::generate(std::span<float> waveform,
     }
   }
 
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[ASR] Raw transcription results (tokens): ", bestTokens);
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[ASR] Raw transcription results (text): ",
+                    tokenizer_->decode(bestTokens, true));
+
   return this->calculateWordLevelTimestamps(bestTokens, waveform,
                                             bestAvgLogProb, bestTemperature,
                                             bestCompressionRatio);
@@ -323,7 +343,8 @@ std::vector<Segment> ASR::calculateWordLevelTimestamps(
       if (words.size()) {
         Segment seg;
         seg.words = std::move(words);
-        seg.tokens = {};
+        // seg.tokens = {};  // WTF ?
+        seg.tokens = tokens;
         seg.avgLogprob = avgLogProb;
         seg.temperature = temperature;
         seg.compressionRatio = compressionRatio;
@@ -382,6 +403,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
                                        uint64_t start, uint64_t end) const {
   const std::vector<uint64_t> tokensVec(tokens.begin(), tokens.end());
   const std::string segmentText = tokenizer_->decode(tokensVec, true);
+
   std::istringstream iss(segmentText);
   std::vector<std::string> wordsStr;
   std::string word;
 
@@ -21,15 +21,17 @@ constexpr static int32_t kNumFrames = 1500;
 
 // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz)
 constexpr static int32_t kSamplingRate = 16000;
+constexpr static int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
 
 // Time precision used by Whisper timestamps: each token spans 0.02 seconds
 constexpr static float kTimePrecision = 0.02f;
 
 // Special token constants
 namespace tokens {
-inline const std::string kStartOfTranscript = "<|startoftranscript|>";
-inline const std::string kEndOfTranscript = "<|endoftext|>";
-inline const std::string kBeginTimestamp = "<|0.00|>";
+static const std::string kStartOfTranscript = "<|startoftranscript|>";
+static const std::string kEndOfTranscript = "<|endoftext|>";
+static const std::string kBeginTimestamp = "<|0.00|>";
+static const std::string kBlankAudio = "[BLANK_AUDIO]";
 } // namespace tokens
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::constants
@@ -1,45 +1,56 @@
 #include "HypothesisBuffer.h"
+#include "Params.h"
+#include "Utils.h"
+#include <cmath>
+#include <rnexecutorch/Log.h>
 
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
 void HypothesisBuffer::insert(std::span<const Word> newWords, float offset) {
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[HypothesisBuffer] Inserting " +
+                        std::to_string(newWords.size()) +
+                        " words with offset " + std::to_string(offset) + "s.");
+
   fresh_.clear();
   for (const auto &word : newWords) {
     const float newStart = word.start + offset;
-    if (newStart > lastCommittedTime_ - 0.5f) {
+    // Only accept words that start after or near the last committed time to
+    // avoid stale data
+    if (newStart > lastCommittedTime_ - params::kStreamFreshThreshold) {
       fresh_.emplace_back(word.content, newStart, word.end + offset);
     }
   }
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[HypothesisBuffer] Filtered " +
+                        std::to_string(fresh_.size()) +
+                        " words into 'fresh' buffer.");
 
   if (!fresh_.empty() && !committedInBuffer_.empty()) {
     const float a = fresh_.front().start;
-    if (std::fabs(a - lastCommittedTime_) < 1.0f) {
+    // Check for overlap with already committed history to avoid duplicates in
+    // the stream
+    if (std::fabs(a - lastCommittedTime_) < 2.0f) {
       const size_t cn = committedInBuffer_.size();
       const size_t nn = fresh_.size();
-      const std::size_t maxCheck = std::min<std::size_t>({cn, nn, 5});
-      for (size_t i = 1; i <= maxCheck; i++) {
-        std::string c;
-        for (auto it = committedInBuffer_.cend() - i;
-             it != committedInBuffer_.cend(); ++it) {
-          if (!c.empty()) {
-            c += ' ';
-          }
-          c += it->content;
-        }
-
-        std::string tail;
-        auto it = fresh_.cbegin();
-        for (size_t k = 0; k < i; k++, it++) {
-          if (!tail.empty()) {
-            tail += ' ';
-          }
-          tail += it->content;
-        }
-
-        if (c == tail) {
-          fresh_.erase(fresh_.begin(), fresh_.begin() + i);
-          break;
-        }
+
+      rnexecutorch::log(
+          rnexecutorch::LOG_LEVEL::Info,
+          "[HypothesisBuffer] Checking for overlap. cn=" + std::to_string(cn) +
+              ", nn=" + std::to_string(nn) +
+              ", maxCheck=" + std::to_string(params::kStreamMaxOverlapSize));
+
+      size_t overlapSize = utils::findLargestOverlapingFragment(
+          committedInBuffer_, fresh_, params::kStreamMaxOverlapSize,
+          params::kStreamMaxOverlapTimestampDiff);
+
+      if (overlapSize > 0) {
+        rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                          "[HypothesisBuffer] Detected overlap of " +
+                              std::to_string(overlapSize) +
+                              " words with committed history. Erasing "
+                              "duplicates from 'fresh'.");
+        fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize);
       }
     }
   }
@@ -48,6 +59,8 @@ void HypothesisBuffer::insert(std::span<const Word> newWords, float offset) {
 std::deque<Word> HypothesisBuffer::flush() {
   std::deque<Word> commit;
 
+  // Find stable prefix: words that haven't changed between last and current
+  // iteration
   while (!fresh_.empty() && !buffer_.empty()) {
     if (fresh_.front().content != buffer_.front().content) {
       break;
@@ -59,19 +72,36 @@ std::deque<Word> HypothesisBuffer::flush() {
 
   if (!commit.empty()) {
     lastCommittedTime_ = commit.back().end;
+    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                      "[HypothesisBuffer] Found stable prefix. Committing " +
+                          std::to_string(commit.size()) +
+                          " words. New lastCommittedTime: " +
+                          std::to_string(lastCommittedTime_) + "s.");
   }
 
+  // Current 'fresh' (remaining) becomes the new 'buffer' for next iteration
+  // comparison
   buffer_ = std::move(fresh_);
   fresh_.clear();
+
   committedInBuffer_.insert(committedInBuffer_.end(), commit.begin(),
                             commit.end());
+
   return commit;
 }
 
 void HypothesisBuffer::popCommitted(float time) {
+  size_t count = 0;
   while (!committedInBuffer_.empty() &&
          committedInBuffer_.front().end <= time) {
     committedInBuffer_.pop_front();
+    count++;
+  }
+  if (count > 0) {
+    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                      "[HypothesisBuffer] Popped " + std::to_string(count) +
+                          " old words from committed history up to " +
+                          std::to_string(time) + "s.");
   }
 }
 
@@ -81,6 +111,8 @@ void HypothesisBuffer::reset() {
   buffer_.clear();
   fresh_.clear();
   committedInBuffer_.clear();
+
+  lastCommittedTime_ = 0.f;
 }
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::stream