Skip to content

Commit ae017ef

Browse files
committed
Final nits
1 parent cbcb652 commit ae017ef

File tree

4 files changed

+15
-4
lines changed

4 files changed

+15
-4
lines changed

packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
2121
firstFreshWordIdx = lastMatchingWordIdx.value_or(0);
2222
}
2323

24+
bool isCompletelyFresh = firstFreshWordIdx == 0;
2425
for (size_t i = firstFreshWordIdx; i < words.size(); i++) {
2526
const auto &word = words[i];
2627

@@ -29,7 +30,8 @@ void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
2930
const float startGlobal = word.start + offset;
3031
const float endGlobal = word.end + offset;
3132

32-
if (startGlobal > lastCommittedTime_ - 3.F) {
33+
if (!isCompletelyFresh ||
34+
startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) {
3335
fresh_.emplace_back(word.content, startGlobal, endGlobal,
3436
word.punctations);
3537
}

packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
2424
}
2525

2626
void OnlineASR::insertAudioChunk(std::span<const float> audio) {
27+
std::lock_guard<std::mutex> lock(audioBufferMutex_);
2728
audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
2829
}
2930

@@ -32,7 +33,10 @@ bool OnlineASR::isReady() const {
3233
}
3334

3435
ProcessResult OnlineASR::process(const DecodingOptions &options) {
36+
std::unique_lock<std::mutex> lock(audioBufferMutex_);
37+
3538
std::vector<Segment> transcriptions = asr_->transcribe(audioBuffer_, options);
39+
lock.unlock();
3640

3741
if (transcriptions.empty()) {
3842
return {.committed = {}, .nonCommitted = {}};
@@ -57,9 +61,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
5761
const float newEnd = hypothesisBuffer_.fresh_.back().end;
5862
float shift = 0.F;
5963
for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) {
60-
const float originalStart = hypothesisBuffer_.fresh_[i].start;
6164
const float originalEnd = hypothesisBuffer_.fresh_[i].end;
62-
const std::string &wordContent = hypothesisBuffer_.fresh_[i].content;
6365

6466
if (i < hypothesisBuffer_.hypothesis_.size() &&
6567
utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content,
@@ -104,6 +106,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
104106

105107
// Since Whisper does not accept waveforms longer than 30 seconds, we need
106108
// to cut the audio at some safe point.
109+
lock.lock();
107110
const float audioDuration =
108111
static_cast<float>(audioBuffer_.size()) / constants::kSamplingRate;
109112
if (audioDuration > params::kStreamChunkThreshold) {
@@ -126,6 +129,7 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
126129
audioBuffer_.begin() + nSamplesToErase);
127130
bufferTimeOffset_ += eraseDuration;
128131
}
132+
lock.unlock();
129133

130134
return {.committed = move_to_vector(committed),
131135
.nonCommitted = move_to_vector(nonCommitted)};
@@ -140,6 +144,8 @@ std::vector<Word> OnlineASR::finish() {
140144
}
141145

142146
void OnlineASR::reset() {
147+
std::lock_guard<std::mutex> lock(audioBufferMutex_);
148+
143149
hypothesisBuffer_.reset();
144150
bufferTimeOffset_ = 0.f;
145151

packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include "ASR.h"
88
#include "HypothesisBuffer.h"
99

10+
#include <mutex>
11+
1012
namespace rnexecutorch::models::speech_to_text::whisper::stream {
1113

1214
/**
@@ -64,6 +66,7 @@ class OnlineASR : public schema::OnlineASR {
6466
// Stores the increasing amounts of streamed audio.
6567
// Cleared from time to time after reaching a threshold size.
6668
std::vector<float> audioBuffer_ = {};
69+
std::mutex audioBufferMutex_;
6770
float bufferTimeOffset_ = 0.F; // Audio buffer offset
6871

6972
// Helper buffers - hypothesis buffer

packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ constexpr static int32_t kChunkBreakBuffer = 2; // [s]
2525
* Determines the maximum timestamp difference available for a word to be
2626
* considered as fresh in streaming algorithm.
2727
*/
28-
constexpr static float kStreamFreshThreshold = 1.F; // [s], originally 0.5
28+
constexpr static float kStreamFreshThreshold = 2.F; // [s], originally 0.5
2929

3030
/**
3131
* Determines the maximum expected size of overlapping fragments between

0 commit comments

Comments
 (0)