software-mansion · jakmro · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -5,6 +5,9 @@
 namespace rnexecutorch::models::speech_to_text {
 
 using namespace ::executorch::extension;
+using namespace asr;
+using namespace types;
+using namespace stream;
 
 SpeechToText::SpeechToText(const std::string &encoderSource,
                            const std::string &decoderSource,

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -6,10 +6,6 @@ namespace rnexecutorch {
 
 namespace models::speech_to_text {
 
-using namespace asr;
-using namespace types;
-using namespace stream;
-
 class SpeechToText {
 public:
   explicit SpeechToText(const std::string &encoderSource,
@@ -35,14 +31,14 @@ class SpeechToText {
   std::unique_ptr<BaseModel> encoder;
   std::unique_ptr<BaseModel> decoder;
   std::unique_ptr<TokenizerModule> tokenizer;
-  std::unique_ptr<ASR> asr;
+  std::unique_ptr<asr::ASR> asr;
 
   std::shared_ptr<OwningArrayBuffer>
   makeOwningBuffer(std::span<const float> vectorView) const;
 
   // Stream
   std::shared_ptr<react::CallInvoker> callInvoker;
-  std::unique_ptr<OnlineASRProcessor> processor;
+  std::unique_ptr<stream::OnlineASRProcessor> processor;
   bool isStreaming;
   bool readyToProcess;
 

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -1,4 +1,5 @@
 #include <random>
+#include <sstream>
 
 #include "ASR.h"
 #include "executorch/extension/tensor/tensor_ptr.h"
@@ -8,6 +9,8 @@
 
 namespace rnexecutorch::models::speech_to_text::asr {
 
+using namespace types;
+
 ASR::ASR(const models::BaseModel *encoder, const models::BaseModel *decoder,
          const TokenizerModule *tokenizer)
     : encoder(encoder), decoder(decoder), tokenizer(tokenizer),

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
@@ -8,15 +8,14 @@
 
 namespace rnexecutorch::models::speech_to_text::asr {
 
-using namespace types;
-
 class ASR {
 public:
   explicit ASR(const models::BaseModel *encoder,
                const models::BaseModel *decoder,
                const TokenizerModule *tokenizer);
-  std::vector<Segment> transcribe(std::span<const float> waveform,
-                                  const DecodingOptions &options) const;
+  std::vector<types::Segment>
+  transcribe(std::span<const float> waveform,
+             const types::DecodingOptions &options) const;
   std::vector<float> encode(std::span<const float> waveform) const;
   std::vector<float> decode(std::span<int32_t> tokens,
                             std::span<float> encoderOutput) const;
@@ -43,16 +42,18 @@ class ASR {
   // Number of mel frames output by the encoder (derived from input spectrogram)
   constexpr static int32_t kNumFrames = 1500;
 
-  std::vector<int32_t> getInitialSequence(const DecodingOptions &options) const;
-  GenerationResult generate(std::span<const float> waveform, float temperature,
-                            const DecodingOptions &options) const;
-  std::vector<Segment>
+  std::vector<int32_t>
+  getInitialSequence(const types::DecodingOptions &options) const;
+  types::GenerationResult generate(std::span<const float> waveform,
+                                   float temperature,
+                                   const types::DecodingOptions &options) const;
+  std::vector<types::Segment>
   generateWithFallback(std::span<const float> waveform,
-                       const DecodingOptions &options) const;
-  std::vector<Segment>
+                       const types::DecodingOptions &options) const;
+  std::vector<types::Segment>
   calculateWordLevelTimestamps(std::span<const int32_t> tokens,
                                std::span<const float> waveform) const;
-  std::vector<Word>
+  std::vector<types::Word>
   estimateWordLevelTimestampsLinear(std::span<const int32_t> tokens,
                                     int32_t start, int32_t end) const;
   float getCompressionRatio(const std::string &text) const;

diff --git a/...t-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp b/...t-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp
@@ -2,6 +2,8 @@
 
 namespace rnexecutorch::models::speech_to_text::stream {
 
+using namespace types;
+
 void HypothesisBuffer::insert(std::span<const Word> newWords, float offset) {
   this->fresh.clear();
   for (const auto &word : newWords) {

diff --git a/...act-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h b/...act-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h
@@ -7,21 +7,19 @@
 
 namespace rnexecutorch::models::speech_to_text::stream {
 
-using namespace types;
-
 class HypothesisBuffer {
 public:
-  void insert(std::span<const Word> newWords, float offset);
-  std::deque<Word> flush();
+  void insert(std::span<const types::Word> newWords, float offset);
+  std::deque<types::Word> flush();
   void popCommitted(float time);
-  std::deque<Word> complete() const;
+  std::deque<types::Word> complete() const;
 
 private:
   float lastCommittedTime = 0.0f;
 
-  std::deque<Word> committedInBuffer;
-  std::deque<Word> buffer;
-  std::deque<Word> fresh;
+  std::deque<types::Word> committedInBuffer;
+  std::deque<types::Word> buffer;
+  std::deque<types::Word> fresh;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/...native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/...native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -4,6 +4,9 @@
 
 namespace rnexecutorch::models::speech_to_text::stream {
 
+using namespace asr;
+using namespace types;
+
 OnlineASRProcessor::OnlineASRProcessor(const ASR *asr) : asr(asr) {}
 
 void OnlineASRProcessor::insertAudioChunk(std::span<const float> audio) {

diff --git a/...t-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/...t-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
@@ -6,31 +6,28 @@
 
 namespace rnexecutorch::models::speech_to_text::stream {
 
-using namespace asr;
-using namespace types;
-
 class OnlineASRProcessor {
 public:
-  explicit OnlineASRProcessor(const ASR *asr);
+  explicit OnlineASRProcessor(const asr::ASR *asr);
 
   void insertAudioChunk(std::span<const float> audio);
-  ProcessResult processIter(const DecodingOptions &options);
+  types::ProcessResult processIter(const types::DecodingOptions &options);
   std::string finish();
 
   std::vector<float> audioBuffer;
 
 private:
-  const ASR *asr;
+  const asr::ASR *asr;
   constexpr static int32_t kSamplingRate = 16000;
 
   HypothesisBuffer hypothesisBuffer;
   float bufferTimeOffset = 0.0f;
-  std::vector<Word> committed;
+  std::vector<types::Word> committed;
 
-  void chunkCompletedSegment(std::span<const Segment> res);
+  void chunkCompletedSegment(std::span<const types::Segment> res);
   void chunkAt(float time);
 
-  std::string toFlush(const std::deque<Word> &words) const;
+  std::string toFlush(const std::deque<types::Word> &words) const;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-executorch",
-  "version": "0.5.4",
+  "version": "0.5.5",
   "description": "An easy way to run AI models in React Native with ExecuTorch",
   "source": "./src/index.ts",
   "main": "./lib/module/index.js",