software-mansion
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h‎
Lines changed: 3 additions & 3 deletions b/‎packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h‎
Lines changed: 1 addition & 1 deletion b/‎packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp‎
Lines changed: 39 additions & 42 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp‎
Lines changed: 39 additions & 42 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h‎
Lines changed: 18 additions & 18 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h‎
Lines changed: 0 additions & 65 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/ASR.h‎
Lines changed: 39 additions & 0 deletions b/‎packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/ASR.h‎
Lines changed: 39 additions & 0 deletions
@@ -18,11 +18,11 @@
 #include <rnexecutorch/models/object_detection/Constants.h>
 #include <rnexecutorch/models/object_detection/Types.h>
 #include <rnexecutorch/models/ocr/Types.h>
-#include <rnexecutorch/models/speech_to_text/types/Segment.h>
-#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
+#include <rnexecutorch/models/speech_to_text/common/types/Segment.h>
+#include <rnexecutorch/models/speech_to_text/common/types/TranscriptionResult.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
-using namespace rnexecutorch::models::speech_to_text::types;
+using namespace rnexecutorch::models::speech_to_text;
 
 namespace rnexecutorch::jsi_conversion {
 
 
@@ -3,12 +3,12 @@
 #include <string>
 #include <vector>
 
-#include "rnexecutorch/metaprogramming/ConstructorHelpers.h"
 #include <ReactCommon/CallInvoker.h>
 #include <executorch/extension/module/module.h>
 #include <jsi/jsi.h>
 #include <rnexecutorch/host_objects/JSTensorViewIn.h>
 #include <rnexecutorch/host_objects/JSTensorViewOut.h>
+#include <rnexecutorch/metaprogramming/ConstructorHelpers.h>
 
 namespace rnexecutorch {
 namespace models {
 
@@ -1,55 +1,54 @@
 #include <thread>
 
 #include "SpeechToText.h"
+#include "common/types/TranscriptionResult.h"
+#include "whisper/ASR.h"
+#include "whisper/OnlineASR.h"
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
-#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 
 namespace rnexecutorch::models::speech_to_text {
 
-using namespace ::executorch::extension;
-using namespace asr;
-using namespace types;
-using namespace stream;
-
-SpeechToText::SpeechToText(const std::string &encoderSource,
-                           const std::string &decoderSource,
+SpeechToText::SpeechToText(const std::string &modelName,
+                           const std::string &modelSource,
                            const std::string &tokenizerSource,
                            std::shared_ptr<react::CallInvoker> callInvoker)
-    : callInvoker(std::move(callInvoker)),
-      encoder(std::make_unique<BaseModel>(encoderSource, this->callInvoker)),
-      decoder(std::make_unique<BaseModel>(decoderSource, this->callInvoker)),
-      tokenizer(std::make_unique<TokenizerModule>(tokenizerSource,
-                                                  this->callInvoker)),
-      asr(std::make_unique<ASR>(this->encoder.get(), this->decoder.get(),
-                                this->tokenizer.get())),
-      processor(std::make_unique<OnlineASRProcessor>(this->asr.get())),
-      isStreaming(false), readyToProcess(false) {}
-
-void SpeechToText::unload() noexcept {
-  this->encoder->unload();
-  this->decoder->unload();
+    : callInvoker_(std::move(callInvoker)), isStreaming_(false),
+      readyToProcess_(false) {
+  // Switch between the ASR implementations based on model name
+  if (modelName == "whisper") {
+    transcriber_ = std::make_unique<whisper::ASR>(modelSource, tokenizerSource,
+                                                  callInvoker_);
+    streamer_ = std::make_unique<whisper::stream::OnlineASR>(
+        static_cast<const whisper::ASR *>(transcriber_.get()));
+  } else {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::InvalidConfig,
+        "[SpeechToText]: Invalid model name: " + modelName);
+  }
 }
 
+void SpeechToText::unload() noexcept { transcriber_->unload(); }
+
 std::shared_ptr<OwningArrayBuffer>
 SpeechToText::encode(std::span<float> waveform) const {
-  std::vector<float> encoderOutput = this->asr->encode(waveform);
+  std::vector<float> encoderOutput = transcriber_->encode(waveform);
   return std::make_shared<OwningArrayBuffer>(encoderOutput);
 }
 
 std::shared_ptr<OwningArrayBuffer>
 SpeechToText::decode(std::span<uint64_t> tokens,
                      std::span<float> encoderOutput) const {
   std::vector<float> decoderOutput =
-      this->asr->decode(tokens, 0, encoderOutput);
+      transcriber_->decode(tokens, encoderOutput);
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
 TranscriptionResult SpeechToText::transcribe(std::span<float> waveform,
                                              std::string languageOption,
                                              bool verbose) const {
   DecodingOptions options(languageOption, verbose);
-  std::vector<Segment> segments = this->asr->transcribe(waveform, options);
+  std::vector<Segment> segments = transcriber_->transcribe(waveform, options);
 
   std::string fullText;
   for (const auto &segment : segments) {
@@ -71,8 +70,7 @@ TranscriptionResult SpeechToText::transcribe(std::span<float> waveform,
 }
 
 size_t SpeechToText::getMemoryLowerBound() const noexcept {
-  return this->encoder->getMemoryLowerBound() +
-         this->decoder->getMemoryLowerBound();
+  return transcriber_->getMemoryLowerBound();
 }
 
 namespace {
@@ -106,7 +104,7 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
 
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                           std::string languageOption, bool verbose) {
-  if (this->isStreaming) {
+  if (isStreaming_) {
     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
                             "Streaming is already in progress!");
   }
@@ -116,7 +114,7 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                                   const TranscriptionResult &nonCommitted,
                                   bool isDone) {
     // This moves execution to the JS thread
-    this->callInvoker->invokeAsync(
+    callInvoker_->invokeAsync(
         [callback, committed, nonCommitted, isDone, verbose](jsi::Runtime &rt) {
           jsi::Value jsiCommitted =
               rnexecutorch::jsi_conversion::getJsiValue(committed, rt);
@@ -128,46 +126,45 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
         });
   };
 
-  this->isStreaming = true;
+  isStreaming_ = true;
   DecodingOptions options(languageOption, verbose);
 
-  while (this->isStreaming) {
-    if (!this->readyToProcess ||
-        this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) {
+  while (isStreaming_) {
+    if (!readyToProcess_ || !streamer_->ready()) {
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
       continue;
     }
 
-    ProcessResult res = this->processor->processIter(options);
+    ProcessResult res = streamer_->process(options);
 
     TranscriptionResult cRes =
         wordsToResult(res.committed, languageOption, verbose);
     TranscriptionResult ncRes =
         wordsToResult(res.nonCommitted, languageOption, verbose);
 
     nativeCallback(cRes, ncRes, false);
-    this->readyToProcess = false;
+    readyToProcess_ = false;
   }
 
-  std::vector<Word> finalWords = this->processor->finish();
+  std::vector<Word> finalWords = streamer_->finish();
   TranscriptionResult finalRes =
       wordsToResult(finalWords, languageOption, verbose);
 
   nativeCallback(finalRes, {}, true);
-  this->resetStreamState();
+  resetStreamState();
 }
 
-void SpeechToText::streamStop() { this->isStreaming = false; }
+void SpeechToText::streamStop() { isStreaming_ = false; }
 
 void SpeechToText::streamInsert(std::span<float> waveform) {
-  this->processor->insertAudioChunk(waveform);
-  this->readyToProcess = true;
+  streamer_->insertAudioChunk(waveform);
+  readyToProcess_ = true;
 }
 
 void SpeechToText::resetStreamState() {
-  this->isStreaming = false;
-  this->readyToProcess = false;
-  this->processor = std::make_unique<OnlineASRProcessor>(this->asr.get());
+  isStreaming_ = false;
+  readyToProcess_ = false;
+  streamer_->reset();
 }
 
 } // namespace rnexecutorch::models::speech_to_text
@@ -1,19 +1,21 @@
 #pragma once
 
-#include "rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h"
-#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 #include <span>
 #include <string>
 #include <vector>
 
+#include "common/schema/ASR.h"
+#include "common/schema/OnlineASR.h"
+#include "common/types/TranscriptionResult.h"
+
 namespace rnexecutorch {
 
 namespace models::speech_to_text {
 
 class SpeechToText {
 public:
-  explicit SpeechToText(const std::string &encoderSource,
-                        const std::string &decoderSource,
+  explicit SpeechToText(const std::string &modelName,
+                        const std::string &modelSource,
                         const std::string &tokenizerSource,
                         std::shared_ptr<react::CallInvoker> callInvoker);
 
@@ -25,9 +27,9 @@ class SpeechToText {
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   decode(std::span<uint64_t> tokens, std::span<float> encoderOutput) const;
   [[nodiscard("Registered non-void function")]]
-  types::TranscriptionResult transcribe(std::span<float> waveform,
-                                        std::string languageOption,
-                                        bool verbose) const;
+  TranscriptionResult transcribe(std::span<float> waveform,
+                                 std::string languageOption,
+                                 bool verbose) const;
 
   [[nodiscard("Registered non-void function")]]
   std::vector<char> transcribeStringOnly(std::span<float> waveform,
@@ -42,20 +44,18 @@ class SpeechToText {
   void streamInsert(std::span<float> waveform);
 
 private:
-  std::shared_ptr<react::CallInvoker> callInvoker;
-  std::unique_ptr<BaseModel> encoder;
-  std::unique_ptr<BaseModel> decoder;
-  std::unique_ptr<TokenizerModule> tokenizer;
-  std::unique_ptr<asr::ASR> asr;
+  // Helper functions
+  void resetStreamState();
 
-  // Stream
-  std::unique_ptr<stream::OnlineASRProcessor> processor;
-  bool isStreaming;
-  bool readyToProcess;
+  std::shared_ptr<react::CallInvoker> callInvoker_;
 
-  constexpr static int32_t kMinAudioSamples = 16000; // 1 second
+  // ASR-like module (both static transcription & streaming)
+  std::unique_ptr<schema::ASR> transcriber_ = nullptr;
 
-  void resetStreamState();
+  // Online ASR-like module (streaming only)
+  std::unique_ptr<schema::OnlineASR> streamer_ = nullptr;
+  bool isStreaming_ = false;
+  bool readyToProcess_ = true;
 };
 
 } // namespace models::speech_to_text
 
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cinttypes>
+#include <span>
+#include <vector>
+
+#include "../types/DecodingOptions.h"
+#include "../types/Segment.h"
+#include <rnexecutorch/models/BaseModel.h>
+
+namespace rnexecutorch::models::speech_to_text::schema {
+
+/**
+ * @brief Abstract base class for Automatic Speech Recognition (ASR) models.
+ *
+ * Provides a unified interface for speech-to-text models like Whisper, allowing
+ * for transcription of raw audio waveforms into text segments, as well as
+ * access to lower-level model components like encoding and decoding.
+ */
+class ASR {
+public:
+  virtual ~ASR() = default;
+
+  std::vector<Segment> virtual transcribe(
+      std::span<float> waveform, const DecodingOptions &options) const = 0;
+
+  virtual std::vector<float> encode(std::span<float> waveform) const = 0;
+
+  virtual std::vector<float> decode(std::span<uint64_t> tokens,
+                                    std::span<float> encoderOutput,
+                                    uint64_t startPos = 0) const = 0;
+
+  // Standard ExecuTorch model methods for compatibility with the rest of the
+  // API.
+  virtual void unload() noexcept = 0;
+  virtual std::size_t getMemoryLowerBound() const noexcept = 0;
+};
+
+} // namespace rnexecutorch::models::speech_to_text::schema