diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index d444b9c914..e7d485c3b9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -5,6 +5,9 @@ namespace rnexecutorch::models::speech_to_text { using namespace ::executorch::extension; +using namespace asr; +using namespace types; +using namespace stream; SpeechToText::SpeechToText(const std::string &encoderSource, const std::string &decoderSource, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index a6f3779e4d..d28e80d0db 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -6,10 +6,6 @@ namespace rnexecutorch { namespace models::speech_to_text { -using namespace asr; -using namespace types; -using namespace stream; - class SpeechToText { public: explicit SpeechToText(const std::string &encoderSource, @@ -35,14 +31,14 @@ class SpeechToText { std::unique_ptr encoder; std::unique_ptr decoder; std::unique_ptr tokenizer; - std::unique_ptr asr; + std::unique_ptr asr; std::shared_ptr makeOwningBuffer(std::span vectorView) const; // Stream std::shared_ptr callInvoker; - std::unique_ptr processor; + std::unique_ptr processor; bool isStreaming; bool readyToProcess; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp index 5a56f2d7e9..d0f965cb39 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp @@ -1,4 +1,5 @@ #include +#include #include "ASR.h" #include "executorch/extension/tensor/tensor_ptr.h" @@ -8,6 +9,8 @@ namespace rnexecutorch::models::speech_to_text::asr { +using namespace types; + ASR::ASR(const models::BaseModel *encoder, const models::BaseModel *decoder, const TokenizerModule *tokenizer) : encoder(encoder), decoder(decoder), tokenizer(tokenizer), diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h index 605052363f..20180ebe46 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h @@ -8,15 +8,14 @@ namespace rnexecutorch::models::speech_to_text::asr { -using namespace types; - class ASR { public: explicit ASR(const models::BaseModel *encoder, const models::BaseModel *decoder, const TokenizerModule *tokenizer); - std::vector transcribe(std::span waveform, - const DecodingOptions &options) const; + std::vector + transcribe(std::span waveform, + const types::DecodingOptions &options) const; std::vector encode(std::span waveform) const; std::vector decode(std::span tokens, std::span encoderOutput) const; @@ -43,16 +42,18 @@ class ASR { // Number of mel frames output by the encoder (derived from input spectrogram) constexpr static int32_t kNumFrames = 1500; - std::vector getInitialSequence(const DecodingOptions &options) const; - GenerationResult generate(std::span waveform, float temperature, - const DecodingOptions &options) const; - std::vector + std::vector + getInitialSequence(const types::DecodingOptions &options) const; + types::GenerationResult generate(std::span waveform, + float temperature, + const types::DecodingOptions &options) const; + std::vector generateWithFallback(std::span waveform, - const DecodingOptions &options) const; - std::vector + const types::DecodingOptions &options) const; + std::vector calculateWordLevelTimestamps(std::span tokens, std::span waveform) const; - std::vector + std::vector estimateWordLevelTimestampsLinear(std::span tokens, int32_t start, int32_t end) const; float getCompressionRatio(const std::string &text) const; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp index 3e4d6a7cab..31806c1268 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp @@ -2,6 +2,8 @@ namespace rnexecutorch::models::speech_to_text::stream { +using namespace types; + void HypothesisBuffer::insert(std::span newWords, float offset) { this->fresh.clear(); for (const auto &word : newWords) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h index ea4e73328e..cfa11fd665 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h @@ -7,21 +7,19 @@ namespace rnexecutorch::models::speech_to_text::stream { -using namespace types; - class HypothesisBuffer { public: - void insert(std::span newWords, float offset); - std::deque flush(); + void insert(std::span newWords, float offset); + std::deque flush(); void popCommitted(float time); - std::deque complete() const; + std::deque complete() const; private: float lastCommittedTime = 0.0f; - std::deque committedInBuffer; - std::deque buffer; - std::deque fresh; + std::deque committedInBuffer; + std::deque buffer; + std::deque fresh; }; } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp index 63cffd67cd..c6a99e9a2a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp @@ -4,6 +4,9 @@ namespace rnexecutorch::models::speech_to_text::stream { +using namespace asr; +using namespace types; + OnlineASRProcessor::OnlineASRProcessor(const ASR *asr) : asr(asr) {} void OnlineASRProcessor::insertAudioChunk(std::span audio) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h index 403cf87d1a..c50b562711 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h @@ -6,31 +6,28 @@ namespace rnexecutorch::models::speech_to_text::stream { -using namespace asr; -using namespace types; - class OnlineASRProcessor { public: - explicit OnlineASRProcessor(const ASR *asr); + explicit OnlineASRProcessor(const asr::ASR *asr); void insertAudioChunk(std::span audio); - ProcessResult processIter(const DecodingOptions &options); + types::ProcessResult processIter(const types::DecodingOptions &options); std::string finish(); std::vector audioBuffer; private: - const ASR *asr; + const asr::ASR *asr; constexpr static int32_t kSamplingRate = 16000; HypothesisBuffer hypothesisBuffer; float bufferTimeOffset = 0.0f; - std::vector committed; + std::vector committed; - void chunkCompletedSegment(std::span res); + void chunkCompletedSegment(std::span res); void chunkAt(float time); - std::string toFlush(const std::deque &words) const; + std::string toFlush(const std::deque &words) const; }; } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json index 7807e420eb..11f433b5fd 100644 --- a/packages/react-native-executorch/package.json +++ b/packages/react-native-executorch/package.json @@ -1,6 +1,6 @@ { "name": "react-native-executorch", - "version": "0.5.4", + "version": "0.5.5", "description": "An easy way to run AI models in React Native with ExecuTorch", "source": "./src/index.ts", "main": "./lib/module/index.js",