diff --git a/apps/speech/App.tsx b/apps/speech/App.tsx index ab036678e8..ddbfe3c983 100644 --- a/apps/speech/App.tsx +++ b/apps/speech/App.tsx @@ -5,6 +5,7 @@ import { SpeechToTextScreen } from './screens/SpeechToTextScreen'; import ColorPalette from './colors'; import ExecutorchLogo from './assets/executorch.svg'; import { Quiz } from './screens/Quiz'; +import { TextToSpeechLLMScreen } from './screens/TextToSpeechLLMScreen'; import { initExecutorch } from 'react-native-executorch'; import { ExpoResourceFetcher } from '@react-native-executorch/expo-resource-fetcher'; @@ -14,7 +15,7 @@ initExecutorch({ export default function App() { const [currentScreen, setCurrentScreen] = useState< - 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' + 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' | 'text-to-speech-llm' >('menu'); const goToMenu = () => setCurrentScreen('menu'); @@ -31,6 +32,10 @@ export default function App() { return ; } + if (currentScreen === 'text-to-speech-llm') { + return ; + } + return ( @@ -54,6 +59,12 @@ export default function App() { > Text to Speech - Quiz + setCurrentScreen('text-to-speech-llm')} + > + Text to Speech - LLM Streaming + ); diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx new file mode 100644 index 0000000000..04856cf3e7 --- /dev/null +++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx @@ -0,0 +1,323 @@ +import React, { useEffect, useState, useRef } from 'react'; +import { + View, + Text, + StyleSheet, + TouchableOpacity, + ScrollView, +} from 'react-native'; +import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +import FontAwesome from '@expo/vector-icons/FontAwesome'; +import SWMIcon from '../assets/swm_icon.svg'; +import { + useLLM, + useTextToSpeech, + KOKORO_MEDIUM, + KOKORO_VOICE_AF_HEART, + LLAMA3_2_1B_QLORA, +} from 'react-native-executorch'; +import { + AudioManager, + AudioContext, + AudioBuffer, + AudioBufferSourceNode, +} from 'react-native-audio-api'; + +interface TextToSpeechLLMProps { + onBack: () => void; +} + +/** + * Converts an audio vector (Float32Array) to an AudioBuffer for playback + * @param audioVector - The generated audio samples from the model + * @param sampleRate - The sample rate (default: 24000 Hz for Kokoro) + * @returns AudioBuffer ready for playback + */ +const createAudioBufferFromVector = ( + audioVector: Float32Array, + audioContext: AudioContext, + sampleRate: number = 24000 +): AudioBuffer => { + const audioBuffer = audioContext.createBuffer( + 1, + audioVector.length, + sampleRate + ); + const channelData = audioBuffer.getChannelData(0); + channelData.set(audioVector); + + return audioBuffer; +}; + +export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => { + const [displayText, setDisplayText] = useState(''); + const [isTtsStreaming, setIsTtsStreaming] = useState(false); + const llm = useLLM({ model: LLAMA3_2_1B_QLORA }); + const tts = useTextToSpeech({ + model: KOKORO_MEDIUM, + voice: KOKORO_VOICE_AF_HEART, + }); + + const processedLengthRef = useRef(0); + const audioContextRef = useRef(null); + const sourceRef = useRef(null); + + useEffect(() => { + AudioManager.setAudioSessionOptions({ + iosCategory: 'playAndRecord', + iosMode: 'spokenAudio', + iosOptions: ['defaultToSpeaker'], + }); + + audioContextRef.current = new AudioContext({ sampleRate: 24000 }); + audioContextRef.current.suspend(); + + return () => { + audioContextRef.current?.close(); + audioContextRef.current = null; + }; + }, []); + + // Update displayText gradually as response gets generated and insert new text chunks into TTS stream + useEffect(() => { + if (llm.response && tts.isReady) { + setDisplayText(llm.response); + + const previousLength = processedLengthRef.current; + if (llm.response.length > previousLength && isTtsStreaming) { + const newChunk = llm.response.slice(previousLength); + tts.streamInsert(newChunk); + processedLengthRef.current = llm.response.length; + } + } else { + processedLengthRef.current = 0; + } + }, [llm.response, tts, isTtsStreaming]); + + const handleGenerate = async () => { + setDisplayText(''); + processedLengthRef.current = 0; + setIsTtsStreaming(true); + + const startTTS = async () => { + try { + const audioContext = audioContextRef.current; + if (!audioContext) return; + + if (audioContext.state === 'suspended') { + await audioContext.resume(); + } + + const onNext = async (audioVec: Float32Array) => { + return new Promise((resolve) => { + const audioBuffer = createAudioBufferFromVector( + audioVec, + audioContext, + 24000 + ); + + const source = (sourceRef.current = + audioContext.createBufferSource()); + source.buffer = audioBuffer; + source.connect(audioContext.destination); + + source.onEnded = () => resolve(); + + source.start(); + }); + }; + + await tts.stream({ + speed: 0.9, + stopAutomatically: false, + onNext, + }); + } catch (e) { + console.error('TTS streaming error:', e); + } finally { + setIsTtsStreaming(false); + } + }; + + const ttsPromise = startTTS(); + + try { + await llm.sendMessage( + 'Generate a short story about a robot learning to paint. The story should be around 200 words long.' + ); + } catch (e) { + console.error('Generation failed:', e); + } finally { + tts.streamStop(false); + await ttsPromise; + + if ( + audioContextRef.current && + audioContextRef.current.state === 'running' + ) { + await audioContextRef.current.suspend(); + } + } + }; + + const handleStop = () => { + llm.interrupt(); + tts.streamStop(true); + if (sourceRef.current) { + try { + sourceRef.current.stop(); + } catch (e) { + // Source might have already stopped or disconnected + } + } + }; + + const isProcessing = llm.isGenerating || isTtsStreaming; + const isModelsReady = llm.isReady && tts.isReady; + + const getModelStatus = () => { + if (llm.error) return `LLM Error: ${llm.error.message}`; + if (tts.error) return `TTS Error: ${tts.error.message}`; + if (!llm.isReady) + return `Loading LLM: ${(100 * llm.downloadProgress).toFixed(2)}%`; + if (!tts.isReady) + return `Loading TTS: ${(100 * tts.downloadProgress).toFixed(2)}%`; + if (isProcessing) return 'Generating/Streaming...'; + return 'Ready'; + }; + + return ( + + + + + + + + React Native ExecuTorch + LLM to Speech Demo + + + + Status: {getModelStatus()} + + + + Generated Story + + + + {displayText || + (isModelsReady + ? 'Press the button to generate a story and hear it spoken aloud.' + : 'Please wait for models to load...')} + + + + + + + {isProcessing ? ( + + + Stop Generation + + ) : ( + + + Generate & Stream Speech + + )} + + + + ); +}; + +const styles = StyleSheet.create({ + container: { + flex: 1, + alignItems: 'center', + backgroundColor: 'white', + paddingHorizontal: 16, + }, + header: { + alignItems: 'center', + position: 'relative', + width: '100%', + }, + backButton: { + position: 'absolute', + left: 0, + top: 10, + padding: 10, + zIndex: 1, + }, + headerText: { + fontSize: 22, + fontWeight: 'bold', + color: '#0f186e', + }, + statusContainer: { + marginTop: 12, + alignItems: 'center', + }, + contentContainer: { + width: '100%', + marginTop: 24, + flex: 1, + marginBottom: 24, + }, + label: { + marginLeft: 12, + marginBottom: 4, + color: '#0f186e', + fontWeight: '600', + }, + responseContainer: { + borderRadius: 12, + borderWidth: 1, + borderColor: '#0f186e', + flex: 1, + }, + responseContent: { + padding: 12, + }, + responseText: { + fontSize: 16, + color: '#333', + lineHeight: 24, + }, + buttonContainer: { + marginBottom: 24, + width: '100%', + }, + actionButton: { + backgroundColor: '#0f186e', + flexDirection: 'row', + justifyContent: 'center', + alignItems: 'center', + padding: 12, + borderRadius: 12, + gap: 8, + }, + stopButton: { + backgroundColor: '#ff4444', + }, + buttonText: { + color: 'white', + fontWeight: '600', + letterSpacing: -0.5, + fontSize: 16, + }, + disabled: { + opacity: 0.5, + }, +}); diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md index 10e9986dee..af0fe8e18c 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md @@ -87,7 +87,8 @@ The module provides two ways to generate speech using either raw text or pre-gen ### Using Text 1. [**`forward({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`. -2. [**`stream({ text, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. +2. [**`stream({speed, stopAutomatically, onNext, ...})`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed. + This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`. ### Using Phonemes diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md index 00fd04f53b..ec0919574c 100644 --- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md +++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md @@ -52,14 +52,14 @@ The module provides two ways to generate speech using either raw text or pre-gen ### Using Text 1. [**`forward(text, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`. -2. [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. +2. [**`stream({ speed, stopAutomatically, onNext, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop). ### Using Phonemes If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step: 1. [**`forwardFromPhonemes(phonemes, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string. -2. [**`streamFromPhonemes({ phonemes, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string. +2. [**`streamFromPhonemes({ phonemes, speed, onNext, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string. :::note Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs. diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 90241eac03..c13b8991dc 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -188,17 +188,19 @@ template class ModelHostObject : public JsiHostObject { promiseHostFunction<&Model::stream>, "stream")); addFunctions(JSI_EXPORT_FUNCTION( - ModelHostObject, - synchronousHostFunction<&Model::streamStop>, + ModelHostObject, synchronousHostFunction<&Model::streamStop>, "streamStop")); addFunctions(JSI_EXPORT_FUNCTION( - ModelHostObject, - promiseHostFunction<&Model::generateFromPhonemes>, - "generateFromPhonemes")); - addFunctions(JSI_EXPORT_FUNCTION( - ModelHostObject, - promiseHostFunction<&Model::streamFromPhonemes>, - "streamFromPhonemes")); + ModelHostObject, synchronousHostFunction<&Model::streamInsert>, + "streamInsert")); + addFunctions( + JSI_EXPORT_FUNCTION(ModelHostObject, + promiseHostFunction<&Model::generateFromPhonemes>, + "generateFromPhonemes")); + addFunctions( + JSI_EXPORT_FUNCTION(ModelHostObject, + promiseHostFunction<&Model::streamFromPhonemes>, + "streamFromPhonemes")); } if constexpr (meta::HasGenerateFromString) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h index 3bc7f7f835..6064191443 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h @@ -34,6 +34,10 @@ inline constexpr int32_t kSamplingRate = 24000; // Corresponds to Kokoro's model audio frequency inline constexpr int32_t kSamplesPerMilisecond = kSamplingRate / 1000; +// Special text characters +inline const std::unordered_set kEndOfSentenceCharacters = {'.', '?', '!', + ';'}; + // Special phonemes inline const std::unordered_set kEndOfSentencePhonemes = { U'.', U'?', U'!', U';', U'…'}; @@ -70,4 +74,4 @@ inline const std::unordered_map kVocab = { inline constexpr Token kInvalidToken = -1; inline constexpr Token kPadToken = 0; -} // namespace rnexecutorch::models::text_to_speech::kokoro::constants +} // namespace rnexecutorch::models::text_to_speech::kokoro::constants \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp index d7da13b945..ea43f09d47 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace rnexecutorch::models::text_to_speech::kokoro { @@ -103,9 +104,8 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) { return audio; } -void Kokoro::streamFromPhonemesImpl( - const std::u32string &phonemes, float speed, - std::shared_ptr callback) { +void Kokoro::streamFromPhonemesImpl(const std::u32string &phonemes, float speed, + std::shared_ptr callback) { auto nativeCallback = [this, callback](const std::vector &audioVec) { if (this->isStreaming_) { this->callInvoker_->invokeAsync( @@ -116,8 +116,6 @@ void Kokoro::streamFromPhonemesImpl( } }; - isStreaming_ = true; - // Use LATENCY strategy to minimize the time-to-first-audio for streaming auto subsentences = partitioner_.divide(phonemes); @@ -152,8 +150,6 @@ void Kokoro::streamFromPhonemesImpl( // Push the audio right away to the JS side nativeCallback(std::move(audioPart)); } - - isStreaming_ = false; } std::vector Kokoro::generate(std::string text, float speed) { @@ -162,6 +158,10 @@ std::vector Kokoro::generate(std::string text, float speed) { "Kokoro: maximum input text size exceeded"); } + if (text.empty()) { + return {}; + } + // G2P (Grapheme to Phoneme) conversion auto phonemes = phonemizer_.process(text); @@ -171,24 +171,74 @@ std::vector Kokoro::generate(std::string text, float speed) { std::vector Kokoro::generateFromPhonemes(std::string phonemes, float speed) { if (phonemes.empty()) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: phoneme string must not be empty"); + return {}; } + return generateFromPhonemesImpl( phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed); } -void Kokoro::stream(std::string text, float speed, +void Kokoro::stream(float speed, bool stopOnEmptyBuffer, std::shared_ptr callback) { - if (text.size() > params::kMaxTextSize) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: maximum input text size exceeded"); - } + isStreaming_ = true; + stopOnEmptyBuffer_ = stopOnEmptyBuffer; + + // The outer streaming loop is responsible for handling the input buffer. + // The extracted text is then passed to the inner loop, which performs a + // standard streaming on a fixed amount of input text. + while (isStreaming_) { + std::string text; + + // Extract the code relying on input buffer for a separate mutex lock + // section. + { + std::scoped_lock lock(inputTextBufferMutex_); + if (inputTextBuffer_.empty() && stopOnEmptyBuffer_) { + break; + } + + // Try to find the most recent available end of sentence character. + size_t searchLimit = + std::min(inputTextBuffer_.size(), params::kMaxTextSize); + auto eosIt = std::find_first_of( + inputTextBuffer_.rbegin() + (inputTextBuffer_.size() - searchLimit), + inputTextBuffer_.rend(), constants::kEndOfSentenceCharacters.begin(), + constants::kEndOfSentenceCharacters.end()); + size_t chunkSize = (eosIt != inputTextBuffer_.rend()) + ? std::distance(eosIt, inputTextBuffer_.rend()) + : 0; + + // To maximize the quality of the speech, we try to avoid processing + // chunks which end in the middle of a sentence. + if (chunkSize > 0 || + streamSkippedIterations >= params::kStreamMaxSkippedIterations) { + text = inputTextBuffer_.substr(0, chunkSize); + inputTextBuffer_.erase(0, chunkSize); + streamSkippedIterations = 0; + } else { + streamSkippedIterations++; + } + } - // G2P (Grapheme to Phoneme) conversion - auto phonemes = phonemizer_.process(text); + if (!text.empty()) { + // Now we proceed with a standard streaming logic for fixed-size input. + auto phonemes = phonemizer_.process(text); + streamFromPhonemesImpl(phonemes, speed, callback); + } - streamFromPhonemesImpl(phonemes, speed, callback); + // A little bit of pause to not overload the thread. + if (isStreaming_) { + std::this_thread::sleep_for( + std::chrono::milliseconds(params::kStreamPause)); + } + } + + { + std::scoped_lock lock(inputTextBufferMutex_); + inputTextBuffer_.clear(); + isStreaming_ = false; + streamSkippedIterations = 0; + } } void Kokoro::streamFromPhonemes(std::string phonemes, float speed, @@ -197,12 +247,26 @@ void Kokoro::streamFromPhonemes(std::string phonemes, float speed, throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, "Kokoro: phoneme string must not be empty"); } + + isStreaming_ = true; streamFromPhonemesImpl( phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed, callback); + isStreaming_ = false; } -void Kokoro::streamStop() noexcept { isStreaming_ = false; } +void Kokoro::streamInsert(std::string textChunk) noexcept { + std::scoped_lock lock(inputTextBufferMutex_); + inputTextBuffer_.append(textChunk); +} + +void Kokoro::streamStop(bool instant) noexcept { + if (instant) { + isStreaming_ = false; + } else { + stopOnEmptyBuffer_ = true; + } +} std::vector Kokoro::synthesize(const std::u32string &phonemes, float speed, size_t paddingMs) { @@ -220,8 +284,8 @@ std::vector Kokoro::synthesize(const std::u32string &phonemes, const auto tokens = utils::tokenize(phonemes, {noTokens}); // Select the appropriate voice vector - size_t voiceID = std::min({phonemes.size() - 1, noTokens - 1, - voice_.size() - 1}); + size_t voiceID = + std::min({phonemes.size() - 1, noTokens - 1, voice_.size() - 1}); auto &voice = voice_[voiceID]; // Initialize text mask diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h index 47fdab769b..e33631af61 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h @@ -25,27 +25,46 @@ class Kokoro { const std::string &synthesizerSource, const std::string &voiceSource, std::shared_ptr callInvoker); - // Processes the entire text at once, before sending back to the JS side. + /** + * Processes the entire text at once, before sending back to the JS side. + */ std::vector generate(std::string text, float speed = 1.F); - // Accepts pre-computed phonemes (as a UTF-8 IPA string) and synthesizes - // audio, bypassing the built-in phonemizer. This allows callers to use - // an external G2P system (e.g. the Python `phonemizer` library, espeak-ng, - // or any custom phonemizer). + /** + * Similar to generate(), but accepts pre-computed phonemes (as a UTF-8 IPA + * string) and synthesizes audio, bypassing the built-in phonemizer. + */ std::vector generateFromPhonemes(std::string phonemes, float speed = 1.F); - // Processes text in chunks, sending each chunk individualy to the JS side - // with asynchronous callbacks. - void stream(std::string text, float speed, + /** + * Processes text from inputTextBuffer_ in chunks, sending each chunk + * individualy to the JS side with asynchronous callbacks. + * + * Allows an incrementally expanded input by using an input text buffer. + */ + void stream(float speed, bool stopOnEmptyBuffer, std::shared_ptr callback); // Streaming variant that accepts pre-computed phonemes instead of text. void streamFromPhonemes(std::string phonemes, float speed, std::shared_ptr callback); - // Stops the streaming process - void streamStop() noexcept; + /** + * Updates the input streaming buffer by adding more text to be processed. + * + * @param text A new chunk of text, appended to the end of the input buffer. + */ + void streamInsert(std::string textChunk) noexcept; + + /** + * Stops the streaming process. + * + * @param instant If true, stops the streaming as soon as possible by + * switching the isStreaming_ flag. Otherwise allows to process the rest of + * the buffer first, by switching the stopOnEmptyBuffer_ flag. + */ + void streamStop(bool instant) noexcept; std::size_t getMemoryLowerBound() const noexcept; void unload() noexcept; @@ -80,8 +99,12 @@ class Kokoro { // Each row is a style vector for a given input token count. std::vector> voice_; - // Extra control variables + // Streaming state control variables + std::string inputTextBuffer_; + mutable std::mutex inputTextBufferMutex_; std::atomic isStreaming_{false}; + std::atomic stopOnEmptyBuffer_{true}; + int32_t streamSkippedIterations = 0; }; } // namespace models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h index f6b910b03f..f517db0318 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h @@ -20,6 +20,17 @@ namespace rnexecutorch::models::text_to_speech::kokoro::params { */ inline constexpr size_t kMaxTextSize = 2048; +/** + * A number of skipped streaming iterations after which we process the remaining + * input no matter how it looks like. + */ +inline constexpr int32_t kStreamMaxSkippedIterations = 3; + +/** + * A size of pause (in miliseconds) applied after each streaming iteration. + */ +inline constexpr int32_t kStreamPause = 200; + /** * A set of punctation - pause values. Determines how much pause (silence) is * being added at the end of each calculated audio vector. This is primarly used diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index 52ed60685f..426aafc1f3 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -84,6 +84,12 @@ target_link_options(opencv_deps INTERFACE -fopenmp -static-openmp) add_library(tokenizers_deps INTERFACE) target_include_directories(tokenizers_deps INTERFACE "${TOKENIZERS_DIR}") +# Phonemis +add_library(phonemis STATIC IMPORTED) + set_target_properties(phonemis PROPERTIES + IMPORTED_LOCATION "${ANDROID_THIRD_PARTY}/phonemis/${ANDROID_ABI}/libphonemis.a" + ) + # Source Definitions set(CORE_SOURCES ${RNEXECUTORCH_DIR}/models/BaseModel.cpp @@ -218,6 +224,16 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp LIBS tokenizers_deps z ) +add_rn_test(TextToSpeechTests integration/TextToSpeechTest.cpp + SOURCES + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Kokoro.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/DurationPredictor.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Synthesizer.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Partitioner.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Utils.cpp + LIBS phonemis +) + add_rn_test(LLMTests integration/LLMTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp new file mode 100644 index 0000000000..017c9c3695 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp @@ -0,0 +1,123 @@ +#include "BaseModelTests.h" +#include "utils/TestUtils.h" +#include +#include +#include +#include + +using namespace rnexecutorch; +using namespace rnexecutorch::models::text_to_speech::kokoro; + +constexpr auto kValidLang = "en-us"; +constexpr auto kValidTaggerPath = "kokoro_en_tagger.json"; +constexpr auto kValidPhonemizerPath = "kokoro_us_lexicon.json"; +constexpr auto kValidDurationPath = "kokoro_duration_predictor.pte"; +constexpr auto kValidSynthesizerPath = "kokoro_synthesizer.pte"; +constexpr auto kValidVoicePath = "kokoro_af_heart.bin"; + +namespace { +bool isAudioValid(const std::vector &audio) { + if (audio.empty()) { + return false; + } + // Check for non-silence (amplitude greater than an arbitrary small noise + // threshold) + return std::ranges::any_of( + audio, [](float sample) { return std::abs(sample) > 1e-4f; }); +} + +bool isAudioSimilar(const std::vector &audio1, + const std::vector &audio2, float tolerance = 0.1f) { + if (audio1.empty() || audio2.empty()) { + return false; + } + + double sumSqDiff = 0.0; + size_t steps = std::max(audio1.size(), audio2.size()); + + for (size_t i = 0; i < steps; ++i) { + size_t idx1 = + static_cast((static_cast(i) / steps) * audio1.size()); + size_t idx2 = + static_cast((static_cast(i) / steps) * audio2.size()); + + float diff = audio1[idx1] - audio2[idx2]; + sumSqDiff += diff * diff; + } + + double rmse = std::sqrt(sumSqDiff / steps); + if (rmse >= tolerance) { + std::cerr << "Audio structural RMSE difference: " << rmse + << " (tolerance: " << tolerance << ")" << std::endl; + return false; + } + return true; +} + +class KokoroTest : public ::testing::Test { +protected: + void SetUp() override { + try { + model_ = std::make_unique( + kValidLang, kValidTaggerPath, kValidPhonemizerPath, + kValidDurationPath, kValidSynthesizerPath, kValidVoicePath, nullptr); + } catch (...) { + model_ = nullptr; + } + } + + std::unique_ptr model_; +}; +} // namespace + +TEST(TTSCtorTests, InvalidVoicePathThrows) { + EXPECT_THROW(Kokoro(kValidLang, kValidTaggerPath, kValidPhonemizerPath, + kValidDurationPath, kValidSynthesizerPath, + "nonexistent_voice.bin", nullptr), + RnExecutorchError); +} + +TEST_F(KokoroTest, MaxTextSizeExceededThrows) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + std::string hugeText(10000, 'A'); // beyond params::kMaxTextSize + EXPECT_THROW(model_->generate(hugeText, 1.0f), RnExecutorchError); +} + +TEST_F(KokoroTest, EmptyStringReturnsEmptyVector) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + auto result = model_->generate("", 1.0f); + EXPECT_TRUE(result.empty()); +} + +TEST_F(KokoroTest, GenerateReturnsValidAudio) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + auto result = model_->generate("Hello world! How are you doing?", 1.0f); + auto reference = test_utils::loadAudioFromFile("test_speech.raw"); + + ASSERT_FALSE(reference.empty()) + << "Reference audio 'test_speech.raw' not found."; + + // Compare against an audio waveform obtained from the original + // Kokoro model (PyTorch) + EXPECT_TRUE(isAudioSimilar(result, reference)); +} + +TEST_F(KokoroTest, GenerateSpeedAdjustsAudioLength) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + std::string text = "This is a sentence to test the speed modifications."; + auto resultNormal = model_->generate(text, 1.0f); + auto resultFast = model_->generate(text, 1.5f); + + EXPECT_TRUE(isAudioValid(resultNormal)); + EXPECT_TRUE(isAudioValid(resultFast)); + // Fast speech should result in a noticeably shorter output waveform + EXPECT_LT(resultFast.size(), resultNormal.size()); +} \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw new file mode 100644 index 0000000000..2cf55af04f Binary files /dev/null and b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw differ diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index c46fde9fa9..d12f8fbada 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -29,6 +29,7 @@ TEST_EXECUTABLES=( "VADTests" "TokenizerModuleTests" "SpeechToTextTests" + "TextToSpeechTests" "LLMTests" "ImageSegmentationTests" "TextToImageTests" @@ -41,6 +42,7 @@ TEST_EXECUTABLES=( # ============================================================================ TEST_ASSETS=( "integration/assets/test_audio_float.raw" + "integration/assets/test_speech.raw" "integration/assets/we_are_software_mansion.jpg" ) @@ -58,6 +60,11 @@ MODELS=( "fsmn-vad_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-fsmn-vad/resolve/main/xnnpack/fsmn-vad_xnnpack.pte" "whisper_tiny_en_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.8.0/xnnpack/whisper_tiny_en_xnnpack.pte" "whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.8.0/tokenizer.json" + "kokoro_duration_predictor.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/duration_predictor.pte" + "kokoro_synthesizer.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/synthesizer.pte" + "kokoro_af_heart.bin|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/voices/af_heart.bin" + "kokoro_us_lexicon.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/us_merged.json" + "kokoro_en_tagger.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/tags.json" "smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte" "smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json" "deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte" diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts index 1d48aef34e..b5e03ceb59 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts @@ -5,7 +5,6 @@ import { TextToSpeechInput, TextToSpeechPhonemeInput, TextToSpeechType, - TextToSpeechStreamingCallbacks, TextToSpeechStreamingInput, TextToSpeechStreamingPhonemeInput, } from '../../types/tts'; @@ -91,36 +90,11 @@ export const useTextToSpeech = ({ [isReady, isGenerating, moduleInstance] ); - // Shared streaming orchestration (guards + onBegin/onNext/onEnd lifecycle) - const runStream = useCallback( - async ( - methodName: string, - generator: AsyncGenerator, - callbacks: TextToSpeechStreamingCallbacks - ) => { - guardReady(methodName); - setIsGenerating(true); - try { - await callbacks.onBegin?.(); - for await (const audio of generator) { - if (callbacks.onNext) { - await callbacks.onNext(audio); - } - } - } finally { - await callbacks.onEnd?.(); - setIsGenerating(false); - } - }, - // eslint-disable-next-line react-hooks/exhaustive-deps - [isReady, isGenerating, moduleInstance] - ); - const forward = async (input: TextToSpeechInput) => { const instance = guardReady('forward'); try { setIsGenerating(true); - return await instance.forward(input.text, input.speed ?? 1.0); + return await instance.forward(input.text ?? '', input.speed ?? 1.0); } finally { setIsGenerating(false); } @@ -131,7 +105,7 @@ export const useTextToSpeech = ({ try { setIsGenerating(true); return await instance.forwardFromPhonemes( - input.phonemes, + input.phonemes ?? '', input.speed ?? 1.0 ); } finally { @@ -142,28 +116,67 @@ export const useTextToSpeech = ({ const stream = useCallback( async (input: TextToSpeechStreamingInput) => { const instance = guardReady('stream'); - await runStream( - 'stream', - instance.stream({ text: input.text, speed: input.speed ?? 1.0 }), - input - ); + setIsGenerating(true); + try { + if (input.text) { + instance.streamInsert(input.text); + } + + await input.onBegin?.(); + for await (const audio of instance.stream({ + speed: input.speed ?? 1.0, + stopAutomatically: input.stopAutomatically ?? true, + })) { + if (input.onNext) { + await input.onNext(audio); + } + } + } finally { + await input.onEnd?.(); + setIsGenerating(false); + } }, - [guardReady, runStream] + [guardReady] ); const streamFromPhonemes = useCallback( async (input: TextToSpeechStreamingPhonemeInput) => { const instance = guardReady('streamFromPhonemes'); - await runStream( - 'streamFromPhonemes', - instance.streamFromPhonemes({ - phonemes: input.phonemes, + setIsGenerating(true); + try { + await input.onBegin?.(); + for await (const audio of instance.streamFromPhonemes({ + phonemes: input.phonemes ?? '', speed: input.speed ?? 1.0, - }), - input - ); + })) { + if (input.onNext) { + await input.onNext(audio); + } + } + } finally { + await input.onEnd?.(); + setIsGenerating(false); + } + }, + [guardReady] + ); + + const streamInsert = useCallback( + (text: string) => { + if (moduleInstance) { + moduleInstance.streamInsert(text); + } + }, + [moduleInstance] + ); + + const streamStop = useCallback( + (instant: boolean = true) => { + if (moduleInstance) { + moduleInstance.streamStop(instant); + } }, - [guardReady, runStream] + [moduleInstance] ); return { @@ -174,7 +187,8 @@ export const useTextToSpeech = ({ forwardFromPhonemes, stream, streamFromPhonemes, - streamStop: () => moduleInstance?.streamStop(), + streamInsert, + streamStop, downloadProgress, }; }; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts index a12285057b..fb27dd29ba 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts @@ -17,6 +17,7 @@ import { Logger } from '../../common/Logger'; */ export class TextToSpeechModule { private nativeModule: any; + private isStreaming: boolean = false; private constructor(nativeModule: unknown) { this.nativeModule = nativeModule; @@ -143,17 +144,23 @@ export class TextToSpeechModule { } /** - * Shared streaming implementation. Wraps a native streaming call in an - * async generator that yields Float32Array audio chunks as they arrive. + * Starts a streaming synthesis session. Yields audio chunks as they are generated. + * + * @param input - Input object containing text and optional speed. + * @returns An async generator yielding Float32Array audio chunks. */ - private async *streamImpl( - nativeCall: (cb: (audio: number[]) => void) => Promise - ): AsyncGenerator { + public async *stream({ + speed, + stopAutomatically, + }: TextToSpeechStreamingInput): AsyncGenerator { + // Stores computed audio segments const queue: Float32Array[] = []; let waiter: (() => void) | null = null; - let finished = false; let error: unknown; + let nativeStreamFinished = false; + + this.isStreaming = true; const wake = () => { waiter?.(); @@ -162,46 +169,36 @@ export class TextToSpeechModule { (async () => { try { - await nativeCall((audio: number[]) => { - queue.push(new Float32Array(audio)); - wake(); - }); - finished = true; + await this.nativeModule.stream( + speed, + stopAutomatically, + (audio: number[]) => { + queue.push(new Float32Array(audio)); + wake(); + } + ); + nativeStreamFinished = true; wake(); } catch (e) { error = e; - finished = true; + nativeStreamFinished = true; wake(); } })(); - while (true) { + while (this.isStreaming) { if (queue.length > 0) { yield queue.shift()!; - if (finished && queue.length === 0) { + if (nativeStreamFinished && queue.length === 0) { return; } continue; } if (error) throw error; - if (finished) return; await new Promise((r) => (waiter = r)); } } - /** - * Starts a streaming synthesis session. Yields audio chunks as they are generated. - * - * @param input - Input object containing text and optional speed. - * @returns An async generator yielding Float32Array audio chunks. - */ - public async *stream({ - text, - speed, - }: TextToSpeechStreamingInput): AsyncGenerator { - yield* this.streamImpl((cb) => this.nativeModule.stream(text, speed, cb)); - } - /** * Starts a streaming synthesis session from pre-computed phonemes. * Bypasses the built-in phonemizer, allowing use of external G2P systems. @@ -213,16 +210,68 @@ export class TextToSpeechModule { phonemes, speed, }: TextToSpeechStreamingPhonemeInput): AsyncGenerator { - yield* this.streamImpl((cb) => - this.nativeModule.streamFromPhonemes(phonemes, speed, cb) - ); + const queue: Float32Array[] = []; + + let waiter: (() => void) | null = null; + let error: unknown; + let nativeStreamFinished = false; + + const wake = () => { + waiter?.(); + waiter = null; + }; + + (async () => { + try { + await this.nativeModule.streamFromPhonemes( + phonemes, + speed, + (audio: number[]) => { + queue.push(new Float32Array(audio)); + wake(); + } + ); + nativeStreamFinished = true; + wake(); + } catch (e) { + error = e; + nativeStreamFinished = true; + wake(); + } + })(); + + while (this.isStreaming) { + if (queue.length > 0) { + yield queue.shift()!; + if (nativeStreamFinished && queue.length === 0) { + return; + } + continue; + } + if (error) throw error; + await new Promise((r) => (waiter = r)); + } + } + + /** + * Inserts new text chunk into the buffer to be processed in streaming mode. + */ + public streamInsert(textChunk: string): void { + this.nativeModule.streamInsert(textChunk); } /** * Stops the streaming process if there is any ongoing. + * + * * @param instant If true, stops the streaming as soon as possible. Otherwise + * allows the module to complete processing for the remains of the buffer. */ - public streamStop(): void { - this.nativeModule.streamStop(); + public streamStop(instant: boolean = true): void { + this.nativeModule.streamStop(instant); + + if (instant) { + this.isStreaming = false; + } } /** diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts index 097f35976a..e515d13b42 100644 --- a/packages/react-native-executorch/src/types/tts.ts +++ b/packages/react-native-executorch/src/types/tts.ts @@ -93,7 +93,7 @@ export interface TextToSpeechProps extends TextToSpeechConfig { * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes */ export interface TextToSpeechInput { - text: string; + text?: string; speed?: number; } @@ -179,10 +179,18 @@ export interface TextToSpeechType { input: TextToSpeechStreamingPhonemeInput ) => Promise; + /** + * Inserts new text chunk into the buffer to be processed in streaming mode. + */ + streamInsert: (textChunk: string) => void; + /** * Interrupts and stops the currently active audio generation stream. + * + * @param instant If true, stops the streaming as soon as possible. Otherwise + * allows the module to complete processing for the remains of the buffer. */ - streamStop: () => void; + streamStop: (instant?: boolean) => void; } /** @@ -207,10 +215,16 @@ export interface TextToSpeechStreamingCallbacks { * Actions such as playing the audio should happen within the onNext callback. * Callbacks can be both synchronous or asynchronous. * + * Enables an incrementally expanded input, in other words adding + * new text chunks with streamInsert() as the streaming is running. + * * @category Types + * @property {boolean} [stopAutomatically] - If true, streaming will stop automatically when the buffer is empty. */ export interface TextToSpeechStreamingInput - extends TextToSpeechInput, TextToSpeechStreamingCallbacks {} + extends TextToSpeechInput, TextToSpeechStreamingCallbacks { + stopAutomatically?: boolean; +} /** * Streaming input definition for pre-computed phonemes. diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a index ab5d24b339..5a38707580 100644 Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a and b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a differ diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a index 6f98f2aaca..2306d4647a 100644 Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a and b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a differ diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a index 2b79e22edf..78f5169308 100644 Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a and b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a differ diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a index c8dba0c3d4..ccf1d2fa64 100644 Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a and b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a differ