diff --git a/apps/speech/App.tsx b/apps/speech/App.tsx
index ab036678e8..ddbfe3c983 100644
--- a/apps/speech/App.tsx
+++ b/apps/speech/App.tsx
@@ -5,6 +5,7 @@ import { SpeechToTextScreen } from './screens/SpeechToTextScreen';
import ColorPalette from './colors';
import ExecutorchLogo from './assets/executorch.svg';
import { Quiz } from './screens/Quiz';
+import { TextToSpeechLLMScreen } from './screens/TextToSpeechLLMScreen';
import { initExecutorch } from 'react-native-executorch';
import { ExpoResourceFetcher } from '@react-native-executorch/expo-resource-fetcher';
@@ -14,7 +15,7 @@ initExecutorch({
export default function App() {
const [currentScreen, setCurrentScreen] = useState<
- 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz'
+ 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' | 'text-to-speech-llm'
>('menu');
const goToMenu = () => setCurrentScreen('menu');
@@ -31,6 +32,10 @@ export default function App() {
return ;
}
+ if (currentScreen === 'text-to-speech-llm') {
+ return ;
+ }
+
return (
@@ -54,6 +59,12 @@ export default function App() {
>
Text to Speech - Quiz
+ setCurrentScreen('text-to-speech-llm')}
+ >
+ Text to Speech - LLM Streaming
+
);
diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx
new file mode 100644
index 0000000000..04856cf3e7
--- /dev/null
+++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx
@@ -0,0 +1,323 @@
+import React, { useEffect, useState, useRef } from 'react';
+import {
+ View,
+ Text,
+ StyleSheet,
+ TouchableOpacity,
+ ScrollView,
+} from 'react-native';
+import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
+import FontAwesome from '@expo/vector-icons/FontAwesome';
+import SWMIcon from '../assets/swm_icon.svg';
+import {
+ useLLM,
+ useTextToSpeech,
+ KOKORO_MEDIUM,
+ KOKORO_VOICE_AF_HEART,
+ LLAMA3_2_1B_QLORA,
+} from 'react-native-executorch';
+import {
+ AudioManager,
+ AudioContext,
+ AudioBuffer,
+ AudioBufferSourceNode,
+} from 'react-native-audio-api';
+
+interface TextToSpeechLLMProps {
+ onBack: () => void;
+}
+
+/**
+ * Converts an audio vector (Float32Array) to an AudioBuffer for playback
+ * @param audioVector - The generated audio samples from the model
+ * @param sampleRate - The sample rate (default: 24000 Hz for Kokoro)
+ * @returns AudioBuffer ready for playback
+ */
+const createAudioBufferFromVector = (
+ audioVector: Float32Array,
+ audioContext: AudioContext,
+ sampleRate: number = 24000
+): AudioBuffer => {
+ const audioBuffer = audioContext.createBuffer(
+ 1,
+ audioVector.length,
+ sampleRate
+ );
+ const channelData = audioBuffer.getChannelData(0);
+ channelData.set(audioVector);
+
+ return audioBuffer;
+};
+
+export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => {
+ const [displayText, setDisplayText] = useState('');
+ const [isTtsStreaming, setIsTtsStreaming] = useState(false);
+ const llm = useLLM({ model: LLAMA3_2_1B_QLORA });
+ const tts = useTextToSpeech({
+ model: KOKORO_MEDIUM,
+ voice: KOKORO_VOICE_AF_HEART,
+ });
+
+ const processedLengthRef = useRef(0);
+ const audioContextRef = useRef(null);
+ const sourceRef = useRef(null);
+
+ useEffect(() => {
+ AudioManager.setAudioSessionOptions({
+ iosCategory: 'playAndRecord',
+ iosMode: 'spokenAudio',
+ iosOptions: ['defaultToSpeaker'],
+ });
+
+ audioContextRef.current = new AudioContext({ sampleRate: 24000 });
+ audioContextRef.current.suspend();
+
+ return () => {
+ audioContextRef.current?.close();
+ audioContextRef.current = null;
+ };
+ }, []);
+
+ // Update displayText gradually as response gets generated and insert new text chunks into TTS stream
+ useEffect(() => {
+ if (llm.response && tts.isReady) {
+ setDisplayText(llm.response);
+
+ const previousLength = processedLengthRef.current;
+ if (llm.response.length > previousLength && isTtsStreaming) {
+ const newChunk = llm.response.slice(previousLength);
+ tts.streamInsert(newChunk);
+ processedLengthRef.current = llm.response.length;
+ }
+ } else {
+ processedLengthRef.current = 0;
+ }
+ }, [llm.response, tts, isTtsStreaming]);
+
+ const handleGenerate = async () => {
+ setDisplayText('');
+ processedLengthRef.current = 0;
+ setIsTtsStreaming(true);
+
+ const startTTS = async () => {
+ try {
+ const audioContext = audioContextRef.current;
+ if (!audioContext) return;
+
+ if (audioContext.state === 'suspended') {
+ await audioContext.resume();
+ }
+
+ const onNext = async (audioVec: Float32Array) => {
+ return new Promise((resolve) => {
+ const audioBuffer = createAudioBufferFromVector(
+ audioVec,
+ audioContext,
+ 24000
+ );
+
+ const source = (sourceRef.current =
+ audioContext.createBufferSource());
+ source.buffer = audioBuffer;
+ source.connect(audioContext.destination);
+
+ source.onEnded = () => resolve();
+
+ source.start();
+ });
+ };
+
+ await tts.stream({
+ speed: 0.9,
+ stopAutomatically: false,
+ onNext,
+ });
+ } catch (e) {
+ console.error('TTS streaming error:', e);
+ } finally {
+ setIsTtsStreaming(false);
+ }
+ };
+
+ const ttsPromise = startTTS();
+
+ try {
+ await llm.sendMessage(
+ 'Generate a short story about a robot learning to paint. The story should be around 200 words long.'
+ );
+ } catch (e) {
+ console.error('Generation failed:', e);
+ } finally {
+ tts.streamStop(false);
+ await ttsPromise;
+
+ if (
+ audioContextRef.current &&
+ audioContextRef.current.state === 'running'
+ ) {
+ await audioContextRef.current.suspend();
+ }
+ }
+ };
+
+ const handleStop = () => {
+ llm.interrupt();
+ tts.streamStop(true);
+ if (sourceRef.current) {
+ try {
+ sourceRef.current.stop();
+ } catch (e) {
+ // Source might have already stopped or disconnected
+ }
+ }
+ };
+
+ const isProcessing = llm.isGenerating || isTtsStreaming;
+ const isModelsReady = llm.isReady && tts.isReady;
+
+ const getModelStatus = () => {
+ if (llm.error) return `LLM Error: ${llm.error.message}`;
+ if (tts.error) return `TTS Error: ${tts.error.message}`;
+ if (!llm.isReady)
+ return `Loading LLM: ${(100 * llm.downloadProgress).toFixed(2)}%`;
+ if (!tts.isReady)
+ return `Loading TTS: ${(100 * tts.downloadProgress).toFixed(2)}%`;
+ if (isProcessing) return 'Generating/Streaming...';
+ return 'Ready';
+ };
+
+ return (
+
+
+
+
+
+
+
+ React Native ExecuTorch
+ LLM to Speech Demo
+
+
+
+ Status: {getModelStatus()}
+
+
+
+ Generated Story
+
+
+
+ {displayText ||
+ (isModelsReady
+ ? 'Press the button to generate a story and hear it spoken aloud.'
+ : 'Please wait for models to load...')}
+
+
+
+
+
+
+ {isProcessing ? (
+
+
+ Stop Generation
+
+ ) : (
+
+
+ Generate & Stream Speech
+
+ )}
+
+
+
+ );
+};
+
+const styles = StyleSheet.create({
+ container: {
+ flex: 1,
+ alignItems: 'center',
+ backgroundColor: 'white',
+ paddingHorizontal: 16,
+ },
+ header: {
+ alignItems: 'center',
+ position: 'relative',
+ width: '100%',
+ },
+ backButton: {
+ position: 'absolute',
+ left: 0,
+ top: 10,
+ padding: 10,
+ zIndex: 1,
+ },
+ headerText: {
+ fontSize: 22,
+ fontWeight: 'bold',
+ color: '#0f186e',
+ },
+ statusContainer: {
+ marginTop: 12,
+ alignItems: 'center',
+ },
+ contentContainer: {
+ width: '100%',
+ marginTop: 24,
+ flex: 1,
+ marginBottom: 24,
+ },
+ label: {
+ marginLeft: 12,
+ marginBottom: 4,
+ color: '#0f186e',
+ fontWeight: '600',
+ },
+ responseContainer: {
+ borderRadius: 12,
+ borderWidth: 1,
+ borderColor: '#0f186e',
+ flex: 1,
+ },
+ responseContent: {
+ padding: 12,
+ },
+ responseText: {
+ fontSize: 16,
+ color: '#333',
+ lineHeight: 24,
+ },
+ buttonContainer: {
+ marginBottom: 24,
+ width: '100%',
+ },
+ actionButton: {
+ backgroundColor: '#0f186e',
+ flexDirection: 'row',
+ justifyContent: 'center',
+ alignItems: 'center',
+ padding: 12,
+ borderRadius: 12,
+ gap: 8,
+ },
+ stopButton: {
+ backgroundColor: '#ff4444',
+ },
+ buttonText: {
+ color: 'white',
+ fontWeight: '600',
+ letterSpacing: -0.5,
+ fontSize: 16,
+ },
+ disabled: {
+ opacity: 0.5,
+ },
+});
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
index 10e9986dee..af0fe8e18c 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
@@ -87,7 +87,8 @@ The module provides two ways to generate speech using either raw text or pre-gen
### Using Text
1. [**`forward({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
-2. [**`stream({ text, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+2. [**`stream({speed, stopAutomatically, onNext, ...})`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed.
+ This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`.
### Using Phonemes
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
index 00fd04f53b..ec0919574c 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
@@ -52,14 +52,14 @@ The module provides two ways to generate speech using either raw text or pre-gen
### Using Text
1. [**`forward(text, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
-2. [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+2. [**`stream({ speed, stopAutomatically, onNext, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop).
### Using Phonemes
If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step:
1. [**`forwardFromPhonemes(phonemes, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string.
-2. [**`streamFromPhonemes({ phonemes, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.
+2. [**`streamFromPhonemes({ phonemes, speed, onNext, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.
:::note
Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs.
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 90241eac03..c13b8991dc 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -188,17 +188,19 @@ template class ModelHostObject : public JsiHostObject {
promiseHostFunction<&Model::stream>,
"stream"));
addFunctions(JSI_EXPORT_FUNCTION(
- ModelHostObject,
- synchronousHostFunction<&Model::streamStop>,
+ ModelHostObject, synchronousHostFunction<&Model::streamStop>,
"streamStop"));
addFunctions(JSI_EXPORT_FUNCTION(
- ModelHostObject,
- promiseHostFunction<&Model::generateFromPhonemes>,
- "generateFromPhonemes"));
- addFunctions(JSI_EXPORT_FUNCTION(
- ModelHostObject,
- promiseHostFunction<&Model::streamFromPhonemes>,
- "streamFromPhonemes"));
+ ModelHostObject, synchronousHostFunction<&Model::streamInsert>,
+ "streamInsert"));
+ addFunctions(
+ JSI_EXPORT_FUNCTION(ModelHostObject,
+ promiseHostFunction<&Model::generateFromPhonemes>,
+ "generateFromPhonemes"));
+ addFunctions(
+ JSI_EXPORT_FUNCTION(ModelHostObject,
+ promiseHostFunction<&Model::streamFromPhonemes>,
+ "streamFromPhonemes"));
}
if constexpr (meta::HasGenerateFromString) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
index 3bc7f7f835..6064191443 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
@@ -34,6 +34,10 @@ inline constexpr int32_t kSamplingRate =
24000; // Corresponds to Kokoro's model audio frequency
inline constexpr int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
+// Special text characters
+inline const std::unordered_set kEndOfSentenceCharacters = {'.', '?', '!',
+ ';'};
+
// Special phonemes
inline const std::unordered_set kEndOfSentencePhonemes = {
U'.', U'?', U'!', U';', U'…'};
@@ -70,4 +74,4 @@ inline const std::unordered_map kVocab = {
inline constexpr Token kInvalidToken = -1;
inline constexpr Token kPadToken = 0;
-} // namespace rnexecutorch::models::text_to_speech::kokoro::constants
+} // namespace rnexecutorch::models::text_to_speech::kokoro::constants
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index d7da13b945..ea43f09d47 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -7,6 +7,7 @@
#include
#include
#include
+#include
namespace rnexecutorch::models::text_to_speech::kokoro {
@@ -103,9 +104,8 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
return audio;
}
-void Kokoro::streamFromPhonemesImpl(
- const std::u32string &phonemes, float speed,
- std::shared_ptr callback) {
+void Kokoro::streamFromPhonemesImpl(const std::u32string &phonemes, float speed,
+ std::shared_ptr callback) {
auto nativeCallback = [this, callback](const std::vector &audioVec) {
if (this->isStreaming_) {
this->callInvoker_->invokeAsync(
@@ -116,8 +116,6 @@ void Kokoro::streamFromPhonemesImpl(
}
};
- isStreaming_ = true;
-
// Use LATENCY strategy to minimize the time-to-first-audio for streaming
auto subsentences =
partitioner_.divide(phonemes);
@@ -152,8 +150,6 @@ void Kokoro::streamFromPhonemesImpl(
// Push the audio right away to the JS side
nativeCallback(std::move(audioPart));
}
-
- isStreaming_ = false;
}
std::vector Kokoro::generate(std::string text, float speed) {
@@ -162,6 +158,10 @@ std::vector Kokoro::generate(std::string text, float speed) {
"Kokoro: maximum input text size exceeded");
}
+ if (text.empty()) {
+ return {};
+ }
+
// G2P (Grapheme to Phoneme) conversion
auto phonemes = phonemizer_.process(text);
@@ -171,24 +171,74 @@ std::vector Kokoro::generate(std::string text, float speed) {
std::vector Kokoro::generateFromPhonemes(std::string phonemes,
float speed) {
if (phonemes.empty()) {
- throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
- "Kokoro: phoneme string must not be empty");
+ return {};
}
+
return generateFromPhonemesImpl(
phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed);
}
-void Kokoro::stream(std::string text, float speed,
+void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
std::shared_ptr callback) {
- if (text.size() > params::kMaxTextSize) {
- throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
- "Kokoro: maximum input text size exceeded");
- }
+ isStreaming_ = true;
+ stopOnEmptyBuffer_ = stopOnEmptyBuffer;
+
+ // The outer streaming loop is responsible for handling the input buffer.
+ // The extracted text is then passed to the inner loop, which performs a
+ // standard streaming on a fixed amount of input text.
+ while (isStreaming_) {
+ std::string text;
+
+ // Extract the code relying on input buffer for a separate mutex lock
+ // section.
+ {
+ std::scoped_lock lock(inputTextBufferMutex_);
+ if (inputTextBuffer_.empty() && stopOnEmptyBuffer_) {
+ break;
+ }
+
+ // Try to find the most recent available end of sentence character.
+ size_t searchLimit =
+ std::min(inputTextBuffer_.size(), params::kMaxTextSize);
+ auto eosIt = std::find_first_of(
+ inputTextBuffer_.rbegin() + (inputTextBuffer_.size() - searchLimit),
+ inputTextBuffer_.rend(), constants::kEndOfSentenceCharacters.begin(),
+ constants::kEndOfSentenceCharacters.end());
+ size_t chunkSize = (eosIt != inputTextBuffer_.rend())
+ ? std::distance(eosIt, inputTextBuffer_.rend())
+ : 0;
+
+ // To maximize the quality of the speech, we try to avoid processing
+ // chunks which end in the middle of a sentence.
+ if (chunkSize > 0 ||
+ streamSkippedIterations >= params::kStreamMaxSkippedIterations) {
+ text = inputTextBuffer_.substr(0, chunkSize);
+ inputTextBuffer_.erase(0, chunkSize);
+ streamSkippedIterations = 0;
+ } else {
+ streamSkippedIterations++;
+ }
+ }
- // G2P (Grapheme to Phoneme) conversion
- auto phonemes = phonemizer_.process(text);
+ if (!text.empty()) {
+ // Now we proceed with a standard streaming logic for fixed-size input.
+ auto phonemes = phonemizer_.process(text);
+ streamFromPhonemesImpl(phonemes, speed, callback);
+ }
- streamFromPhonemesImpl(phonemes, speed, callback);
+ // A little bit of pause to not overload the thread.
+ if (isStreaming_) {
+ std::this_thread::sleep_for(
+ std::chrono::milliseconds(params::kStreamPause));
+ }
+ }
+
+ {
+ std::scoped_lock lock(inputTextBufferMutex_);
+ inputTextBuffer_.clear();
+ isStreaming_ = false;
+ streamSkippedIterations = 0;
+ }
}
void Kokoro::streamFromPhonemes(std::string phonemes, float speed,
@@ -197,12 +247,26 @@ void Kokoro::streamFromPhonemes(std::string phonemes, float speed,
throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
"Kokoro: phoneme string must not be empty");
}
+
+ isStreaming_ = true;
streamFromPhonemesImpl(
phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed,
callback);
+ isStreaming_ = false;
}
-void Kokoro::streamStop() noexcept { isStreaming_ = false; }
+void Kokoro::streamInsert(std::string textChunk) noexcept {
+ std::scoped_lock lock(inputTextBufferMutex_);
+ inputTextBuffer_.append(textChunk);
+}
+
+void Kokoro::streamStop(bool instant) noexcept {
+ if (instant) {
+ isStreaming_ = false;
+ } else {
+ stopOnEmptyBuffer_ = true;
+ }
+}
std::vector Kokoro::synthesize(const std::u32string &phonemes,
float speed, size_t paddingMs) {
@@ -220,8 +284,8 @@ std::vector Kokoro::synthesize(const std::u32string &phonemes,
const auto tokens = utils::tokenize(phonemes, {noTokens});
// Select the appropriate voice vector
- size_t voiceID = std::min({phonemes.size() - 1, noTokens - 1,
- voice_.size() - 1});
+ size_t voiceID =
+ std::min({phonemes.size() - 1, noTokens - 1, voice_.size() - 1});
auto &voice = voice_[voiceID];
// Initialize text mask
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index 47fdab769b..e33631af61 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -25,27 +25,46 @@ class Kokoro {
const std::string &synthesizerSource, const std::string &voiceSource,
std::shared_ptr callInvoker);
- // Processes the entire text at once, before sending back to the JS side.
+ /**
+ * Processes the entire text at once, before sending back to the JS side.
+ */
std::vector generate(std::string text, float speed = 1.F);
- // Accepts pre-computed phonemes (as a UTF-8 IPA string) and synthesizes
- // audio, bypassing the built-in phonemizer. This allows callers to use
- // an external G2P system (e.g. the Python `phonemizer` library, espeak-ng,
- // or any custom phonemizer).
+ /**
+ * Similar to generate(), but accepts pre-computed phonemes (as a UTF-8 IPA
+ * string) and synthesizes audio, bypassing the built-in phonemizer.
+ */
std::vector generateFromPhonemes(std::string phonemes,
float speed = 1.F);
- // Processes text in chunks, sending each chunk individualy to the JS side
- // with asynchronous callbacks.
- void stream(std::string text, float speed,
+ /**
+ * Processes text from inputTextBuffer_ in chunks, sending each chunk
+ * individualy to the JS side with asynchronous callbacks.
+ *
+ * Allows an incrementally expanded input by using an input text buffer.
+ */
+ void stream(float speed, bool stopOnEmptyBuffer,
std::shared_ptr callback);
// Streaming variant that accepts pre-computed phonemes instead of text.
void streamFromPhonemes(std::string phonemes, float speed,
std::shared_ptr callback);
- // Stops the streaming process
- void streamStop() noexcept;
+ /**
+ * Updates the input streaming buffer by adding more text to be processed.
+ *
+ * @param text A new chunk of text, appended to the end of the input buffer.
+ */
+ void streamInsert(std::string textChunk) noexcept;
+
+ /**
+ * Stops the streaming process.
+ *
+ * @param instant If true, stops the streaming as soon as possible by
+ * switching the isStreaming_ flag. Otherwise allows to process the rest of
+ * the buffer first, by switching the stopOnEmptyBuffer_ flag.
+ */
+ void streamStop(bool instant) noexcept;
std::size_t getMemoryLowerBound() const noexcept;
void unload() noexcept;
@@ -80,8 +99,12 @@ class Kokoro {
// Each row is a style vector for a given input token count.
std::vector> voice_;
- // Extra control variables
+ // Streaming state control variables
+ std::string inputTextBuffer_;
+ mutable std::mutex inputTextBufferMutex_;
std::atomic isStreaming_{false};
+ std::atomic stopOnEmptyBuffer_{true};
+ int32_t streamSkippedIterations = 0;
};
} // namespace models::text_to_speech::kokoro
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
index f6b910b03f..f517db0318 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
@@ -20,6 +20,17 @@ namespace rnexecutorch::models::text_to_speech::kokoro::params {
*/
inline constexpr size_t kMaxTextSize = 2048;
+/**
+ * A number of skipped streaming iterations after which we process the remaining
+ * input no matter how it looks like.
+ */
+inline constexpr int32_t kStreamMaxSkippedIterations = 3;
+
+/**
+ * A size of pause (in miliseconds) applied after each streaming iteration.
+ */
+inline constexpr int32_t kStreamPause = 200;
+
/**
* A set of punctation - pause values. Determines how much pause (silence) is
* being added at the end of each calculated audio vector. This is primarly used
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 52ed60685f..426aafc1f3 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -84,6 +84,12 @@ target_link_options(opencv_deps INTERFACE -fopenmp -static-openmp)
add_library(tokenizers_deps INTERFACE)
target_include_directories(tokenizers_deps INTERFACE "${TOKENIZERS_DIR}")
+# Phonemis
+add_library(phonemis STATIC IMPORTED)
+ set_target_properties(phonemis PROPERTIES
+ IMPORTED_LOCATION "${ANDROID_THIRD_PARTY}/phonemis/${ANDROID_ABI}/libphonemis.a"
+ )
+
# Source Definitions
set(CORE_SOURCES
${RNEXECUTORCH_DIR}/models/BaseModel.cpp
@@ -218,6 +224,16 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp
LIBS tokenizers_deps z
)
+add_rn_test(TextToSpeechTests integration/TextToSpeechTest.cpp
+ SOURCES
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Kokoro.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/DurationPredictor.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Synthesizer.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Partitioner.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Utils.cpp
+ LIBS phonemis
+)
+
add_rn_test(LLMTests integration/LLMTest.cpp
SOURCES
${RNEXECUTORCH_DIR}/models/llm/LLM.cpp
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp
new file mode 100644
index 0000000000..017c9c3695
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp
@@ -0,0 +1,123 @@
+#include "BaseModelTests.h"
+#include "utils/TestUtils.h"
+#include
+#include
+#include
+#include
+
+using namespace rnexecutorch;
+using namespace rnexecutorch::models::text_to_speech::kokoro;
+
+constexpr auto kValidLang = "en-us";
+constexpr auto kValidTaggerPath = "kokoro_en_tagger.json";
+constexpr auto kValidPhonemizerPath = "kokoro_us_lexicon.json";
+constexpr auto kValidDurationPath = "kokoro_duration_predictor.pte";
+constexpr auto kValidSynthesizerPath = "kokoro_synthesizer.pte";
+constexpr auto kValidVoicePath = "kokoro_af_heart.bin";
+
+namespace {
+bool isAudioValid(const std::vector &audio) {
+ if (audio.empty()) {
+ return false;
+ }
+ // Check for non-silence (amplitude greater than an arbitrary small noise
+ // threshold)
+ return std::ranges::any_of(
+ audio, [](float sample) { return std::abs(sample) > 1e-4f; });
+}
+
+bool isAudioSimilar(const std::vector &audio1,
+ const std::vector &audio2, float tolerance = 0.1f) {
+ if (audio1.empty() || audio2.empty()) {
+ return false;
+ }
+
+ double sumSqDiff = 0.0;
+ size_t steps = std::max(audio1.size(), audio2.size());
+
+ for (size_t i = 0; i < steps; ++i) {
+ size_t idx1 =
+ static_cast((static_cast(i) / steps) * audio1.size());
+ size_t idx2 =
+ static_cast((static_cast(i) / steps) * audio2.size());
+
+ float diff = audio1[idx1] - audio2[idx2];
+ sumSqDiff += diff * diff;
+ }
+
+ double rmse = std::sqrt(sumSqDiff / steps);
+ if (rmse >= tolerance) {
+ std::cerr << "Audio structural RMSE difference: " << rmse
+ << " (tolerance: " << tolerance << ")" << std::endl;
+ return false;
+ }
+ return true;
+}
+
+class KokoroTest : public ::testing::Test {
+protected:
+ void SetUp() override {
+ try {
+ model_ = std::make_unique(
+ kValidLang, kValidTaggerPath, kValidPhonemizerPath,
+ kValidDurationPath, kValidSynthesizerPath, kValidVoicePath, nullptr);
+ } catch (...) {
+ model_ = nullptr;
+ }
+ }
+
+ std::unique_ptr model_;
+};
+} // namespace
+
+TEST(TTSCtorTests, InvalidVoicePathThrows) {
+ EXPECT_THROW(Kokoro(kValidLang, kValidTaggerPath, kValidPhonemizerPath,
+ kValidDurationPath, kValidSynthesizerPath,
+ "nonexistent_voice.bin", nullptr),
+ RnExecutorchError);
+}
+
+TEST_F(KokoroTest, MaxTextSizeExceededThrows) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ std::string hugeText(10000, 'A'); // beyond params::kMaxTextSize
+ EXPECT_THROW(model_->generate(hugeText, 1.0f), RnExecutorchError);
+}
+
+TEST_F(KokoroTest, EmptyStringReturnsEmptyVector) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ auto result = model_->generate("", 1.0f);
+ EXPECT_TRUE(result.empty());
+}
+
+TEST_F(KokoroTest, GenerateReturnsValidAudio) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ auto result = model_->generate("Hello world! How are you doing?", 1.0f);
+ auto reference = test_utils::loadAudioFromFile("test_speech.raw");
+
+ ASSERT_FALSE(reference.empty())
+ << "Reference audio 'test_speech.raw' not found.";
+
+ // Compare against an audio waveform obtained from the original
+ // Kokoro model (PyTorch)
+ EXPECT_TRUE(isAudioSimilar(result, reference));
+}
+
+TEST_F(KokoroTest, GenerateSpeedAdjustsAudioLength) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ std::string text = "This is a sentence to test the speed modifications.";
+ auto resultNormal = model_->generate(text, 1.0f);
+ auto resultFast = model_->generate(text, 1.5f);
+
+ EXPECT_TRUE(isAudioValid(resultNormal));
+ EXPECT_TRUE(isAudioValid(resultFast));
+ // Fast speech should result in a noticeably shorter output waveform
+ EXPECT_LT(resultFast.size(), resultNormal.size());
+}
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw
new file mode 100644
index 0000000000..2cf55af04f
Binary files /dev/null and b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw differ
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
index c46fde9fa9..d12f8fbada 100755
--- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
@@ -29,6 +29,7 @@ TEST_EXECUTABLES=(
"VADTests"
"TokenizerModuleTests"
"SpeechToTextTests"
+ "TextToSpeechTests"
"LLMTests"
"ImageSegmentationTests"
"TextToImageTests"
@@ -41,6 +42,7 @@ TEST_EXECUTABLES=(
# ============================================================================
TEST_ASSETS=(
"integration/assets/test_audio_float.raw"
+ "integration/assets/test_speech.raw"
"integration/assets/we_are_software_mansion.jpg"
)
@@ -58,6 +60,11 @@ MODELS=(
"fsmn-vad_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-fsmn-vad/resolve/main/xnnpack/fsmn-vad_xnnpack.pte"
"whisper_tiny_en_xnnpack.pte|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.8.0/xnnpack/whisper_tiny_en_xnnpack.pte"
"whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.8.0/tokenizer.json"
+ "kokoro_duration_predictor.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/duration_predictor.pte"
+ "kokoro_synthesizer.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/synthesizer.pte"
+ "kokoro_af_heart.bin|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/voices/af_heart.bin"
+ "kokoro_us_lexicon.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/us_merged.json"
+ "kokoro_en_tagger.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/tags.json"
"smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte"
"smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json"
"deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte"
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index 1d48aef34e..b5e03ceb59 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -5,7 +5,6 @@ import {
TextToSpeechInput,
TextToSpeechPhonemeInput,
TextToSpeechType,
- TextToSpeechStreamingCallbacks,
TextToSpeechStreamingInput,
TextToSpeechStreamingPhonemeInput,
} from '../../types/tts';
@@ -91,36 +90,11 @@ export const useTextToSpeech = ({
[isReady, isGenerating, moduleInstance]
);
- // Shared streaming orchestration (guards + onBegin/onNext/onEnd lifecycle)
- const runStream = useCallback(
- async (
- methodName: string,
- generator: AsyncGenerator,
- callbacks: TextToSpeechStreamingCallbacks
- ) => {
- guardReady(methodName);
- setIsGenerating(true);
- try {
- await callbacks.onBegin?.();
- for await (const audio of generator) {
- if (callbacks.onNext) {
- await callbacks.onNext(audio);
- }
- }
- } finally {
- await callbacks.onEnd?.();
- setIsGenerating(false);
- }
- },
- // eslint-disable-next-line react-hooks/exhaustive-deps
- [isReady, isGenerating, moduleInstance]
- );
-
const forward = async (input: TextToSpeechInput) => {
const instance = guardReady('forward');
try {
setIsGenerating(true);
- return await instance.forward(input.text, input.speed ?? 1.0);
+ return await instance.forward(input.text ?? '', input.speed ?? 1.0);
} finally {
setIsGenerating(false);
}
@@ -131,7 +105,7 @@ export const useTextToSpeech = ({
try {
setIsGenerating(true);
return await instance.forwardFromPhonemes(
- input.phonemes,
+ input.phonemes ?? '',
input.speed ?? 1.0
);
} finally {
@@ -142,28 +116,67 @@ export const useTextToSpeech = ({
const stream = useCallback(
async (input: TextToSpeechStreamingInput) => {
const instance = guardReady('stream');
- await runStream(
- 'stream',
- instance.stream({ text: input.text, speed: input.speed ?? 1.0 }),
- input
- );
+ setIsGenerating(true);
+ try {
+ if (input.text) {
+ instance.streamInsert(input.text);
+ }
+
+ await input.onBegin?.();
+ for await (const audio of instance.stream({
+ speed: input.speed ?? 1.0,
+ stopAutomatically: input.stopAutomatically ?? true,
+ })) {
+ if (input.onNext) {
+ await input.onNext(audio);
+ }
+ }
+ } finally {
+ await input.onEnd?.();
+ setIsGenerating(false);
+ }
},
- [guardReady, runStream]
+ [guardReady]
);
const streamFromPhonemes = useCallback(
async (input: TextToSpeechStreamingPhonemeInput) => {
const instance = guardReady('streamFromPhonemes');
- await runStream(
- 'streamFromPhonemes',
- instance.streamFromPhonemes({
- phonemes: input.phonemes,
+ setIsGenerating(true);
+ try {
+ await input.onBegin?.();
+ for await (const audio of instance.streamFromPhonemes({
+ phonemes: input.phonemes ?? '',
speed: input.speed ?? 1.0,
- }),
- input
- );
+ })) {
+ if (input.onNext) {
+ await input.onNext(audio);
+ }
+ }
+ } finally {
+ await input.onEnd?.();
+ setIsGenerating(false);
+ }
+ },
+ [guardReady]
+ );
+
+ const streamInsert = useCallback(
+ (text: string) => {
+ if (moduleInstance) {
+ moduleInstance.streamInsert(text);
+ }
+ },
+ [moduleInstance]
+ );
+
+ const streamStop = useCallback(
+ (instant: boolean = true) => {
+ if (moduleInstance) {
+ moduleInstance.streamStop(instant);
+ }
},
- [guardReady, runStream]
+ [moduleInstance]
);
return {
@@ -174,7 +187,8 @@ export const useTextToSpeech = ({
forwardFromPhonemes,
stream,
streamFromPhonemes,
- streamStop: () => moduleInstance?.streamStop(),
+ streamInsert,
+ streamStop,
downloadProgress,
};
};
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index a12285057b..fb27dd29ba 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -17,6 +17,7 @@ import { Logger } from '../../common/Logger';
*/
export class TextToSpeechModule {
private nativeModule: any;
+ private isStreaming: boolean = false;
private constructor(nativeModule: unknown) {
this.nativeModule = nativeModule;
@@ -143,17 +144,23 @@ export class TextToSpeechModule {
}
/**
- * Shared streaming implementation. Wraps a native streaming call in an
- * async generator that yields Float32Array audio chunks as they arrive.
+ * Starts a streaming synthesis session. Yields audio chunks as they are generated.
+ *
+ * @param input - Input object containing text and optional speed.
+ * @returns An async generator yielding Float32Array audio chunks.
*/
- private async *streamImpl(
- nativeCall: (cb: (audio: number[]) => void) => Promise
- ): AsyncGenerator {
+ public async *stream({
+ speed,
+ stopAutomatically,
+ }: TextToSpeechStreamingInput): AsyncGenerator {
+ // Stores computed audio segments
const queue: Float32Array[] = [];
let waiter: (() => void) | null = null;
- let finished = false;
let error: unknown;
+ let nativeStreamFinished = false;
+
+ this.isStreaming = true;
const wake = () => {
waiter?.();
@@ -162,46 +169,36 @@ export class TextToSpeechModule {
(async () => {
try {
- await nativeCall((audio: number[]) => {
- queue.push(new Float32Array(audio));
- wake();
- });
- finished = true;
+ await this.nativeModule.stream(
+ speed,
+ stopAutomatically,
+ (audio: number[]) => {
+ queue.push(new Float32Array(audio));
+ wake();
+ }
+ );
+ nativeStreamFinished = true;
wake();
} catch (e) {
error = e;
- finished = true;
+ nativeStreamFinished = true;
wake();
}
})();
- while (true) {
+ while (this.isStreaming) {
if (queue.length > 0) {
yield queue.shift()!;
- if (finished && queue.length === 0) {
+ if (nativeStreamFinished && queue.length === 0) {
return;
}
continue;
}
if (error) throw error;
- if (finished) return;
await new Promise((r) => (waiter = r));
}
}
- /**
- * Starts a streaming synthesis session. Yields audio chunks as they are generated.
- *
- * @param input - Input object containing text and optional speed.
- * @returns An async generator yielding Float32Array audio chunks.
- */
- public async *stream({
- text,
- speed,
- }: TextToSpeechStreamingInput): AsyncGenerator {
- yield* this.streamImpl((cb) => this.nativeModule.stream(text, speed, cb));
- }
-
/**
* Starts a streaming synthesis session from pre-computed phonemes.
* Bypasses the built-in phonemizer, allowing use of external G2P systems.
@@ -213,16 +210,68 @@ export class TextToSpeechModule {
phonemes,
speed,
}: TextToSpeechStreamingPhonemeInput): AsyncGenerator {
- yield* this.streamImpl((cb) =>
- this.nativeModule.streamFromPhonemes(phonemes, speed, cb)
- );
+ const queue: Float32Array[] = [];
+
+ let waiter: (() => void) | null = null;
+ let error: unknown;
+ let nativeStreamFinished = false;
+
+ const wake = () => {
+ waiter?.();
+ waiter = null;
+ };
+
+ (async () => {
+ try {
+ await this.nativeModule.streamFromPhonemes(
+ phonemes,
+ speed,
+ (audio: number[]) => {
+ queue.push(new Float32Array(audio));
+ wake();
+ }
+ );
+ nativeStreamFinished = true;
+ wake();
+ } catch (e) {
+ error = e;
+ nativeStreamFinished = true;
+ wake();
+ }
+ })();
+
+ while (this.isStreaming) {
+ if (queue.length > 0) {
+ yield queue.shift()!;
+ if (nativeStreamFinished && queue.length === 0) {
+ return;
+ }
+ continue;
+ }
+ if (error) throw error;
+ await new Promise((r) => (waiter = r));
+ }
+ }
+
+ /**
+ * Inserts new text chunk into the buffer to be processed in streaming mode.
+ */
+ public streamInsert(textChunk: string): void {
+ this.nativeModule.streamInsert(textChunk);
}
/**
* Stops the streaming process if there is any ongoing.
+ *
+ * * @param instant If true, stops the streaming as soon as possible. Otherwise
+ * allows the module to complete processing for the remains of the buffer.
*/
- public streamStop(): void {
- this.nativeModule.streamStop();
+ public streamStop(instant: boolean = true): void {
+ this.nativeModule.streamStop(instant);
+
+ if (instant) {
+ this.isStreaming = false;
+ }
}
/**
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index 097f35976a..e515d13b42 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -93,7 +93,7 @@ export interface TextToSpeechProps extends TextToSpeechConfig {
* @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes
*/
export interface TextToSpeechInput {
- text: string;
+ text?: string;
speed?: number;
}
@@ -179,10 +179,18 @@ export interface TextToSpeechType {
input: TextToSpeechStreamingPhonemeInput
) => Promise;
+ /**
+ * Inserts new text chunk into the buffer to be processed in streaming mode.
+ */
+ streamInsert: (textChunk: string) => void;
+
/**
* Interrupts and stops the currently active audio generation stream.
+ *
+ * @param instant If true, stops the streaming as soon as possible. Otherwise
+ * allows the module to complete processing for the remains of the buffer.
*/
- streamStop: () => void;
+ streamStop: (instant?: boolean) => void;
}
/**
@@ -207,10 +215,16 @@ export interface TextToSpeechStreamingCallbacks {
* Actions such as playing the audio should happen within the onNext callback.
* Callbacks can be both synchronous or asynchronous.
*
+ * Enables an incrementally expanded input, in other words adding
+ * new text chunks with streamInsert() as the streaming is running.
+ *
* @category Types
+ * @property {boolean} [stopAutomatically] - If true, streaming will stop automatically when the buffer is empty.
*/
export interface TextToSpeechStreamingInput
- extends TextToSpeechInput, TextToSpeechStreamingCallbacks {}
+ extends TextToSpeechInput, TextToSpeechStreamingCallbacks {
+ stopAutomatically?: boolean;
+}
/**
* Streaming input definition for pre-computed phonemes.
diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a
index ab5d24b339..5a38707580 100644
Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a and b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a differ
diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a
index 6f98f2aaca..2306d4647a 100644
Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a and b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a
index 2b79e22edf..78f5169308 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a and b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a
index c8dba0c3d4..ccf1d2fa64 100644
Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a and b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a differ