Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion apps/speech/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { SpeechToTextScreen } from './screens/SpeechToTextScreen';
import ColorPalette from './colors';
import ExecutorchLogo from './assets/executorch.svg';
import { Quiz } from './screens/Quiz';
import { TextToSpeechLLMScreen } from './screens/TextToSpeechLLMScreen';
import { initExecutorch } from 'react-native-executorch';
import { ExpoResourceFetcher } from '@react-native-executorch/expo-resource-fetcher';

Expand All @@ -14,7 +15,7 @@ initExecutorch({

export default function App() {
const [currentScreen, setCurrentScreen] = useState<
'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz'
'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' | 'text-to-speech-llm'
>('menu');

const goToMenu = () => setCurrentScreen('menu');
Expand All @@ -31,6 +32,10 @@ export default function App() {
return <Quiz onBack={goToMenu} />;
}

if (currentScreen === 'text-to-speech-llm') {
return <TextToSpeechLLMScreen onBack={goToMenu} />;
}

return (
<View style={styles.container}>
<ExecutorchLogo width={64} height={64} />
Expand All @@ -54,6 +59,12 @@ export default function App() {
>
<Text style={styles.buttonText}>Text to Speech - Quiz</Text>
</TouchableOpacity>
<TouchableOpacity
style={styles.button}
onPress={() => setCurrentScreen('text-to-speech-llm')}
>
<Text style={styles.buttonText}>Text to Speech - LLM Streaming</Text>
</TouchableOpacity>
</View>
</View>
);
Expand Down
323 changes: 323 additions & 0 deletions apps/speech/screens/TextToSpeechLLMScreen.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
import React, { useEffect, useState, useRef } from 'react';
import {
View,
Text,
StyleSheet,
TouchableOpacity,
ScrollView,
} from 'react-native';
import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
import FontAwesome from '@expo/vector-icons/FontAwesome';
import SWMIcon from '../assets/swm_icon.svg';
import {
useLLM,
useTextToSpeech,
KOKORO_MEDIUM,
KOKORO_VOICE_AF_HEART,
LLAMA3_2_1B_QLORA,
} from 'react-native-executorch';
import {
AudioManager,
AudioContext,
AudioBuffer,
AudioBufferSourceNode,
} from 'react-native-audio-api';

interface TextToSpeechLLMProps {
onBack: () => void;
}

/**
* Converts an audio vector (Float32Array) to an AudioBuffer for playback
* @param audioVector - The generated audio samples from the model
* @param sampleRate - The sample rate (default: 24000 Hz for Kokoro)
* @returns AudioBuffer ready for playback
*/
const createAudioBufferFromVector = (
audioVector: Float32Array,
audioContext: AudioContext,
sampleRate: number = 24000
): AudioBuffer => {
const audioBuffer = audioContext.createBuffer(
1,
audioVector.length,
sampleRate
);
const channelData = audioBuffer.getChannelData(0);
channelData.set(audioVector);

return audioBuffer;
};

export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => {
const [displayText, setDisplayText] = useState('');
const [isTtsStreaming, setIsTtsStreaming] = useState(false);
const llm = useLLM({ model: LLAMA3_2_1B_QLORA });
const tts = useTextToSpeech({
model: KOKORO_MEDIUM,
voice: KOKORO_VOICE_AF_HEART,
});

const processedLengthRef = useRef(0);
const audioContextRef = useRef<AudioContext | null>(null);
const sourceRef = useRef<AudioBufferSourceNode>(null);

useEffect(() => {
AudioManager.setAudioSessionOptions({
iosCategory: 'playAndRecord',
iosMode: 'spokenAudio',
iosOptions: ['defaultToSpeaker'],
});

audioContextRef.current = new AudioContext({ sampleRate: 24000 });
audioContextRef.current.suspend();

return () => {
audioContextRef.current?.close();
audioContextRef.current = null;
};
}, []);

// Update displayText gradually as response gets generated and insert new text chunks into TTS stream
useEffect(() => {
if (llm.response && tts.isReady) {
setDisplayText(llm.response);

const previousLength = processedLengthRef.current;
if (llm.response.length > previousLength && isTtsStreaming) {
const newChunk = llm.response.slice(previousLength);
tts.streamInsert(newChunk);
processedLengthRef.current = llm.response.length;
}
} else {
processedLengthRef.current = 0;
}
}, [llm.response, tts, isTtsStreaming]);

const handleGenerate = async () => {
setDisplayText('');
processedLengthRef.current = 0;
setIsTtsStreaming(true);

const startTTS = async () => {
try {
const audioContext = audioContextRef.current;
if (!audioContext) return;

if (audioContext.state === 'suspended') {
await audioContext.resume();
}

const onNext = async (audioVec: Float32Array) => {
return new Promise<void>((resolve) => {
const audioBuffer = createAudioBufferFromVector(
audioVec,
audioContext,
24000
);

const source = (sourceRef.current =
audioContext.createBufferSource());
source.buffer = audioBuffer;
source.connect(audioContext.destination);

source.onEnded = () => resolve();

source.start();
});
};

await tts.stream({
speed: 0.9,
stopAutomatically: false,
onNext,
});
Comment thread
IgorSwat marked this conversation as resolved.
} catch (e) {
console.error('TTS streaming error:', e);
} finally {
setIsTtsStreaming(false);
}
};

const ttsPromise = startTTS();

try {
await llm.sendMessage(
'Generate a short story about a robot learning to paint. The story should be around 200 words long.'
);
} catch (e) {
console.error('Generation failed:', e);
} finally {
tts.streamStop(false);
await ttsPromise;

if (
audioContextRef.current &&
audioContextRef.current.state === 'running'
) {
await audioContextRef.current.suspend();
}
}
};

const handleStop = () => {
llm.interrupt();
tts.streamStop(true);
if (sourceRef.current) {
try {
sourceRef.current.stop();
} catch (e) {
// Source might have already stopped or disconnected
}
}
};

const isProcessing = llm.isGenerating || isTtsStreaming;
const isModelsReady = llm.isReady && tts.isReady;

const getModelStatus = () => {
if (llm.error) return `LLM Error: ${llm.error.message}`;
if (tts.error) return `TTS Error: ${tts.error.message}`;
if (!llm.isReady)
return `Loading LLM: ${(100 * llm.downloadProgress).toFixed(2)}%`;
if (!tts.isReady)
return `Loading TTS: ${(100 * tts.downloadProgress).toFixed(2)}%`;
if (isProcessing) return 'Generating/Streaming...';
return 'Ready';
};

return (
<SafeAreaProvider>
<SafeAreaView style={styles.container}>
<View style={styles.header}>
<TouchableOpacity style={styles.backButton} onPress={onBack}>
<FontAwesome name="chevron-left" size={20} color="#0f186e" />
</TouchableOpacity>
<SWMIcon width={60} height={60} />
<Text style={styles.headerText}>React Native ExecuTorch</Text>
<Text style={styles.headerText}>LLM to Speech Demo</Text>
</View>

<View style={styles.statusContainer}>
<Text>Status: {getModelStatus()}</Text>
</View>

<View style={styles.contentContainer}>
<Text style={styles.label}>Generated Story</Text>
<View style={styles.responseContainer}>
<ScrollView contentContainerStyle={styles.responseContent}>
<Text style={styles.responseText}>
{displayText ||
(isModelsReady
? 'Press the button to generate a story and hear it spoken aloud.'
: 'Please wait for models to load...')}
</Text>
</ScrollView>
</View>
</View>

<View style={styles.buttonContainer}>
{isProcessing ? (
<TouchableOpacity
style={[styles.actionButton, styles.stopButton]}
onPress={handleStop}
>
<FontAwesome name="stop" size={20} color="white" />
<Text style={styles.buttonText}>Stop Generation</Text>
</TouchableOpacity>
) : (
<TouchableOpacity
disabled={!isModelsReady}
onPress={handleGenerate}
style={[styles.actionButton, !isModelsReady && styles.disabled]}
>
<FontAwesome name="magic" size={20} color="white" />
<Text style={styles.buttonText}>Generate & Stream Speech</Text>
</TouchableOpacity>
)}
</View>
</SafeAreaView>
</SafeAreaProvider>
);
};

const styles = StyleSheet.create({
container: {
flex: 1,
alignItems: 'center',
backgroundColor: 'white',
paddingHorizontal: 16,
},
header: {
alignItems: 'center',
position: 'relative',
width: '100%',
},
backButton: {
position: 'absolute',
left: 0,
top: 10,
padding: 10,
zIndex: 1,
},
headerText: {
fontSize: 22,
fontWeight: 'bold',
color: '#0f186e',
},
statusContainer: {
marginTop: 12,
alignItems: 'center',
},
contentContainer: {
width: '100%',
marginTop: 24,
flex: 1,
marginBottom: 24,
},
label: {
marginLeft: 12,
marginBottom: 4,
color: '#0f186e',
fontWeight: '600',
},
responseContainer: {
borderRadius: 12,
borderWidth: 1,
borderColor: '#0f186e',
flex: 1,
},
responseContent: {
padding: 12,
},
responseText: {
fontSize: 16,
color: '#333',
lineHeight: 24,
},
buttonContainer: {
marginBottom: 24,
width: '100%',
},
actionButton: {
backgroundColor: '#0f186e',
flexDirection: 'row',
justifyContent: 'center',
alignItems: 'center',
padding: 12,
borderRadius: 12,
gap: 8,
},
stopButton: {
backgroundColor: '#ff4444',
},
buttonText: {
color: 'white',
fontWeight: '600',
letterSpacing: -0.5,
fontSize: 16,
},
disabled: {
opacity: 0.5,
},
});
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ The module provides two ways to generate speech using either raw text or pre-gen
### Using Text

1. [**`forward({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
2. [**`stream({ text, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
2. [**`stream({speed, stopAutomatically, onNext, ...})`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed.
This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`.

### Using Phonemes

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ The module provides two ways to generate speech using either raw text or pre-gen
### Using Text

1. [**`forward(text, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
2. [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
2. [**`stream({ speed, stopAutomatically, onNext, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop).

### Using Phonemes

If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step:

1. [**`forwardFromPhonemes(phonemes, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string.
2. [**`streamFromPhonemes({ phonemes, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.
2. [**`streamFromPhonemes({ phonemes, speed, onNext, ... })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.

:::note
Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs.
Expand Down
Loading
Loading