@@ -31,8 +31,6 @@ It is recommended to use models provided by us, which are available at our [Hugg
3131
3232## Reference
3333
34- ### File transcription
35-
3634You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize ` react-native-audio-api ` library to process a mp3 file.
3735
3836``` typescript
@@ -65,72 +63,6 @@ if (error) {
6563}
6664```
6765
68- ### Live data (microphone) transcription
69-
70- ``` typescript
71- import { STREAMING_ACTION , useSpeechToText } from ' react-native-executorch' ;
72- import LiveAudioStream from ' react-native-live-audio-stream' ;
73- import { useState } from ' react' ;
74- import { Buffer } from ' buffer' ;
75-
76- const audioStreamOptions = {
77- sampleRate: 16000 ,
78- channels: 1 ,
79- bitsPerSample: 16 ,
80- audioSource: 1 ,
81- bufferSize: 16000 ,
82- };
83-
84- const startStreamingAudio = (options : any , onChunk : (data : string ) => void ) => {
85- LiveAudioStream .init (options );
86- LiveAudioStream .on (' data' , onChunk );
87- LiveAudioStream .start ();
88- };
89-
90- const float32ArrayFromPCMBinaryBuffer = (b64EncodedBuffer : string ) => {
91- const b64DecodedChunk = Buffer .from (b64EncodedBuffer , ' base64' );
92- const int16Array = new Int16Array (b64DecodedChunk .buffer );
93-
94- const float32Array = new Float32Array (int16Array .length );
95- for (let i = 0 ; i < int16Array .length ; i ++ ) {
96- float32Array [i ] = Math .max (
97- - 1 ,
98- Math .min (1 , (int16Array [i ] / audioStreamOptions .bufferSize ) * 8 )
99- );
100- }
101- return float32Array ;
102- };
103-
104- const [isRecording, setIsRecording] = useState (false );
105-
106- const speechToText = useSpeechToText ({
107- modelName: ' moonshine' ,
108- windowSize: 3 ,
109- overlapSeconds: 1.2 ,
110- });
111-
112- const onChunk = (data : string ) => {
113- const float32Chunk = float32ArrayFromPCMBinaryBuffer (data );
114- speechToText .streamingTranscribe (
115- STREAMING_ACTION .DATA ,
116- Array .from (float32Chunk )
117- );
118- };
119-
120- const handleRecordPress = async () => {
121- if (isRecording ) {
122- setIsRecording (false );
123- LiveAudioStream .stop ();
124- messageRecorded .current = true ;
125- await speechToText .streamingTranscribe (STREAMING_ACTION .STOP );
126- } else {
127- setIsRecording (true );
128- startStreamingAudio (audioStreamOptions , onChunk );
129- await speechToText .streamingTranscribe (STREAMING_ACTION .START );
130- }
131- };
132- ```
133-
13466### Streaming
13567
13668Given that STT models can process audio no longer than 30 seconds, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm (adapted for mobile devices from [ whisper-streaming] ( https://aclanthology.org/2023.ijcnlp-demo.3.pdf ) ) that uses overlapping audio chunks. This might introduce some overhead, but allows for processing audio inputs of arbitrary length.
@@ -302,11 +234,97 @@ function App() {
302234 title = " Transcribe"
303235 />
304236 <Text>{error ? error : sequence }</Text>
305- </View>****
237+ </View>
306238 );
307239}
308240` ` `
309241
242+ ### Live data (microphone) transcription
243+
244+ ` ` ` typescript
245+ import { STREAMING_ACTION , useSpeechToText } from ' react-native-executorch' ;
246+ import LiveAudioStream from ' react-native-live-audio-stream' ;
247+ import { useState } from ' react' ;
248+ import { Buffer } from ' buffer' ;
249+
250+ const audioStreamOptions = {
251+ sampleRate: 16000 ,
252+ channels: 1 ,
253+ bitsPerSample: 16 ,
254+ audioSource: 1 ,
255+ bufferSize: 16000 ,
256+ };
257+
258+ const startStreamingAudio = (options : any , onChunk : (data : string ) => void ) => {
259+ LiveAudioStream .init (options );
260+ LiveAudioStream .on (' data' , onChunk );
261+ LiveAudioStream .start ();
262+ };
263+
264+ const float32ArrayFromPCMBinaryBuffer = (b64EncodedBuffer : string ) => {
265+ const b64DecodedChunk = Buffer .from (b64EncodedBuffer , ' base64' );
266+ const int16Array = new Int16Array (b64DecodedChunk .buffer );
267+
268+ const float32Array = new Float32Array (int16Array .length );
269+ for (let i = 0 ; i < int16Array .length ; i ++ ) {
270+ float32Array [i ] = Math .max (
271+ - 1 ,
272+ Math .min (1 , (int16Array [i ] / audioStreamOptions .bufferSize ) * 8 )
273+ );
274+ }
275+ return float32Array ;
276+ };
277+
278+ function App() {
279+ const [isRecording, setIsRecording] = useState (false );
280+ const speechToText = useSpeechToText ({
281+ modelName: ' moonshine' ,
282+ windowSize: 3 ,
283+ overlapSeconds: 1.2 ,
284+ });
285+
286+ const onChunk = (data : string ) => {
287+ const float32Chunk = float32ArrayFromPCMBinaryBuffer (data );
288+ speechToText .streamingTranscribe (
289+ STREAMING_ACTION .DATA ,
290+ Array .from (float32Chunk )
291+ );
292+ };
293+
294+ const handleRecordPress = async () => {
295+ if (isRecording ) {
296+ setIsRecording (false );
297+ LiveAudioStream .stop ();
298+ messageRecorded .current = true ;
299+ await speechToText .streamingTranscribe (STREAMING_ACTION .STOP );
300+ } else {
301+ setIsRecording (true );
302+ startStreamingAudio (audioStreamOptions , onChunk );
303+ await speechToText .streamingTranscribe (STREAMING_ACTION .START );
304+ }
305+ };
306+
307+ return
308+ <View >
309+ <Text >
310+ {speechToText .sequence }
311+ < / Text >
312+ < TouchableOpacity
313+ style = {
314+ !isRecording ? styles.recordTouchable : styles .recordingInfo
315+ }
316+ onPress = {handleRecordPress }
317+ >
318+ {isRecording ? (
319+ < StopIcon height = {40 } width = {40 } padding = {4 } margin = {8 } / >
320+ ) : (
321+ < MicIcon height = {40 } width = {40 } padding = {4 } margin = {8 } / >
322+ )}
323+ < / TouchableOpacity >
324+ < / View >
325+ }
326+ ` ` `
327+
310328## Supported models
311329
312330| Model | Language |
0 commit comments