Skip to content

Commit a1556b9

Browse files
committed
feat!: Add time stamping to speech to text (#742)
<!-- Provide a concise and descriptive summary of the changes implemented in this PR. --> - [x] Yes - [ ] No This PR introduces breaking change as now the return type from `transcribe` and `stream` methods are based on `TranscriptionResult` type. Also now there is no commited / nonCommited properties of hook. `stream` now is async generator. - [ ] Bug fix (change which fixes an issue) - [x] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) - [x] iOS - [x] Android * Run demo app in `apps/speech` and run transcription for both time stamping and regular mode (both from url and from real time audio to test both `transcribe` and `stream` methods). * Run voice chat in `apps/llm` to check if transcription appears. *NOTE* This example seems to be a bit buggy. * You need to run this on **android device** since this PR also fixes `Speech to Text` demo app in case of using physical android device. Earlier, required permissions for microphone weren't granted and the example effectively didn't work. * Check that documentation for modified sections is updated and that api reference is correct as well. * Run tests and check that they compile and work as previously. <!-- Add screenshots here, if applicable --> <!-- Link related issues here using #issue-number --> - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings <!-- Include any additional information, assumptions, or context that reviewers might need to understand this PR. -->
1 parent ea096e1 commit a1556b9

33 files changed

Lines changed: 1401 additions & 359 deletions

File tree

.cspell-wordlist.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,4 +107,6 @@ sublist
107107
TTFT
108108
pogodin
109109
kesha
110-
antonov
110+
antonov
111+
timestamping
112+
logprob

apps/llm/app/voice_chat/index.tsx

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { useContext, useEffect, useRef, useState } from 'react';
1+
import { useContext, useEffect, useState } from 'react';
22
import {
33
Keyboard,
44
KeyboardAvoidingView,
@@ -35,14 +35,16 @@ export default function VoiceChatScreenWrapper() {
3535

3636
function VoiceChatScreen() {
3737
const [isRecording, setIsRecording] = useState(false);
38+
const [liveTranscription, setLiveTranscription] = useState('');
39+
3840
const [recorder] = useState(
3941
() =>
4042
new AudioRecorder({
4143
sampleRate: 16000,
4244
bufferLengthInSamples: 1600,
4345
})
4446
);
45-
const messageRecorded = useRef<boolean>(false);
47+
4648
const { setGlobalGenerating } = useContext(GeneratingContext);
4749

4850
const llm = useLLM({ model: QWEN3_0_6B_QUANTIZED });
@@ -67,16 +69,32 @@ function VoiceChatScreen() {
6769
if (isRecording) {
6870
setIsRecording(false);
6971
recorder.stop();
70-
messageRecorded.current = true;
7172
speechToText.streamStop();
7273
} else {
7374
setIsRecording(true);
75+
setLiveTranscription('');
76+
7477
recorder.onAudioReady(({ buffer }) => {
7578
speechToText.streamInsert(buffer.getChannelData(0));
7679
});
7780
recorder.start();
78-
const transcription = await speechToText.stream();
79-
await llm.sendMessage(transcription);
81+
82+
let finalResult = '';
83+
84+
try {
85+
for await (const result of speechToText.stream()) {
86+
const text = result.committed.text + result.nonCommitted.text;
87+
setLiveTranscription(text);
88+
finalResult = text;
89+
}
90+
} catch (e) {
91+
console.error('Streaming error:', e);
92+
} finally {
93+
if (finalResult.trim().length > 0) {
94+
await llm.sendMessage(finalResult);
95+
setLiveTranscription('');
96+
}
97+
}
8098
}
8199
};
82100

@@ -96,16 +114,17 @@ function VoiceChatScreen() {
96114
<SWMIcon width={45} height={45} />
97115
<Text style={styles.textModelName}>Qwen 3 x Whisper</Text>
98116
</View>
99-
{llm.messageHistory.length || speechToText.committedTranscription ? (
117+
118+
{llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
100119
<View style={styles.chatContainer}>
101120
<Messages
102121
chatHistory={
103-
speechToText.isGenerating
122+
isRecording && liveTranscription.length > 0
104123
? [
105124
...llm.messageHistory,
106125
{
107126
role: 'user',
108-
content: speechToText.committedTranscription,
127+
content: liveTranscription,
109128
},
110129
]
111130
: llm.messageHistory
@@ -123,6 +142,7 @@ function VoiceChatScreen() {
123142
</Text>
124143
</View>
125144
)}
145+
126146
<View style={styles.bottomContainer}>
127147
{DeviceInfo.isEmulatorSync() ? (
128148
<View style={styles.emulatorBox}>

apps/speech/app.json

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,44 @@
1717
"bundleIdentifier": "com.anonymous.speech",
1818
"infoPlist": {
1919
"NSMicrophoneUsageDescription": "This app needs access to your microphone to record audio."
20+
},
21+
"entitlements": {
22+
"com.apple.developer.kernel.increased-memory-limit": true
2023
}
2124
},
2225
"android": {
2326
"adaptiveIcon": {
2427
"foregroundImage": "./assets/adaptive-icon.png",
2528
"backgroundColor": "#ffffff"
2629
},
27-
"package": "com.anonymous.speech"
30+
"package": "com.anonymous.speech",
31+
"permissions": [
32+
"android.permission.RECORD_AUDIO",
33+
"android.permission.MODIFY_AUDIO_SETTINGS",
34+
"android.permission.FOREGROUND_SERVICE",
35+
"android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK"
36+
]
2837
},
2938
"web": {
3039
"favicon": "./assets/favicon.png"
3140
},
32-
"plugins": ["expo-font"]
41+
"plugins": [
42+
"expo-font",
43+
[
44+
"react-native-audio-api",
45+
{
46+
"iosBackgroundMode": true,
47+
"iosMicrophonePermission": "This app requires access to the microphone to record audio.",
48+
"androidPermissions": [
49+
"android.permission.MODIFY_AUDIO_SETTINGS",
50+
"android.permission.FOREGROUND_SERVICE",
51+
"android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK",
52+
"android.permission.RECORD_AUDIO"
53+
],
54+
"androidForegroundService": true,
55+
"androidFSTypes": ["mediaPlayback", "microphone"]
56+
}
57+
]
58+
]
3359
}
3460
}
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
import React from 'react';
2+
import { View, Text, StyleSheet } from 'react-native';
3+
import { TranscriptionResult } from 'react-native-executorch';
4+
5+
export const VerboseTranscription = ({
6+
data,
7+
}: {
8+
data: TranscriptionResult;
9+
}) => {
10+
if (!data) return null;
11+
12+
const hasSegments = Array.isArray(data.segments) && data.segments.length > 0;
13+
14+
const hasLanguage =
15+
!!data.language && data.language !== 'N/A' && data.language.trim() !== '';
16+
17+
const hasDuration = typeof data.duration === 'number' && data.duration > 0;
18+
19+
const hasMetadata = hasLanguage || hasDuration;
20+
21+
return (
22+
<View style={styles.container}>
23+
<View style={styles.metaContainer}>
24+
<Text style={styles.label}>Full Text:</Text>
25+
<Text style={styles.text}>{data.text || ''}</Text>
26+
27+
{hasMetadata && (
28+
<View style={styles.row}>
29+
{hasLanguage && (
30+
<Text style={styles.metaItem}>Language: {data.language}</Text>
31+
)}
32+
{hasDuration && (
33+
<Text style={styles.metaItem}>
34+
Duration: {data.duration?.toFixed(2)}s
35+
</Text>
36+
)}
37+
</View>
38+
)}
39+
</View>
40+
41+
{hasSegments && (
42+
<>
43+
<Text style={styles.sectionHeader}>
44+
Segments ({data.segments?.length})
45+
</Text>
46+
47+
{data.segments?.map((seg, index) => (
48+
<View key={index} style={styles.segmentCard}>
49+
<View style={styles.segmentHeader}>
50+
<Text style={styles.timeBadge}>
51+
{seg.start.toFixed(2)}s - {seg.end.toFixed(2)}s
52+
</Text>
53+
<Text style={styles.segmentId}>ID: {index}</Text>
54+
</View>
55+
56+
<Text style={styles.segmentText}>"{seg.text}"</Text>
57+
58+
{seg.words && seg.words.length > 0 && (
59+
<View style={styles.wordsContainer}>
60+
<Text style={styles.statLabel}>Word Timestamps:</Text>
61+
<View style={styles.wordsGrid}>
62+
{seg.words.map((w, wIdx) => (
63+
<View key={wIdx} style={styles.wordChip}>
64+
<Text style={styles.wordText}>{w.word.trim()}</Text>
65+
<Text style={styles.wordTime}>
66+
{w.start.toFixed(2)}s
67+
</Text>
68+
</View>
69+
))}
70+
</View>
71+
</View>
72+
)}
73+
74+
<View style={styles.statsGrid}>
75+
<View style={styles.statItem}>
76+
<Text style={styles.statLabel}>Avg LogProb</Text>
77+
<Text style={styles.statValue}>
78+
{data.task === 'transcribe'
79+
? seg.avgLogprob?.toFixed(4)
80+
: 'N/A'}
81+
</Text>
82+
</View>
83+
<View style={styles.statItem}>
84+
<Text style={styles.statLabel}>Temp</Text>
85+
<Text style={styles.statValue}>
86+
{data.task === 'transcribe'
87+
? seg.temperature?.toFixed(2)
88+
: 'N/A'}
89+
</Text>
90+
</View>
91+
<View style={styles.statItem}>
92+
{/*eslint-disable-next-line @cspell/spellchecker*/}
93+
<Text style={styles.statLabel}>Compr.</Text>
94+
<Text style={styles.statValue}>
95+
{data.task === 'transcribe'
96+
? seg.compressionRatio?.toFixed(2)
97+
: 'N/A'}
98+
</Text>
99+
</View>
100+
</View>
101+
</View>
102+
))}
103+
</>
104+
)}
105+
</View>
106+
);
107+
};
108+
109+
const styles = StyleSheet.create({
110+
container: {
111+
padding: 4,
112+
},
113+
metaContainer: {
114+
marginBottom: 16,
115+
padding: 12,
116+
backgroundColor: '#f0f2f5',
117+
borderRadius: 8,
118+
},
119+
label: {
120+
fontWeight: 'bold',
121+
color: '#0f186e',
122+
marginBottom: 4,
123+
},
124+
text: {
125+
fontSize: 16,
126+
color: '#333',
127+
marginBottom: 8,
128+
},
129+
row: {
130+
flexDirection: 'row',
131+
gap: 10,
132+
marginTop: 8,
133+
},
134+
metaItem: {
135+
fontSize: 12,
136+
color: '#666',
137+
backgroundColor: '#e1e4e8',
138+
paddingHorizontal: 8,
139+
paddingVertical: 2,
140+
borderRadius: 4,
141+
overflow: 'hidden',
142+
},
143+
sectionHeader: {
144+
fontSize: 18,
145+
fontWeight: 'bold',
146+
color: '#0f186e',
147+
marginBottom: 8,
148+
marginTop: 8,
149+
},
150+
segmentCard: {
151+
backgroundColor: '#fff',
152+
borderRadius: 8,
153+
borderWidth: 1,
154+
borderColor: '#e1e4e8',
155+
marginBottom: 12,
156+
padding: 12,
157+
shadowColor: '#000',
158+
shadowOffset: { width: 0, height: 1 },
159+
shadowOpacity: 0.1,
160+
shadowRadius: 2,
161+
elevation: 2,
162+
},
163+
segmentHeader: {
164+
flexDirection: 'row',
165+
justifyContent: 'space-between',
166+
marginBottom: 8,
167+
},
168+
timeBadge: {
169+
fontSize: 12,
170+
fontWeight: 'bold',
171+
color: '#fff',
172+
backgroundColor: '#0f186e',
173+
paddingHorizontal: 8,
174+
paddingVertical: 2,
175+
borderRadius: 12,
176+
overflow: 'hidden',
177+
},
178+
segmentId: {
179+
fontSize: 12,
180+
color: '#888',
181+
},
182+
segmentText: {
183+
fontSize: 15,
184+
fontStyle: 'italic',
185+
color: '#333',
186+
marginBottom: 12,
187+
},
188+
statsGrid: {
189+
flexDirection: 'row',
190+
flexWrap: 'wrap',
191+
gap: 8,
192+
borderTopWidth: 1,
193+
borderTopColor: '#f0f0f0',
194+
paddingTop: 8,
195+
},
196+
statItem: {
197+
flex: 1,
198+
minWidth: '45%',
199+
flexDirection: 'row',
200+
justifyContent: 'space-between',
201+
},
202+
statLabel: {
203+
fontSize: 11,
204+
color: '#888',
205+
},
206+
statValue: {
207+
fontSize: 11,
208+
fontWeight: '600',
209+
color: '#444',
210+
},
211+
wordsContainer: {
212+
marginVertical: 8,
213+
backgroundColor: '#f8f9fa',
214+
padding: 8,
215+
borderRadius: 6,
216+
},
217+
wordsGrid: {
218+
flexDirection: 'row',
219+
flexWrap: 'wrap',
220+
gap: 6,
221+
marginTop: 4,
222+
},
223+
wordChip: {
224+
backgroundColor: '#ffffff',
225+
borderWidth: 1,
226+
borderColor: '#e1e4e8',
227+
borderRadius: 4,
228+
paddingHorizontal: 6,
229+
paddingVertical: 2,
230+
alignItems: 'center',
231+
},
232+
wordText: {
233+
fontSize: 12,
234+
color: '#333',
235+
},
236+
wordTime: {
237+
fontSize: 9,
238+
color: '#888',
239+
marginTop: 1,
240+
},
241+
});

apps/speech/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"metro-config": "^0.81.5",
2121
"react": "19.1.0",
2222
"react-native": "0.81.5",
23-
"react-native-audio-api": "0.6.5",
23+
"react-native-audio-api": "0.11.3",
2424
"react-native-device-info": "^14.0.4",
2525
"react-native-executorch": "workspace:*",
2626
"react-native-reanimated": "~4.1.1",

0 commit comments

Comments
 (0)