Skip to content

Commit 89bdd17

Browse files
committed
Apply review suggestions
1 parent f6e774c commit 89bdd17

8 files changed

Lines changed: 58 additions & 58 deletions

File tree

apps/speech/screens/SpeechToTextScreen.tsx

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ import {
1717
useSpeechToText,
1818
TranscriptionResult,
1919
SpeechToTextProps,
20-
FSMN_VAD,
2120
} from 'react-native-executorch';
2221
import { ModelPicker, ModelOption } from '../components/ModelPicker';
2322
const speechToText = models.speech_to_text;
23+
const vad = models.vad;
2424

2525
type STTModelSources = SpeechToTextProps['model'];
2626

@@ -44,11 +44,6 @@ import ErrorBanner from '../components/ErrorBanner';
4444

4545
const isSimulator = DeviceInfo.isEmulatorSync();
4646

47-
const DEFAULT_MODEL =
48-
Platform.OS === 'ios' && !isSimulator
49-
? WHISPER_BASE_EN_COREML
50-
: WHISPER_TINY_EN;
51-
5247
export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
5348
const [selectedModel, setSelectedModel] = useState<STTModelSources>(
5449
Platform.OS === 'ios'
@@ -58,7 +53,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
5853

5954
const model = useSpeechToText({
6055
model: selectedModel,
61-
vad: FSMN_VAD,
56+
vad: vad.fsmn_vad()
6257
});
6358

6459
const [transcription, setTranscription] =
@@ -408,15 +403,19 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
408403
<Text
409404
style={[
410405
styles.vadButtonLabel,
411-
{ color: useVAD ? 'white' : '#64748b' },
406+
useVAD
407+
? styles.vadButtonLabelActive
408+
: styles.vadButtonLabelInactive,
412409
]}
413410
>
414411
VAD
415412
</Text>
416413
<Text
417414
style={[
418415
styles.vadButtonState,
419-
{ color: useVAD ? '#bbf7d0' : '#94a3b8' },
416+
useVAD
417+
? styles.vadButtonStateActive
418+
: styles.vadButtonStateInactive,
420419
]}
421420
>
422421
{useVAD ? 'ON' : 'OFF'}
@@ -579,11 +578,23 @@ const styles = StyleSheet.create({
579578
fontSize: 13,
580579
letterSpacing: 0.5,
581580
},
581+
vadButtonLabelActive: {
582+
color: 'white',
583+
},
584+
vadButtonLabelInactive: {
585+
color: '#64748b',
586+
},
582587
vadButtonState: {
583588
fontWeight: '700',
584589
fontSize: 10,
585590
letterSpacing: 1,
586591
},
592+
vadButtonStateActive: {
593+
color: '#bbf7d0',
594+
},
595+
vadButtonStateInactive: {
596+
color: '#94a3b8',
597+
},
587598
disabled: {
588599
opacity: 0.5,
589600
},

apps/speech/screens/VoiceActivityDetectionScreen.tsx

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ import {
88
Platform,
99
} from 'react-native';
1010
import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
11-
import { useVAD, FSMN_VAD } from 'react-native-executorch';
11+
import {
12+
models,
13+
useVAD
14+
} from 'react-native-executorch';
1215
import FontAwesome from '@expo/vector-icons/FontAwesome';
1316
import { AudioManager, AudioRecorder } from 'react-native-audio-api';
1417
import SWMIcon from '../assets/swm_icon.svg';
@@ -23,7 +26,7 @@ export const VoiceActivityDetectionScreen = ({
2326
onBack: () => void;
2427
}) => {
2528
const model = useVAD({
26-
model: FSMN_VAD,
29+
model: models.vad.fsmn_vad(),
2730
});
2831

2932
const [isSpeaking, setIsSpeaking] = useState(false);

docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ import { AudioContext } from 'react-native-audio-api';
4949
import * as FileSystem from 'expo-file-system';
5050

5151
const model = useSpeechToText({
52-
model: models.speech_to_text.whisper_tiny_en(),
52+
model: models.speech_to_text.whisper_tiny_en(), // Use whisper_tiny_en for English or whisper_tiny for multilingual support
5353
});
5454

5555
// 1. Get audio file
@@ -89,8 +89,13 @@ The `stream()` function accepts several optional parameters:
8989

9090
- `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
9191
- `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects.
92+
- `useVAD`: Enable the Voice Activity Detection submodule (if configured in `useSpeechToText` props) to optimize performance by filtering silence. Defaults to `false`.
9293
- `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks in streaming mode. Lower values provide more frequent updates and lower latency, while higher values reduce CPU consumption. Defaults to `100`.
93-
- `useVAD`: Enable the Voice Activity Detection submodule (if configured in `useSpeechToText` props) to optimize performance by filtering silence.
94+
- `vadDetectionMargin`: (Advanced) The duration of silence (in milliseconds) required after speech is detected before "committing" a segment. Defaults to `500`. Only active when VAD module is used.
95+
96+
### Voice Activity Detection (VAD)
97+
98+
Integrating a VAD submodule is highly recommended for streaming. It improves performance by automatically removing silence, which reduces CPU usage, saves battery, and prevents the model from "hallucinating" text during silent periods.
9499

95100
### Example
96101

@@ -103,6 +108,7 @@ import { AudioManager, AudioRecorder } from 'react-native-audio-api';
103108
export default function LiveTranscriber() {
104109
const model = useSpeechToText({
105110
model: models.speech_to_text.whisper_tiny_en(),
111+
vad: models.vad.fsmn_vad(),
106112
});
107113
const [text, setText] = useState('');
108114
const isRecordingRef = useRef(false);
@@ -112,18 +118,22 @@ export default function LiveTranscriber() {
112118
isRecordingRef.current = true;
113119
setText('');
114120

115-
// 1. Capture microphone input
121+
// 2. Capture microphone input
116122
recorder.onAudioReady(
117123
{ sampleRate: 16000, bufferLength: 1600, channelCount: 1 },
118124
(chunk) => model.streamInsert(chunk.buffer.getChannelData(0))
119125
);
120126

121127
await recorder.start();
122128

123-
// 2. Process the stream
129+
// 3. Process the stream with VAD enabled
124130
try {
125131
let finalizedText = '';
126-
const streamIter = model.stream({ verbose: false });
132+
const streamIter = model.stream({
133+
verbose: false,
134+
useVAD: true, // Enable VAD filter
135+
vadDetectionMargin: 500, // Wait for 500ms of silence before committing
136+
});
127137

128138
for await (const { committed, nonCommitted } of streamIter) {
129139
if (!isRecordingRef.current) break;
@@ -158,39 +168,15 @@ export default function LiveTranscriber() {
158168

159169
## Advanced Features
160170

161-
### VAD Integration (Recommended for Live)
162-
163-
Integrating **Voice Activity Detection (VAD)** as a submodule improves streaming performance by automatically removing silence. This reduces CPU usage, saves battery, and prevents hallucinations during silent periods.
164-
165-
To use it, provide the `vad` model in the hook props and enable `useVAD` in the stream options:
166-
167-
```typescript
168-
import {
169-
useSpeechToText,
170-
WHISPER_TINY_EN,
171-
FSMN_VAD,
172-
} from 'react-native-executorch';
173-
174-
const model = useSpeechToText({
175-
model: WHISPER_TINY_EN,
176-
vad: FSMN_VAD, // Integrating VAD submodule
177-
});
178-
179-
const startLiveStreaming = async () => {
180-
const streamIter = model.stream({
181-
useVAD: true, // Enable VAD logic in the stream context
182-
vadDetectionMargin: 500, // Wait for 500ms of silence before committing (for stability)
183-
});
184-
};
185-
```
186-
187171
### Multilingual Transcription
188172

189173
To transcribe languages other than English, use a multilingual model (e.g., `models.speech_to_text.whisper_tiny()`) and specify the corresponding language code:
190174

191175
```typescript
192176
// Transcribe in Spanish
193-
const model = useSpeechToText({ model: WHISPER_TINY });
177+
const model = useSpeechToText({
178+
model: models.speech_to_text.whisper_tiny(),
179+
});
194180
const result = await model.transcribe(spanishAudio, { language: 'es' });
195181
```
196182

docs/docs/03-hooks/01-natural-language-processing/useVAD.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ It is recommended to use models provided by us, which are available at our [Hugg
1818
This mode is best suited for processing pre-recorded audio files or existing buffers. You provide a full waveform to the `forward` method, which returns an array of detected speech segments.
1919

2020
```typescript
21-
import { useVAD, FSMN_VAD } from 'react-native-executorch';
21+
import { useVAD, models } from 'react-native-executorch';
2222

23-
const model = useVAD({ model: FSMN_VAD });
23+
const model = useVAD({ model: models.vad.fsmn_vad() });
2424

2525
// ... obtain audioBuffer (Float32Array) at 16kHz ...
2626

@@ -55,9 +55,9 @@ You can fine-tune the streaming behavior via the `options` object:
5555
- **`detectionMargin`** (default: `100`ms): Specifies the maximum allowed gap between the last detected speech segment and the current time to still consider the speech as "ongoing." This value determines how much silence is tolerated before `onSpeechEnd` is triggered.
5656

5757
```tsx
58-
import { useVAD, FSMN_VAD } from 'react-native-executorch';
58+
import { useVAD, models } from 'react-native-executorch';
5959

60-
const model = useVAD({ model: FSMN_VAD });
60+
const model = useVAD({ model: models.vad.fsmn_vad() });
6161

6262
const startLiveVAD = async () => {
6363
// Start the continuous streaming listener

docs/docs/04-typescript-api/01-natural-language-processing/VADModule.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ For more information on loading resources, take a look at the [loading models](.
4040

4141
## Running the model
4242

43-
### Batch Processing
43+
### File Processing
4444

4545
To process a full audio buffer at once, use the [`forward`](../../06-api-reference/classes/VADModule.md#forward) method. Before calling [`forward`](../../06-api-reference/classes/VADModule.md#forward), ensure you have the audio waveform sampled at 16 kHz. Pass the waveform as an argument; the method returns a promise that resolves to an array of detected speech segments.
4646

packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/Constants.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
namespace rnexecutorch::models::voice_activity_detection::constants {
77

88
inline constexpr uint32_t kSampleRate = 16000;
9-
inline constexpr uint32_t kSampleRateMiliseconds = 16;
10-
inline constexpr auto kMstoSecond = 0.001f;
9+
inline constexpr uint32_t kSamplesPerMs = kSampleRate / 1000;
10+
inline constexpr auto kMsToSeconds = 0.001f;
1111
inline constexpr uint32_t kWindowSizeMs = 25;
1212
inline constexpr uint32_t kHopLengthMs = 10;
1313
inline constexpr auto kWindowSize =
14-
static_cast<uint32_t>(kMstoSecond * kWindowSizeMs * kSampleRate); // 400
14+
static_cast<uint32_t>(kMsToSeconds * kWindowSizeMs * kSampleRate); // 400
1515
inline constexpr auto kHopLength =
16-
static_cast<uint32_t>(kMstoSecond * kHopLengthMs * kSampleRate); // 160
16+
static_cast<uint32_t>(kMsToSeconds * kHopLengthMs * kSampleRate); // 160
1717
inline constexpr auto kPreemphasisCoeff = 0.97f;
1818
inline constexpr auto kLeftPadding = (kWindowSize - 1) / 2;
1919
inline constexpr auto kRightPadding = kWindowSize / 2;

packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/Utils.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ mergeSegments(const std::vector<types::Segment> &segments, size_t maxMergeGap) {
2626
auto &lastMerged = mergedSegments.back();
2727
const auto &current = segments[i];
2828

29-
if (current.start - lastMerged.end <= maxMergeGap) {
29+
if (current.start < lastMerged.end ||
30+
current.start - lastMerged.end <= maxMergeGap) {
3031
lastMerged.end = current.end;
3132
} else {
3233
mergedSegments.push_back(current);

packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,6 @@ void VoiceActivityDetection::stream(std::shared_ptr<jsi::Function> callback,
102102
});
103103
};
104104

105-
isStreaming_ = true;
106-
107105
while (isStreaming_) {
108106
// Make sure that audio buffer does not exceed it's max size
109107
// BEFORE infering the model, such that potentially save 1 unnecessary
@@ -134,8 +132,9 @@ void VoiceActivityDetection::stream(std::shared_ptr<jsi::Function> callback,
134132
auto lastSegment = detection.back();
135133
auto speechEnd = lastSegment.end;
136134

137-
uint32_t diffMs = (audioBuffer_.size() - speechEnd) /
138-
constants::kSampleRateMiliseconds; // [ms]
135+
std::scoped_lock lock(audioBufferMutex_);
136+
uint32_t diffMs =
137+
(audioBuffer_.size() - speechEnd) / constants::kSamplesPerMs; // [ms]
139138

140139
speaking = diffMs <= detectionMargin;
141140
}
@@ -239,7 +238,7 @@ VoiceActivityDetection::postprocess(const std::vector<float> &scores,
239238
}
240239

241240
// Merge tightly placed segments according to the max allowed gap parameter.
242-
size_t maxMergeGap = mergeGap * constants::kSampleRateMiliseconds;
241+
size_t maxMergeGap = mergeGap * constants::kSamplesPerMs;
243242
return utils::mergeSegments(speechSegments, maxMergeGap);
244243
}
245244

0 commit comments

Comments
 (0)