Apply review suggestions

IgorSwat · IgorSwat · commit 89bdd1719d4d · 2026-05-22T11:35:36.000+02:00
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -17,10 +17,10 @@ import {
   useSpeechToText,
   TranscriptionResult,
   SpeechToTextProps,
-  FSMN_VAD,
 } from 'react-native-executorch';
 import { ModelPicker, ModelOption } from '../components/ModelPicker';
 const speechToText = models.speech_to_text;
+const vad = models.vad;
 
 type STTModelSources = SpeechToTextProps['model'];
 
@@ -44,11 +44,6 @@ import ErrorBanner from '../components/ErrorBanner';
 
 const isSimulator = DeviceInfo.isEmulatorSync();
 
-const DEFAULT_MODEL =
-  Platform.OS === 'ios' && !isSimulator
-    ? WHISPER_BASE_EN_COREML
-    : WHISPER_TINY_EN;
-
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const [selectedModel, setSelectedModel] = useState<STTModelSources>(
     Platform.OS === 'ios'
@@ -58,7 +53,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const model = useSpeechToText({
     model: selectedModel,
-    vad: FSMN_VAD,
+    vad: vad.fsmn_vad()
   });
 
   const [transcription, setTranscription] =
@@ -408,15 +403,19 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                     <Text
                       style={[
                         styles.vadButtonLabel,
-                        { color: useVAD ? 'white' : '#64748b' },
+                        useVAD
+                          ? styles.vadButtonLabelActive
+                          : styles.vadButtonLabelInactive,
                       ]}
                     >
                       VAD
                     </Text>
                     <Text
                       style={[
                         styles.vadButtonState,
-                        { color: useVAD ? '#bbf7d0' : '#94a3b8' },
+                        useVAD
+                          ? styles.vadButtonStateActive
+                          : styles.vadButtonStateInactive,
                       ]}
                     >
                       {useVAD ? 'ON' : 'OFF'}
@@ -579,11 +578,23 @@ const styles = StyleSheet.create({
     fontSize: 13,
     letterSpacing: 0.5,
   },
+  vadButtonLabelActive: {
+    color: 'white',
+  },
+  vadButtonLabelInactive: {
+    color: '#64748b',
+  },
   vadButtonState: {
     fontWeight: '700',
     fontSize: 10,
     letterSpacing: 1,
   },
+  vadButtonStateActive: {
+    color: '#bbf7d0',
+  },
+  vadButtonStateInactive: {
+    color: '#94a3b8',
+  },
   disabled: {
     opacity: 0.5,
   },
diff --git a/apps/speech/screens/VoiceActivityDetectionScreen.tsx b/apps/speech/screens/VoiceActivityDetectionScreen.tsx
@@ -8,7 +8,10 @@ import {
   Platform,
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
-import { useVAD, FSMN_VAD } from 'react-native-executorch';
+import {
+  models,
+  useVAD 
+} from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import { AudioManager, AudioRecorder } from 'react-native-audio-api';
 import SWMIcon from '../assets/swm_icon.svg';
@@ -23,7 +26,7 @@ export const VoiceActivityDetectionScreen = ({
   onBack: () => void;
 }) => {
   const model = useVAD({
-    model: FSMN_VAD,
+    model: models.vad.fsmn_vad(),
   });
 
   const [isSpeaking, setIsSpeaking] = useState(false);
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -49,7 +49,7 @@ import { AudioContext } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
 
 const model = useSpeechToText({
-  model: models.speech_to_text.whisper_tiny_en(),
+  model: models.speech_to_text.whisper_tiny_en(), // Use whisper_tiny_en for English or whisper_tiny for multilingual support
 });
 
 // 1. Get audio file
@@ -89,8 +89,13 @@ The `stream()` function accepts several optional parameters:
 
 - `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models.
 - `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects.
+- `useVAD`: Enable the Voice Activity Detection submodule (if configured in `useSpeechToText` props) to optimize performance by filtering silence. Defaults to `false`.
 - `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks in streaming mode. Lower values provide more frequent updates and lower latency, while higher values reduce CPU consumption. Defaults to `100`.
-- `useVAD`: Enable the Voice Activity Detection submodule (if configured in `useSpeechToText` props) to optimize performance by filtering silence.
+- `vadDetectionMargin`: (Advanced) The duration of silence (in milliseconds) required after speech is detected before "committing" a segment. Defaults to `500`. Only active when VAD module is used.
+
+### Voice Activity Detection (VAD)
+
+Integrating a VAD submodule is highly recommended for streaming. It improves performance by automatically removing silence, which reduces CPU usage, saves battery, and prevents the model from "hallucinating" text during silent periods.
 
 ### Example
 
@@ -103,6 +108,7 @@ import { AudioManager, AudioRecorder } from 'react-native-audio-api';
 export default function LiveTranscriber() {
   const model = useSpeechToText({
     model: models.speech_to_text.whisper_tiny_en(),
+    vad: models.vad.fsmn_vad(),
   });
   const [text, setText] = useState('');
   const isRecordingRef = useRef(false);
@@ -112,18 +118,22 @@ export default function LiveTranscriber() {
     isRecordingRef.current = true;
     setText('');
 
-    // 1. Capture microphone input
+    // 2. Capture microphone input
     recorder.onAudioReady(
       { sampleRate: 16000, bufferLength: 1600, channelCount: 1 },
       (chunk) => model.streamInsert(chunk.buffer.getChannelData(0))
     );
 
     await recorder.start();
 
-    // 2. Process the stream
+    // 3. Process the stream with VAD enabled
     try {
       let finalizedText = '';
-      const streamIter = model.stream({ verbose: false });
+      const streamIter = model.stream({
+        verbose: false,
+        useVAD: true, // Enable VAD filter
+        vadDetectionMargin: 500, // Wait for 500ms of silence before committing
+      });
 
       for await (const { committed, nonCommitted } of streamIter) {
         if (!isRecordingRef.current) break;
@@ -158,39 +168,15 @@ export default function LiveTranscriber() {
 
 ## Advanced Features
 
-### VAD Integration (Recommended for Live)
-
-Integrating **Voice Activity Detection (VAD)** as a submodule improves streaming performance by automatically removing silence. This reduces CPU usage, saves battery, and prevents hallucinations during silent periods.
-
-To use it, provide the `vad` model in the hook props and enable `useVAD` in the stream options:
-
-```typescript
-import {
-  useSpeechToText,
-  WHISPER_TINY_EN,
-  FSMN_VAD,
-} from 'react-native-executorch';
-
-const model = useSpeechToText({
-  model: WHISPER_TINY_EN,
-  vad: FSMN_VAD, // Integrating VAD submodule
-});
-
-const startLiveStreaming = async () => {
-  const streamIter = model.stream({
-    useVAD: true, // Enable VAD logic in the stream context
-    vadDetectionMargin: 500, // Wait for 500ms of silence before committing (for stability)
-  });
-};
-```
-
 ### Multilingual Transcription
 
 To transcribe languages other than English, use a multilingual model (e.g., `models.speech_to_text.whisper_tiny()`) and specify the corresponding language code:
 
 ```typescript
 // Transcribe in Spanish
-const model = useSpeechToText({ model: WHISPER_TINY });
+const model = useSpeechToText({
+  model: models.speech_to_text.whisper_tiny(),
+});
 const result = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useVAD.md b/docs/docs/03-hooks/01-natural-language-processing/useVAD.md
@@ -18,9 +18,9 @@ It is recommended to use models provided by us, which are available at our [Hugg
 This mode is best suited for processing pre-recorded audio files or existing buffers. You provide a full waveform to the `forward` method, which returns an array of detected speech segments.
 
 ```typescript
-import { useVAD, FSMN_VAD } from 'react-native-executorch';
+import { useVAD, models } from 'react-native-executorch';
 
-const model = useVAD({ model: FSMN_VAD });
+const model = useVAD({ model: models.vad.fsmn_vad() });
 
 // ... obtain audioBuffer (Float32Array) at 16kHz ...
 
@@ -55,9 +55,9 @@ You can fine-tune the streaming behavior via the `options` object:
 - **`detectionMargin`** (default: `100`ms): Specifies the maximum allowed gap between the last detected speech segment and the current time to still consider the speech as "ongoing." This value determines how much silence is tolerated before `onSpeechEnd` is triggered.
 
 ```tsx
-import { useVAD, FSMN_VAD } from 'react-native-executorch';
+import { useVAD, models } from 'react-native-executorch';
 
-const model = useVAD({ model: FSMN_VAD });
+const model = useVAD({ model: models.vad.fsmn_vad() });
 
 const startLiveVAD = async () => {
   // Start the continuous streaming listener
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/VADModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/VADModule.md
@@ -40,7 +40,7 @@ For more information on loading resources, take a look at the [loading models](.
 
 ## Running the model
 
-### Batch Processing
+### File Processing
 
 To process a full audio buffer at once, use the [`forward`](../../06-api-reference/classes/VADModule.md#forward) method. Before calling [`forward`](../../06-api-reference/classes/VADModule.md#forward), ensure you have the audio waveform sampled at 16 kHz. Pass the waveform as an argument; the method returns a promise that resolves to an array of detected speech segments.
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/Constants.h
@@ -6,14 +6,14 @@
 namespace rnexecutorch::models::voice_activity_detection::constants {
 
 inline constexpr uint32_t kSampleRate = 16000;
-inline constexpr uint32_t kSampleRateMiliseconds = 16;
-inline constexpr auto kMstoSecond = 0.001f;
+inline constexpr uint32_t kSamplesPerMs = kSampleRate / 1000;
+inline constexpr auto kMsToSeconds = 0.001f;
 inline constexpr uint32_t kWindowSizeMs = 25;
 inline constexpr uint32_t kHopLengthMs = 10;
 inline constexpr auto kWindowSize =
-    static_cast<uint32_t>(kMstoSecond * kWindowSizeMs * kSampleRate); // 400
+    static_cast<uint32_t>(kMsToSeconds * kWindowSizeMs * kSampleRate); // 400
 inline constexpr auto kHopLength =
-    static_cast<uint32_t>(kMstoSecond * kHopLengthMs * kSampleRate); // 160
+    static_cast<uint32_t>(kMsToSeconds * kHopLengthMs * kSampleRate); // 160
 inline constexpr auto kPreemphasisCoeff = 0.97f;
 inline constexpr auto kLeftPadding = (kWindowSize - 1) / 2;
 inline constexpr auto kRightPadding = kWindowSize / 2;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/Utils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/Utils.cpp
@@ -26,7 +26,8 @@ mergeSegments(const std::vector<types::Segment> &segments, size_t maxMergeGap) {
     auto &lastMerged = mergedSegments.back();
     const auto &current = segments[i];
 
-    if (current.start - lastMerged.end <= maxMergeGap) {
+    if (current.start < lastMerged.end ||
+        current.start - lastMerged.end <= maxMergeGap) {
       lastMerged.end = current.end;
     } else {
       mergedSegments.push_back(current);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.cpp
@@ -102,8 +102,6 @@ void VoiceActivityDetection::stream(std::shared_ptr<jsi::Function> callback,
     });
   };
 
-  isStreaming_ = true;
-
   while (isStreaming_) {
     // Make sure that audio buffer does not exceed it's max size
     // BEFORE infering the model, such that potentially save 1 unnecessary
@@ -134,8 +132,9 @@ void VoiceActivityDetection::stream(std::shared_ptr<jsi::Function> callback,
       auto lastSegment = detection.back();
       auto speechEnd = lastSegment.end;
 
-      uint32_t diffMs = (audioBuffer_.size() - speechEnd) /
-                        constants::kSampleRateMiliseconds; // [ms]
+      std::scoped_lock lock(audioBufferMutex_);
+      uint32_t diffMs =
+          (audioBuffer_.size() - speechEnd) / constants::kSamplesPerMs; // [ms]
 
       speaking = diffMs <= detectionMargin;
     }
@@ -239,7 +238,7 @@ VoiceActivityDetection::postprocess(const std::vector<float> &scores,
   }
 
   // Merge tightly placed segments according to the max allowed gap parameter.
-  size_t maxMergeGap = mergeGap * constants::kSampleRateMiliseconds;
+  size_t maxMergeGap = mergeGap * constants::kSamplesPerMs;
   return utils::mergeSegments(speechSegments, maxMergeGap);
 }