Accumulate full speech segments before wake phrase check

seyeong-han · seyeong-han · commit 005ecdcebfb5 · 2026-03-26T11:31:47.000-07:00
Instead of triggering on the first 2 frames of speech (64ms) and
stopping VAD, keep VAD running through the entire utterance and
fire only when speech ends. The complete audio segment — including
"hey" through "torch" — is then fed to Voxtral at once.

VAD never stops during wake checking, so there is no restart gap
between attempts. If the segment doesn't contain the wake keyword,
VAD is already listening for the next utterance.

This lets users say "hey torch" naturally as a single phrase
without needing to make noise first to "wake up" the VAD.

Made-with: Cursor
diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift
@@ -301,9 +301,9 @@ final class DictationManager {
         switch event {
         case .ready:
             store.wakeState = .listening
-        case let .speechDetected(preRollSamples):
+        case let .speechSegment(samples):
             guard state == .idle else { return }
-            await beginWakePhraseCheck(preRollSamples: preRollSamples)
+            await checkWakeSegment(samples: samples)
         case .silenceDetected:
             log.warning("VAD detected microphone silence — stopping wake listening")
             await vadService.stop()
@@ -319,48 +319,51 @@ final class DictationManager {
         }
     }
 
-    private func beginWakePhraseCheck(preRollSamples: [Float]) async {
-        await vadService.stop()
+    private func checkWakeSegment(samples: [Float]) async {
         wakeCheckTask?.cancel()
 
-        targetApp = NSWorkspace.shared.frontmostApplication
-        wakeTriggeredForCurrentSession = false
-        dictationStartedAt = .now
-        state = .listening
-        store.wakeState = .checkingPhrase
-
-        await store.startDictation(initialSamples: preRollSamples, skipMicCheck: true)
-
         if !preferences.enableWakePhrase {
+            await vadService.stop()
+            targetApp = NSWorkspace.shared.frontmostApplication
+            wakeTriggeredForCurrentSession = false
+            dictationStartedAt = .now
+            state = .listening
             store.wakeState = .active
             showPanel()
+            await store.startDictation(initialSamples: samples, skipMicCheck: true)
             startSilenceMonitor()
             return
         }
 
-        let requiredPhrase = store.normalizeWakePhrase(preferences.wakePhrase)
-        let keywords = Self.wakeKeywords(from: requiredPhrase)
+        store.wakeState = .checkingPhrase
+
+        await store.startDictation(initialSamples: samples, skipMicCheck: true)
+
+        let keywords = Self.wakeKeywords(from: store.normalizeWakePhrase(preferences.wakePhrase))
         let checkDurationNs = UInt64(preferences.wakeCheckSeconds * 1_000_000_000)
         let deadline = DispatchTime.now().uptimeNanoseconds + checkDurationNs
 
         wakeCheckTask = Task { @MainActor [weak self] in
             guard let self else { return }
-            while !Task.isCancelled && self.state == .listening {
+            while !Task.isCancelled {
                 let normalized = self.store.normalizeWakePhrase(self.store.dictationText)
                 if !keywords.isEmpty && keywords.allSatisfy({ normalized.contains($0) }) {
+                    await self.vadService.stop()
+                    self.targetApp = NSWorkspace.shared.frontmostApplication
                     self.wakeTriggeredForCurrentSession = true
+                    self.dictationStartedAt = .now
+                    self.state = .listening
                     self.store.stripLeadingWakePhrase(self.preferences.wakePhrase)
                     self.store.wakeState = .active
                     self.showPanel()
                     self.startSilenceMonitor()
+                    log.info("Wake keyword matched — dictation active")
                     return
                 }
                 if DispatchTime.now().uptimeNanoseconds >= deadline {
-                    log.info("Wake phrase not matched within \(self.preferences.wakeCheckSeconds)s — returning to idle")
                     _ = await self.store.stopDictation()
-                    self.state = .idle
                     self.store.wakeState = .listening
-                    await self.startWakeListeningIfNeeded()
+                    log.info("Speech segment did not contain wake keyword — continuing to listen")
                     return
                 }
                 try? await Task.sleep(for: .milliseconds(100))
diff --git a/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift b/voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift
@@ -15,7 +15,7 @@ private let vadLog = Logger(subsystem: "org.pytorch.executorch.VoxtralRealtime",
 actor VadService {
     enum Event: Sendable {
         case ready
-        case speechDetected(preRollSamples: [Float])
+        case speechSegment(samples: [Float])
         case silenceDetected
         case stopped
         case error(String)
@@ -25,21 +25,25 @@ actor VadService {
     private var stdinPipe: Pipe?
     private var engine: AVAudioEngine?
     private var recentSamples: [Float] = []
-    private var byteBuffer = Data()
-    private var consecutiveSpeechFrames = 0
-    private var armed = true
     private var eventHandler: (@Sendable (Event) -> Void)?
 
     private var totalSamplesWritten: Int = 0
     private var peakRms: Float = 0
     private var silenceCheckFired = false
-    private static let silenceCheckSamples = 16_000 * 2  // 2s at 16kHz
+    private static let silenceCheckSamples = 16_000 * 2
     private static let silenceRmsThreshold: Float = 1e-6
 
-    private var hangoverFramesRemaining = 0
     private var hangoverFramesMax = 0
     private static let frameDurationMs = 32
 
+    private var inSpeech = false
+    private var hangoverFramesRemaining = 0
+    private var speechSamples: [Float] = []
+    private var preRollSamples: [Float] = []
+    private var speechFrameCount = 0
+    private static let minSpeechFrames = 3
+    private static let preRollBufferSize = 16_000 / 2  // 0.5s at 16kHz
+
     func start(
         runnerPath: String,
         modelPath: String,
@@ -50,15 +54,16 @@ actor VadService {
         await stop()
 
         self.eventHandler = eventHandler
-        armed = true
         recentSamples = []
-        byteBuffer = Data()
-        consecutiveSpeechFrames = 0
-        hangoverFramesRemaining = 0
         hangoverFramesMax = max(0, hangoverMs / Self.frameDurationMs)
         totalSamplesWritten = 0
         peakRms = 0
         silenceCheckFired = false
+        inSpeech = false
+        hangoverFramesRemaining = 0
+        speechSamples = []
+        preRollSamples = []
+        speechFrameCount = 0
 
         let stdoutPipe = Pipe()
         let stdinPipe = Pipe()
@@ -163,7 +168,7 @@ actor VadService {
             let frameCount = Int(converted.frameLength)
             let samples = Array(UnsafeBufferPointer(start: channelData[0], count: frameCount))
             Task {
-                await self.appendRecent(samples)
+                await self.bufferSamples(samples)
                 try? await self.write(samples: samples, to: handle)
             }
         }
@@ -172,6 +177,17 @@ actor VadService {
         self.engine = engine
     }
 
+    private func bufferSamples(_ samples: [Float]) {
+        if inSpeech {
+            speechSamples.append(contentsOf: samples)
+        } else {
+            preRollSamples.append(contentsOf: samples)
+            if preRollSamples.count > Self.preRollBufferSize {
+                preRollSamples.removeFirst(preRollSamples.count - Self.preRollBufferSize)
+            }
+        }
+    }
+
     private func write(samples: [Float], to handle: FileHandle) throws {
         guard !samples.isEmpty else { return }
         let data = samples.withUnsafeBufferPointer { Data(buffer: $0) }
@@ -191,14 +207,6 @@ actor VadService {
         }
     }
 
-    private func appendRecent(_ samples: [Float]) {
-        recentSamples.append(contentsOf: samples)
-        let maxSamples = 16_000 * 2
-        if recentSamples.count > maxSamples {
-            recentSamples.removeFirst(recentSamples.count - maxSamples)
-        }
-    }
-
     private func handleOutputLine(_ line: String, threshold: Float) {
         if line == "READY" {
             emit(.ready)
@@ -211,17 +219,29 @@ actor VadService {
         }
 
         if probability >= threshold {
-            consecutiveSpeechFrames += 1
+            speechFrameCount += 1
             hangoverFramesRemaining = hangoverFramesMax
-        } else if hangoverFramesRemaining > 0 {
-            hangoverFramesRemaining -= 1
-        } else {
-            consecutiveSpeechFrames = 0
-        }
 
-        if armed && consecutiveSpeechFrames >= 2 {
-            armed = false
-            emit(.speechDetected(preRollSamples: recentSamples))
+            if !inSpeech && speechFrameCount >= Self.minSpeechFrames {
+                inSpeech = true
+                speechSamples = preRollSamples
+                vadLog.info("Speech segment started")
+            }
+        } else if inSpeech {
+            if hangoverFramesRemaining > 0 {
+                hangoverFramesRemaining -= 1
+            } else {
+                let segment = speechSamples
+                inSpeech = false
+                speechSamples = []
+                speechFrameCount = 0
+                preRollSamples = []
+
+                vadLog.info("Speech segment ended (\(segment.count) samples, \(String(format: "%.2f", Double(segment.count) / 16000))s)")
+                emit(.speechSegment(samples: segment))
+            }
+        } else {
+            speechFrameCount = 0
         }
     }