Skip to content

Commit 005ecdc

Browse files
committed
Accumulate full speech segments before wake phrase check
Instead of triggering on the first 2 frames of speech (64ms) and stopping VAD, keep VAD running through the entire utterance and fire only when speech ends. The complete audio segment — including "hey" through "torch" — is then fed to Voxtral at once. VAD never stops during wake checking, so there is no restart gap between attempts. If the segment doesn't contain the wake keyword, VAD is already listening for the next utterance. This lets users say "hey torch" naturally as a single phrase without needing to make noise first to "wake up" the VAD. Made-with: Cursor
1 parent 060cc76 commit 005ecdc

2 files changed

Lines changed: 69 additions & 46 deletions

File tree

voxtral_realtime/macos/VoxtralRealtime/Services/DictationManager.swift

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -301,9 +301,9 @@ final class DictationManager {
301301
switch event {
302302
case .ready:
303303
store.wakeState = .listening
304-
case let .speechDetected(preRollSamples):
304+
case let .speechSegment(samples):
305305
guard state == .idle else { return }
306-
await beginWakePhraseCheck(preRollSamples: preRollSamples)
306+
await checkWakeSegment(samples: samples)
307307
case .silenceDetected:
308308
log.warning("VAD detected microphone silence — stopping wake listening")
309309
await vadService.stop()
@@ -319,48 +319,51 @@ final class DictationManager {
319319
}
320320
}
321321

322-
private func beginWakePhraseCheck(preRollSamples: [Float]) async {
323-
await vadService.stop()
322+
private func checkWakeSegment(samples: [Float]) async {
324323
wakeCheckTask?.cancel()
325324

326-
targetApp = NSWorkspace.shared.frontmostApplication
327-
wakeTriggeredForCurrentSession = false
328-
dictationStartedAt = .now
329-
state = .listening
330-
store.wakeState = .checkingPhrase
331-
332-
await store.startDictation(initialSamples: preRollSamples, skipMicCheck: true)
333-
334325
if !preferences.enableWakePhrase {
326+
await vadService.stop()
327+
targetApp = NSWorkspace.shared.frontmostApplication
328+
wakeTriggeredForCurrentSession = false
329+
dictationStartedAt = .now
330+
state = .listening
335331
store.wakeState = .active
336332
showPanel()
333+
await store.startDictation(initialSamples: samples, skipMicCheck: true)
337334
startSilenceMonitor()
338335
return
339336
}
340337

341-
let requiredPhrase = store.normalizeWakePhrase(preferences.wakePhrase)
342-
let keywords = Self.wakeKeywords(from: requiredPhrase)
338+
store.wakeState = .checkingPhrase
339+
340+
await store.startDictation(initialSamples: samples, skipMicCheck: true)
341+
342+
let keywords = Self.wakeKeywords(from: store.normalizeWakePhrase(preferences.wakePhrase))
343343
let checkDurationNs = UInt64(preferences.wakeCheckSeconds * 1_000_000_000)
344344
let deadline = DispatchTime.now().uptimeNanoseconds + checkDurationNs
345345

346346
wakeCheckTask = Task { @MainActor [weak self] in
347347
guard let self else { return }
348-
while !Task.isCancelled && self.state == .listening {
348+
while !Task.isCancelled {
349349
let normalized = self.store.normalizeWakePhrase(self.store.dictationText)
350350
if !keywords.isEmpty && keywords.allSatisfy({ normalized.contains($0) }) {
351+
await self.vadService.stop()
352+
self.targetApp = NSWorkspace.shared.frontmostApplication
351353
self.wakeTriggeredForCurrentSession = true
354+
self.dictationStartedAt = .now
355+
self.state = .listening
352356
self.store.stripLeadingWakePhrase(self.preferences.wakePhrase)
353357
self.store.wakeState = .active
354358
self.showPanel()
355359
self.startSilenceMonitor()
360+
log.info("Wake keyword matched — dictation active")
356361
return
357362
}
358363
if DispatchTime.now().uptimeNanoseconds >= deadline {
359-
log.info("Wake phrase not matched within \(self.preferences.wakeCheckSeconds)s — returning to idle")
360364
_ = await self.store.stopDictation()
361-
self.state = .idle
362365
self.store.wakeState = .listening
363-
await self.startWakeListeningIfNeeded()
366+
log.info("Speech segment did not contain wake keyword — continuing to listen")
364367
return
365368
}
366369
try? await Task.sleep(for: .milliseconds(100))

voxtral_realtime/macos/VoxtralRealtime/Services/VadService.swift

Lines changed: 48 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ private let vadLog = Logger(subsystem: "org.pytorch.executorch.VoxtralRealtime",
1515
actor VadService {
1616
enum Event: Sendable {
1717
case ready
18-
case speechDetected(preRollSamples: [Float])
18+
case speechSegment(samples: [Float])
1919
case silenceDetected
2020
case stopped
2121
case error(String)
@@ -25,21 +25,25 @@ actor VadService {
2525
private var stdinPipe: Pipe?
2626
private var engine: AVAudioEngine?
2727
private var recentSamples: [Float] = []
28-
private var byteBuffer = Data()
29-
private var consecutiveSpeechFrames = 0
30-
private var armed = true
3128
private var eventHandler: (@Sendable (Event) -> Void)?
3229

3330
private var totalSamplesWritten: Int = 0
3431
private var peakRms: Float = 0
3532
private var silenceCheckFired = false
36-
private static let silenceCheckSamples = 16_000 * 2 // 2s at 16kHz
33+
private static let silenceCheckSamples = 16_000 * 2
3734
private static let silenceRmsThreshold: Float = 1e-6
3835

39-
private var hangoverFramesRemaining = 0
4036
private var hangoverFramesMax = 0
4137
private static let frameDurationMs = 32
4238

39+
private var inSpeech = false
40+
private var hangoverFramesRemaining = 0
41+
private var speechSamples: [Float] = []
42+
private var preRollSamples: [Float] = []
43+
private var speechFrameCount = 0
44+
private static let minSpeechFrames = 3
45+
private static let preRollBufferSize = 16_000 / 2 // 0.5s at 16kHz
46+
4347
func start(
4448
runnerPath: String,
4549
modelPath: String,
@@ -50,15 +54,16 @@ actor VadService {
5054
await stop()
5155

5256
self.eventHandler = eventHandler
53-
armed = true
5457
recentSamples = []
55-
byteBuffer = Data()
56-
consecutiveSpeechFrames = 0
57-
hangoverFramesRemaining = 0
5858
hangoverFramesMax = max(0, hangoverMs / Self.frameDurationMs)
5959
totalSamplesWritten = 0
6060
peakRms = 0
6161
silenceCheckFired = false
62+
inSpeech = false
63+
hangoverFramesRemaining = 0
64+
speechSamples = []
65+
preRollSamples = []
66+
speechFrameCount = 0
6267

6368
let stdoutPipe = Pipe()
6469
let stdinPipe = Pipe()
@@ -163,7 +168,7 @@ actor VadService {
163168
let frameCount = Int(converted.frameLength)
164169
let samples = Array(UnsafeBufferPointer(start: channelData[0], count: frameCount))
165170
Task {
166-
await self.appendRecent(samples)
171+
await self.bufferSamples(samples)
167172
try? await self.write(samples: samples, to: handle)
168173
}
169174
}
@@ -172,6 +177,17 @@ actor VadService {
172177
self.engine = engine
173178
}
174179

180+
private func bufferSamples(_ samples: [Float]) {
181+
if inSpeech {
182+
speechSamples.append(contentsOf: samples)
183+
} else {
184+
preRollSamples.append(contentsOf: samples)
185+
if preRollSamples.count > Self.preRollBufferSize {
186+
preRollSamples.removeFirst(preRollSamples.count - Self.preRollBufferSize)
187+
}
188+
}
189+
}
190+
175191
private func write(samples: [Float], to handle: FileHandle) throws {
176192
guard !samples.isEmpty else { return }
177193
let data = samples.withUnsafeBufferPointer { Data(buffer: $0) }
@@ -191,14 +207,6 @@ actor VadService {
191207
}
192208
}
193209

194-
private func appendRecent(_ samples: [Float]) {
195-
recentSamples.append(contentsOf: samples)
196-
let maxSamples = 16_000 * 2
197-
if recentSamples.count > maxSamples {
198-
recentSamples.removeFirst(recentSamples.count - maxSamples)
199-
}
200-
}
201-
202210
private func handleOutputLine(_ line: String, threshold: Float) {
203211
if line == "READY" {
204212
emit(.ready)
@@ -211,17 +219,29 @@ actor VadService {
211219
}
212220

213221
if probability >= threshold {
214-
consecutiveSpeechFrames += 1
222+
speechFrameCount += 1
215223
hangoverFramesRemaining = hangoverFramesMax
216-
} else if hangoverFramesRemaining > 0 {
217-
hangoverFramesRemaining -= 1
218-
} else {
219-
consecutiveSpeechFrames = 0
220-
}
221224

222-
if armed && consecutiveSpeechFrames >= 2 {
223-
armed = false
224-
emit(.speechDetected(preRollSamples: recentSamples))
225+
if !inSpeech && speechFrameCount >= Self.minSpeechFrames {
226+
inSpeech = true
227+
speechSamples = preRollSamples
228+
vadLog.info("Speech segment started")
229+
}
230+
} else if inSpeech {
231+
if hangoverFramesRemaining > 0 {
232+
hangoverFramesRemaining -= 1
233+
} else {
234+
let segment = speechSamples
235+
inSpeech = false
236+
speechSamples = []
237+
speechFrameCount = 0
238+
preRollSamples = []
239+
240+
vadLog.info("Speech segment ended (\(segment.count) samples, \(String(format: "%.2f", Double(segment.count) / 16000))s)")
241+
emit(.speechSegment(samples: segment))
242+
}
243+
} else {
244+
speechFrameCount = 0
225245
}
226246
}
227247

0 commit comments

Comments
 (0)