@@ -27,6 +27,12 @@ final class FluidAudioProvider: TranscriptionProvider {
2727
2828 private var streamingAsrManager : AsrManager ?
2929 private var finalAsrManager : AsrManager ?
30+ private var latestStreamingPreviewText : String = " "
31+ private var latestStreamingPreviewSampleCount : Int = 0
32+ private var latestStreamingPreviewFinishedAt : TimeInterval ?
33+ private let fastPreviewMinimumSamples = 32_000
34+ private let fastPreviewTailSilenceRMS : Float = 0.002
35+ private let fastPreviewTailAudioToleranceMs = 300
3036 private( set) var isReady : Bool = false
3137 private( set) var isWordBoostingActive : Bool = false
3238 private( set) var boostedVocabularyTermsCount : Int = 0
@@ -142,6 +148,9 @@ final class FluidAudioProvider: TranscriptionProvider {
142148
143149 self . streamingAsrManager = streamingManager
144150 self . finalAsrManager = finalManager
151+ self . latestStreamingPreviewText = " "
152+ self . latestStreamingPreviewSampleCount = 0
153+ self . latestStreamingPreviewFinishedAt = nil
145154 await progressSink. emit ( 0.98 )
146155
147156 self . isReady = true
@@ -156,16 +165,39 @@ final class FluidAudioProvider: TranscriptionProvider {
156165 try await self . transcribeFinal ( samples)
157166 }
158167
168+ func resetStreamingPreviewCache( ) {
169+ self . latestStreamingPreviewText = " "
170+ self . latestStreamingPreviewSampleCount = 0
171+ self . latestStreamingPreviewFinishedAt = nil
172+ }
173+
159174 func transcribeStreaming( _ samples: [ Float ] ) async throws -> ASRTranscriptionResult {
160- guard let manager = self . streamingAsrManager else {
175+ guard let fullPreviewManager = self . streamingAsrManager else {
161176 throw NSError (
162177 domain: " FluidAudioProvider " ,
163178 code: - 1 ,
164179 userInfo: [ NSLocalizedDescriptionKey: " ASR manager not initialized " ]
165180 )
166181 }
167182
168- let result = try await manager. transcribe ( samples, source: AudioSource . microphone)
183+ let startedAt = Date ( ) . timeIntervalSince1970
184+ let result = try await fullPreviewManager. transcribe ( samples, source: AudioSource . microphone)
185+ let text = result. text. trimmingCharacters ( in: . whitespacesAndNewlines)
186+ self . latestStreamingPreviewText = text
187+ self . latestStreamingPreviewSampleCount = samples. count
188+ self . latestStreamingPreviewFinishedAt = Date ( ) . timeIntervalSince1970
189+ let elapsedMs = Int ( ( ( Date ( ) . timeIntervalSince1970 - startedAt) * 1000 ) . rounded ( ) )
190+ let audioMs = Int ( ( Double ( samples. count) / 16_000.0 * 1000 ) . rounded ( ) )
191+ let rtf = audioMs > 0 ? Double ( elapsedMs) / Double( audioMs) : 0
192+ let finalizationMode = SettingsStore . shared. parakeetFinalizationMode. rawValue
193+ DebugLogger . shared. info (
194+ """
195+ ASR_BENCH provider_streaming_done samples= \( samples. count) audioMs= \( audioMs) \
196+ elapsedMs= \( elapsedMs) textChars= \( text. trimmingCharacters ( in: . whitespacesAndNewlines) . count) \
197+ rtf= \( String ( format: " %.3f " , rtf) ) finalizationMode= \( finalizationMode)
198+ """ ,
199+ source: " ASRBenchmark "
200+ )
169201 return ASRTranscriptionResult ( text: result. text, confidence: result. confidence)
170202 }
171203
@@ -178,10 +210,19 @@ final class FluidAudioProvider: TranscriptionProvider {
178210 )
179211 }
180212
213+ let startedAt = Date ( ) . timeIntervalSince1970
214+ if SettingsStore . shared. parakeetFinalizationMode == . tokenTimedChunkMerge {
215+ if let previewResult = await self . cachedStreamingPreviewResult ( for: samples, startedAt: startedAt) {
216+ return previewResult
217+ }
218+ }
219+
181220 // If the boosted final manager fails, fall back to the unboosted streaming
182221 // manager so the user still gets a transcription (just without CTC rescoring).
183222 do {
223+ let startedAt = Date ( ) . timeIntervalSince1970
184224 let result = try await manager. transcribe ( samples, source: AudioSource . microphone)
225+ self . logFinalBenchmark ( samples: samples, text: result. text, startedAt: startedAt, usedFallback: false )
185226 return ASRTranscriptionResult ( text: result. text, confidence: result. confidence)
186227 } catch {
187228 guard let fallback = self . streamingAsrManager, fallback !== manager else {
@@ -191,11 +232,128 @@ final class FluidAudioProvider: TranscriptionProvider {
191232 " FluidAudioProvider: Boosted final transcription failed ( \( error. localizedDescription) ), retrying without vocab boost " ,
192233 source: " FluidAudioProvider "
193234 )
235+ let startedAt = Date ( ) . timeIntervalSince1970
194236 let result = try await fallback. transcribe ( samples, source: AudioSource . microphone)
237+ self . logFinalBenchmark ( samples: samples, text: result. text, startedAt: startedAt, usedFallback: true )
195238 return ASRTranscriptionResult ( text: result. text, confidence: result. confidence)
196239 }
197240 }
198241
242+ func transcribeCachedStreamingPreviewIfAvailable( _ samples: [ Float ] ) async -> ASRTranscriptionResult ? {
243+ guard SettingsStore . shared. parakeetFinalizationMode == . tokenTimedChunkMerge else {
244+ return nil
245+ }
246+ let startedAt = Date ( ) . timeIntervalSince1970
247+ return await self . cachedStreamingPreviewResult ( for: samples, startedAt: startedAt)
248+ }
249+
250+ private func cachedStreamingPreviewResult( for samples: [ Float ] , startedAt: TimeInterval ) async -> ASRTranscriptionResult ? {
251+ let text = self . latestStreamingPreviewText. trimmingCharacters ( in: . whitespacesAndNewlines)
252+ let finalSampleCount = samples. count
253+ let previewSampleCount = min ( self . latestStreamingPreviewSampleCount, finalSampleCount)
254+ let tailSamples = max ( 0 , finalSampleCount - previewSampleCount)
255+ let tailMs = Int ( ( Double ( tailSamples) / 16_000.0 * 1000 ) . rounded ( ) )
256+ let tailRMS = self . rms ( samples: samples, startIndex: previewSampleCount)
257+ let coverage = finalSampleCount > 0 ? Double ( previewSampleCount) / Double( finalSampleCount) : 0
258+ let ageMs : Int
259+ if let latestStreamingPreviewFinishedAt {
260+ ageMs = Int ( ( ( Date ( ) . timeIntervalSince1970 - latestStreamingPreviewFinishedAt) * 1000 ) . rounded ( ) )
261+ } else {
262+ ageMs = Int . max
263+ }
264+
265+ DebugLogger . shared. info (
266+ """
267+ ASR_BENCH provider_fast_preview_check finalSamples= \( finalSampleCount) previewSamples= \( previewSampleCount) \
268+ tailMs= \( tailMs) coverage= \( String ( format: " %.3f " , coverage) ) ageMs= \( ageMs) \
269+ tailRMS= \( String ( format: " %.5f " , tailRMS) ) textChars= \( text. count) wordBoosting= \( self . isWordBoostingActive)
270+ """ ,
271+ source: " ASRBenchmark "
272+ )
273+
274+ guard !text. isEmpty else {
275+ self . logFastPreviewMiss ( reason: " empty " , tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
276+ return nil
277+ }
278+ guard self . latestStreamingPreviewFinishedAt != nil , self . latestStreamingPreviewSampleCount > 0 else {
279+ self . logFastPreviewMiss ( reason: " missing_preview " , tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
280+ return nil
281+ }
282+ guard ageMs <= 3000 else {
283+ self . logFastPreviewMiss ( reason: " stale " , tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
284+ return nil
285+ }
286+ guard finalSampleCount >= self . fastPreviewMinimumSamples else {
287+ self . logFastPreviewMiss ( reason: " short_recording " , tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
288+ return nil
289+ }
290+ guard coverage >= 0.88 else {
291+ self . logFastPreviewMiss ( reason: " low_coverage " , tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
292+ return nil
293+ }
294+ guard tailMs <= 1800 else {
295+ self . logFastPreviewMiss ( reason: " large_tail " , tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
296+ return nil
297+ }
298+ guard tailSamples == 0 || tailMs <= self . fastPreviewTailAudioToleranceMs || tailRMS <= self . fastPreviewTailSilenceRMS else {
299+ self . logFastPreviewMiss ( reason: " tail_has_audio " , tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
300+ return nil
301+ }
302+
303+ DebugLogger . shared. info (
304+ """
305+ ASR_BENCH provider_fast_preview_hit tailMs= \( tailMs) coverage= \( String ( format: " %.3f " , coverage) ) \
306+ ageMs= \( ageMs) tailRMS= \( String ( format: " %.5f " , tailRMS) ) textChars= \( text. count)
307+ """ ,
308+ source: " ASRBenchmark "
309+ )
310+ self . logFinalBenchmark ( samples: samples, text: text, startedAt: startedAt, usedFallback: false , source: " livePreview " )
311+ return ASRTranscriptionResult ( text: text, confidence: 0.95 )
312+ }
313+
314+ private func rms( samples: [ Float ] , startIndex: Int ) -> Float {
315+ guard startIndex < samples. count else { return 0 }
316+ var sum : Float = 0
317+ var count : Float = 0
318+ for sample in samples [ startIndex... ] {
319+ sum += sample * sample
320+ count += 1
321+ }
322+ guard count > 0 else { return 0 }
323+ return sqrt ( sum / count)
324+ }
325+
326+ private func logFastPreviewMiss( reason: String , tailMs: Int , coverage: Double , ageMs: Int , tailRMS: Float ) {
327+ DebugLogger . shared. info (
328+ """
329+ ASR_BENCH provider_fast_preview_miss reason= \( reason) tailMs= \( tailMs) \
330+ coverage= \( String ( format: " %.3f " , coverage) ) ageMs= \( ageMs) tailRMS= \( String ( format: " %.5f " , tailRMS) )
331+ """ ,
332+ source: " ASRBenchmark "
333+ )
334+ }
335+
336+ private func logFinalBenchmark(
337+ samples: [ Float ] ,
338+ text: String ,
339+ startedAt: TimeInterval ,
340+ usedFallback: Bool ,
341+ source: String = " full "
342+ ) {
343+ let elapsedMs = Int ( ( ( Date ( ) . timeIntervalSince1970 - startedAt) * 1000 ) . rounded ( ) )
344+ let audioMs = Int ( ( Double ( samples. count) / 16_000.0 * 1000 ) . rounded ( ) )
345+ let rtf = audioMs > 0 ? Double ( elapsedMs) / Double( audioMs) : 0
346+ let finalizationMode = SettingsStore . shared. parakeetFinalizationMode. rawValue
347+ DebugLogger . shared. info (
348+ """
349+ ASR_BENCH provider_final_done samples= \( samples. count) audioMs= \( audioMs) \
350+ elapsedMs= \( elapsedMs) textChars= \( text. trimmingCharacters ( in: . whitespacesAndNewlines) . count) \
351+ rtf= \( String ( format: " %.3f " , rtf) ) fallback= \( usedFallback) finalizationMode= \( finalizationMode) source= \( source)
352+ """ ,
353+ source: " ASRBenchmark "
354+ )
355+ }
356+
199357 func modelsExistOnDisk( ) -> Bool {
200358 let baseCacheDir = AsrModels . defaultCacheDirectory ( ) . deletingLastPathComponent ( )
201359 let selectedModel = self . modelOverride ?? SettingsStore . shared. selectedSpeechModel
@@ -332,5 +490,11 @@ final class FluidAudioProvider: TranscriptionProvider {
332490 func detectBoostedTerms( in text: String , limit: Int = 2 ) -> [ String ] {
333491 [ ]
334492 }
493+
494+ func resetStreamingPreviewCache( ) { }
495+
496+ func transcribeCachedStreamingPreviewIfAvailable( _ samples: [ Float ] ) async -> ASRTranscriptionResult ? {
497+ nil
498+ }
335499}
336500#endif
0 commit comments