Skip to content

Commit cf8b5bf

Browse files
authored
Merge pull request #339 from altic-dev/B/model-benchmarking
Speed up fast dictation finalization
2 parents 9ec89ed + 9d6c063 commit cf8b5bf

7 files changed

Lines changed: 421 additions & 47 deletions

File tree

Fluid.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Package.resolved

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import Foundation
2+
3+
enum ParakeetFinalizationMode: String, CaseIterable, Codable, Identifiable {
4+
case stableFullFinal
5+
case tokenTimedChunkMerge
6+
7+
var id: String { self.rawValue }
8+
9+
var displayName: String {
10+
switch self {
11+
case .stableFullFinal:
12+
return "Standard"
13+
case .tokenTimedChunkMerge:
14+
return "Fast"
15+
}
16+
}
17+
18+
var detailText: String {
19+
switch self {
20+
case .stableFullFinal:
21+
return "Most reliable."
22+
case .tokenTimedChunkMerge:
23+
return "Faster, but maybe inaccurate."
24+
}
25+
}
26+
}

Sources/Fluid/Persistence/SettingsStore.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,16 @@ final class SettingsStore: ObservableObject {
13041304
}
13051305
}
13061306

1307+
var parakeetFinalizationMode: ParakeetFinalizationMode {
1308+
get {
1309+
self.defaults.string(forKey: Keys.parakeetFinalizationMode).flatMap(ParakeetFinalizationMode.init(rawValue:)) ?? .stableFullFinal
1310+
}
1311+
set {
1312+
objectWillChange.send()
1313+
self.defaults.set(newValue.rawValue, forKey: Keys.parakeetFinalizationMode)
1314+
}
1315+
}
1316+
13071317
var copyTranscriptionToClipboard: Bool {
13081318
get { self.defaults.bool(forKey: Keys.copyTranscriptionToClipboard) }
13091319
set { self.defaults.set(newValue, forKey: Keys.copyTranscriptionToClipboard) }
@@ -3616,6 +3626,7 @@ private extension SettingsStore {
36163626
static let hotkeyMode = "HotkeyMode"
36173627
static let enableStreamingPreview = "EnableStreamingPreview"
36183628
static let enableAIStreaming = "EnableAIStreaming"
3629+
static let parakeetFinalizationMode = "ParakeetFinalizationMode"
36193630
static let copyTranscriptionToClipboard = "CopyTranscriptionToClipboard"
36203631
static let textInsertionMode = "TextInsertionMode"
36213632
static let autoUpdateCheckEnabled = "AutoUpdateCheckEnabled"

Sources/Fluid/Services/ASRService.swift

Lines changed: 175 additions & 43 deletions
Large diffs are not rendered by default.

Sources/Fluid/Services/FluidAudioProvider.swift

Lines changed: 166 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ final class FluidAudioProvider: TranscriptionProvider {
2727

2828
private var streamingAsrManager: AsrManager?
2929
private var finalAsrManager: AsrManager?
30+
private var latestStreamingPreviewText: String = ""
31+
private var latestStreamingPreviewSampleCount: Int = 0
32+
private var latestStreamingPreviewFinishedAt: TimeInterval?
33+
private let fastPreviewMinimumSamples = 32_000
34+
private let fastPreviewTailSilenceRMS: Float = 0.002
35+
private let fastPreviewTailAudioToleranceMs = 300
3036
private(set) var isReady: Bool = false
3137
private(set) var isWordBoostingActive: Bool = false
3238
private(set) var boostedVocabularyTermsCount: Int = 0
@@ -142,6 +148,9 @@ final class FluidAudioProvider: TranscriptionProvider {
142148

143149
self.streamingAsrManager = streamingManager
144150
self.finalAsrManager = finalManager
151+
self.latestStreamingPreviewText = ""
152+
self.latestStreamingPreviewSampleCount = 0
153+
self.latestStreamingPreviewFinishedAt = nil
145154
await progressSink.emit(0.98)
146155

147156
self.isReady = true
@@ -156,16 +165,39 @@ final class FluidAudioProvider: TranscriptionProvider {
156165
try await self.transcribeFinal(samples)
157166
}
158167

168+
func resetStreamingPreviewCache() {
169+
self.latestStreamingPreviewText = ""
170+
self.latestStreamingPreviewSampleCount = 0
171+
self.latestStreamingPreviewFinishedAt = nil
172+
}
173+
159174
func transcribeStreaming(_ samples: [Float]) async throws -> ASRTranscriptionResult {
160-
guard let manager = self.streamingAsrManager else {
175+
guard let fullPreviewManager = self.streamingAsrManager else {
161176
throw NSError(
162177
domain: "FluidAudioProvider",
163178
code: -1,
164179
userInfo: [NSLocalizedDescriptionKey: "ASR manager not initialized"]
165180
)
166181
}
167182

168-
let result = try await manager.transcribe(samples, source: AudioSource.microphone)
183+
let startedAt = Date().timeIntervalSince1970
184+
let result = try await fullPreviewManager.transcribe(samples, source: AudioSource.microphone)
185+
let text = result.text.trimmingCharacters(in: .whitespacesAndNewlines)
186+
self.latestStreamingPreviewText = text
187+
self.latestStreamingPreviewSampleCount = samples.count
188+
self.latestStreamingPreviewFinishedAt = Date().timeIntervalSince1970
189+
let elapsedMs = Int(((Date().timeIntervalSince1970 - startedAt) * 1000).rounded())
190+
let audioMs = Int((Double(samples.count) / 16_000.0 * 1000).rounded())
191+
let rtf = audioMs > 0 ? Double(elapsedMs) / Double(audioMs) : 0
192+
let finalizationMode = SettingsStore.shared.parakeetFinalizationMode.rawValue
193+
DebugLogger.shared.info(
194+
"""
195+
ASR_BENCH provider_streaming_done samples=\(samples.count) audioMs=\(audioMs) \
196+
elapsedMs=\(elapsedMs) textChars=\(text.trimmingCharacters(in: .whitespacesAndNewlines).count) \
197+
rtf=\(String(format: "%.3f", rtf)) finalizationMode=\(finalizationMode)
198+
""",
199+
source: "ASRBenchmark"
200+
)
169201
return ASRTranscriptionResult(text: result.text, confidence: result.confidence)
170202
}
171203

@@ -178,10 +210,19 @@ final class FluidAudioProvider: TranscriptionProvider {
178210
)
179211
}
180212

213+
let startedAt = Date().timeIntervalSince1970
214+
if SettingsStore.shared.parakeetFinalizationMode == .tokenTimedChunkMerge {
215+
if let previewResult = await self.cachedStreamingPreviewResult(for: samples, startedAt: startedAt) {
216+
return previewResult
217+
}
218+
}
219+
181220
// If the boosted final manager fails, fall back to the unboosted streaming
182221
// manager so the user still gets a transcription (just without CTC rescoring).
183222
do {
223+
let startedAt = Date().timeIntervalSince1970
184224
let result = try await manager.transcribe(samples, source: AudioSource.microphone)
225+
self.logFinalBenchmark(samples: samples, text: result.text, startedAt: startedAt, usedFallback: false)
185226
return ASRTranscriptionResult(text: result.text, confidence: result.confidence)
186227
} catch {
187228
guard let fallback = self.streamingAsrManager, fallback !== manager else {
@@ -191,11 +232,128 @@ final class FluidAudioProvider: TranscriptionProvider {
191232
"FluidAudioProvider: Boosted final transcription failed (\(error.localizedDescription)), retrying without vocab boost",
192233
source: "FluidAudioProvider"
193234
)
235+
let startedAt = Date().timeIntervalSince1970
194236
let result = try await fallback.transcribe(samples, source: AudioSource.microphone)
237+
self.logFinalBenchmark(samples: samples, text: result.text, startedAt: startedAt, usedFallback: true)
195238
return ASRTranscriptionResult(text: result.text, confidence: result.confidence)
196239
}
197240
}
198241

242+
func transcribeCachedStreamingPreviewIfAvailable(_ samples: [Float]) async -> ASRTranscriptionResult? {
243+
guard SettingsStore.shared.parakeetFinalizationMode == .tokenTimedChunkMerge else {
244+
return nil
245+
}
246+
let startedAt = Date().timeIntervalSince1970
247+
return await self.cachedStreamingPreviewResult(for: samples, startedAt: startedAt)
248+
}
249+
250+
private func cachedStreamingPreviewResult(for samples: [Float], startedAt: TimeInterval) async -> ASRTranscriptionResult? {
251+
let text = self.latestStreamingPreviewText.trimmingCharacters(in: .whitespacesAndNewlines)
252+
let finalSampleCount = samples.count
253+
let previewSampleCount = min(self.latestStreamingPreviewSampleCount, finalSampleCount)
254+
let tailSamples = max(0, finalSampleCount - previewSampleCount)
255+
let tailMs = Int((Double(tailSamples) / 16_000.0 * 1000).rounded())
256+
let tailRMS = self.rms(samples: samples, startIndex: previewSampleCount)
257+
let coverage = finalSampleCount > 0 ? Double(previewSampleCount) / Double(finalSampleCount) : 0
258+
let ageMs: Int
259+
if let latestStreamingPreviewFinishedAt {
260+
ageMs = Int(((Date().timeIntervalSince1970 - latestStreamingPreviewFinishedAt) * 1000).rounded())
261+
} else {
262+
ageMs = Int.max
263+
}
264+
265+
DebugLogger.shared.info(
266+
"""
267+
ASR_BENCH provider_fast_preview_check finalSamples=\(finalSampleCount) previewSamples=\(previewSampleCount) \
268+
tailMs=\(tailMs) coverage=\(String(format: "%.3f", coverage)) ageMs=\(ageMs) \
269+
tailRMS=\(String(format: "%.5f", tailRMS)) textChars=\(text.count) wordBoosting=\(self.isWordBoostingActive)
270+
""",
271+
source: "ASRBenchmark"
272+
)
273+
274+
guard !text.isEmpty else {
275+
self.logFastPreviewMiss(reason: "empty", tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
276+
return nil
277+
}
278+
guard self.latestStreamingPreviewFinishedAt != nil, self.latestStreamingPreviewSampleCount > 0 else {
279+
self.logFastPreviewMiss(reason: "missing_preview", tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
280+
return nil
281+
}
282+
guard ageMs <= 3000 else {
283+
self.logFastPreviewMiss(reason: "stale", tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
284+
return nil
285+
}
286+
guard finalSampleCount >= self.fastPreviewMinimumSamples else {
287+
self.logFastPreviewMiss(reason: "short_recording", tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
288+
return nil
289+
}
290+
guard coverage >= 0.88 else {
291+
self.logFastPreviewMiss(reason: "low_coverage", tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
292+
return nil
293+
}
294+
guard tailMs <= 1800 else {
295+
self.logFastPreviewMiss(reason: "large_tail", tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
296+
return nil
297+
}
298+
guard tailSamples == 0 || tailMs <= self.fastPreviewTailAudioToleranceMs || tailRMS <= self.fastPreviewTailSilenceRMS else {
299+
self.logFastPreviewMiss(reason: "tail_has_audio", tailMs: tailMs, coverage: coverage, ageMs: ageMs, tailRMS: tailRMS)
300+
return nil
301+
}
302+
303+
DebugLogger.shared.info(
304+
"""
305+
ASR_BENCH provider_fast_preview_hit tailMs=\(tailMs) coverage=\(String(format: "%.3f", coverage)) \
306+
ageMs=\(ageMs) tailRMS=\(String(format: "%.5f", tailRMS)) textChars=\(text.count)
307+
""",
308+
source: "ASRBenchmark"
309+
)
310+
self.logFinalBenchmark(samples: samples, text: text, startedAt: startedAt, usedFallback: false, source: "livePreview")
311+
return ASRTranscriptionResult(text: text, confidence: 0.95)
312+
}
313+
314+
private func rms(samples: [Float], startIndex: Int) -> Float {
315+
guard startIndex < samples.count else { return 0 }
316+
var sum: Float = 0
317+
var count: Float = 0
318+
for sample in samples[startIndex...] {
319+
sum += sample * sample
320+
count += 1
321+
}
322+
guard count > 0 else { return 0 }
323+
return sqrt(sum / count)
324+
}
325+
326+
private func logFastPreviewMiss(reason: String, tailMs: Int, coverage: Double, ageMs: Int, tailRMS: Float) {
327+
DebugLogger.shared.info(
328+
"""
329+
ASR_BENCH provider_fast_preview_miss reason=\(reason) tailMs=\(tailMs) \
330+
coverage=\(String(format: "%.3f", coverage)) ageMs=\(ageMs) tailRMS=\(String(format: "%.5f", tailRMS))
331+
""",
332+
source: "ASRBenchmark"
333+
)
334+
}
335+
336+
private func logFinalBenchmark(
337+
samples: [Float],
338+
text: String,
339+
startedAt: TimeInterval,
340+
usedFallback: Bool,
341+
source: String = "full"
342+
) {
343+
let elapsedMs = Int(((Date().timeIntervalSince1970 - startedAt) * 1000).rounded())
344+
let audioMs = Int((Double(samples.count) / 16_000.0 * 1000).rounded())
345+
let rtf = audioMs > 0 ? Double(elapsedMs) / Double(audioMs) : 0
346+
let finalizationMode = SettingsStore.shared.parakeetFinalizationMode.rawValue
347+
DebugLogger.shared.info(
348+
"""
349+
ASR_BENCH provider_final_done samples=\(samples.count) audioMs=\(audioMs) \
350+
elapsedMs=\(elapsedMs) textChars=\(text.trimmingCharacters(in: .whitespacesAndNewlines).count) \
351+
rtf=\(String(format: "%.3f", rtf)) fallback=\(usedFallback) finalizationMode=\(finalizationMode) source=\(source)
352+
""",
353+
source: "ASRBenchmark"
354+
)
355+
}
356+
199357
func modelsExistOnDisk() -> Bool {
200358
let baseCacheDir = AsrModels.defaultCacheDirectory().deletingLastPathComponent()
201359
let selectedModel = self.modelOverride ?? SettingsStore.shared.selectedSpeechModel
@@ -332,5 +490,11 @@ final class FluidAudioProvider: TranscriptionProvider {
332490
func detectBoostedTerms(in text: String, limit: Int = 2) -> [String] {
333491
[]
334492
}
493+
494+
func resetStreamingPreviewCache() {}
495+
496+
func transcribeCachedStreamingPreviewIfAvailable(_ samples: [Float]) async -> ASRTranscriptionResult? {
497+
nil
498+
}
335499
}
336500
#endif

Sources/Fluid/UI/SettingsView.swift

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,6 +1385,47 @@ struct SettingsView: View {
13851385
}
13861386
.padding(16)
13871387
}
1388+
1389+
// Experimental Card
1390+
ThemedCard(style: .standard) {
1391+
VStack(alignment: .leading, spacing: 14) {
1392+
Label("Experimental Settings", systemImage: "exclamationmark.triangle")
1393+
.font(.headline)
1394+
.foregroundStyle(.primary)
1395+
1396+
VStack(alignment: .leading, spacing: 8) {
1397+
HStack(alignment: .center) {
1398+
VStack(alignment: .leading, spacing: 2) {
1399+
Text("Dictation Processing Speed")
1400+
.font(.body)
1401+
}
1402+
1403+
Spacer()
1404+
1405+
Picker("", selection: self.$settings.parakeetFinalizationMode) {
1406+
ForEach(ParakeetFinalizationMode.allCases) { mode in
1407+
Text(mode.displayName).tag(mode)
1408+
}
1409+
}
1410+
.pickerStyle(.menu)
1411+
.frame(width: 170, alignment: .trailing)
1412+
.disabled(self.asr.isRunning)
1413+
}
1414+
1415+
Text("Standard: most reliable. Fast: faster, but maybe inaccurate.")
1416+
.font(.caption)
1417+
.foregroundStyle(.secondary)
1418+
1419+
if self.asr.isRunning {
1420+
Text("Settings are disabled during active recording")
1421+
.font(.caption)
1422+
.foregroundStyle(.secondary)
1423+
.italic()
1424+
}
1425+
}
1426+
}
1427+
.padding(16)
1428+
}
13881429
}
13891430
.padding(16)
13901431
}

0 commit comments

Comments
 (0)