Skip to content

Commit fdcbab8

Browse files
committed
Add ExecuWhisper macOS dictation app with LFM2.5 smart formatter
Introduce ExecuWhisper, a Superwhisper-style on-device dictation app backed by ExecuTorch. Audio is captured locally with AVAudioEngine, transcribed by Parakeet TDT (Metal backend), and optionally rewritten by LFM2.5-350M (MLX backend) before paste/save. Both helpers run as warm subprocesses over a JSONL stdin/stdout protocol so first paste latency stays low across consecutive dictations. App features: - Global Ctrl+Space (configurable) overlay dictation that pastes the formatted text back into the active app via a stable Accessibility paste helper installed under Application Support, so Xcode rebuilds do not invalidate the granted permission. - Single smart formatting prompt with safety net: rewrites dictation into final text, never answers spoken questions, falls back to the raw transcript when the formatter output is suspicious (length-ratio guard, prompt-echo guard, "Mode:" / "Sure" / "Here is" guards). - Auto-preload of the Parakeet helper on launch and after foreground health checks, with a toolbar spinner while warming. - Replacements (case-aware, longest-match-first, word-boundary aware). - Session history with rename, pinning, recency grouping, search, and export to txt / json / srt. - Built-in microphone selection, silence detection, and dictation shortcut recorder. Build / packaging: - XcodeGen project (project.yml) with hardened runtime, deployment target macOS 14.0, dead-code-stripping, explicit ENABLE_USER_SCRIPT_SANDBOXING=NO so the post-compile script can bundle the parakeet/lfm25 helpers and mlx.metallib next to the app. - scripts/build.sh for lightweight or --bundle-models releases. - scripts/create_dmg.sh validates required helpers and the paste helper sub-app, then emits a verified UDZO DMG. - ExecuWhisper Paste Helper.app sub-bundle gives macOS a stable bundle identifier (org.pytorch.executorch.ExecuWhisper.PasteHelper) for Accessibility, so we are not asking for a new grant per build. Models: - First-launch download from younghan-meta/Parakeet-TDT-ExecuTorch-Metal and younghan-meta/LFM2.5-ExecuTorch-MLX into Application Support. - Optional --bundle-models build for fully offline distribution. Tests: - 68 Swift Testing cases covering replacements, prompt safety, formatter fallback paths (question-answering, prompt echo, metadata echo), preload state machine, session export, and helper protocol wire format. Made-with: Cursor
1 parent 56f5fbf commit fdcbab8

64 files changed

Lines changed: 10016 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

ExecuWhisper/.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
ExecuWhisper.xcodeproj/
2+
docs/superpowers/
3+
4+
# Local-only dictation samples and prompt-quality corpus.
5+
test_audio/
6+
evaluation/
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3+
<plist version="1.0">
4+
<dict>
5+
<key>com.apple.security.cs.disable-library-validation</key>
6+
<true/>
7+
<key>com.apple.security.device.audio-input</key>
8+
<true/>
9+
<key>com.apple.security.network.client</key>
10+
<true/>
11+
</dict>
12+
</plist>
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
import AppKit
10+
import SwiftUI
11+
12+
@main
13+
struct ExecuWhisperApp: App {
14+
@State private var preferences = Preferences()
15+
@State private var downloader = ModelDownloader()
16+
@State private var replacementStore = ReplacementStore()
17+
@State private var store: TranscriptStore
18+
@State private var dictationManager: DictationManager
19+
20+
init() {
21+
let prefs = Preferences()
22+
let downloader = ModelDownloader()
23+
let replacementStore = ReplacementStore()
24+
let formatterBridge = FormatterBridge()
25+
let textPipeline = TextPipeline(
26+
replacementStore: replacementStore,
27+
formatterBridge: formatterBridge
28+
) {
29+
TextPipeline.FormatterPaths(
30+
runnerPath: prefs.formatterRunnerPath,
31+
modelPath: prefs.formatterModelPath,
32+
tokenizerPath: prefs.formatterTokenizerPath,
33+
tokenizerConfigPath: prefs.formatterTokenizerConfigPath
34+
)
35+
}
36+
let store = TranscriptStore(
37+
preferences: prefs,
38+
downloader: downloader,
39+
textPipeline: textPipeline
40+
)
41+
let dictationManager = DictationManager(store: store, preferences: prefs)
42+
_preferences = State(initialValue: prefs)
43+
_downloader = State(initialValue: downloader)
44+
_replacementStore = State(initialValue: replacementStore)
45+
_store = State(initialValue: store)
46+
_dictationManager = State(initialValue: dictationManager)
47+
}
48+
49+
var body: some Scene {
50+
WindowGroup {
51+
ContentView()
52+
.environment(store)
53+
.environment(preferences)
54+
.environment(downloader)
55+
.environment(replacementStore)
56+
.environment(dictationManager)
57+
.frame(minWidth: 700, minHeight: 460)
58+
.onReceive(NotificationCenter.default.publisher(for: NSApplication.didBecomeActiveNotification)) { _ in
59+
Task { await store.runHealthCheck() }
60+
}
61+
}
62+
.defaultSize(width: 960, height: 640)
63+
.windowToolbarStyle(.unified)
64+
.commands {
65+
CommandGroup(replacing: .newItem) {}
66+
67+
CommandMenu("Transcription") {
68+
switch store.sessionState {
69+
case .idle:
70+
Button("Start Recording") {
71+
Task { await store.startRecording() }
72+
}
73+
.keyboardShortcut("R", modifiers: [.command, .shift])
74+
.disabled(!store.isModelReady)
75+
76+
case .recording:
77+
Button("Stop and Transcribe") {
78+
Task { await store.stopRecordingAndTranscribe() }
79+
}
80+
.keyboardShortcut("R", modifiers: [.command, .shift])
81+
82+
case .transcribing:
83+
Button("Transcribing...") {}
84+
.disabled(true)
85+
}
86+
87+
Button("Import Audio...") {
88+
store.importAudioFileWithPanel()
89+
}
90+
.disabled(store.hasActiveSession || downloader.isDownloading)
91+
92+
if store.healthResult?.shouldOfferModelDownload == true && !downloader.isDownloading {
93+
Divider()
94+
Button("Download Model") {
95+
Task { await store.downloadModel() }
96+
}
97+
}
98+
99+
if store.resourcesReady && !store.hasActiveSession {
100+
Divider()
101+
switch store.helperState {
102+
case .unloaded:
103+
Button("Preload Model") {
104+
Task { await store.preloadModel() }
105+
}
106+
.keyboardShortcut("L", modifiers: [.command, .shift])
107+
108+
case .loading:
109+
Button("Warming Model...") {}
110+
.disabled(true)
111+
112+
case .warm:
113+
Button("Unload Model") {
114+
Task { await store.unloadModel() }
115+
}
116+
.keyboardShortcut("U", modifiers: [.command, .shift])
117+
118+
case .failed:
119+
Button("Retry Preload") {
120+
Task { await store.preloadModel() }
121+
}
122+
}
123+
}
124+
125+
Divider()
126+
127+
Button("Copy Transcript") {
128+
let text = currentTranscript
129+
guard !text.isEmpty else { return }
130+
NSPasteboard.general.clearContents()
131+
NSPasteboard.general.setString(text, forType: .string)
132+
}
133+
.keyboardShortcut("C", modifiers: [.command, .shift])
134+
.disabled(currentTranscript.isEmpty)
135+
}
136+
137+
CommandMenu("Dictation") {
138+
Button(dictationManager.isListening ? "Stop Dictation" : "Start Dictation") {
139+
Task { await dictationManager.toggle() }
140+
}
141+
.disabled(store.isTranscribing)
142+
}
143+
}
144+
145+
Settings {
146+
SettingsView(usesFixedWindowSize: true)
147+
.environment(preferences)
148+
.environment(dictationManager)
149+
}
150+
}
151+
152+
private var currentTranscript: String {
153+
if store.hasActiveSession {
154+
return store.liveTranscript
155+
}
156+
guard let id = store.selectedSessionID else { return "" }
157+
return store.sessions.first(where: { $0.id == id })?.transcript ?? ""
158+
}
159+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3+
<plist version="1.0">
4+
<dict>
5+
<key>NSMicrophoneUsageDescription</key>
6+
<string>ExecuWhisper needs microphone access to record audio for on-device transcription.</string>
7+
</dict>
8+
</plist>
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
import AppKit
10+
import Carbon.HIToolbox
11+
import Foundation
12+
13+
struct DictationShortcut: Codable, Equatable, Sendable {
14+
var keyCode: UInt32
15+
var carbonModifiers: UInt32
16+
var keyDisplay: String
17+
18+
static let controlSpace = DictationShortcut(
19+
keyCode: UInt32(kVK_Space),
20+
carbonModifiers: UInt32(controlKey),
21+
keyDisplay: "Space"
22+
)
23+
24+
init(keyCode: UInt32, carbonModifiers: UInt32, keyDisplay: String) {
25+
self.keyCode = keyCode
26+
self.carbonModifiers = carbonModifiers
27+
self.keyDisplay = keyDisplay
28+
}
29+
30+
init?(event: NSEvent) {
31+
let carbonModifiers = Self.carbonModifiers(from: event.modifierFlags)
32+
guard carbonModifiers != 0 else { return nil }
33+
guard let keyDisplay = Self.keyDisplay(for: event) else { return nil }
34+
self.init(
35+
keyCode: UInt32(event.keyCode),
36+
carbonModifiers: carbonModifiers,
37+
keyDisplay: keyDisplay
38+
)
39+
}
40+
41+
var displayString: String {
42+
var value = ""
43+
if carbonModifiers & UInt32(controlKey) != 0 {
44+
value += ""
45+
}
46+
if carbonModifiers & UInt32(optionKey) != 0 {
47+
value += ""
48+
}
49+
if carbonModifiers & UInt32(shiftKey) != 0 {
50+
value += ""
51+
}
52+
if carbonModifiers & UInt32(cmdKey) != 0 {
53+
value += ""
54+
}
55+
return value + keyDisplay
56+
}
57+
58+
static func carbonModifiers(from flags: NSEvent.ModifierFlags) -> UInt32 {
59+
let sanitized = flags.intersection(.deviceIndependentFlagsMask)
60+
var value: UInt32 = 0
61+
if sanitized.contains(.control) {
62+
value |= UInt32(controlKey)
63+
}
64+
if sanitized.contains(.option) {
65+
value |= UInt32(optionKey)
66+
}
67+
if sanitized.contains(.shift) {
68+
value |= UInt32(shiftKey)
69+
}
70+
if sanitized.contains(.command) {
71+
value |= UInt32(cmdKey)
72+
}
73+
return value
74+
}
75+
76+
private static func keyDisplay(for event: NSEvent) -> String? {
77+
switch Int(event.keyCode) {
78+
case kVK_Space:
79+
return "Space"
80+
case kVK_Return:
81+
return "Return"
82+
case kVK_Tab:
83+
return "Tab"
84+
case kVK_Delete:
85+
return "Delete"
86+
case kVK_ForwardDelete:
87+
return "Fn-Delete"
88+
case kVK_Escape:
89+
return "Esc"
90+
case kVK_LeftArrow:
91+
return "Left"
92+
case kVK_RightArrow:
93+
return "Right"
94+
case kVK_UpArrow:
95+
return "Up"
96+
case kVK_DownArrow:
97+
return "Down"
98+
default:
99+
break
100+
}
101+
102+
guard let characters = event.charactersIgnoringModifiers?
103+
.trimmingCharacters(in: .whitespacesAndNewlines),
104+
!characters.isEmpty
105+
else {
106+
return nil
107+
}
108+
return characters.uppercased()
109+
}
110+
}

0 commit comments

Comments
 (0)