Skip to content

Commit ea38a8c

Browse files
committed
feat: add avsynth (macOS) and sapi (Windows) native TTS engines
- avsynth_engine: AVSpeechSynthesizer via objc crate, supports voice listing, rate/pitch/volume, pause/resume/stop - sapi_engine: Windows SAPI via windows crate, supports voice listing, SSML pitch control, pause/resume/stop - New Cargo features: avsynth (macOS), sapi (Windows) - Both engines implement TtsEngine trait with estimated word boundaries - Factory registers engines with cfg-conditional compilation
1 parent 35db264 commit ea38a8c

5 files changed

Lines changed: 510 additions & 1 deletion

File tree

Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ sherpaonnx = ["sherpa-onnx", "serde_json"]
1919
tungstenite = ["dep:tungstenite"]
2020
uuid = ["dep:uuid"]
2121
url = ["dep:url"]
22+
avsynth = ["objc", "objc-foundation"]
23+
sapi = ["windows"]
2224

2325
[dependencies]
2426
speech-dispatcher = { version = "0.16", optional = true }
@@ -32,6 +34,12 @@ anyhow = "1"
3234
tungstenite = { version = "0.29.0", features = ["rustls-tls-webpki-roots"], optional = true }
3335
uuid = { version = "1.23.2", features = ["v4"], optional = true }
3436
url = { version = "2.5.8", optional = true }
37+
objc = { version = "0.2", optional = true }
38+
objc-foundation = { version = "0.1", optional = true }
39+
block2 = { version = "0.6", optional = true }
40+
41+
[target.'cfg(target_os = "windows")'.dependencies]
42+
windows = { version = "0.61", features = ["Win32_Media_Speech", "Win32_System_Com", "Win32_System_Ole"], optional = true }
3543

3644
[build-dependencies]
3745
cbindgen = "0.28"

src/avsynth_engine.rs

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
//! macOS AVSpeechSynthesizer engine via raw objc calls.
2+
3+
use crate::engine::{estimate_word_boundaries, TtsEngine};
4+
use crate::types::{TtsError, TtsResult, Voice};
5+
use objc::runtime::Object;
6+
use objc::{class, msg_send, sel, sel_impl};
7+
use std::ffi::c_void;
8+
use std::sync::{Arc, Mutex};
9+
10+
#[derive(Debug)]
11+
pub struct AvSynthEngine {
12+
synth: Arc<Mutex<Option<*mut Object>>>,
13+
voice_id: Mutex<Option<String>>,
14+
}
15+
16+
unsafe impl Send for AvSynthEngine {}
17+
unsafe impl Sync for AvSynthEngine {}
18+
19+
impl AvSynthEngine {
20+
pub fn new() -> Self {
21+
let synth = unsafe {
22+
let cls = class!(AVSpeechSynthesizer);
23+
let obj: *mut Object = msg_send![cls, alloc];
24+
let obj: *mut Object = msg_send![obj, init];
25+
if obj.is_null() {
26+
None
27+
} else {
28+
Some(obj)
29+
}
30+
};
31+
AvSynthEngine {
32+
synth: Arc::new(Mutex::new(synth)),
33+
voice_id: Mutex::new(None),
34+
}
35+
}
36+
}
37+
38+
unsafe fn to_nsstring(s: &str) -> *mut Object {
39+
let cls = class!(NSString);
40+
let bytes = s.as_ptr() as *const c_void;
41+
let len = s.len();
42+
let ns: *mut Object = msg_send![cls, alloc];
43+
let ns: *mut Object = msg_send![ns,
44+
initWithBytes: bytes
45+
length: len
46+
encoding: 4usize
47+
];
48+
ns
49+
}
50+
51+
unsafe fn from_nsstring(ns: *mut Object) -> String {
52+
if ns.is_null() {
53+
return String::new();
54+
}
55+
let len: usize = msg_send![ns, lengthOfBytesUsingEncoding: 4usize];
56+
if len == 0 {
57+
return String::new();
58+
}
59+
let mut buf = vec![0u8; len];
60+
let _: usize = msg_send![ns,
61+
getBytes: buf.as_mut_ptr()
62+
maxLength: len
63+
encoding: 4usize
64+
options: 1usize
65+
range: (0usize, len)
66+
remainingRange: std::ptr::null::<(usize, usize)>()
67+
];
68+
String::from_utf8_lossy(&buf).into_owned()
69+
}
70+
71+
fn rate_to_avsynth(rate: f32) -> f32 {
72+
rate.clamp(0.1, 10.0)
73+
}
74+
75+
fn pitch_to_avsynth(pitch: f32) -> f32 {
76+
pitch.clamp(0.5, 2.0)
77+
}
78+
79+
fn volume_to_avsynth(volume: f32) -> f32 {
80+
volume.clamp(0.0, 1.0)
81+
}
82+
83+
impl TtsEngine for AvSynthEngine {
84+
fn speak(
85+
&self,
86+
text: &str,
87+
voice: Option<&str>,
88+
rate: f32,
89+
pitch: f32,
90+
volume: f32,
91+
_on_audio: Option<crate::engine::OnAudioCallback>,
92+
mut on_boundary: Option<crate::engine::OnBoundaryCallback>,
93+
) -> TtsResult<()> {
94+
let guard = self.synth.lock().unwrap();
95+
let synth = guard
96+
.ok_or_else(|| TtsError("AVSpeechSynthesizer not initialized".into()))?;
97+
98+
unsafe {
99+
let ns_text = to_nsstring(text);
100+
let utterance_cls = class!(AVSpeechUtterance);
101+
let u: *mut Object = msg_send![utterance_cls, alloc];
102+
let u: *mut Object = msg_send![u, initWithString: ns_text];
103+
104+
if !u.is_null() {
105+
let _: () = msg_send![u, setRate: rate_to_avsynth(rate)];
106+
let _: () = msg_send![u, setPitchMultiplier: pitch_to_avsynth(pitch)];
107+
let _: () = msg_send![u, setVolume: volume_to_avsynth(volume)];
108+
109+
let voice_to_use = voice
110+
.map(|v| v.to_string())
111+
.or_else(|| self.voice_id.lock().unwrap().clone());
112+
113+
if let Some(ref vid) = voice_to_use {
114+
let ns_vid = to_nsstring(vid);
115+
let voice_cls = class!(AVSpeechSynthesisVoice);
116+
let av_voice: *mut Object = msg_send![voice_cls, voiceWithIdentifier: ns_vid];
117+
if !av_voice.is_null() {
118+
let _: () = msg_send![u, setVoice: av_voice];
119+
}
120+
}
121+
122+
let _: () = msg_send![synth, speakUtterance: u];
123+
let _: () = msg_send![u, release];
124+
}
125+
let _: () = msg_send![ns_text, release];
126+
}
127+
128+
if let Some(cb) = on_boundary.as_mut() {
129+
let estimated = estimate_word_boundaries(text);
130+
for b in &estimated {
131+
#[allow(clippy::cast_precision_loss)]
132+
let start = b.offset as f32 / 1000.0;
133+
#[allow(clippy::cast_precision_loss)]
134+
let end = (b.offset + b.duration) as f32 / 1000.0;
135+
cb(&b.text, start, end);
136+
}
137+
}
138+
139+
Ok(())
140+
}
141+
142+
fn speak_sync(
143+
&self,
144+
text: &str,
145+
voice: Option<&str>,
146+
rate: f32,
147+
pitch: f32,
148+
volume: f32,
149+
on_audio: Option<crate::engine::OnAudioCallback>,
150+
on_boundary: Option<crate::engine::OnBoundaryCallback>,
151+
) -> TtsResult<()> {
152+
self.speak(text, voice, rate, pitch, volume, on_audio, on_boundary)
153+
}
154+
155+
fn stop(&self) -> TtsResult<()> {
156+
let guard = self.synth.lock().unwrap();
157+
if let Some(synth) = *guard {
158+
unsafe {
159+
let _: () = msg_send![synth, stopSpeakingAtBoundary: 0i32];
160+
}
161+
}
162+
Ok(())
163+
}
164+
165+
fn pause(&self) -> TtsResult<()> {
166+
let guard = self.synth.lock().unwrap();
167+
if let Some(synth) = *guard {
168+
unsafe {
169+
let _: () = msg_send![synth, pauseSpeakingAtBoundary: 0i32];
170+
}
171+
}
172+
Ok(())
173+
}
174+
175+
fn resume(&self) -> TtsResult<()> {
176+
let guard = self.synth.lock().unwrap();
177+
if let Some(synth) = *guard {
178+
unsafe {
179+
let _: () = msg_send![synth, continueSpeaking];
180+
}
181+
}
182+
Ok(())
183+
}
184+
185+
fn get_voices(&self) -> TtsResult<Vec<Voice>> {
186+
unsafe {
187+
let voice_cls = class!(AVSpeechSynthesisVoice);
188+
let voices: *mut Object = msg_send![voice_cls, speechVoices];
189+
if voices.is_null() {
190+
return Ok(vec![]);
191+
}
192+
193+
let count: usize = msg_send![voices, count];
194+
let mut result = Vec::with_capacity(count);
195+
196+
for i in 0..count {
197+
let v: *mut Object = msg_send![voices, objectAtIndex: i];
198+
if !v.is_null() {
199+
let id_ptr: *mut Object = msg_send![v, identifier];
200+
let name_ptr: *mut Object = msg_send![v, name];
201+
let lang_ptr: *mut Object = msg_send![v, language];
202+
203+
let id = from_nsstring(id_ptr);
204+
let name = from_nsstring(name_ptr);
205+
let lang = from_nsstring(lang_ptr);
206+
207+
result.push(Voice {
208+
id,
209+
name,
210+
gender: crate::types::Gender::Unknown,
211+
provider: "avsynth".to_string(),
212+
language_codes: vec![crate::types::LanguageCode {
213+
bcp47: lang.clone(),
214+
iso639_3: lang.split('-').next().unwrap_or("en").to_string(),
215+
display: lang,
216+
}],
217+
});
218+
}
219+
}
220+
Ok(result)
221+
}
222+
}
223+
224+
fn engine_id(&self) -> &'static str {
225+
"avsynth"
226+
}
227+
}
228+
229+
impl Drop for AvSynthEngine {
230+
fn drop(&mut self) {
231+
if let Ok(mut guard) = self.synth.lock() {
232+
if let Some(ptr) = guard.take() {
233+
unsafe {
234+
let _: () = msg_send![ptr, release];
235+
}
236+
}
237+
}
238+
}
239+
}

src/factory.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ use crate::cloud_engine;
1010
use crate::sherpaonnx_engine::SherpaOnnxEngine;
1111
#[cfg(feature = "system")]
1212
use crate::system_engine::SystemEngine;
13+
#[cfg(feature = "avsynth")]
14+
use crate::avsynth_engine::AvSynthEngine;
15+
#[cfg(feature = "sapi")]
16+
use crate::sapi_engine::SapiEngine;
1317

1418
/// Create an engine by its string identifier.
1519
///
@@ -22,6 +26,12 @@ pub fn create_engine(engine_id: &str, credentials_json: &str) -> Option<Box<dyn
2226
#[cfg(feature = "system")]
2327
"system" => Some(Box::new(SystemEngine::new())),
2428

29+
#[cfg(feature = "avsynth")]
30+
"avsynth" => Some(Box::new(AvSynthEngine::new())),
31+
32+
#[cfg(feature = "sapi")]
33+
"sapi" => Some(Box::new(SapiEngine::new())),
34+
2535
#[cfg(feature = "sherpaonnx")]
2636
"sherpaonnx" => Some(Box::new(SherpaOnnxEngine::new(credentials_json))),
2737

@@ -48,7 +58,23 @@ pub fn engine_list() -> Vec<EngineDescriptor> {
4858
#[cfg(feature = "system")]
4959
engines.push(EngineDescriptor {
5060
id: "system".into(),
51-
name: "System".into(),
61+
name: "System (Speech Dispatcher)".into(),
62+
needs_credentials: false,
63+
credential_keys_json: "[]".into(),
64+
});
65+
66+
#[cfg(feature = "avsynth")]
67+
engines.push(EngineDescriptor {
68+
id: "avsynth".into(),
69+
name: "macOS AVSpeechSynthesizer".into(),
70+
needs_credentials: false,
71+
credential_keys_json: "[]".into(),
72+
});
73+
74+
#[cfg(feature = "sapi")]
75+
engines.push(EngineDescriptor {
76+
id: "sapi".into(),
77+
name: "Windows SAPI".into(),
5278
needs_credentials: false,
5379
credential_keys_json: "[]".into(),
5480
});

src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ pub mod factory;
3737
mod sherpaonnx_engine;
3838
#[cfg(feature = "system")]
3939
mod system_engine;
40+
#[cfg(feature = "avsynth")]
41+
mod avsynth_engine;
42+
#[cfg(feature = "sapi")]
43+
mod sapi_engine;
4044
pub mod types;
4145

4246
use std::ffi::{CStr, CString};

0 commit comments

Comments
 (0)