1- use std:: { env, time:: Duration } ;
1+ use std:: { env, path :: Path , time:: Duration } ;
22
3- use anyhow:: { Context , Result } ;
3+ use anyhow:: { Context , Result , bail } ;
44use cpal:: traits:: { DeviceTrait , HostTrait , StreamTrait } ;
55use tokio:: {
66 select,
77 sync:: mpsc:: { channel, unbounded_channel} ,
88} ;
99
10- use context_switch:: { InputModality , OutputModality , services:: AzureTranscribe } ;
10+ use context_switch:: { AudioConsumer , InputModality , OutputModality , services:: AzureTranscribe } ;
1111use context_switch_core:: {
1212 AudioFormat , AudioFrame , audio,
1313 conversation:: { Conversation , Input } ,
1414 service:: Service ,
1515} ;
1616
17+ const LANGUAGE : & str = "de-DE" ;
18+
1719#[ tokio:: main]
1820async fn main ( ) -> Result < ( ) > {
1921 dotenvy:: dotenv_override ( ) ?;
2022 tracing_subscriber:: fmt:: init ( ) ;
2123
24+ let mut args = env:: args ( ) ;
25+ match args. len ( ) {
26+ 1 => recognize_from_microphone ( ) . await ?,
27+ 2 => recognize_from_wav ( Path :: new ( & args. nth ( 1 ) . unwrap ( ) ) ) . await ?,
28+ _ => bail ! ( "Invalid number of arguments, expect zero or one" ) ,
29+ }
30+ Ok ( ( ) )
31+ }
32+
33+ async fn recognize_from_wav ( file : & Path ) -> Result < ( ) > {
34+ // For now we always convert to 16khz single channel (this is what we use internally for
35+ // testing).
36+ let format = AudioFormat {
37+ channels : 1 ,
38+ sample_rate : 16000 ,
39+ } ;
40+
41+ let frames = playback:: audio_file_to_frames ( file, format) ?;
42+ if frames. is_empty ( ) {
43+ bail ! ( "No frames in the audio file" )
44+ }
45+
46+ let ( producer, input_consumer) = format. new_channel ( ) ;
47+
48+ for frame in frames {
49+ producer. produce ( frame) ?;
50+ }
51+
52+ recognize ( format, input_consumer) . await
53+ }
54+
55+ async fn recognize_from_microphone ( ) -> Result < ( ) > {
2256 let host = cpal:: default_host ( ) ;
2357 let device = host
2458 . default_input_device ( )
@@ -33,7 +67,7 @@ async fn main() -> Result<()> {
3367 let sample_rate = config. sample_rate ( ) ;
3468 let format = AudioFormat :: new ( channels, sample_rate. 0 ) ;
3569
36- let ( producer, mut input_consumer) = format. new_channel ( ) ;
70+ let ( producer, input_consumer) = format. new_channel ( ) ;
3771
3872 // Create and run the input stream
3973 let stream = device
@@ -56,19 +90,23 @@ async fn main() -> Result<()> {
5690
5791 stream. play ( ) . expect ( "Failed to play stream" ) ;
5892
59- let language = "de-DE" ;
93+ recognize ( format, input_consumer) . await
94+ }
6095
96+ async fn recognize ( format : AudioFormat , mut input_consumer : AudioConsumer ) -> Result < ( ) > {
6197 // TODO: clarify how to access configurations.
6298 let params = azure:: transcribe:: Params {
6399 host : None ,
64100 region : Some ( env:: var ( "AZURE_REGION" ) . expect ( "AZURE_REGION undefined" ) ) ,
65101 subscription_key : env:: var ( "AZURE_SUBSCRIPTION_KEY" )
66102 . expect ( "AZURE_SUBSCRIPTION_KEY undefined" ) ,
67- language : language. into ( ) ,
103+ language : LANGUAGE . into ( ) ,
104+ speech_gate : false ,
68105 } ;
69106
70107 let ( output_producer, mut output_consumer) = unbounded_channel ( ) ;
71- let ( conv_input_producer, conv_input_consumer) = channel ( 32 ) ;
108+ // For now this is more or less unbounded, because we push complete audio files for recognition.
109+ let ( conv_input_producer, conv_input_consumer) = channel ( 16384 ) ;
72110
73111 let azure = AzureTranscribe ;
74112 let mut conversation = azure. conversation (
0 commit comments