Skip to content

Commit 0d1ab01

Browse files
committed
feat: impl stream vad & asr when use ParaformerRealtimeV2Asr
1 parent faea950 commit 0d1ab01

4 files changed

Lines changed: 319 additions & 27 deletions

File tree

src/ai/bailian/realtime_asr.rs

Lines changed: 98 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,10 @@ impl ParaformerRealtimeV2Asr {
9393
Ok(())
9494
}
9595

96-
pub async fn start_pcm_recognition(&mut self) -> anyhow::Result<()> {
96+
pub async fn start_pcm_recognition(
97+
&mut self,
98+
semantic_punctuation_enabled: bool,
99+
) -> anyhow::Result<()> {
97100
let task_id = Uuid::new_v4().to_string();
98101
log::info!("Starting asr task with ID: {}", task_id);
99102
self.task_id = task_id;
@@ -112,6 +115,7 @@ impl ParaformerRealtimeV2Asr {
112115
"parameters": {
113116
"format": "pcm",
114117
"sample_rate": self.sample_rate,
118+
"semantic_punctuation_enabled": semantic_punctuation_enabled,
115119
},
116120
"input": {}
117121
},
@@ -163,6 +167,7 @@ impl ParaformerRealtimeV2Asr {
163167
"streaming": "duplex"
164168
},
165169
"payload": {
170+
"task_group": "audio",
166171
"input": {}
167172
}
168173
});
@@ -197,6 +202,7 @@ impl ParaformerRealtimeV2Asr {
197202
} else if let Some(output) = response.payload.output {
198203
return Ok(Some(output.sentence));
199204
} else {
205+
log::error!("ASR response has no output: {:?}", text);
200206
return Err(anyhow::anyhow!("ASR error: {:?}", text));
201207
}
202208
}
@@ -226,31 +232,116 @@ async fn test_paraformer_asr() {
226232
let mut asr = ParaformerRealtimeV2Asr::connect("", token, head.sample_rate)
227233
.await
228234
.unwrap();
229-
asr.start_pcm_recognition().await.unwrap();
235+
asr.start_pcm_recognition(false).await.unwrap();
230236

231237
asr.send_audio(audio_data.clone()).await.unwrap();
232238
asr.finish_task().await.unwrap();
233239

234240
loop {
235241
if let Ok(Some(sentence)) = asr.next_result().await {
236-
println!("{:?}", sentence);
242+
log::info!("{:?}", sentence);
237243
if sentence.sentence_end {
238-
println!();
244+
log::info!("Final sentence received, ending recognition session.");
239245
}
240246
} else {
241247
break;
242248
}
243249
}
244250

245-
asr.start_pcm_recognition().await.unwrap();
251+
asr.start_pcm_recognition(false).await.unwrap();
246252
asr.send_audio(audio_data).await.unwrap();
247253
asr.finish_task().await.unwrap();
248254

249255
loop {
250256
if let Ok(Some(sentence)) = asr.next_result().await {
251-
println!("{:?}", sentence);
257+
log::info!("{:?}", sentence);
258+
if sentence.sentence_end {
259+
log::info!("Final sentence received, ending recognition session.");
260+
}
261+
} else {
262+
break;
263+
}
264+
}
265+
}
266+
267+
// cargo test --package echokit_server --bin echokit_server -- ai::bailian::realtime_asr::test_paraformer_stream_asr --exact --show-output
268+
#[tokio::test]
269+
async fn test_paraformer_stream_asr() {
270+
env_logger::init();
271+
let token = std::env::var("COSYVOICE_TOKEN").unwrap();
272+
273+
let data = std::fs::read("./resources/test/out.wav").unwrap();
274+
let mut reader = wav_io::reader::Reader::from_vec(data).expect("Failed to create WAV reader");
275+
let header = reader.read_header().unwrap();
276+
let mut samples = crate::util::get_samples_f32(&mut reader).unwrap();
277+
278+
// pad 10 seconds of silence
279+
samples.extend_from_slice(&[0.0; 16000 * 10]);
280+
281+
let samples = crate::util::convert_samples_f32_to_i16_bytes(&samples);
282+
let audio_data = bytes::Bytes::from(samples);
283+
284+
let mut asr = ParaformerRealtimeV2Asr::connect("", token, header.sample_rate)
285+
.await
286+
.unwrap();
287+
asr.start_pcm_recognition(true).await.unwrap();
288+
289+
let mut ms = 0;
290+
291+
for chunk in audio_data.chunks(3200) {
292+
ms += 100;
293+
log::info!("Sending audio chunk at {} ms", ms);
294+
asr.send_audio(Bytes::copy_from_slice(chunk)).await.unwrap();
295+
// tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
296+
let wait_asr_fut = asr.next_result();
297+
298+
let (sentence, has_result) = tokio::select! {
299+
res = wait_asr_fut => {
300+
(res.unwrap(),true)
301+
}
302+
_ = async {} => {
303+
(None,false)
304+
}
305+
};
306+
307+
if has_result {
308+
log::info!("{:?} {ms}", sentence);
309+
}
310+
311+
if let Some(s) = sentence {
312+
if s.sentence_end {
313+
break;
314+
}
315+
}
316+
}
317+
318+
asr.finish_task().await.unwrap();
319+
320+
loop {
321+
if let Ok(Some(sentence)) = asr.next_result().await {
322+
log::info!("{:?}", sentence);
323+
if sentence.sentence_end {
324+
log::info!("End of sentence");
325+
}
326+
} else {
327+
break;
328+
}
329+
}
330+
331+
asr.start_pcm_recognition(true).await.unwrap();
332+
333+
ms = 0;
334+
for chunk in audio_data.chunks(3200) {
335+
ms += 100;
336+
log::info!("Sending audio chunk at {} ms", ms);
337+
asr.send_audio(Bytes::copy_from_slice(chunk)).await.unwrap();
338+
// tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
339+
}
340+
loop {
341+
if let Ok(Some(sentence)) = asr.next_result().await {
342+
log::info!("{:?}", sentence);
252343
if sentence.sentence_end {
253-
println!();
344+
log::info!("End of sentence");
254345
}
255346
} else {
256347
break;

0 commit comments

Comments
 (0)