diff --git a/changelog/4076.fixed.md b/changelog/4076.fixed.md new file mode 100644 index 0000000000..ffde7dd07a --- /dev/null +++ b/changelog/4076.fixed.md @@ -0,0 +1 @@ +Fixed SimliVideoService forwarding near-silent audio frames (quantization noise) as TTSAudioRawFrame. Added RMS threshold check to skip frames below 1.0, which is well above the observed noise floor (~0.41) but far below real speech. Without this fix - BotStoppedSpeaking will never trigger. \ No newline at end of file diff --git a/src/pipecat/services/simli/video.py b/src/pipecat/services/simli/video.py index f994ef0dc3..9aa56bef90 100644 --- a/src/pipecat/services/simli/video.py +++ b/src/pipecat/services/simli/video.py @@ -261,8 +261,9 @@ async def _consume_and_process_audio(self): resampled_frames = self._pipecat_resampler.resample(audio_frame) for resampled_frame in resampled_frames: audio_array = resampled_frame.to_ndarray() - # Only push frame is there is audio (e.g. not silence) - if audio_array.any(): + # Simli pushes very low volume (total silence for practical purposes) frames when the avatar is not speaking, so we can skip otherwise BotStoppedSpeaking will never trigger. + rms = np.sqrt(np.mean(audio_array.astype(np.float32) ** 2)) + if rms >= 1.0: await self.push_frame( TTSAudioRawFrame( audio=audio_array.tobytes(),