Fix AudioOutputStream handling for stereo

dangusev · dangusev · commit 635c69ba9f7b · 2026-05-05T18:54:26.000+02:00
send_nowait() was infinitely accumulating carryover buffer because of incorrect handling of 2d numpy arrays
diff --git a/agents-core/vision_agents/core/agents/inference/audio.py b/agents-core/vision_agents/core/agents/inference/audio.py
@@ -79,7 +79,10 @@ def send_nowait(self, item: AudioOutputChunk | AudioOutputFlush) -> None:
             self._carry = None
 
         for pcm_chunk in pcm.chunks(chunk_size):
-            if len(pcm_chunk.samples) < chunk_size:
+            # `samples.shape[-1]` is the per-channel sample count for both
+            # mono 1D and stereo channel-major 2D arrays. `len(samples)` for
+            # 2D returns the channel count, not the sample count.
+            if pcm_chunk.samples.shape[-1] < chunk_size:
                 self._carry = pcm_chunk
             else:
                 super().send_nowait(AudioOutputChunk(data=pcm_chunk))
@@ -98,7 +101,7 @@ def send_nowait(self, item: AudioOutputChunk | AudioOutputFlush) -> None:
             )
 
     def _flush_carry(self) -> None:
-        if self._carry is not None and len(self._carry.samples) > 0:
+        if self._carry is not None and self._carry.samples.shape[-1] > 0:
             chunk_size = self._carry.sample_rate // self._chunk_frac
             padded = next(self._carry.chunks(chunk_size, pad_last=True))
             super().send_nowait(AudioOutputChunk(data=padded))
diff --git a/tests/test_agents/test_inference/test_audio.py b/tests/test_agents/test_inference/test_audio.py
@@ -9,114 +9,147 @@
 )
 
 
-def make_pcm(ms: int, sample_rate: int = 16000, fill: int = 1) -> PcmData:
+def make_pcm(
+    ms: int, sample_rate: int = 16000, fill: int = 1, channels: int = 1
+) -> PcmData:
     num_samples = int(sample_rate * ms / 1000)
-    samples = np.full(num_samples, fill, dtype=np.int16)
-    return PcmData(samples=samples, sample_rate=sample_rate, format=AudioFormat.S16)
+    if channels == 1:
+        samples = np.full(num_samples, fill, dtype=np.int16)
+    else:
+        # Channel-major shape (channels, num_samples) — the convention used
+        # downstream by PcmData.chunks() and PcmData.append() for multi-channel.
+        samples = np.full((channels, num_samples), fill, dtype=np.int16)
+    return PcmData(
+        samples=samples,
+        sample_rate=sample_rate,
+        format=AudioFormat.S16,
+        channels=channels,
+    )
 
 
 @pytest.fixture
 def stream() -> AudioOutputStream:
     return AudioOutputStream()
 
 
+@pytest.mark.parametrize("channels", [1, 2])
 class TestAudioOutputStream:
     async def test_exact_multiple_of_20ms_emits_that_many_chunks(
-        self, stream: AudioOutputStream
+        self, stream: AudioOutputStream, channels: int
     ):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(40, fill=7)))
+        stream.send_nowait(
+            AudioOutputChunk(data=make_pcm(40, fill=7, channels=channels))
+        )
         items = stream.peek()
         assert len(items) == 2
         for item in items:
             assert isinstance(item, AudioOutputChunk)
             assert item.data is not None
-            assert len(item.data.samples) == 320
+            assert item.data.samples.shape[-1] == 320
             assert np.all(item.data.samples == 7)
 
-    async def test_sub_20ms_input_emits_nothing(self, stream: AudioOutputStream):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
+    async def test_sub_20ms_input_emits_nothing(
+        self, stream: AudioOutputStream, channels: int
+    ):
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
         assert stream.empty()
 
-    async def test_carry_is_prepended_on_next_send(self, stream: AudioOutputStream):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(15)))
+    async def test_carry_is_prepended_on_next_send(
+        self, stream: AudioOutputStream, channels: int
+    ):
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(15, channels=channels)))
         items = stream.peek()
         assert len(items) == 1
         assert isinstance(items[0], AudioOutputChunk)
         assert items[0].data is not None
-        assert len(items[0].data.samples) == 320
+        assert items[0].data.samples.shape[-1] == 320
 
-    async def test_chunk_size_tracks_sample_rate(self, stream: AudioOutputStream):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(20, sample_rate=48000)))
+    async def test_chunk_size_tracks_sample_rate(
+        self, stream: AudioOutputStream, channels: int
+    ):
+        stream.send_nowait(
+            AudioOutputChunk(data=make_pcm(20, sample_rate=48000, channels=channels))
+        )
         items = stream.peek()
         assert len(items) == 1
         assert isinstance(items[0], AudioOutputChunk)
         assert items[0].data is not None
-        assert len(items[0].data.samples) == 960
+        assert items[0].data.samples.shape[-1] == 960
 
     async def test_final_with_carry_pads_then_emits_terminal_marker(
-        self, stream: AudioOutputStream
+        self, stream: AudioOutputStream, channels: int
     ):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(5, fill=100), final=True))
+        stream.send_nowait(
+            AudioOutputChunk(
+                data=make_pcm(5, fill=100, channels=channels), final=True
+            )
+        )
         items = stream.peek()
         assert len(items) == 2
 
         padded, terminal = items
         assert isinstance(padded, AudioOutputChunk)
         assert padded.final is False
         assert padded.data is not None
-        assert len(padded.data.samples) == 320
-        assert np.all(padded.data.samples[:80] == 100)
-        assert np.all(padded.data.samples[80:] == 0)
+        assert padded.data.samples.shape[-1] == 320
+        assert np.all(padded.data.samples[..., :80] == 100)
+        assert np.all(padded.data.samples[..., 80:] == 0)
 
         assert isinstance(terminal, AudioOutputChunk)
         assert terminal.final is True
         assert terminal.data is not None
-        assert len(terminal.data.samples) == 0
+        assert terminal.data.samples.shape[-1] == 0
 
     async def test_final_with_no_carry_emits_chunk_plus_marker(
-        self, stream: AudioOutputStream
+        self, stream: AudioOutputStream, channels: int
     ):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(20, fill=9), final=True))
+        stream.send_nowait(
+            AudioOutputChunk(data=make_pcm(20, fill=9, channels=channels), final=True)
+        )
         items = stream.peek()
         assert len(items) == 2
 
         full, terminal = items
         assert isinstance(full, AudioOutputChunk)
         assert full.final is False
         assert full.data is not None
-        assert len(full.data.samples) == 320
+        assert full.data.samples.shape[-1] == 320
         assert np.all(full.data.samples == 9)
 
         assert isinstance(terminal, AudioOutputChunk)
         assert terminal.final is True
         assert terminal.data is not None
-        assert len(terminal.data.samples) == 0
+        assert terminal.data.samples.shape[-1] == 0
 
-    async def test_carry_is_reset_after_final(self, stream: AudioOutputStream):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(5), final=True))
+    async def test_carry_is_reset_after_final(
+        self, stream: AudioOutputStream, channels: int
+    ):
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(5, channels=channels), final=True))
         stream.clear()
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
         assert stream.empty()
 
-    async def test_flush_passes_through_unchanged(self, stream: AudioOutputStream):
+    async def test_flush_passes_through_unchanged(
+        self, stream: AudioOutputStream, channels: int
+    ):
         flush = AudioOutputFlush()
         stream.send_nowait(flush)
         assert stream.peek() == [flush]
 
     async def test_chunk_with_none_data_passes_through_unchanged(
-        self, stream: AudioOutputStream
+        self, stream: AudioOutputStream, channels: int
     ):
         signal = AudioOutputChunk(data=None, final=True)
         stream.send_nowait(signal)
         assert stream.peek() == [signal]
 
     async def test_final_marker_with_no_data_flushes_pending_carry(
-        self, stream: AudioOutputStream
+        self, stream: AudioOutputStream, channels: int
     ):
         # Build up a sub-20ms carry, then send a data-less final marker
         # (as the realtime flow does on RealtimeAudioOutputDone).
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(5, fill=42)))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(5, fill=42, channels=channels)))
         assert stream.empty()  # carry only, nothing emitted yet
 
         signal = AudioOutputChunk(data=None, final=True)
@@ -129,51 +162,59 @@ async def test_final_marker_with_no_data_flushes_pending_carry(
         assert isinstance(padded, AudioOutputChunk)
         assert padded.final is False
         assert padded.data is not None
-        assert len(padded.data.samples) == 320
-        assert np.all(padded.data.samples[:80] == 42)
-        assert np.all(padded.data.samples[80:] == 0)
+        assert padded.data.samples.shape[-1] == 320
+        assert np.all(padded.data.samples[..., :80] == 42)
+        assert np.all(padded.data.samples[..., 80:] == 0)
 
         assert final is signal
 
         # Carry must be cleared so a fresh utterance does not inherit it.
         stream.clear()
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
         assert stream.empty()
 
-    async def test_clear_drops_the_carry(self, stream: AudioOutputStream):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(25)))
+    async def test_clear_drops_the_carry(
+        self, stream: AudioOutputStream, channels: int
+    ):
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(25, channels=channels)))
         stream.clear()
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(15)))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(15, channels=channels)))
         assert stream.empty()
 
-    async def test_buffered_reports_pending_seconds(self, stream: AudioOutputStream):
+    async def test_buffered_reports_pending_seconds(
+        self, stream: AudioOutputStream, channels: int
+    ):
         assert stream.buffered == 0.0
 
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(40)))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(40, channels=channels)))
         assert stream.buffered == pytest.approx(0.04)
 
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(20)))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(20, channels=channels)))
         assert stream.buffered == pytest.approx(0.06)
 
-    async def test_buffered_includes_carry(self, stream: AudioOutputStream):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
+    async def test_buffered_includes_carry(
+        self, stream: AudioOutputStream, channels: int
+    ):
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
         assert stream.empty()
         assert stream.buffered == pytest.approx(0.01)
 
-    async def test_buffered_ignores_flush(self, stream: AudioOutputStream):
+    async def test_buffered_ignores_flush(
+        self, stream: AudioOutputStream, channels: int
+    ):
         stream.send_nowait(AudioOutputFlush())
         assert stream.buffered == 0.0
 
     async def test_buffered_ignores_chunk_with_none_data(
-        self, stream: AudioOutputStream
+        self, stream: AudioOutputStream, channels: int
     ):
         stream.send_nowait(AudioOutputChunk(data=None, final=True))
         assert stream.buffered == 0.0
 
     async def test_buffered_after_final_excludes_terminal_marker(
-        self, stream: AudioOutputStream
+        self, stream: AudioOutputStream, channels: int
     ):
-        stream.send_nowait(AudioOutputChunk(data=make_pcm(20), final=True))
+        stream.send_nowait(AudioOutputChunk(data=make_pcm(20, channels=channels), final=True))
         # Stream now holds the real 20ms chunk plus a zero-sample terminal marker.
         assert len(stream.peek()) == 2
         # Only the real chunk contributes to buffered duration.