Skip to content

Commit 635c69b

Browse files
committed
Fix AudioOutputStream handling for stereo
send_nowait() was infinitely accumulating carryover buffer because of incorrect handling of 2d numpy arrays
1 parent 115e40f commit 635c69b

2 files changed

Lines changed: 94 additions & 50 deletions

File tree

agents-core/vision_agents/core/agents/inference/audio.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,10 @@ def send_nowait(self, item: AudioOutputChunk | AudioOutputFlush) -> None:
7979
self._carry = None
8080

8181
for pcm_chunk in pcm.chunks(chunk_size):
82-
if len(pcm_chunk.samples) < chunk_size:
82+
# `samples.shape[-1]` is the per-channel sample count for both
83+
# mono 1D and stereo channel-major 2D arrays. `len(samples)` for
84+
# 2D returns the channel count, not the sample count.
85+
if pcm_chunk.samples.shape[-1] < chunk_size:
8386
self._carry = pcm_chunk
8487
else:
8588
super().send_nowait(AudioOutputChunk(data=pcm_chunk))
@@ -98,7 +101,7 @@ def send_nowait(self, item: AudioOutputChunk | AudioOutputFlush) -> None:
98101
)
99102

100103
def _flush_carry(self) -> None:
101-
if self._carry is not None and len(self._carry.samples) > 0:
104+
if self._carry is not None and self._carry.samples.shape[-1] > 0:
102105
chunk_size = self._carry.sample_rate // self._chunk_frac
103106
padded = next(self._carry.chunks(chunk_size, pad_last=True))
104107
super().send_nowait(AudioOutputChunk(data=padded))

tests/test_agents/test_inference/test_audio.py

Lines changed: 89 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,114 +9,147 @@
99
)
1010

1111

12-
def make_pcm(ms: int, sample_rate: int = 16000, fill: int = 1) -> PcmData:
12+
def make_pcm(
13+
ms: int, sample_rate: int = 16000, fill: int = 1, channels: int = 1
14+
) -> PcmData:
1315
num_samples = int(sample_rate * ms / 1000)
14-
samples = np.full(num_samples, fill, dtype=np.int16)
15-
return PcmData(samples=samples, sample_rate=sample_rate, format=AudioFormat.S16)
16+
if channels == 1:
17+
samples = np.full(num_samples, fill, dtype=np.int16)
18+
else:
19+
# Channel-major shape (channels, num_samples) — the convention used
20+
# downstream by PcmData.chunks() and PcmData.append() for multi-channel.
21+
samples = np.full((channels, num_samples), fill, dtype=np.int16)
22+
return PcmData(
23+
samples=samples,
24+
sample_rate=sample_rate,
25+
format=AudioFormat.S16,
26+
channels=channels,
27+
)
1628

1729

1830
@pytest.fixture
1931
def stream() -> AudioOutputStream:
2032
return AudioOutputStream()
2133

2234

35+
@pytest.mark.parametrize("channels", [1, 2])
2336
class TestAudioOutputStream:
2437
async def test_exact_multiple_of_20ms_emits_that_many_chunks(
25-
self, stream: AudioOutputStream
38+
self, stream: AudioOutputStream, channels: int
2639
):
27-
stream.send_nowait(AudioOutputChunk(data=make_pcm(40, fill=7)))
40+
stream.send_nowait(
41+
AudioOutputChunk(data=make_pcm(40, fill=7, channels=channels))
42+
)
2843
items = stream.peek()
2944
assert len(items) == 2
3045
for item in items:
3146
assert isinstance(item, AudioOutputChunk)
3247
assert item.data is not None
33-
assert len(item.data.samples) == 320
48+
assert item.data.samples.shape[-1] == 320
3449
assert np.all(item.data.samples == 7)
3550

36-
async def test_sub_20ms_input_emits_nothing(self, stream: AudioOutputStream):
37-
stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
51+
async def test_sub_20ms_input_emits_nothing(
52+
self, stream: AudioOutputStream, channels: int
53+
):
54+
stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
3855
assert stream.empty()
3956

40-
async def test_carry_is_prepended_on_next_send(self, stream: AudioOutputStream):
41-
stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
42-
stream.send_nowait(AudioOutputChunk(data=make_pcm(15)))
57+
async def test_carry_is_prepended_on_next_send(
58+
self, stream: AudioOutputStream, channels: int
59+
):
60+
stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
61+
stream.send_nowait(AudioOutputChunk(data=make_pcm(15, channels=channels)))
4362
items = stream.peek()
4463
assert len(items) == 1
4564
assert isinstance(items[0], AudioOutputChunk)
4665
assert items[0].data is not None
47-
assert len(items[0].data.samples) == 320
66+
assert items[0].data.samples.shape[-1] == 320
4867

49-
async def test_chunk_size_tracks_sample_rate(self, stream: AudioOutputStream):
50-
stream.send_nowait(AudioOutputChunk(data=make_pcm(20, sample_rate=48000)))
68+
async def test_chunk_size_tracks_sample_rate(
69+
self, stream: AudioOutputStream, channels: int
70+
):
71+
stream.send_nowait(
72+
AudioOutputChunk(data=make_pcm(20, sample_rate=48000, channels=channels))
73+
)
5174
items = stream.peek()
5275
assert len(items) == 1
5376
assert isinstance(items[0], AudioOutputChunk)
5477
assert items[0].data is not None
55-
assert len(items[0].data.samples) == 960
78+
assert items[0].data.samples.shape[-1] == 960
5679

5780
async def test_final_with_carry_pads_then_emits_terminal_marker(
58-
self, stream: AudioOutputStream
81+
self, stream: AudioOutputStream, channels: int
5982
):
60-
stream.send_nowait(AudioOutputChunk(data=make_pcm(5, fill=100), final=True))
83+
stream.send_nowait(
84+
AudioOutputChunk(
85+
data=make_pcm(5, fill=100, channels=channels), final=True
86+
)
87+
)
6188
items = stream.peek()
6289
assert len(items) == 2
6390

6491
padded, terminal = items
6592
assert isinstance(padded, AudioOutputChunk)
6693
assert padded.final is False
6794
assert padded.data is not None
68-
assert len(padded.data.samples) == 320
69-
assert np.all(padded.data.samples[:80] == 100)
70-
assert np.all(padded.data.samples[80:] == 0)
95+
assert padded.data.samples.shape[-1] == 320
96+
assert np.all(padded.data.samples[..., :80] == 100)
97+
assert np.all(padded.data.samples[..., 80:] == 0)
7198

7299
assert isinstance(terminal, AudioOutputChunk)
73100
assert terminal.final is True
74101
assert terminal.data is not None
75-
assert len(terminal.data.samples) == 0
102+
assert terminal.data.samples.shape[-1] == 0
76103

77104
async def test_final_with_no_carry_emits_chunk_plus_marker(
78-
self, stream: AudioOutputStream
105+
self, stream: AudioOutputStream, channels: int
79106
):
80-
stream.send_nowait(AudioOutputChunk(data=make_pcm(20, fill=9), final=True))
107+
stream.send_nowait(
108+
AudioOutputChunk(data=make_pcm(20, fill=9, channels=channels), final=True)
109+
)
81110
items = stream.peek()
82111
assert len(items) == 2
83112

84113
full, terminal = items
85114
assert isinstance(full, AudioOutputChunk)
86115
assert full.final is False
87116
assert full.data is not None
88-
assert len(full.data.samples) == 320
117+
assert full.data.samples.shape[-1] == 320
89118
assert np.all(full.data.samples == 9)
90119

91120
assert isinstance(terminal, AudioOutputChunk)
92121
assert terminal.final is True
93122
assert terminal.data is not None
94-
assert len(terminal.data.samples) == 0
123+
assert terminal.data.samples.shape[-1] == 0
95124

96-
async def test_carry_is_reset_after_final(self, stream: AudioOutputStream):
97-
stream.send_nowait(AudioOutputChunk(data=make_pcm(5), final=True))
125+
async def test_carry_is_reset_after_final(
126+
self, stream: AudioOutputStream, channels: int
127+
):
128+
stream.send_nowait(AudioOutputChunk(data=make_pcm(5, channels=channels), final=True))
98129
stream.clear()
99-
stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
130+
stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
100131
assert stream.empty()
101132

102-
async def test_flush_passes_through_unchanged(self, stream: AudioOutputStream):
133+
async def test_flush_passes_through_unchanged(
134+
self, stream: AudioOutputStream, channels: int
135+
):
103136
flush = AudioOutputFlush()
104137
stream.send_nowait(flush)
105138
assert stream.peek() == [flush]
106139

107140
async def test_chunk_with_none_data_passes_through_unchanged(
108-
self, stream: AudioOutputStream
141+
self, stream: AudioOutputStream, channels: int
109142
):
110143
signal = AudioOutputChunk(data=None, final=True)
111144
stream.send_nowait(signal)
112145
assert stream.peek() == [signal]
113146

114147
async def test_final_marker_with_no_data_flushes_pending_carry(
115-
self, stream: AudioOutputStream
148+
self, stream: AudioOutputStream, channels: int
116149
):
117150
# Build up a sub-20ms carry, then send a data-less final marker
118151
# (as the realtime flow does on RealtimeAudioOutputDone).
119-
stream.send_nowait(AudioOutputChunk(data=make_pcm(5, fill=42)))
152+
stream.send_nowait(AudioOutputChunk(data=make_pcm(5, fill=42, channels=channels)))
120153
assert stream.empty() # carry only, nothing emitted yet
121154

122155
signal = AudioOutputChunk(data=None, final=True)
@@ -129,51 +162,59 @@ async def test_final_marker_with_no_data_flushes_pending_carry(
129162
assert isinstance(padded, AudioOutputChunk)
130163
assert padded.final is False
131164
assert padded.data is not None
132-
assert len(padded.data.samples) == 320
133-
assert np.all(padded.data.samples[:80] == 42)
134-
assert np.all(padded.data.samples[80:] == 0)
165+
assert padded.data.samples.shape[-1] == 320
166+
assert np.all(padded.data.samples[..., :80] == 42)
167+
assert np.all(padded.data.samples[..., 80:] == 0)
135168

136169
assert final is signal
137170

138171
# Carry must be cleared so a fresh utterance does not inherit it.
139172
stream.clear()
140-
stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
173+
stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
141174
assert stream.empty()
142175

143-
async def test_clear_drops_the_carry(self, stream: AudioOutputStream):
144-
stream.send_nowait(AudioOutputChunk(data=make_pcm(25)))
176+
async def test_clear_drops_the_carry(
177+
self, stream: AudioOutputStream, channels: int
178+
):
179+
stream.send_nowait(AudioOutputChunk(data=make_pcm(25, channels=channels)))
145180
stream.clear()
146-
stream.send_nowait(AudioOutputChunk(data=make_pcm(15)))
181+
stream.send_nowait(AudioOutputChunk(data=make_pcm(15, channels=channels)))
147182
assert stream.empty()
148183

149-
async def test_buffered_reports_pending_seconds(self, stream: AudioOutputStream):
184+
async def test_buffered_reports_pending_seconds(
185+
self, stream: AudioOutputStream, channels: int
186+
):
150187
assert stream.buffered == 0.0
151188

152-
stream.send_nowait(AudioOutputChunk(data=make_pcm(40)))
189+
stream.send_nowait(AudioOutputChunk(data=make_pcm(40, channels=channels)))
153190
assert stream.buffered == pytest.approx(0.04)
154191

155-
stream.send_nowait(AudioOutputChunk(data=make_pcm(20)))
192+
stream.send_nowait(AudioOutputChunk(data=make_pcm(20, channels=channels)))
156193
assert stream.buffered == pytest.approx(0.06)
157194

158-
async def test_buffered_includes_carry(self, stream: AudioOutputStream):
159-
stream.send_nowait(AudioOutputChunk(data=make_pcm(10)))
195+
async def test_buffered_includes_carry(
196+
self, stream: AudioOutputStream, channels: int
197+
):
198+
stream.send_nowait(AudioOutputChunk(data=make_pcm(10, channels=channels)))
160199
assert stream.empty()
161200
assert stream.buffered == pytest.approx(0.01)
162201

163-
async def test_buffered_ignores_flush(self, stream: AudioOutputStream):
202+
async def test_buffered_ignores_flush(
203+
self, stream: AudioOutputStream, channels: int
204+
):
164205
stream.send_nowait(AudioOutputFlush())
165206
assert stream.buffered == 0.0
166207

167208
async def test_buffered_ignores_chunk_with_none_data(
168-
self, stream: AudioOutputStream
209+
self, stream: AudioOutputStream, channels: int
169210
):
170211
stream.send_nowait(AudioOutputChunk(data=None, final=True))
171212
assert stream.buffered == 0.0
172213

173214
async def test_buffered_after_final_excludes_terminal_marker(
174-
self, stream: AudioOutputStream
215+
self, stream: AudioOutputStream, channels: int
175216
):
176-
stream.send_nowait(AudioOutputChunk(data=make_pcm(20), final=True))
217+
stream.send_nowait(AudioOutputChunk(data=make_pcm(20, channels=channels), final=True))
177218
# Stream now holds the real 20ms chunk plus a zero-sample terminal marker.
178219
assert len(stream.peek()) == 2
179220
# Only the real chunk contributes to buffered duration.

0 commit comments

Comments
 (0)