Skip to content

Commit f9fa49e

Browse files
committed
Merge branch 'T11662-RUS-stt-leading-silence-preroll' into 'main'
T11662 RUS: ignore leading STT startup silence See merge request bizzappdev/ai/polytalkio/polytalk!8
2 parents d18ae37 + 6b33bce commit f9fa49e

7 files changed

Lines changed: 285 additions & 13 deletions

File tree

.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ STT_EMIT_INTERVAL_SECONDS=4.5
5050
# Flush the current speech window after this much trailing silence, even if the
5151
# normal stream window or emit thresholds have not been reached. Set 0 to disable.
5252
STT_PAUSE_FLUSH_SECONDS=1.2
53+
# Keep a small amount of audio before first detected speech, but do not let
54+
# leading tab-share silence fill the first STT window.
55+
STT_LEADING_SILENCE_PREROLL_SECONDS=0.2
5356

5457
# Silence/hallucination guards for streaming STT. These balanced defaults work
5558
# well for typical microphone input: raise RMS/no-speech strictness if Whisper

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ The main latency knobs are:
282282
| `STT_EMIT_MIN_CHARS` | `120` | Minimum new transcript text before STT emits an update to PolyTalk. Increase this if live chunks are too small. |
283283
| `STT_EMIT_INTERVAL_SECONDS` | `4.5` | Maximum time to hold pending transcript text before emitting it. |
284284
| `STT_PAUSE_FLUSH_SECONDS` | `1.2` | Flush and emit the current speech window after this much trailing silence. Set `0` to disable pause flushing. |
285+
| `STT_LEADING_SILENCE_PREROLL_SECONDS` | `0.2` | Keep this much audio before first detected speech while discarding longer tab-share startup silence. |
285286
| `STT_SILENCE_RMS_THRESHOLD` | `0.003` | Skip STT for very quiet audio windows. Raise this if Whisper hallucinates while nobody is speaking. |
286287
| `STT_NO_SPEECH_PROB_THRESHOLD` | `0.50` | Drop faster-whisper segments classified as likely no-speech. |
287288
| `STT_LOG_PROB_THRESHOLD` | `-1.0` | Drop low-confidence faster-whisper segments. |
@@ -319,6 +320,7 @@ STT_STREAM_CHUNK_SECONDS=3.0
319320
STT_EMIT_MIN_CHARS=120
320321
STT_EMIT_INTERVAL_SECONDS=4.5
321322
STT_PAUSE_FLUSH_SECONDS=1.2
323+
STT_LEADING_SILENCE_PREROLL_SECONDS=0.2
322324
```
323325

324326
```yaml

docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ services:
2525
- STT_EMIT_MIN_CHARS=${STT_EMIT_MIN_CHARS:-40}
2626
- STT_EMIT_INTERVAL_SECONDS=${STT_EMIT_INTERVAL_SECONDS:-2.0}
2727
- STT_PAUSE_FLUSH_SECONDS=${STT_PAUSE_FLUSH_SECONDS:-1.0}
28+
- STT_LEADING_SILENCE_PREROLL_SECONDS=${STT_LEADING_SILENCE_PREROLL_SECONDS:-0.2}
2829
- STT_SILENCE_RMS_THRESHOLD=${STT_SILENCE_RMS_THRESHOLD:-0.003}
2930
- STT_NO_SPEECH_PROB_THRESHOLD=${STT_NO_SPEECH_PROB_THRESHOLD:-0.50}
3031
- STT_LOG_PROB_THRESHOLD=${STT_LOG_PROB_THRESHOLD:--1.0}

stt/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ docker run -p 8000:8000 polytalk-stt
4747
| `STT_EMIT_MIN_CHARS` | `120` | Minimum new transcript text before emitting an update to PolyTalk. Increase this if live chunks are too small. |
4848
| `STT_EMIT_INTERVAL_SECONDS` | `4.5` | Maximum time to hold pending transcript text before emitting it. |
4949
| `STT_PAUSE_FLUSH_SECONDS` | `1.2` | Flush and emit the current speech window after this much trailing silence. Set `0` to disable pause flushing. |
50+
| `STT_LEADING_SILENCE_PREROLL_SECONDS` | `0.2` | Keep this much audio before first detected speech while discarding longer tab-share startup silence. |
5051
| `STT_SILENCE_RMS_THRESHOLD` | `0.003` | Skip model inference for very quiet audio windows. Raise this if Whisper hallucinates during silence. |
5152
| `STT_NO_SPEECH_PROB_THRESHOLD` | `0.50` | Drop segments classified as likely no-speech by faster-whisper. |
5253
| `STT_LOG_PROB_THRESHOLD` | `-1.0` | Drop low-confidence faster-whisper segments. |

stt/app/main.py

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ async def lifespan(app: FastAPI):
6060
EMIT_MIN_CHARS = int(os.environ.get("STT_EMIT_MIN_CHARS", "40"))
6161
EMIT_INTERVAL_SECONDS = float(os.environ.get("STT_EMIT_INTERVAL_SECONDS", "2.0"))
6262
PAUSE_FLUSH_SECONDS = float(os.environ.get("STT_PAUSE_FLUSH_SECONDS", "1.0"))
63+
LEADING_SILENCE_PREROLL_SECONDS = float(
64+
os.environ.get("STT_LEADING_SILENCE_PREROLL_SECONDS", "0.2")
65+
)
6366
PRELOAD_MODEL = os.environ.get("STT_PRELOAD_MODEL", "true").lower() == "true"
6467
SILENCE_RMS_THRESHOLD = float(os.environ.get("STT_SILENCE_RMS_THRESHOLD", "0.003"))
6568
NO_SPEECH_PROB_THRESHOLD = float(os.environ.get("STT_NO_SPEECH_PROB_THRESHOLD", "0.50"))
@@ -217,6 +220,30 @@ def _trim_pause_flush_audio(
217220
return audio_bytes[:-trim_bytes]
218221

219222

223+
def _append_bounded_audio_preroll(
224+
chunks: list[bytes], audio_data: bytes, max_bytes: int, current_total_bytes: int
225+
) -> int:
226+
"""Keep only the most recent audio bytes for pre-speech context."""
227+
if max_bytes <= 0:
228+
chunks.clear()
229+
return 0
230+
231+
chunks.append(audio_data)
232+
current_total_bytes += len(audio_data)
233+
while chunks and current_total_bytes > max_bytes:
234+
overflow_bytes = current_total_bytes - max_bytes
235+
first_chunk = chunks[0]
236+
if len(first_chunk) <= overflow_bytes:
237+
current_total_bytes -= len(first_chunk)
238+
chunks.pop(0)
239+
continue
240+
241+
chunks[0] = first_chunk[overflow_bytes:]
242+
current_total_bytes -= overflow_bytes
243+
244+
return current_total_bytes
245+
246+
220247
def _build_wav_buffer(audio_bytes: bytes) -> io.BytesIO:
221248
"""Wrap raw PCM audio bytes in an in-memory WAV container."""
222249
wav_buffer = io.BytesIO()
@@ -490,6 +517,14 @@ async def receive_audio() -> None:
490517
nonlocal current_language, current_task, total_size, next_sequence
491518
current_window_has_voice = False
492519
trailing_silence_bytes = 0
520+
leading_silence_preroll_chunks: list[bytes] = []
521+
leading_silence_preroll_total_bytes = 0
522+
leading_silence_preroll_bytes = int(
523+
LEADING_SILENCE_PREROLL_SECONDS * bytes_per_second
524+
)
525+
leading_silence_preroll_bytes -= (
526+
leading_silence_preroll_bytes % SAMPLE_WIDTH_BYTES
527+
)
493528
pause_flush_bytes = int(PAUSE_FLUSH_SECONDS * bytes_per_second)
494529
pause_flush_bytes -= pause_flush_bytes % SAMPLE_WIDTH_BYTES
495530

@@ -597,15 +632,42 @@ async def enqueue_audio_window(
597632
stop_event.set()
598633
break
599634

600-
audio_chunks.append(audio_data)
635+
# This is a total received-byte guard, not the current
636+
# audio_chunks buffer size. Startup silence used later as
637+
# preroll is already counted here when received.
601638
total_size += len(audio_data)
602639

603-
if _calculate_rms(audio_data) >= SILENCE_RMS_THRESHOLD:
640+
audio_rms = _calculate_rms(audio_data)
641+
has_voice = audio_rms >= SILENCE_RMS_THRESHOLD
642+
643+
if not current_window_has_voice and not has_voice:
644+
leading_silence_preroll_total_bytes = _append_bounded_audio_preroll(
645+
leading_silence_preroll_chunks,
646+
audio_data,
647+
leading_silence_preroll_bytes,
648+
leading_silence_preroll_total_bytes,
649+
)
650+
# The active speech window stays empty during startup silence;
651+
# recent silence is kept separately as bounded preroll.
652+
audio_chunks.clear()
653+
continue
654+
655+
if not current_window_has_voice and has_voice:
656+
if leading_silence_preroll_chunks:
657+
audio_chunks.extend(leading_silence_preroll_chunks)
658+
logger.debug(
659+
"[STT_LEADING_SILENCE] prepended preroll "
660+
f"audio_seconds={leading_silence_preroll_total_bytes / bytes_per_second:.2f}"
661+
)
662+
leading_silence_preroll_chunks.clear()
663+
leading_silence_preroll_total_bytes = 0
604664
current_window_has_voice = True
605665
trailing_silence_bytes = 0
606-
elif current_window_has_voice:
666+
elif current_window_has_voice and not has_voice:
607667
trailing_silence_bytes += len(audio_data)
608668

669+
audio_chunks.append(audio_data)
670+
609671
audio_bytes = b"".join(audio_chunks)
610672
should_flush_for_pause = (
611673
current_window_has_voice

tests/test_app.py

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,6 +1132,202 @@ def test_stt_pause_flush_seconds_env_override(self):
11321132

11331133
assert stt_main.PAUSE_FLUSH_SECONDS == 1.75
11341134

1135+
def test_stt_leading_silence_preroll_default(self):
1136+
"""Test leading silence keeps only a short default pre-roll."""
1137+
stt_main = load_stt_main_module()
1138+
1139+
assert stt_main.LEADING_SILENCE_PREROLL_SECONDS == 0.2
1140+
1141+
def test_stt_leading_silence_preroll_env_override(self):
1142+
"""Test leading silence pre-roll is configurable."""
1143+
stt_main = load_stt_main_module({"STT_LEADING_SILENCE_PREROLL_SECONDS": "0.35"})
1144+
1145+
assert stt_main.LEADING_SILENCE_PREROLL_SECONDS == 0.35
1146+
1147+
def test_stt_leading_silence_preroll_keeps_recent_audio_only(self):
1148+
"""Test startup silence is bounded before first detected speech."""
1149+
stt_main = load_stt_main_module()
1150+
chunks = []
1151+
1152+
total_bytes = 0
1153+
total_bytes = stt_main._append_bounded_audio_preroll(
1154+
chunks, b"a" * 4, max_bytes=10, current_total_bytes=total_bytes
1155+
)
1156+
total_bytes = stt_main._append_bounded_audio_preroll(
1157+
chunks, b"b" * 4, max_bytes=10, current_total_bytes=total_bytes
1158+
)
1159+
total_bytes = stt_main._append_bounded_audio_preroll(
1160+
chunks, b"c" * 4, max_bytes=10, current_total_bytes=total_bytes
1161+
)
1162+
1163+
assert b"".join(chunks) == b"aabbbbcccc"
1164+
assert total_bytes == 10
1165+
1166+
def test_stt_leading_silence_preroll_can_be_disabled(self):
1167+
"""Test zero pre-roll discards all leading silence."""
1168+
stt_main = load_stt_main_module()
1169+
chunks = [b"silence"]
1170+
1171+
total_bytes = stt_main._append_bounded_audio_preroll(
1172+
chunks, b"more", max_bytes=0, current_total_bytes=len(b"silence")
1173+
)
1174+
1175+
assert chunks == []
1176+
assert total_bytes == 0
1177+
1178+
def test_stt_stream_prepends_leading_silence_preroll_before_first_voice(self):
1179+
"""Test stream transcription prepends bounded startup silence to speech."""
1180+
from fastapi.testclient import TestClient
1181+
1182+
stt_main = load_stt_main_module(
1183+
{
1184+
"STT_SAMPLE_RATE": "10",
1185+
"STT_SAMPLE_WIDTH_BYTES": "2",
1186+
"STT_STREAM_CHUNK_SECONDS": "0.3",
1187+
"STT_LEADING_SILENCE_PREROLL_SECONDS": "0.2",
1188+
"STT_EMIT_MIN_CHARS": "1",
1189+
"STT_TRANSCRIBE_WORKERS": "1",
1190+
}
1191+
)
1192+
captured_audio = []
1193+
silence = b"\x00\x00" * 2
1194+
voice = (10000).to_bytes(2, "little", signed=True) * 2
1195+
1196+
def fake_process_transcribe_job(model, job):
1197+
captured_audio.append(job.audio_bytes)
1198+
return stt_main.TranscribeResult(
1199+
sequence=job.sequence,
1200+
transcript="hello",
1201+
has_speech=True,
1202+
detected_language="en",
1203+
)
1204+
1205+
stt_main._get_model = lambda: object()
1206+
stt_main._process_transcribe_job = fake_process_transcribe_job
1207+
1208+
with TestClient(stt_main.app) as client:
1209+
with client.websocket_connect("/v1/stream/transcriptions") as websocket:
1210+
websocket.send_bytes(silence)
1211+
websocket.send_bytes(voice)
1212+
assert websocket.receive_json()["text"] == "hello"
1213+
1214+
assert captured_audio == [silence + voice]
1215+
1216+
def test_stt_stream_prepends_partial_leading_silence_preroll(self):
1217+
"""Test speech receives partial preroll when voice starts before it fills."""
1218+
from fastapi.testclient import TestClient
1219+
1220+
stt_main = load_stt_main_module(
1221+
{
1222+
"STT_SAMPLE_RATE": "10",
1223+
"STT_SAMPLE_WIDTH_BYTES": "2",
1224+
"STT_STREAM_CHUNK_SECONDS": "0.4",
1225+
"STT_LEADING_SILENCE_PREROLL_SECONDS": "0.4",
1226+
"STT_EMIT_MIN_CHARS": "1",
1227+
"STT_TRANSCRIBE_WORKERS": "1",
1228+
}
1229+
)
1230+
captured_audio = []
1231+
silence = b"\x00\x00" * 1
1232+
voice = (10000).to_bytes(2, "little", signed=True) * 3
1233+
1234+
def fake_process_transcribe_job(model, job):
1235+
captured_audio.append(job.audio_bytes)
1236+
return stt_main.TranscribeResult(
1237+
sequence=job.sequence,
1238+
transcript="hello",
1239+
has_speech=True,
1240+
detected_language="en",
1241+
)
1242+
1243+
stt_main._get_model = lambda: object()
1244+
stt_main._process_transcribe_job = fake_process_transcribe_job
1245+
1246+
with TestClient(stt_main.app) as client:
1247+
with client.websocket_connect("/v1/stream/transcriptions") as websocket:
1248+
websocket.send_bytes(silence)
1249+
websocket.send_bytes(voice)
1250+
assert websocket.receive_json()["text"] == "hello"
1251+
1252+
assert captured_audio == [silence + voice]
1253+
1254+
def test_stt_stream_skips_preroll_when_voice_starts_immediately(self):
1255+
"""Test stream transcription does not prepend silence without startup silence."""
1256+
from fastapi.testclient import TestClient
1257+
1258+
stt_main = load_stt_main_module(
1259+
{
1260+
"STT_SAMPLE_RATE": "10",
1261+
"STT_SAMPLE_WIDTH_BYTES": "2",
1262+
"STT_STREAM_CHUNK_SECONDS": "0.2",
1263+
"STT_LEADING_SILENCE_PREROLL_SECONDS": "0.2",
1264+
"STT_EMIT_MIN_CHARS": "1",
1265+
"STT_TRANSCRIBE_WORKERS": "1",
1266+
}
1267+
)
1268+
captured_audio = []
1269+
voice = (10000).to_bytes(2, "little", signed=True) * 2
1270+
1271+
def fake_process_transcribe_job(model, job):
1272+
captured_audio.append(job.audio_bytes)
1273+
return stt_main.TranscribeResult(
1274+
sequence=job.sequence,
1275+
transcript="hello",
1276+
has_speech=True,
1277+
detected_language="en",
1278+
)
1279+
1280+
stt_main._get_model = lambda: object()
1281+
stt_main._process_transcribe_job = fake_process_transcribe_job
1282+
1283+
with TestClient(stt_main.app) as client:
1284+
with client.websocket_connect("/v1/stream/transcriptions") as websocket:
1285+
websocket.send_bytes(voice)
1286+
assert websocket.receive_json()["text"] == "hello"
1287+
1288+
assert captured_audio == [voice]
1289+
1290+
def test_stt_stream_pause_flush_still_trims_trailing_silence(self):
1291+
"""Test leading-silence preroll does not break pause-flush trimming."""
1292+
from fastapi.testclient import TestClient
1293+
1294+
stt_main = load_stt_main_module(
1295+
{
1296+
"STT_SAMPLE_RATE": "10",
1297+
"STT_SAMPLE_WIDTH_BYTES": "2",
1298+
"STT_STREAM_CHUNK_SECONDS": "10",
1299+
"STT_PAUSE_FLUSH_SECONDS": "0.2",
1300+
"STT_VAD_SPEECH_PAD_MS": "0",
1301+
"STT_EMIT_MIN_CHARS": "1",
1302+
"STT_TRANSCRIBE_WORKERS": "1",
1303+
}
1304+
)
1305+
captured_audio = []
1306+
voice = (10000).to_bytes(2, "little", signed=True) * 2
1307+
silence = b"\x00\x00" * 2
1308+
1309+
def fake_process_transcribe_job(model, job):
1310+
captured_audio.append(job.audio_bytes)
1311+
return stt_main.TranscribeResult(
1312+
sequence=job.sequence,
1313+
transcript="hello",
1314+
has_speech=True,
1315+
detected_language="en",
1316+
force_emit=job.force_emit,
1317+
)
1318+
1319+
stt_main._get_model = lambda: object()
1320+
stt_main._process_transcribe_job = fake_process_transcribe_job
1321+
1322+
with TestClient(stt_main.app) as client:
1323+
with client.websocket_connect("/v1/stream/transcriptions") as websocket:
1324+
websocket.send_bytes(voice)
1325+
websocket.send_bytes(silence)
1326+
result = websocket.receive_json()
1327+
1328+
assert result["metrics"]["force_emit"] is True
1329+
assert captured_audio == [voice]
1330+
11351331
def test_stt_transcribe_job_preserves_pause_force_emit(self):
11361332
"""Test pause-flushed jobs remain emit-eligible after inference."""
11371333
stt_main = load_stt_main_module()

0 commit comments

Comments
 (0)