Merge branch 'T11662-RUS-stt-leading-silence-preroll' into 'main'

bizzappdev · bizzappdev · commit f9fa49ecbb71 · 2026-06-11T14:26:05.000Z
T11662 RUS: ignore leading STT startup silence

See merge request bizzappdev/ai/polytalkio/polytalk!8
diff --git a/.env.example b/.env.example
@@ -50,6 +50,9 @@ STT_EMIT_INTERVAL_SECONDS=4.5
 # Flush the current speech window after this much trailing silence, even if the
 # normal stream window or emit thresholds have not been reached. Set 0 to disable.
 STT_PAUSE_FLUSH_SECONDS=1.2
+# Keep a small amount of audio before first detected speech, but do not let
+# leading tab-share silence fill the first STT window.
+STT_LEADING_SILENCE_PREROLL_SECONDS=0.2
 
 # Silence/hallucination guards for streaming STT. These balanced defaults work
 # well for typical microphone input: raise RMS/no-speech strictness if Whisper
diff --git a/README.md b/README.md
@@ -282,6 +282,7 @@ The main latency knobs are:
 | `STT_EMIT_MIN_CHARS` | `120` | Minimum new transcript text before STT emits an update to PolyTalk. Increase this if live chunks are too small. |
 | `STT_EMIT_INTERVAL_SECONDS` | `4.5` | Maximum time to hold pending transcript text before emitting it. |
 | `STT_PAUSE_FLUSH_SECONDS` | `1.2` | Flush and emit the current speech window after this much trailing silence. Set `0` to disable pause flushing. |
+| `STT_LEADING_SILENCE_PREROLL_SECONDS` | `0.2` | Keep this much audio before first detected speech while discarding longer tab-share startup silence. |
 | `STT_SILENCE_RMS_THRESHOLD` | `0.003` | Skip STT for very quiet audio windows. Raise this if Whisper hallucinates while nobody is speaking. |
 | `STT_NO_SPEECH_PROB_THRESHOLD` | `0.50` | Drop faster-whisper segments classified as likely no-speech. |
 | `STT_LOG_PROB_THRESHOLD` | `-1.0` | Drop low-confidence faster-whisper segments. |
@@ -319,6 +320,7 @@ STT_STREAM_CHUNK_SECONDS=3.0
 STT_EMIT_MIN_CHARS=120
 STT_EMIT_INTERVAL_SECONDS=4.5
 STT_PAUSE_FLUSH_SECONDS=1.2
+STT_LEADING_SILENCE_PREROLL_SECONDS=0.2
 ```
 
 ```yaml
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -25,6 +25,7 @@ services:
       - STT_EMIT_MIN_CHARS=${STT_EMIT_MIN_CHARS:-40}
       - STT_EMIT_INTERVAL_SECONDS=${STT_EMIT_INTERVAL_SECONDS:-2.0}
       - STT_PAUSE_FLUSH_SECONDS=${STT_PAUSE_FLUSH_SECONDS:-1.0}
+      - STT_LEADING_SILENCE_PREROLL_SECONDS=${STT_LEADING_SILENCE_PREROLL_SECONDS:-0.2}
       - STT_SILENCE_RMS_THRESHOLD=${STT_SILENCE_RMS_THRESHOLD:-0.003}
       - STT_NO_SPEECH_PROB_THRESHOLD=${STT_NO_SPEECH_PROB_THRESHOLD:-0.50}
       - STT_LOG_PROB_THRESHOLD=${STT_LOG_PROB_THRESHOLD:--1.0}
diff --git a/stt/README.md b/stt/README.md
@@ -47,6 +47,7 @@ docker run -p 8000:8000 polytalk-stt
 | `STT_EMIT_MIN_CHARS` | `120` | Minimum new transcript text before emitting an update to PolyTalk. Increase this if live chunks are too small. |
 | `STT_EMIT_INTERVAL_SECONDS` | `4.5` | Maximum time to hold pending transcript text before emitting it. |
 | `STT_PAUSE_FLUSH_SECONDS` | `1.2` | Flush and emit the current speech window after this much trailing silence. Set `0` to disable pause flushing. |
+| `STT_LEADING_SILENCE_PREROLL_SECONDS` | `0.2` | Keep this much audio before first detected speech while discarding longer tab-share startup silence. |
 | `STT_SILENCE_RMS_THRESHOLD` | `0.003` | Skip model inference for very quiet audio windows. Raise this if Whisper hallucinates during silence. |
 | `STT_NO_SPEECH_PROB_THRESHOLD` | `0.50` | Drop segments classified as likely no-speech by faster-whisper. |
 | `STT_LOG_PROB_THRESHOLD` | `-1.0` | Drop low-confidence faster-whisper segments. |
diff --git a/stt/app/main.py b/stt/app/main.py
@@ -60,6 +60,9 @@ async def lifespan(app: FastAPI):
 EMIT_MIN_CHARS = int(os.environ.get("STT_EMIT_MIN_CHARS", "40"))
 EMIT_INTERVAL_SECONDS = float(os.environ.get("STT_EMIT_INTERVAL_SECONDS", "2.0"))
 PAUSE_FLUSH_SECONDS = float(os.environ.get("STT_PAUSE_FLUSH_SECONDS", "1.0"))
+LEADING_SILENCE_PREROLL_SECONDS = float(
+    os.environ.get("STT_LEADING_SILENCE_PREROLL_SECONDS", "0.2")
+)
 PRELOAD_MODEL = os.environ.get("STT_PRELOAD_MODEL", "true").lower() == "true"
 SILENCE_RMS_THRESHOLD = float(os.environ.get("STT_SILENCE_RMS_THRESHOLD", "0.003"))
 NO_SPEECH_PROB_THRESHOLD = float(os.environ.get("STT_NO_SPEECH_PROB_THRESHOLD", "0.50"))
@@ -217,6 +220,30 @@ def _trim_pause_flush_audio(
     return audio_bytes[:-trim_bytes]
 
 
+def _append_bounded_audio_preroll(
+    chunks: list[bytes], audio_data: bytes, max_bytes: int, current_total_bytes: int
+) -> int:
+    """Keep only the most recent audio bytes for pre-speech context."""
+    if max_bytes <= 0:
+        chunks.clear()
+        return 0
+
+    chunks.append(audio_data)
+    current_total_bytes += len(audio_data)
+    while chunks and current_total_bytes > max_bytes:
+        overflow_bytes = current_total_bytes - max_bytes
+        first_chunk = chunks[0]
+        if len(first_chunk) <= overflow_bytes:
+            current_total_bytes -= len(first_chunk)
+            chunks.pop(0)
+            continue
+
+        chunks[0] = first_chunk[overflow_bytes:]
+        current_total_bytes -= overflow_bytes
+
+    return current_total_bytes
+
+
 def _build_wav_buffer(audio_bytes: bytes) -> io.BytesIO:
     """Wrap raw PCM audio bytes in an in-memory WAV container."""
     wav_buffer = io.BytesIO()
@@ -490,6 +517,14 @@ async def receive_audio() -> None:
         nonlocal current_language, current_task, total_size, next_sequence
         current_window_has_voice = False
         trailing_silence_bytes = 0
+        leading_silence_preroll_chunks: list[bytes] = []
+        leading_silence_preroll_total_bytes = 0
+        leading_silence_preroll_bytes = int(
+            LEADING_SILENCE_PREROLL_SECONDS * bytes_per_second
+        )
+        leading_silence_preroll_bytes -= (
+            leading_silence_preroll_bytes % SAMPLE_WIDTH_BYTES
+        )
         pause_flush_bytes = int(PAUSE_FLUSH_SECONDS * bytes_per_second)
         pause_flush_bytes -= pause_flush_bytes % SAMPLE_WIDTH_BYTES
 
@@ -597,15 +632,42 @@ async def enqueue_audio_window(
                     stop_event.set()
                     break
 
-                audio_chunks.append(audio_data)
+                # This is a total received-byte guard, not the current
+                # audio_chunks buffer size. Startup silence used later as
+                # preroll is already counted here when received.
                 total_size += len(audio_data)
 
-                if _calculate_rms(audio_data) >= SILENCE_RMS_THRESHOLD:
+                audio_rms = _calculate_rms(audio_data)
+                has_voice = audio_rms >= SILENCE_RMS_THRESHOLD
+
+                if not current_window_has_voice and not has_voice:
+                    leading_silence_preroll_total_bytes = _append_bounded_audio_preroll(
+                        leading_silence_preroll_chunks,
+                        audio_data,
+                        leading_silence_preroll_bytes,
+                        leading_silence_preroll_total_bytes,
+                    )
+                    # The active speech window stays empty during startup silence;
+                    # recent silence is kept separately as bounded preroll.
+                    audio_chunks.clear()
+                    continue
+
+                if not current_window_has_voice and has_voice:
+                    if leading_silence_preroll_chunks:
+                        audio_chunks.extend(leading_silence_preroll_chunks)
+                        logger.debug(
+                            "[STT_LEADING_SILENCE] prepended preroll "
+                            f"audio_seconds={leading_silence_preroll_total_bytes / bytes_per_second:.2f}"
+                        )
+                        leading_silence_preroll_chunks.clear()
+                        leading_silence_preroll_total_bytes = 0
                     current_window_has_voice = True
                     trailing_silence_bytes = 0
-                elif current_window_has_voice:
+                elif current_window_has_voice and not has_voice:
                     trailing_silence_bytes += len(audio_data)
 
+                audio_chunks.append(audio_data)
+
                 audio_bytes = b"".join(audio_chunks)
                 should_flush_for_pause = (
                     current_window_has_voice
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -1132,6 +1132,202 @@ def test_stt_pause_flush_seconds_env_override(self):
 
         assert stt_main.PAUSE_FLUSH_SECONDS == 1.75
 
+    def test_stt_leading_silence_preroll_default(self):
+        """Test leading silence keeps only a short default pre-roll."""
+        stt_main = load_stt_main_module()
+
+        assert stt_main.LEADING_SILENCE_PREROLL_SECONDS == 0.2
+
+    def test_stt_leading_silence_preroll_env_override(self):
+        """Test leading silence pre-roll is configurable."""
+        stt_main = load_stt_main_module({"STT_LEADING_SILENCE_PREROLL_SECONDS": "0.35"})
+
+        assert stt_main.LEADING_SILENCE_PREROLL_SECONDS == 0.35
+
+    def test_stt_leading_silence_preroll_keeps_recent_audio_only(self):
+        """Test startup silence is bounded before first detected speech."""
+        stt_main = load_stt_main_module()
+        chunks = []
+
+        total_bytes = 0
+        total_bytes = stt_main._append_bounded_audio_preroll(
+            chunks, b"a" * 4, max_bytes=10, current_total_bytes=total_bytes
+        )
+        total_bytes = stt_main._append_bounded_audio_preroll(
+            chunks, b"b" * 4, max_bytes=10, current_total_bytes=total_bytes
+        )
+        total_bytes = stt_main._append_bounded_audio_preroll(
+            chunks, b"c" * 4, max_bytes=10, current_total_bytes=total_bytes
+        )
+
+        assert b"".join(chunks) == b"aabbbbcccc"
+        assert total_bytes == 10
+
+    def test_stt_leading_silence_preroll_can_be_disabled(self):
+        """Test zero pre-roll discards all leading silence."""
+        stt_main = load_stt_main_module()
+        chunks = [b"silence"]
+
+        total_bytes = stt_main._append_bounded_audio_preroll(
+            chunks, b"more", max_bytes=0, current_total_bytes=len(b"silence")
+        )
+
+        assert chunks == []
+        assert total_bytes == 0
+
+    def test_stt_stream_prepends_leading_silence_preroll_before_first_voice(self):
+        """Test stream transcription prepends bounded startup silence to speech."""
+        from fastapi.testclient import TestClient
+
+        stt_main = load_stt_main_module(
+            {
+                "STT_SAMPLE_RATE": "10",
+                "STT_SAMPLE_WIDTH_BYTES": "2",
+                "STT_STREAM_CHUNK_SECONDS": "0.3",
+                "STT_LEADING_SILENCE_PREROLL_SECONDS": "0.2",
+                "STT_EMIT_MIN_CHARS": "1",
+                "STT_TRANSCRIBE_WORKERS": "1",
+            }
+        )
+        captured_audio = []
+        silence = b"\x00\x00" * 2
+        voice = (10000).to_bytes(2, "little", signed=True) * 2
+
+        def fake_process_transcribe_job(model, job):
+            captured_audio.append(job.audio_bytes)
+            return stt_main.TranscribeResult(
+                sequence=job.sequence,
+                transcript="hello",
+                has_speech=True,
+                detected_language="en",
+            )
+
+        stt_main._get_model = lambda: object()
+        stt_main._process_transcribe_job = fake_process_transcribe_job
+
+        with TestClient(stt_main.app) as client:
+            with client.websocket_connect("/v1/stream/transcriptions") as websocket:
+                websocket.send_bytes(silence)
+                websocket.send_bytes(voice)
+                assert websocket.receive_json()["text"] == "hello"
+
+        assert captured_audio == [silence + voice]
+
+    def test_stt_stream_prepends_partial_leading_silence_preroll(self):
+        """Test speech receives partial preroll when voice starts before it fills."""
+        from fastapi.testclient import TestClient
+
+        stt_main = load_stt_main_module(
+            {
+                "STT_SAMPLE_RATE": "10",
+                "STT_SAMPLE_WIDTH_BYTES": "2",
+                "STT_STREAM_CHUNK_SECONDS": "0.4",
+                "STT_LEADING_SILENCE_PREROLL_SECONDS": "0.4",
+                "STT_EMIT_MIN_CHARS": "1",
+                "STT_TRANSCRIBE_WORKERS": "1",
+            }
+        )
+        captured_audio = []
+        silence = b"\x00\x00" * 1
+        voice = (10000).to_bytes(2, "little", signed=True) * 3
+
+        def fake_process_transcribe_job(model, job):
+            captured_audio.append(job.audio_bytes)
+            return stt_main.TranscribeResult(
+                sequence=job.sequence,
+                transcript="hello",
+                has_speech=True,
+                detected_language="en",
+            )
+
+        stt_main._get_model = lambda: object()
+        stt_main._process_transcribe_job = fake_process_transcribe_job
+
+        with TestClient(stt_main.app) as client:
+            with client.websocket_connect("/v1/stream/transcriptions") as websocket:
+                websocket.send_bytes(silence)
+                websocket.send_bytes(voice)
+                assert websocket.receive_json()["text"] == "hello"
+
+        assert captured_audio == [silence + voice]
+
+    def test_stt_stream_skips_preroll_when_voice_starts_immediately(self):
+        """Test stream transcription does not prepend silence without startup silence."""
+        from fastapi.testclient import TestClient
+
+        stt_main = load_stt_main_module(
+            {
+                "STT_SAMPLE_RATE": "10",
+                "STT_SAMPLE_WIDTH_BYTES": "2",
+                "STT_STREAM_CHUNK_SECONDS": "0.2",
+                "STT_LEADING_SILENCE_PREROLL_SECONDS": "0.2",
+                "STT_EMIT_MIN_CHARS": "1",
+                "STT_TRANSCRIBE_WORKERS": "1",
+            }
+        )
+        captured_audio = []
+        voice = (10000).to_bytes(2, "little", signed=True) * 2
+
+        def fake_process_transcribe_job(model, job):
+            captured_audio.append(job.audio_bytes)
+            return stt_main.TranscribeResult(
+                sequence=job.sequence,
+                transcript="hello",
+                has_speech=True,
+                detected_language="en",
+            )
+
+        stt_main._get_model = lambda: object()
+        stt_main._process_transcribe_job = fake_process_transcribe_job
+
+        with TestClient(stt_main.app) as client:
+            with client.websocket_connect("/v1/stream/transcriptions") as websocket:
+                websocket.send_bytes(voice)
+                assert websocket.receive_json()["text"] == "hello"
+
+        assert captured_audio == [voice]
+
+    def test_stt_stream_pause_flush_still_trims_trailing_silence(self):
+        """Test leading-silence preroll does not break pause-flush trimming."""
+        from fastapi.testclient import TestClient
+
+        stt_main = load_stt_main_module(
+            {
+                "STT_SAMPLE_RATE": "10",
+                "STT_SAMPLE_WIDTH_BYTES": "2",
+                "STT_STREAM_CHUNK_SECONDS": "10",
+                "STT_PAUSE_FLUSH_SECONDS": "0.2",
+                "STT_VAD_SPEECH_PAD_MS": "0",
+                "STT_EMIT_MIN_CHARS": "1",
+                "STT_TRANSCRIBE_WORKERS": "1",
+            }
+        )
+        captured_audio = []
+        voice = (10000).to_bytes(2, "little", signed=True) * 2
+        silence = b"\x00\x00" * 2
+
+        def fake_process_transcribe_job(model, job):
+            captured_audio.append(job.audio_bytes)
+            return stt_main.TranscribeResult(
+                sequence=job.sequence,
+                transcript="hello",
+                has_speech=True,
+                detected_language="en",
+                force_emit=job.force_emit,
+            )
+
+        stt_main._get_model = lambda: object()
+        stt_main._process_transcribe_job = fake_process_transcribe_job
+
+        with TestClient(stt_main.app) as client:
+            with client.websocket_connect("/v1/stream/transcriptions") as websocket:
+                websocket.send_bytes(voice)
+                websocket.send_bytes(silence)
+                result = websocket.receive_json()
+
+        assert result["metrics"]["force_emit"] is True
+        assert captured_audio == [voice]
+
     def test_stt_transcribe_job_preserves_pause_force_emit(self):
         """Test pause-flushed jobs remain emit-eligible after inference."""
         stt_main = load_stt_main_module()
diff --git a/tests/test_services.py b/tests/test_services.py