@@ -1132,6 +1132,202 @@ def test_stt_pause_flush_seconds_env_override(self):
11321132
11331133 assert stt_main .PAUSE_FLUSH_SECONDS == 1.75
11341134
1135+ def test_stt_leading_silence_preroll_default (self ):
1136+ """Test leading silence keeps only a short default pre-roll."""
1137+ stt_main = load_stt_main_module ()
1138+
1139+ assert stt_main .LEADING_SILENCE_PREROLL_SECONDS == 0.2
1140+
1141+ def test_stt_leading_silence_preroll_env_override (self ):
1142+ """Test leading silence pre-roll is configurable."""
1143+ stt_main = load_stt_main_module ({"STT_LEADING_SILENCE_PREROLL_SECONDS" : "0.35" })
1144+
1145+ assert stt_main .LEADING_SILENCE_PREROLL_SECONDS == 0.35
1146+
1147+ def test_stt_leading_silence_preroll_keeps_recent_audio_only (self ):
1148+ """Test startup silence is bounded before first detected speech."""
1149+ stt_main = load_stt_main_module ()
1150+ chunks = []
1151+
1152+ total_bytes = 0
1153+ total_bytes = stt_main ._append_bounded_audio_preroll (
1154+ chunks , b"a" * 4 , max_bytes = 10 , current_total_bytes = total_bytes
1155+ )
1156+ total_bytes = stt_main ._append_bounded_audio_preroll (
1157+ chunks , b"b" * 4 , max_bytes = 10 , current_total_bytes = total_bytes
1158+ )
1159+ total_bytes = stt_main ._append_bounded_audio_preroll (
1160+ chunks , b"c" * 4 , max_bytes = 10 , current_total_bytes = total_bytes
1161+ )
1162+
1163+ assert b"" .join (chunks ) == b"aabbbbcccc"
1164+ assert total_bytes == 10
1165+
1166+ def test_stt_leading_silence_preroll_can_be_disabled (self ):
1167+ """Test zero pre-roll discards all leading silence."""
1168+ stt_main = load_stt_main_module ()
1169+ chunks = [b"silence" ]
1170+
1171+ total_bytes = stt_main ._append_bounded_audio_preroll (
1172+ chunks , b"more" , max_bytes = 0 , current_total_bytes = len (b"silence" )
1173+ )
1174+
1175+ assert chunks == []
1176+ assert total_bytes == 0
1177+
1178+ def test_stt_stream_prepends_leading_silence_preroll_before_first_voice (self ):
1179+ """Test stream transcription prepends bounded startup silence to speech."""
1180+ from fastapi .testclient import TestClient
1181+
1182+ stt_main = load_stt_main_module (
1183+ {
1184+ "STT_SAMPLE_RATE" : "10" ,
1185+ "STT_SAMPLE_WIDTH_BYTES" : "2" ,
1186+ "STT_STREAM_CHUNK_SECONDS" : "0.3" ,
1187+ "STT_LEADING_SILENCE_PREROLL_SECONDS" : "0.2" ,
1188+ "STT_EMIT_MIN_CHARS" : "1" ,
1189+ "STT_TRANSCRIBE_WORKERS" : "1" ,
1190+ }
1191+ )
1192+ captured_audio = []
1193+ silence = b"\x00 \x00 " * 2
1194+ voice = (10000 ).to_bytes (2 , "little" , signed = True ) * 2
1195+
1196+ def fake_process_transcribe_job (model , job ):
1197+ captured_audio .append (job .audio_bytes )
1198+ return stt_main .TranscribeResult (
1199+ sequence = job .sequence ,
1200+ transcript = "hello" ,
1201+ has_speech = True ,
1202+ detected_language = "en" ,
1203+ )
1204+
1205+ stt_main ._get_model = lambda : object ()
1206+ stt_main ._process_transcribe_job = fake_process_transcribe_job
1207+
1208+ with TestClient (stt_main .app ) as client :
1209+ with client .websocket_connect ("/v1/stream/transcriptions" ) as websocket :
1210+ websocket .send_bytes (silence )
1211+ websocket .send_bytes (voice )
1212+ assert websocket .receive_json ()["text" ] == "hello"
1213+
1214+ assert captured_audio == [silence + voice ]
1215+
1216+ def test_stt_stream_prepends_partial_leading_silence_preroll (self ):
1217+ """Test speech receives partial preroll when voice starts before it fills."""
1218+ from fastapi .testclient import TestClient
1219+
1220+ stt_main = load_stt_main_module (
1221+ {
1222+ "STT_SAMPLE_RATE" : "10" ,
1223+ "STT_SAMPLE_WIDTH_BYTES" : "2" ,
1224+ "STT_STREAM_CHUNK_SECONDS" : "0.4" ,
1225+ "STT_LEADING_SILENCE_PREROLL_SECONDS" : "0.4" ,
1226+ "STT_EMIT_MIN_CHARS" : "1" ,
1227+ "STT_TRANSCRIBE_WORKERS" : "1" ,
1228+ }
1229+ )
1230+ captured_audio = []
1231+ silence = b"\x00 \x00 " * 1
1232+ voice = (10000 ).to_bytes (2 , "little" , signed = True ) * 3
1233+
1234+ def fake_process_transcribe_job (model , job ):
1235+ captured_audio .append (job .audio_bytes )
1236+ return stt_main .TranscribeResult (
1237+ sequence = job .sequence ,
1238+ transcript = "hello" ,
1239+ has_speech = True ,
1240+ detected_language = "en" ,
1241+ )
1242+
1243+ stt_main ._get_model = lambda : object ()
1244+ stt_main ._process_transcribe_job = fake_process_transcribe_job
1245+
1246+ with TestClient (stt_main .app ) as client :
1247+ with client .websocket_connect ("/v1/stream/transcriptions" ) as websocket :
1248+ websocket .send_bytes (silence )
1249+ websocket .send_bytes (voice )
1250+ assert websocket .receive_json ()["text" ] == "hello"
1251+
1252+ assert captured_audio == [silence + voice ]
1253+
1254+ def test_stt_stream_skips_preroll_when_voice_starts_immediately (self ):
1255+ """Test stream transcription does not prepend silence without startup silence."""
1256+ from fastapi .testclient import TestClient
1257+
1258+ stt_main = load_stt_main_module (
1259+ {
1260+ "STT_SAMPLE_RATE" : "10" ,
1261+ "STT_SAMPLE_WIDTH_BYTES" : "2" ,
1262+ "STT_STREAM_CHUNK_SECONDS" : "0.2" ,
1263+ "STT_LEADING_SILENCE_PREROLL_SECONDS" : "0.2" ,
1264+ "STT_EMIT_MIN_CHARS" : "1" ,
1265+ "STT_TRANSCRIBE_WORKERS" : "1" ,
1266+ }
1267+ )
1268+ captured_audio = []
1269+ voice = (10000 ).to_bytes (2 , "little" , signed = True ) * 2
1270+
1271+ def fake_process_transcribe_job (model , job ):
1272+ captured_audio .append (job .audio_bytes )
1273+ return stt_main .TranscribeResult (
1274+ sequence = job .sequence ,
1275+ transcript = "hello" ,
1276+ has_speech = True ,
1277+ detected_language = "en" ,
1278+ )
1279+
1280+ stt_main ._get_model = lambda : object ()
1281+ stt_main ._process_transcribe_job = fake_process_transcribe_job
1282+
1283+ with TestClient (stt_main .app ) as client :
1284+ with client .websocket_connect ("/v1/stream/transcriptions" ) as websocket :
1285+ websocket .send_bytes (voice )
1286+ assert websocket .receive_json ()["text" ] == "hello"
1287+
1288+ assert captured_audio == [voice ]
1289+
1290+ def test_stt_stream_pause_flush_still_trims_trailing_silence (self ):
1291+ """Test leading-silence preroll does not break pause-flush trimming."""
1292+ from fastapi .testclient import TestClient
1293+
1294+ stt_main = load_stt_main_module (
1295+ {
1296+ "STT_SAMPLE_RATE" : "10" ,
1297+ "STT_SAMPLE_WIDTH_BYTES" : "2" ,
1298+ "STT_STREAM_CHUNK_SECONDS" : "10" ,
1299+ "STT_PAUSE_FLUSH_SECONDS" : "0.2" ,
1300+ "STT_VAD_SPEECH_PAD_MS" : "0" ,
1301+ "STT_EMIT_MIN_CHARS" : "1" ,
1302+ "STT_TRANSCRIBE_WORKERS" : "1" ,
1303+ }
1304+ )
1305+ captured_audio = []
1306+ voice = (10000 ).to_bytes (2 , "little" , signed = True ) * 2
1307+ silence = b"\x00 \x00 " * 2
1308+
1309+ def fake_process_transcribe_job (model , job ):
1310+ captured_audio .append (job .audio_bytes )
1311+ return stt_main .TranscribeResult (
1312+ sequence = job .sequence ,
1313+ transcript = "hello" ,
1314+ has_speech = True ,
1315+ detected_language = "en" ,
1316+ force_emit = job .force_emit ,
1317+ )
1318+
1319+ stt_main ._get_model = lambda : object ()
1320+ stt_main ._process_transcribe_job = fake_process_transcribe_job
1321+
1322+ with TestClient (stt_main .app ) as client :
1323+ with client .websocket_connect ("/v1/stream/transcriptions" ) as websocket :
1324+ websocket .send_bytes (voice )
1325+ websocket .send_bytes (silence )
1326+ result = websocket .receive_json ()
1327+
1328+ assert result ["metrics" ]["force_emit" ] is True
1329+ assert captured_audio == [voice ]
1330+
11351331 def test_stt_transcribe_job_preserves_pause_force_emit (self ):
11361332 """Test pause-flushed jobs remain emit-eligible after inference."""
11371333 stt_main = load_stt_main_module ()
0 commit comments