1+ #!/usr/bin/env python3
2+ """
3+ Example: Voice-Activity-Detection bot (Silero VAD)
4+
5+ The script joins a Stream video call with a bot that detects when anyone
6+ speaks, using the Silero VAD plugin (`getstream.plugins.vad.silero`).
7+ Each complete speech turn is logged with a timestamp and duration.
8+
9+ Run:
10+ python main.py
11+
12+ Environment: copy `examples/env.example` to `.env` and fill in
13+ `STREAM_API_KEY`, `STREAM_API_SECRET` (and optionally `STREAM_BASE_URL`).
14+ """
15+
16+ from __future__ import annotations
17+
18+ import asyncio
19+ import logging
20+ import os
21+ import time
22+ from typing import Any
23+ from uuid import uuid4
24+
25+ from dotenv import load_dotenv
26+
27+ from examples .utils import create_user , open_browser
28+ from getstream .stream import Stream
29+ from getstream .video import rtc
30+ from getstream .video .rtc .track_util import PcmData
31+ from getstream .plugins .vad .silero import Silero
32+
33+ # ---------------------------------------------------------------------------
34+ # Logging setup – INFO level so we see joins / leaves, etc.
35+ # ---------------------------------------------------------------------------
36+ logging .basicConfig (
37+ level = logging .INFO ,
38+ format = "%(asctime)s %(levelname)s %(message)s" ,
39+ )
40+
41+
42+ async def main () -> None :
43+ """Create a call and start the Silero VAD bot."""
44+
45+ # Load env from examples/.env
46+ load_dotenv (os .path .join (os .path .dirname (__file__ ), ".." , ".env" ))
47+
48+ client = Stream .from_env ()
49+
50+ human_id = f"user-{ uuid4 ()} "
51+ bot_id = f"vad-bot-{ uuid4 ()} "
52+
53+ create_user (client , human_id , "Human" )
54+ create_user (client , bot_id , "VAD Bot" )
55+
56+ token = client .create_token (human_id , expiration = 3600 )
57+
58+ call_id = str (uuid4 ())
59+ call = client .video .call ("default" , call_id )
60+ call .get_or_create (data = {"created_by_id" : bot_id })
61+
62+ logging .info ("📞 Call ready: %s" , call_id )
63+
64+ open_browser (client .api_key , token , call_id )
65+
66+ vad = Silero ()
67+
68+ print ("\n 🤖 VAD bot starting – speak in the call and watch the console.\n " )
69+
70+ speech_segments : list [dict [str , Any ]] = []
71+
72+ try :
73+ async with await rtc .join (call , bot_id ) as connection :
74+ logging .info ("🤖 Bot joined call: %s" , call_id )
75+
76+ # Forward audio frames to the VAD engine
77+ @connection .on ("audio" )
78+ async def _on_pcm (pcm : PcmData , user ):
79+ await vad .process_audio (pcm , user )
80+
81+ # Complete speech turns
82+ @vad .on ("audio" ) # type: ignore[arg-type]
83+ async def _on_turn (pcm : PcmData , user ):
84+ duration = pcm .duration
85+ ts = time .strftime ("%H:%M:%S" )
86+ print (f"[{ ts } ] Speech from { user } — { duration :.2f} s" )
87+ speech_segments .append ({
88+ "timestamp" : ts ,
89+ "duration" : duration ,
90+ "user" : user ,
91+ })
92+
93+ # Optional: in-progress indicator
94+ @vad .on ("partial" ) # type: ignore[arg-type]
95+ async def _on_partial (_ : PcmData , user ):
96+ print (f" { user } … speaking" , end = "\r " )
97+
98+ print ("🎧 Listening… press Ctrl-C to stop" )
99+ await connection .wait ()
100+
101+ except (asyncio .CancelledError , KeyboardInterrupt ):
102+ print ("\n ⏹️ Stopping VAD bot…" )
103+ finally :
104+ await vad .close ()
105+
106+ print (f"Detected { len (speech_segments )} speech segments" )
107+ total_duration = sum (segment ["duration" ] for segment in speech_segments )
108+ print (f"Total speech duration: { total_duration :.2f} seconds" )
109+
110+ client .delete_users ([human_id , bot_id ])
111+ print ("🧹 Cleanup completed" )
112+
113+
114+ if __name__ == "__main__" :
115+ asyncio .run (main ())
0 commit comments