@@ -131,6 +131,21 @@ export default function ChatPage() {
131131 const remoteAudioRef = useRef ( null )
132132 const voiceModeStartTimeRef = useRef ( null )
133133
134+ const lastSpokenTextRef = useRef ( "" )
135+ const setMicrophoneEnabled = useCallback ( ( enabled ) => {
136+ if ( webrtcClientRef . current ?. mediaStream ) {
137+ const audioTracks =
138+ webrtcClientRef . current . mediaStream . getAudioTracks ( )
139+ if ( audioTracks . length > 0 ) {
140+ // Only change if the state is different to avoid unnecessary operations
141+ if ( audioTracks [ 0 ] . enabled !== enabled ) {
142+ audioTracks [ 0 ] . enabled = enabled
143+ setIsMuted ( ! enabled )
144+ }
145+ }
146+ }
147+ } , [ ] )
148+
134149 const fetchInitialMessages = useCallback ( async ( ) => {
135150 setIsLoading ( true )
136151 try {
@@ -632,74 +647,113 @@ export default function ChatPage() {
632647 }
633648
634649 // --- Voice Mode Handlers ---
635- const handleStatusChange = useCallback ( ( status ) => {
636- setConnectionStatus ( status )
637- if ( status !== "connecting" && ringtoneAudioRef . current ) {
638- ringtoneAudioRef . current . pause ( )
639- ringtoneAudioRef . current . currentTime = 0
640- }
641- if ( status === "connected" ) {
642- if ( connectedAudioRef . current ) {
643- connectedAudioRef . current . volume = 0.4
644- connectedAudioRef . current
645- . play ( )
646- . catch ( ( e ) => console . error ( "Error playing sound:" , e ) )
650+ const handleStatusChange = useCallback (
651+ ( status ) => {
652+ setConnectionStatus ( status )
653+ if ( status !== "connecting" && ringtoneAudioRef . current ) {
654+ ringtoneAudioRef . current . pause ( )
655+ ringtoneAudioRef . current . currentTime = 0
647656 }
648- setVoiceStatusText ( "Listening..." )
649- } else if ( status === "disconnected" ) {
650- setVoiceStatusText ( "Click to start call" )
651- } else if ( status === "connecting" ) {
652- setVoiceStatusText ( "Connecting..." )
653- }
654- } , [ ] )
655-
656- const handleVoiceEvent = useCallback ( ( event ) => {
657- if ( event . type === "stt_result" && event . text ) {
658- setDisplayedMessages ( ( prev ) => [
659- ...prev ,
660- {
661- id : `user_${ Date . now ( ) } ` ,
662- role : "user" ,
663- content : event . text ,
664- timestamp : new Date ( ) . toISOString ( )
657+ if ( status === "connected" ) {
658+ if ( connectedAudioRef . current ) {
659+ connectedAudioRef . current . volume = 0.4
660+ connectedAudioRef . current
661+ . play ( )
662+ . catch ( ( e ) => console . error ( "Error playing sound:" , e ) )
665663 }
666- ] )
667- } else if ( event . type === "llm_result" && event . text ) {
668- setDisplayedMessages ( ( prev ) => [
669- ...prev ,
670- {
671- id : event . messageId || `assistant_${ Date . now ( ) } ` ,
672- role : "assistant" ,
673- content : event . text ,
674- timestamp : new Date ( ) . toISOString ( )
664+ // Add a delay to allow ICE connection to stabilize
665+ setVoiceStatusText ( "Please wait a moment..." )
666+ setMicrophoneEnabled ( false ) // Mute mic during stabilization
667+ setTimeout ( ( ) => {
668+ setVoiceStatusText ( "Listening..." )
669+ setMicrophoneEnabled ( true ) // Unmute after delay
670+ } , 4000 )
671+ } else if ( status === "disconnected" ) {
672+ setVoiceStatusText ( "Click to start call" )
673+ } else if ( status === "connecting" ) {
674+ setVoiceStatusText ( "Connecting..." )
675+ }
676+ } ,
677+ [ setMicrophoneEnabled ]
678+ )
679+
680+ const handleVoiceEvent = useCallback (
681+ ( event ) => {
682+ if ( event . type === "stt_result" && event . text ) {
683+ setDisplayedMessages ( ( prev ) => [
684+ ...prev ,
685+ {
686+ id : `user_${ Date . now ( ) } ` ,
687+ role : "user" ,
688+ content : event . text ,
689+ timestamp : new Date ( ) . toISOString ( )
690+ }
691+ ] )
692+ } else if ( event . type === "llm_result" && event . text ) {
693+ lastSpokenTextRef . current = event . text // Store the text for duration calculation
694+ setDisplayedMessages ( ( prev ) => [
695+ ...prev ,
696+ {
697+ id : event . messageId || `assistant_${ Date . now ( ) } ` ,
698+ role : "assistant" ,
699+ content : event . text ,
700+ timestamp : new Date ( ) . toISOString ( )
701+ }
702+ ] )
703+ } else if ( event . type === "status" ) {
704+ if ( event . message === "thinking" ) {
705+ setVoiceStatusText ( "Thinking..." )
706+ setMicrophoneEnabled ( false )
707+ } else if ( event . message === "speaking" ) {
708+ setVoiceStatusText ( "Speaking..." )
709+ setMicrophoneEnabled ( false )
710+ } else if ( event . message === "listening" ) {
711+ // The server sends 'listening' when it's done sending audio,
712+ // but client-side buffering can cause a delay. We estimate
713+ // the speaking duration based on the text length from the
714+ // `llm_result` event to avoid unmuting the mic too early.
715+ const textToMeasure = lastSpokenTextRef . current
716+ // Estimate duration: ~18 chars/sec -> ~55ms/char. Add a smaller buffer.
717+ const estimatedDuration = textToMeasure . length * 55 + 250 // ms
718+
719+ setTimeout ( ( ) => {
720+ if (
721+ webrtcClientRef . current ?. peerConnection
722+ ?. connectionState === "connected"
723+ ) {
724+ setVoiceStatusText ( "Listening..." )
725+ setMicrophoneEnabled ( true )
726+ }
727+ } , estimatedDuration )
728+
729+ // Reset for the next turn
730+ lastSpokenTextRef . current = ""
731+ } else if ( event . message === "transcribing" ) {
732+ setVoiceStatusText ( "Transcribing..." )
733+ setMicrophoneEnabled ( false ) // Mute as soon as transcription starts
734+ } else if ( event . message === "choosing_tools" )
735+ setVoiceStatusText ( "Choosing tools..." )
736+ else if (
737+ event . message &&
738+ event . message . startsWith ( "using_tool_" )
739+ ) {
740+ const toolName = event . message
741+ . replace ( "using_tool_" , "" )
742+ . replace ( "_server" , "" )
743+ . replace ( "_mcp" , "" )
744+ setVoiceStatusText (
745+ `Using ${
746+ toolName . charAt ( 0 ) . toUpperCase ( ) + toolName . slice ( 1 )
747+ } ...`
748+ )
675749 }
676- ] )
677- } else if ( event . type === "status" ) {
678- if ( event . message === "thinking" ) setVoiceStatusText ( "Thinking..." )
679- else if ( event . message === "speaking" )
680- setVoiceStatusText ( "Speaking..." )
681- else if ( event . message === "listening" )
682- setVoiceStatusText ( "Listening..." )
683- else if ( event . message === "transcribing" )
684- setVoiceStatusText ( "Transcribing..." )
685- else if ( event . message === "choosing_tools" )
686- setVoiceStatusText ( "Choosing tools..." )
687- else if ( event . message && event . message . startsWith ( "using_tool_" ) ) {
688- const toolName = event . message
689- . replace ( "using_tool_" , "" )
690- . replace ( "_server" , "" )
691- . replace ( "_mcp" , "" )
692- setVoiceStatusText (
693- `Using ${
694- toolName . charAt ( 0 ) . toUpperCase ( ) + toolName . slice ( 1 )
695- } ...`
696- )
750+ } else if ( event . type === "error" ) {
751+ toast . error ( `Voice Error: ${ event . message } ` )
752+ setVoiceStatusText ( "Error. Click to retry." )
697753 }
698- } else if ( event . type === "error" ) {
699- toast . error ( `Voice Error: ${ event . message } ` )
700- setVoiceStatusText ( "Error. Click to retry." )
701- }
702- } , [ ] )
754+ } ,
755+ [ setMicrophoneEnabled ]
756+ )
703757
704758 const handleAudioLevel = useCallback ( ( level ) => {
705759 setAudioLevel ( ( prev ) => prev * 0.7 + level * 0.3 )
0 commit comments