@@ -131,6 +131,21 @@ export default function ChatPage() {
131131 const remoteAudioRef = useRef ( null )
132132 const voiceModeStartTimeRef = useRef ( null )
133133
134+ const lastSpokenTextRef = useRef ( "" )
135+ const setMicrophoneEnabled = useCallback ( ( enabled ) => {
136+ if ( webrtcClientRef . current ?. mediaStream ) {
137+ const audioTracks =
138+ webrtcClientRef . current . mediaStream . getAudioTracks ( )
139+ if ( audioTracks . length > 0 ) {
140+ // Only change if the state is different to avoid unnecessary operations
141+ if ( audioTracks [ 0 ] . enabled !== enabled ) {
142+ audioTracks [ 0 ] . enabled = enabled
143+ setIsMuted ( ! enabled )
144+ }
145+ }
146+ }
147+ } , [ ] )
148+
134149 const fetchInitialMessages = useCallback ( async ( ) => {
135150 setIsLoading ( true )
136151 try {
@@ -597,74 +612,113 @@ export default function ChatPage() {
597612 }
598613
599614 // --- Voice Mode Handlers ---
600- const handleStatusChange = useCallback ( ( status ) => {
601- setConnectionStatus ( status )
602- if ( status !== "connecting" && ringtoneAudioRef . current ) {
603- ringtoneAudioRef . current . pause ( )
604- ringtoneAudioRef . current . currentTime = 0
605- }
606- if ( status === "connected" ) {
607- if ( connectedAudioRef . current ) {
608- connectedAudioRef . current . volume = 0.4
609- connectedAudioRef . current
610- . play ( )
611- . catch ( ( e ) => console . error ( "Error playing sound:" , e ) )
615+ const handleStatusChange = useCallback (
616+ ( status ) => {
617+ setConnectionStatus ( status )
618+ if ( status !== "connecting" && ringtoneAudioRef . current ) {
619+ ringtoneAudioRef . current . pause ( )
620+ ringtoneAudioRef . current . currentTime = 0
612621 }
613- setVoiceStatusText ( "Listening..." )
614- } else if ( status === "disconnected" ) {
615- setVoiceStatusText ( "Click to start call" )
616- } else if ( status === "connecting" ) {
617- setVoiceStatusText ( "Connecting..." )
618- }
619- } , [ ] )
620-
621- const handleVoiceEvent = useCallback ( ( event ) => {
622- if ( event . type === "stt_result" && event . text ) {
623- setDisplayedMessages ( ( prev ) => [
624- ...prev ,
625- {
626- id : `user_${ Date . now ( ) } ` ,
627- role : "user" ,
628- content : event . text ,
629- timestamp : new Date ( ) . toISOString ( )
622+ if ( status === "connected" ) {
623+ if ( connectedAudioRef . current ) {
624+ connectedAudioRef . current . volume = 0.4
625+ connectedAudioRef . current
626+ . play ( )
627+ . catch ( ( e ) => console . error ( "Error playing sound:" , e ) )
630628 }
631- ] )
632- } else if ( event . type === "llm_result" && event . text ) {
633- setDisplayedMessages ( ( prev ) => [
634- ...prev ,
635- {
636- id : event . messageId || `assistant_${ Date . now ( ) } ` ,
637- role : "assistant" ,
638- content : event . text ,
639- timestamp : new Date ( ) . toISOString ( )
629+ // Add a delay to allow ICE connection to stabilize
630+ setVoiceStatusText ( "Please wait a moment..." )
631+ setMicrophoneEnabled ( false ) // Mute mic during stabilization
632+ setTimeout ( ( ) => {
633+ setVoiceStatusText ( "Listening..." )
634+ setMicrophoneEnabled ( true ) // Unmute after delay
635+ } , 4000 )
636+ } else if ( status === "disconnected" ) {
637+ setVoiceStatusText ( "Click to start call" )
638+ } else if ( status === "connecting" ) {
639+ setVoiceStatusText ( "Connecting..." )
640+ }
641+ } ,
642+ [ setMicrophoneEnabled ]
643+ )
644+
645+ const handleVoiceEvent = useCallback (
646+ ( event ) => {
647+ if ( event . type === "stt_result" && event . text ) {
648+ setDisplayedMessages ( ( prev ) => [
649+ ...prev ,
650+ {
651+ id : `user_${ Date . now ( ) } ` ,
652+ role : "user" ,
653+ content : event . text ,
654+ timestamp : new Date ( ) . toISOString ( )
655+ }
656+ ] )
657+ } else if ( event . type === "llm_result" && event . text ) {
658+ lastSpokenTextRef . current = event . text // Store the text for duration calculation
659+ setDisplayedMessages ( ( prev ) => [
660+ ...prev ,
661+ {
662+ id : event . messageId || `assistant_${ Date . now ( ) } ` ,
663+ role : "assistant" ,
664+ content : event . text ,
665+ timestamp : new Date ( ) . toISOString ( )
666+ }
667+ ] )
668+ } else if ( event . type === "status" ) {
669+ if ( event . message === "thinking" ) {
670+ setVoiceStatusText ( "Thinking..." )
671+ setMicrophoneEnabled ( false )
672+ } else if ( event . message === "speaking" ) {
673+ setVoiceStatusText ( "Speaking..." )
674+ setMicrophoneEnabled ( false )
675+ } else if ( event . message === "listening" ) {
676+ // The server sends 'listening' when it's done sending audio,
677+ // but client-side buffering can cause a delay. We estimate
678+ // the speaking duration based on the text length from the
679+ // `llm_result` event to avoid unmuting the mic too early.
680+ const textToMeasure = lastSpokenTextRef . current
681+ // Estimate duration: ~18 chars/sec -> ~55ms/char. Add a smaller buffer.
682+ const estimatedDuration = textToMeasure . length * 55 + 250 // ms
683+
684+ setTimeout ( ( ) => {
685+ if (
686+ webrtcClientRef . current ?. peerConnection
687+ ?. connectionState === "connected"
688+ ) {
689+ setVoiceStatusText ( "Listening..." )
690+ setMicrophoneEnabled ( true )
691+ }
692+ } , estimatedDuration )
693+
694+ // Reset for the next turn
695+ lastSpokenTextRef . current = ""
696+ } else if ( event . message === "transcribing" ) {
697+ setVoiceStatusText ( "Transcribing..." )
698+ setMicrophoneEnabled ( false ) // Mute as soon as transcription starts
699+ } else if ( event . message === "choosing_tools" )
700+ setVoiceStatusText ( "Choosing tools..." )
701+ else if (
702+ event . message &&
703+ event . message . startsWith ( "using_tool_" )
704+ ) {
705+ const toolName = event . message
706+ . replace ( "using_tool_" , "" )
707+ . replace ( "_server" , "" )
708+ . replace ( "_mcp" , "" )
709+ setVoiceStatusText (
710+ `Using ${
711+ toolName . charAt ( 0 ) . toUpperCase ( ) + toolName . slice ( 1 )
712+ } ...`
713+ )
640714 }
641- ] )
642- } else if ( event . type === "status" ) {
643- if ( event . message === "thinking" ) setVoiceStatusText ( "Thinking..." )
644- else if ( event . message === "speaking" )
645- setVoiceStatusText ( "Speaking..." )
646- else if ( event . message === "listening" )
647- setVoiceStatusText ( "Listening..." )
648- else if ( event . message === "transcribing" )
649- setVoiceStatusText ( "Transcribing..." )
650- else if ( event . message === "choosing_tools" )
651- setVoiceStatusText ( "Choosing tools..." )
652- else if ( event . message && event . message . startsWith ( "using_tool_" ) ) {
653- const toolName = event . message
654- . replace ( "using_tool_" , "" )
655- . replace ( "_server" , "" )
656- . replace ( "_mcp" , "" )
657- setVoiceStatusText (
658- `Using ${
659- toolName . charAt ( 0 ) . toUpperCase ( ) + toolName . slice ( 1 )
660- } ...`
661- )
715+ } else if ( event . type === "error" ) {
716+ toast . error ( `Voice Error: ${ event . message } ` )
717+ setVoiceStatusText ( "Error. Click to retry." )
662718 }
663- } else if ( event . type === "error" ) {
664- toast . error ( `Voice Error: ${ event . message } ` )
665- setVoiceStatusText ( "Error. Click to retry." )
666- }
667- } , [ ] )
719+ } ,
720+ [ setMicrophoneEnabled ]
721+ )
668722
669723 const handleAudioLevel = useCallback ( ( level ) => {
670724 setAudioLevel ( ( prev ) => prev * 0.7 + level * 0.3 )
0 commit comments