@@ -107,8 +107,9 @@ public async Task Connect(
107107 JsonOptions = _jsonOptions
108108 } ) ;
109109
110+ var uri = BuildWebsocketUri ( modelSettings . ApiKey , "v1beta" ) ;
110111 await _session . ConnectAsync (
111- uri : new Uri ( $ "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key= { modelSettings . ApiKey } " ) ,
112+ uri : uri ,
112113 cancellationToken : CancellationToken . None ) ;
113114
114115 await onModelReady ( ) ;
@@ -148,9 +149,12 @@ private async Task ReceiveMessage(
148149 Func < string , Task > onModelAudioTranscriptDone ,
149150 Func < List < RoleDialogModel > , Task > onModelResponseDone ,
150151 Func < string , Task > onConversationItemCreated ,
151- Func < RoleDialogModel , Task > onInputAudioTranscriptionCompleted ,
152+ Func < RoleDialogModel , Task > onInputAudioTranscriptionDone ,
152153 Func < Task > onInterruptionDetected )
153154 {
155+ var inputTranscription = string . Empty ;
156+ var outputTranscription = string . Empty ;
157+
154158 await foreach ( ChatSessionUpdate update in _session . ReceiveUpdatesAsync ( CancellationToken . None ) )
155159 {
156160 var receivedText = update ? . RawResponse ;
@@ -163,7 +167,6 @@ private async Task ReceiveMessage(
163167 try
164168 {
165169 var response = JsonSerializer . Deserialize < RealtimeServerResponse > ( receivedText , _jsonOptions ) ;
166-
167170 if ( response == null )
168171 {
169172 continue ;
@@ -175,10 +178,29 @@ private async Task ReceiveMessage(
175178 }
176179 else if ( response . ServerContent != null )
177180 {
181+ if ( response . ServerContent . InputTranscription ? . Text != null )
182+ {
183+ outputTranscription = string . Empty ;
184+ inputTranscription += response . ServerContent . InputTranscription . Text ;
185+ }
186+
187+ if ( response . ServerContent . OutputTranscription ? . Text != null )
188+ {
189+ outputTranscription += response . ServerContent . OutputTranscription . Text ;
190+ }
191+
178192 if ( response . ServerContent . ModelTurn != null )
179193 {
180194 _logger . LogInformation ( $ "Model audio delta received.") ;
181195 var parts = response . ServerContent . ModelTurn . Parts ;
196+
197+ if ( ! string . IsNullOrEmpty ( inputTranscription ) )
198+ {
199+ var message = await OnUserAudioTranscriptionCompleted ( conn , inputTranscription ) ;
200+ await onInputAudioTranscriptionDone ( message ) ;
201+ inputTranscription = string . Empty ;
202+ }
203+
182204 if ( ! parts . IsNullOrEmpty ( ) )
183205 {
184206 foreach ( var part in parts )
@@ -197,13 +219,23 @@ private async Task ReceiveMessage(
197219 else if ( response . ServerContent . TurnComplete == true )
198220 {
199221 _logger . LogInformation ( $ "Model turn completed.") ;
222+
223+ if ( ! string . IsNullOrEmpty ( outputTranscription ) )
224+ {
225+ var messages = await OnResponseDone ( conn , outputTranscription , response . UsageMetaData ) ;
226+ await onModelResponseDone ( messages ) ;
227+
228+ // Reset input/output transcription
229+ inputTranscription = string . Empty ;
230+ outputTranscription = string . Empty ;
231+ }
200232 }
201233 }
202234 }
203235 catch ( Exception ex )
204236 {
205- _logger . LogError ( ex , $ "Error when deserializing server response.") ;
206- continue ;
237+ _logger . LogError ( ex , $ "Error when deserializing server response. { ex . Message } ") ;
238+ break ;
207239 }
208240 }
209241
@@ -288,7 +320,7 @@ private Task AttachEvents(MultiModalLiveClient client)
288320 client . Connected += ( sender , e ) =>
289321 {
290322 _logger . LogInformation ( "Google Realtime Client connected." ) ;
291- _onModelReady ( ) ;
323+ _onModelReady ( ) . ConfigureAwait ( false ) . GetAwaiter ( ) . GetResult ( ) ;
292324 } ;
293325
294326 client . Disconnected += ( sender , e ) =>
@@ -301,39 +333,39 @@ private Task AttachEvents(MultiModalLiveClient client)
301333 _logger . LogInformation ( "User message received." ) ;
302334 if ( e . Payload . SetupComplete != null )
303335 {
304- _onConversationItemCreated ( _client . ConnectionId . ToString ( ) ) ;
336+ _onConversationItemCreated ( _client . ConnectionId . ToString ( ) ) . ConfigureAwait ( false ) . GetAwaiter ( ) . GetResult ( ) ;
305337 }
306338
307339 if ( e . Payload . ServerContent != null )
308340 {
309341 if ( e . Payload . ServerContent . TurnComplete == true )
310342 {
311343 var responseDone = await ResponseDone ( _conn , e . Payload . ServerContent ) ;
312- _onModelResponseDone ( responseDone ) ;
344+ _onModelResponseDone ( responseDone ) . ConfigureAwait ( false ) . GetAwaiter ( ) . GetResult ( ) ;
313345 }
314346 }
315347 } ;
316348
317349 client . AudioChunkReceived += ( sender , e ) =>
318350 {
319- _onModelAudioDeltaReceived ( Convert . ToBase64String ( e . Buffer ) , Guid . NewGuid ( ) . ToString ( ) ) ;
351+ _onModelAudioDeltaReceived ( Convert . ToBase64String ( e . Buffer ) , Guid . NewGuid ( ) . ToString ( ) ) . ConfigureAwait ( false ) . GetAwaiter ( ) . GetResult ( ) ;
320352 } ;
321353
322354 client . TextChunkReceived += ( sender , e ) =>
323355 {
324- _onInputAudioTranscriptionDone ( new RoleDialogModel ( AgentRole . Assistant , e . Text ) ) ;
356+ _onInputAudioTranscriptionDone ( new RoleDialogModel ( AgentRole . Assistant , e . Text ) ) . ConfigureAwait ( false ) . GetAwaiter ( ) . GetResult ( ) ;
325357 } ;
326358
327359 client . GenerationInterrupted += ( sender , e ) =>
328360 {
329361 _logger . LogInformation ( "Audio generation interrupted." ) ;
330- _onUserInterrupted ( ) ;
362+ _onUserInterrupted ( ) . ConfigureAwait ( false ) . GetAwaiter ( ) . GetResult ( ) ;
331363 } ;
332364
333365 client . AudioReceiveCompleted += ( sender , e ) =>
334366 {
335367 _logger . LogInformation ( "Audio receive completed." ) ;
336- _onModelAudioResponseDone ( ) ;
368+ _onModelAudioResponseDone ( ) . ConfigureAwait ( false ) . GetAwaiter ( ) . GetResult ( ) ;
337369 } ;
338370
339371 client . ErrorOccurred += ( sender , e ) =>
@@ -345,6 +377,43 @@ private Task AttachEvents(MultiModalLiveClient client)
345377 return Task . CompletedTask ;
346378 }
347379
380+ private async Task < List < RoleDialogModel > > OnResponseDone ( RealtimeHubConnection conn , string text , RealtimeUsageMetaData ? useage )
381+ {
382+ var outputs = new List < RoleDialogModel >
383+ {
384+ new ( AgentRole . Assistant , text )
385+ {
386+ CurrentAgentId = conn . CurrentAgentId ,
387+ MessageId = Guid . NewGuid ( ) . ToString ( ) ,
388+ MessageType = MessageTypeName . Plain
389+ }
390+ } ;
391+
392+ if ( useage != null )
393+ {
394+ var contentHooks = _services . GetServices < IContentGeneratingHook > ( ) ;
395+ foreach ( var hook in contentHooks )
396+ {
397+ await hook . AfterGenerated ( new RoleDialogModel ( AgentRole . Assistant , text )
398+ {
399+ CurrentAgentId = conn . CurrentAgentId
400+ } ,
401+ new TokenStatsModel
402+ {
403+ Provider = Provider ,
404+ Model = _model ,
405+ Prompt = text ,
406+ TextInputTokens = useage . PromptTokensDetails ? . FirstOrDefault ( x => x . Modality == Modality . TEXT . ToString ( ) ) ? . TokenCount ?? 0 ,
407+ AudioInputTokens = useage . PromptTokensDetails ? . FirstOrDefault ( x => x . Modality == Modality . AUDIO . ToString ( ) ) ? . TokenCount ?? 0 ,
408+ TextOutputTokens = useage . ResponseTokensDetails ? . FirstOrDefault ( x => x . Modality == Modality . TEXT . ToString ( ) ) ? . TokenCount ?? 0 ,
409+ AudioOutputTokens = useage . ResponseTokensDetails ? . FirstOrDefault ( x => x . Modality == Modality . AUDIO . ToString ( ) ) ? . TokenCount ?? 0
410+ } ) ;
411+ }
412+ }
413+
414+ return outputs ;
415+ }
416+
348417 private async Task < List < RoleDialogModel > > ResponseDone ( RealtimeHubConnection conn ,
349418 BidiGenerateContentServerContent serverContent )
350419 {
@@ -401,8 +470,6 @@ await hook.AfterGenerated(new RoleDialogModel(AgentRole.Assistant, "response.don
401470
402471 public async Task SendEventToModel ( object message )
403472 {
404- //todo Send Audio Chunks to Model, Botsharp RealTime Implementation seems to be incomplete
405-
406473 if ( _session == null ) return ;
407474
408475 await _session . SendEventToModel ( message ) ;
@@ -419,9 +486,9 @@ public async Task<string> UpdateSession(RealtimeHubConnection conn, bool isInit
419486 var ( prompt , request ) = PrepareOptions ( agent , [ ] ) ;
420487
421488 var config = request . GenerationConfig ;
422- //Output Modality can either be text or audio
423489 if ( config != null )
424490 {
491+ //Output Modality can either be text or audio
425492 config . ResponseModalities = [ Modality . AUDIO ] ;
426493
427494 var words = new List < string > ( ) ;
@@ -467,14 +534,16 @@ await HookEmitter.Emit<IContentGeneratingHook>(_services,
467534 // //Tools = request.Tools?.ToArray(),
468535 //});
469536
470- await SendEventToModel ( new BidiClientPayload
537+ await SendEventToModel ( new RealtimeClientPayload
471538 {
472- Setup = new BidiGenerateContentSetup ( )
539+ Setup = new RealtimeGenerateContentSetup ( )
473540 {
474541 GenerationConfig = config ,
475542 Model = Model . ToModelId ( ) ,
476543 SystemInstruction = request . SystemInstruction ,
477- Tools = [ ]
544+ Tools = [ ] ,
545+ InputAudioTranscription = new ( ) ,
546+ OutputAudioTranscription = new ( )
478547 }
479548 } ) ;
480549
@@ -532,7 +601,7 @@ await SendEventToModel(new BidiClientPayload
532601 }
533602 else
534603 {
535- throw new NotImplementedException ( " ") ;
604+ throw new NotImplementedException ( $ "Unrecognized role { message . Role } . ") ;
536605 }
537606 }
538607
@@ -542,9 +611,9 @@ public async Task<List<RoleDialogModel>> OnResponsedDone(RealtimeHubConnection c
542611 }
543612
544613
545- public async Task < RoleDialogModel > OnConversationItemCreated ( RealtimeHubConnection conn , string response )
614+ public async Task < RoleDialogModel > OnConversationItemCreated ( RealtimeHubConnection conn , string text )
546615 {
547- return await Task . FromResult ( new RoleDialogModel ( AgentRole . User , response ) ) ;
616+ return await Task . FromResult ( new RoleDialogModel ( AgentRole . User , text ) ) ;
548617 }
549618
550619 private ( string , GenerateContentRequest ) PrepareOptions ( Agent agent ,
@@ -688,4 +757,18 @@ private string GetPrompt(IEnumerable<string> systemPrompts, IEnumerable<string>
688757
689758 return prompt ;
690759 }
760+
761+
762+ private async Task < RoleDialogModel > OnUserAudioTranscriptionCompleted ( RealtimeHubConnection conn , string text )
763+ {
764+ return new RoleDialogModel ( AgentRole . User , text )
765+ {
766+ CurrentAgentId = conn . CurrentAgentId
767+ } ;
768+ }
769+
770+ private Uri BuildWebsocketUri ( string apiKey , string version = "v1alpha" )
771+ {
772+ return new Uri ( $ "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.{ version } .GenerativeService.BidiGenerateContent?key={ apiKey } ") ;
773+ }
691774}
0 commit comments