Skip to content

Commit d9fe6da

Browse files
author
Jicheng Lu
committed
temp save
1 parent da9c3c6 commit d9fe6da

5 files changed

Lines changed: 161 additions & 22 deletions

File tree

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
using GenerativeAI.Types;
2+
3+
namespace BotSharp.Plugin.GoogleAI.Models.Realtime;
4+
5+
internal class RealtimeClientPayload
6+
{
7+
[JsonPropertyName("setup")]
8+
public RealtimeGenerateContentSetup? Setup { get; set; }
9+
10+
[JsonPropertyName("clientContent")]
11+
public BidiGenerateContentClientContent? ClientContent { get; set; }
12+
13+
[JsonPropertyName("realtimeInput")]
14+
public BidiGenerateContentRealtimeInput? RealtimeInput { get; set; }
15+
16+
[JsonPropertyName("toolResponse")]
17+
public BidiGenerateContentToolResponse? ToolResponse { get; set; }
18+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
using GenerativeAI.Types;
2+
3+
namespace BotSharp.Plugin.GoogleAI.Models.Realtime;
4+
5+
internal class RealtimeGenerateContentSetup
6+
{
7+
[JsonPropertyName("model")]
8+
public string? Model { get; set; }
9+
10+
[JsonPropertyName("generationConfig")]
11+
public GenerationConfig? GenerationConfig { get; set; }
12+
13+
[JsonPropertyName("systemInstruction")]
14+
public Content? SystemInstruction { get; set; }
15+
16+
[JsonPropertyName("tools")]
17+
public Tool[]? Tools { get; set; }
18+
19+
[JsonPropertyName("inputAudioTranscription")]
20+
public AudioTranscriptionConfig? InputAudioTranscription { get; set; }
21+
22+
[JsonPropertyName("outputAudioTranscription")]
23+
public AudioTranscriptionConfig? OutputAudioTranscription { get; set; }
24+
}
25+
26+
internal class AudioTranscriptionConfig { }

src/Plugins/BotSharp.Plugin.GoogleAI/Models/Realtime/RealtimeServerResponse.cs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ internal class RealtimeGenerateContentServerContent
3030

3131
[JsonPropertyName("modelTurn")]
3232
public Content? ModelTurn { get; set; }
33+
34+
[JsonPropertyName("inputTranscription")]
35+
public RealtimeGenerateContentTranscription? InputTranscription { get; set; }
36+
37+
[JsonPropertyName("outputTranscription")]
38+
public RealtimeGenerateContentTranscription? OutputTranscription { get; set; }
3339
}
3440

3541
internal class RealtimeUsageMetaData
@@ -58,4 +64,10 @@ internal class RealtimeTokenDetail
5864

5965
[JsonPropertyName("tokenCount")]
6066
public int? TokenCount { get; set; }
67+
}
68+
69+
internal class RealtimeGenerateContentTranscription
70+
{
71+
[JsonPropertyName("text")]
72+
public string? Text { get; set; }
6173
}

src/Plugins/BotSharp.Plugin.GoogleAI/Providers/Realtime/RealTimeCompletionProvider.cs

Lines changed: 104 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,9 @@ public async Task Connect(
107107
JsonOptions = _jsonOptions
108108
});
109109

110+
var uri = BuildWebsocketUri(modelSettings.ApiKey, "v1beta");
110111
await _session.ConnectAsync(
111-
uri: new Uri($"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key={modelSettings.ApiKey}"),
112+
uri: uri,
112113
cancellationToken: CancellationToken.None);
113114

114115
await onModelReady();
@@ -148,9 +149,12 @@ private async Task ReceiveMessage(
148149
Func<string, Task> onModelAudioTranscriptDone,
149150
Func<List<RoleDialogModel>, Task> onModelResponseDone,
150151
Func<string, Task> onConversationItemCreated,
151-
Func<RoleDialogModel, Task> onInputAudioTranscriptionCompleted,
152+
Func<RoleDialogModel, Task> onInputAudioTranscriptionDone,
152153
Func<Task> onInterruptionDetected)
153154
{
155+
var inputTranscription = string.Empty;
156+
var outputTranscription = string.Empty;
157+
154158
await foreach (ChatSessionUpdate update in _session.ReceiveUpdatesAsync(CancellationToken.None))
155159
{
156160
var receivedText = update?.RawResponse;
@@ -163,7 +167,6 @@ private async Task ReceiveMessage(
163167
try
164168
{
165169
var response = JsonSerializer.Deserialize<RealtimeServerResponse>(receivedText, _jsonOptions);
166-
167170
if (response == null)
168171
{
169172
continue;
@@ -175,10 +178,29 @@ private async Task ReceiveMessage(
175178
}
176179
else if (response.ServerContent != null)
177180
{
181+
if (response.ServerContent.InputTranscription?.Text != null)
182+
{
183+
outputTranscription = string.Empty;
184+
inputTranscription += response.ServerContent.InputTranscription.Text;
185+
}
186+
187+
if (response.ServerContent.OutputTranscription?.Text != null)
188+
{
189+
outputTranscription += response.ServerContent.OutputTranscription.Text;
190+
}
191+
178192
if (response.ServerContent.ModelTurn != null)
179193
{
180194
_logger.LogInformation($"Model audio delta received.");
181195
var parts = response.ServerContent.ModelTurn.Parts;
196+
197+
if (!string.IsNullOrEmpty(inputTranscription))
198+
{
199+
var message = await OnUserAudioTranscriptionCompleted(conn, inputTranscription);
200+
await onInputAudioTranscriptionDone(message);
201+
inputTranscription = string.Empty;
202+
}
203+
182204
if (!parts.IsNullOrEmpty())
183205
{
184206
foreach (var part in parts)
@@ -197,13 +219,23 @@ private async Task ReceiveMessage(
197219
else if (response.ServerContent.TurnComplete == true)
198220
{
199221
_logger.LogInformation($"Model turn completed.");
222+
223+
if (!string.IsNullOrEmpty(outputTranscription))
224+
{
225+
var messages = await OnResponseDone(conn, outputTranscription, response.UsageMetaData);
226+
await onModelResponseDone(messages);
227+
228+
// Reset input/output transcription
229+
inputTranscription = string.Empty;
230+
outputTranscription = string.Empty;
231+
}
200232
}
201233
}
202234
}
203235
catch (Exception ex)
204236
{
205-
_logger.LogError(ex, $"Error when deserializing server response.");
206-
continue;
237+
_logger.LogError(ex, $"Error when deserializing server response. {ex.Message}");
238+
break;
207239
}
208240
}
209241

@@ -288,7 +320,7 @@ private Task AttachEvents(MultiModalLiveClient client)
288320
client.Connected += (sender, e) =>
289321
{
290322
_logger.LogInformation("Google Realtime Client connected.");
291-
_onModelReady();
323+
_onModelReady().ConfigureAwait(false).GetAwaiter().GetResult();
292324
};
293325

294326
client.Disconnected += (sender, e) =>
@@ -301,39 +333,39 @@ private Task AttachEvents(MultiModalLiveClient client)
301333
_logger.LogInformation("User message received.");
302334
if (e.Payload.SetupComplete != null)
303335
{
304-
_onConversationItemCreated(_client.ConnectionId.ToString());
336+
_onConversationItemCreated(_client.ConnectionId.ToString()).ConfigureAwait(false).GetAwaiter().GetResult();
305337
}
306338

307339
if (e.Payload.ServerContent != null)
308340
{
309341
if (e.Payload.ServerContent.TurnComplete == true)
310342
{
311343
var responseDone = await ResponseDone(_conn, e.Payload.ServerContent);
312-
_onModelResponseDone(responseDone);
344+
_onModelResponseDone(responseDone).ConfigureAwait(false).GetAwaiter().GetResult();
313345
}
314346
}
315347
};
316348

317349
client.AudioChunkReceived += (sender, e) =>
318350
{
319-
_onModelAudioDeltaReceived(Convert.ToBase64String(e.Buffer), Guid.NewGuid().ToString());
351+
_onModelAudioDeltaReceived(Convert.ToBase64String(e.Buffer), Guid.NewGuid().ToString()).ConfigureAwait(false).GetAwaiter().GetResult();
320352
};
321353

322354
client.TextChunkReceived += (sender, e) =>
323355
{
324-
_onInputAudioTranscriptionDone(new RoleDialogModel(AgentRole.Assistant, e.Text));
356+
_onInputAudioTranscriptionDone(new RoleDialogModel(AgentRole.Assistant, e.Text)).ConfigureAwait(false).GetAwaiter().GetResult();
325357
};
326358

327359
client.GenerationInterrupted += (sender, e) =>
328360
{
329361
_logger.LogInformation("Audio generation interrupted.");
330-
_onUserInterrupted();
362+
_onUserInterrupted().ConfigureAwait(false).GetAwaiter().GetResult();
331363
};
332364

333365
client.AudioReceiveCompleted += (sender, e) =>
334366
{
335367
_logger.LogInformation("Audio receive completed.");
336-
_onModelAudioResponseDone();
368+
_onModelAudioResponseDone().ConfigureAwait(false).GetAwaiter().GetResult();
337369
};
338370

339371
client.ErrorOccurred += (sender, e) =>
@@ -345,6 +377,43 @@ private Task AttachEvents(MultiModalLiveClient client)
345377
return Task.CompletedTask;
346378
}
347379

380+
private async Task<List<RoleDialogModel>> OnResponseDone(RealtimeHubConnection conn, string text, RealtimeUsageMetaData? useage)
381+
{
382+
var outputs = new List<RoleDialogModel>
383+
{
384+
new(AgentRole.Assistant, text)
385+
{
386+
CurrentAgentId = conn.CurrentAgentId,
387+
MessageId = Guid.NewGuid().ToString(),
388+
MessageType = MessageTypeName.Plain
389+
}
390+
};
391+
392+
if (useage != null)
393+
{
394+
var contentHooks = _services.GetServices<IContentGeneratingHook>();
395+
foreach (var hook in contentHooks)
396+
{
397+
await hook.AfterGenerated(new RoleDialogModel(AgentRole.Assistant, text)
398+
{
399+
CurrentAgentId = conn.CurrentAgentId
400+
},
401+
new TokenStatsModel
402+
{
403+
Provider = Provider,
404+
Model = _model,
405+
Prompt = text,
406+
TextInputTokens = useage.PromptTokensDetails?.FirstOrDefault(x => x.Modality == Modality.TEXT.ToString())?.TokenCount ?? 0,
407+
AudioInputTokens = useage.PromptTokensDetails?.FirstOrDefault(x => x.Modality == Modality.AUDIO.ToString())?.TokenCount ?? 0,
408+
TextOutputTokens = useage.ResponseTokensDetails?.FirstOrDefault(x => x.Modality == Modality.TEXT.ToString())?.TokenCount ?? 0,
409+
AudioOutputTokens = useage.ResponseTokensDetails?.FirstOrDefault(x => x.Modality == Modality.AUDIO.ToString())?.TokenCount ?? 0
410+
});
411+
}
412+
}
413+
414+
return outputs;
415+
}
416+
348417
private async Task<List<RoleDialogModel>> ResponseDone(RealtimeHubConnection conn,
349418
BidiGenerateContentServerContent serverContent)
350419
{
@@ -401,8 +470,6 @@ await hook.AfterGenerated(new RoleDialogModel(AgentRole.Assistant, "response.don
401470

402471
public async Task SendEventToModel(object message)
403472
{
404-
//todo Send Audio Chunks to Model, Botsharp RealTime Implementation seems to be incomplete
405-
406473
if (_session == null) return;
407474

408475
await _session.SendEventToModel(message);
@@ -419,9 +486,9 @@ public async Task<string> UpdateSession(RealtimeHubConnection conn, bool isInit
419486
var (prompt, request) = PrepareOptions(agent, []);
420487

421488
var config = request.GenerationConfig;
422-
//Output Modality can either be text or audio
423489
if (config != null)
424490
{
491+
//Output Modality can either be text or audio
425492
config.ResponseModalities = [Modality.AUDIO];
426493

427494
var words = new List<string>();
@@ -467,14 +534,16 @@ await HookEmitter.Emit<IContentGeneratingHook>(_services,
467534
// //Tools = request.Tools?.ToArray(),
468535
//});
469536

470-
await SendEventToModel(new BidiClientPayload
537+
await SendEventToModel(new RealtimeClientPayload
471538
{
472-
Setup = new BidiGenerateContentSetup()
539+
Setup = new RealtimeGenerateContentSetup()
473540
{
474541
GenerationConfig = config,
475542
Model = Model.ToModelId(),
476543
SystemInstruction = request.SystemInstruction,
477-
Tools = []
544+
Tools = [],
545+
InputAudioTranscription = new(),
546+
OutputAudioTranscription = new()
478547
}
479548
});
480549

@@ -532,7 +601,7 @@ await SendEventToModel(new BidiClientPayload
532601
}
533602
else
534603
{
535-
throw new NotImplementedException("");
604+
throw new NotImplementedException($"Unrecognized role {message.Role}.");
536605
}
537606
}
538607

@@ -542,9 +611,9 @@ public async Task<List<RoleDialogModel>> OnResponsedDone(RealtimeHubConnection c
542611
}
543612

544613

545-
public async Task<RoleDialogModel> OnConversationItemCreated(RealtimeHubConnection conn, string response)
614+
public async Task<RoleDialogModel> OnConversationItemCreated(RealtimeHubConnection conn, string text)
546615
{
547-
return await Task.FromResult(new RoleDialogModel(AgentRole.User, response));
616+
return await Task.FromResult(new RoleDialogModel(AgentRole.User, text));
548617
}
549618

550619
private (string, GenerateContentRequest) PrepareOptions(Agent agent,
@@ -688,4 +757,18 @@ private string GetPrompt(IEnumerable<string> systemPrompts, IEnumerable<string>
688757

689758
return prompt;
690759
}
760+
761+
762+
private async Task<RoleDialogModel> OnUserAudioTranscriptionCompleted(RealtimeHubConnection conn, string text)
763+
{
764+
return new RoleDialogModel(AgentRole.User, text)
765+
{
766+
CurrentAgentId = conn.CurrentAgentId
767+
};
768+
}
769+
770+
private Uri BuildWebsocketUri(string apiKey, string version = "v1alpha")
771+
{
772+
return new Uri($"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.{version}.GenerativeService.BidiGenerateContent?key={apiKey}");
773+
}
691774
}

src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ public async Task InsertConversationItem(RoleDialogModel message)
402402
}
403403
else
404404
{
405-
throw new NotImplementedException("");
405+
throw new NotImplementedException($"Unrecognized role {message.Role}.");
406406
}
407407
}
408408

0 commit comments

Comments
 (0)