Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.AI.OpenAI" Version="2.1.0" />
<PackageReference Include="Azure.Identity" Version="1.14.2" />
<PackageReference Include="Microsoft.Extensions.Configuration.UserSecrets" Version="9.0.7" />
<PackageReference Include="Azure.AI.OpenAI" Version="2.3.0-beta.2" />
<PackageReference Include="Azure.Identity" Version="1.15.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.UserSecrets" Version="9.0.9" />
<PackageReference Include="NAudio" Version="2.2.1" />
</ItemGroup>

Expand Down
104 changes: 68 additions & 36 deletions 03-CoreGenerativeAITechniques/src/Audio-02-RealTimeAudio/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,34 @@
using Azure.Identity;
using Microsoft.Extensions.Configuration;
using OpenAI;
using OpenAI.RealtimeConversation;
using OpenAI.Realtime;
using System.ClientModel;

#pragma warning disable OPENAI002
#pragma warning disable AOAI001, OPENAI002

public class Program
{
private static string finishedConversationToolName = "user_wants_to_finish_conversation";

public static async Task Main(string[] args)
{
// First, we create a client according to configured environment variables (see end of file) and then start
// a new conversation session.
RealtimeConversationClient client = GetConfiguredClient();
using RealtimeConversationSession session = await client.StartConversationSessionAsync();

// We'll add a simple function tool that enables the model to interpret user input to figure out when it
// might be a good time to stop the interaction.
ConversationFunctionTool finishConversationTool = new()
{
Name = "user_wants_to_finish_conversation",
Description = "Invoked when the user says goodbye, expresses being finished, or otherwise seems to want to stop the interaction.",
Parameters = BinaryData.FromString("{}")
};
RealtimeClient client = GetConfiguredClient();
var realtimeModel = GetModel();
using RealtimeSession session = await client.StartConversationSessionAsync(realtimeModel);

// Now we configure the session using the tool we created along with transcription options that enable input
// audio transcription with whisper.
await session.ConfigureSessionAsync(new ConversationSessionOptions()
{
Tools = { finishConversationTool },
InputTranscriptionOptions = new()
{
Model = "whisper-1",
},
});
var prompt = "you are a useful chat that helps the user.";
ConversationSessionOptions conversationSessionOptions = CreateConversationSessionOptions(prompt);
await session.ConfigureConversationSessionAsync(conversationSessionOptions);

// For convenience, we'll proactively start playback to the speakers now. Nothing will play until it's enqueued.
SpeakerOutput speakerOutput = new();

// With the session configured, we start processing commands received from the service.
await foreach (ConversationUpdate update in session.ReceiveUpdatesAsync())
await foreach (RealtimeUpdate update in session.ReceiveUpdatesAsync())
{
// session.created is the very first command on a session and lets us know that connection was successful.
if (update is ConversationSessionStartedUpdate)
Expand All @@ -61,7 +50,7 @@ await session.ConfigureSessionAsync(new ConversationSessionOptions()

// input_audio_buffer.speech_started tells us that the beginning of speech was detected in the input audio
// we're sending from the microphone.
if (update is ConversationInputSpeechStartedUpdate speechStartedUpdate)
if (update is InputAudioSpeechStartedUpdate speechStartedUpdate)
{
Console.WriteLine($" <<< Start of speech detected @ {speechStartedUpdate.AudioStartTime}");
// Like any good listener, we can use the cue that the user started speaking as a hint that the app
Expand All @@ -72,22 +61,22 @@ await session.ConfigureSessionAsync(new ConversationSessionOptions()

// input_audio_buffer.speech_stopped tells us that the end of speech was detected in the input audio sent
// from the microphone. It'll automatically tell the model to start generating a response to reply back.
if (update is ConversationInputSpeechFinishedUpdate speechFinishedUpdate)
if (update is InputAudioSpeechFinishedUpdate speechFinishedUpdate)
{
Console.WriteLine($" <<< End of speech detected @ {speechFinishedUpdate.AudioEndTime}");
}

// conversation.item.input_audio_transcription.completed will only arrive if input transcription was
// configured for the session. It provides a written representation of what the user said, which can
// provide good feedback about what the model will use to respond.
if (update is ConversationInputTranscriptionFinishedUpdate transcriptionFinishedUpdate)
if (update is InputAudioTranscriptionFinishedUpdate transcriptionFinishedUpdate)
{
Console.WriteLine($" >>> USER: {transcriptionFinishedUpdate.Transcript}");
}

// Item streaming delta updates provide a combined view into incremental item data including output
// the audio response transcript, function arguments, and audio data.
if (update is ConversationItemStreamingPartDeltaUpdate deltaUpdate)
if (update is OutputDeltaUpdate deltaUpdate)
{
Console.Write(deltaUpdate.AudioTranscript);
Console.Write(deltaUpdate.Text);
Expand All @@ -96,18 +85,18 @@ await session.ConfigureSessionAsync(new ConversationSessionOptions()

// response.output_item.done tells us that a model-generated item with streaming content is completed.
// That's a good signal to provide a visual break and perform final evaluation of tool calls.
if (update is ConversationItemStreamingFinishedUpdate itemFinishedUpdate)
if (update is OutputStreamingStartedUpdate itemFinishedUpdate)
{
Console.WriteLine();
if (itemFinishedUpdate.FunctionName == finishConversationTool.Name)
if (itemFinishedUpdate.FunctionName == finishedConversationToolName)
{
Console.WriteLine($" <<< Finish tool invoked -- ending conversation!");
break;
}
}

// error commands, as the name implies, are raised when something goes wrong.
if (update is ConversationErrorUpdate errorUpdate)
if (update is RealtimeErrorUpdate errorUpdate)
{
Console.WriteLine();
Console.WriteLine();
Expand All @@ -118,7 +107,49 @@ await session.ConfigureSessionAsync(new ConversationSessionOptions()
}
}

private static RealtimeConversationClient GetConfiguredClient()
private static ConversationSessionOptions CreateConversationSessionOptions(string instructions)
{
ConversationSessionOptions sessionOptions = new()
{
Instructions = instructions,
Voice = ConversationVoice.Alloy,
InputAudioFormat = RealtimeAudioFormat.Pcm16,
OutputAudioFormat = RealtimeAudioFormat.Pcm16,
// Input transcription options must be provided to enable transcribed feedback for input audio
InputTranscriptionOptions = new()
{
Model = "whisper-1",
},
};

// We'll add a simple function tool that enables the model to interpret user input to figure out when it
// might be a good time to stop the interaction.
ConversationFunctionTool finishConversationTool = new(finishedConversationToolName)
{
Description = "Invoked when the user says goodbye, expresses being finished, or otherwise seems to want to stop the interaction.",
Parameters = BinaryData.FromString("{}")
};

sessionOptions.Tools.Add(finishConversationTool);
return sessionOptions;
}

#region Configuration

private static string GetModel()
{
string? aoaiDeployment = Environment.GetEnvironmentVariable("AZURE_OPENAI_DEPLOYMENT");


if (string.IsNullOrEmpty(aoaiDeployment))
{
var config = new ConfigurationBuilder().AddUserSecrets<Program>().Build();
aoaiDeployment = config["AZURE_OPENAI_DEPLOYMENT"];
}
return string.IsNullOrEmpty(aoaiDeployment) ? "gpt-realtime" : aoaiDeployment;
}

private static RealtimeClient GetConfiguredClient()
{
string? aoaiEndpoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT");
string? aoaiUseEntra = Environment.GetEnvironmentVariable("AZURE_OPENAI_USE_ENTRA");
Expand Down Expand Up @@ -163,7 +194,7 @@ private static RealtimeConversationClient GetConfiguredClient()
}
}

private static RealtimeConversationClient GetConfiguredClientForAzureOpenAIWithEntra(
private static RealtimeClient GetConfiguredClientForAzureOpenAIWithEntra(
string aoaiEndpoint,
string? aoaiDeployment)
{
Expand All @@ -174,10 +205,10 @@ private static RealtimeConversationClient GetConfiguredClientForAzureOpenAIWithE
: $" * Using deployment (AZURE_OPENAI_DEPLOYMENT): {aoaiDeployment}");

AzureOpenAIClient aoaiClient = new(new Uri(aoaiEndpoint), new DefaultAzureCredential());
return aoaiClient.GetRealtimeConversationClient(aoaiDeployment);
return aoaiClient.GetRealtimeClient();
}

private static RealtimeConversationClient GetConfiguredClientForAzureOpenAIWithKey(
private static RealtimeClient GetConfiguredClientForAzureOpenAIWithKey(
string aoaiEndpoint,
string? aoaiDeployment,
string aoaiApiKey)
Expand All @@ -189,16 +220,17 @@ private static RealtimeConversationClient GetConfiguredClientForAzureOpenAIWithK
: $" * Using deployment (AZURE_OPENAI_DEPLOYMENT): {aoaiDeployment}");

AzureOpenAIClient aoaiClient = new(new Uri(aoaiEndpoint), new ApiKeyCredential(aoaiApiKey));
return aoaiClient.GetRealtimeConversationClient(aoaiDeployment);
return aoaiClient.GetRealtimeClient();
}

private static RealtimeConversationClient GetConfiguredClientForOpenAIWithKey(string oaiApiKey)
private static RealtimeClient GetConfiguredClientForOpenAIWithKey(string oaiApiKey)
{
string oaiEndpoint = "https://api.openai.com/v1";
Console.WriteLine($" * Connecting to OpenAI endpoint (OPENAI_ENDPOINT): {oaiEndpoint}");
Console.WriteLine($" * Using API key (OPENAI_API_KEY): {oaiApiKey[..5]}**");

OpenAIClient aoaiClient = new(new ApiKeyCredential(oaiApiKey));
return aoaiClient.GetRealtimeConversationClient("gpt-4o-realtime-preview-2024-10-01");
return aoaiClient.GetRealtimeClient();
}
#endregion
}
Loading