From 1f91c0c09656f91894cdbec10433daa4c6d4cb5d Mon Sep 17 00:00:00 2001 From: Sulyman Alani Date: Wed, 18 Jun 2025 10:10:08 +0100 Subject: [PATCH 1/3] Enhance AzureClientCore with response and audio options Introduce methods to handle response modalities and audio options in AzureClientCore. Add checks for executionSettings.Modalities and executionSettings.Audio to dynamically configure options based on user settings. Implement GetResponseModalities and GetAudioOptions methods to support various input formats, improving flexibility and robustness. --- .../Core/AzureClientCore.ChatCompletion.cs | 121 +++++++++++++++++- 1 file changed, 117 insertions(+), 4 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs b/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs index 7e1e2f2c2a79..a2309042a74d 100644 --- a/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs +++ b/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs @@ -2,7 +2,9 @@ using System; using System.ClientModel.Primitives; +using System.Collections.Generic; using System.Diagnostics; +using System.Text.Json; using Azure.AI.OpenAI.Chat; using Microsoft.SemanticKernel.ChatCompletion; using Microsoft.SemanticKernel.Connectors.OpenAI; @@ -52,7 +54,16 @@ protected override ChatCompletionOptions CreateChatCompletionOptions( options.IncludeLogProbabilities = executionSettings.Logprobs; options.StoredOutputEnabled = executionSettings.Store; options.ReasoningEffortLevel = GetEffortLevel(executionSettings); - options.ResponseModalities = ChatResponseModalities.Default; + + if (executionSettings.Modalities is not null) + { + options.ResponseModalities = GetResponseModalities(executionSettings); + } + + if (executionSettings.Audio is not null) + { + options.AudioOptions = GetAudioOptions(executionSettings); + } if (azureSettings.SetNewMaxCompletionTokensEnabled) { @@ -91,6 +102,11 @@ protected override ChatCompletionOptions CreateChatCompletionOptions( #pragma warning restore AOAI001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. } + if (toolCallingConfig.Options?.AllowParallelCalls is not null) + { + options.AllowParallelToolCalls = toolCallingConfig.Options.AllowParallelCalls; + } + if (executionSettings.TokenSelectionBiases is not null) { foreach (var keyValue in executionSettings.TokenSelectionBiases) @@ -115,11 +131,108 @@ protected override ChatCompletionOptions CreateChatCompletionOptions( } } - if (toolCallingConfig.Options?.AllowParallelCalls is not null) + return options; + } + + /// + /// Gets the response modalities from the execution settings. + /// + /// The execution settings. + /// The response modalities as a flags enum. + private static ChatResponseModalities GetResponseModalities(OpenAIPromptExecutionSettings executionSettings) + { + static ChatResponseModalities ParseResponseModalitiesEnumerable(IEnumerable responseModalitiesStrings) { - options.AllowParallelToolCalls = toolCallingConfig.Options.AllowParallelCalls; + ChatResponseModalities result = ChatResponseModalities.Default; + foreach (var modalityString in responseModalitiesStrings) + { + if (Enum.TryParse(modalityString, true, out var parsedModality)) + { + result |= parsedModality; + } + else + { + throw new NotSupportedException($"The provided response modalities '{modalityString}' is not supported."); + } + } + + return result; } - return options; + if (executionSettings.Modalities is null) + { + return ChatResponseModalities.Default; + } + + if (executionSettings.Modalities is ChatResponseModalities responseModalities) + { + return responseModalities; + } + + if (executionSettings.Modalities is IEnumerable responseModalitiesStrings) + { + return ParseResponseModalitiesEnumerable(responseModalitiesStrings); + } + + if (executionSettings.Modalities is string responseModalitiesString) + { + if (Enum.TryParse(responseModalitiesString, true, out var parsedResponseModalities)) + { + return parsedResponseModalities; + } + throw new NotSupportedException($"The provided response modalities '{responseModalitiesString}' is not supported."); + } + + if (executionSettings.Modalities is JsonElement responseModalitiesElement) + { + if (responseModalitiesElement.ValueKind == JsonValueKind.String && + Enum.TryParse(responseModalitiesElement.GetString(), true, out var parsedResponseModalities)) + { + return parsedResponseModalities; + } + + if (responseModalitiesElement.ValueKind == JsonValueKind.Array) + { + var modalitiesEnumeration = JsonSerializer.Deserialize>(responseModalitiesElement.GetRawText())!; + return ParseResponseModalitiesEnumerable(modalitiesEnumeration); + } + + throw new NotSupportedException($"The provided response modalities '{executionSettings.Modalities?.GetType()}' is not supported."); + } + + return ChatResponseModalities.Default; + } + + /// + /// Gets the audio options from the execution settings. + /// + /// The execution settings. + /// The audio options as a object. + private static ChatAudioOptions GetAudioOptions(OpenAIPromptExecutionSettings executionSettings) + { + if (executionSettings.Audio is ChatAudioOptions audioOptions) + { + return audioOptions; + } + + if (executionSettings.Audio is JsonElement audioOptionsElement) + { + var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsElement.GetRawText())); + if (result != null) + { + return result; + } + } + + if (executionSettings.Audio is string audioOptionsString) + { + var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsString)); + if (result != null) + { + return result; + } + } + + throw new NotSupportedException($"The provided audio options '{executionSettings.Audio?.GetType()}' is not supported."); } } From d61dcc952b7854cc11d0b8a4bfec4d0e19b70e10 Mon Sep 17 00:00:00 2001 From: Sulyman Alani Date: Wed, 18 Jun 2025 16:23:01 +0100 Subject: [PATCH 2/3] Add audio content handling tests to AzureOpenAIChat Implemented tests in `AzureOpenAIChatCompletionServiceTests` to verify the correct handling of audio content in requests and responses. This includes checks for sending audio content, processing audio responses, and handling audio metadata. Introduced new theory data members for validating response modalities and audio options. --- .../AzureOpenAIChatCompletionServiceTests.cs | 337 ++++++++++++++++++ 1 file changed, 337 insertions(+) diff --git a/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs b/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs index c5c55fa6eb58..ed453295da5b 100644 --- a/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs +++ b/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs @@ -1809,6 +1809,343 @@ public async Task GetStreamingChatMessageContentsWithFunctionCallAndEmptyArgumen Assert.Equal(1, functionCallCount); } + // Sample audio content for testing + private static readonly byte[] s_sampleAudioBytes = { 0x01, 0x02, 0x03, 0x04 }; + + [Fact] + public async Task ItSendsAudioContentCorrectlyAsync() + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json")) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var chatHistory = new ChatHistory(); + chatHistory.AddUserMessage([ + new TextContent("What's in this audio?"), + new AudioContent(s_sampleAudioBytes, "audio/mp3") + ]); + + // Act + await service.GetChatMessageContentsAsync(chatHistory); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + + var messages = optionsJson.GetProperty("messages"); + Assert.Equal(1, messages.GetArrayLength()); + + var contentItems = messages[0].GetProperty("content"); + Assert.Equal(2, contentItems.GetArrayLength()); + + Assert.Equal("text", contentItems[0].GetProperty("type").GetString()); + Assert.Equal("What's in this audio?", contentItems[0].GetProperty("text").GetString()); + + Assert.Equal("input_audio", contentItems[1].GetProperty("type").GetString()); + + // Check for the audio data + Assert.True(contentItems[1].TryGetProperty("input_audio", out var audioData)); + Assert.Equal(JsonValueKind.Object, audioData.ValueKind); + Assert.True(audioData.TryGetProperty("data", out var dataProperty)); + var base64Audio = dataProperty.GetString(); + Assert.True(audioData.TryGetProperty("format", out var formatProperty)); + Assert.Equal("mp3", formatProperty.GetString()); + + Assert.NotNull(base64Audio); + Assert.Equal(Convert.ToBase64String(s_sampleAudioBytes), base64Audio); + } + + [Fact] + public async Task ItHandlesAudioContentInResponseAsync() + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + // Create a response with audio content + var responseJson = """ + { + "model": "gpt-4o", + "choices": [ + { + "message": { + "role": "assistant", + "content": "This is the text response.", + "audio": { + "data": "AQIDBA==" + } + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 20, + "total_tokens": 30 + } + } + """; + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(responseJson) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = ChatResponseModalities.Text | ChatResponseModalities.Audio, + Audio = new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3) + }; + + // Act + var result = await service.GetChatMessageContentAsync(new ChatHistory("test"), settings); + + // Assert + Assert.NotNull(result); + Assert.Equal("This is the text response.", result.Content); + Assert.Equal(2, result.Items.Count); + + var textContent = result.Items[0] as TextContent; + Assert.NotNull(textContent); + Assert.Equal("This is the text response.", textContent.Text); + + var audioContent = result.Items[1] as AudioContent; + Assert.NotNull(audioContent); + Assert.NotNull(audioContent.Data); + Assert.Equal(4, audioContent.Data.Value.Length); + Assert.Equal(s_sampleAudioBytes[0], audioContent.Data.Value.Span[0]); + Assert.Equal(s_sampleAudioBytes[1], audioContent.Data.Value.Span[1]); + Assert.Equal(s_sampleAudioBytes[2], audioContent.Data.Value.Span[2]); + Assert.Equal(s_sampleAudioBytes[3], audioContent.Data.Value.Span[3]); + } + + [Fact] + public async Task ItHandlesAudioContentWithMetadataInResponseAsync() + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + // Create a response with audio content including metadata + var responseJson = """ + { + "model": "gpt-4o", + "choices": [ + { + "message": { + "role": "assistant", + "content": "This is the text response.", + "audio": { + "id": "audio-123456", + "data": "AQIDBA==", + "transcript": "This is the audio transcript.", + "expires_at": 1698765432 + } + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 20, + "total_tokens": 30 + } + } + """; + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(responseJson) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = ChatResponseModalities.Text | ChatResponseModalities.Audio, + Audio = new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3) + }; + + // Act + var result = await service.GetChatMessageContentAsync(new ChatHistory("test"), settings); + + // Assert + Assert.NotNull(result); + Assert.Equal("This is the text response.", result.Content); + Assert.Equal(2, result.Items.Count); + + var textContent = result.Items[0] as TextContent; + Assert.NotNull(textContent); + Assert.Equal("This is the text response.", textContent.Text); + + var audioContent = result.Items[1] as AudioContent; + Assert.NotNull(audioContent); + Assert.NotNull(audioContent.Data); + Assert.Equal(4, audioContent.Data.Value.Length); + Assert.Equal(s_sampleAudioBytes[0], audioContent.Data.Value.Span[0]); + Assert.Equal(s_sampleAudioBytes[1], audioContent.Data.Value.Span[1]); + Assert.Equal(s_sampleAudioBytes[2], audioContent.Data.Value.Span[2]); + Assert.Equal(s_sampleAudioBytes[3], audioContent.Data.Value.Span[3]); + + // Verify audio metadata + Assert.NotNull(audioContent.Metadata); + Assert.Equal("audio-123456", audioContent.Metadata["Id"]); + Assert.Equal("This is the audio transcript.", audioContent.Metadata["Transcript"]); + Assert.NotNull(audioContent.Metadata["ExpiresAt"]); + // The ExpiresAt value is converted to a DateTime object, so we can't directly compare it to the Unix timestamp + } + + [Theory] + [MemberData(nameof(ResponseModalitiesData))] + public async Task ItCreatesCorrectResponseModalitiesAsync(object responseModalities, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json")) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = responseModalities + }; + + // Act + await service.GetChatMessageContentsAsync(new ChatHistory("test"), settings); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("modalities", out var property)); + Assert.Equal(expectedJson, property.GetRawText()); + } + + [Theory] + [MemberData(nameof(ResponseModalitiesData))] + public async Task ItCreatesCorrectResponseModalitiesStreamingAsync(object responseModalities, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(AzureOpenAITestHelper.GetTestResponse("chat_completion_streaming_test_response.txt"))); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StreamContent(stream) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = responseModalities + }; + + // Act + var asyncEnumerable = service.GetStreamingChatMessageContentsAsync(new ChatHistory("test"), settings); + await asyncEnumerable.GetAsyncEnumerator().MoveNextAsync(); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("modalities", out var property)); + Assert.Equal(expectedJson, property.GetRawText()); + } + + [Theory] + [MemberData(nameof(AudioOptionsData))] + public async Task ItCreatesCorrectAudioOptionsAsync(object audioOptions, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json")) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Audio = audioOptions + }; + + // Act + await service.GetChatMessageContentsAsync(new ChatHistory("test"), settings); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("audio", out var property)); + Assert.Equal(JsonValueKind.Object, property.ValueKind); + Assert.Equal(expectedJson, property.GetRawText()); + } + + [Theory] + [MemberData(nameof(AudioOptionsData))] + public async Task ItCreatesCorrectAudioOptionsStreamingAsync(object audioOptions, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(AzureOpenAITestHelper.GetTestResponse("chat_completion_streaming_test_response.txt"))); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StreamContent(stream) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Audio = audioOptions + }; + + // Act + var asyncEnumerable = service.GetStreamingChatMessageContentsAsync(new ChatHistory("test"), settings); + await asyncEnumerable.GetAsyncEnumerator().MoveNextAsync(); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("audio", out var property)); + Assert.Equal(JsonValueKind.Object, property.ValueKind); + Assert.Equal(expectedJson, property.GetRawText()); + } + + // Add these theory data members to the class: + + public static TheoryData ResponseModalitiesData => new() +{ + { ChatResponseModalities.Text, "[\"text\"]" }, + { ChatResponseModalities.Audio, "[\"audio\"]" }, + { ChatResponseModalities.Text | ChatResponseModalities.Audio, "[\"text\",\"audio\"]" }, + { new[] { "text" }, "[\"text\"]" }, + { new[] { "audio" }, "[\"audio\"]" }, + { new[] { "text", "audio" }, "[\"text\",\"audio\"]" }, + { "Text", "[\"text\"]" }, + { "Audio", "[\"audio\"]" }, + { JsonSerializer.Deserialize("\"text\""), "[\"text\"]" }, + { JsonSerializer.Deserialize("\"audio\""), "[\"audio\"]" }, + { JsonSerializer.Deserialize("[\"text\", \"audio\"]"), "[\"text\",\"audio\"]" }, +}; + + public static TheoryData AudioOptionsData => new() +{ + { new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3), "{\"voice\":\"alloy\",\"format\":\"mp3\"}" }, + { new ChatAudioOptions(ChatOutputAudioVoice.Echo, ChatOutputAudioFormat.Opus), "{\"voice\":\"echo\",\"format\":\"opus\"}" }, + { JsonSerializer.Deserialize("{\"voice\":\"alloy\",\"format\":\"mp3\"}"), "{\"voice\":\"alloy\",\"format\":\"mp3\"}" }, + { "{\"voice\":\"echo\",\"format\":\"opus\"}", "{\"voice\":\"echo\",\"format\":\"opus\"}" }, +}; + public static TheoryData Versions => new() { { "V2025_03_01_preview", "2025-03-01-preview" }, From aaee01d505c8665ae8327e692000270bb48e513a Mon Sep 17 00:00:00 2001 From: Sulyman Alani Date: Fri, 20 Jun 2025 18:46:16 +0100 Subject: [PATCH 3/3] Improve error handling for modality and audio parsing Enhance error handling by introducing try-catch blocks for JSON deserialization, providing clearer exception messages for unsupported modalities and invalid audio options. Refactor parsing logic for string modalities to improve code readability and maintainability. --- .../Core/AzureClientCore.ChatCompletion.cs | 49 ++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs b/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs index a2309042a74d..504982dc1304 100644 --- a/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs +++ b/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs @@ -185,16 +185,28 @@ static ChatResponseModalities ParseResponseModalitiesEnumerable(IEnumerable(responseModalitiesElement.GetString(), true, out var parsedResponseModalities)) + if (responseModalitiesElement.ValueKind == JsonValueKind.String) { - return parsedResponseModalities; + var modalityString = responseModalitiesElement.GetString(); + if (Enum.TryParse(modalityString, true, out var parsedResponseModalities)) + { + return parsedResponseModalities; + } + + throw new NotSupportedException($"The provided response modalities '{modalityString}' is not supported."); } if (responseModalitiesElement.ValueKind == JsonValueKind.Array) { - var modalitiesEnumeration = JsonSerializer.Deserialize>(responseModalitiesElement.GetRawText())!; - return ParseResponseModalitiesEnumerable(modalitiesEnumeration); + try + { + var modalitiesEnumeration = JsonSerializer.Deserialize>(responseModalitiesElement.GetRawText())!; + return ParseResponseModalitiesEnumerable(modalitiesEnumeration); + } + catch (JsonException ex) + { + throw new NotSupportedException("The provided response modalities JSON array may only contain strings.", ex); + } } throw new NotSupportedException($"The provided response modalities '{executionSettings.Modalities?.GetType()}' is not supported."); @@ -203,6 +215,7 @@ static ChatResponseModalities ParseResponseModalitiesEnumerable(IEnumerable /// Gets the audio options from the execution settings. /// @@ -217,19 +230,33 @@ private static ChatAudioOptions GetAudioOptions(OpenAIPromptExecutionSettings ex if (executionSettings.Audio is JsonElement audioOptionsElement) { - var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsElement.GetRawText())); - if (result != null) + try { - return result; + var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsElement.GetRawText())); + if (result != null) + { + return result; + } + } + catch (Exception ex) + { + throw new NotSupportedException($"Failed to parse the provided audio options from JSON. Ensure the JSON structure matches ChatAudioOptions format.", ex); } } if (executionSettings.Audio is string audioOptionsString) { - var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsString)); - if (result != null) + try + { + var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsString)); + if (result != null) + { + return result; + } + } + catch (Exception ex) { - return result; + throw new NotSupportedException($"Failed to parse the provided audio options from string. Ensure the string is valid JSON that matches ChatAudioOptions format.", ex); } }