diff --git a/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs b/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs index c5c55fa6eb58..ed453295da5b 100644 --- a/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs +++ b/dotnet/src/Connectors/Connectors.AzureOpenAI.UnitTests/Services/AzureOpenAIChatCompletionServiceTests.cs @@ -1809,6 +1809,343 @@ public async Task GetStreamingChatMessageContentsWithFunctionCallAndEmptyArgumen Assert.Equal(1, functionCallCount); } + // Sample audio content for testing + private static readonly byte[] s_sampleAudioBytes = { 0x01, 0x02, 0x03, 0x04 }; + + [Fact] + public async Task ItSendsAudioContentCorrectlyAsync() + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json")) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var chatHistory = new ChatHistory(); + chatHistory.AddUserMessage([ + new TextContent("What's in this audio?"), + new AudioContent(s_sampleAudioBytes, "audio/mp3") + ]); + + // Act + await service.GetChatMessageContentsAsync(chatHistory); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + + var messages = optionsJson.GetProperty("messages"); + Assert.Equal(1, messages.GetArrayLength()); + + var contentItems = messages[0].GetProperty("content"); + Assert.Equal(2, contentItems.GetArrayLength()); + + Assert.Equal("text", contentItems[0].GetProperty("type").GetString()); + Assert.Equal("What's in this audio?", contentItems[0].GetProperty("text").GetString()); + + Assert.Equal("input_audio", contentItems[1].GetProperty("type").GetString()); + + // Check for the audio data + Assert.True(contentItems[1].TryGetProperty("input_audio", out var audioData)); + Assert.Equal(JsonValueKind.Object, audioData.ValueKind); + Assert.True(audioData.TryGetProperty("data", out var dataProperty)); + var base64Audio = dataProperty.GetString(); + Assert.True(audioData.TryGetProperty("format", out var formatProperty)); + Assert.Equal("mp3", formatProperty.GetString()); + + Assert.NotNull(base64Audio); + Assert.Equal(Convert.ToBase64String(s_sampleAudioBytes), base64Audio); + } + + [Fact] + public async Task ItHandlesAudioContentInResponseAsync() + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + // Create a response with audio content + var responseJson = """ + { + "model": "gpt-4o", + "choices": [ + { + "message": { + "role": "assistant", + "content": "This is the text response.", + "audio": { + "data": "AQIDBA==" + } + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 20, + "total_tokens": 30 + } + } + """; + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(responseJson) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = ChatResponseModalities.Text | ChatResponseModalities.Audio, + Audio = new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3) + }; + + // Act + var result = await service.GetChatMessageContentAsync(new ChatHistory("test"), settings); + + // Assert + Assert.NotNull(result); + Assert.Equal("This is the text response.", result.Content); + Assert.Equal(2, result.Items.Count); + + var textContent = result.Items[0] as TextContent; + Assert.NotNull(textContent); + Assert.Equal("This is the text response.", textContent.Text); + + var audioContent = result.Items[1] as AudioContent; + Assert.NotNull(audioContent); + Assert.NotNull(audioContent.Data); + Assert.Equal(4, audioContent.Data.Value.Length); + Assert.Equal(s_sampleAudioBytes[0], audioContent.Data.Value.Span[0]); + Assert.Equal(s_sampleAudioBytes[1], audioContent.Data.Value.Span[1]); + Assert.Equal(s_sampleAudioBytes[2], audioContent.Data.Value.Span[2]); + Assert.Equal(s_sampleAudioBytes[3], audioContent.Data.Value.Span[3]); + } + + [Fact] + public async Task ItHandlesAudioContentWithMetadataInResponseAsync() + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + // Create a response with audio content including metadata + var responseJson = """ + { + "model": "gpt-4o", + "choices": [ + { + "message": { + "role": "assistant", + "content": "This is the text response.", + "audio": { + "id": "audio-123456", + "data": "AQIDBA==", + "transcript": "This is the audio transcript.", + "expires_at": 1698765432 + } + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 20, + "total_tokens": 30 + } + } + """; + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(responseJson) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = ChatResponseModalities.Text | ChatResponseModalities.Audio, + Audio = new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3) + }; + + // Act + var result = await service.GetChatMessageContentAsync(new ChatHistory("test"), settings); + + // Assert + Assert.NotNull(result); + Assert.Equal("This is the text response.", result.Content); + Assert.Equal(2, result.Items.Count); + + var textContent = result.Items[0] as TextContent; + Assert.NotNull(textContent); + Assert.Equal("This is the text response.", textContent.Text); + + var audioContent = result.Items[1] as AudioContent; + Assert.NotNull(audioContent); + Assert.NotNull(audioContent.Data); + Assert.Equal(4, audioContent.Data.Value.Length); + Assert.Equal(s_sampleAudioBytes[0], audioContent.Data.Value.Span[0]); + Assert.Equal(s_sampleAudioBytes[1], audioContent.Data.Value.Span[1]); + Assert.Equal(s_sampleAudioBytes[2], audioContent.Data.Value.Span[2]); + Assert.Equal(s_sampleAudioBytes[3], audioContent.Data.Value.Span[3]); + + // Verify audio metadata + Assert.NotNull(audioContent.Metadata); + Assert.Equal("audio-123456", audioContent.Metadata["Id"]); + Assert.Equal("This is the audio transcript.", audioContent.Metadata["Transcript"]); + Assert.NotNull(audioContent.Metadata["ExpiresAt"]); + // The ExpiresAt value is converted to a DateTime object, so we can't directly compare it to the Unix timestamp + } + + [Theory] + [MemberData(nameof(ResponseModalitiesData))] + public async Task ItCreatesCorrectResponseModalitiesAsync(object responseModalities, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json")) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = responseModalities + }; + + // Act + await service.GetChatMessageContentsAsync(new ChatHistory("test"), settings); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("modalities", out var property)); + Assert.Equal(expectedJson, property.GetRawText()); + } + + [Theory] + [MemberData(nameof(ResponseModalitiesData))] + public async Task ItCreatesCorrectResponseModalitiesStreamingAsync(object responseModalities, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(AzureOpenAITestHelper.GetTestResponse("chat_completion_streaming_test_response.txt"))); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StreamContent(stream) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Modalities = responseModalities + }; + + // Act + var asyncEnumerable = service.GetStreamingChatMessageContentsAsync(new ChatHistory("test"), settings); + await asyncEnumerable.GetAsyncEnumerator().MoveNextAsync(); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("modalities", out var property)); + Assert.Equal(expectedJson, property.GetRawText()); + } + + [Theory] + [MemberData(nameof(AudioOptionsData))] + public async Task ItCreatesCorrectAudioOptionsAsync(object audioOptions, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent(AzureOpenAITestHelper.GetTestResponse("chat_completion_test_response.json")) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Audio = audioOptions + }; + + // Act + await service.GetChatMessageContentsAsync(new ChatHistory("test"), settings); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("audio", out var property)); + Assert.Equal(JsonValueKind.Object, property.ValueKind); + Assert.Equal(expectedJson, property.GetRawText()); + } + + [Theory] + [MemberData(nameof(AudioOptionsData))] + public async Task ItCreatesCorrectAudioOptionsStreamingAsync(object audioOptions, string expectedJson) + { + // Arrange + var service = new AzureOpenAIChatCompletionService("deployment", "https://endpoint", "api-key", "model-id", this._httpClient); + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(AzureOpenAITestHelper.GetTestResponse("chat_completion_streaming_test_response.txt"))); + + using var responseMessage = new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StreamContent(stream) + }; + this._messageHandlerStub.ResponsesToReturn.Add(responseMessage); + + var settings = new AzureOpenAIPromptExecutionSettings + { + Audio = audioOptions + }; + + // Act + var asyncEnumerable = service.GetStreamingChatMessageContentsAsync(new ChatHistory("test"), settings); + await asyncEnumerable.GetAsyncEnumerator().MoveNextAsync(); + + // Assert + var actualRequestContent = Encoding.UTF8.GetString(this._messageHandlerStub.RequestContents[0]!); + Assert.NotNull(actualRequestContent); + var optionsJson = JsonSerializer.Deserialize(actualRequestContent); + Assert.True(optionsJson.TryGetProperty("audio", out var property)); + Assert.Equal(JsonValueKind.Object, property.ValueKind); + Assert.Equal(expectedJson, property.GetRawText()); + } + + // Add these theory data members to the class: + + public static TheoryData ResponseModalitiesData => new() +{ + { ChatResponseModalities.Text, "[\"text\"]" }, + { ChatResponseModalities.Audio, "[\"audio\"]" }, + { ChatResponseModalities.Text | ChatResponseModalities.Audio, "[\"text\",\"audio\"]" }, + { new[] { "text" }, "[\"text\"]" }, + { new[] { "audio" }, "[\"audio\"]" }, + { new[] { "text", "audio" }, "[\"text\",\"audio\"]" }, + { "Text", "[\"text\"]" }, + { "Audio", "[\"audio\"]" }, + { JsonSerializer.Deserialize("\"text\""), "[\"text\"]" }, + { JsonSerializer.Deserialize("\"audio\""), "[\"audio\"]" }, + { JsonSerializer.Deserialize("[\"text\", \"audio\"]"), "[\"text\",\"audio\"]" }, +}; + + public static TheoryData AudioOptionsData => new() +{ + { new ChatAudioOptions(ChatOutputAudioVoice.Alloy, ChatOutputAudioFormat.Mp3), "{\"voice\":\"alloy\",\"format\":\"mp3\"}" }, + { new ChatAudioOptions(ChatOutputAudioVoice.Echo, ChatOutputAudioFormat.Opus), "{\"voice\":\"echo\",\"format\":\"opus\"}" }, + { JsonSerializer.Deserialize("{\"voice\":\"alloy\",\"format\":\"mp3\"}"), "{\"voice\":\"alloy\",\"format\":\"mp3\"}" }, + { "{\"voice\":\"echo\",\"format\":\"opus\"}", "{\"voice\":\"echo\",\"format\":\"opus\"}" }, +}; + public static TheoryData Versions => new() { { "V2025_03_01_preview", "2025-03-01-preview" }, diff --git a/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs b/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs index 7e1e2f2c2a79..504982dc1304 100644 --- a/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs +++ b/dotnet/src/Connectors/Connectors.AzureOpenAI/Core/AzureClientCore.ChatCompletion.cs @@ -2,7 +2,9 @@ using System; using System.ClientModel.Primitives; +using System.Collections.Generic; using System.Diagnostics; +using System.Text.Json; using Azure.AI.OpenAI.Chat; using Microsoft.SemanticKernel.ChatCompletion; using Microsoft.SemanticKernel.Connectors.OpenAI; @@ -52,7 +54,16 @@ protected override ChatCompletionOptions CreateChatCompletionOptions( options.IncludeLogProbabilities = executionSettings.Logprobs; options.StoredOutputEnabled = executionSettings.Store; options.ReasoningEffortLevel = GetEffortLevel(executionSettings); - options.ResponseModalities = ChatResponseModalities.Default; + + if (executionSettings.Modalities is not null) + { + options.ResponseModalities = GetResponseModalities(executionSettings); + } + + if (executionSettings.Audio is not null) + { + options.AudioOptions = GetAudioOptions(executionSettings); + } if (azureSettings.SetNewMaxCompletionTokensEnabled) { @@ -91,6 +102,11 @@ protected override ChatCompletionOptions CreateChatCompletionOptions( #pragma warning restore AOAI001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. } + if (toolCallingConfig.Options?.AllowParallelCalls is not null) + { + options.AllowParallelToolCalls = toolCallingConfig.Options.AllowParallelCalls; + } + if (executionSettings.TokenSelectionBiases is not null) { foreach (var keyValue in executionSettings.TokenSelectionBiases) @@ -115,11 +131,135 @@ protected override ChatCompletionOptions CreateChatCompletionOptions( } } - if (toolCallingConfig.Options?.AllowParallelCalls is not null) + return options; + } + + /// + /// Gets the response modalities from the execution settings. + /// + /// The execution settings. + /// The response modalities as a flags enum. + private static ChatResponseModalities GetResponseModalities(OpenAIPromptExecutionSettings executionSettings) + { + static ChatResponseModalities ParseResponseModalitiesEnumerable(IEnumerable responseModalitiesStrings) { - options.AllowParallelToolCalls = toolCallingConfig.Options.AllowParallelCalls; + ChatResponseModalities result = ChatResponseModalities.Default; + foreach (var modalityString in responseModalitiesStrings) + { + if (Enum.TryParse(modalityString, true, out var parsedModality)) + { + result |= parsedModality; + } + else + { + throw new NotSupportedException($"The provided response modalities '{modalityString}' is not supported."); + } + } + + return result; } - return options; + if (executionSettings.Modalities is null) + { + return ChatResponseModalities.Default; + } + + if (executionSettings.Modalities is ChatResponseModalities responseModalities) + { + return responseModalities; + } + + if (executionSettings.Modalities is IEnumerable responseModalitiesStrings) + { + return ParseResponseModalitiesEnumerable(responseModalitiesStrings); + } + + if (executionSettings.Modalities is string responseModalitiesString) + { + if (Enum.TryParse(responseModalitiesString, true, out var parsedResponseModalities)) + { + return parsedResponseModalities; + } + throw new NotSupportedException($"The provided response modalities '{responseModalitiesString}' is not supported."); + } + + if (executionSettings.Modalities is JsonElement responseModalitiesElement) + { + if (responseModalitiesElement.ValueKind == JsonValueKind.String) + { + var modalityString = responseModalitiesElement.GetString(); + if (Enum.TryParse(modalityString, true, out var parsedResponseModalities)) + { + return parsedResponseModalities; + } + + throw new NotSupportedException($"The provided response modalities '{modalityString}' is not supported."); + } + + if (responseModalitiesElement.ValueKind == JsonValueKind.Array) + { + try + { + var modalitiesEnumeration = JsonSerializer.Deserialize>(responseModalitiesElement.GetRawText())!; + return ParseResponseModalitiesEnumerable(modalitiesEnumeration); + } + catch (JsonException ex) + { + throw new NotSupportedException("The provided response modalities JSON array may only contain strings.", ex); + } + } + + throw new NotSupportedException($"The provided response modalities '{executionSettings.Modalities?.GetType()}' is not supported."); + } + + return ChatResponseModalities.Default; + } + + + /// + /// Gets the audio options from the execution settings. + /// + /// The execution settings. + /// The audio options as a object. + private static ChatAudioOptions GetAudioOptions(OpenAIPromptExecutionSettings executionSettings) + { + if (executionSettings.Audio is ChatAudioOptions audioOptions) + { + return audioOptions; + } + + if (executionSettings.Audio is JsonElement audioOptionsElement) + { + try + { + var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsElement.GetRawText())); + if (result != null) + { + return result; + } + } + catch (Exception ex) + { + throw new NotSupportedException($"Failed to parse the provided audio options from JSON. Ensure the JSON structure matches ChatAudioOptions format.", ex); + } + } + + if (executionSettings.Audio is string audioOptionsString) + { + try + { + var result = ModelReaderWriter.Read(BinaryData.FromString(audioOptionsString)); + if (result != null) + { + return result; + } + } + catch (Exception ex) + { + throw new NotSupportedException($"Failed to parse the provided audio options from string. Ensure the string is valid JSON that matches ChatAudioOptions format.", ex); + } + } + + throw new NotSupportedException($"The provided audio options '{executionSettings.Audio?.GetType()}' is not supported."); } }