refactor(openai): change voice parameter to string in OpenAI Audio Speech API (#2395)

dev-jonghoonpark · tzolov · commit 14e7033a8e4d · 2025-04-09T21:10:31.000+02:00
This change modifies the voice parameter in OpenAI Audio Speech API from using the
Voice enum directly to using the string value of the enum. This provides more
flexibility for handling voice options, especially for custom voices or when voice
names come from configuration.

- Change voice parameter type from Voice enum to String
- Add overloaded methods to accept both enum and string values
- Update tests and documentation to reflect these changes

Signed-off-by: jonghoon park &lt;dev@jonghoonpark.com&gt;
diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java
@@ -29,6 +29,7 @@
  *
  * @author Ahmed Yousri
  * @author Stefan Vassilev
+ * @author Jonghoon Park
  */
 @ConfigurationProperties(OpenAiAudioSpeechProperties.CONFIG_PREFIX)
 public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
@@ -39,7 +40,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
 
 	private static final Float SPEED = 1.0f;
 
-	private static final OpenAiAudioApi.SpeechRequest.Voice VOICE = OpenAiAudioApi.SpeechRequest.Voice.ALLOY;
+	private static final String VOICE = OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue();
 
 	private static final OpenAiAudioApi.SpeechRequest.AudioResponseFormat DEFAULT_RESPONSE_FORMAT = OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3;
 
diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/test/java/org/springframework/ai/model/openai/autoconfigure/OpenAiPropertiesTests.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/test/java/org/springframework/ai/model/openai/autoconfigure/OpenAiPropertiesTests.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
  *
  * @author Christian Tzolov
  * @author Thomas Vitale
+ * @author Jonghoon Park
  * @since 0.8.0
  */
 public class OpenAiPropertiesTests {
@@ -177,7 +178,7 @@ public void speechProperties() {
 
 				assertThat(speechProperties.getOptions().getModel()).isEqualTo("TTS_1");
 				assertThat(speechProperties.getOptions().getVoice())
-					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY);
+					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue());
 				assertThat(speechProperties.getOptions().getResponseFormat())
 					.isEqualTo(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3);
 				assertThat(speechProperties.getOptions().getSpeed()).isEqualTo(0.75f);
@@ -205,7 +206,7 @@ public void speechPropertiesTest() {
 
 				assertThat(speechProperties.getOptions().getModel()).isEqualTo("TTS_1");
 				assertThat(speechProperties.getOptions().getVoice())
-					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY);
+					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue());
 				assertThat(speechProperties.getOptions().getResponseFormat())
 					.isEqualTo(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3);
 				assertThat(speechProperties.getOptions().getSpeed()).isEqualTo(0.75f);
@@ -237,7 +238,8 @@ public void speechOverrideConnectionPropertiesTest() {
 				assertThat(speechProperties.getBaseUrl()).isEqualTo("TEST_BASE_URL2");
 
 				assertThat(speechProperties.getOptions().getModel()).isEqualTo("TTS_2");
-				assertThat(speechProperties.getOptions().getVoice()).isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ECHO);
+				assertThat(speechProperties.getOptions().getVoice())
+					.isEqualTo(OpenAiAudioApi.SpeechRequest.Voice.ECHO.getValue());
 				assertThat(speechProperties.getOptions().getResponseFormat())
 					.isEqualTo(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.OPUS);
 				assertThat(speechProperties.getOptions().getSpeed()).isEqualTo(0.5f);
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,6 +42,7 @@
  * @author Ahmed Yousri
  * @author Hyunjoon Choi
  * @author Thomas Vitale
+ * @author Jonghoon Park
  * @see OpenAiAudioApi
  * @since 1.0.0-M1
  */
@@ -81,7 +82,7 @@ public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) {
 				OpenAiAudioSpeechOptions.builder()
 					.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
 					.responseFormat(AudioResponseFormat.MP3)
-					.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+					.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 					.speed(SPEED)
 					.build());
 	}
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,14 +29,15 @@
  * @author Ahmed Yousri
  * @author Hyunjoon Choi
  * @author Ilayaperumal Gopinathan
+ * @author Jonghoon Park
  * @since 1.0.0-M1
  */
 @JsonInclude(JsonInclude.Include.NON_NULL)
 public class OpenAiAudioSpeechOptions implements ModelOptions {
 
 	/**
-	 * ID of the model to use for generating the audio. One of the available TTS models:
-	 * tts-1 or tts-1-hd.
+	 * ID of the model to use for generating the audio. For OpenAI's TTS API, use one of
+	 * the available models: tts-1 or tts-1-hd.
 	 */
 	@JsonProperty("model")
 	private String model;
@@ -48,11 +49,11 @@ public class OpenAiAudioSpeechOptions implements ModelOptions {
 	private String input;
 
 	/**
-	 * The voice to use for synthesis. One of the available voices for the chosen model:
-	 * 'alloy', 'echo', 'fable', 'onyx', 'nova', and 'shimmer'.
+	 * The voice to use for synthesis. For OpenAI's TTS API, One of the available voices
+	 * for the chosen model: 'alloy', 'echo', 'fable', 'onyx', 'nova', and 'shimmer'.
 	 */
 	@JsonProperty("voice")
-	private Voice voice;
+	private String voice;
 
 	/**
 	 * The format of the audio output. Supported formats are mp3, opus, aac, and flac.
@@ -88,14 +89,18 @@ public void setInput(String input) {
 		this.input = input;
 	}
 
-	public Voice getVoice() {
+	public String getVoice() {
 		return this.voice;
 	}
 
-	public void setVoice(Voice voice) {
+	public void setVoice(String voice) {
 		this.voice = voice;
 	}
 
+	public void setVoice(Voice voice) {
+		this.voice = voice.getValue();
+	}
+
 	public AudioResponseFormat getResponseFormat() {
 		return this.responseFormat;
 	}
@@ -197,11 +202,16 @@ public Builder input(String input) {
 			return this;
 		}
 
-		public Builder voice(Voice voice) {
+		public Builder voice(String voice) {
 			this.options.voice = voice;
 			return this;
 		}
 
+		public Builder voice(Voice voice) {
+			this.options.voice = voice.getValue();
+			return this;
+		}
+
 		public Builder responseFormat(AudioResponseFormat responseFormat) {
 			this.options.responseFormat = responseFormat;
 			return this;
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,6 +47,7 @@
  *
  * @author Christian Tzolov
  * @author Ilayaperumal Gopinathan
+ * @author Jonghoon Park
  * @since 0.8.1
  */
 public class OpenAiAudioApi {
@@ -330,7 +331,7 @@ public record SpeechRequest(
 	// @formatter:off
 		@JsonProperty("model") String model,
 		@JsonProperty("input") String input,
-		@JsonProperty("voice") Voice voice,
+		@JsonProperty("voice") String voice,
 		@JsonProperty("response_format") AudioResponseFormat responseFormat,
 		@JsonProperty("speed") Float speed) {
 		// @formatter:on
@@ -419,7 +420,7 @@ public static class Builder {
 
 			private String input;
 
-			private Voice voice;
+			private String voice;
 
 			private AudioResponseFormat responseFormat = AudioResponseFormat.MP3;
 
@@ -435,11 +436,16 @@ public Builder input(String input) {
 				return this;
 			}
 
-			public Builder voice(Voice voice) {
+			public Builder voice(String voice) {
 				this.voice = voice;
 				return this;
 			}
 
+			public Builder voice(Voice voice) {
+				this.voice = voice.getValue();
+				return this;
+			}
+
 			public Builder responseFormat(AudioResponseFormat responseFormat) {
 				this.responseFormat = responseFormat;
 				return this;
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,6 +37,7 @@
 
 /**
  * @author Christian Tzolov
+ * @author Jonghoon Park
  */
 @EnabledIfEnvironmentVariable(named = "OPENAI_API_KEY", matches = ".+")
 public class OpenAiAudioApiIT {
@@ -53,7 +54,7 @@ void speechTranscriptionAndTranslation() throws IOException {
 			.createSpeech(SpeechRequest.builder()
 				.model(TtsModel.TTS_1_HD.getValue())
 				.input("Hello, my name is Chris and I love Spring A.I.")
-				.voice(Voice.ONYX)
+				.voice(Voice.ONYX.getValue())
 				.build())
 			.getBody();
 
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java
@@ -31,6 +31,7 @@
 
 /**
  * @author Ilayaperumal Gopinathan
+ * @author Jonghoon Park
  */
 @SpringBootTest(classes = OpenAiAudioModelNoOpApiKeysIT.Config.class)
 @EnabledIfEnvironmentVariable(named = "OPENAI_API_KEY", matches = ".+")
@@ -46,7 +47,7 @@ void checkNoOpKey() {
 				.createSpeech(OpenAiAudioApi.SpeechRequest.builder()
 					.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
 					.input("Hello, my name is Chris and I love Spring A.I.")
-					.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX)
+					.voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue())
 					.build())
 				.getBody();
 		}).isInstanceOf(NonTransientAiException.class);
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,10 @@
 
 import static org.assertj.core.api.Assertions.assertThat;
 
+/**
+ * @author Ahmed Yousri
+ * @author Jonghoon Park
+ */
 @SpringBootTest(classes = OpenAiTestConfiguration.class)
 @EnabledIfEnvironmentVariable(named = "OPENAI_API_KEY", matches = ".+")
 class OpenAiSpeechModelIT extends AbstractIT {
@@ -57,7 +61,7 @@ void shouldProduceAudioBytesDirectlyFromMessage() {
 	@Test
 	void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() {
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
@@ -93,7 +97,7 @@ void shouldGenerateNonEmptyWavAudioFromSpeechPrompt() {
 	@Test
 	void speechRateLimitTest() {
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
@@ -113,7 +117,7 @@ void speechRateLimitTest() {
 	void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
 
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
@@ -135,7 +139,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() {
 	@ValueSource(strings = { "alloy", "echo", "fable", "onyx", "nova", "shimmer", "sage", "coral", "ash" })
 	void speechVoicesTest(String voice) {
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.valueOf(voice.toUpperCase()))
+			.voice(voice)
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java
@@ -46,6 +46,7 @@
 
 /**
  * @author Ahmed Yousri
+ * @author Jonghoon Park
  */
 @RestClientTest(OpenAiSpeechModelWithSpeechResponseMetadataTests.Config.class)
 public class OpenAiSpeechModelWithSpeechResponseMetadataTests {
@@ -71,7 +72,7 @@ void aiResponseContainsImageResponseMetadata() {
 		prepareMock();
 
 		OpenAiAudioSpeechOptions speechOptions = OpenAiAudioSpeechOptions.builder()
-			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY)
+			.voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue())
 			.speed(SPEED)
 			.responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3)
 			.model(OpenAiAudioApi.TtsModel.TTS_1.value)
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/audio/speech/openai-speech.adoc