Add documentation for OpenAI Transcription

markpollack · markpollack · commit 4fd15b11c52b · 2024-03-12T12:23:25.000-04:00
Fixes #401
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionClient.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionClient.java
@@ -68,6 +68,11 @@ public class OpenAiAudioTranscriptionClient
 
 	private final OpenAiAudioApi audioApi;
 
+	/**
+	 * OpenAiAudioTranscriptionClient is a client class used to interact with the OpenAI
+	 * Audio Transcription API.
+	 * @param audioApi The OpenAiAudioApi instance to be used for making API calls.
+	 */
 	public OpenAiAudioTranscriptionClient(OpenAiAudioApi audioApi) {
 		this(audioApi,
 				OpenAiAudioTranscriptionOptions.builder()
@@ -78,6 +83,25 @@ public OpenAiAudioTranscriptionClient(OpenAiAudioApi audioApi) {
 				RetryUtils.DEFAULT_RETRY_TEMPLATE);
 	}
 
+	/**
+	 * OpenAiAudioTranscriptionClient is a client class used to interact with the OpenAI
+	 * Audio Transcription API.
+	 * @param audioApi The OpenAiAudioApi instance to be used for making API calls.
+	 * @param options The OpenAiAudioTranscriptionOptions instance for configuring the
+	 * audio transcription.
+	 */
+	public OpenAiAudioTranscriptionClient(OpenAiAudioApi audioApi, OpenAiAudioTranscriptionOptions options) {
+		this(audioApi, options, RetryUtils.DEFAULT_RETRY_TEMPLATE);
+	}
+
+	/**
+	 * OpenAiAudioTranscriptionClient is a client class used to interact with the OpenAI
+	 * Audio Transcription API.
+	 * @param audioApi The OpenAiAudioApi instance to be used for making API calls.
+	 * @param options The OpenAiAudioTranscriptionOptions instance for configuring the
+	 * audio transcription.
+	 * @param retryTemplate The RetryTemplate instance for retrying failed API calls.
+	 */
 	public OpenAiAudioTranscriptionClient(OpenAiAudioApi audioApi, OpenAiAudioTranscriptionOptions options,
 			RetryTemplate retryTemplate) {
 		Assert.notNull(audioApi, "OpenAiAudioApi must not be null");
diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionOptions.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionOptions.java
@@ -38,8 +38,7 @@ public class OpenAiAudioTranscriptionOptions implements ModelOptions {
 	private @JsonProperty("model") String model;
 
 	/**
-	 * An object specifying the format that the model must output. Setting to { "type":
-	 * "json_object" } enables JSON mode, which guarantees the message the model generates is valid JSON.
+	 * The format of the transcript output, in one of these options: json, text, srt, verbose_json, or vtt.
 	 */
 	private @JsonProperty("response_format") TranscriptResponseFormat responseFormat;
 
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/nav.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/nav.adoc
@@ -2,17 +2,6 @@
 * xref:concepts.adoc[AI Concepts]
 * xref:getting-started.adoc[Getting Started]
 * xref:api/index.adoc[]
-** xref:api/embeddings.adoc[]
-*** xref:api/embeddings/openai-embeddings.adoc[OpenAI]
-*** xref:api/embeddings/ollama-embeddings.adoc[Ollama]
-*** xref:api/embeddings/azure-openai-embeddings.adoc[Azure OpenAI]
-*** xref:api/embeddings/postgresml-embeddings.adoc[PostgresML]
-*** xref:api/embeddings/vertexai-embeddings.adoc[Google VertexAI PaLM2]
-*** xref:api/bedrock.adoc[Amazon Bedrock]
-**** xref:api/embeddings/bedrock-cohere-embedding.adoc[Cohere]
-**** xref:api/embeddings/bedrock-titan-embedding.adoc[Titan]
-*** xref:api/embeddings/onnx.adoc[Transformers (ONNX)]
-*** xref:api/embeddings/mistralai-embeddings.adoc[Mistral AI]
 ** xref:api/chatclient.adoc[]
 *** xref:api/clients/openai-chat.adoc[OpenAI]
 **** xref:api/clients/functions/openai-chat-functions.adoc[Function Calling]
@@ -31,9 +20,22 @@
 ***** xref:api/clients/functions/vertexai-gemini-chat-functions.adoc[Function Calling]
 *** xref:api/clients/mistralai-chat.adoc[Mistral AI]
 **** xref:api/clients/functions/mistralai-chat-functions.adoc[Function Calling]
+** xref:api/embeddings.adoc[]
+*** xref:api/embeddings/openai-embeddings.adoc[OpenAI]
+*** xref:api/embeddings/ollama-embeddings.adoc[Ollama]
+*** xref:api/embeddings/azure-openai-embeddings.adoc[Azure OpenAI]
+*** xref:api/embeddings/postgresml-embeddings.adoc[PostgresML]
+*** xref:api/embeddings/vertexai-embeddings.adoc[Google VertexAI PaLM2]
+*** xref:api/bedrock.adoc[Amazon Bedrock]
+**** xref:api/embeddings/bedrock-cohere-embedding.adoc[Cohere]
+**** xref:api/embeddings/bedrock-titan-embedding.adoc[Titan]
+*** xref:api/embeddings/onnx.adoc[Transformers (ONNX)]
+*** xref:api/embeddings/mistralai-embeddings.adoc[Mistral AI]
 ** xref:api/imageclient.adoc[]
 *** xref:api/clients/image/openai-image.adoc[OpenAI]
 *** xref:api/clients/image/stabilityai-image.adoc[Stability]
+** xref:api/transcriptions.adoc[]
+*** xref:api/transcriptions/openai-transcriptions.adoc[OpenAI]
 ** xref:api/vectordbs.adoc[]
 *** xref:api/vectordbs/azure.adoc[]
 *** xref:api/vectordbs/chroma.adoc[]
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/bedrock/bedrock-titan.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/bedrock/bedrock-titan.adoc
@@ -1,6 +1,6 @@
 = Titan Chat
 
-link:https://aws.amazon.com/bedrock/titan/[Amazon Titan] foundation models (FMs) provide customers with a breadth of high-performing image, multimodal, and text model choices, via a fully managed API.
+link:https://aws.amazon.com/bedrock/titan/[Amazon Titan] foundation models (FMs) provide customers with a breadth of high-performing image, multimodal embeddings, and text model choices, via a fully managed API.
 Amazon Titan models are created by AWS and pretrained on large datasets, making them powerful, general-purpose models built to support a variety of use cases, while also supporting the responsible use of AI.
 Use them as is or privately customize them with your own data.
 
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/image/openai-image.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/image/openai-image.adoc
@@ -1,7 +1,7 @@
 = OpenAI Image Generation
 
 
-Spring AI supports ChatGPT's DALL-E, the Image generation model from OpenAI.
+Spring AI supports DALL-E, the Image generation model from OpenAI.
 
 == Prerequisites
 
@@ -41,20 +41,24 @@ TIP: Refer to the xref:getting-started.adoc#dependency-management[Dependency Man
 
 === Image Generation Properties
 
-==== Retry Properties
 
-The prefix `spring.ai.retry` is used as the property prefix that lets you configure the retry mechanism for the OpenAI Image client.
+The prefix `spring.ai.openai.image` is the property prefix that lets you configure the `ImageClient` implementation for OpenAI.
 
 [cols="3,5,1"]
 |====
 | Property | Description | Default
-
-| spring.ai.retry.max-attempts   | Maximum number of retry attempts. |  10
-| spring.ai.retry.backoff.initial-interval | Initial sleep duration for the exponential backoff policy. |  2 sec.
-| spring.ai.retry.backoff.multiplier | Backoff interval multiplier. |  5
-| spring.ai.retry.backoff.max-interval | Maximum backoff duration. |  3 min.
-| spring.ai.retry.on-client-errors | If false, throw a NonTransientAiException, and do not attempt retry for `4xx` client error codes | false
-| spring.ai.retry.exclude-on-http-codes | List of HTTP status codes that should not trigger a retry (e.g. to throw NonTransientAiException). | empty
+| spring.ai.openai.image.enabled | Enable OpenAI image client.  | true
+| spring.ai.openai.image.base-url              | Optional overrides the spring.ai.openai.base-url to provide chat specific url |  -
+| spring.ai.openai.image.api-key               | Optional overrides the spring.ai.openai.api-key to provide chat specific api-key |  -
+| spring.ai.openai.image.options.n            | The number of images to generate. Must be between 1 and 10. For dall-e-3, only n=1 is supported.  | -
+| spring.ai.openai.image.options.model        | The model to use for image generation.  | OpenAiImageApi.DEFAULT_IMAGE_MODEL
+| spring.ai.openai.image.options.quality      | The quality of the image that will be generated. HD creates images with finer details and greater consistency across the image. This parameter is only supported for dall-e-3. | -
+| spring.ai.openai.image.options.response_format | The format in which the generated images are returned. Must be one of URL or b64_json. | -
+| `spring.ai.openai.image.options.size`       | The size of the generated images. Must be one of 256x256, 512x512, or 1024x1024 for dall-e-2. Must be one of 1024x1024, 1792x1024, or 1024x1792 for dall-e-3 models. | -
+| `spring.ai.openai.image.options.size_width` | The width of the generated images. Must be one of 256, 512, or 1024 for dall-e-2.  | -
+| `spring.ai.openai.image.options.size_height`| The height of the generated images. Must be one of 256, 512, or 1024 for dall-e-2. | -
+| `spring.ai.openai.image.options.style`      | The style of the generated images. Must be one of vivid or natural. Vivid causes the model to lean towards generating hyper-real and dramatic images. Natural causes the model to produce more natural, less hyper-real looking images. This parameter is only supported for dall-e-3. | -
+| `spring.ai.openai.image.options.user`       | A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. | -
 |====
 
 ==== Connection Properties
@@ -70,32 +74,31 @@ The prefix `spring.ai.openai` is used as the property prefix that lets you conne
 
 ==== Configuration Properties
 
-The prefix `spring.ai.openai.image` is the property prefix that lets you configure the `ImageClient` implementation for OpenAI.
+
+==== Retry Properties
+
+The prefix `spring.ai.retry` is used as the property prefix that lets you configure the retry mechanism for the OpenAI Image client.
 
 [cols="3,5,1"]
 |====
 | Property | Description | Default
-| spring.ai.openai.image.enabled | Enable OpenAI image client.  | true
-| spring.ai.openai.image.base-url              | Optional overrides the spring.ai.openai.base-url to provide chat specific url |  -
-| spring.ai.openai.image.api-key               | Optional overrides the spring.ai.openai.api-key to provide chat specific api-key |  -
-| spring.ai.openai.image.options.n            | The number of images to generate. Must be between 1 and 10. For dall-e-3, only n=1 is supported.  | -
-| spring.ai.openai.image.options.model        | The model to use for image generation.  | OpenAiImageApi.DEFAULT_IMAGE_MODEL
-| spring.ai.openai.image.options.quality      | The quality of the image that will be generated. HD creates images with finer details and greater consistency across the image. This parameter is only supported for dall-e-3. | -
-| spring.ai.openai.image.options.response_format | The format in which the generated images are returned. Must be one of URL or b64_json. | -
-| `spring.ai.openai.image.options.size`       | The size of the generated images. Must be one of 256x256, 512x512, or 1024x1024 for dall-e-2. Must be one of 1024x1024, 1792x1024, or 1024x1792 for dall-e-3 models. | -
-| `spring.ai.openai.image.options.size_width` | The width of the generated images. Must be one of 256, 512, or 1024 for dall-e-2.  | -
-| `spring.ai.openai.image.options.size_height`| The height of the generated images. Must be one of 256, 512, or 1024 for dall-e-2. | -
-| `spring.ai.openai.image.options.style`      | The style of the generated images. Must be one of vivid or natural. Vivid causes the model to lean towards generating hyper-real and dramatic images. Natural causes the model to produce more natural, less hyper-real looking images. This parameter is only supported for dall-e-3. | -
-| `spring.ai.openai.image.options.user`       | A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. | -
+
+| spring.ai.retry.max-attempts   | Maximum number of retry attempts. |  10
+| spring.ai.retry.backoff.initial-interval | Initial sleep duration for the exponential backoff policy. |  2 sec.
+| spring.ai.retry.backoff.multiplier | Backoff interval multiplier. |  5
+| spring.ai.retry.backoff.max-interval | Maximum backoff duration. |  3 min.
+| spring.ai.retry.on-client-errors | If false, throw a NonTransientAiException, and do not attempt retry for `4xx` client error codes | false
+| spring.ai.retry.exclude-on-http-codes | List of HTTP status codes that should not trigger a retry (e.g. to throw NonTransientAiException). | empty
 |====
 
-=== Image Options [[image-options]]
+
+== Runtime Options [[image-options]]
 
 The https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiImageOptions.java[OpenAiImageOptions.java] provides model configurations, such as the model to use, the quality, the size, etc.
 
 On start-up, the default options can be configured with the `OpenAiImageClient(OpenAiImageApi openAiImageApi)` constructor and the `withDefaultOptions(OpenAiImageOptions defaultOptions)` method.  Alternatively, use the `spring.ai.openai.image.options.*` properties described previously.
 
-At run-time you can override the default options by adding new, request specific, options to the `ImagePrompt` call.
+At runtime you can override the default options by adding new, request specific, options to the `ImagePrompt` call.
 For example to override the OpenAI specific options such as quality and the number of images to create, use the following code example:
 
 [source,java]
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/image/stabilityai-image.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/image/stabilityai-image.adoc
@@ -80,7 +80,7 @@ The https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-stab
 
 On start-up, the default options can be configured with the `StabilityAiImageClient(StabilityAiApi stabilityAiApi, StabilityAiImageOptions options)` constructor. Alternatively, use the `spring.ai.openai.image.options.*` properties described previously.
 
-At run-time you can override the default options by adding new, request specific, options to the `ImagePrompt` call.
+At runtime, you can override the default options by adding new, request specific, options to the `ImagePrompt` call.
 For example to override the Stability AI specific options such as quality and the number of images to create, use the following code example:
 
 [source,java]
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/openai-chat.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/openai-chat.adoc
@@ -262,7 +262,7 @@ Flux<ChatCompletionChunk> streamResponse = openAiApi.chatCompletionStream(
 
 Follow the https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiApi.java[OpenAiApi.java]'s JavaDoc for further information.
 
-==== OpenAiApi Samples
+== Example Code
 * The link:https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/chat/api/OpenAiApiIT.java[OpenAiApiIT.java] test provides some general examples how to use the lightweight library.
 
 * The link:https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/chat/api/tool/OpenAiApiToolFunctionCallIT.java[OpenAiApiToolFunctionCallIT.java] test shows how to use the low-level API to call tool functions.
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/vertexai-gemini-chat.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/clients/vertexai-gemini-chat.adoc
@@ -1,7 +1,5 @@
 = VertexAI Gemini Chat
 
-
-
 The https://cloud.google.com/vertex-ai/docs/generative-ai/multimodal/overview[Vertex AI Gemini API] allows developers to build generative AI applications using the Gemini model.
 The Vertex AI Gemini API supports multimodal prompts as input and output text or code.
 A multimodal model is a model that is capable of processing information from multiple modalities, including images, videos, and text. For example, you can send the model a photo of a plate of cookies and ask it to give you a recipe for those cookies.
@@ -79,13 +77,13 @@ The prefix `spring.ai.vertex.ai.gemini.chat` is the property prefix that lets yo
 
 TIP: All properties prefixed with `spring.ai.vertex.ai.gemini.chat.options` can be overridden at runtime by adding a request specific <<chat-options>> to the `Prompt` call.
 
-=== Chat Options [[chat-options]]
+== Runtime options [[chat-options]]
 
 The https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-vertex-ai-gemini/src/main/java/org/springframework/ai/vertexai/gemini/VertexAiGeminiChatOptions.java[VertexAiGeminiChatOptions.java] provides model configurations, such as the temperature, the topK, etc.
 
 On start-up, the default options can be configured with the `VertexAiGeminiChatClient(api, options)` constructor or the `spring.ai.vertex.ai.chat.options.*` properties.
 
-At run-time you can override the default options by adding new, request specific, options to the `Prompt` call.
+At runtime you can override the default options by adding new, request specific, options to the `Prompt` call.
 For example to override the default temperature for a specific request:
 
 [source,java]
@@ -101,21 +99,21 @@ ChatResponse response = chatClient.call(
 
 TIP: In addition to the model specific `VertexAiChatPaLm2Options` you can use a portable https://github.com/spring-projects/spring-ai/blob/main/spring-ai-core/src/main/java/org/springframework/ai/chat/ChatOptions.java[ChatOptions] instance, created with the https://github.com/spring-projects/spring-ai/blob/main/spring-ai-core/src/main/java/org/springframework/ai/chat/ChatOptionsBuilder.java[ChatOptionsBuilder#builder()].
 
-=== Function Calling
+== Function Calling
 
 You can register custom Java functions with the VertexAiGeminiChatClient and have the Gemini Pro model intelligently choose to output a JSON object containing arguments to call one or many of the registered functions.
 This is a powerful technique to connect the LLM capabilities with external tools and APIs.
 Read more about xref:api/clients/functions/vertexai-gemini-chat-functions.adoc[Vertex AI Gemini Function Calling].
 
-=== Multimodal Example
+== Multimodal
 Multimodality refers to a model's ability to simultaneously understand and process information from various sources, including text, images, audio, and other data formats. This paradigm represents a significant advancement in AI models.
 
-Google's Gemini AI models support this capability by comprehending and integrating text, code, audio, images, and video. For more details, refer to the blog post [Introducing Gemini](https://blog.google/technology/ai/google-gemini-ai/#introducing-gemini).
+Google's Gemini AI models support this capability by comprehending and integrating text, code, audio, images, and video. For more details, refer to the blog post https://blog.google/technology/ai/google-gemini-ai/#introducing-gemini[Introducing Gemini].
 
 Spring AI's `Message` interface supports multimodal AI models by introducing the Media type.
 This type contains data and information about media attachments in messages, using Spring's `org.springframework.util.MimeType` and a `java.lang.Object` for the raw media data.
 
-Below is a simple code example extracted from [VertexAiGeminiChatClientIT.java](https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-vertex-ai-gemini/src/test/java/org/springframework/ai/vertexai/gemini/VertexAiGeminiChatClientIT.java), demonstrating the combination of user text with an image.
+Below is a simple code example extracted from https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-vertex-ai-gemini/src/test/java/org/springframework/ai/vertexai/gemini/VertexAiGeminiChatClientIT.java[VertexAiGeminiChatClientIT.java], demonstrating the combination of user text with an image.
 
 
 [source,java]
@@ -128,7 +126,7 @@ var userMessage = new UserMessage("Explain what do you see o this picture?",
 ChatResponse response = chatClient.call(new Prompt(List.of(userMessage)));
 ----
 
-=== Sample Controller (Auto-configuration)
+== Sample Controller
 
 https://start.spring.io/[Create] a new Spring Boot project and add the `spring-ai-vertex-ai-palm2-spring-boot-starter` to your pom (or gradle) dependencies.
 
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/bedrock-titan-embedding.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/bedrock-titan-embedding.adoc
@@ -1,7 +1,7 @@
 = Titan Embeddings
 
 Provides Bedrock Titan Embedding client.
-link:https://aws.amazon.com/bedrock/titan/[Amazon Titan] foundation models (FMs) provide customers with a breadth of high-performing image, multimodal, and text model choices, via a fully managed API.
+link:https://aws.amazon.com/bedrock/titan/[Amazon Titan] foundation models (FMs) provide customers with a breadth of high-performing image, multimodal embeddings, and text model choices, via a fully managed API.
 Amazon Titan models are created by AWS and pretrained on large datasets, making them powerful, general-purpose models built to support a variety of use cases, while also supporting the responsible use of AI.
 Use them as is or privately customize them with your own data.
 
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/transcriptions.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/transcriptions.adoc
@@ -0,0 +1,5 @@
+[[Transcription]]
+= Transcription API
+
+Spring AI provides support for OpenAI's Transcription API.
+When additional providers for Transcription are implemented, a common `AudioTranscriptionClient` interface will be extracted.
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/transcriptions/openai-transcriptions.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/transcriptions/openai-transcriptions.adoc