diff --git a/async-openai/src/types/realtime/client_event.rs b/async-openai/src/types/realtime/client_event.rs index 87ff7010..9fb3d2a4 100644 --- a/async-openai/src/types/realtime/client_event.rs +++ b/async-openai/src/types/realtime/client_event.rs @@ -1,7 +1,76 @@ use serde::{Deserialize, Serialize}; use tokio_tungstenite::tungstenite::Message; -use super::{item::Item, session_resource::SessionResource}; +use super::{ + item::Item, + session_resource::{ + AudioFormat, MaxResponseOutputTokens, Modality, RealtimeVoice, SessionResource, ToolChoice, + ToolDefinition, + }, +}; + +/// Configuration for a response in the OpenAI Realtime API. +/// This is used in the `response.create` event. +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct ResponseConfig { + /// Controls which conversation the response is added to. Currently supports "auto" and "none", + /// with "auto" as the default value. The "auto" value means that the contents of the response + /// will be added to the default conversation. Set this to "none" to create an out-of-band response + /// which will not add items to default conversation. + #[serde(skip_serializing_if = "Option::is_none")] + pub conversation: Option, + + /// Input items to include in the prompt for the model. Using this field creates a new context + /// for this Response instead of using the default conversation. An empty array [] will clear + /// the context for this Response. Note that this can include references to items from the default conversation. + #[serde(skip_serializing_if = "Option::is_none")] + pub input: Option>, + + /// The default system instructions (i.e. system message) prepended to model calls. + /// This field allows the client to guide the model on desired responses. + #[serde(skip_serializing_if = "Option::is_none")] + pub instructions: Option, + + /// Maximum number of output tokens for a single assistant response, inclusive of tool calls. + /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model. + /// Defaults to "inf". + #[serde(skip_serializing_if = "Option::is_none")] + pub max_response_output_tokens: Option, + + /// Set of 16 key-value pairs that can be attached to an object. + /// This can be useful for storing additional information about the object in a structured format. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option>, + + /// The set of modalities the model can respond with. To disable audio, set this to ["text"]. + #[serde(skip_serializing_if = "Option::is_none")] + pub modalities: Option>, + + /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". + #[serde(skip_serializing_if = "Option::is_none")] + pub output_audio_format: Option, + + /// Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + #[serde(skip_serializing_if = "Option::is_none")] + pub temperature: Option, + + /// How the model chooses tools. + #[serde(skip_serializing_if = "Option::is_none")] + pub tool_choice: Option, + + /// Tools (functions) available to the model. + #[serde(skip_serializing_if = "Option::is_none")] + pub tools: Option>, + + /// The voice the model uses to respond. Cannot be changed once the model has responded with audio at least once. + #[serde(skip_serializing_if = "Option::is_none")] + pub voice: Option, + + /// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + /// This value can only be changed in between model turns, not while a response is in progress. + #[serde(skip_serializing_if = "Option::is_none")] + pub speed: Option, +} #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct SessionUpdateEvent { @@ -35,6 +104,13 @@ pub struct InputAudioBufferClearEvent { pub event_id: Option, } +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct OutputAudioBufferClearEvent { + /// Optional client-generated ID used to identify this event. + #[serde(skip_serializing_if = "Option::is_none")] + pub event_id: Option, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ConversationItemCreateEvent { /// Optional client-generated ID used to identify this event. @@ -75,6 +151,16 @@ pub struct ConversationItemDeleteEvent { pub item_id: String, } +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct ConversationItemRetrieveEvent { + /// Optional client-generated ID used to identify this event. + #[serde(skip_serializing_if = "Option::is_none")] + pub event_id: Option, + + /// The ID of the item to retrieve. + pub item_id: String, +} + #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct ResponseCreateEvent { /// Optional client-generated ID used to identify this event. @@ -82,7 +168,7 @@ pub struct ResponseCreateEvent { pub event_id: Option, /// Configuration for the response. - pub response: Option, + pub response: Option, } #[derive(Debug, Serialize, Deserialize, Clone, Default)] @@ -90,6 +176,9 @@ pub struct ResponseCancelEvent { /// Optional client-generated ID used to identify this event. #[serde(skip_serializing_if = "Option::is_none")] pub event_id: Option, + + /// A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation. + pub response_id: String, } /// These are events that the OpenAI Realtime WebSocket server will accept from the client. @@ -112,6 +201,10 @@ pub enum ClientEvent { #[serde(rename = "input_audio_buffer.clear")] InputAudioBufferClear(InputAudioBufferClearEvent), + /// WebRTC Only: Send this event to cut off the current audio response. + #[serde(rename = "output_audio_buffer.clear")] + OutputAudioBufferClear(OutputAudioBufferClearEvent), + /// Send this event when adding an item to the conversation. #[serde(rename = "conversation.item.create")] ConversationItemCreate(ConversationItemCreateEvent), @@ -124,6 +217,10 @@ pub enum ClientEvent { #[serde(rename = "conversation.item.delete")] ConversationItemDelete(ConversationItemDeleteEvent), + /// Send this event when you want to retrieve the server's representation of a specific item in the conversation history. + #[serde(rename = "conversation.item.retrieve")] + ConversationItemRetrieve(ConversationItemRetrieveEvent), + /// Send this event to trigger a response generation. #[serde(rename = "response.create")] ResponseCreate(ResponseCreateEvent), @@ -181,6 +278,11 @@ event_from!( ClientEvent, InputAudioBufferClear ); +event_from!( + OutputAudioBufferClearEvent, + ClientEvent, + OutputAudioBufferClear +); event_from!( ConversationItemCreateEvent, ClientEvent, @@ -198,14 +300,21 @@ event_from!( ); event_from!(ResponseCreateEvent, ClientEvent, ResponseCreate); event_from!(ResponseCancelEvent, ClientEvent, ResponseCancel); +event_from!( + ConversationItemRetrieveEvent, + ClientEvent, + ConversationItemRetrieve +); message_from_event!(SessionUpdateEvent, ClientEvent); message_from_event!(InputAudioBufferAppendEvent, ClientEvent); message_from_event!(InputAudioBufferCommitEvent, ClientEvent); message_from_event!(InputAudioBufferClearEvent, ClientEvent); +message_from_event!(OutputAudioBufferClearEvent, ClientEvent); message_from_event!(ConversationItemCreateEvent, ClientEvent); message_from_event!(ConversationItemTruncateEvent, ClientEvent); message_from_event!(ConversationItemDeleteEvent, ClientEvent); +message_from_event!(ConversationItemRetrieveEvent, ClientEvent); message_from_event!(ResponseCreateEvent, ClientEvent); message_from_event!(ResponseCancelEvent, ClientEvent); diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs index a6c6c32f..922c1fad 100644 --- a/async-openai/src/types/realtime/response_resource.rs +++ b/async-openai/src/types/realtime/response_resource.rs @@ -31,6 +31,17 @@ pub enum IncompleteReason { Interruption, MaxOutputTokens, ContentFilter, + TokenLimit, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum FinishReason { + Stop, + Length, + ToolCalls, + ContentFilter, + FunctionCall, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -58,4 +69,10 @@ pub struct ResponseResource { pub output: Vec, /// Usage statistics for the response. pub usage: Option, + /// The Unix timestamp (in seconds) for when the response was created. + #[serde(skip_serializing_if = "Option::is_none")] + pub created_at: Option, + /// The reason the model stopped generating tokens, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub finish_reason: Option, } diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs index 8795f6e4..fadeaa2d 100644 --- a/async-openai/src/types/realtime/server_event.rs +++ b/async-openai/src/types/realtime/server_event.rs @@ -38,7 +38,7 @@ pub struct ConversationCreatedEvent { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct InputAudioBufferCommitedEvent { +pub struct InputAudioBufferCommittedEvent { /// The unique ID of the server event. pub event_id: String, /// The ID of the preceding item after which the new item will be inserted. @@ -53,6 +53,12 @@ pub struct InputAudioBufferClearedEvent { pub event_id: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct OutputAudioBufferClearedEvent { + /// The unique ID of the server event. + pub event_id: String, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct InputAudioBufferSpeechStartedEvent { /// The unique ID of the server event. @@ -154,6 +160,14 @@ pub struct ConversationItemDeletedEvent { pub item_id: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ConversationItemRetrievedEvent { + /// The unique ID of the server event. + pub event_id: String, + /// The item that was retrieved. + pub item: Item, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ResponseCreatedEvent { /// The unique ID of the server event. @@ -381,12 +395,16 @@ pub enum ServerEvent { /// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode. #[serde(rename = "input_audio_buffer.committed")] - InputAudioBufferCommited(InputAudioBufferCommitedEvent), + InputAudioBufferCommitted(InputAudioBufferCommittedEvent), /// Returned when the input audio buffer is cleared by the client. #[serde(rename = "input_audio_buffer.cleared")] InputAudioBufferCleared(InputAudioBufferClearedEvent), + /// Returned when the output audio buffer is cleared by the client (WebRTC specific). + #[serde(rename = "output_audio_buffer.cleared")] + OutputAudioBufferCleared(OutputAudioBufferClearedEvent), + /// Returned in server turn detection mode when speech is detected. #[serde(rename = "input_audio_buffer.speech_started")] InputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent), @@ -422,6 +440,10 @@ pub enum ServerEvent { #[serde(rename = "conversation.item.deleted")] ConversationItemDeleted(ConversationItemDeletedEvent), + /// Returned when an item in the conversation is retrieved. + #[serde(rename = "conversation.item.retrieved")] + ConversationItemRetrieved(ConversationItemRetrievedEvent), + /// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress". #[serde(rename = "response.created")] ResponseCreated(ResponseCreatedEvent), diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs index 2fe1e5b1..f84ecff9 100644 --- a/async-openai/src/types/realtime/session_resource.rs +++ b/async-openai/src/types/realtime/session_resource.rs @@ -10,6 +10,22 @@ pub enum AudioFormat { G711ALAW, } +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum NoiseReductionType { + NearField, + FarField, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct InputAudioNoiseReduction { + /// Type of noise reduction. `near_field` is for close-talking microphones such as + /// headphones, `far_field` is for far-field microphones such as laptop or + /// conference room microphones. + #[serde(skip_serializing_if = "Option::is_none")] + pub r#type: Option, +} + #[derive(Debug, Default, Serialize, Deserialize, Clone)] pub struct AudioTranscription { /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency. @@ -74,6 +90,32 @@ pub enum MaxResponseOutputTokens { Num(u16), } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TracingConfiguration { + /// The group id to attach to this trace to enable filtering and grouping in the traces dashboard. + #[serde(skip_serializing_if = "Option::is_none")] + pub group_id: Option, + + /// The arbitrary metadata to attach to this trace to enable filtering in the traces dashboard. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option, + + /// The name of the workflow to attach to this trace. This is used to name the trace in the traces dashboard. + #[serde(skip_serializing_if = "Option::is_none")] + pub workflow_name: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] +pub enum TracingOption { + /// Auto tracing with default values + #[serde(rename = "auto")] + Auto, + /// Granular tracing configuration + #[serde(rename = "config")] + Config(TracingConfiguration), +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(tag = "type")] pub enum ToolDefinition { @@ -118,19 +160,43 @@ pub enum RealtimeVoice { Fable, Onyx, Nova, + Sage, Shimmer, Verse, } +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum Modality { + Text, + Audio, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum RealtimeModel { + #[serde(rename = "gpt-4o-realtime-preview")] + GPT4ORealtimePreview, + #[serde(rename = "gpt-4o-realtime-preview-2024-10-01")] + GPT4ORealtimePreview20241001, + #[serde(rename = "gpt-4o-realtime-preview-2024-12-17")] + GPT4ORealtimePreview20241217, + #[serde(rename = "gpt-4o-realtime-preview-2025-06-03")] + GPT4ORealtimePreview20250603, + #[serde(rename = "gpt-4o-mini-realtime-preview")] + GPT4OMiniRealtimePreview, + #[serde(rename = "gpt-4o-mini-realtime-preview-2024-12-17")] + GPT4OMiniRealtimePreview20241217, +} + #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct SessionResource { /// The default model used for this session. #[serde(skip_serializing_if = "Option::is_none")] - pub model: Option, + pub model: Option, /// The set of modalities the model can respond with. To disable audio, set this to ["text"]. #[serde(skip_serializing_if = "Option::is_none")] - pub modalities: Option>, + pub modalities: Option>, //// The default system instructions prepended to model calls. #[serde(skip_serializing_if = "Option::is_none")] @@ -140,10 +206,22 @@ pub struct SessionResource { #[serde(skip_serializing_if = "Option::is_none")] pub voice: Option, + /// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + /// This value can only be changed in between model turns, not while a response is in progress. + #[serde(skip_serializing_if = "Option::is_none")] + pub speed: Option, + /// The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". #[serde(skip_serializing_if = "Option::is_none")] pub input_audio_format: Option, + /// Configuration for input audio noise reduction. This can be set to `null` to turn off. + /// Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. + /// Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) + /// and model performance by improving perception of the input audio. + #[serde(skip_serializing_if = "Option::is_none")] + pub input_audio_noise_reduction: Option, + /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". #[serde(skip_serializing_if = "Option::is_none")] pub output_audio_format: Option, @@ -168,6 +246,9 @@ pub struct SessionResource { /// Sampling temperature for the model. pub temperature: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tracing: Option, + /// Maximum number of output tokens for a single assistant response, inclusive of tool calls. /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model. /// Defaults to "inf".