From 6395a6cf288858e99da9dd6b8308f98897afa2bb Mon Sep 17 00:00:00 2001 From: Chris Raethke Date: Mon, 23 Jun 2025 16:28:34 +1000 Subject: [PATCH 1/4] feat: enhance realtime response types and audio transcription options - Added `Cancelled` variant to `ResponseStatusDetail` enum for better handling of cancelled responses. - Introduced `LogProb` struct to capture log probability information for transcribed tokens. - Updated `ConversationItemInputAudioTranscriptionCompletedEvent` and `ConversationItemInputAudioTranscriptionDeltaEvent` to include optional `logprobs` for per-token log probability data. - Enhanced `AudioTranscription` struct with optional fields for `language`, `model`, and `prompt` to improve transcription accuracy and customization. - Added new `SemanticVAD` option in the `TurnDetection` enum to control model response eagerness. - Expanded `RealtimeVoice` enum with additional voice options for more variety in audio responses. --- .../src/types/realtime/response_resource.rs | 2 ++ .../src/types/realtime/server_event.rs | 30 ++++++++++++++++ .../src/types/realtime/session_resource.rs | 34 +++++++++++++++---- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs index 4a500890..a6c6c32f 100644 --- a/async-openai/src/types/realtime/response_resource.rs +++ b/async-openai/src/types/realtime/response_resource.rs @@ -40,6 +40,8 @@ pub enum ResponseStatusDetail { Incomplete { reason: IncompleteReason }, #[serde(rename = "failed")] Failed { error: Option }, + #[serde(rename = "cancelled")] + Cancelled { reason: String }, } #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs index 3ba5f552..8795f6e4 100644 --- a/async-openai/src/types/realtime/server_event.rs +++ b/async-openai/src/types/realtime/server_event.rs @@ -83,6 +83,17 @@ pub struct ConversationItemCreatedEvent { pub item: Item, } +#[derive(Debug, Serialize, Deserialize, Clone)] +/// Log probability information for a transcribed token. +pub struct LogProb { + /// Raw UTF-8 bytes for the token. + pub bytes: Vec, + /// The log probability of the token. + pub logprob: f64, + /// The token string. + pub token: String, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ConversationItemInputAudioTranscriptionCompletedEvent { /// The unique ID of the server event. @@ -93,6 +104,22 @@ pub struct ConversationItemInputAudioTranscriptionCompletedEvent { pub content_index: u32, /// The transcribed text. pub transcript: String, + /// Optional per-token log probability data. + pub logprobs: Option>, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ConversationItemInputAudioTranscriptionDeltaEvent { + /// The unique ID of the server event. + pub event_id: String, + /// The ID of the user message item. + pub item_id: String, + /// The index of the content part containing the audio. + pub content_index: u32, + /// The text delta. + pub delta: String, + /// Optional per-token log probability data. + pub logprobs: Option>, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -378,6 +405,9 @@ pub enum ServerEvent { ConversationItemInputAudioTranscriptionCompletedEvent, ), + #[serde(rename = "conversation.item.input_audio_transcription.delta")] + ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent), + /// Returned when input audio transcription is configured, and a transcription request for a user message failed. #[serde(rename = "conversation.item.input_audio_transcription.failed")] ConversationItemInputAudioTranscriptionFailed( diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs index 10472414..89be7133 100644 --- a/async-openai/src/types/realtime/session_resource.rs +++ b/async-openai/src/types/realtime/session_resource.rs @@ -10,12 +10,19 @@ pub enum AudioFormat { G711ALAW, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Default, Serialize, Deserialize, Clone)] pub struct AudioTranscription { - /// Whether to enable input audio transcription. - pub enabled: bool, - /// The model to use for transcription (e.g., "whisper-1"). - pub model: String, + /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency. + #[serde(skip_serializing_if = "Option::is_none")] + pub language: Option, + /// The model to use for transcription, current options are gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1. + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + /// An optional text to guide the model's style or continue a previous audio segment. + /// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models, + /// the prompt is a free text string, for example "expect words related to technology". + #[serde(skip_serializing_if = "Option::is_none")] + pub prompt: Option, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -31,6 +38,14 @@ pub enum TurnDetection { /// Duration of silence to detect speech stop (in milliseconds). silence_duration_ms: u32, }, + + #[serde(rename = "semantic_vad")] + SemanticVAD { + /// The eagerness of the model to respond. + /// `low` will wait longer for the user to continue speaking, + /// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium` + eagerness: String, + }, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -78,8 +93,15 @@ pub enum ToolChoice { #[serde(rename_all = "lowercase")] pub enum RealtimeVoice { Alloy, - Shimmer, + Ash, + Ballad, + Coral, Echo, + Fable, + Onyx, + Nova, + Shimmer, + Verse, } #[derive(Debug, Serialize, Deserialize, Clone, Default)] From daeb8c7c1686e7b3cfe1395b6b0b1dcabe19a4e6 Mon Sep 17 00:00:00 2001 From: Chris Raethke Date: Mon, 23 Jun 2025 17:40:57 +1000 Subject: [PATCH 2/4] feat: update audio format enum values for consistency - Changed enum variants for `AudioFormat` to use underscores instead of hyphens in their serialized names. - Updated `G711ULAW` from `g711-ulaw` to `g711_law` and `G711ALAW` from `g711-alaw` to `g711_alaw` for improved clarity and adherence to naming conventions. --- async-openai/src/types/realtime/session_resource.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs index 89be7133..e2e4067f 100644 --- a/async-openai/src/types/realtime/session_resource.rs +++ b/async-openai/src/types/realtime/session_resource.rs @@ -4,9 +4,9 @@ use serde::{Deserialize, Serialize}; pub enum AudioFormat { #[serde(rename = "pcm16")] PCM16, - #[serde(rename = "g711-ulaw")] + #[serde(rename = "g711_law")] G711ULAW, - #[serde(rename = "g711-alaw")] + #[serde(rename = "g711_alaw")] G711ALAW, } From 2bb05e3904aafd6174188320a1a25306aa0bdf52 Mon Sep 17 00:00:00 2001 From: Chris Raethke Date: Thu, 26 Jun 2025 11:57:08 +1000 Subject: [PATCH 3/4] feat: add auto-response options to VAD configurations --- .../src/types/realtime/session_resource.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs index e2e4067f..2fe1e5b1 100644 --- a/async-openai/src/types/realtime/session_resource.rs +++ b/async-openai/src/types/realtime/session_resource.rs @@ -37,6 +37,15 @@ pub enum TurnDetection { prefix_padding_ms: u32, /// Duration of silence to detect speech stop (in milliseconds). silence_duration_ms: u32, + + /// Whether or not to automatically generate a response when a VAD stop event occurs. + #[serde(skip_serializing_if = "Option::is_none")] + create_response: Option, + + /// Whether or not to automatically interrupt any ongoing response with output to + /// the default conversation (i.e. conversation of auto) when a VAD start event occurs. + #[serde(skip_serializing_if = "Option::is_none")] + interrupt_response: Option, }, #[serde(rename = "semantic_vad")] @@ -45,6 +54,15 @@ pub enum TurnDetection { /// `low` will wait longer for the user to continue speaking, /// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium` eagerness: String, + + /// Whether or not to automatically generate a response when a VAD stop event occurs. + #[serde(skip_serializing_if = "Option::is_none", default)] + create_response: Option, + + /// Whether or not to automatically interrupt any ongoing response with output to + /// the default conversation (i.e. conversation of auto) when a VAD start event occurs. + #[serde(skip_serializing_if = "Option::is_none", default)] + interrupt_response: Option, }, } From 479bf1e3274fd49d7e7e0ee0634ee778c4835a45 Mon Sep 17 00:00:00 2001 From: Chris Raethke Date: Mon, 30 Jun 2025 14:58:31 +1000 Subject: [PATCH 4/4] feat: add realtime API types and event handling for audio, tracing, and response management --- .../src/types/realtime/client_event.rs | 113 +++++++++++++++++- .../src/types/realtime/response_resource.rs | 17 +++ .../src/types/realtime/server_event.rs | 26 +++- .../src/types/realtime/session_resource.rs | 87 +++++++++++++- 4 files changed, 236 insertions(+), 7 deletions(-) diff --git a/async-openai/src/types/realtime/client_event.rs b/async-openai/src/types/realtime/client_event.rs index 87ff7010..9fb3d2a4 100644 --- a/async-openai/src/types/realtime/client_event.rs +++ b/async-openai/src/types/realtime/client_event.rs @@ -1,7 +1,76 @@ use serde::{Deserialize, Serialize}; use tokio_tungstenite::tungstenite::Message; -use super::{item::Item, session_resource::SessionResource}; +use super::{ + item::Item, + session_resource::{ + AudioFormat, MaxResponseOutputTokens, Modality, RealtimeVoice, SessionResource, ToolChoice, + ToolDefinition, + }, +}; + +/// Configuration for a response in the OpenAI Realtime API. +/// This is used in the `response.create` event. +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct ResponseConfig { + /// Controls which conversation the response is added to. Currently supports "auto" and "none", + /// with "auto" as the default value. The "auto" value means that the contents of the response + /// will be added to the default conversation. Set this to "none" to create an out-of-band response + /// which will not add items to default conversation. + #[serde(skip_serializing_if = "Option::is_none")] + pub conversation: Option, + + /// Input items to include in the prompt for the model. Using this field creates a new context + /// for this Response instead of using the default conversation. An empty array [] will clear + /// the context for this Response. Note that this can include references to items from the default conversation. + #[serde(skip_serializing_if = "Option::is_none")] + pub input: Option>, + + /// The default system instructions (i.e. system message) prepended to model calls. + /// This field allows the client to guide the model on desired responses. + #[serde(skip_serializing_if = "Option::is_none")] + pub instructions: Option, + + /// Maximum number of output tokens for a single assistant response, inclusive of tool calls. + /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model. + /// Defaults to "inf". + #[serde(skip_serializing_if = "Option::is_none")] + pub max_response_output_tokens: Option, + + /// Set of 16 key-value pairs that can be attached to an object. + /// This can be useful for storing additional information about the object in a structured format. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option>, + + /// The set of modalities the model can respond with. To disable audio, set this to ["text"]. + #[serde(skip_serializing_if = "Option::is_none")] + pub modalities: Option>, + + /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". + #[serde(skip_serializing_if = "Option::is_none")] + pub output_audio_format: Option, + + /// Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + #[serde(skip_serializing_if = "Option::is_none")] + pub temperature: Option, + + /// How the model chooses tools. + #[serde(skip_serializing_if = "Option::is_none")] + pub tool_choice: Option, + + /// Tools (functions) available to the model. + #[serde(skip_serializing_if = "Option::is_none")] + pub tools: Option>, + + /// The voice the model uses to respond. Cannot be changed once the model has responded with audio at least once. + #[serde(skip_serializing_if = "Option::is_none")] + pub voice: Option, + + /// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + /// This value can only be changed in between model turns, not while a response is in progress. + #[serde(skip_serializing_if = "Option::is_none")] + pub speed: Option, +} #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct SessionUpdateEvent { @@ -35,6 +104,13 @@ pub struct InputAudioBufferClearEvent { pub event_id: Option, } +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct OutputAudioBufferClearEvent { + /// Optional client-generated ID used to identify this event. + #[serde(skip_serializing_if = "Option::is_none")] + pub event_id: Option, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ConversationItemCreateEvent { /// Optional client-generated ID used to identify this event. @@ -75,6 +151,16 @@ pub struct ConversationItemDeleteEvent { pub item_id: String, } +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct ConversationItemRetrieveEvent { + /// Optional client-generated ID used to identify this event. + #[serde(skip_serializing_if = "Option::is_none")] + pub event_id: Option, + + /// The ID of the item to retrieve. + pub item_id: String, +} + #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct ResponseCreateEvent { /// Optional client-generated ID used to identify this event. @@ -82,7 +168,7 @@ pub struct ResponseCreateEvent { pub event_id: Option, /// Configuration for the response. - pub response: Option, + pub response: Option, } #[derive(Debug, Serialize, Deserialize, Clone, Default)] @@ -90,6 +176,9 @@ pub struct ResponseCancelEvent { /// Optional client-generated ID used to identify this event. #[serde(skip_serializing_if = "Option::is_none")] pub event_id: Option, + + /// A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation. + pub response_id: String, } /// These are events that the OpenAI Realtime WebSocket server will accept from the client. @@ -112,6 +201,10 @@ pub enum ClientEvent { #[serde(rename = "input_audio_buffer.clear")] InputAudioBufferClear(InputAudioBufferClearEvent), + /// WebRTC Only: Send this event to cut off the current audio response. + #[serde(rename = "output_audio_buffer.clear")] + OutputAudioBufferClear(OutputAudioBufferClearEvent), + /// Send this event when adding an item to the conversation. #[serde(rename = "conversation.item.create")] ConversationItemCreate(ConversationItemCreateEvent), @@ -124,6 +217,10 @@ pub enum ClientEvent { #[serde(rename = "conversation.item.delete")] ConversationItemDelete(ConversationItemDeleteEvent), + /// Send this event when you want to retrieve the server's representation of a specific item in the conversation history. + #[serde(rename = "conversation.item.retrieve")] + ConversationItemRetrieve(ConversationItemRetrieveEvent), + /// Send this event to trigger a response generation. #[serde(rename = "response.create")] ResponseCreate(ResponseCreateEvent), @@ -181,6 +278,11 @@ event_from!( ClientEvent, InputAudioBufferClear ); +event_from!( + OutputAudioBufferClearEvent, + ClientEvent, + OutputAudioBufferClear +); event_from!( ConversationItemCreateEvent, ClientEvent, @@ -198,14 +300,21 @@ event_from!( ); event_from!(ResponseCreateEvent, ClientEvent, ResponseCreate); event_from!(ResponseCancelEvent, ClientEvent, ResponseCancel); +event_from!( + ConversationItemRetrieveEvent, + ClientEvent, + ConversationItemRetrieve +); message_from_event!(SessionUpdateEvent, ClientEvent); message_from_event!(InputAudioBufferAppendEvent, ClientEvent); message_from_event!(InputAudioBufferCommitEvent, ClientEvent); message_from_event!(InputAudioBufferClearEvent, ClientEvent); +message_from_event!(OutputAudioBufferClearEvent, ClientEvent); message_from_event!(ConversationItemCreateEvent, ClientEvent); message_from_event!(ConversationItemTruncateEvent, ClientEvent); message_from_event!(ConversationItemDeleteEvent, ClientEvent); +message_from_event!(ConversationItemRetrieveEvent, ClientEvent); message_from_event!(ResponseCreateEvent, ClientEvent); message_from_event!(ResponseCancelEvent, ClientEvent); diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs index a6c6c32f..922c1fad 100644 --- a/async-openai/src/types/realtime/response_resource.rs +++ b/async-openai/src/types/realtime/response_resource.rs @@ -31,6 +31,17 @@ pub enum IncompleteReason { Interruption, MaxOutputTokens, ContentFilter, + TokenLimit, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum FinishReason { + Stop, + Length, + ToolCalls, + ContentFilter, + FunctionCall, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -58,4 +69,10 @@ pub struct ResponseResource { pub output: Vec, /// Usage statistics for the response. pub usage: Option, + /// The Unix timestamp (in seconds) for when the response was created. + #[serde(skip_serializing_if = "Option::is_none")] + pub created_at: Option, + /// The reason the model stopped generating tokens, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub finish_reason: Option, } diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs index 8795f6e4..fadeaa2d 100644 --- a/async-openai/src/types/realtime/server_event.rs +++ b/async-openai/src/types/realtime/server_event.rs @@ -38,7 +38,7 @@ pub struct ConversationCreatedEvent { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct InputAudioBufferCommitedEvent { +pub struct InputAudioBufferCommittedEvent { /// The unique ID of the server event. pub event_id: String, /// The ID of the preceding item after which the new item will be inserted. @@ -53,6 +53,12 @@ pub struct InputAudioBufferClearedEvent { pub event_id: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct OutputAudioBufferClearedEvent { + /// The unique ID of the server event. + pub event_id: String, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct InputAudioBufferSpeechStartedEvent { /// The unique ID of the server event. @@ -154,6 +160,14 @@ pub struct ConversationItemDeletedEvent { pub item_id: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ConversationItemRetrievedEvent { + /// The unique ID of the server event. + pub event_id: String, + /// The item that was retrieved. + pub item: Item, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ResponseCreatedEvent { /// The unique ID of the server event. @@ -381,12 +395,16 @@ pub enum ServerEvent { /// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode. #[serde(rename = "input_audio_buffer.committed")] - InputAudioBufferCommited(InputAudioBufferCommitedEvent), + InputAudioBufferCommitted(InputAudioBufferCommittedEvent), /// Returned when the input audio buffer is cleared by the client. #[serde(rename = "input_audio_buffer.cleared")] InputAudioBufferCleared(InputAudioBufferClearedEvent), + /// Returned when the output audio buffer is cleared by the client (WebRTC specific). + #[serde(rename = "output_audio_buffer.cleared")] + OutputAudioBufferCleared(OutputAudioBufferClearedEvent), + /// Returned in server turn detection mode when speech is detected. #[serde(rename = "input_audio_buffer.speech_started")] InputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent), @@ -422,6 +440,10 @@ pub enum ServerEvent { #[serde(rename = "conversation.item.deleted")] ConversationItemDeleted(ConversationItemDeletedEvent), + /// Returned when an item in the conversation is retrieved. + #[serde(rename = "conversation.item.retrieved")] + ConversationItemRetrieved(ConversationItemRetrievedEvent), + /// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress". #[serde(rename = "response.created")] ResponseCreated(ResponseCreatedEvent), diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs index 2fe1e5b1..fec64848 100644 --- a/async-openai/src/types/realtime/session_resource.rs +++ b/async-openai/src/types/realtime/session_resource.rs @@ -4,12 +4,28 @@ use serde::{Deserialize, Serialize}; pub enum AudioFormat { #[serde(rename = "pcm16")] PCM16, - #[serde(rename = "g711_law")] + #[serde(rename = "g711_ulaw")] G711ULAW, #[serde(rename = "g711_alaw")] G711ALAW, } +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum NoiseReductionType { + NearField, + FarField, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct InputAudioNoiseReduction { + /// Type of noise reduction. `near_field` is for close-talking microphones such as + /// headphones, `far_field` is for far-field microphones such as laptop or + /// conference room microphones. + #[serde(skip_serializing_if = "Option::is_none")] + pub r#type: Option, +} + #[derive(Debug, Default, Serialize, Deserialize, Clone)] pub struct AudioTranscription { /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency. @@ -74,6 +90,32 @@ pub enum MaxResponseOutputTokens { Num(u16), } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TracingConfiguration { + /// The group id to attach to this trace to enable filtering and grouping in the traces dashboard. + #[serde(skip_serializing_if = "Option::is_none")] + pub group_id: Option, + + /// The arbitrary metadata to attach to this trace to enable filtering in the traces dashboard. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option, + + /// The name of the workflow to attach to this trace. This is used to name the trace in the traces dashboard. + #[serde(skip_serializing_if = "Option::is_none")] + pub workflow_name: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] +pub enum TracingOption { + /// Auto tracing with default values + #[serde(rename = "auto")] + Auto, + /// Granular tracing configuration + #[serde(rename = "config")] + Config(TracingConfiguration), +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(tag = "type")] pub enum ToolDefinition { @@ -118,19 +160,43 @@ pub enum RealtimeVoice { Fable, Onyx, Nova, + Sage, Shimmer, Verse, } +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum Modality { + Text, + Audio, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum RealtimeModel { + #[serde(rename = "gpt-4o-realtime-preview")] + GPT4ORealtimePreview, + #[serde(rename = "gpt-4o-realtime-preview-2024-10-01")] + GPT4ORealtimePreview20241001, + #[serde(rename = "gpt-4o-realtime-preview-2024-12-17")] + GPT4ORealtimePreview20241217, + #[serde(rename = "gpt-4o-realtime-preview-2025-06-03")] + GPT4ORealtimePreview20250603, + #[serde(rename = "gpt-4o-mini-realtime-preview")] + GPT4OMiniRealtimePreview, + #[serde(rename = "gpt-4o-mini-realtime-preview-2024-12-17")] + GPT4OMiniRealtimePreview20241217, +} + #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct SessionResource { /// The default model used for this session. #[serde(skip_serializing_if = "Option::is_none")] - pub model: Option, + pub model: Option, /// The set of modalities the model can respond with. To disable audio, set this to ["text"]. #[serde(skip_serializing_if = "Option::is_none")] - pub modalities: Option>, + pub modalities: Option>, //// The default system instructions prepended to model calls. #[serde(skip_serializing_if = "Option::is_none")] @@ -140,10 +206,22 @@ pub struct SessionResource { #[serde(skip_serializing_if = "Option::is_none")] pub voice: Option, + /// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. + /// This value can only be changed in between model turns, not while a response is in progress. + #[serde(skip_serializing_if = "Option::is_none")] + pub speed: Option, + /// The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". #[serde(skip_serializing_if = "Option::is_none")] pub input_audio_format: Option, + /// Configuration for input audio noise reduction. This can be set to `null` to turn off. + /// Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. + /// Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) + /// and model performance by improving perception of the input audio. + #[serde(skip_serializing_if = "Option::is_none")] + pub input_audio_noise_reduction: Option, + /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw". #[serde(skip_serializing_if = "Option::is_none")] pub output_audio_format: Option, @@ -168,6 +246,9 @@ pub struct SessionResource { /// Sampling temperature for the model. pub temperature: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tracing: Option, + /// Maximum number of output tokens for a single assistant response, inclusive of tool calls. /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model. /// Defaults to "inf".