From 6395a6cf288858e99da9dd6b8308f98897afa2bb Mon Sep 17 00:00:00 2001
From: Chris Raethke <codesoda@users.noreply.github.com>
Date: Mon, 23 Jun 2025 16:28:34 +1000
Subject: [PATCH 1/4] feat: enhance realtime response types and audio
 transcription options

- Added `Cancelled` variant to `ResponseStatusDetail` enum for better handling of cancelled responses.
- Introduced `LogProb` struct to capture log probability information for transcribed tokens.
- Updated `ConversationItemInputAudioTranscriptionCompletedEvent` and `ConversationItemInputAudioTranscriptionDeltaEvent` to include optional `logprobs` for per-token log probability data.
- Enhanced `AudioTranscription` struct with optional fields for `language`, `model`, and `prompt` to improve transcription accuracy and customization.
- Added new `SemanticVAD` option in the `TurnDetection` enum to control model response eagerness.
- Expanded `RealtimeVoice` enum with additional voice options for more variety in audio responses.
---
 .../src/types/realtime/response_resource.rs   |  2 ++
 .../src/types/realtime/server_event.rs        | 30 ++++++++++++++++
 .../src/types/realtime/session_resource.rs    | 34 +++++++++++++++----
 3 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs
index 4a500890..a6c6c32f 100644
--- a/async-openai/src/types/realtime/response_resource.rs
+++ b/async-openai/src/types/realtime/response_resource.rs
@@ -40,6 +40,8 @@ pub enum ResponseStatusDetail {
     Incomplete { reason: IncompleteReason },
     #[serde(rename = "failed")]
     Failed { error: Option<FailedError> },
+    #[serde(rename = "cancelled")]
+    Cancelled { reason: String },
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs
index 3ba5f552..8795f6e4 100644
--- a/async-openai/src/types/realtime/server_event.rs
+++ b/async-openai/src/types/realtime/server_event.rs
@@ -83,6 +83,17 @@ pub struct ConversationItemCreatedEvent {
     pub item: Item,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+/// Log probability information for a transcribed token.
+pub struct LogProb {
+    /// Raw UTF-8 bytes for the token.
+    pub bytes: Vec<u8>,
+    /// The log probability of the token.
+    pub logprob: f64,
+    /// The token string.
+    pub token: String,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
     /// The unique ID of the server event.
@@ -93,6 +104,22 @@ pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
     pub content_index: u32,
     /// The transcribed text.
     pub transcript: String,
+    /// Optional per-token log probability data.
+    pub logprobs: Option<Vec<LogProb>>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemInputAudioTranscriptionDeltaEvent {
+    /// The unique ID of the server event.
+    pub event_id: String,
+    /// The ID of the user message item.
+    pub item_id: String,
+    /// The index of the content part containing the audio.
+    pub content_index: u32,
+    /// The text delta.
+    pub delta: String,
+    /// Optional per-token log probability data.
+    pub logprobs: Option<Vec<LogProb>>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -378,6 +405,9 @@ pub enum ServerEvent {
         ConversationItemInputAudioTranscriptionCompletedEvent,
     ),
 
+    #[serde(rename = "conversation.item.input_audio_transcription.delta")]
+    ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent),
+
     /// Returned when input audio transcription is configured, and a transcription request for a user message failed.
     #[serde(rename = "conversation.item.input_audio_transcription.failed")]
     ConversationItemInputAudioTranscriptionFailed(
diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs
index 10472414..89be7133 100644
--- a/async-openai/src/types/realtime/session_resource.rs
+++ b/async-openai/src/types/realtime/session_resource.rs
@@ -10,12 +10,19 @@ pub enum AudioFormat {
     G711ALAW,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
 pub struct AudioTranscription {
-    /// Whether to enable input audio transcription.
-    pub enabled: bool,
-    /// The model to use for transcription (e.g., "whisper-1").
-    pub model: String,
+    /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+    /// The model to use for transcription, current options are gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+    /// An optional text to guide the model's style or continue a previous audio segment.
+    /// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models,
+    /// the prompt is a free text string, for example "expect words related to technology".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt: Option<String>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -31,6 +38,14 @@ pub enum TurnDetection {
         /// Duration of silence to detect speech stop (in milliseconds).
         silence_duration_ms: u32,
     },
+
+    #[serde(rename = "semantic_vad")]
+    SemanticVAD {
+        /// The eagerness of the model to respond.
+        /// `low` will wait longer for the user to continue speaking,
+        /// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
+        eagerness: String,
+    },
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -78,8 +93,15 @@ pub enum ToolChoice {
 #[serde(rename_all = "lowercase")]
 pub enum RealtimeVoice {
     Alloy,
-    Shimmer,
+    Ash,
+    Ballad,
+    Coral,
     Echo,
+    Fable,
+    Onyx,
+    Nova,
+    Shimmer,
+    Verse,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]

From daeb8c7c1686e7b3cfe1395b6b0b1dcabe19a4e6 Mon Sep 17 00:00:00 2001
From: Chris Raethke <codesoda@users.noreply.github.com>
Date: Mon, 23 Jun 2025 17:40:57 +1000
Subject: [PATCH 2/4] feat: update audio format enum values for consistency

- Changed enum variants for `AudioFormat` to use underscores instead of hyphens in their serialized names.
- Updated `G711ULAW` from `g711-ulaw` to `g711_law` and `G711ALAW` from `g711-alaw` to `g711_alaw` for improved clarity and adherence to naming conventions.
---
 async-openai/src/types/realtime/session_resource.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs
index 89be7133..e2e4067f 100644
--- a/async-openai/src/types/realtime/session_resource.rs
+++ b/async-openai/src/types/realtime/session_resource.rs
@@ -4,9 +4,9 @@ use serde::{Deserialize, Serialize};
 pub enum AudioFormat {
     #[serde(rename = "pcm16")]
     PCM16,
-    #[serde(rename = "g711-ulaw")]
+    #[serde(rename = "g711_law")]
     G711ULAW,
-    #[serde(rename = "g711-alaw")]
+    #[serde(rename = "g711_alaw")]
     G711ALAW,
 }
 

From 2bb05e3904aafd6174188320a1a25306aa0bdf52 Mon Sep 17 00:00:00 2001
From: Chris Raethke <codesoda@users.noreply.github.com>
Date: Thu, 26 Jun 2025 11:57:08 +1000
Subject: [PATCH 3/4] feat: add auto-response options to VAD configurations

---
 .../src/types/realtime/session_resource.rs     | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs
index e2e4067f..2fe1e5b1 100644
--- a/async-openai/src/types/realtime/session_resource.rs
+++ b/async-openai/src/types/realtime/session_resource.rs
@@ -37,6 +37,15 @@ pub enum TurnDetection {
         prefix_padding_ms: u32,
         /// Duration of silence to detect speech stop (in milliseconds).
         silence_duration_ms: u32,
+
+        /// Whether or not to automatically generate a response when a VAD stop event occurs.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        create_response: Option<bool>,
+
+        /// Whether or not to automatically interrupt any ongoing response with output to
+        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        interrupt_response: Option<bool>,
     },
 
     #[serde(rename = "semantic_vad")]
@@ -45,6 +54,15 @@ pub enum TurnDetection {
         /// `low` will wait longer for the user to continue speaking,
         /// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
         eagerness: String,
+
+        /// Whether or not to automatically generate a response when a VAD stop event occurs.
+        #[serde(skip_serializing_if = "Option::is_none", default)]
+        create_response: Option<bool>,
+
+        /// Whether or not to automatically interrupt any ongoing response with output to
+        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
+        #[serde(skip_serializing_if = "Option::is_none", default)]
+        interrupt_response: Option<bool>,
     },
 }
 

From 479bf1e3274fd49d7e7e0ee0634ee778c4835a45 Mon Sep 17 00:00:00 2001
From: Chris Raethke <codesoda@users.noreply.github.com>
Date: Mon, 30 Jun 2025 14:58:31 +1000
Subject: [PATCH 4/4] feat: add realtime API types and event handling for
 audio, tracing, and response management

---
 .../src/types/realtime/client_event.rs        | 113 +++++++++++++++++-
 .../src/types/realtime/response_resource.rs   |  17 +++
 .../src/types/realtime/server_event.rs        |  26 +++-
 .../src/types/realtime/session_resource.rs    |  87 +++++++++++++-
 4 files changed, 236 insertions(+), 7 deletions(-)

diff --git a/async-openai/src/types/realtime/client_event.rs b/async-openai/src/types/realtime/client_event.rs
index 87ff7010..9fb3d2a4 100644
--- a/async-openai/src/types/realtime/client_event.rs
+++ b/async-openai/src/types/realtime/client_event.rs
@@ -1,7 +1,76 @@
 use serde::{Deserialize, Serialize};
 use tokio_tungstenite::tungstenite::Message;
 
-use super::{item::Item, session_resource::SessionResource};
+use super::{
+    item::Item,
+    session_resource::{
+        AudioFormat, MaxResponseOutputTokens, Modality, RealtimeVoice, SessionResource, ToolChoice,
+        ToolDefinition,
+    },
+};
+
+/// Configuration for a response in the OpenAI Realtime API.
+/// This is used in the `response.create` event.
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ResponseConfig {
+    /// Controls which conversation the response is added to. Currently supports "auto" and "none",
+    /// with "auto" as the default value. The "auto" value means that the contents of the response
+    /// will be added to the default conversation. Set this to "none" to create an out-of-band response
+    /// which will not add items to default conversation.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub conversation: Option<String>,
+
+    /// Input items to include in the prompt for the model. Using this field creates a new context
+    /// for this Response instead of using the default conversation. An empty array [] will clear
+    /// the context for this Response. Note that this can include references to items from the default conversation.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input: Option<Vec<Item>>,
+
+    /// The default system instructions (i.e. system message) prepended to model calls.
+    /// This field allows the client to guide the model on desired responses.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub instructions: Option<String>,
+
+    /// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
+    /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model.
+    /// Defaults to "inf".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_response_output_tokens: Option<MaxResponseOutputTokens>,
+
+    /// Set of 16 key-value pairs that can be attached to an object.
+    /// This can be useful for storing additional information about the object in a structured format.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<std::collections::HashMap<String, String>>,
+
+    /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modalities: Option<Vec<Modality>>,
+
+    /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub output_audio_format: Option<AudioFormat>,
+
+    /// Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// How the model chooses tools.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<ToolChoice>,
+
+    /// Tools (functions) available to the model.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<ToolDefinition>>,
+
+    /// The voice the model uses to respond. Cannot be changed once the model has responded with audio at least once.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub voice: Option<RealtimeVoice>,
+
+    /// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+    /// This value can only be changed in between model turns, not while a response is in progress.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub speed: Option<f32>,
+}
 
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
 pub struct SessionUpdateEvent {
@@ -35,6 +104,13 @@ pub struct InputAudioBufferClearEvent {
     pub event_id: Option<String>,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct OutputAudioBufferClearEvent {
+    /// Optional client-generated ID used to identify this event.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct ConversationItemCreateEvent {
     /// Optional client-generated ID used to identify this event.
@@ -75,6 +151,16 @@ pub struct ConversationItemDeleteEvent {
     pub item_id: String,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ConversationItemRetrieveEvent {
+    /// Optional client-generated ID used to identify this event.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+
+    /// The ID of the item to retrieve.
+    pub item_id: String,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
 pub struct ResponseCreateEvent {
     /// Optional client-generated ID used to identify this event.
@@ -82,7 +168,7 @@ pub struct ResponseCreateEvent {
     pub event_id: Option<String>,
 
     /// Configuration for the response.
-    pub response: Option<SessionResource>,
+    pub response: Option<ResponseConfig>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
@@ -90,6 +176,9 @@ pub struct ResponseCancelEvent {
     /// Optional client-generated ID used to identify this event.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub event_id: Option<String>,
+
+    /// A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation.
+    pub response_id: String,
 }
 
 /// These are events that the OpenAI Realtime WebSocket server will accept from the client.
@@ -112,6 +201,10 @@ pub enum ClientEvent {
     #[serde(rename = "input_audio_buffer.clear")]
     InputAudioBufferClear(InputAudioBufferClearEvent),
 
+    /// WebRTC Only: Send this event to cut off the current audio response.
+    #[serde(rename = "output_audio_buffer.clear")]
+    OutputAudioBufferClear(OutputAudioBufferClearEvent),
+
     /// Send this event when adding an item to the conversation.
     #[serde(rename = "conversation.item.create")]
     ConversationItemCreate(ConversationItemCreateEvent),
@@ -124,6 +217,10 @@ pub enum ClientEvent {
     #[serde(rename = "conversation.item.delete")]
     ConversationItemDelete(ConversationItemDeleteEvent),
 
+    /// Send this event when you want to retrieve the server's representation of a specific item in the conversation history.
+    #[serde(rename = "conversation.item.retrieve")]
+    ConversationItemRetrieve(ConversationItemRetrieveEvent),
+
     /// Send this event to trigger a response generation.
     #[serde(rename = "response.create")]
     ResponseCreate(ResponseCreateEvent),
@@ -181,6 +278,11 @@ event_from!(
     ClientEvent,
     InputAudioBufferClear
 );
+event_from!(
+    OutputAudioBufferClearEvent,
+    ClientEvent,
+    OutputAudioBufferClear
+);
 event_from!(
     ConversationItemCreateEvent,
     ClientEvent,
@@ -198,14 +300,21 @@ event_from!(
 );
 event_from!(ResponseCreateEvent, ClientEvent, ResponseCreate);
 event_from!(ResponseCancelEvent, ClientEvent, ResponseCancel);
+event_from!(
+    ConversationItemRetrieveEvent,
+    ClientEvent,
+    ConversationItemRetrieve
+);
 
 message_from_event!(SessionUpdateEvent, ClientEvent);
 message_from_event!(InputAudioBufferAppendEvent, ClientEvent);
 message_from_event!(InputAudioBufferCommitEvent, ClientEvent);
 message_from_event!(InputAudioBufferClearEvent, ClientEvent);
+message_from_event!(OutputAudioBufferClearEvent, ClientEvent);
 message_from_event!(ConversationItemCreateEvent, ClientEvent);
 message_from_event!(ConversationItemTruncateEvent, ClientEvent);
 message_from_event!(ConversationItemDeleteEvent, ClientEvent);
+message_from_event!(ConversationItemRetrieveEvent, ClientEvent);
 message_from_event!(ResponseCreateEvent, ClientEvent);
 message_from_event!(ResponseCancelEvent, ClientEvent);
 
diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs
index a6c6c32f..922c1fad 100644
--- a/async-openai/src/types/realtime/response_resource.rs
+++ b/async-openai/src/types/realtime/response_resource.rs
@@ -31,6 +31,17 @@ pub enum IncompleteReason {
     Interruption,
     MaxOutputTokens,
     ContentFilter,
+    TokenLimit,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum FinishReason {
+    Stop,
+    Length,
+    ToolCalls,
+    ContentFilter,
+    FunctionCall,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -58,4 +69,10 @@ pub struct ResponseResource {
     pub output: Vec<Item>,
     /// Usage statistics for the response.
     pub usage: Option<Usage>,
+    /// The Unix timestamp (in seconds) for when the response was created.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_at: Option<u64>,
+    /// The reason the model stopped generating tokens, if applicable.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub finish_reason: Option<FinishReason>,
 }
diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs
index 8795f6e4..fadeaa2d 100644
--- a/async-openai/src/types/realtime/server_event.rs
+++ b/async-openai/src/types/realtime/server_event.rs
@@ -38,7 +38,7 @@ pub struct ConversationCreatedEvent {
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct InputAudioBufferCommitedEvent {
+pub struct InputAudioBufferCommittedEvent {
     /// The unique ID of the server event.
     pub event_id: String,
     /// The ID of the preceding item after which the new item will be inserted.
@@ -53,6 +53,12 @@ pub struct InputAudioBufferClearedEvent {
     pub event_id: String,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct OutputAudioBufferClearedEvent {
+    /// The unique ID of the server event.
+    pub event_id: String,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct InputAudioBufferSpeechStartedEvent {
     /// The unique ID of the server event.
@@ -154,6 +160,14 @@ pub struct ConversationItemDeletedEvent {
     pub item_id: String,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemRetrievedEvent {
+    /// The unique ID of the server event.
+    pub event_id: String,
+    /// The item that was retrieved.
+    pub item: Item,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct ResponseCreatedEvent {
     /// The unique ID of the server event.
@@ -381,12 +395,16 @@ pub enum ServerEvent {
 
     /// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode.
     #[serde(rename = "input_audio_buffer.committed")]
-    InputAudioBufferCommited(InputAudioBufferCommitedEvent),
+    InputAudioBufferCommitted(InputAudioBufferCommittedEvent),
 
     /// Returned when the input audio buffer is cleared by the client.
     #[serde(rename = "input_audio_buffer.cleared")]
     InputAudioBufferCleared(InputAudioBufferClearedEvent),
 
+    /// Returned when the output audio buffer is cleared by the client (WebRTC specific).
+    #[serde(rename = "output_audio_buffer.cleared")]
+    OutputAudioBufferCleared(OutputAudioBufferClearedEvent),
+
     /// Returned in server turn detection mode when speech is detected.
     #[serde(rename = "input_audio_buffer.speech_started")]
     InputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent),
@@ -422,6 +440,10 @@ pub enum ServerEvent {
     #[serde(rename = "conversation.item.deleted")]
     ConversationItemDeleted(ConversationItemDeletedEvent),
 
+    /// Returned when an item in the conversation is retrieved.
+    #[serde(rename = "conversation.item.retrieved")]
+    ConversationItemRetrieved(ConversationItemRetrievedEvent),
+
     /// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress".
     #[serde(rename = "response.created")]
     ResponseCreated(ResponseCreatedEvent),
diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs
index 2fe1e5b1..fec64848 100644
--- a/async-openai/src/types/realtime/session_resource.rs
+++ b/async-openai/src/types/realtime/session_resource.rs
@@ -4,12 +4,28 @@ use serde::{Deserialize, Serialize};
 pub enum AudioFormat {
     #[serde(rename = "pcm16")]
     PCM16,
-    #[serde(rename = "g711_law")]
+    #[serde(rename = "g711_ulaw")]
     G711ULAW,
     #[serde(rename = "g711_alaw")]
     G711ALAW,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum NoiseReductionType {
+    NearField,
+    FarField,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct InputAudioNoiseReduction {
+    /// Type of noise reduction. `near_field` is for close-talking microphones such as
+    /// headphones, `far_field` is for far-field microphones such as laptop or
+    /// conference room microphones.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<NoiseReductionType>,
+}
+
 #[derive(Debug, Default, Serialize, Deserialize, Clone)]
 pub struct AudioTranscription {
     /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
@@ -74,6 +90,32 @@ pub enum MaxResponseOutputTokens {
     Num(u16),
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct TracingConfiguration {
+    /// The group id to attach to this trace to enable filtering and grouping in the traces dashboard.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub group_id: Option<String>,
+
+    /// The arbitrary metadata to attach to this trace to enable filtering in the traces dashboard.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<serde_json::Value>,
+
+    /// The name of the workflow to attach to this trace. This is used to name the trace in the traces dashboard.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub workflow_name: Option<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(untagged)]
+pub enum TracingOption {
+    /// Auto tracing with default values
+    #[serde(rename = "auto")]
+    Auto,
+    /// Granular tracing configuration
+    #[serde(rename = "config")]
+    Config(TracingConfiguration),
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 #[serde(tag = "type")]
 pub enum ToolDefinition {
@@ -118,19 +160,43 @@ pub enum RealtimeVoice {
     Fable,
     Onyx,
     Nova,
+    Sage,
     Shimmer,
     Verse,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "lowercase")]
+pub enum Modality {
+    Text,
+    Audio,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub enum RealtimeModel {
+    #[serde(rename = "gpt-4o-realtime-preview")]
+    GPT4ORealtimePreview,
+    #[serde(rename = "gpt-4o-realtime-preview-2024-10-01")]
+    GPT4ORealtimePreview20241001,
+    #[serde(rename = "gpt-4o-realtime-preview-2024-12-17")]
+    GPT4ORealtimePreview20241217,
+    #[serde(rename = "gpt-4o-realtime-preview-2025-06-03")]
+    GPT4ORealtimePreview20250603,
+    #[serde(rename = "gpt-4o-mini-realtime-preview")]
+    GPT4OMiniRealtimePreview,
+    #[serde(rename = "gpt-4o-mini-realtime-preview-2024-12-17")]
+    GPT4OMiniRealtimePreview20241217,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
 pub struct SessionResource {
     /// The default model used for this session.
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub model: Option<String>,
+    pub model: Option<RealtimeModel>,
 
     /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub modalities: Option<Vec<String>>,
+    pub modalities: Option<Vec<Modality>>,
 
     //// The default system instructions prepended to model calls.
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -140,10 +206,22 @@ pub struct SessionResource {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub voice: Option<RealtimeVoice>,
 
+    /// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+    /// This value can only be changed in between model turns, not while a response is in progress.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub speed: Option<f32>,
+
     /// The format of input audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
     #[serde(skip_serializing_if = "Option::is_none")]
     pub input_audio_format: Option<AudioFormat>,
 
+    /// Configuration for input audio noise reduction. This can be set to `null` to turn off.
+    /// Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model.
+    /// Filtering the audio can improve VAD and turn detection accuracy (reducing false positives)
+    /// and model performance by improving perception of the input audio.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input_audio_noise_reduction: Option<InputAudioNoiseReduction>,
+
     /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
     #[serde(skip_serializing_if = "Option::is_none")]
     pub output_audio_format: Option<AudioFormat>,
@@ -168,6 +246,9 @@ pub struct SessionResource {
     /// Sampling temperature for the model.
     pub temperature: Option<f32>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tracing: Option<TracingOption>,
+
     /// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
     /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model.
     /// Defaults to "inf".