feat: enhance realtime response types and audio transcription options (#391)

codesoda · web-flow · commit be059c228cd5 · 2025-06-28T18:05:21.000-07:00
* feat: enhance realtime response types and audio transcription options

- Added `Cancelled` variant to `ResponseStatusDetail` enum for better handling of cancelled responses.
- Introduced `LogProb` struct to capture log probability information for transcribed tokens.
- Updated `ConversationItemInputAudioTranscriptionCompletedEvent` and `ConversationItemInputAudioTranscriptionDeltaEvent` to include optional `logprobs` for per-token log probability data.
- Enhanced `AudioTranscription` struct with optional fields for `language`, `model`, and `prompt` to improve transcription accuracy and customization.
- Added new `SemanticVAD` option in the `TurnDetection` enum to control model response eagerness.
- Expanded `RealtimeVoice` enum with additional voice options for more variety in audio responses.

* feat: update audio format enum values for consistency

- Changed enum variants for `AudioFormat` to use underscores instead of hyphens in their serialized names.
- Updated `G711ULAW` from `g711-ulaw` to `g711_law` and `G711ALAW` from `g711-alaw` to `g711_alaw` for improved clarity and adherence to naming conventions.

* feat: add auto-response options to VAD configurations

---------

Co-authored-by: Chris Raethke &lt;codesoda@users.noreply.github.com&gt;
diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs
@@ -40,6 +40,8 @@ pub enum ResponseStatusDetail {
     Incomplete { reason: IncompleteReason },
     #[serde(rename = "failed")]
     Failed { error: Option<FailedError> },
+    #[serde(rename = "cancelled")]
+    Cancelled { reason: String },
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs
@@ -83,6 +83,17 @@ pub struct ConversationItemCreatedEvent {
     pub item: Item,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+/// Log probability information for a transcribed token.
+pub struct LogProb {
+    /// Raw UTF-8 bytes for the token.
+    pub bytes: Vec<u8>,
+    /// The log probability of the token.
+    pub logprob: f64,
+    /// The token string.
+    pub token: String,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
     /// The unique ID of the server event.
@@ -93,6 +104,22 @@ pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
     pub content_index: u32,
     /// The transcribed text.
     pub transcript: String,
+    /// Optional per-token log probability data.
+    pub logprobs: Option<Vec<LogProb>>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemInputAudioTranscriptionDeltaEvent {
+    /// The unique ID of the server event.
+    pub event_id: String,
+    /// The ID of the user message item.
+    pub item_id: String,
+    /// The index of the content part containing the audio.
+    pub content_index: u32,
+    /// The text delta.
+    pub delta: String,
+    /// Optional per-token log probability data.
+    pub logprobs: Option<Vec<LogProb>>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -378,6 +405,9 @@ pub enum ServerEvent {
         ConversationItemInputAudioTranscriptionCompletedEvent,
     ),
 
+    #[serde(rename = "conversation.item.input_audio_transcription.delta")]
+    ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent),
+
     /// Returned when input audio transcription is configured, and a transcription request for a user message failed.
     #[serde(rename = "conversation.item.input_audio_transcription.failed")]
     ConversationItemInputAudioTranscriptionFailed(
diff --git a/async-openai/src/types/realtime/session_resource.rs b/async-openai/src/types/realtime/session_resource.rs
@@ -4,18 +4,25 @@ use serde::{Deserialize, Serialize};
 pub enum AudioFormat {
     #[serde(rename = "pcm16")]
     PCM16,
-    #[serde(rename = "g711-ulaw")]
+    #[serde(rename = "g711_law")]
     G711ULAW,
-    #[serde(rename = "g711-alaw")]
+    #[serde(rename = "g711_alaw")]
     G711ALAW,
 }
 
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone)]
 pub struct AudioTranscription {
-    /// Whether to enable input audio transcription.
-    pub enabled: bool,
-    /// The model to use for transcription (e.g., "whisper-1").
-    pub model: String,
+    /// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub language: Option<String>,
+    /// The model to use for transcription, current options are gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+    /// An optional text to guide the model's style or continue a previous audio segment.
+    /// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models,
+    /// the prompt is a free text string, for example "expect words related to technology".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt: Option<String>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -30,6 +37,32 @@ pub enum TurnDetection {
         prefix_padding_ms: u32,
         /// Duration of silence to detect speech stop (in milliseconds).
         silence_duration_ms: u32,
+
+        /// Whether or not to automatically generate a response when a VAD stop event occurs.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        create_response: Option<bool>,
+
+        /// Whether or not to automatically interrupt any ongoing response with output to
+        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        interrupt_response: Option<bool>,
+    },
+
+    #[serde(rename = "semantic_vad")]
+    SemanticVAD {
+        /// The eagerness of the model to respond.
+        /// `low` will wait longer for the user to continue speaking,
+        /// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
+        eagerness: String,
+
+        /// Whether or not to automatically generate a response when a VAD stop event occurs.
+        #[serde(skip_serializing_if = "Option::is_none", default)]
+        create_response: Option<bool>,
+
+        /// Whether or not to automatically interrupt any ongoing response with output to
+        /// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
+        #[serde(skip_serializing_if = "Option::is_none", default)]
+        interrupt_response: Option<bool>,
     },
 }
 
@@ -78,8 +111,15 @@ pub enum ToolChoice {
 #[serde(rename_all = "lowercase")]
 pub enum RealtimeVoice {
     Alloy,
-    Shimmer,
+    Ash,
+    Ballad,
+    Coral,
     Echo,
+    Fable,
+    Onyx,
+    Nova,
+    Shimmer,
+    Verse,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,8 @@ pub enum ResponseStatusDetail {`
`40`	`40`	`Incomplete { reason: IncompleteReason },`
`41`	`41`	`#[serde(rename = "failed")]`
`42`	`42`	`Failed { error: Option<FailedError> },`
	`43`	`+ #[serde(rename = "cancelled")]`
	`44`	`+ Cancelled { reason: String },`
`43`	`45`	`}`
`44`	`46`
`45`	`47`	`#[derive(Debug, Serialize, Deserialize, Clone)]`