Skip to content

Commit be059c2

Browse files
authored
feat: enhance realtime response types and audio transcription options (#391)
* feat: enhance realtime response types and audio transcription options - Added `Cancelled` variant to `ResponseStatusDetail` enum for better handling of cancelled responses. - Introduced `LogProb` struct to capture log probability information for transcribed tokens. - Updated `ConversationItemInputAudioTranscriptionCompletedEvent` and `ConversationItemInputAudioTranscriptionDeltaEvent` to include optional `logprobs` for per-token log probability data. - Enhanced `AudioTranscription` struct with optional fields for `language`, `model`, and `prompt` to improve transcription accuracy and customization. - Added new `SemanticVAD` option in the `TurnDetection` enum to control model response eagerness. - Expanded `RealtimeVoice` enum with additional voice options for more variety in audio responses. * feat: update audio format enum values for consistency - Changed enum variants for `AudioFormat` to use underscores instead of hyphens in their serialized names. - Updated `G711ULAW` from `g711-ulaw` to `g711_law` and `G711ALAW` from `g711-alaw` to `g711_alaw` for improved clarity and adherence to naming conventions. * feat: add auto-response options to VAD configurations --------- Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
1 parent 482344a commit be059c2

File tree

3 files changed

+80
-8
lines changed

3 files changed

+80
-8
lines changed

async-openai/src/types/realtime/response_resource.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ pub enum ResponseStatusDetail {
4040
Incomplete { reason: IncompleteReason },
4141
#[serde(rename = "failed")]
4242
Failed { error: Option<FailedError> },
43+
#[serde(rename = "cancelled")]
44+
Cancelled { reason: String },
4345
}
4446

4547
#[derive(Debug, Serialize, Deserialize, Clone)]

async-openai/src/types/realtime/server_event.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,17 @@ pub struct ConversationItemCreatedEvent {
8383
pub item: Item,
8484
}
8585

86+
#[derive(Debug, Serialize, Deserialize, Clone)]
87+
/// Log probability information for a transcribed token.
88+
pub struct LogProb {
89+
/// Raw UTF-8 bytes for the token.
90+
pub bytes: Vec<u8>,
91+
/// The log probability of the token.
92+
pub logprob: f64,
93+
/// The token string.
94+
pub token: String,
95+
}
96+
8697
#[derive(Debug, Serialize, Deserialize, Clone)]
8798
pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
8899
/// The unique ID of the server event.
@@ -93,6 +104,22 @@ pub struct ConversationItemInputAudioTranscriptionCompletedEvent {
93104
pub content_index: u32,
94105
/// The transcribed text.
95106
pub transcript: String,
107+
/// Optional per-token log probability data.
108+
pub logprobs: Option<Vec<LogProb>>,
109+
}
110+
111+
#[derive(Debug, Serialize, Deserialize, Clone)]
112+
pub struct ConversationItemInputAudioTranscriptionDeltaEvent {
113+
/// The unique ID of the server event.
114+
pub event_id: String,
115+
/// The ID of the user message item.
116+
pub item_id: String,
117+
/// The index of the content part containing the audio.
118+
pub content_index: u32,
119+
/// The text delta.
120+
pub delta: String,
121+
/// Optional per-token log probability data.
122+
pub logprobs: Option<Vec<LogProb>>,
96123
}
97124

98125
#[derive(Debug, Serialize, Deserialize, Clone)]
@@ -378,6 +405,9 @@ pub enum ServerEvent {
378405
ConversationItemInputAudioTranscriptionCompletedEvent,
379406
),
380407

408+
#[serde(rename = "conversation.item.input_audio_transcription.delta")]
409+
ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDeltaEvent),
410+
381411
/// Returned when input audio transcription is configured, and a transcription request for a user message failed.
382412
#[serde(rename = "conversation.item.input_audio_transcription.failed")]
383413
ConversationItemInputAudioTranscriptionFailed(

async-openai/src/types/realtime/session_resource.rs

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,25 @@ use serde::{Deserialize, Serialize};
44
pub enum AudioFormat {
55
#[serde(rename = "pcm16")]
66
PCM16,
7-
#[serde(rename = "g711-ulaw")]
7+
#[serde(rename = "g711_law")]
88
G711ULAW,
9-
#[serde(rename = "g711-alaw")]
9+
#[serde(rename = "g711_alaw")]
1010
G711ALAW,
1111
}
1212

13-
#[derive(Debug, Serialize, Deserialize, Clone)]
13+
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
1414
pub struct AudioTranscription {
15-
/// Whether to enable input audio transcription.
16-
pub enabled: bool,
17-
/// The model to use for transcription (e.g., "whisper-1").
18-
pub model: String,
15+
/// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
16+
#[serde(skip_serializing_if = "Option::is_none")]
17+
pub language: Option<String>,
18+
/// The model to use for transcription, current options are gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1.
19+
#[serde(skip_serializing_if = "Option::is_none")]
20+
pub model: Option<String>,
21+
/// An optional text to guide the model's style or continue a previous audio segment.
22+
/// For whisper-1, the prompt is a list of keywords. For gpt-4o-transcribe models,
23+
/// the prompt is a free text string, for example "expect words related to technology".
24+
#[serde(skip_serializing_if = "Option::is_none")]
25+
pub prompt: Option<String>,
1926
}
2027

2128
#[derive(Debug, Serialize, Deserialize, Clone)]
@@ -30,6 +37,32 @@ pub enum TurnDetection {
3037
prefix_padding_ms: u32,
3138
/// Duration of silence to detect speech stop (in milliseconds).
3239
silence_duration_ms: u32,
40+
41+
/// Whether or not to automatically generate a response when a VAD stop event occurs.
42+
#[serde(skip_serializing_if = "Option::is_none")]
43+
create_response: Option<bool>,
44+
45+
/// Whether or not to automatically interrupt any ongoing response with output to
46+
/// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
47+
#[serde(skip_serializing_if = "Option::is_none")]
48+
interrupt_response: Option<bool>,
49+
},
50+
51+
#[serde(rename = "semantic_vad")]
52+
SemanticVAD {
53+
/// The eagerness of the model to respond.
54+
/// `low` will wait longer for the user to continue speaking,
55+
/// `high`` will respond more quickly. `auto`` is the default and is equivalent to `medium`
56+
eagerness: String,
57+
58+
/// Whether or not to automatically generate a response when a VAD stop event occurs.
59+
#[serde(skip_serializing_if = "Option::is_none", default)]
60+
create_response: Option<bool>,
61+
62+
/// Whether or not to automatically interrupt any ongoing response with output to
63+
/// the default conversation (i.e. conversation of auto) when a VAD start event occurs.
64+
#[serde(skip_serializing_if = "Option::is_none", default)]
65+
interrupt_response: Option<bool>,
3366
},
3467
}
3568

@@ -78,8 +111,15 @@ pub enum ToolChoice {
78111
#[serde(rename_all = "lowercase")]
79112
pub enum RealtimeVoice {
80113
Alloy,
81-
Shimmer,
114+
Ash,
115+
Ballad,
116+
Coral,
82117
Echo,
118+
Fable,
119+
Onyx,
120+
Nova,
121+
Shimmer,
122+
Verse,
83123
}
84124

85125
#[derive(Debug, Serialize, Deserialize, Clone, Default)]

0 commit comments

Comments
 (0)