Skip to content

Update more realtime spec #397

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 111 additions & 2 deletions async-openai/src/types/realtime/client_event.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,76 @@
use serde::{Deserialize, Serialize};
use tokio_tungstenite::tungstenite::Message;

use super::{item::Item, session_resource::SessionResource};
use super::{
item::Item,
session_resource::{
AudioFormat, MaxResponseOutputTokens, Modality, RealtimeVoice, SessionResource, ToolChoice,
ToolDefinition,
},
};

/// Configuration for a response in the OpenAI Realtime API.
/// This is used in the `response.create` event.
#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ResponseConfig {
/// Controls which conversation the response is added to. Currently supports "auto" and "none",
/// with "auto" as the default value. The "auto" value means that the contents of the response
/// will be added to the default conversation. Set this to "none" to create an out-of-band response
/// which will not add items to default conversation.
#[serde(skip_serializing_if = "Option::is_none")]
pub conversation: Option<String>,

/// Input items to include in the prompt for the model. Using this field creates a new context
/// for this Response instead of using the default conversation. An empty array [] will clear
/// the context for this Response. Note that this can include references to items from the default conversation.
#[serde(skip_serializing_if = "Option::is_none")]
pub input: Option<Vec<Item>>,

/// The default system instructions (i.e. system message) prepended to model calls.
/// This field allows the client to guide the model on desired responses.
#[serde(skip_serializing_if = "Option::is_none")]
pub instructions: Option<String>,

/// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
/// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model.
/// Defaults to "inf".
#[serde(skip_serializing_if = "Option::is_none")]
pub max_response_output_tokens: Option<MaxResponseOutputTokens>,

/// Set of 16 key-value pairs that can be attached to an object.
/// This can be useful for storing additional information about the object in a structured format.
#[serde(skip_serializing_if = "Option::is_none")]
pub metadata: Option<std::collections::HashMap<String, String>>,

/// The set of modalities the model can respond with. To disable audio, set this to ["text"].
#[serde(skip_serializing_if = "Option::is_none")]
pub modalities: Option<Vec<Modality>>,

/// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
#[serde(skip_serializing_if = "Option::is_none")]
pub output_audio_format: Option<AudioFormat>,

/// Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,

/// How the model chooses tools.
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<ToolChoice>,

/// Tools (functions) available to the model.
#[serde(skip_serializing_if = "Option::is_none")]
pub tools: Option<Vec<ToolDefinition>>,

/// The voice the model uses to respond. Cannot be changed once the model has responded with audio at least once.
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<RealtimeVoice>,

/// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
/// This value can only be changed in between model turns, not while a response is in progress.
#[serde(skip_serializing_if = "Option::is_none")]
pub speed: Option<f32>,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct SessionUpdateEvent {
Expand Down Expand Up @@ -35,6 +104,13 @@ pub struct InputAudioBufferClearEvent {
pub event_id: Option<String>,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct OutputAudioBufferClearEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ConversationItemCreateEvent {
/// Optional client-generated ID used to identify this event.
Expand Down Expand Up @@ -75,21 +151,34 @@ pub struct ConversationItemDeleteEvent {
pub item_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ConversationItemRetrieveEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,

/// The ID of the item to retrieve.
pub item_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ResponseCreateEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,

/// Configuration for the response.
pub response: Option<SessionResource>,
pub response: Option<ResponseConfig>,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ResponseCancelEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,

/// A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation.
pub response_id: String,
}

/// These are events that the OpenAI Realtime WebSocket server will accept from the client.
Expand All @@ -112,6 +201,10 @@ pub enum ClientEvent {
#[serde(rename = "input_audio_buffer.clear")]
InputAudioBufferClear(InputAudioBufferClearEvent),

/// WebRTC Only: Send this event to cut off the current audio response.
#[serde(rename = "output_audio_buffer.clear")]
OutputAudioBufferClear(OutputAudioBufferClearEvent),

/// Send this event when adding an item to the conversation.
#[serde(rename = "conversation.item.create")]
ConversationItemCreate(ConversationItemCreateEvent),
Expand All @@ -124,6 +217,10 @@ pub enum ClientEvent {
#[serde(rename = "conversation.item.delete")]
ConversationItemDelete(ConversationItemDeleteEvent),

/// Send this event when you want to retrieve the server's representation of a specific item in the conversation history.
#[serde(rename = "conversation.item.retrieve")]
ConversationItemRetrieve(ConversationItemRetrieveEvent),

/// Send this event to trigger a response generation.
#[serde(rename = "response.create")]
ResponseCreate(ResponseCreateEvent),
Expand Down Expand Up @@ -181,6 +278,11 @@ event_from!(
ClientEvent,
InputAudioBufferClear
);
event_from!(
OutputAudioBufferClearEvent,
ClientEvent,
OutputAudioBufferClear
);
event_from!(
ConversationItemCreateEvent,
ClientEvent,
Expand All @@ -198,14 +300,21 @@ event_from!(
);
event_from!(ResponseCreateEvent, ClientEvent, ResponseCreate);
event_from!(ResponseCancelEvent, ClientEvent, ResponseCancel);
event_from!(
ConversationItemRetrieveEvent,
ClientEvent,
ConversationItemRetrieve
);

message_from_event!(SessionUpdateEvent, ClientEvent);
message_from_event!(InputAudioBufferAppendEvent, ClientEvent);
message_from_event!(InputAudioBufferCommitEvent, ClientEvent);
message_from_event!(InputAudioBufferClearEvent, ClientEvent);
message_from_event!(OutputAudioBufferClearEvent, ClientEvent);
message_from_event!(ConversationItemCreateEvent, ClientEvent);
message_from_event!(ConversationItemTruncateEvent, ClientEvent);
message_from_event!(ConversationItemDeleteEvent, ClientEvent);
message_from_event!(ConversationItemRetrieveEvent, ClientEvent);
message_from_event!(ResponseCreateEvent, ClientEvent);
message_from_event!(ResponseCancelEvent, ClientEvent);

Expand Down
17 changes: 17 additions & 0 deletions async-openai/src/types/realtime/response_resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ pub enum IncompleteReason {
Interruption,
MaxOutputTokens,
ContentFilter,
TokenLimit,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "snake_case")]
pub enum FinishReason {
Stop,
Length,
ToolCalls,
ContentFilter,
FunctionCall,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
Expand Down Expand Up @@ -58,4 +69,10 @@ pub struct ResponseResource {
pub output: Vec<Item>,
/// Usage statistics for the response.
pub usage: Option<Usage>,
/// The Unix timestamp (in seconds) for when the response was created.
#[serde(skip_serializing_if = "Option::is_none")]
pub created_at: Option<u64>,
/// The reason the model stopped generating tokens, if applicable.
#[serde(skip_serializing_if = "Option::is_none")]
pub finish_reason: Option<FinishReason>,
}
26 changes: 24 additions & 2 deletions async-openai/src/types/realtime/server_event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pub struct ConversationCreatedEvent {
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct InputAudioBufferCommitedEvent {
pub struct InputAudioBufferCommittedEvent {
/// The unique ID of the server event.
pub event_id: String,
/// The ID of the preceding item after which the new item will be inserted.
Expand All @@ -53,6 +53,12 @@ pub struct InputAudioBufferClearedEvent {
pub event_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct OutputAudioBufferClearedEvent {
/// The unique ID of the server event.
pub event_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct InputAudioBufferSpeechStartedEvent {
/// The unique ID of the server event.
Expand Down Expand Up @@ -154,6 +160,14 @@ pub struct ConversationItemDeletedEvent {
pub item_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ConversationItemRetrievedEvent {
/// The unique ID of the server event.
pub event_id: String,
/// The item that was retrieved.
pub item: Item,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ResponseCreatedEvent {
/// The unique ID of the server event.
Expand Down Expand Up @@ -381,12 +395,16 @@ pub enum ServerEvent {

/// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode.
#[serde(rename = "input_audio_buffer.committed")]
InputAudioBufferCommited(InputAudioBufferCommitedEvent),
InputAudioBufferCommitted(InputAudioBufferCommittedEvent),

/// Returned when the input audio buffer is cleared by the client.
#[serde(rename = "input_audio_buffer.cleared")]
InputAudioBufferCleared(InputAudioBufferClearedEvent),

/// Returned when the output audio buffer is cleared by the client (WebRTC specific).
#[serde(rename = "output_audio_buffer.cleared")]
OutputAudioBufferCleared(OutputAudioBufferClearedEvent),

/// Returned in server turn detection mode when speech is detected.
#[serde(rename = "input_audio_buffer.speech_started")]
InputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent),
Expand Down Expand Up @@ -422,6 +440,10 @@ pub enum ServerEvent {
#[serde(rename = "conversation.item.deleted")]
ConversationItemDeleted(ConversationItemDeletedEvent),

/// Returned when an item in the conversation is retrieved.
#[serde(rename = "conversation.item.retrieved")]
ConversationItemRetrieved(ConversationItemRetrievedEvent),

/// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress".
#[serde(rename = "response.created")]
ResponseCreated(ResponseCreatedEvent),
Expand Down
Loading