Skip to content

Implement vector store search, retrieve file content operations #360

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
251 changes: 249 additions & 2 deletions async-openai/src/types/vector_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ pub struct UpdateVectorStoreRequest {
pub struct ListVectorStoreFilesResponse {
pub object: String,
pub data: Vec<VectorStoreFileObject>,
pub first_id: String,
pub last_id: String,
pub first_id: Option<String>,
pub last_id: Option<String>,
pub has_more: bool,
}

Expand Down Expand Up @@ -209,7 +209,10 @@ pub enum VectorStoreFileObjectChunkingStrategy {
pub struct CreateVectorStoreFileRequest {
/// A [File](https://platform.openai.com/docs/api-reference/files) ID that the vector store should use. Useful for tools like `file_search` that can access files.
pub file_id: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub chunking_strategy: Option<VectorStoreChunkingStrategy>,
#[serde(skip_serializing_if = "Option::is_none")]
pub attributes: Option<HashMap<String, AttributeValue>>,
}

#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
Expand Down Expand Up @@ -269,3 +272,247 @@ pub struct VectorStoreFileBatchObject {
pub status: VectorStoreFileBatchStatus,
pub file_counts: VectorStoreFileBatchCounts,
}

/// Represents the parsed content of a vector store file.
#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct VectorStoreFileContentResponse {
/// The object type, which is always `vector_store.file_content.page`
pub object: String,

/// Parsed content of the file.
pub data: Vec<VectorStoreFileContentObject>,

/// Indicates if there are more content pages to fetch.
pub has_more: bool,

/// The token for the next page, if any.
pub next_page: Option<String>,
}

/// Represents the parsed content of a vector store file.
#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct VectorStoreFileContentObject {
/// The content type (currently only `"text"`)
pub r#type: String,

/// The text content
pub text: String,
}

#[derive(Debug, Serialize, Default, Clone, Builder, PartialEq, Deserialize)]
#[builder(name = "VectorStoreSearchRequestArgs")]
#[builder(pattern = "mutable")]
#[builder(setter(into, strip_option), default)]
#[builder(derive(Debug))]
#[builder(build_fn(error = "OpenAIError"))]
pub struct VectorStoreSearchRequest {
/// A query string for a search.
pub query: VectorStoreSearchQuery,

/// Whether to rewrite the natural language query for vector search.
#[serde(skip_serializing_if = "Option::is_none")]
pub rewrite_query: Option<bool>,

/// The maximum number of results to return. This number should be between 1 and 50 inclusive.
#[serde(skip_serializing_if = "Option::is_none")]
pub max_num_results: Option<u8>,

/// A filter to apply based on file attributes.
#[serde(skip_serializing_if = "Option::is_none")]
pub filters: Option<VectorStoreSearchFilter>,

/// Ranking options for search.
#[serde(skip_serializing_if = "Option::is_none")]
pub ranking_options: Option<RankingOptions>,
}

#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[serde(untagged)]
pub enum VectorStoreSearchQuery {
/// A single query to search for.
Text(String),
/// A list of queries to search for.
Array(Vec<String>),
}

impl Default for VectorStoreSearchQuery {
fn default() -> Self {
Self::Text(String::new())
}
}

impl From<String> for VectorStoreSearchQuery {
fn from(query: String) -> Self {
Self::Text(query)
}
}

impl From<&str> for VectorStoreSearchQuery {
fn from(query: &str) -> Self {
Self::Text(query.to_string())
}
}

impl From<Vec<String>> for VectorStoreSearchQuery {
fn from(query: Vec<String>) -> Self {
Self::Array(query)
}
}

#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[serde(untagged)]
pub enum VectorStoreSearchFilter {
Comparison(ComparisonFilter),
Compound(CompoundFilter),
}

impl From<ComparisonFilter> for VectorStoreSearchFilter {
fn from(filter: ComparisonFilter) -> Self {
Self::Comparison(filter)
}
}

impl From<CompoundFilter> for VectorStoreSearchFilter {
fn from(filter: CompoundFilter) -> Self {
Self::Compound(filter)
}
}

/// A filter used to compare a specified attribute key to a given value using a defined comparison operation.
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
pub struct ComparisonFilter {
/// Specifies the comparison operator: `eq`, `ne`, `gt`, `gte`, `lt`, `lte`.
pub r#type: ComparisonType,

/// The key to compare against the value.
pub key: String,

/// The value to compare against the attribute key; supports string, number, or boolean types.
pub value: AttributeValue,
}

/// Specifies the comparison operator: `eq`, `ne`, `gt`, `gte`, `lt`, `lte`.
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum ComparisonType {
Eq,
Ne,
Gt,
Gte,
Lt,
Lte,
}

/// The value to compare against the attribute key; supports string, number, or boolean types.
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[serde(untagged)]
pub enum AttributeValue {
String(String),
Number(i64),
Boolean(bool),
}

impl From<String> for AttributeValue {
fn from(value: String) -> Self {
Self::String(value)
}
}

impl From<i64> for AttributeValue {
fn from(value: i64) -> Self {
Self::Number(value)
}
}

impl From<bool> for AttributeValue {
fn from(value: bool) -> Self {
Self::Boolean(value)
}
}

impl From<&str> for AttributeValue {
fn from(value: &str) -> Self {
Self::String(value.to_string())
}
}

/// Ranking options for search.
#[derive(Debug, Serialize, Default, Deserialize, Clone, PartialEq)]
pub struct RankingOptions {
#[serde(skip_serializing_if = "Option::is_none")]
pub ranker: Option<Ranker>,

#[serde(skip_serializing_if = "Option::is_none")]
pub score_threshold: Option<f32>,
}

#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
pub enum Ranker {
#[serde(rename = "auto")]
Auto,
#[serde(rename = "default-2024-11-15")]
Default20241115,
}

/// Combine multiple filters using `and` or `or`.
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
pub struct CompoundFilter {
/// Type of operation: `and` or `or`.
pub r#type: CompoundFilterType,

/// Array of filters to combine. Items can be `ComparisonFilter` or `CompoundFilter`
pub filters: Vec<VectorStoreSearchFilter>,
}

/// Type of operation: `and` or `or`.
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum CompoundFilterType {
And,
Or,
}

#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct VectorStoreSearchResultsPage {
/// The object type, which is always `vector_store.search_results.page`.
pub object: String,

/// The query used for this search.
pub search_query: Vec<String>,

/// The list of search result items.
pub data: Vec<VectorStoreSearchResultItem>,

/// Indicates if there are more results to fetch.
pub has_more: bool,

/// The token for the next page, if any.
pub next_page: Option<String>,
}

#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct VectorStoreSearchResultItem {
/// The ID of the vector store file.
pub file_id: String,

/// The name of the vector store file.
pub filename: String,

/// The similarity score for the result.
pub score: f32, // minimum: 0, maximum: 1

/// Attributes of the vector store file.
pub attributes: HashMap<String, AttributeValue>,

/// Content chunks from the file.
pub content: Vec<VectorStoreSearchResultContentObject>,
}

#[derive(Debug, Deserialize, Clone, PartialEq, Serialize)]
pub struct VectorStoreSearchResultContentObject {
/// The type of content
pub r#type: String,

/// The text content returned from search.
pub text: String,
}
16 changes: 15 additions & 1 deletion async-openai/src/vector_store_files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
error::OpenAIError,
types::{
CreateVectorStoreFileRequest, DeleteVectorStoreFileResponse, ListVectorStoreFilesResponse,
VectorStoreFileObject,
VectorStoreFileContentResponse, VectorStoreFileObject,
},
Client,
};
Expand Down Expand Up @@ -78,6 +78,20 @@ impl<'c, C: Config> VectorStoreFiles<'c, C> {
)
.await
}

/// Retrieve the parsed contents of a vector store file.
#[crate::byot(T0 = std::fmt::Display, R = serde::de::DeserializeOwned)]
pub async fn retrieve_file_content(
&self,
file_id: &str,
) -> Result<VectorStoreFileContentResponse, OpenAIError> {
self.client
.get(&format!(
"/vector_stores/{}/files/{file_id}/content",
&self.vector_store_id
))
.await
}
}

#[cfg(test)]
Expand Down
15 changes: 14 additions & 1 deletion async-openai/src/vector_stores.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ use crate::{
error::OpenAIError,
types::{
CreateVectorStoreRequest, DeleteVectorStoreResponse, ListVectorStoresResponse,
UpdateVectorStoreRequest, VectorStoreObject,
UpdateVectorStoreRequest, VectorStoreObject, VectorStoreSearchRequest,
VectorStoreSearchResultsPage,
},
vector_store_file_batches::VectorStoreFileBatches,
Client, VectorStoreFiles,
Expand Down Expand Up @@ -78,4 +79,16 @@ impl<'c, C: Config> VectorStores<'c, C> {
.post(&format!("/vector_stores/{vector_store_id}"), request)
.await
}

/// Searches a vector store.
#[crate::byot(T0 = std::fmt::Display, T1 = serde::Serialize, R = serde::de::DeserializeOwned)]
pub async fn search(
&self,
vector_store_id: &str,
request: VectorStoreSearchRequest,
) -> Result<VectorStoreSearchResultsPage, OpenAIError> {
self.client
.post(&format!("/vector_stores/{vector_store_id}/search"), request)
.await
}
}
9 changes: 9 additions & 0 deletions examples/vector-store-retrieval/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[package]
name = "vector-store-retrieval"
version = "0.1.0"
edition = "2021"
publish = false

[dependencies]
async-openai = { path = "../../async-openai" }
tokio = { version = "1.43.0", features = ["full"] }
33 changes: 33 additions & 0 deletions examples/vector-store-retrieval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
## Intro

This example is based on https://platform.openai.com/docs/guides/retrieval


## Data

Uber Annual Report obtained from https://investor.uber.com/financials/

Lyft Annual Report obtained from https://investor.lyft.com/financials-and-reports/annual-reports/default.aspx


## Output

```
Waiting for vector store to be[] ready...
Search results: VectorStoreSearchResultsPage {
object: "vector_store.search_results.page",
search_query: [
"uber profit",
],
data: [
VectorStoreSearchResultItem {
file_id: "file-1XFoSYUzJudwJLkAazLdjd",
filename: "uber-10k.pdf",
score: 0.5618923,
attributes: {},
content: [
VectorStoreSearchResultContentObject {
type: "text",
text: "(In millions) Q1 2022 Q2 2022 Q3 2022 Q4 2022 Q1 2023 Q2 2023 Q3 2023 Q4 2023\n\nMobility $ 10,723 $ 13,364 $ 13,684 $ 14,894 $ 14,981 $ 16,728 $ 17,903 $ 19,285 \nDelivery 13,903 13,876 13,684 14,315 15,026 15,595 16,094 17,011 \nFreight 1,823 1,838 1,751 1,540 1,401 1,278 1,284 1,279 \n\nAdjusted EBITDA.
...
```
Binary file not shown.
Binary file not shown.
Loading