diff --git a/Cargo.toml b/Cargo.toml index 7bea1e1e5..5aa7287a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,13 @@ members = [ "llm/ollama", "llm/openai", "llm/openrouter", + "search/search", + "search/elasticsearch", + "search/opensearch", + "search/algolia", + "search/typesense", + "search/meilisearch", + "test/components-rust/test-search", ] [profile.release] @@ -19,10 +26,16 @@ opt-level = 's' golem-rust = "1.6.0" log = "0.4.27" golem-llm = { path = "llm/llm", version = "0.0.0", default-features = false } +golem-search = { path = "search/search", version = "0.0.0", default-features = false } reqwest = { git = "https://github.com/golemcloud/reqwest", branch = "update-may-2025", features = [ "json", ] } serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0" } wit-bindgen-rt = { version = "0.40.0", features = ["bitflags"] } +wit-bindgen = { version = "0.40.0" } base64 = { version = "0.22.1" } +sha2 = { version = "0.10" } +hmac = { version = "0.12" } +hex = { version = "0.4" } +chrono = { version = "0.4", features = ["serde"] } diff --git a/Makefile.toml b/Makefile.toml index cc443bc6a..5c30e1391 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -13,7 +13,7 @@ args = ["test"] [tasks.build] script_runner = "@duckscript" script = ''' -domains = array llm +domains = array llm search # if there is no domain passed run for every domain if is_empty ${1} @@ -28,7 +28,7 @@ end [tasks.release-build] script_runner = "@duckscript" script = ''' -domains = array llm +domains = array llm search # if there is no domain passed run for every domain if is_empty ${1} @@ -44,7 +44,7 @@ end script_runner = "@duckscript" script = ''' #!/bin/bash -domains = array llm +domains = array llm search # if there is no domain passed run for every domain if is_empty ${1} @@ -60,7 +60,7 @@ end script_runner = "@duckscript" script = ''' #!/bin/bash -domains = array llm +domains = array llm search # if there is no domain passed run for every domain if is_empty ${1} @@ -75,7 +75,7 @@ end [tasks.wit] script_runner = "@duckscript" script = ''' -domains = array llm +domains = array llm search # if there is no domain passed run for every domain if is_empty ${1} @@ -91,7 +91,7 @@ end description = "Builds all test components with golem-cli" script_runner = "@duckscript" script = ''' -domains = array llm +domains = array llm search # if there is no domain passed run for every domain if is_empty ${1} @@ -137,7 +137,7 @@ script = ''' is_portable = eq ${1} "--portable" -targets = array llm_openai llm_anthropic llm_grok llm_openrouter llm_ollama +targets = array llm_openai llm_anthropic llm_grok llm_openrouter llm_ollama search_elasticsearch search_opensearch search_algolia search_typesense search_meilisearch for target in ${targets} if is_portable cp target/wasm32-wasip1/debug/golem_${target}.wasm components/debug/golem_${target}-portable.wasm @@ -153,7 +153,7 @@ script = ''' is_portable = eq ${1} "--portable" -targets = array llm_openai llm_anthropic llm_grok llm_openrouter llm_ollama +targets = array llm_openai llm_anthropic llm_grok llm_openrouter llm_ollama search_elasticsearch search_opensearch search_algolia search_typesense search_meilisearch for target in ${targets} if is_portable cp target/wasm32-wasip1/release/golem_${target}.wasm components/release/golem_${target}-portable.wasm diff --git a/search/Makefile.toml b/search/Makefile.toml new file mode 100644 index 000000000..15a5053fc --- /dev/null +++ b/search/Makefile.toml @@ -0,0 +1,185 @@ +[config] +default_to_workspace = false +skip_core_tasks = true + +[tasks.build] +run_task = { name = [ + "build-elasticsearch", + "build-opensearch", + "build-algolia", + "build-typesense", + "build-meilisearch", +] } + +[tasks.build-portable] +run_task = { name = [ + "build-elasticsearch-portable", + "build-opensearch-portable", + "build-algolia-portable", + "build-typesense-portable", + "build-meilisearch-portable", +] } + +[tasks.release-build] +run_task = { name = [ + "release-build-elasticsearch", + "release-build-opensearch", + "release-build-algolia", + "release-build-typesense", + "release-build-meilisearch", +] } + +[tasks.release-build-portable] +run_task = { name = [ + "release-build-elasticsearch-portable", + "release-build-opensearch-portable", + "release-build-algolia-portable", + "release-build-typesense-portable", + "release-build-meilisearch-portable", +] } + +[tasks.build-elasticsearch] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-elasticsearch"] + +[tasks.build-elasticsearch-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-elasticsearch", "--no-default-features"] + +[tasks.build-opensearch] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-opensearch"] + +[tasks.build-opensearch-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-opensearch", "--no-default-features"] + +[tasks.build-algolia] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-algolia"] + +[tasks.build-algolia-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-algolia", "--no-default-features"] + +[tasks.build-typesense] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-typesense"] + +[tasks.build-typesense-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-typesense", "--no-default-features"] + +[tasks.build-meilisearch] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-meilisearch"] + +[tasks.build-meilisearch-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-meilisearch", "--no-default-features"] + +[tasks.release-build-elasticsearch] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-elasticsearch", "--release"] + +[tasks.release-build-elasticsearch-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-elasticsearch", "--release", "--no-default-features"] + +[tasks.release-build-opensearch] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-opensearch", "--release"] + +[tasks.release-build-opensearch-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-opensearch", "--release", "--no-default-features"] + +[tasks.release-build-algolia] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-algolia", "--release"] + +[tasks.release-build-algolia-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-algolia", "--release", "--no-default-features"] + +[tasks.release-build-typesense] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-typesense", "--release"] + +[tasks.release-build-typesense-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-typesense", "--release", "--no-default-features"] + +[tasks.release-build-meilisearch] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-meilisearch", "--release"] + +[tasks.release-build-meilisearch-portable] +install_crate = { crate_name = "cargo-component", version = "0.20.0" } +command = "cargo-component" +args = ["build", "-p", "golem-search-meilisearch", "--release", "--no-default-features"] + +[tasks.wit-update] +install_crate = { crate_name = "wit-deps-cli" } +command = "wit-deps" +args = ["update"] + +[tasks.wit] +dependencies = ["wit-update"] + +script_runner = "@duckscript" +script = """ +modules = array search elasticsearch opensearch algolia typesense meilisearch + +for module in ${modules} + rm -r ${module}/wit/deps + mkdir ${module}/wit/deps/golem-search + cp wit/golem-search.wit ${module}/wit/deps/golem-search/golem-search.wit + cp wit/deps/wasi:io ${module}/wit/deps + + echo "Copied WIT for module search::${module}" +end + +echo "Copied WIT for module search" +""" + +[tasks.build-test-components] +dependencies = ["build"] +install_crate = "cargo-binstall" +description = "Builds search test components with golem-cli" +script = ''' +cargo-binstall golem-cli@1.2.2-dev.11 --locked --no-confirm +cargo-binstall wac-cli --locked --no-confirm +cd ../test + +golem-cli --version +golem-cli app clean +golem-cli app build -b elasticsearch-debug +golem-cli app clean +golem-cli app build -b opensearch-debug +golem-cli app clean +golem-cli app build -b algolia-debug +golem-cli app clean +golem-cli app build -b typesense-debug +golem-cli app clean +golem-cli app build -b meilisearch-debug +''' diff --git a/search/algolia/Cargo.toml b/search/algolia/Cargo.toml new file mode 100644 index 000000000..7297dd0e5 --- /dev/null +++ b/search/algolia/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "golem-search-algolia" +version = "0.0.0" +edition = "2021" +license = "Apache-2.0" +homepage = "https://golem.cloud" +repository = "https://github.com/golemcloud/golem-llm" +description = "WebAssembly component for working with Algolia APIs, with special support for Golem Cloud" + +[lib] +path = "src/lib.rs" +crate-type = ["cdylib"] + +[features] +default = ["durability"] +durability = ["golem-rust/durability", "golem-search/durability"] + +[dependencies] +golem-search = { path = "../search" } +golem-rust = "1.6.0" +log = "0.4.27" +reqwest = { git = "https://github.com/golemcloud/reqwest", branch = "update-may-2025", features = ["json", "blocking"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +wit-bindgen-rt = { version = "0.40.0", features = ["bitflags"] } + +[package.metadata.component] +package = "golem:search-algolia" + +[package.metadata.component.bindings] +generate_unused_types = true + +[package.metadata.component.bindings.with] +"golem:search/core@1.0.0" = "golem_search::golem::search::core" + +[package.metadata.component.target] +path = "wit" + +[package.metadata.component.target.dependencies] +"golem:search" = { path = "wit/deps/golem-search" } +"wasi:io" = { path = "wit/deps/wasi:io" } diff --git a/search/algolia/src/client.rs b/search/algolia/src/client.rs new file mode 100644 index 000000000..0f7911c45 --- /dev/null +++ b/search/algolia/src/client.rs @@ -0,0 +1,375 @@ +use golem_search::golem::search::types::{Doc, Schema, SearchError}; +use log::{debug, trace}; +use reqwest::{Client, Response}; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct AlgoliaApi { + client: Client, + app_id: String, + api_key: String, + base_url: String, +} + +#[derive(Debug, Deserialize)] +pub struct AlgoliaError { + pub message: String, + pub status: u16, +} + +#[derive(Debug, Deserialize)] +pub struct AlgoliaSearchResponse { + pub hits: Vec, + #[serde(rename = "nbHits")] + pub nb_hits: u64, + pub page: u32, + #[serde(rename = "nbPages")] + pub nb_pages: u32, + #[serde(rename = "hitsPerPage")] + pub hits_per_page: u32, + #[serde(rename = "processingTimeMS")] + pub processing_time_ms: u32, + pub cursor: Option, + pub facets: Option>>, +} + +#[derive(Debug, Deserialize)] +pub struct AlgoliaHit { + #[serde(rename = "objectID")] + pub object_id: String, + #[serde(rename = "_highlightResult")] + pub highlight_result: Option>, + #[serde(flatten)] + pub attributes: Value, +} + +#[derive(Debug, Deserialize)] +pub struct AlgoliaHighlight { + pub value: String, + #[serde(rename = "matchLevel")] + pub match_level: String, +} + +#[derive(Debug, Serialize)] +pub struct AlgoliaDoc { + #[serde(rename = "objectID")] + pub object_id: String, + #[serde(flatten)] + pub content: Value, +} + +#[derive(Debug, Deserialize)] +pub struct AlgoliaBrowseResponse { + pub hits: Vec, + pub cursor: Option, + #[serde(rename = "processingTimeMS")] + pub processing_time_ms: u32, +} + +#[derive(Debug, Deserialize)] +pub struct IndexListResponse { + pub items: Vec, +} + +#[derive(Debug, Deserialize)] +pub struct IndexInfo { + pub name: String, +} + +#[derive(Debug, Deserialize)] +pub struct TaskResponse { + #[serde(rename = "taskID")] + pub task_id: u64, +} + +impl AlgoliaApi { + pub fn new(app_id: &str, api_key: &str) -> Self { + let client = Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .unwrap(); + + Self { + client, + app_id: app_id.to_string(), + api_key: api_key.to_string(), + base_url: format!("https://{}-dsn.algolia.net/1", app_id), + } + } + + pub fn empty() -> Self { + Self { + client: Client::new(), + app_id: String::new(), + api_key: String::new(), + base_url: String::new(), + } + } + + async fn request(&self, method: reqwest::Method, path: &str, body: Option) -> Result { + let url = format!("{}/{}", self.base_url, path.trim_start_matches('/')); + let mut request = self.client.request(method, &url); + + request = request + .header("X-Algolia-Application-Id", &self.app_id) + .header("X-Algolia-API-Key", &self.api_key) + .header("Content-Type", "application/json"); + + if let Some(body) = body { + request = request.json(&body); + } + + trace!("Making Algolia request to: {}", url); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Request failed: {}", e)))?; + + if response.status().is_success() { + Ok(response) + } else { + let status = response.status().as_u16(); + let error_text = response.text().await.unwrap_or_default(); + + if let Ok(algolia_error) = serde_json::from_str::(&error_text) { + match status { + 404 => Err(SearchError::IndexNotFound), + 400 => Err(SearchError::InvalidQuery(algolia_error.message)), + _ => Err(SearchError::Internal(format!("Algolia error: {}", algolia_error.message))) + } + } else { + Err(SearchError::Internal(format!("HTTP {}: {}", status, error_text))) + } + } + } + + pub fn create_index(&self, name: &str) -> Result<(), SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + // Algolia creates indexes automatically when first document is added + // For explicit creation, we can set empty settings + let body = json!({}); + self.request(reqwest::Method::POST, &format!("indexes/{}/settings", name), Some(body)).await?; + debug!("Created Algolia index: {}", name); + Ok(()) + }) + } + + pub fn create_index_with_schema(&self, name: &str, settings: Value) -> Result<(), SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, &format!("indexes/{}/settings", name), Some(settings)).await?; + debug!("Created Algolia index with settings: {}", name); + Ok(()) + }) + } + + pub fn delete_index(&self, name: &str) -> Result<(), SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::DELETE, &format!("indexes/{}", name), None).await?; + debug!("Deleted Algolia index: {}", name); + Ok(()) + }) + } + + pub fn list_indexes(&self) -> Result, SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, "indexes", None).await?; + let list_response: IndexListResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + let names = list_response.items.into_iter() + .map(|index| index.name) + .collect(); + + Ok(names) + }) + } + + pub fn index_document(&self, index: &str, doc: AlgoliaDoc) -> Result<(), SearchError> { + let path = format!("indexes/{}", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::POST, &path, Some(serde_json::to_value(&doc).unwrap())).await?; + debug!("Indexed document: {}/{}", index, doc.object_id); + Ok(()) + }) + } + + pub fn bulk_index(&self, index: &str, docs: Vec) -> Result<(), SearchError> { + if docs.is_empty() { + return Ok(()); + } + + let path = format!("indexes/{}/batch", index); + let body = json!({ + "requests": docs.iter().map(|doc| { + json!({ + "action": "addObject", + "body": doc + }) + }).collect::>() + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::POST, &path, Some(body)).await?; + debug!("Bulk indexed {} documents to {}", docs.len(), index); + Ok(()) + }) + } + + pub fn delete_document(&self, index: &str, id: &str) -> Result<(), SearchError> { + let path = format!("indexes/{}/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::DELETE, &path, None).await?; + debug!("Deleted document: {}/{}", index, id); + Ok(()) + }) + } + + pub fn bulk_delete(&self, index: &str, ids: Vec) -> Result<(), SearchError> { + if ids.is_empty() { + return Ok(()); + } + + let path = format!("indexes/{}/batch", index); + let body = json!({ + "requests": ids.iter().map(|id| { + json!({ + "action": "deleteObject", + "body": { + "objectID": id + } + }) + }).collect::>() + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::POST, &path, Some(body)).await?; + debug!("Bulk deleted {} documents from {}", ids.len(), index); + Ok(()) + }) + } + + pub fn get_document(&self, index: &str, id: &str) -> Result, SearchError> { + let path = format!("indexes/{}/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + match self.request(reqwest::Method::GET, &path, None).await { + Ok(response) => { + let mut doc_value: Value = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + // Remove objectID from content to match Doc structure + let object_id = doc_value.get("objectID") + .and_then(|v| v.as_str()) + .unwrap_or(id) + .to_string(); + + if let Some(obj) = doc_value.as_object_mut() { + obj.remove("objectID"); + } + + Ok(Some(Doc { + id: object_id, + content: serde_json::to_string(&doc_value) + .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?, + })) + } + Err(SearchError::Internal(msg)) if msg.contains("404") => Ok(None), + Err(e) => Err(e), + } + }) + } + + pub fn search(&self, index: &str, query: Value) -> Result { + let path = format!("indexes/{}/query", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, &path, Some(query)).await?; + let search_response: AlgoliaSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse search response: {}", e)))?; + + debug!("Algolia search completed in {}ms", search_response.processing_time_ms); + Ok(search_response) + }) + } + + pub fn browse(&self, index: &str, cursor: Option<&str>) -> Result { + let mut path = format!("indexes/{}/browse", index); + if let Some(cursor) = cursor { + path.push_str(&format!("?cursor={}", cursor)); + } + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, &path, Some(json!({}))).await?; + let browse_response: AlgoliaBrowseResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse browse response: {}", e)))?; + + debug!("Algolia browse completed"); + Ok(browse_response) + }) + } + + pub fn get_settings(&self, index: &str) -> Result { + let path = format!("indexes/{}/settings", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, &path, None).await?; + let settings: Value = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse settings response: {}", e)))?; + + Ok(settings) + }) + } + + pub fn update_settings(&self, index: &str, settings: Value) -> Result<(), SearchError> { + let path = format!("indexes/{}/settings", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, &path, Some(settings)).await?; + debug!("Updated settings for Algolia index: {}", index); + Ok(()) + }) + } +} diff --git a/search/algolia/src/conversions.rs b/search/algolia/src/conversions.rs new file mode 100644 index 000000000..dceba013c --- /dev/null +++ b/search/algolia/src/conversions.rs @@ -0,0 +1,347 @@ +use crate::client::{AlgoliaDoc, AlgoliaHit, AlgoliaSearchResponse}; +use golem_search::golem::search::types::{ + Doc, FieldType, Schema, SchemaField, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use log::trace; +use serde_json::{json, Map, Value}; +use std::collections::HashMap; + +pub fn query_to_algolia_query(query: &SearchQuery) -> Result { + let mut algolia_query = json!({}); + + // Main query text + if let Some(q) = &query.q { + if !q.trim().is_empty() { + algolia_query["query"] = json!(q); + } + } + + // Filters - Algolia uses a different syntax + if !query.filters.is_empty() { + let mut filter_parts = Vec::new(); + + for filter in &query.filters { + if let Ok(filter_value) = serde_json::from_str::(filter) { + // Direct JSON filter + if let Some(filter_str) = filter_value.as_str() { + filter_parts.push(filter_str.to_string()); + } + } else { + let parts: Vec<&str> = filter.splitn(3, ':').collect(); + if parts.len() == 3 { + let field = parts[0]; + let op = parts[1]; + let value = parts[2]; + + let filter_clause = match op { + "eq" => format!("{}:{}", field, value), + "ne" => format!("NOT {}:{}", field, value), + "gt" => format!("{} > {}", field, value), + "gte" => format!("{} >= {}", field, value), + "lt" => format!("{} < {}", field, value), + "lte" => format!("{} <= {}", field, value), + "in" => { + let values: Vec<&str> = value.split(',').collect(); + format!("({})", values.iter() + .map(|v| format!("{}:{}", field, v)) + .collect::>() + .join(" OR ")) + } + "exists" => format!("{} > 0", field), // Algolia doesn't have direct exists + "prefix" => format!("{}:{}", field, value), // Algolia handles prefix naturally + _ => return Err(SearchError::InvalidQuery(format!("Unknown filter operator: {}", op))) + }; + filter_parts.push(filter_clause); + } else { + return Err(SearchError::InvalidQuery(format!("Invalid filter format: {}", filter))); + } + } + } + + if !filter_parts.is_empty() { + algolia_query["filters"] = json!(filter_parts.join(" AND ")); + } + } + + // Facets + if !query.facets.is_empty() { + algolia_query["facets"] = json!(query.facets); + } + + // Pagination + let hits_per_page = query.per_page.unwrap_or(10); + algolia_query["hitsPerPage"] = json!(hits_per_page); + + if let Some(page) = query.page { + algolia_query["page"] = json!(page - 1); // Algolia is 0-indexed + } else if let Some(offset) = query.offset { + let page = offset / hits_per_page; + algolia_query["page"] = json!(page); + } + + // Highlighting + if let Some(highlight) = &query.highlight { + let mut highlight_config = json!({}); + + if !highlight.fields.is_empty() { + highlight_config["attributesToHighlight"] = json!(highlight.fields); + } + + if let Some(pre_tag) = &highlight.pre_tag { + highlight_config["highlightPreTag"] = json!(pre_tag); + } + + if let Some(post_tag) = &highlight.post_tag { + highlight_config["highlightPostTag"] = json!(post_tag); + } + + // Merge highlight config into main query + if let Some(obj) = highlight_config.as_object() { + for (key, value) in obj { + algolia_query[key] = value.clone(); + } + } + } + + // Attributes to retrieve + if let Some(config) = &query.config { + if !config.attributes_to_retrieve.is_empty() { + algolia_query["attributesToRetrieve"] = json!(config.attributes_to_retrieve); + } + } + + // Sorting - Algolia uses ranking/custom ranking + if !query.sort.is_empty() { + // Algolia requires pre-configured ranking attributes + // For dynamic sorting, would need replica indexes + algolia_query["customRanking"] = json!(query.sort); + } + + trace!("Generated Algolia query: {}", serde_json::to_string_pretty(&algolia_query).unwrap_or_default()); + Ok(algolia_query) +} + +pub fn algolia_hit_to_search_hit(hit: AlgoliaHit) -> SearchHit { + // Extract content excluding objectID + let mut content_value = hit.attributes.clone(); + if let Some(obj) = content_value.as_object_mut() { + obj.remove("objectID"); + } + let content = Some(serde_json::to_string(&content_value).unwrap_or_default()); + + // Extract highlights + let highlights = if let Some(highlight_map) = hit.highlight_result { + let mut highlights = Map::new(); + for (field, highlight) in highlight_map { + highlights.insert(field, json!(highlight.value)); + } + Some(serde_json::to_string(&highlights).unwrap_or_default()) + } else { + None + }; + + SearchHit { + id: hit.object_id, + score: None, // Algolia doesn't expose raw scores + content, + highlights, + } +} + +pub fn algolia_response_to_results(response: AlgoliaSearchResponse, query: &SearchQuery) -> SearchResults { + let hits = response.hits.into_iter().map(algolia_hit_to_search_hit).collect(); + let total = Some(response.nb_hits as u32); + + let page = Some(response.page + 1); // Convert back to 1-indexed + let per_page = Some(response.hits_per_page); + let took_ms = Some(response.processing_time_ms); + + // Convert facets + let facets = if let Some(algolia_facets) = response.facets { + let mut facet_map = Map::new(); + for (facet_name, facet_values) in algolia_facets { + let mut values = Map::new(); + for (value, count) in facet_values { + values.insert(value, json!(count)); + } + facet_map.insert(facet_name, json!(values)); + } + Some(serde_json::to_string(&facet_map).unwrap_or_default()) + } else { + None + }; + + SearchResults { + total, + page, + per_page, + hits, + facets, + took_ms, + } +} + +pub fn schema_to_algolia_settings(schema: &Schema) -> Value { + let mut settings = json!({}); + let mut attributes_for_faceting = Vec::new(); + let mut searchable_attributes = Vec::new(); + let mut ranking = Vec::new(); + + for field in &schema.fields { + if field.index { + searchable_attributes.push(&field.name); + } + + if field.facet { + match field.type_ { + FieldType::Text | FieldType::Keyword => { + attributes_for_faceting.push(format!("filterOnly({})", field.name)); + } + _ => { + attributes_for_faceting.push(field.name.clone()); + } + } + } + + if field.sort { + ranking.push(format!("desc({})", field.name)); + } + } + + if !searchable_attributes.is_empty() { + settings["searchableAttributes"] = json!(searchable_attributes); + } + + if !attributes_for_faceting.is_empty() { + settings["attributesForFaceting"] = json!(attributes_for_faceting); + } + + if !ranking.is_empty() { + settings["customRanking"] = json!(ranking); + } + + // Algolia default settings + settings["attributesToHighlight"] = json!(["*"]); + settings["attributesToSnippet"] = json!(["*:20"]); + settings["hitsPerPage"] = json!(20); + + settings +} + +pub fn algolia_settings_to_schema(settings: &Value) -> Result { + let mut fields = Vec::new(); + let mut primary_key = Some("objectID".to_string()); + + // Extract searchable attributes + if let Some(searchable) = settings.get("searchableAttributes") + .and_then(|v| v.as_array()) { + for attr in searchable { + if let Some(attr_name) = attr.as_str() { + fields.push(SchemaField { + name: attr_name.to_string(), + type_: FieldType::Text, // Default to text for searchable + required: false, + facet: false, + sort: false, + index: true, + }); + } + } + } + + // Extract facetable attributes + if let Some(facets) = settings.get("attributesForFaceting") + .and_then(|v| v.as_array()) { + for facet in facets { + if let Some(facet_str) = facet.as_str() { + let field_name = if facet_str.starts_with("filterOnly(") { + facet_str.trim_start_matches("filterOnly(") + .trim_end_matches(")") + .to_string() + } else { + facet_str.to_string() + }; + + // Check if field already exists + if let Some(existing_field) = fields.iter_mut().find(|f| f.name == field_name) { + existing_field.facet = true; + } else { + fields.push(SchemaField { + name: field_name, + type_: FieldType::Keyword, // Default to keyword for facets + required: false, + facet: true, + sort: false, + index: true, + }); + } + } + } + } + + // Extract custom ranking (sort fields) + if let Some(ranking) = settings.get("customRanking") + .and_then(|v| v.as_array()) { + for rank in ranking { + if let Some(rank_str) = rank.as_str() { + let field_name = if rank_str.starts_with("desc(") { + rank_str.trim_start_matches("desc(") + .trim_end_matches(")") + .to_string() + } else if rank_str.starts_with("asc(") { + rank_str.trim_start_matches("asc(") + .trim_end_matches(")") + .to_string() + } else { + rank_str.to_string() + }; + + // Check if field already exists + if let Some(existing_field) = fields.iter_mut().find(|f| f.name == field_name) { + existing_field.sort = true; + } else { + fields.push(SchemaField { + name: field_name, + type_: FieldType::Integer, // Assume numeric for ranking + required: false, + facet: false, + sort: true, + index: true, + }); + } + } + } + } + + // Add objectID field if not present + if !fields.iter().any(|f| f.name == "objectID") { + fields.insert(0, SchemaField { + name: "objectID".to_string(), + type_: FieldType::Keyword, + required: true, + facet: false, + sort: false, + index: false, + }); + } + + Ok(Schema { + fields, + primary_key, + }) +} + +pub fn doc_to_algolia_doc(doc: Doc) -> Result { + let mut content: Value = serde_json::from_str(&doc.content) + .map_err(|e| SearchError::InvalidQuery(format!("Invalid JSON in document: {}", e)))?; + + // Ensure objectID is set + if let Some(obj) = content.as_object_mut() { + obj.insert("objectID".to_string(), json!(doc.id)); + } + + Ok(AlgoliaDoc { + object_id: doc.id, + content, + }) +} diff --git a/search/algolia/src/lib.rs b/search/algolia/src/lib.rs new file mode 100644 index 000000000..165a50249 --- /dev/null +++ b/search/algolia/src/lib.rs @@ -0,0 +1,272 @@ +use crate::client::AlgoliaApi; +use crate::conversions::{ + doc_to_algolia_doc, algolia_hit_to_search_hit, algolia_response_to_results, query_to_algolia_query, + schema_to_algolia_settings, algolia_settings_to_schema, +}; +use golem_search::config::with_config_key; +use golem_search::durability::{DurableSearch, ExtendedGuest}; +use golem_search::golem::search::core::Guest; +use golem_search::golem::search::types::{ + Doc, Schema, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use golem_search::{SearchStream, SearchStreamState, LOGGING_STATE}; +use log::trace; +use std::cell::RefCell; + +mod client; +mod conversions; + +struct AlgoliaStream { + client: AlgoliaApi, + index: String, + query: SearchQuery, + cursor: RefCell>, + finished: RefCell, + failure: Option, +} + +impl AlgoliaStream { + fn new(client: AlgoliaApi, index: String, query: SearchQuery) -> Self { + Self { + client, + index, + query, + cursor: RefCell::new(None), + finished: RefCell::new(false), + failure: None, + } + } + + fn failed(error: SearchError) -> Self { + Self { + client: AlgoliaApi::empty(), + index: String::new(), + query: SearchQuery { + q: None, + filters: vec![], + sort: vec![], + facets: vec![], + page: None, + per_page: None, + offset: None, + highlight: None, + config: None, + }, + cursor: RefCell::new(None), + finished: RefCell::new(true), + failure: Some(error), + } + } +} + +impl SearchStreamState for AlgoliaStream { + fn failure(&self) -> &Option { + &self.failure + } + + fn is_finished(&self) -> bool { + *self.finished.borrow() + } + + fn set_finished(&self) { + *self.finished.borrow_mut() = true; + } + + fn get_next_hits(&self) -> Result, SearchError> { + let cursor = self.cursor.borrow().clone(); + + // Use browse API for streaming + match self.client.browse(&self.index, cursor.as_deref()) { + Ok(response) => { + if response.hits.is_empty() { + self.set_finished(); + Ok(vec![]) + } else { + *self.cursor.borrow_mut() = response.cursor; + if response.cursor.is_none() { + self.set_finished(); + } + Ok(response.hits.into_iter().map(algolia_hit_to_search_hit).collect()) + } + } + Err(error) => Err(error), + } + } +} + +struct AlgoliaComponent; + +impl AlgoliaComponent { + const APP_ID_VAR: &'static str = "ALGOLIA_APPLICATION_ID"; + const API_KEY_VAR: &'static str = "ALGOLIA_API_KEY"; +} + +impl Guest for AlgoliaComponent { + type SearchHitStream = SearchStream; + + fn create_index(name: String, schema: Option) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + + if let Some(schema) = schema { + let settings = schema_to_algolia_settings(&schema); + client.create_index_with_schema(&name, settings) + } else { + client.create_index(&name) + } + }) + }) + } + + fn delete_index(name: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + client.delete_index(&name) + }) + }) + } + + fn list_indexes() -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + client.list_indexes() + }) + }) + } + + fn upsert(index: String, doc: Doc) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + let algolia_doc = doc_to_algolia_doc(doc)?; + client.index_document(&index, algolia_doc) + }) + }) + } + + fn upsert_many(index: String, docs: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + let algolia_docs: Result, _> = docs.into_iter().map(doc_to_algolia_doc).collect(); + client.bulk_index(&index, algolia_docs?) + }) + }) + } + + fn delete(index: String, id: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + client.delete_document(&index, &id) + }) + }) + } + + fn delete_many(index: String, ids: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + client.bulk_delete(&index, ids) + }) + }) + } + + fn get(index: String, id: String) -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + client.get_document(&index, &id) + }) + }) + } + + fn search(index: String, query: SearchQuery) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + let algolia_query = query_to_algolia_query(&query)?; + trace!("Executing Algolia search query: {:?}", algolia_query); + + match client.search(&index, algolia_query) { + Ok(response) => Ok(algolia_response_to_results(response, &query)), + Err(error) => Err(error), + } + }) + }) + } + + fn stream_search( + index: String, + query: SearchQuery, + ) -> Result { + Self::unwrapped_stream_search(index, query) + } + + fn get_schema(index: String) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + let settings = client.get_settings(&index)?; + algolia_settings_to_schema(&settings) + }) + }) + } + + fn update_schema(index: String, schema: Schema) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, Err, |app_id| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + let settings = schema_to_algolia_settings(&schema); + client.update_settings(&index, settings) + }) + }) + } +} + +impl ExtendedGuest for AlgoliaComponent { + type SearchHitStream = SearchStream; + + fn unwrapped_stream_search( + index: String, + query: SearchQuery, + ) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::APP_ID_VAR, |error| Ok(SearchStream::new(AlgoliaStream::failed(error))), |app_id| { + with_config_key(Self::API_KEY_VAR, |error| Ok(SearchStream::new(AlgoliaStream::failed(error))), |api_key| { + let client = AlgoliaApi::new(app_id, api_key); + Ok(SearchStream::new(AlgoliaStream::new(client, index, query))) + }) + }) + } +} + +type DurableAlgoliaComponent = DurableSearch; + +golem_search::export_search!(DurableAlgoliaComponent with_types_in golem_search); diff --git a/search/algolia/wit/algolia.wit b/search/algolia/wit/algolia.wit new file mode 100644 index 000000000..16067eca0 --- /dev/null +++ b/search/algolia/wit/algolia.wit @@ -0,0 +1,6 @@ +package golem:search-algolia; + +world algolia-provider { + import golem:search/core@1.0.0; + export golem:search/core@1.0.0; +} diff --git a/search/algolia/wit/deps/golem-search/golem-search.wit b/search/algolia/wit/deps/golem-search/golem-search.wit new file mode 100644 index 000000000..64c822dfa --- /dev/null +++ b/search/algolia/wit/deps/golem-search/golem-search.wit @@ -0,0 +1,134 @@ +package golem:search@1.0.0; + +/// Core types and error handling for universal search interfaces +interface types { + /// Common structured errors for search operations + variant search-error { + index-not-found, + invalid-query(string), + unsupported, + internal(string), + timeout, + rate-limited, + } + + /// Identifier types + type index-name = string; + type document-id = string; + type json = string; + + /// Document payload + record doc { + id: document-id, + content: json, + } + + /// Highlight configuration + record highlight-config { + fields: list, + pre-tag: option, + post-tag: option, + max-length: option, + } + + /// Advanced search tuning + record search-config { + timeout-ms: option, + boost-fields: list>, + attributes-to-retrieve: list, + language: option, + typo-tolerance: option, + exact-match-boost: option, + provider-params: option, + } + + /// Search request + record search-query { + q: option, + filters: list, + sort: list, + facets: list, + page: option, + per-page: option, + offset: option, + highlight: option, + config: option, + } + + /// Search hit + record search-hit { + id: document-id, + score: option, + content: option, + highlights: option, + } + + /// Search result set + record search-results { + total: option, + page: option, + per-page: option, + hits: list, + facets: option, + took-ms: option, + } + + /// Field schema types + enum field-type { + text, + keyword, + integer, + float, + boolean, + date, + geo-point, + } + + /// Field definition + record schema-field { + name: string, + type: field-type, + required: bool, + facet: bool, + sort: bool, + index: bool, + } + + /// Index schema + record schema { + fields: list, + primary-key: option, + } +} + +/// Unified search interface +interface core { + use types.{ + index-name, document-id, doc, search-query, search-results, + search-hit, schema, search-error + }; + + // Index lifecycle + create-index: func(name: index-name, schema: option) -> result<_, search-error>; + delete-index: func(name: index-name) -> result<_, search-error>; + list-indexes: func() -> result, search-error>; + + // Document operations + upsert: func(index: index-name, doc: doc) -> result<_, search-error>; + upsert-many: func(index: index-name, docs: list) -> result<_, search-error>; + delete: func(index: index-name, id: document-id) -> result<_, search-error>; + delete-many: func(index: index-name, ids: list) -> result<_, search-error>; + get: func(index: index-name, id: document-id) -> result, search-error>; + + // Query + search: func(index: index-name, query: search-query) -> result; + stream-search: func(index: index-name, query: search-query) -> result, search-error>; + + // Schema inspection + get-schema: func(index: index-name) -> result; + update-schema: func(index: index-name, schema: schema) -> result<_, search-error>; +} + +world search-library { + export core; +} diff --git a/search/algolia/wit/deps/wasi:io/error.wit b/search/algolia/wit/deps/wasi:io/error.wit new file mode 100644 index 000000000..97c606877 --- /dev/null +++ b/search/algolia/wit/deps/wasi:io/error.wit @@ -0,0 +1,34 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +interface error { + /// A resource which represents some error information. + /// + /// The only method provided by this resource is `to-debug-string`, + /// which provides some human-readable information about the error. + /// + /// In the `wasi:io` package, this resource is returned through the + /// `wasi:io/streams/stream-error` type. + /// + /// To provide more specific error information, other interfaces may + /// offer functions to "downcast" this error into more specific types. For example, + /// errors returned from streams derived from filesystem types can be described using + /// the filesystem's own error-code type. This is done using the function + /// `wasi:filesystem/types/filesystem-error-code`, which takes a `borrow` + /// parameter and returns an `option`. + /// + /// The set of functions which can "downcast" an `error` into a more + /// concrete type is open. + @since(version = 0.2.0) + resource error { + /// Returns a string that is suitable to assist humans in debugging + /// this error. + /// + /// WARNING: The returned string should not be consumed mechanically! + /// It may change across platforms, hosts, or other implementation + /// details. Parsing this string is a major platform-compatibility + /// hazard. + @since(version = 0.2.0) + to-debug-string: func() -> string; + } +} diff --git a/search/algolia/wit/deps/wasi:io/poll.wit b/search/algolia/wit/deps/wasi:io/poll.wit new file mode 100644 index 000000000..9bcbe8e03 --- /dev/null +++ b/search/algolia/wit/deps/wasi:io/poll.wit @@ -0,0 +1,47 @@ +package wasi:io@0.2.3; + +/// A poll API intended to let users wait for I/O events on multiple handles +/// at once. +@since(version = 0.2.0) +interface poll { + /// `pollable` represents a single I/O event which may be ready, or not. + @since(version = 0.2.0) + resource pollable { + + /// Return the readiness of a pollable. This function never blocks. + /// + /// Returns `true` when the pollable is ready, and `false` otherwise. + @since(version = 0.2.0) + ready: func() -> bool; + + /// `block` returns immediately if the pollable is ready, and otherwise + /// blocks until ready. + /// + /// This function is equivalent to calling `poll.poll` on a list + /// containing only this pollable. + @since(version = 0.2.0) + block: func(); + } + + /// Poll for completion on a set of pollables. + /// + /// This function takes a list of pollables, which identify I/O sources of + /// interest, and waits until one or more of the events is ready for I/O. + /// + /// The result `list` contains one or more indices of handles in the + /// argument list that is ready for I/O. + /// + /// This function traps if either: + /// - the list is empty, or: + /// - the list contains more elements than can be indexed with a `u32` value. + /// + /// A timeout can be implemented by adding a pollable from the + /// wasi-clocks API to the list. + /// + /// This function does not return a `result`; polling in itself does not + /// do any I/O so it doesn't fail. If any of the I/O sources identified by + /// the pollables has an error, it is indicated by marking the source as + /// being ready for I/O. + @since(version = 0.2.0) + poll: func(in: list>) -> list; +} diff --git a/search/algolia/wit/deps/wasi:io/streams.wit b/search/algolia/wit/deps/wasi:io/streams.wit new file mode 100644 index 000000000..0de084629 --- /dev/null +++ b/search/algolia/wit/deps/wasi:io/streams.wit @@ -0,0 +1,290 @@ +package wasi:io@0.2.3; + +/// WASI I/O is an I/O abstraction API which is currently focused on providing +/// stream types. +/// +/// In the future, the component model is expected to add built-in stream types; +/// when it does, they are expected to subsume this API. +@since(version = 0.2.0) +interface streams { + @since(version = 0.2.0) + use error.{error}; + @since(version = 0.2.0) + use poll.{pollable}; + + /// An error for input-stream and output-stream operations. + @since(version = 0.2.0) + variant stream-error { + /// The last operation (a write or flush) failed before completion. + /// + /// More information is available in the `error` payload. + /// + /// After this, the stream will be closed. All future operations return + /// `stream-error::closed`. + last-operation-failed(error), + /// The stream is closed: no more input will be accepted by the + /// stream. A closed output-stream will return this error on all + /// future operations. + closed + } + + /// An input bytestream. + /// + /// `input-stream`s are *non-blocking* to the extent practical on underlying + /// platforms. I/O operations always return promptly; if fewer bytes are + /// promptly available than requested, they return the number of bytes promptly + /// available, which could even be zero. To wait for data to be available, + /// use the `subscribe` function to obtain a `pollable` which can be polled + /// for using `wasi:io/poll`. + @since(version = 0.2.0) + resource input-stream { + /// Perform a non-blocking read from the stream. + /// + /// When the source of a `read` is binary data, the bytes from the source + /// are returned verbatim. When the source of a `read` is known to the + /// implementation to be text, bytes containing the UTF-8 encoding of the + /// text are returned. + /// + /// This function returns a list of bytes containing the read data, + /// when successful. The returned list will contain up to `len` bytes; + /// it may return fewer than requested, but not more. The list is + /// empty when no bytes are available for reading at this time. The + /// pollable given by `subscribe` will be ready when more bytes are + /// available. + /// + /// This function fails with a `stream-error` when the operation + /// encounters an error, giving `last-operation-failed`, or when the + /// stream is closed, giving `closed`. + /// + /// When the caller gives a `len` of 0, it represents a request to + /// read 0 bytes. If the stream is still open, this call should + /// succeed and return an empty list, or otherwise fail with `closed`. + /// + /// The `len` parameter is a `u64`, which could represent a list of u8 which + /// is not possible to allocate in wasm32, or not desirable to allocate as + /// as a return value by the callee. The callee may return a list of bytes + /// less than `len` in size while more bytes are available for reading. + @since(version = 0.2.0) + read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Read bytes from a stream, after blocking until at least one byte can + /// be read. Except for blocking, behavior is identical to `read`. + @since(version = 0.2.0) + blocking-read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Skip bytes from a stream. Returns number of bytes skipped. + /// + /// Behaves identical to `read`, except instead of returning a list + /// of bytes, returns the number of bytes consumed from the stream. + @since(version = 0.2.0) + skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Skip bytes from a stream, after blocking until at least one byte + /// can be skipped. Except for blocking behavior, identical to `skip`. + @since(version = 0.2.0) + blocking-skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Create a `pollable` which will resolve once either the specified stream + /// has bytes available to read or the other end of the stream has been + /// closed. + /// The created `pollable` is a child resource of the `input-stream`. + /// Implementations may trap if the `input-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + } + + + /// An output bytestream. + /// + /// `output-stream`s are *non-blocking* to the extent practical on + /// underlying platforms. Except where specified otherwise, I/O operations also + /// always return promptly, after the number of bytes that can be written + /// promptly, which could even be zero. To wait for the stream to be ready to + /// accept data, the `subscribe` function to obtain a `pollable` which can be + /// polled for using `wasi:io/poll`. + /// + /// Dropping an `output-stream` while there's still an active write in + /// progress may result in the data being lost. Before dropping the stream, + /// be sure to fully flush your writes. + @since(version = 0.2.0) + resource output-stream { + /// Check readiness for writing. This function never blocks. + /// + /// Returns the number of bytes permitted for the next call to `write`, + /// or an error. Calling `write` with more bytes than this function has + /// permitted will trap. + /// + /// When this function returns 0 bytes, the `subscribe` pollable will + /// become ready when this function will report at least 1 byte, or an + /// error. + @since(version = 0.2.0) + check-write: func() -> result; + + /// Perform a write. This function never blocks. + /// + /// When the destination of a `write` is binary data, the bytes from + /// `contents` are written verbatim. When the destination of a `write` is + /// known to the implementation to be text, the bytes of `contents` are + /// transcoded from UTF-8 into the encoding of the destination and then + /// written. + /// + /// Precondition: check-write gave permit of Ok(n) and contents has a + /// length of less than or equal to n. Otherwise, this function will trap. + /// + /// returns Err(closed) without writing if the stream has closed since + /// the last call to check-write provided a permit. + @since(version = 0.2.0) + write: func( + contents: list + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 bytes, and then flush the stream. Block + /// until all of these operations are complete, or an error occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write`, and `flush`, and is implemented with the + /// following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while !contents.is_empty() { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, contents.len()); + /// let (chunk, rest) = contents.split_at(len); + /// this.write(chunk ); // eliding error handling + /// contents = rest; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-and-flush: func( + contents: list + ) -> result<_, stream-error>; + + /// Request to flush buffered output. This function never blocks. + /// + /// This tells the output-stream that the caller intends any buffered + /// output to be flushed. the output which is expected to be flushed + /// is all that has been passed to `write` prior to this call. + /// + /// Upon calling this function, the `output-stream` will not accept any + /// writes (`check-write` will return `ok(0)`) until the flush has + /// completed. The `subscribe` pollable will become ready when the + /// flush has completed and the stream can accept more writes. + @since(version = 0.2.0) + flush: func() -> result<_, stream-error>; + + /// Request to flush buffered output, and block until flush completes + /// and stream is ready for writing again. + @since(version = 0.2.0) + blocking-flush: func() -> result<_, stream-error>; + + /// Create a `pollable` which will resolve once the output-stream + /// is ready for more writing, or an error has occurred. When this + /// pollable is ready, `check-write` will return `ok(n)` with n>0, or an + /// error. + /// + /// If the stream is closed, this pollable is always ready immediately. + /// + /// The created `pollable` is a child resource of the `output-stream`. + /// Implementations may trap if the `output-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + + /// Write zeroes to a stream. + /// + /// This should be used precisely like `write` with the exact same + /// preconditions (must use check-write first), but instead of + /// passing a list of bytes, you simply pass the number of zero-bytes + /// that should be written. + @since(version = 0.2.0) + write-zeroes: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 zeroes, and then flush the stream. + /// Block until all of these operations are complete, or an error + /// occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write-zeroes`, and `flush`, and is implemented with + /// the following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while num_zeroes != 0 { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, num_zeroes); + /// this.write-zeroes(len); // eliding error handling + /// num_zeroes -= len; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-zeroes-and-flush: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Read from one stream and write to another. + /// + /// The behavior of splice is equivalent to: + /// 1. calling `check-write` on the `output-stream` + /// 2. calling `read` on the `input-stream` with the smaller of the + /// `check-write` permitted length and the `len` provided to `splice` + /// 3. calling `write` on the `output-stream` with that read data. + /// + /// Any error reported by the call to `check-write`, `read`, or + /// `write` ends the splice and reports that error. + /// + /// This function returns the number of bytes transferred; it may be less + /// than `len`. + @since(version = 0.2.0) + splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + + /// Read from one stream and write to another, with blocking. + /// + /// This is similar to `splice`, except that it blocks until the + /// `output-stream` is ready for writing, and the `input-stream` + /// is ready for reading, before performing the `splice`. + @since(version = 0.2.0) + blocking-splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + } +} diff --git a/search/algolia/wit/deps/wasi:io/world.wit b/search/algolia/wit/deps/wasi:io/world.wit new file mode 100644 index 000000000..f1d2102dc --- /dev/null +++ b/search/algolia/wit/deps/wasi:io/world.wit @@ -0,0 +1,10 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +world imports { + @since(version = 0.2.0) + import streams; + + @since(version = 0.2.0) + import poll; +} diff --git a/search/elasticsearch/Cargo.toml b/search/elasticsearch/Cargo.toml new file mode 100644 index 000000000..8346bb702 --- /dev/null +++ b/search/elasticsearch/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "golem-search-elasticsearch" +version = "0.0.0" +edition = "2021" +license = "Apache-2.0" +homepage = "https://golem.cloud" +repository = "https://github.com/golemcloud/golem-llm" +description = "WebAssembly component for working with ElasticSearch APIs, with special support for Golem Cloud" + +[lib] +path = "src/lib.rs" +crate-type = ["cdylib"] + +[features] +default = ["durability"] +durability = ["golem-rust/durability", "golem-search/durability"] + +[dependencies] +golem-search = { workspace = true } +golem-rust = { workspace = true } +log = { workspace = true } +reqwest = { workspace = true, features = ["json", "blocking"] } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +wit-bindgen-rt = { workspace = true } +base64 = { workspace = true } + +[package.metadata.component] +package = "golem:search-elasticsearch" + +[package.metadata.component.bindings] +generate_unused_types = true + +[package.metadata.component.bindings.with] +"golem:search/core@1.0.0" = "golem_search::golem::search::core" + +[package.metadata.component.target] +path = "wit" + +[package.metadata.component.target.dependencies] +"golem:search" = { path = "wit/deps/golem-search" } +"wasi:io" = { path = "wit/deps/wasi:io" } diff --git a/search/elasticsearch/src/client.rs b/search/elasticsearch/src/client.rs new file mode 100644 index 000000000..e53102ffd --- /dev/null +++ b/search/elasticsearch/src/client.rs @@ -0,0 +1,435 @@ +use golem_search::golem::search::types::{Doc, Schema, SearchError}; +use log::{debug, trace}; +use reqwest::{Client, Response}; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct ElasticSearchApi { + client: Client, + base_url: String, + auth: Option, +} + +#[derive(Debug, Deserialize)] +pub struct ElasticSearchError { + #[serde(default)] + pub error: ErrorDetails, +} + +#[derive(Debug, Deserialize, Default)] +pub struct ErrorDetails { + #[serde(rename = "type")] + pub error_type: String, + pub reason: String, +} + +#[derive(Debug, Deserialize)] +pub struct EsSearchResponse { + pub hits: EsHits, + #[serde(rename = "_scroll_id")] + pub scroll_id: Option, + pub took: Option, +} + +#[derive(Debug, Deserialize)] +pub struct EsHits { + pub total: EsTotal, + pub hits: Vec, +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +pub enum EsTotal { + Simple(u64), + Detailed { value: u64 }, +} + +impl EsTotal { + pub fn value(&self) -> u64 { + match self { + EsTotal::Simple(v) => *v, + EsTotal::Detailed { value } => *value, + } + } +} + +#[derive(Debug, Deserialize)] +pub struct EsHit { + #[serde(rename = "_id")] + pub id: String, + #[serde(rename = "_score")] + pub score: Option, + #[serde(rename = "_source")] + pub source: Option, + pub highlight: Option>>, +} + +#[derive(Debug, Serialize)] +pub struct EsDoc { + pub id: String, + pub content: Value, +} + +#[derive(Debug, Deserialize)] +pub struct IndexResponse { + #[serde(rename = "acknowledged")] + pub acknowledged: bool, +} + +#[derive(Debug, Deserialize)] +pub struct GetResponse { + #[serde(rename = "_id")] + pub id: String, + #[serde(rename = "_source")] + pub source: Option, + pub found: bool, +} + +impl ElasticSearchApi { + pub fn new(endpoint: &str, username: &str, password: &str) -> Self { + let auth = format!("{}:{}", username, password); + let auth = base64::encode(auth); + + let client = Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .unwrap(); + + Self { + client, + base_url: endpoint.trim_end_matches('/').to_string(), + auth: Some(auth), + } + } + + pub fn empty() -> Self { + Self { + client: Client::new(), + base_url: String::new(), + auth: None, + } + } + + fn get_auth_header(&self) -> Option { + self.auth.as_ref().map(|auth| format!("Basic {}", auth)) + } + + async fn request(&self, method: reqwest::Method, path: &str, body: Option) -> Result { + let url = format!("{}/{}", self.base_url, path.trim_start_matches('/')); + let mut request = self.client.request(method, &url); + + if let Some(auth) = self.get_auth_header() { + request = request.header("Authorization", auth); + } + + request = request.header("Content-Type", "application/json"); + + if let Some(body) = body { + request = request.json(&body); + } + + trace!("Making request to: {}", url); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Request failed: {}", e)))?; + + if response.status().is_success() { + Ok(response) + } else { + let status = response.status().as_u16(); + let error_text = response.text().await.unwrap_or_default(); + + if let Ok(es_error) = serde_json::from_str::(&error_text) { + match es_error.error.error_type.as_str() { + "index_not_found_exception" => Err(SearchError::IndexNotFound), + "parsing_exception" | "query_parsing_exception" => { + Err(SearchError::InvalidQuery(es_error.error.reason)) + } + _ => Err(SearchError::Internal(format!("ES error: {}", es_error.error.reason))) + } + } else { + Err(SearchError::Internal(format!("HTTP {}: {}", status, error_text))) + } + } + } + + pub fn create_index(&self, name: &str) -> Result<(), SearchError> { + let body = json!({ + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + } + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, name, Some(body)).await?; + debug!("Created index: {}", name); + Ok(()) + }) + } + + pub fn create_index_with_mapping(&self, name: &str, mapping: Value) -> Result<(), SearchError> { + let body = json!({ + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + }, + "mappings": mapping + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, name, Some(body)).await?; + debug!("Created index with mapping: {}", name); + Ok(()) + }) + } + + pub fn delete_index(&self, name: &str) -> Result<(), SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::DELETE, name, None).await?; + debug!("Deleted index: {}", name); + Ok(()) + }) + } + + pub fn list_indexes(&self) -> Result, SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, "_cat/indices?format=json", None).await?; + let indices: Vec = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + let names = indices.into_iter() + .filter_map(|index| index.get("index").and_then(|v| v.as_str().map(|s| s.to_string()))) + .collect(); + + Ok(names) + }) + } + + pub fn index_document(&self, index: &str, id: &str, doc: Value) -> Result<(), SearchError> { + let path = format!("{}/_doc/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, &path, Some(doc)).await?; + debug!("Indexed document: {}/{}", index, id); + Ok(()) + }) + } + + pub fn bulk_index(&self, index: &str, docs: Vec) -> Result<(), SearchError> { + if docs.is_empty() { + return Ok(()); + } + + let mut body = String::new(); + for doc in &docs { + let index_action = json!({"index": {"_index": index, "_id": doc.id}}); + body.push_str(&serde_json::to_string(&index_action).unwrap()); + body.push('\n'); + body.push_str(&serde_json::to_string(&doc.content).unwrap()); + body.push('\n'); + } + + let url = format!("{}/_bulk", self.base_url); + let mut request = self.client.post(&url); + + if let Some(auth) = self.get_auth_header() { + request = request.header("Authorization", auth); + } + + request = request + .header("Content-Type", "application/x-ndjson") + .body(body); + + let response = request.send() + .map_err(|e| SearchError::Internal(format!("Bulk request failed: {}", e)))?; + + if !response.status().is_success() { + let error_text = response.text().unwrap_or_default(); + return Err(SearchError::Internal(format!("Bulk indexing failed: {}", error_text))); + } + + debug!("Bulk indexed {} documents to {}", docs.len(), index); + Ok(()) + } + + pub fn delete_document(&self, index: &str, id: &str) -> Result<(), SearchError> { + let path = format!("{}/_doc/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::DELETE, &path, None).await?; + debug!("Deleted document: {}/{}", index, id); + Ok(()) + }) + } + + pub fn bulk_delete(&self, index: &str, ids: Vec) -> Result<(), SearchError> { + if ids.is_empty() { + return Ok(()); + } + + let mut body = String::new(); + for id in ids { + let delete_action = json!({"delete": {"_index": index, "_id": id}}); + body.push_str(&serde_json::to_string(&delete_action).unwrap()); + body.push('\n'); + } + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let url = format!("{}/_bulk", self.base_url); + let mut request = self.client.post(&url); + + if let Some(auth) = self.get_auth_header() { + request = request.header("Authorization", auth); + } + + request = request + .header("Content-Type", "application/x-ndjson") + .body(body); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Bulk delete failed: {}", e)))?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + return Err(SearchError::Internal(format!("Bulk delete failed: {}", error_text))); + } + + debug!("Bulk deleted {} documents from {}", ids.len(), index); + Ok(()) + }) + } + + pub fn get_document(&self, index: &str, id: &str) -> Result, SearchError> { + let path = format!("{}/_doc/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + match self.request(reqwest::Method::GET, &path, None).await { + Ok(response) => { + let get_response: GetResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + if get_response.found { + if let Some(source) = get_response.source { + Ok(Some(Doc { + id: get_response.id, + content: serde_json::to_string(&source) + .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?, + })) + } else { + Ok(None) + } + } else { + Ok(None) + } + } + Err(SearchError::Internal(msg)) if msg.contains("404") => Ok(None), + Err(e) => Err(e), + } + }) + } + + pub fn search(&self, index: &str, query: Value) -> Result { + let path = format!("{}/_search", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, &path, Some(query)).await?; + let search_response: EsSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse search response: {}", e)))?; + + debug!("Search completed in {}ms", search_response.took.unwrap_or(0)); + Ok(search_response) + }) + } + + pub fn search_with_scroll(&self, index: &str, query: Value) -> Result { + let path = format!("{}/_search?scroll=1m", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, &path, Some(query)).await?; + let search_response: EsSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse scroll search response: {}", e)))?; + + debug!("Scroll search initiated"); + Ok(search_response) + }) + } + + pub fn scroll(&self, scroll_id: &str) -> Result { + let body = json!({ + "scroll": "1m", + "scroll_id": scroll_id + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, "_search/scroll", Some(body)).await?; + let search_response: EsSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse scroll response: {}", e)))?; + + Ok(search_response) + }) + } + + pub fn get_mapping(&self, index: &str) -> Result { + let path = format!("{}/_mapping", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, &path, None).await?; + let mapping: Value = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse mapping response: {}", e)))?; + + crate::conversions::mapping_to_schema(&mapping) + }) + } + + pub fn update_mapping(&self, index: &str, mapping: Value) -> Result<(), SearchError> { + let path = format!("{}/_mapping", index); + + self.request(reqwest::Method::PUT, &path, Some(mapping))?; + debug!("Updated mapping for index: {}", index); + Ok(()) + } +} diff --git a/search/elasticsearch/src/conversions.rs b/search/elasticsearch/src/conversions.rs new file mode 100644 index 000000000..c44d658d2 --- /dev/null +++ b/search/elasticsearch/src/conversions.rs @@ -0,0 +1,298 @@ +use crate::client::{EsDoc, EsHit, EsSearchResponse, EsTotal}; +use golem_search::golem::search::types::{ + Doc, FieldType, Schema, SchemaField, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use log::trace; +use serde_json::{json, Map, Value}; +use std::collections::HashMap; + +pub fn query_to_es_query(query: &SearchQuery) -> Result { + let mut es_query = json!({}); + let mut bool_query = json!({}); + let mut must_clauses = Vec::new(); + let mut filter_clauses = Vec::new(); + + if let Some(q) = &query.q { + if !q.trim().is_empty() { + must_clauses.push(json!({ + "multi_match": { + "query": q, + "type": "best_fields", + "fields": ["*"], + "fuzziness": "AUTO" + } + })); + } + } + + for filter in &query.filters { + if let Ok(filter_value) = serde_json::from_str::(filter) { + filter_clauses.push(filter_value); + } else { + let parts: Vec<&str> = filter.splitn(3, ':').collect(); + if parts.len() == 3 { + let field = parts[0]; + let op = parts[1]; + let value = parts[2]; + + let filter_clause = match op { + "eq" => json!({"term": {field: value}}), + "ne" => json!({"bool": {"must_not": {"term": {field: value}}}}), + "gt" => json!({"range": {field: {"gt": value}}}), + "gte" => json!({"range": {field: {"gte": value}}}), + "lt" => json!({"range": {field: {"lt": value}}}), + "lte" => json!({"range": {field: {"lte": value}}}), + "in" => { + let values: Vec<&str> = value.split(',').collect(); + json!({"terms": {field: values}}) + } + "exists" => json!({"exists": {"field": field}}), + "prefix" => json!({"prefix": {field: value}}), + "wildcard" => json!({"wildcard": {field: value}}), + _ => return Err(SearchError::InvalidQuery(format!("Unknown filter operator: {}", op))) + }; + filter_clauses.push(filter_clause); + } else { + return Err(SearchError::InvalidQuery(format!("Invalid filter format: {}", filter))); + } + } + } + + if !must_clauses.is_empty() || !filter_clauses.is_empty() { + if !must_clauses.is_empty() { + bool_query["must"] = json!(must_clauses); + } + if !filter_clauses.is_empty() { + bool_query["filter"] = json!(filter_clauses); + } + es_query["query"] = json!({"bool": bool_query}); + } else { + es_query["query"] = json!({"match_all": {}}); + } + + if !query.sort.is_empty() { + let mut sort_clauses = Vec::new(); + for sort_field in &query.sort { + if sort_field.starts_with('-') { + let field = &sort_field[1..]; + sort_clauses.push(json!({field: {"order": "desc"}})); + } else { + sort_clauses.push(json!({sort_field: {"order": "asc"}})); + } + } + es_query["sort"] = json!(sort_clauses); + } + + if let Some(highlight) = &query.highlight { + let mut highlight_config = json!({ + "fields": {} + }); + + for field in &highlight.fields { + highlight_config["fields"][field] = json!({}); + } + + if let Some(pre_tag) = &highlight.pre_tag { + highlight_config["pre_tags"] = json!([pre_tag]); + } + + if let Some(post_tag) = &highlight.post_tag { + highlight_config["post_tags"] = json!([post_tag]); + } + + if let Some(max_length) = highlight.max_length { + highlight_config["fragment_size"] = json!(max_length); + } + + es_query["highlight"] = highlight_config; + } + + let size = query.per_page.unwrap_or(10) as usize; + es_query["size"] = json!(size); + + if let Some(page) = query.page { + let from = ((page - 1) * query.per_page.unwrap_or(10)) as usize; + es_query["from"] = json!(from); + } else if let Some(offset) = query.offset { + es_query["from"] = json!(offset); + } + + if !query.facets.is_empty() { + let mut aggs = json!({}); + for facet in &query.facets { + aggs[facet] = json!({ + "terms": { + "field": facet + } + }); + } + es_query["aggs"] = aggs; + } + + if let Some(config) = &query.config { + if let Some(timeout_ms) = config.timeout_ms { + es_query["timeout"] = json!(format!("{}ms", timeout_ms)); + } + + if !config.attributes_to_retrieve.is_empty() { + es_query["_source"] = json!(config.attributes_to_retrieve); + } + } + + trace!("Generated ES query: {}", serde_json::to_string_pretty(&es_query).unwrap_or_default()); + Ok(es_query) +} + +pub fn es_hit_to_search_hit(hit: EsHit) -> SearchHit { + let content = hit.source.map(|s| serde_json::to_string(&s).unwrap_or_default()); + + let highlights = if let Some(highlight_map) = hit.highlight { + let mut highlights = Map::new(); + for (field, fragments) in highlight_map { + highlights.insert(field, json!(fragments)); + } + Some(serde_json::to_string(&highlights).unwrap_or_default()) + } else { + None + }; + + SearchHit { + id: hit.id, + score: hit.score, + content, + highlights, + } +} + +pub fn es_response_to_results(response: EsSearchResponse, query: &SearchQuery) -> SearchResults { + let hits = response.hits.hits.into_iter().map(es_hit_to_search_hit).collect(); + let total = Some(response.hits.total.value() as u32); + + let page = query.page; + let per_page = query.per_page; + let took_ms = response.took.map(|t| t as u32); + + SearchResults { + total, + page, + per_page, + hits, + facets: None, + took_ms, + } +} + +pub fn schema_to_mapping(schema: &Schema) -> Value { + let mut properties = json!({}); + + for field in &schema.fields { + let mut field_mapping = json!({}); + + match field.type_ { + FieldType::Text => { + field_mapping["type"] = json!("text"); + if field.facet { + field_mapping["fields"] = json!({ + "keyword": { + "type": "keyword" + } + }); + } + } + FieldType::Keyword => { + field_mapping["type"] = json!("keyword"); + } + FieldType::Integer => { + field_mapping["type"] = json!("long"); + } + FieldType::Float => { + field_mapping["type"] = json!("double"); + } + FieldType::Boolean => { + field_mapping["type"] = json!("boolean"); + } + FieldType::Date => { + field_mapping["type"] = json!("date"); + } + FieldType::GeoPoint => { + field_mapping["type"] = json!("geo_point"); + } + } + + if !field.index { + field_mapping["index"] = json!(false); + } + + properties[&field.name] = field_mapping; + } + + json!({ + "properties": properties + }) +} + +pub fn mapping_to_schema(mapping: &Value) -> Result { + let mut fields = Vec::new(); + let mut primary_key = None; + + if let Some(index_mappings) = mapping.as_object() { + for (index_name, index_mapping) in index_mappings { + if let Some(mappings) = index_mapping.get("mappings") { + if let Some(properties) = mappings.get("properties") { + if let Some(props) = properties.as_object() { + for (field_name, field_def) in props { + if let Some(field_type_str) = field_def.get("type").and_then(|v| v.as_str()) { + let field_type = match field_type_str { + "text" => FieldType::Text, + "keyword" => FieldType::Keyword, + "long" | "integer" | "short" | "byte" => FieldType::Integer, + "double" | "float" => FieldType::Float, + "boolean" => FieldType::Boolean, + "date" => FieldType::Date, + "geo_point" => FieldType::GeoPoint, + _ => FieldType::Text, + }; + + let index = field_def.get("index") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + + let facet = field_def.get("fields") + .and_then(|v| v.get("keyword")) + .is_some() || field_type == FieldType::Keyword; + + fields.push(SchemaField { + name: field_name.clone(), + type_: field_type, + required: false, + facet, + sort: true, + index, + }); + + if field_name == "_id" || field_name == "id" { + primary_key = Some(field_name.clone()); + } + } + } + } + } + } + } + } + + Ok(Schema { + fields, + primary_key, + }) +} + +pub fn doc_to_es_doc(doc: Doc) -> Result { + let content: Value = serde_json::from_str(&doc.content) + .map_err(|e| SearchError::InvalidQuery(format!("Invalid JSON in document: {}", e)))?; + + Ok(EsDoc { + id: doc.id, + content, + }) +} diff --git a/search/elasticsearch/src/lib.rs b/search/elasticsearch/src/lib.rs new file mode 100644 index 000000000..3ef6e0fa7 --- /dev/null +++ b/search/elasticsearch/src/lib.rs @@ -0,0 +1,297 @@ +use crate::client::ElasticSearchApi; +use crate::conversions::{ + doc_to_es_doc, es_hit_to_search_hit, es_response_to_results, query_to_es_query, + schema_to_mapping, +}; +use golem_search::config::with_config_key; +use golem_search::durability::{DurableSearch, ExtendedGuest}; +use golem_search::golem::search::core::Guest; +use golem_search::golem::search::types::{ + Doc, Schema, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use golem_search::{SearchStream, SearchStreamState, LOGGING_STATE}; +use log::trace; +use std::cell::RefCell; + +mod client; +mod conversions; + +struct ElasticSearchStream { + client: ElasticSearchApi, + index: String, + query: SearchQuery, + scroll_id: RefCell>, + finished: RefCell, + failure: Option, +} + +impl ElasticSearchStream { + fn new(client: ElasticSearchApi, index: String, query: SearchQuery) -> Self { + Self { + client, + index, + query, + scroll_id: RefCell::new(None), + finished: RefCell::new(false), + failure: None, + } + } + + fn failed(error: SearchError) -> Self { + Self { + client: ElasticSearchApi::empty(), + index: String::new(), + query: SearchQuery { + q: None, + filters: vec![], + sort: vec![], + facets: vec![], + page: None, + per_page: None, + offset: None, + highlight: None, + config: None, + }, + scroll_id: RefCell::new(None), + finished: RefCell::new(true), + failure: Some(error), + } + } +} + +impl SearchStreamState for ElasticSearchStream { + fn failure(&self) -> &Option { + &self.failure + } + + fn is_finished(&self) -> bool { + *self.finished.borrow() + } + + fn set_finished(&self) { + *self.finished.borrow_mut() = true; + } + + fn get_next_hits(&self) -> Result, SearchError> { + let scroll_id = self.scroll_id.borrow().clone(); + + if let Some(scroll_id) = scroll_id { + match self.client.scroll(&scroll_id) { + Ok(response) => { + if response.hits.hits.is_empty() { + self.set_finished(); + Ok(vec![]) + } else { + *self.scroll_id.borrow_mut() = response.scroll_id; + Ok(response.hits.hits.into_iter().map(es_hit_to_search_hit).collect()) + } + } + Err(error) => Err(error), + } + } else { + let es_query = query_to_es_query(&self.query)?; + match self.client.search_with_scroll(&self.index, es_query) { + Ok(response) => { + if response.hits.hits.is_empty() { + self.set_finished(); + Ok(vec![]) + } else { + *self.scroll_id.borrow_mut() = response.scroll_id; + Ok(response.hits.hits.into_iter().map(es_hit_to_search_hit).collect()) + } + } + Err(error) => Err(error), + } + } + } +} + +struct ElasticSearchComponent; + +impl ElasticSearchComponent { + const ENDPOINT_VAR: &'static str = "ELASTIC_ENDPOINT"; + const PASSWORD_VAR: &'static str = "ELASTIC_PASSWORD"; + const USERNAME_VAR: &'static str = "ELASTIC_USERNAME"; +} + +impl Guest for ElasticSearchComponent { + type SearchHitStream = SearchStream; + + fn create_index(name: String, schema: Option) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + + if let Some(schema) = schema { + let mapping = schema_to_mapping(&schema); + client.create_index_with_mapping(&name, mapping) + } else { + client.create_index(&name) + } + }) + }) + }) + } + + fn delete_index(name: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + client.delete_index(&name) + }) + }) + }) + } + + fn list_indexes() -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + client.list_indexes() + }) + }) + }) + } + + fn upsert(index: String, doc: Doc) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + let es_doc = doc_to_es_doc(doc)?; + client.index_document(&index, &es_doc.id, es_doc.content) + }) + }) + }) + } + + fn upsert_many(index: String, docs: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + let es_docs: Result, _> = docs.into_iter().map(doc_to_es_doc).collect(); + client.bulk_index(&index, es_docs?) + }) + }) + }) + } + + fn delete(index: String, id: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + client.delete_document(&index, &id) + }) + }) + }) + } + + fn delete_many(index: String, ids: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + client.bulk_delete(&index, ids) + }) + }) + }) + } + + fn get(index: String, id: String) -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + client.get_document(&index, &id) + }) + }) + }) + } + + fn search(index: String, query: SearchQuery) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + let es_query = query_to_es_query(&query)?; + trace!("Executing search query: {:?}", es_query); + + match client.search(&index, es_query) { + Ok(response) => Ok(es_response_to_results(response, &query)), + Err(error) => Err(error), + } + }) + }) + }) + } + + fn stream_search( + index: String, + query: SearchQuery, + ) -> Result { + Self::unwrapped_stream_search(index, query) + } + + fn get_schema(index: String) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + client.get_mapping(&index) + }) + }) + }) + } + + fn update_schema(index: String, schema: Schema) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = ElasticSearchApi::new(endpoint, api_key); + let mapping = schema_to_mapping(&schema); + client.update_mapping(&index, mapping) + }) + }) + }) + } +} + +impl ExtendedGuest for ElasticSearchComponent { + type SearchHitStream = SearchStream; + + fn unwrapped_stream_search( + index: String, + query: SearchQuery, + ) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, |error| Ok(SearchStream::new(ElasticSearchStream::failed(error))), |endpoint| { + with_config_key(Self::USERNAME_VAR, |error| Ok(SearchStream::new(ElasticSearchStream::failed(error))), |username| { + with_config_key(Self::PASSWORD_VAR, |error| Ok(SearchStream::new(ElasticSearchStream::failed(error))), |password| { + let client = ElasticSearchApi::new(endpoint, username, password); + Ok(SearchStream::new(ElasticSearchStream::new(client, index, query))) + }) + }) + }) + } +} + +type DurableElasticSearchComponent = DurableSearch; + +golem_search::export_search!(DurableElasticSearchComponent with_types_in golem_search); diff --git a/search/elasticsearch/wit/deps/golem-search/golem-search.wit b/search/elasticsearch/wit/deps/golem-search/golem-search.wit new file mode 100644 index 000000000..64c822dfa --- /dev/null +++ b/search/elasticsearch/wit/deps/golem-search/golem-search.wit @@ -0,0 +1,134 @@ +package golem:search@1.0.0; + +/// Core types and error handling for universal search interfaces +interface types { + /// Common structured errors for search operations + variant search-error { + index-not-found, + invalid-query(string), + unsupported, + internal(string), + timeout, + rate-limited, + } + + /// Identifier types + type index-name = string; + type document-id = string; + type json = string; + + /// Document payload + record doc { + id: document-id, + content: json, + } + + /// Highlight configuration + record highlight-config { + fields: list, + pre-tag: option, + post-tag: option, + max-length: option, + } + + /// Advanced search tuning + record search-config { + timeout-ms: option, + boost-fields: list>, + attributes-to-retrieve: list, + language: option, + typo-tolerance: option, + exact-match-boost: option, + provider-params: option, + } + + /// Search request + record search-query { + q: option, + filters: list, + sort: list, + facets: list, + page: option, + per-page: option, + offset: option, + highlight: option, + config: option, + } + + /// Search hit + record search-hit { + id: document-id, + score: option, + content: option, + highlights: option, + } + + /// Search result set + record search-results { + total: option, + page: option, + per-page: option, + hits: list, + facets: option, + took-ms: option, + } + + /// Field schema types + enum field-type { + text, + keyword, + integer, + float, + boolean, + date, + geo-point, + } + + /// Field definition + record schema-field { + name: string, + type: field-type, + required: bool, + facet: bool, + sort: bool, + index: bool, + } + + /// Index schema + record schema { + fields: list, + primary-key: option, + } +} + +/// Unified search interface +interface core { + use types.{ + index-name, document-id, doc, search-query, search-results, + search-hit, schema, search-error + }; + + // Index lifecycle + create-index: func(name: index-name, schema: option) -> result<_, search-error>; + delete-index: func(name: index-name) -> result<_, search-error>; + list-indexes: func() -> result, search-error>; + + // Document operations + upsert: func(index: index-name, doc: doc) -> result<_, search-error>; + upsert-many: func(index: index-name, docs: list) -> result<_, search-error>; + delete: func(index: index-name, id: document-id) -> result<_, search-error>; + delete-many: func(index: index-name, ids: list) -> result<_, search-error>; + get: func(index: index-name, id: document-id) -> result, search-error>; + + // Query + search: func(index: index-name, query: search-query) -> result; + stream-search: func(index: index-name, query: search-query) -> result, search-error>; + + // Schema inspection + get-schema: func(index: index-name) -> result; + update-schema: func(index: index-name, schema: schema) -> result<_, search-error>; +} + +world search-library { + export core; +} diff --git a/search/elasticsearch/wit/deps/wasi:io/error.wit b/search/elasticsearch/wit/deps/wasi:io/error.wit new file mode 100644 index 000000000..97c606877 --- /dev/null +++ b/search/elasticsearch/wit/deps/wasi:io/error.wit @@ -0,0 +1,34 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +interface error { + /// A resource which represents some error information. + /// + /// The only method provided by this resource is `to-debug-string`, + /// which provides some human-readable information about the error. + /// + /// In the `wasi:io` package, this resource is returned through the + /// `wasi:io/streams/stream-error` type. + /// + /// To provide more specific error information, other interfaces may + /// offer functions to "downcast" this error into more specific types. For example, + /// errors returned from streams derived from filesystem types can be described using + /// the filesystem's own error-code type. This is done using the function + /// `wasi:filesystem/types/filesystem-error-code`, which takes a `borrow` + /// parameter and returns an `option`. + /// + /// The set of functions which can "downcast" an `error` into a more + /// concrete type is open. + @since(version = 0.2.0) + resource error { + /// Returns a string that is suitable to assist humans in debugging + /// this error. + /// + /// WARNING: The returned string should not be consumed mechanically! + /// It may change across platforms, hosts, or other implementation + /// details. Parsing this string is a major platform-compatibility + /// hazard. + @since(version = 0.2.0) + to-debug-string: func() -> string; + } +} diff --git a/search/elasticsearch/wit/deps/wasi:io/poll.wit b/search/elasticsearch/wit/deps/wasi:io/poll.wit new file mode 100644 index 000000000..9bcbe8e03 --- /dev/null +++ b/search/elasticsearch/wit/deps/wasi:io/poll.wit @@ -0,0 +1,47 @@ +package wasi:io@0.2.3; + +/// A poll API intended to let users wait for I/O events on multiple handles +/// at once. +@since(version = 0.2.0) +interface poll { + /// `pollable` represents a single I/O event which may be ready, or not. + @since(version = 0.2.0) + resource pollable { + + /// Return the readiness of a pollable. This function never blocks. + /// + /// Returns `true` when the pollable is ready, and `false` otherwise. + @since(version = 0.2.0) + ready: func() -> bool; + + /// `block` returns immediately if the pollable is ready, and otherwise + /// blocks until ready. + /// + /// This function is equivalent to calling `poll.poll` on a list + /// containing only this pollable. + @since(version = 0.2.0) + block: func(); + } + + /// Poll for completion on a set of pollables. + /// + /// This function takes a list of pollables, which identify I/O sources of + /// interest, and waits until one or more of the events is ready for I/O. + /// + /// The result `list` contains one or more indices of handles in the + /// argument list that is ready for I/O. + /// + /// This function traps if either: + /// - the list is empty, or: + /// - the list contains more elements than can be indexed with a `u32` value. + /// + /// A timeout can be implemented by adding a pollable from the + /// wasi-clocks API to the list. + /// + /// This function does not return a `result`; polling in itself does not + /// do any I/O so it doesn't fail. If any of the I/O sources identified by + /// the pollables has an error, it is indicated by marking the source as + /// being ready for I/O. + @since(version = 0.2.0) + poll: func(in: list>) -> list; +} diff --git a/search/elasticsearch/wit/deps/wasi:io/streams.wit b/search/elasticsearch/wit/deps/wasi:io/streams.wit new file mode 100644 index 000000000..0de084629 --- /dev/null +++ b/search/elasticsearch/wit/deps/wasi:io/streams.wit @@ -0,0 +1,290 @@ +package wasi:io@0.2.3; + +/// WASI I/O is an I/O abstraction API which is currently focused on providing +/// stream types. +/// +/// In the future, the component model is expected to add built-in stream types; +/// when it does, they are expected to subsume this API. +@since(version = 0.2.0) +interface streams { + @since(version = 0.2.0) + use error.{error}; + @since(version = 0.2.0) + use poll.{pollable}; + + /// An error for input-stream and output-stream operations. + @since(version = 0.2.0) + variant stream-error { + /// The last operation (a write or flush) failed before completion. + /// + /// More information is available in the `error` payload. + /// + /// After this, the stream will be closed. All future operations return + /// `stream-error::closed`. + last-operation-failed(error), + /// The stream is closed: no more input will be accepted by the + /// stream. A closed output-stream will return this error on all + /// future operations. + closed + } + + /// An input bytestream. + /// + /// `input-stream`s are *non-blocking* to the extent practical on underlying + /// platforms. I/O operations always return promptly; if fewer bytes are + /// promptly available than requested, they return the number of bytes promptly + /// available, which could even be zero. To wait for data to be available, + /// use the `subscribe` function to obtain a `pollable` which can be polled + /// for using `wasi:io/poll`. + @since(version = 0.2.0) + resource input-stream { + /// Perform a non-blocking read from the stream. + /// + /// When the source of a `read` is binary data, the bytes from the source + /// are returned verbatim. When the source of a `read` is known to the + /// implementation to be text, bytes containing the UTF-8 encoding of the + /// text are returned. + /// + /// This function returns a list of bytes containing the read data, + /// when successful. The returned list will contain up to `len` bytes; + /// it may return fewer than requested, but not more. The list is + /// empty when no bytes are available for reading at this time. The + /// pollable given by `subscribe` will be ready when more bytes are + /// available. + /// + /// This function fails with a `stream-error` when the operation + /// encounters an error, giving `last-operation-failed`, or when the + /// stream is closed, giving `closed`. + /// + /// When the caller gives a `len` of 0, it represents a request to + /// read 0 bytes. If the stream is still open, this call should + /// succeed and return an empty list, or otherwise fail with `closed`. + /// + /// The `len` parameter is a `u64`, which could represent a list of u8 which + /// is not possible to allocate in wasm32, or not desirable to allocate as + /// as a return value by the callee. The callee may return a list of bytes + /// less than `len` in size while more bytes are available for reading. + @since(version = 0.2.0) + read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Read bytes from a stream, after blocking until at least one byte can + /// be read. Except for blocking, behavior is identical to `read`. + @since(version = 0.2.0) + blocking-read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Skip bytes from a stream. Returns number of bytes skipped. + /// + /// Behaves identical to `read`, except instead of returning a list + /// of bytes, returns the number of bytes consumed from the stream. + @since(version = 0.2.0) + skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Skip bytes from a stream, after blocking until at least one byte + /// can be skipped. Except for blocking behavior, identical to `skip`. + @since(version = 0.2.0) + blocking-skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Create a `pollable` which will resolve once either the specified stream + /// has bytes available to read or the other end of the stream has been + /// closed. + /// The created `pollable` is a child resource of the `input-stream`. + /// Implementations may trap if the `input-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + } + + + /// An output bytestream. + /// + /// `output-stream`s are *non-blocking* to the extent practical on + /// underlying platforms. Except where specified otherwise, I/O operations also + /// always return promptly, after the number of bytes that can be written + /// promptly, which could even be zero. To wait for the stream to be ready to + /// accept data, the `subscribe` function to obtain a `pollable` which can be + /// polled for using `wasi:io/poll`. + /// + /// Dropping an `output-stream` while there's still an active write in + /// progress may result in the data being lost. Before dropping the stream, + /// be sure to fully flush your writes. + @since(version = 0.2.0) + resource output-stream { + /// Check readiness for writing. This function never blocks. + /// + /// Returns the number of bytes permitted for the next call to `write`, + /// or an error. Calling `write` with more bytes than this function has + /// permitted will trap. + /// + /// When this function returns 0 bytes, the `subscribe` pollable will + /// become ready when this function will report at least 1 byte, or an + /// error. + @since(version = 0.2.0) + check-write: func() -> result; + + /// Perform a write. This function never blocks. + /// + /// When the destination of a `write` is binary data, the bytes from + /// `contents` are written verbatim. When the destination of a `write` is + /// known to the implementation to be text, the bytes of `contents` are + /// transcoded from UTF-8 into the encoding of the destination and then + /// written. + /// + /// Precondition: check-write gave permit of Ok(n) and contents has a + /// length of less than or equal to n. Otherwise, this function will trap. + /// + /// returns Err(closed) without writing if the stream has closed since + /// the last call to check-write provided a permit. + @since(version = 0.2.0) + write: func( + contents: list + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 bytes, and then flush the stream. Block + /// until all of these operations are complete, or an error occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write`, and `flush`, and is implemented with the + /// following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while !contents.is_empty() { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, contents.len()); + /// let (chunk, rest) = contents.split_at(len); + /// this.write(chunk ); // eliding error handling + /// contents = rest; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-and-flush: func( + contents: list + ) -> result<_, stream-error>; + + /// Request to flush buffered output. This function never blocks. + /// + /// This tells the output-stream that the caller intends any buffered + /// output to be flushed. the output which is expected to be flushed + /// is all that has been passed to `write` prior to this call. + /// + /// Upon calling this function, the `output-stream` will not accept any + /// writes (`check-write` will return `ok(0)`) until the flush has + /// completed. The `subscribe` pollable will become ready when the + /// flush has completed and the stream can accept more writes. + @since(version = 0.2.0) + flush: func() -> result<_, stream-error>; + + /// Request to flush buffered output, and block until flush completes + /// and stream is ready for writing again. + @since(version = 0.2.0) + blocking-flush: func() -> result<_, stream-error>; + + /// Create a `pollable` which will resolve once the output-stream + /// is ready for more writing, or an error has occurred. When this + /// pollable is ready, `check-write` will return `ok(n)` with n>0, or an + /// error. + /// + /// If the stream is closed, this pollable is always ready immediately. + /// + /// The created `pollable` is a child resource of the `output-stream`. + /// Implementations may trap if the `output-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + + /// Write zeroes to a stream. + /// + /// This should be used precisely like `write` with the exact same + /// preconditions (must use check-write first), but instead of + /// passing a list of bytes, you simply pass the number of zero-bytes + /// that should be written. + @since(version = 0.2.0) + write-zeroes: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 zeroes, and then flush the stream. + /// Block until all of these operations are complete, or an error + /// occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write-zeroes`, and `flush`, and is implemented with + /// the following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while num_zeroes != 0 { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, num_zeroes); + /// this.write-zeroes(len); // eliding error handling + /// num_zeroes -= len; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-zeroes-and-flush: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Read from one stream and write to another. + /// + /// The behavior of splice is equivalent to: + /// 1. calling `check-write` on the `output-stream` + /// 2. calling `read` on the `input-stream` with the smaller of the + /// `check-write` permitted length and the `len` provided to `splice` + /// 3. calling `write` on the `output-stream` with that read data. + /// + /// Any error reported by the call to `check-write`, `read`, or + /// `write` ends the splice and reports that error. + /// + /// This function returns the number of bytes transferred; it may be less + /// than `len`. + @since(version = 0.2.0) + splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + + /// Read from one stream and write to another, with blocking. + /// + /// This is similar to `splice`, except that it blocks until the + /// `output-stream` is ready for writing, and the `input-stream` + /// is ready for reading, before performing the `splice`. + @since(version = 0.2.0) + blocking-splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + } +} diff --git a/search/elasticsearch/wit/deps/wasi:io/world.wit b/search/elasticsearch/wit/deps/wasi:io/world.wit new file mode 100644 index 000000000..f1d2102dc --- /dev/null +++ b/search/elasticsearch/wit/deps/wasi:io/world.wit @@ -0,0 +1,10 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +world imports { + @since(version = 0.2.0) + import streams; + + @since(version = 0.2.0) + import poll; +} diff --git a/search/elasticsearch/wit/elasticsearch.wit b/search/elasticsearch/wit/elasticsearch.wit new file mode 100644 index 000000000..843fa3a61 --- /dev/null +++ b/search/elasticsearch/wit/elasticsearch.wit @@ -0,0 +1,6 @@ +package golem:search-elasticsearch; + +world elasticsearch-provider { + import golem:search/core@1.0.0; + export golem:search/core@1.0.0; +} diff --git a/search/meilisearch/Cargo.toml b/search/meilisearch/Cargo.toml new file mode 100644 index 000000000..8bdefb768 --- /dev/null +++ b/search/meilisearch/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "golem-search-meilisearch" +version = "0.0.0" +edition = "2021" +license = "Apache-2.0" +homepage = "https://golem.cloud" +repository = "https://github.com/golemcloud/golem-llm" +description = "WebAssembly component for working with Meilisearch APIs, with special support for Golem Cloud" + +[lib] +path = "src/lib.rs" +crate-type = ["cdylib"] + +[features] +default = ["durability"] +durability = ["golem-rust/durability", "golem-search/durability"] + +[dependencies] +golem-search = { path = "../search" } +golem-rust = "1.6.0" +log = "0.4.27" +reqwest = { git = "https://github.com/golemcloud/reqwest", branch = "update-may-2025", features = ["json", "blocking"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +wit-bindgen-rt = { version = "0.40.0", features = ["bitflags"] } + +[package.metadata.component] +package = "golem:search-meilisearch" + +[package.metadata.component.bindings] +generate_unused_types = true + +[package.metadata.component.bindings.with] +"golem:search/core@1.0.0" = "golem_search::golem::search::core" + +[package.metadata.component.target] +path = "wit" + +[package.metadata.component.target.dependencies] +"golem:search" = { path = "wit/deps/golem-search" } +"wasi:io" = { path = "wit/deps/wasi:io" } diff --git a/search/meilisearch/src/client.rs b/search/meilisearch/src/client.rs new file mode 100644 index 000000000..ae722c412 --- /dev/null +++ b/search/meilisearch/src/client.rs @@ -0,0 +1,457 @@ +use golem_search::golem::search::types::{Doc, Schema, SearchError}; +use log::{debug, trace}; +use reqwest::{Client, Response}; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct MeilisearchApi { + client: Client, + base_url: String, + api_key: String, +} + +#[derive(Debug, Deserialize)] +pub struct MeilisearchError { + pub message: String, + pub code: String, + #[serde(rename = "type")] + pub error_type: String, + pub link: String, +} + +#[derive(Debug, Deserialize)] +pub struct MeilisearchSearchResponse { + pub hits: Vec, + #[serde(rename = "processingTimeMs")] + pub processing_time_ms: u64, + pub query: String, + pub limit: u32, + pub offset: u32, + #[serde(rename = "estimatedTotalHits")] + pub estimated_total_hits: Option, + #[serde(rename = "facetDistribution")] + pub facet_distribution: Option>>, +} + +#[derive(Debug, Deserialize)] +pub struct MeilisearchHit { + #[serde(flatten)] + pub source: Value, + #[serde(rename = "_formatted")] + pub formatted: Option, +} + +#[derive(Debug, Serialize)] +pub struct MeilisearchSearchQuery { + pub q: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub offset: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub limit: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub filter: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub sort: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub facets: Option>, + #[serde(rename = "attributesToHighlight", skip_serializing_if = "Option::is_none")] + pub attributes_to_highlight: Option>, + #[serde(rename = "highlightPreTag", skip_serializing_if = "Option::is_none")] + pub highlight_pre_tag: Option, + #[serde(rename = "highlightPostTag", skip_serializing_if = "Option::is_none")] + pub highlight_post_tag: Option, +} + +#[derive(Debug, Serialize)] +pub struct MeilisearchDoc { + pub id: String, + #[serde(flatten)] + pub content: Value, +} + +#[derive(Debug, Deserialize)] +pub struct MeilisearchTask { + #[serde(rename = "taskUid")] + pub task_uid: u64, + #[serde(rename = "indexUid")] + pub index_uid: String, + pub status: String, + #[serde(rename = "type")] + pub task_type: String, + #[serde(rename = "enqueuedAt")] + pub enqueued_at: String, +} + +#[derive(Debug, Deserialize)] +pub struct MeilisearchIndexes { + pub results: Vec, +} + +#[derive(Debug, Deserialize)] +pub struct MeilisearchIndex { + pub uid: String, + #[serde(rename = "primaryKey")] + pub primary_key: Option, + #[serde(rename = "createdAt")] + pub created_at: String, + #[serde(rename = "updatedAt")] + pub updated_at: String, +} + +#[derive(Debug, Serialize)] +pub struct MeilisearchIndexSettings { + #[serde(rename = "searchableAttributes", skip_serializing_if = "Option::is_none")] + pub searchable_attributes: Option>, + #[serde(rename = "filterableAttributes", skip_serializing_if = "Option::is_none")] + pub filterable_attributes: Option>, + #[serde(rename = "sortableAttributes", skip_serializing_if = "Option::is_none")] + pub sortable_attributes: Option>, + #[serde(rename = "rankingRules", skip_serializing_if = "Option::is_none")] + pub ranking_rules: Option>, +} + +impl MeilisearchApi { + pub fn new(host: String, api_key: String) -> Self { + let client = Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .expect("Failed to create HTTP client"); + + Self { + client, + base_url: host.trim_end_matches('/').to_string(), + api_key, + } + } + + pub fn empty() -> Self { + Self { + client: Client::new(), + base_url: String::new(), + api_key: String::new(), + } + } + + fn handle_error(&self, response: Response) -> SearchError { + let status = response.status(); + let status_code = status.as_u16(); + + match response.text() { + Ok(body) => { + if let Ok(error) = serde_json::from_str::(&body) { + SearchError::ProviderError { + code: Some(status_code as u32), + message: format!("{}: {}", error.code, error.message), + } + } else { + SearchError::ProviderError { + code: Some(status_code as u32), + message: format!("HTTP {}: {}", status_code, body), + } + } + } + Err(_) => SearchError::ProviderError { + code: Some(status_code as u32), + message: format!("HTTP {}: {}", status_code, status.canonical_reason().unwrap_or("Unknown error")), + }, + } + } + + pub fn create_index(&self, name: &str) -> Result<(), SearchError> { + debug!("Creating Meilisearch index: {}", name); + + let url = format!("{}/indexes", self.base_url); + let payload = json!({ + "uid": name, + "primaryKey": "id" + }); + + let response = self + .client + .post(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&payload) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + trace!("Successfully created index: {}", name); + Ok(()) + } else { + Err(self.handle_error(response)) + } + } + + pub fn create_index_with_settings(&self, name: &str, settings: MeilisearchIndexSettings) -> Result<(), SearchError> { + self.create_index(name)?; + self.update_settings(name, settings)?; + Ok(()) + } + + pub fn delete_index(&self, name: &str) -> Result<(), SearchError> { + debug!("Deleting Meilisearch index: {}", name); + + let url = format!("{}/indexes/{}", self.base_url, name); + + let response = self + .client + .delete(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + trace!("Successfully deleted index: {}", name); + Ok(()) + } else { + Err(self.handle_error(response)) + } + } + + pub fn list_indexes(&self) -> Result, SearchError> { + debug!("Listing Meilisearch indexes"); + + let url = format!("{}/indexes", self.base_url); + + let response = self + .client + .get(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + let indexes: MeilisearchIndexes = response.json().map_err(|e| SearchError::ParseError { + message: e.to_string(), + })?; + + Ok(indexes.results.into_iter().map(|idx| idx.uid).collect()) + } else { + Err(self.handle_error(response)) + } + } + + pub fn index_document(&self, index: &str, doc_id: &str, content: Value) -> Result<(), SearchError> { + debug!("Indexing document {} in index {}", doc_id, index); + + let url = format!("{}/indexes/{}/documents", self.base_url, index); + let mut doc = content; + if let Value::Object(ref mut map) = doc { + map.insert("id".to_string(), Value::String(doc_id.to_string())); + } + + let response = self + .client + .post(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&[doc]) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + trace!("Successfully indexed document: {}", doc_id); + Ok(()) + } else { + Err(self.handle_error(response)) + } + } + + pub fn bulk_index(&self, index: &str, docs: Vec) -> Result<(), SearchError> { + debug!("Bulk indexing {} documents in index {}", docs.len(), index); + + let url = format!("{}/indexes/{}/documents", self.base_url, index); + + let response = self + .client + .post(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&docs) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + trace!("Successfully bulk indexed {} documents", docs.len()); + Ok(()) + } else { + Err(self.handle_error(response)) + } + } + + pub fn delete_document(&self, index: &str, doc_id: &str) -> Result<(), SearchError> { + debug!("Deleting document {} from index {}", doc_id, index); + + let url = format!("{}/indexes/{}/documents/{}", self.base_url, index, doc_id); + + let response = self + .client + .delete(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + trace!("Successfully deleted document: {}", doc_id); + Ok(()) + } else { + Err(self.handle_error(response)) + } + } + + pub fn bulk_delete(&self, index: &str, ids: Vec) -> Result<(), SearchError> { + debug!("Bulk deleting {} documents from index {}", ids.len(), index); + + let url = format!("{}/indexes/{}/documents/delete-batch", self.base_url, index); + + let response = self + .client + .post(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&ids) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + trace!("Successfully bulk deleted {} documents", ids.len()); + Ok(()) + } else { + Err(self.handle_error(response)) + } + } + + pub fn get_document(&self, index: &str, doc_id: &str) -> Result, SearchError> { + debug!("Getting document {} from index {}", doc_id, index); + + let url = format!("{}/indexes/{}/documents/{}", self.base_url, index, doc_id); + + let response = self + .client + .get(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + let doc: Value = response.json().map_err(|e| SearchError::ParseError { + message: e.to_string(), + })?; + + if let Some(id) = doc.get("id").and_then(|v| v.as_str()) { + Ok(Some(Doc { + id: id.to_string(), + content: doc, + })) + } else { + Err(SearchError::ParseError { + message: "Document missing id field".to_string(), + }) + } + } else if response.status().as_u16() == 404 { + Ok(None) + } else { + Err(self.handle_error(response)) + } + } + + pub fn search(&self, index: &str, query: MeilisearchSearchQuery) -> Result { + debug!("Searching index {} with query: {:?}", index, query); + + let url = format!("{}/indexes/{}/search", self.base_url, index); + + let response = self + .client + .post(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&query) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + let search_response: MeilisearchSearchResponse = response.json().map_err(|e| SearchError::ParseError { + message: e.to_string(), + })?; + + trace!("Search completed in {}ms, found {} hits", + search_response.processing_time_ms, + search_response.hits.len()); + + Ok(search_response) + } else { + Err(self.handle_error(response)) + } + } + + pub fn get_settings(&self, index: &str) -> Result { + debug!("Getting settings for index {}", index); + + let url = format!("{}/indexes/{}/settings", self.base_url, index); + + let response = self + .client + .get(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + let settings: MeilisearchIndexSettings = response.json().map_err(|e| SearchError::ParseError { + message: e.to_string(), + })?; + + Ok(settings) + } else { + Err(self.handle_error(response)) + } + } + + pub fn update_settings(&self, index: &str, settings: MeilisearchIndexSettings) -> Result<(), SearchError> { + debug!("Updating settings for index {}", index); + + let url = format!("{}/indexes/{}/settings", self.base_url, index); + + let response = self + .client + .patch(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .header("Content-Type", "application/json") + .json(&settings) + .send() + .map_err(|e| SearchError::ConnectionError { + message: e.to_string(), + })?; + + if response.status().is_success() { + trace!("Successfully updated settings for index: {}", index); + Ok(()) + } else { + Err(self.handle_error(response)) + } + } +} diff --git a/search/meilisearch/src/conversions.rs b/search/meilisearch/src/conversions.rs new file mode 100644 index 000000000..ac2f21698 --- /dev/null +++ b/search/meilisearch/src/conversions.rs @@ -0,0 +1,511 @@ +use crate::client::{MeilisearchDoc, MeilisearchHit, MeilisearchSearchQuery, MeilisearchSearchResponse, MeilisearchIndexSettings}; +use golem_search::golem::search::types::{ + Doc, FieldType, Schema, SchemaField, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use log::trace; +use serde_json::{json, Map, Value}; +use std::collections::HashMap; + +pub fn query_to_meilisearch_query(query: &SearchQuery) -> Result { + let mut meilisearch_query = MeilisearchSearchQuery { + q: query.q.clone(), + offset: None, + limit: None, + filter: None, + sort: None, + facets: None, + attributes_to_highlight: None, + highlight_pre_tag: None, + highlight_post_tag: None, + }; + + // Handle pagination + if let Some(page) = query.page { + let per_page = query.per_page.unwrap_or(20); + meilisearch_query.offset = Some((page.saturating_sub(1)) * per_page); + meilisearch_query.limit = Some(per_page); + } else if let Some(offset) = query.offset { + meilisearch_query.offset = Some(offset); + if let Some(per_page) = query.per_page { + meilisearch_query.limit = Some(per_page); + } + } else if let Some(per_page) = query.per_page { + meilisearch_query.limit = Some(per_page); + } + + // Handle filters + if !query.filters.is_empty() { + let filter_expressions: Vec = query + .filters + .iter() + .map(|filter| convert_filter(filter)) + .collect::, _>>()?; + + if !filter_expressions.is_empty() { + meilisearch_query.filter = Some(filter_expressions.join(" AND ")); + } + } + + // Handle sorting + if !query.sort.is_empty() { + let sort_clauses: Vec = query + .sort + .iter() + .map(|sort| convert_sort(sort)) + .collect::, _>>()?; + + if !sort_clauses.is_empty() { + meilisearch_query.sort = Some(sort_clauses); + } + } + + // Handle facets + if !query.facets.is_empty() { + meilisearch_query.facets = Some(query.facets.clone()); + } + + // Handle highlighting + if let Some(highlight) = &query.highlight { + if let Ok(highlight_config) = serde_json::from_str::(highlight) { + if let Some(fields) = highlight_config.get("fields").and_then(|v| v.as_array()) { + let field_names: Vec = fields + .iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect(); + + if !field_names.is_empty() { + meilisearch_query.attributes_to_highlight = Some(field_names); + } + } + + if let Some(pre_tag) = highlight_config.get("pre_tag").and_then(|v| v.as_str()) { + meilisearch_query.highlight_pre_tag = Some(pre_tag.to_string()); + } + + if let Some(post_tag) = highlight_config.get("post_tag").and_then(|v| v.as_str()) { + meilisearch_query.highlight_post_tag = Some(post_tag.to_string()); + } + } else { + // Default highlighting for all fields + meilisearch_query.attributes_to_highlight = Some(vec!["*".to_string()]); + } + } + + Ok(meilisearch_query) +} + +fn convert_filter(filter: &str) -> Result { + // Try to parse as JSON first + if let Ok(filter_value) = serde_json::from_str::(filter) { + return convert_json_filter(&filter_value); + } + + // Parse simple filter format: field:op:value + let parts: Vec<&str> = filter.splitn(3, ':').collect(); + if parts.len() != 3 { + return Err(SearchError::InvalidQuery { + message: format!("Invalid filter format: {}", filter), + }); + } + + let field = parts[0]; + let op = parts[1]; + let value = parts[2]; + + match op { + "eq" => Ok(format!("{} = \"{}\"", field, escape_filter_value(value))), + "ne" => Ok(format!("{} != \"{}\"", field, escape_filter_value(value))), + "gt" => Ok(format!("{} > {}", field, value)), + "gte" => Ok(format!("{} >= {}", field, value)), + "lt" => Ok(format!("{} < {}", field, value)), + "lte" => Ok(format!("{} <= {}", field, value)), + "in" => { + let values: Vec<&str> = value.split(',').collect(); + let quoted_values: Vec = values + .iter() + .map(|v| format!("\"{}\"", escape_filter_value(v))) + .collect(); + Ok(format!("{} IN [{}]", field, quoted_values.join(", "))) + } + "exists" => Ok(format!("{} EXISTS", field)), + "prefix" => Ok(format!("{} = \"{}*\"", field, escape_filter_value(value))), + _ => Err(SearchError::InvalidQuery { + message: format!("Unsupported filter operator: {}", op), + }), + } +} + +fn convert_json_filter(filter: &Value) -> Result { + // Handle complex nested filters + if let Some(obj) = filter.as_object() { + if let Some(bool_filter) = obj.get("bool") { + return convert_bool_filter(bool_filter); + } + + if let Some(term_filter) = obj.get("term") { + return convert_term_filter(term_filter); + } + + if let Some(range_filter) = obj.get("range") { + return convert_range_filter(range_filter); + } + + if let Some(terms_filter) = obj.get("terms") { + return convert_terms_filter(terms_filter); + } + + if let Some(exists_filter) = obj.get("exists") { + return convert_exists_filter(exists_filter); + } + } + + Err(SearchError::InvalidQuery { + message: "Unsupported filter format".to_string(), + }) +} + +fn convert_bool_filter(bool_filter: &Value) -> Result { + let mut clauses = Vec::new(); + + if let Some(must) = bool_filter.get("must").and_then(|v| v.as_array()) { + for clause in must { + clauses.push(convert_json_filter(clause)?); + } + } + + if let Some(filter) = bool_filter.get("filter").and_then(|v| v.as_array()) { + for clause in filter { + clauses.push(convert_json_filter(clause)?); + } + } + + if let Some(must_not) = bool_filter.get("must_not").and_then(|v| v.as_array()) { + for clause in must_not { + clauses.push(format!("NOT ({})", convert_json_filter(clause)?)); + } + } + + if clauses.is_empty() { + return Err(SearchError::InvalidQuery { + message: "Empty bool filter".to_string(), + }); + } + + Ok(format!("({})", clauses.join(" AND "))) +} + +fn convert_term_filter(term_filter: &Value) -> Result { + if let Some(obj) = term_filter.as_object() { + for (field, value) in obj { + if let Some(val_str) = value.as_str() { + return Ok(format!("{} = \"{}\"", field, escape_filter_value(val_str))); + } else if let Some(val_num) = value.as_number() { + return Ok(format!("{} = {}", field, val_num)); + } + } + } + + Err(SearchError::InvalidQuery { + message: "Invalid term filter".to_string(), + }) +} + +fn convert_range_filter(range_filter: &Value) -> Result { + if let Some(obj) = range_filter.as_object() { + for (field, range) in obj { + if let Some(range_obj) = range.as_object() { + let mut conditions = Vec::new(); + + if let Some(gte) = range_obj.get("gte") { + conditions.push(format!("{} >= {}", field, gte)); + } + if let Some(gt) = range_obj.get("gt") { + conditions.push(format!("{} > {}", field, gt)); + } + if let Some(lte) = range_obj.get("lte") { + conditions.push(format!("{} <= {}", field, lte)); + } + if let Some(lt) = range_obj.get("lt") { + conditions.push(format!("{} < {}", field, lt)); + } + + if !conditions.is_empty() { + return Ok(conditions.join(" AND ")); + } + } + } + } + + Err(SearchError::InvalidQuery { + message: "Invalid range filter".to_string(), + }) +} + +fn convert_terms_filter(terms_filter: &Value) -> Result { + if let Some(obj) = terms_filter.as_object() { + for (field, values) in obj { + if let Some(values_array) = values.as_array() { + let quoted_values: Vec = values_array + .iter() + .filter_map(|v| v.as_str()) + .map(|v| format!("\"{}\"", escape_filter_value(v))) + .collect(); + + if !quoted_values.is_empty() { + return Ok(format!("{} IN [{}]", field, quoted_values.join(", "))); + } + } + } + } + + Err(SearchError::InvalidQuery { + message: "Invalid terms filter".to_string(), + }) +} + +fn convert_exists_filter(exists_filter: &Value) -> Result { + if let Some(field) = exists_filter.get("field").and_then(|v| v.as_str()) { + Ok(format!("{} EXISTS", field)) + } else { + Err(SearchError::InvalidQuery { + message: "Invalid exists filter".to_string(), + }) + } +} + +fn convert_sort(sort: &str) -> Result { + // Handle JSON sort format + if let Ok(sort_value) = serde_json::from_str::(sort) { + if let Some(obj) = sort_value.as_object() { + for (field, order) in obj { + if let Some(order_obj) = order.as_object() { + if let Some(order_str) = order_obj.get("order").and_then(|v| v.as_str()) { + return Ok(match order_str { + "desc" => format!("{}:desc", field), + _ => format!("{}:asc", field), + }); + } + } else if let Some(order_str) = order.as_str() { + return Ok(match order_str { + "desc" => format!("{}:desc", field), + _ => format!("{}:asc", field), + }); + } + } + } + } + + // Handle simple format: field:direction + let parts: Vec<&str> = sort.split(':').collect(); + match parts.len() { + 1 => Ok(format!("{}:asc", parts[0])), + 2 => { + let field = parts[0]; + let direction = match parts[1] { + "desc" | "DESC" => "desc", + _ => "asc", + }; + Ok(format!("{}:{}", field, direction)) + } + _ => Err(SearchError::InvalidQuery { + message: format!("Invalid sort format: {}", sort), + }), + } +} + +fn escape_filter_value(value: &str) -> String { + value.replace('"', "\\\"") +} + +pub fn doc_to_meilisearch_doc(doc: Doc) -> Result { + Ok(MeilisearchDoc { + id: doc.id, + content: doc.content, + }) +} + +pub fn meilisearch_hit_to_search_hit(hit: MeilisearchHit) -> SearchHit { + let id = hit.source + .get("id") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + + let score = 1.0; // Meilisearch doesn't provide relevance scores in the same way + + SearchHit { + id, + source: hit.source, + score: Some(score), + highlight: hit.formatted, + } +} + +pub fn meilisearch_response_to_results( + response: MeilisearchSearchResponse, + query: &SearchQuery, +) -> SearchResults { + let hits: Vec = response + .hits + .into_iter() + .map(meilisearch_hit_to_search_hit) + .collect(); + + let total = response.estimated_total_hits.unwrap_or(hits.len() as u64); + let page = query.page.unwrap_or(1); + let per_page = query.per_page.unwrap_or(20); + + // Convert facet distribution to the expected format + let mut facets = HashMap::new(); + if let Some(facet_distribution) = response.facet_distribution { + for (field, distribution) in facet_distribution { + let facet_values: Vec = distribution + .into_iter() + .map(|(value, count)| json!({ + "value": value, + "count": count + })) + .collect(); + facets.insert(field, facet_values); + } + } + + SearchResults { + hits, + total, + page, + per_page, + total_pages: ((total as f64) / (per_page as f64)).ceil() as u32, + processing_time_ms: Some(response.processing_time_ms), + facets: if facets.is_empty() { None } else { Some(facets) }, + } +} + +pub fn schema_to_meilisearch_settings(schema: &Schema) -> MeilisearchIndexSettings { + let mut searchable_attributes = Vec::new(); + let mut filterable_attributes = Vec::new(); + let mut sortable_attributes = Vec::new(); + + for field in &schema.fields { + match field.field_type { + FieldType::Text => { + searchable_attributes.push(field.name.clone()); + if field.searchable.unwrap_or(true) { + // Already added to searchable + } + if field.filterable.unwrap_or(false) { + filterable_attributes.push(field.name.clone()); + } + if field.sortable.unwrap_or(false) { + sortable_attributes.push(field.name.clone()); + } + } + FieldType::Number => { + if field.searchable.unwrap_or(false) { + searchable_attributes.push(field.name.clone()); + } + filterable_attributes.push(field.name.clone()); + sortable_attributes.push(field.name.clone()); + } + FieldType::Boolean => { + filterable_attributes.push(field.name.clone()); + } + FieldType::Date => { + filterable_attributes.push(field.name.clone()); + sortable_attributes.push(field.name.clone()); + } + FieldType::Keyword => { + filterable_attributes.push(field.name.clone()); + sortable_attributes.push(field.name.clone()); + if field.searchable.unwrap_or(false) { + searchable_attributes.push(field.name.clone()); + } + } + FieldType::Object => { + // For objects, we'll add them as filterable by default + filterable_attributes.push(field.name.clone()); + } + } + } + + MeilisearchIndexSettings { + searchable_attributes: if searchable_attributes.is_empty() { + None + } else { + Some(searchable_attributes) + }, + filterable_attributes: if filterable_attributes.is_empty() { + None + } else { + Some(filterable_attributes) + }, + sortable_attributes: if sortable_attributes.is_empty() { + None + } else { + Some(sortable_attributes) + }, + ranking_rules: Some(vec![ + "words".to_string(), + "typo".to_string(), + "proximity".to_string(), + "attribute".to_string(), + "sort".to_string(), + "exactness".to_string(), + ]), + } +} + +pub fn meilisearch_settings_to_schema(settings: &MeilisearchIndexSettings) -> Schema { + let mut fields = Vec::new(); + + // Add searchable fields as text fields + if let Some(searchable) = &settings.searchable_attributes { + for field_name in searchable { + if field_name != "*" { + fields.push(SchemaField { + name: field_name.clone(), + field_type: FieldType::Text, + searchable: Some(true), + filterable: settings + .filterable_attributes + .as_ref() + .map(|f| f.contains(field_name)) + .unwrap_or(false) + .into(), + sortable: settings + .sortable_attributes + .as_ref() + .map(|f| f.contains(field_name)) + .unwrap_or(false) + .into(), + facetable: None, + config: None, + }); + } + } + } + + // Add filterable-only fields + if let Some(filterable) = &settings.filterable_attributes { + for field_name in filterable { + if !fields.iter().any(|f| f.name == *field_name) { + fields.push(SchemaField { + name: field_name.clone(), + field_type: FieldType::Keyword, // Default assumption + searchable: Some(false), + filterable: Some(true), + sortable: settings + .sortable_attributes + .as_ref() + .map(|f| f.contains(field_name)) + .unwrap_or(false) + .into(), + facetable: None, + config: None, + }); + } + } + } + + Schema { fields } +} diff --git a/search/meilisearch/src/lib.rs b/search/meilisearch/src/lib.rs new file mode 100644 index 000000000..cafe5257c --- /dev/null +++ b/search/meilisearch/src/lib.rs @@ -0,0 +1,289 @@ +use crate::client::MeilisearchApi; +use crate::conversions::{ + doc_to_meilisearch_doc, meilisearch_hit_to_search_hit, meilisearch_response_to_results, + query_to_meilisearch_query, schema_to_meilisearch_settings, meilisearch_settings_to_schema, +}; +use golem_search::config::with_config_key; +use golem_search::durability::{DurableSearch, ExtendedGuest}; +use golem_search::golem::search::core::Guest; +use golem_search::golem::search::types::{ + Doc, Schema, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use golem_search::{SearchStream, SearchStreamState, LOGGING_STATE}; +use log::trace; +use std::cell::RefCell; + +mod client; +mod conversions; + +struct MeilisearchStream { + client: MeilisearchApi, + index: String, + query: SearchQuery, + current_offset: RefCell, + per_page: u32, + finished: RefCell, + failure: Option, +} + +impl MeilisearchStream { + fn new(client: MeilisearchApi, index: String, query: SearchQuery) -> Self { + let per_page = query.per_page.unwrap_or(20); + let initial_offset = query.offset.unwrap_or(0); + + Self { + client, + index, + query, + current_offset: RefCell::new(initial_offset), + per_page, + finished: RefCell::new(false), + failure: None, + } + } + + fn failed(error: SearchError) -> Self { + Self { + client: MeilisearchApi::empty(), + index: String::new(), + query: SearchQuery { + q: None, + filters: vec![], + sort: vec![], + facets: vec![], + page: None, + per_page: None, + offset: None, + highlight: None, + config: None, + }, + current_offset: RefCell::new(0), + per_page: 20, + finished: RefCell::new(true), + failure: Some(error), + } + } +} + +impl SearchStreamState for MeilisearchStream { + fn failure(&self) -> &Option { + &self.failure + } + + fn is_finished(&self) -> bool { + *self.finished.borrow() + } + + fn set_finished(&self) { + *self.finished.borrow_mut() = true; + } + + fn get_next_hits(&self) -> Result, SearchError> { + let current_offset = *self.current_offset.borrow(); + + // Create a modified query with current pagination + let mut paginated_query = self.query.clone(); + paginated_query.offset = Some(current_offset); + paginated_query.per_page = Some(self.per_page); + + let meilisearch_query = query_to_meilisearch_query(&paginated_query)?; + + match self.client.search(&self.index, meilisearch_query) { + Ok(response) => { + let hits: Vec = response + .hits + .into_iter() + .map(meilisearch_hit_to_search_hit) + .collect(); + + if hits.len() < self.per_page as usize { + // This was the last page + self.set_finished(); + } else { + // Update offset for next page + *self.current_offset.borrow_mut() = current_offset + self.per_page; + } + + Ok(hits) + } + Err(error) => Err(error), + } + } +} + +struct MeilisearchComponent; + +impl MeilisearchComponent { + const HOST_VAR: &'static str = "SEARCH_PROVIDER_ENDPOINT"; + const API_KEY_VAR: &'static str = "MEILISEARCH_API_KEY"; +} + +impl Guest for MeilisearchComponent { + type SearchHitStream = SearchStream; + + fn create_index(name: String, schema: Option) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + + if let Some(schema) = schema { + let settings = schema_to_meilisearch_settings(&schema); + client.create_index_with_settings(&name, settings) + } else { + client.create_index(&name) + } + }) + }) + } + + fn delete_index(name: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + client.delete_index(&name) + }) + }) + } + + fn list_indexes() -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + client.list_indexes() + }) + }) + } + + fn upsert(index: String, doc: Doc) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + let meilisearch_doc = doc_to_meilisearch_doc(doc)?; + client.index_document(&index, &meilisearch_doc.id, meilisearch_doc.content) + }) + }) + } + + fn upsert_many(index: String, docs: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + let meilisearch_docs: Result, _> = docs.into_iter().map(doc_to_meilisearch_doc).collect(); + client.bulk_index(&index, meilisearch_docs?) + }) + }) + } + + fn delete(index: String, id: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + client.delete_document(&index, &id) + }) + }) + } + + fn delete_many(index: String, ids: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + client.bulk_delete(&index, ids) + }) + }) + } + + fn get(index: String, id: String) -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + client.get_document(&index, &id) + }) + }) + } + + fn search(index: String, query: SearchQuery) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + let meilisearch_query = query_to_meilisearch_query(&query)?; + trace!("Executing Meilisearch query: {:?}", meilisearch_query); + + match client.search(&index, meilisearch_query) { + Ok(response) => Ok(meilisearch_response_to_results(response, &query)), + Err(error) => Err(error), + } + }) + }) + } + + fn stream_search( + index: String, + query: SearchQuery, + ) -> Result { + Self::unwrapped_stream_search(index, query) + } + + fn get_schema(index: String) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + let settings = client.get_settings(&index)?; + Ok(meilisearch_settings_to_schema(&settings)) + }) + }) + } + + fn update_schema(index: String, schema: Schema) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = MeilisearchApi::new(host, api_key); + let settings = schema_to_meilisearch_settings(&schema); + client.update_settings(&index, settings) + }) + }) + } +} + +impl ExtendedGuest for MeilisearchComponent { + type SearchHitStream = SearchStream; + + fn unwrapped_stream_search( + index: String, + query: SearchQuery, + ) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, |error| Ok(SearchStream::new(MeilisearchStream::failed(error))), |host| { + with_config_key(Self::API_KEY_VAR, |error| Ok(SearchStream::new(MeilisearchStream::failed(error))), |api_key| { + let client = MeilisearchApi::new(host, api_key); + Ok(SearchStream::new(MeilisearchStream::new(client, index, query))) + }) + }) + } +} + +type DurableMeilisearchComponent = DurableSearch; + +golem_search::export_search!(DurableMeilisearchComponent with_types_in golem_search); diff --git a/search/meilisearch/wit/deps/golem-search/golem-search.wit b/search/meilisearch/wit/deps/golem-search/golem-search.wit new file mode 100644 index 000000000..64c822dfa --- /dev/null +++ b/search/meilisearch/wit/deps/golem-search/golem-search.wit @@ -0,0 +1,134 @@ +package golem:search@1.0.0; + +/// Core types and error handling for universal search interfaces +interface types { + /// Common structured errors for search operations + variant search-error { + index-not-found, + invalid-query(string), + unsupported, + internal(string), + timeout, + rate-limited, + } + + /// Identifier types + type index-name = string; + type document-id = string; + type json = string; + + /// Document payload + record doc { + id: document-id, + content: json, + } + + /// Highlight configuration + record highlight-config { + fields: list, + pre-tag: option, + post-tag: option, + max-length: option, + } + + /// Advanced search tuning + record search-config { + timeout-ms: option, + boost-fields: list>, + attributes-to-retrieve: list, + language: option, + typo-tolerance: option, + exact-match-boost: option, + provider-params: option, + } + + /// Search request + record search-query { + q: option, + filters: list, + sort: list, + facets: list, + page: option, + per-page: option, + offset: option, + highlight: option, + config: option, + } + + /// Search hit + record search-hit { + id: document-id, + score: option, + content: option, + highlights: option, + } + + /// Search result set + record search-results { + total: option, + page: option, + per-page: option, + hits: list, + facets: option, + took-ms: option, + } + + /// Field schema types + enum field-type { + text, + keyword, + integer, + float, + boolean, + date, + geo-point, + } + + /// Field definition + record schema-field { + name: string, + type: field-type, + required: bool, + facet: bool, + sort: bool, + index: bool, + } + + /// Index schema + record schema { + fields: list, + primary-key: option, + } +} + +/// Unified search interface +interface core { + use types.{ + index-name, document-id, doc, search-query, search-results, + search-hit, schema, search-error + }; + + // Index lifecycle + create-index: func(name: index-name, schema: option) -> result<_, search-error>; + delete-index: func(name: index-name) -> result<_, search-error>; + list-indexes: func() -> result, search-error>; + + // Document operations + upsert: func(index: index-name, doc: doc) -> result<_, search-error>; + upsert-many: func(index: index-name, docs: list) -> result<_, search-error>; + delete: func(index: index-name, id: document-id) -> result<_, search-error>; + delete-many: func(index: index-name, ids: list) -> result<_, search-error>; + get: func(index: index-name, id: document-id) -> result, search-error>; + + // Query + search: func(index: index-name, query: search-query) -> result; + stream-search: func(index: index-name, query: search-query) -> result, search-error>; + + // Schema inspection + get-schema: func(index: index-name) -> result; + update-schema: func(index: index-name, schema: schema) -> result<_, search-error>; +} + +world search-library { + export core; +} diff --git a/search/meilisearch/wit/deps/wasi:io/error.wit b/search/meilisearch/wit/deps/wasi:io/error.wit new file mode 100644 index 000000000..97c606877 --- /dev/null +++ b/search/meilisearch/wit/deps/wasi:io/error.wit @@ -0,0 +1,34 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +interface error { + /// A resource which represents some error information. + /// + /// The only method provided by this resource is `to-debug-string`, + /// which provides some human-readable information about the error. + /// + /// In the `wasi:io` package, this resource is returned through the + /// `wasi:io/streams/stream-error` type. + /// + /// To provide more specific error information, other interfaces may + /// offer functions to "downcast" this error into more specific types. For example, + /// errors returned from streams derived from filesystem types can be described using + /// the filesystem's own error-code type. This is done using the function + /// `wasi:filesystem/types/filesystem-error-code`, which takes a `borrow` + /// parameter and returns an `option`. + /// + /// The set of functions which can "downcast" an `error` into a more + /// concrete type is open. + @since(version = 0.2.0) + resource error { + /// Returns a string that is suitable to assist humans in debugging + /// this error. + /// + /// WARNING: The returned string should not be consumed mechanically! + /// It may change across platforms, hosts, or other implementation + /// details. Parsing this string is a major platform-compatibility + /// hazard. + @since(version = 0.2.0) + to-debug-string: func() -> string; + } +} diff --git a/search/meilisearch/wit/deps/wasi:io/poll.wit b/search/meilisearch/wit/deps/wasi:io/poll.wit new file mode 100644 index 000000000..9bcbe8e03 --- /dev/null +++ b/search/meilisearch/wit/deps/wasi:io/poll.wit @@ -0,0 +1,47 @@ +package wasi:io@0.2.3; + +/// A poll API intended to let users wait for I/O events on multiple handles +/// at once. +@since(version = 0.2.0) +interface poll { + /// `pollable` represents a single I/O event which may be ready, or not. + @since(version = 0.2.0) + resource pollable { + + /// Return the readiness of a pollable. This function never blocks. + /// + /// Returns `true` when the pollable is ready, and `false` otherwise. + @since(version = 0.2.0) + ready: func() -> bool; + + /// `block` returns immediately if the pollable is ready, and otherwise + /// blocks until ready. + /// + /// This function is equivalent to calling `poll.poll` on a list + /// containing only this pollable. + @since(version = 0.2.0) + block: func(); + } + + /// Poll for completion on a set of pollables. + /// + /// This function takes a list of pollables, which identify I/O sources of + /// interest, and waits until one or more of the events is ready for I/O. + /// + /// The result `list` contains one or more indices of handles in the + /// argument list that is ready for I/O. + /// + /// This function traps if either: + /// - the list is empty, or: + /// - the list contains more elements than can be indexed with a `u32` value. + /// + /// A timeout can be implemented by adding a pollable from the + /// wasi-clocks API to the list. + /// + /// This function does not return a `result`; polling in itself does not + /// do any I/O so it doesn't fail. If any of the I/O sources identified by + /// the pollables has an error, it is indicated by marking the source as + /// being ready for I/O. + @since(version = 0.2.0) + poll: func(in: list>) -> list; +} diff --git a/search/meilisearch/wit/deps/wasi:io/streams.wit b/search/meilisearch/wit/deps/wasi:io/streams.wit new file mode 100644 index 000000000..0de084629 --- /dev/null +++ b/search/meilisearch/wit/deps/wasi:io/streams.wit @@ -0,0 +1,290 @@ +package wasi:io@0.2.3; + +/// WASI I/O is an I/O abstraction API which is currently focused on providing +/// stream types. +/// +/// In the future, the component model is expected to add built-in stream types; +/// when it does, they are expected to subsume this API. +@since(version = 0.2.0) +interface streams { + @since(version = 0.2.0) + use error.{error}; + @since(version = 0.2.0) + use poll.{pollable}; + + /// An error for input-stream and output-stream operations. + @since(version = 0.2.0) + variant stream-error { + /// The last operation (a write or flush) failed before completion. + /// + /// More information is available in the `error` payload. + /// + /// After this, the stream will be closed. All future operations return + /// `stream-error::closed`. + last-operation-failed(error), + /// The stream is closed: no more input will be accepted by the + /// stream. A closed output-stream will return this error on all + /// future operations. + closed + } + + /// An input bytestream. + /// + /// `input-stream`s are *non-blocking* to the extent practical on underlying + /// platforms. I/O operations always return promptly; if fewer bytes are + /// promptly available than requested, they return the number of bytes promptly + /// available, which could even be zero. To wait for data to be available, + /// use the `subscribe` function to obtain a `pollable` which can be polled + /// for using `wasi:io/poll`. + @since(version = 0.2.0) + resource input-stream { + /// Perform a non-blocking read from the stream. + /// + /// When the source of a `read` is binary data, the bytes from the source + /// are returned verbatim. When the source of a `read` is known to the + /// implementation to be text, bytes containing the UTF-8 encoding of the + /// text are returned. + /// + /// This function returns a list of bytes containing the read data, + /// when successful. The returned list will contain up to `len` bytes; + /// it may return fewer than requested, but not more. The list is + /// empty when no bytes are available for reading at this time. The + /// pollable given by `subscribe` will be ready when more bytes are + /// available. + /// + /// This function fails with a `stream-error` when the operation + /// encounters an error, giving `last-operation-failed`, or when the + /// stream is closed, giving `closed`. + /// + /// When the caller gives a `len` of 0, it represents a request to + /// read 0 bytes. If the stream is still open, this call should + /// succeed and return an empty list, or otherwise fail with `closed`. + /// + /// The `len` parameter is a `u64`, which could represent a list of u8 which + /// is not possible to allocate in wasm32, or not desirable to allocate as + /// as a return value by the callee. The callee may return a list of bytes + /// less than `len` in size while more bytes are available for reading. + @since(version = 0.2.0) + read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Read bytes from a stream, after blocking until at least one byte can + /// be read. Except for blocking, behavior is identical to `read`. + @since(version = 0.2.0) + blocking-read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Skip bytes from a stream. Returns number of bytes skipped. + /// + /// Behaves identical to `read`, except instead of returning a list + /// of bytes, returns the number of bytes consumed from the stream. + @since(version = 0.2.0) + skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Skip bytes from a stream, after blocking until at least one byte + /// can be skipped. Except for blocking behavior, identical to `skip`. + @since(version = 0.2.0) + blocking-skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Create a `pollable` which will resolve once either the specified stream + /// has bytes available to read or the other end of the stream has been + /// closed. + /// The created `pollable` is a child resource of the `input-stream`. + /// Implementations may trap if the `input-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + } + + + /// An output bytestream. + /// + /// `output-stream`s are *non-blocking* to the extent practical on + /// underlying platforms. Except where specified otherwise, I/O operations also + /// always return promptly, after the number of bytes that can be written + /// promptly, which could even be zero. To wait for the stream to be ready to + /// accept data, the `subscribe` function to obtain a `pollable` which can be + /// polled for using `wasi:io/poll`. + /// + /// Dropping an `output-stream` while there's still an active write in + /// progress may result in the data being lost. Before dropping the stream, + /// be sure to fully flush your writes. + @since(version = 0.2.0) + resource output-stream { + /// Check readiness for writing. This function never blocks. + /// + /// Returns the number of bytes permitted for the next call to `write`, + /// or an error. Calling `write` with more bytes than this function has + /// permitted will trap. + /// + /// When this function returns 0 bytes, the `subscribe` pollable will + /// become ready when this function will report at least 1 byte, or an + /// error. + @since(version = 0.2.0) + check-write: func() -> result; + + /// Perform a write. This function never blocks. + /// + /// When the destination of a `write` is binary data, the bytes from + /// `contents` are written verbatim. When the destination of a `write` is + /// known to the implementation to be text, the bytes of `contents` are + /// transcoded from UTF-8 into the encoding of the destination and then + /// written. + /// + /// Precondition: check-write gave permit of Ok(n) and contents has a + /// length of less than or equal to n. Otherwise, this function will trap. + /// + /// returns Err(closed) without writing if the stream has closed since + /// the last call to check-write provided a permit. + @since(version = 0.2.0) + write: func( + contents: list + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 bytes, and then flush the stream. Block + /// until all of these operations are complete, or an error occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write`, and `flush`, and is implemented with the + /// following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while !contents.is_empty() { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, contents.len()); + /// let (chunk, rest) = contents.split_at(len); + /// this.write(chunk ); // eliding error handling + /// contents = rest; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-and-flush: func( + contents: list + ) -> result<_, stream-error>; + + /// Request to flush buffered output. This function never blocks. + /// + /// This tells the output-stream that the caller intends any buffered + /// output to be flushed. the output which is expected to be flushed + /// is all that has been passed to `write` prior to this call. + /// + /// Upon calling this function, the `output-stream` will not accept any + /// writes (`check-write` will return `ok(0)`) until the flush has + /// completed. The `subscribe` pollable will become ready when the + /// flush has completed and the stream can accept more writes. + @since(version = 0.2.0) + flush: func() -> result<_, stream-error>; + + /// Request to flush buffered output, and block until flush completes + /// and stream is ready for writing again. + @since(version = 0.2.0) + blocking-flush: func() -> result<_, stream-error>; + + /// Create a `pollable` which will resolve once the output-stream + /// is ready for more writing, or an error has occurred. When this + /// pollable is ready, `check-write` will return `ok(n)` with n>0, or an + /// error. + /// + /// If the stream is closed, this pollable is always ready immediately. + /// + /// The created `pollable` is a child resource of the `output-stream`. + /// Implementations may trap if the `output-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + + /// Write zeroes to a stream. + /// + /// This should be used precisely like `write` with the exact same + /// preconditions (must use check-write first), but instead of + /// passing a list of bytes, you simply pass the number of zero-bytes + /// that should be written. + @since(version = 0.2.0) + write-zeroes: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 zeroes, and then flush the stream. + /// Block until all of these operations are complete, or an error + /// occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write-zeroes`, and `flush`, and is implemented with + /// the following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while num_zeroes != 0 { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, num_zeroes); + /// this.write-zeroes(len); // eliding error handling + /// num_zeroes -= len; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-zeroes-and-flush: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Read from one stream and write to another. + /// + /// The behavior of splice is equivalent to: + /// 1. calling `check-write` on the `output-stream` + /// 2. calling `read` on the `input-stream` with the smaller of the + /// `check-write` permitted length and the `len` provided to `splice` + /// 3. calling `write` on the `output-stream` with that read data. + /// + /// Any error reported by the call to `check-write`, `read`, or + /// `write` ends the splice and reports that error. + /// + /// This function returns the number of bytes transferred; it may be less + /// than `len`. + @since(version = 0.2.0) + splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + + /// Read from one stream and write to another, with blocking. + /// + /// This is similar to `splice`, except that it blocks until the + /// `output-stream` is ready for writing, and the `input-stream` + /// is ready for reading, before performing the `splice`. + @since(version = 0.2.0) + blocking-splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + } +} diff --git a/search/meilisearch/wit/deps/wasi:io/world.wit b/search/meilisearch/wit/deps/wasi:io/world.wit new file mode 100644 index 000000000..f1d2102dc --- /dev/null +++ b/search/meilisearch/wit/deps/wasi:io/world.wit @@ -0,0 +1,10 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +world imports { + @since(version = 0.2.0) + import streams; + + @since(version = 0.2.0) + import poll; +} diff --git a/search/meilisearch/wit/meilisearch.wit b/search/meilisearch/wit/meilisearch.wit new file mode 100644 index 000000000..3cdbafae2 --- /dev/null +++ b/search/meilisearch/wit/meilisearch.wit @@ -0,0 +1,6 @@ +package golem:search-meilisearch; + +world meilisearch-provider { + import golem:search/core@1.0.0; + export golem:search/core@1.0.0; +} diff --git a/search/opensearch/Cargo.toml b/search/opensearch/Cargo.toml new file mode 100644 index 000000000..3253d3c86 --- /dev/null +++ b/search/opensearch/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "golem-search-opensearch" +version = "0.0.0" +edition = "2021" +license = "Apache-2.0" +homepage = "https://golem.cloud" +repository = "https://github.com/golemcloud/golem-llm" +description = "WebAssembly component for working with OpenSearch APIs, with special support for Golem Cloud" + +[lib] +path = "src/lib.rs" +crate-type = ["cdylib"] + +[features] +default = ["durability"] +durability = ["golem-rust/durability", "golem-search/durability"] + +[dependencies] +golem-search = { path = "../search" } +golem-rust = "1.6.0" +log = "0.4.27" +reqwest = { git = "https://github.com/golemcloud/reqwest", branch = "update-may-2025", features = ["json", "blocking"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +wit-bindgen-rt = { version = "0.40.0", features = ["bitflags"] } +base64 = "0.22.1" +sha2 = "0.10" +hmac = "0.12" +hex = "0.4" +chrono = { version = "0.4", features = ["serde"] } + +[package.metadata.component] +package = "golem:search-opensearch" + +[package.metadata.component.bindings] +generate_unused_types = true + +[package.metadata.component.bindings.with] +"golem:search/core@1.0.0" = "golem_search::golem::search::core" + +[package.metadata.component.target] +path = "wit" + +[package.metadata.component.target.dependencies] +"golem:search" = { path = "wit/deps/golem-search" } +"wasi:io" = { path = "wit/deps/wasi:io" } diff --git a/search/opensearch/src/client.rs b/search/opensearch/src/client.rs new file mode 100644 index 000000000..4175ecbde --- /dev/null +++ b/search/opensearch/src/client.rs @@ -0,0 +1,496 @@ +use golem_search::golem::search::types::{Doc, Schema, SearchError}; +use log::{debug, trace}; +use reqwest::{Client, Response}; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::time::Duration; +use chrono::{DateTime, Utc}; +use hmac::{Hmac, Mac}; +use sha2::{Sha256, Digest}; + +#[derive(Debug, Clone)] +pub struct OpenSearchApi { + client: Client, + base_url: String, + access_key: String, + secret_key: String, + region: String, +} + +#[derive(Debug, Deserialize)] +pub struct ElasticSearchError { + #[serde(default)] + pub error: ErrorDetails, +} + +#[derive(Debug, Deserialize, Default)] +pub struct ErrorDetails { + #[serde(rename = "type")] + pub error_type: String, + pub reason: String, +} + +#[derive(Debug, Deserialize)] +pub struct EsSearchResponse { + pub hits: EsHits, + #[serde(rename = "_scroll_id")] + pub scroll_id: Option, + pub took: Option, +} + +#[derive(Debug, Deserialize)] +pub struct EsHits { + pub total: EsTotal, + pub hits: Vec, +} + +#[derive(Debug, Deserialize)] +#[serde(untagged)] +pub enum EsTotal { + Simple(u64), + Detailed { value: u64 }, +} + +impl EsTotal { + pub fn value(&self) -> u64 { + match self { + EsTotal::Simple(v) => *v, + EsTotal::Detailed { value } => *value, + } + } +} + +#[derive(Debug, Deserialize)] +pub struct EsHit { + #[serde(rename = "_id")] + pub id: String, + #[serde(rename = "_score")] + pub score: Option, + #[serde(rename = "_source")] + pub source: Option, + pub highlight: Option>>, +} + +#[derive(Debug, Serialize)] +pub struct EsDoc { + pub id: String, + pub content: Value, +} + +#[derive(Debug, Deserialize)] +pub struct IndexResponse { + #[serde(rename = "acknowledged")] + pub acknowledged: bool, +} + +#[derive(Debug, Deserialize)] +pub struct GetResponse { + #[serde(rename = "_id")] + pub id: String, + #[serde(rename = "_source")] + pub source: Option, + pub found: bool, +} + +impl OpenSearchApi { + pub fn new(endpoint: &str, access_key: &str, secret_key: &str, region: &str) -> Self { + let client = Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .unwrap(); + + Self { + client, + base_url: endpoint.trim_end_matches('/').to_string(), + access_key: access_key.to_string(), + secret_key: secret_key.to_string(), + region: region.to_string(), + } + } + + pub fn empty() -> Self { + Self { + client: Client::new(), + base_url: String::new(), + access_key: String::new(), + secret_key: String::new(), + region: String::new(), + } + } + + fn sign_request(&self, method: &str, path: &str, body: &str, date: &DateTime) -> String { + let service = "es"; + let algorithm = "AWS4-HMAC-SHA256"; + let date_stamp = date.format("%Y%m%d").to_string(); + let amz_date = date.format("%Y%m%dT%H%M%SZ").to_string(); + + // Create canonical request + let canonical_uri = path; + let canonical_querystring = ""; + let host = self.base_url.replace("https://", "").replace("http://", ""); + let canonical_headers = format!("host:{}\nx-amz-date:{}\n", host, amz_date); + let signed_headers = "host;x-amz-date"; + + let payload_hash = hex::encode(Sha256::digest(body.as_bytes())); + let canonical_request = format!( + "{}\n{}\n{}\n{}\n{}\n{}", + method, canonical_uri, canonical_querystring, canonical_headers, signed_headers, payload_hash + ); + + // Create string to sign + let credential_scope = format!("{}/{}/{}/aws4_request", date_stamp, self.region, service); + let string_to_sign = format!( + "{}\n{}\n{}\n{}", + algorithm, + amz_date, + credential_scope, + hex::encode(Sha256::digest(canonical_request.as_bytes())) + ); + + // Create signing key + let k_date = hmac_sha256(format!("AWS4{}", self.secret_key).as_bytes(), date_stamp.as_bytes()); + let k_region = hmac_sha256(&k_date, self.region.as_bytes()); + let k_service = hmac_sha256(&k_region, service.as_bytes()); + let k_signing = hmac_sha256(&k_service, b"aws4_request"); + + // Create signature + let signature = hex::encode(hmac_sha256(&k_signing, string_to_sign.as_bytes())); + + format!( + "{} Credential={}/{}, SignedHeaders={}, Signature={}", + algorithm, self.access_key, credential_scope, signed_headers, signature + ) + } + + async fn request(&self, method: reqwest::Method, path: &str, body: Option) -> Result { + let url = format!("{}/{}", self.base_url, path.trim_start_matches('/')); + let now = Utc::now(); + let body_str = body.as_ref().map(|b| serde_json::to_string(b).unwrap()).unwrap_or_default(); + + let authorization = self.sign_request(method.as_str(), path, &body_str, &now); + + let mut request = self.client.request(method, &url); + request = request.header("Authorization", authorization); + request = request.header("x-amz-date", now.format("%Y%m%dT%H%M%SZ").to_string()); + request = request.header("Content-Type", "application/json"); + + if let Some(body) = body { + request = request.json(&body); + } + + trace!("Making request to: {}", url); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Request failed: {}", e)))?; + + if response.status().is_success() { + Ok(response) + } else { + let status = response.status().as_u16(); + let error_text = response.text().await.unwrap_or_default(); + + if let Ok(es_error) = serde_json::from_str::(&error_text) { + match es_error.error.error_type.as_str() { + "index_not_found_exception" => Err(SearchError::IndexNotFound), + "parsing_exception" | "query_parsing_exception" => { + Err(SearchError::InvalidQuery(es_error.error.reason)) + } + _ => Err(SearchError::Internal(format!("OpenSearch error: {}", es_error.error.reason))) + } + } else { + Err(SearchError::Internal(format!("HTTP {}: {}", status, error_text))) + } + } + } + + pub fn create_index(&self, name: &str) -> Result<(), SearchError> { + let body = json!({ + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + } + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, name, Some(body)).await?; + debug!("Created index: {}", name); + Ok(()) + }) + } + + pub fn create_index_with_mapping(&self, name: &str, mapping: Value) -> Result<(), SearchError> { + let body = json!({ + "settings": { + "index": { + "number_of_shards": 1, + "number_of_replicas": 0 + } + }, + "mappings": mapping + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, name, Some(body)).await?; + debug!("Created index with mapping: {}", name); + Ok(()) + }) + } + + pub fn delete_index(&self, name: &str) -> Result<(), SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::DELETE, name, None).await?; + debug!("Deleted index: {}", name); + Ok(()) + }) + } + + pub fn list_indexes(&self) -> Result, SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, "_cat/indices?format=json", None).await?; + let indices: Vec = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + let names = indices.into_iter() + .filter_map(|index| index.get("index").and_then(|v| v.as_str().map(|s| s.to_string()))) + .collect(); + + Ok(names) + }) + } + + pub fn index_document(&self, index: &str, id: &str, doc: Value) -> Result<(), SearchError> { + let path = format!("{}/_doc/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, &path, Some(doc)).await?; + debug!("Indexed document: {}/{}", index, id); + Ok(()) + }) + } + + pub fn bulk_index(&self, index: &str, docs: Vec) -> Result<(), SearchError> { + if docs.is_empty() { + return Ok(()); + } + + let mut body = String::new(); + for doc in &docs { + let index_action = json!({"index": {"_index": index, "_id": doc.id}}); + body.push_str(&serde_json::to_string(&index_action).unwrap()); + body.push('\n'); + body.push_str(&serde_json::to_string(&doc.content).unwrap()); + body.push('\n'); + } + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let url = format!("{}/_bulk", self.base_url); + let now = Utc::now(); + let authorization = self.sign_request("POST", "/_bulk", &body, &now); + + let request = self.client.post(&url) + .header("Authorization", authorization) + .header("x-amz-date", now.format("%Y%m%dT%H%M%SZ").to_string()) + .header("Content-Type", "application/x-ndjson") + .body(body); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Bulk request failed: {}", e)))?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + return Err(SearchError::Internal(format!("Bulk indexing failed: {}", error_text))); + } + + debug!("Bulk indexed {} documents to {}", docs.len(), index); + Ok(()) + }) + } + + pub fn delete_document(&self, index: &str, id: &str) -> Result<(), SearchError> { + let path = format!("{}/_doc/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::DELETE, &path, None).await?; + debug!("Deleted document: {}/{}", index, id); + Ok(()) + }) + } + + pub fn bulk_delete(&self, index: &str, ids: Vec) -> Result<(), SearchError> { + if ids.is_empty() { + return Ok(()); + } + + let mut body = String::new(); + for id in ids { + let delete_action = json!({"delete": {"_index": index, "_id": id}}); + body.push_str(&serde_json::to_string(&delete_action).unwrap()); + body.push('\n'); + } + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let url = format!("{}/_bulk", self.base_url); + let now = Utc::now(); + let authorization = self.sign_request("POST", "/_bulk", &body, &now); + + let request = self.client.post(&url) + .header("Authorization", authorization) + .header("x-amz-date", now.format("%Y%m%dT%H%M%SZ").to_string()) + .header("Content-Type", "application/x-ndjson") + .body(body); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Bulk delete failed: {}", e)))?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + return Err(SearchError::Internal(format!("Bulk delete failed: {}", error_text))); + } + + debug!("Bulk deleted {} documents from {}", ids.len(), index); + Ok(()) + }) + } + + pub fn get_document(&self, index: &str, id: &str) -> Result, SearchError> { + let path = format!("{}/_doc/{}", index, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + match self.request(reqwest::Method::GET, &path, None).await { + Ok(response) => { + let get_response: GetResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + if get_response.found { + if let Some(source) = get_response.source { + Ok(Some(Doc { + id: get_response.id, + content: serde_json::to_string(&source) + .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?, + })) + } else { + Ok(None) + } + } else { + Ok(None) + } + } + Err(SearchError::Internal(msg)) if msg.contains("404") => Ok(None), + Err(e) => Err(e), + } + }) + } + + pub fn search(&self, index: &str, query: Value) -> Result { + let path = format!("{}/_search", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, &path, Some(query)).await?; + let search_response: EsSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse search response: {}", e)))?; + + debug!("Search completed in {}ms", search_response.took.unwrap_or(0)); + Ok(search_response) + }) + } + + pub fn search_with_scroll(&self, index: &str, query: Value) -> Result { + let path = format!("{}/_search?scroll=1m", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, &path, Some(query)).await?; + let search_response: EsSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse scroll search response: {}", e)))?; + + debug!("Scroll search initiated"); + Ok(search_response) + }) + } + + pub fn scroll(&self, scroll_id: &str) -> Result { + let body = json!({ + "scroll": "1m", + "scroll_id": scroll_id + }); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::POST, "_search/scroll", Some(body)).await?; + let search_response: EsSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse scroll response: {}", e)))?; + + Ok(search_response) + }) + } + + pub fn get_mapping(&self, index: &str) -> Result { + let path = format!("{}/_mapping", index); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, &path, None).await?; + let mapping: Value = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse mapping response: {}", e)))?; + + crate::conversions::mapping_to_schema(&mapping) + }) + } + + pub fn update_mapping(&self, index: &str, mapping: Value) -> Result<(), SearchError> { + let path = format!("{}/_mapping", index); + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::PUT, &path, Some(mapping)).await?; + debug!("Updated mapping for index: {}", index); + Ok(()) + }) + } +} + +fn hmac_sha256(key: &[u8], data: &[u8]) -> Vec { + let mut mac = Hmac::::new_from_slice(key).expect("HMAC can take key of any size"); + mac.update(data); + mac.finalize().into_bytes().to_vec() +} diff --git a/search/opensearch/src/conversions.rs b/search/opensearch/src/conversions.rs new file mode 100644 index 000000000..bc0111890 --- /dev/null +++ b/search/opensearch/src/conversions.rs @@ -0,0 +1,298 @@ +use crate::client::{EsDoc, EsHit, EsSearchResponse, EsTotal}; +use golem_search::golem::search::types::{ + Doc, FieldType, Schema, SchemaField, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use log::trace; +use serde_json::{json, Map, Value}; +use std::collections::HashMap; + +pub fn query_to_es_query(query: &SearchQuery) -> Result { + let mut es_query = json!({}); + let mut bool_query = json!({}); + let mut must_clauses = Vec::new(); + let mut filter_clauses = Vec::new(); + + if let Some(q) = &query.q { + if !q.trim().is_empty() { + must_clauses.push(json!({ + "multi_match": { + "query": q, + "type": "best_fields", + "fields": ["*"], + "fuzziness": "AUTO" + } + })); + } + } + + for filter in &query.filters { + if let Ok(filter_value) = serde_json::from_str::(filter) { + filter_clauses.push(filter_value); + } else { + let parts: Vec<&str> = filter.splitn(3, ':').collect(); + if parts.len() == 3 { + let field = parts[0]; + let op = parts[1]; + let value = parts[2]; + + let filter_clause = match op { + "eq" => json!({"term": {field: value}}), + "ne" => json!({"bool": {"must_not": {"term": {field: value}}}}), + "gt" => json!({"range": {field: {"gt": value}}}), + "gte" => json!({"range": {field: {"gte": value}}}), + "lt" => json!({"range": {field: {"lt": value}}}), + "lte" => json!({"range": {field: {"lte": value}}}), + "in" => { + let values: Vec<&str> = value.split(',').collect(); + json!({"terms": {field: values}}) + } + "exists" => json!({"exists": {"field": field}}), + "prefix" => json!({"prefix": {field: value}}), + "wildcard" => json!({"wildcard": {field: value}}), + _ => return Err(SearchError::InvalidQuery(format!("Unknown filter operator: {}", op))) + }; + filter_clauses.push(filter_clause); + } else { + return Err(SearchError::InvalidQuery(format!("Invalid filter format: {}", filter))); + } + } + } + + if !must_clauses.is_empty() || !filter_clauses.is_empty() { + if !must_clauses.is_empty() { + bool_query["must"] = json!(must_clauses); + } + if !filter_clauses.is_empty() { + bool_query["filter"] = json!(filter_clauses); + } + es_query["query"] = json!({"bool": bool_query}); + } else { + es_query["query"] = json!({"match_all": {}}); + } + + if !query.sort.is_empty() { + let mut sort_clauses = Vec::new(); + for sort_field in &query.sort { + if sort_field.starts_with('-') { + let field = &sort_field[1..]; + sort_clauses.push(json!({field: {"order": "desc"}})); + } else { + sort_clauses.push(json!({sort_field: {"order": "asc"}})); + } + } + es_query["sort"] = json!(sort_clauses); + } + + if let Some(highlight) = &query.highlight { + let mut highlight_config = json!({ + "fields": {} + }); + + for field in &highlight.fields { + highlight_config["fields"][field] = json!({}); + } + + if let Some(pre_tag) = &highlight.pre_tag { + highlight_config["pre_tags"] = json!([pre_tag]); + } + + if let Some(post_tag) = &highlight.post_tag { + highlight_config["post_tags"] = json!([post_tag]); + } + + if let Some(max_length) = highlight.max_length { + highlight_config["fragment_size"] = json!(max_length); + } + + es_query["highlight"] = highlight_config; + } + + let size = query.per_page.unwrap_or(10) as usize; + es_query["size"] = json!(size); + + if let Some(page) = query.page { + let from = ((page - 1) * query.per_page.unwrap_or(10)) as usize; + es_query["from"] = json!(from); + } else if let Some(offset) = query.offset { + es_query["from"] = json!(offset); + } + + if !query.facets.is_empty() { + let mut aggs = json!({}); + for facet in &query.facets { + aggs[facet] = json!({ + "terms": { + "field": facet + } + }); + } + es_query["aggs"] = aggs; + } + + if let Some(config) = &query.config { + if let Some(timeout_ms) = config.timeout_ms { + es_query["timeout"] = json!(format!("{}ms", timeout_ms)); + } + + if !config.attributes_to_retrieve.is_empty() { + es_query["_source"] = json!(config.attributes_to_retrieve); + } + } + + trace!("Generated OpenSearch query: {}", serde_json::to_string_pretty(&es_query).unwrap_or_default()); + Ok(es_query) +} + +pub fn es_hit_to_search_hit(hit: EsHit) -> SearchHit { + let content = hit.source.map(|s| serde_json::to_string(&s).unwrap_or_default()); + + let highlights = if let Some(highlight_map) = hit.highlight { + let mut highlights = Map::new(); + for (field, fragments) in highlight_map { + highlights.insert(field, json!(fragments)); + } + Some(serde_json::to_string(&highlights).unwrap_or_default()) + } else { + None + }; + + SearchHit { + id: hit.id, + score: hit.score, + content, + highlights, + } +} + +pub fn es_response_to_results(response: EsSearchResponse, query: &SearchQuery) -> SearchResults { + let hits = response.hits.hits.into_iter().map(es_hit_to_search_hit).collect(); + let total = Some(response.hits.total.value() as u32); + + let page = query.page; + let per_page = query.per_page; + let took_ms = response.took.map(|t| t as u32); + + SearchResults { + total, + page, + per_page, + hits, + facets: None, + took_ms, + } +} + +pub fn schema_to_mapping(schema: &Schema) -> Value { + let mut properties = json!({}); + + for field in &schema.fields { + let mut field_mapping = json!({}); + + match field.type_ { + FieldType::Text => { + field_mapping["type"] = json!("text"); + if field.facet { + field_mapping["fields"] = json!({ + "keyword": { + "type": "keyword" + } + }); + } + } + FieldType::Keyword => { + field_mapping["type"] = json!("keyword"); + } + FieldType::Integer => { + field_mapping["type"] = json!("long"); + } + FieldType::Float => { + field_mapping["type"] = json!("double"); + } + FieldType::Boolean => { + field_mapping["type"] = json!("boolean"); + } + FieldType::Date => { + field_mapping["type"] = json!("date"); + } + FieldType::GeoPoint => { + field_mapping["type"] = json!("geo_point"); + } + } + + if !field.index { + field_mapping["index"] = json!(false); + } + + properties[&field.name] = field_mapping; + } + + json!({ + "properties": properties + }) +} + +pub fn mapping_to_schema(mapping: &Value) -> Result { + let mut fields = Vec::new(); + let mut primary_key = None; + + if let Some(index_mappings) = mapping.as_object() { + for (index_name, index_mapping) in index_mappings { + if let Some(mappings) = index_mapping.get("mappings") { + if let Some(properties) = mappings.get("properties") { + if let Some(props) = properties.as_object() { + for (field_name, field_def) in props { + if let Some(field_type_str) = field_def.get("type").and_then(|v| v.as_str()) { + let field_type = match field_type_str { + "text" => FieldType::Text, + "keyword" => FieldType::Keyword, + "long" | "integer" | "short" | "byte" => FieldType::Integer, + "double" | "float" => FieldType::Float, + "boolean" => FieldType::Boolean, + "date" => FieldType::Date, + "geo_point" => FieldType::GeoPoint, + _ => FieldType::Text, + }; + + let index = field_def.get("index") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + + let facet = field_def.get("fields") + .and_then(|v| v.get("keyword")) + .is_some() || field_type == FieldType::Keyword; + + fields.push(SchemaField { + name: field_name.clone(), + type_: field_type, + required: false, + facet, + sort: true, + index, + }); + + if field_name == "_id" || field_name == "id" { + primary_key = Some(field_name.clone()); + } + } + } + } + } + } + } + } + + Ok(Schema { + fields, + primary_key, + }) +} + +pub fn doc_to_es_doc(doc: Doc) -> Result { + let content: Value = serde_json::from_str(&doc.content) + .map_err(|e| SearchError::InvalidQuery(format!("Invalid JSON in document: {}", e)))?; + + Ok(EsDoc { + id: doc.id, + content, + }) +} diff --git a/search/opensearch/src/lib.rs b/search/opensearch/src/lib.rs new file mode 100644 index 000000000..3b8f2da72 --- /dev/null +++ b/search/opensearch/src/lib.rs @@ -0,0 +1,333 @@ +use crate::client::OpenSearchApi; +use crate::conversions::{ + doc_to_es_doc, es_hit_to_search_hit, es_response_to_results, query_to_es_query, + schema_to_mapping, +}; +use golem_search::config::with_config_key; +use golem_search::durability::{DurableSearch, ExtendedGuest}; +use golem_search::golem::search::core::Guest; +use golem_search::golem::search::types::{ + Doc, Schema, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use golem_search::{SearchStream, SearchStreamState, LOGGING_STATE}; +use log::trace; +use std::cell::RefCell; + +mod client; +mod conversions; + +struct OpenSearchStream { + client: OpenSearchApi, + index: String, + query: SearchQuery, + scroll_id: RefCell>, + finished: RefCell, + failure: Option, +} + +impl OpenSearchStream { + fn new(client: OpenSearchApi, index: String, query: SearchQuery) -> Self { + Self { + client, + index, + query, + scroll_id: RefCell::new(None), + finished: RefCell::new(false), + failure: None, + } + } + + fn failed(error: SearchError) -> Self { + Self { + client: OpenSearchApi::empty(), + index: String::new(), + query: SearchQuery { + q: None, + filters: vec![], + sort: vec![], + facets: vec![], + page: None, + per_page: None, + offset: None, + highlight: None, + config: None, + }, + scroll_id: RefCell::new(None), + finished: RefCell::new(true), + failure: Some(error), + } + } +} + +impl SearchStreamState for OpenSearchStream { + fn failure(&self) -> &Option { + &self.failure + } + + fn is_finished(&self) -> bool { + *self.finished.borrow() + } + + fn set_finished(&self) { + *self.finished.borrow_mut() = true; + } + + fn get_next_hits(&self) -> Result, SearchError> { + let scroll_id = self.scroll_id.borrow().clone(); + + if let Some(scroll_id) = scroll_id { + match self.client.scroll(&scroll_id) { + Ok(response) => { + if response.hits.hits.is_empty() { + self.set_finished(); + Ok(vec![]) + } else { + *self.scroll_id.borrow_mut() = response.scroll_id; + Ok(response.hits.hits.into_iter().map(es_hit_to_search_hit).collect()) + } + } + Err(error) => Err(error), + } + } else { + let es_query = query_to_es_query(&self.query)?; + match self.client.search_with_scroll(&self.index, es_query) { + Ok(response) => { + if response.hits.hits.is_empty() { + self.set_finished(); + Ok(vec![]) + } else { + *self.scroll_id.borrow_mut() = response.scroll_id; + Ok(response.hits.hits.into_iter().map(es_hit_to_search_hit).collect()) + } + } + Err(error) => Err(error), + } + } + } +} + +struct OpenSearchComponent; + +impl OpenSearchComponent { + const ENDPOINT_VAR: &'static str = "OPENSEARCH_ENDPOINT"; + const ACCESS_KEY_VAR: &'static str = "AWS_ACCESS_KEY_ID"; + const SECRET_KEY_VAR: &'static str = "AWS_SECRET_ACCESS_KEY"; + const REGION_VAR: &'static str = "AWS_REGION"; +} + +impl Guest for OpenSearchComponent { + type SearchHitStream = SearchStream; + + fn create_index(name: String, schema: Option) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + + if let Some(schema) = schema { + let mapping = schema_to_mapping(&schema); + client.create_index_with_mapping(&name, mapping) + } else { + client.create_index(&name) + } + }) + }) + }) + }) + } + + fn delete_index(name: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + client.delete_index(&name) + }) + }) + }) + }) + } + + fn list_indexes() -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + client.list_indexes() + }) + }) + }) + }) + } + + fn upsert(index: String, doc: Doc) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + let es_doc = doc_to_es_doc(doc)?; + client.index_document(&index, &es_doc.id, es_doc.content) + }) + }) + }) + }) + } + + fn upsert_many(index: String, docs: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + let es_docs: Result, _> = docs.into_iter().map(doc_to_es_doc).collect(); + client.bulk_index(&index, es_docs?) + }) + }) + }) + }) + } + + fn delete(index: String, id: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + client.delete_document(&index, &id) + }) + }) + }) + }) + } + + fn delete_many(index: String, ids: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + client.bulk_delete(&index, ids) + }) + }) + }) + }) + } + + fn get(index: String, id: String) -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + client.get_document(&index, &id) + }) + }) + }) + }) + } + + fn search(index: String, query: SearchQuery) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + let es_query = query_to_es_query(&query)?; + trace!("Executing search query: {:?}", es_query); + + match client.search(&index, es_query) { + Ok(response) => Ok(es_response_to_results(response, &query)), + Err(error) => Err(error), + } + }) + }) + }) + }) + } + + fn stream_search( + index: String, + query: SearchQuery, + ) -> Result { + Self::unwrapped_stream_search(index, query) + } + + fn get_schema(index: String) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + client.get_mapping(&index) + }) + }) + }) + }) + } + + fn update_schema(index: String, schema: Schema) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, Err, |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, Err, |access_key| { + with_config_key(Self::SECRET_KEY_VAR, Err, |secret_key| { + with_config_key(Self::REGION_VAR, Err, |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + let mapping = schema_to_mapping(&schema); + client.update_mapping(&index, mapping) + }) + }) + }) + }) + } +} + +impl ExtendedGuest for OpenSearchComponent { + type SearchHitStream = SearchStream; + + fn unwrapped_stream_search( + index: String, + query: SearchQuery, + ) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::ENDPOINT_VAR, |error| Ok(SearchStream::new(OpenSearchStream::failed(error))), |endpoint| { + with_config_key(Self::ACCESS_KEY_VAR, |error| Ok(SearchStream::new(OpenSearchStream::failed(error))), |access_key| { + with_config_key(Self::SECRET_KEY_VAR, |error| Ok(SearchStream::new(OpenSearchStream::failed(error))), |secret_key| { + with_config_key(Self::REGION_VAR, |error| Ok(SearchStream::new(OpenSearchStream::failed(error))), |region| { + let client = OpenSearchApi::new(endpoint, access_key, secret_key, region); + Ok(SearchStream::new(OpenSearchStream::new(client, index, query))) + }) + }) + }) + }) + } +} + +type DurableOpenSearchComponent = DurableSearch; + +golem_search::export_search!(DurableOpenSearchComponent with_types_in golem_search); diff --git a/search/opensearch/wit/deps/golem-search/golem-search.wit b/search/opensearch/wit/deps/golem-search/golem-search.wit new file mode 100644 index 000000000..64c822dfa --- /dev/null +++ b/search/opensearch/wit/deps/golem-search/golem-search.wit @@ -0,0 +1,134 @@ +package golem:search@1.0.0; + +/// Core types and error handling for universal search interfaces +interface types { + /// Common structured errors for search operations + variant search-error { + index-not-found, + invalid-query(string), + unsupported, + internal(string), + timeout, + rate-limited, + } + + /// Identifier types + type index-name = string; + type document-id = string; + type json = string; + + /// Document payload + record doc { + id: document-id, + content: json, + } + + /// Highlight configuration + record highlight-config { + fields: list, + pre-tag: option, + post-tag: option, + max-length: option, + } + + /// Advanced search tuning + record search-config { + timeout-ms: option, + boost-fields: list>, + attributes-to-retrieve: list, + language: option, + typo-tolerance: option, + exact-match-boost: option, + provider-params: option, + } + + /// Search request + record search-query { + q: option, + filters: list, + sort: list, + facets: list, + page: option, + per-page: option, + offset: option, + highlight: option, + config: option, + } + + /// Search hit + record search-hit { + id: document-id, + score: option, + content: option, + highlights: option, + } + + /// Search result set + record search-results { + total: option, + page: option, + per-page: option, + hits: list, + facets: option, + took-ms: option, + } + + /// Field schema types + enum field-type { + text, + keyword, + integer, + float, + boolean, + date, + geo-point, + } + + /// Field definition + record schema-field { + name: string, + type: field-type, + required: bool, + facet: bool, + sort: bool, + index: bool, + } + + /// Index schema + record schema { + fields: list, + primary-key: option, + } +} + +/// Unified search interface +interface core { + use types.{ + index-name, document-id, doc, search-query, search-results, + search-hit, schema, search-error + }; + + // Index lifecycle + create-index: func(name: index-name, schema: option) -> result<_, search-error>; + delete-index: func(name: index-name) -> result<_, search-error>; + list-indexes: func() -> result, search-error>; + + // Document operations + upsert: func(index: index-name, doc: doc) -> result<_, search-error>; + upsert-many: func(index: index-name, docs: list) -> result<_, search-error>; + delete: func(index: index-name, id: document-id) -> result<_, search-error>; + delete-many: func(index: index-name, ids: list) -> result<_, search-error>; + get: func(index: index-name, id: document-id) -> result, search-error>; + + // Query + search: func(index: index-name, query: search-query) -> result; + stream-search: func(index: index-name, query: search-query) -> result, search-error>; + + // Schema inspection + get-schema: func(index: index-name) -> result; + update-schema: func(index: index-name, schema: schema) -> result<_, search-error>; +} + +world search-library { + export core; +} diff --git a/search/opensearch/wit/deps/wasi:io/error.wit b/search/opensearch/wit/deps/wasi:io/error.wit new file mode 100644 index 000000000..97c606877 --- /dev/null +++ b/search/opensearch/wit/deps/wasi:io/error.wit @@ -0,0 +1,34 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +interface error { + /// A resource which represents some error information. + /// + /// The only method provided by this resource is `to-debug-string`, + /// which provides some human-readable information about the error. + /// + /// In the `wasi:io` package, this resource is returned through the + /// `wasi:io/streams/stream-error` type. + /// + /// To provide more specific error information, other interfaces may + /// offer functions to "downcast" this error into more specific types. For example, + /// errors returned from streams derived from filesystem types can be described using + /// the filesystem's own error-code type. This is done using the function + /// `wasi:filesystem/types/filesystem-error-code`, which takes a `borrow` + /// parameter and returns an `option`. + /// + /// The set of functions which can "downcast" an `error` into a more + /// concrete type is open. + @since(version = 0.2.0) + resource error { + /// Returns a string that is suitable to assist humans in debugging + /// this error. + /// + /// WARNING: The returned string should not be consumed mechanically! + /// It may change across platforms, hosts, or other implementation + /// details. Parsing this string is a major platform-compatibility + /// hazard. + @since(version = 0.2.0) + to-debug-string: func() -> string; + } +} diff --git a/search/opensearch/wit/deps/wasi:io/poll.wit b/search/opensearch/wit/deps/wasi:io/poll.wit new file mode 100644 index 000000000..9bcbe8e03 --- /dev/null +++ b/search/opensearch/wit/deps/wasi:io/poll.wit @@ -0,0 +1,47 @@ +package wasi:io@0.2.3; + +/// A poll API intended to let users wait for I/O events on multiple handles +/// at once. +@since(version = 0.2.0) +interface poll { + /// `pollable` represents a single I/O event which may be ready, or not. + @since(version = 0.2.0) + resource pollable { + + /// Return the readiness of a pollable. This function never blocks. + /// + /// Returns `true` when the pollable is ready, and `false` otherwise. + @since(version = 0.2.0) + ready: func() -> bool; + + /// `block` returns immediately if the pollable is ready, and otherwise + /// blocks until ready. + /// + /// This function is equivalent to calling `poll.poll` on a list + /// containing only this pollable. + @since(version = 0.2.0) + block: func(); + } + + /// Poll for completion on a set of pollables. + /// + /// This function takes a list of pollables, which identify I/O sources of + /// interest, and waits until one or more of the events is ready for I/O. + /// + /// The result `list` contains one or more indices of handles in the + /// argument list that is ready for I/O. + /// + /// This function traps if either: + /// - the list is empty, or: + /// - the list contains more elements than can be indexed with a `u32` value. + /// + /// A timeout can be implemented by adding a pollable from the + /// wasi-clocks API to the list. + /// + /// This function does not return a `result`; polling in itself does not + /// do any I/O so it doesn't fail. If any of the I/O sources identified by + /// the pollables has an error, it is indicated by marking the source as + /// being ready for I/O. + @since(version = 0.2.0) + poll: func(in: list>) -> list; +} diff --git a/search/opensearch/wit/deps/wasi:io/streams.wit b/search/opensearch/wit/deps/wasi:io/streams.wit new file mode 100644 index 000000000..0de084629 --- /dev/null +++ b/search/opensearch/wit/deps/wasi:io/streams.wit @@ -0,0 +1,290 @@ +package wasi:io@0.2.3; + +/// WASI I/O is an I/O abstraction API which is currently focused on providing +/// stream types. +/// +/// In the future, the component model is expected to add built-in stream types; +/// when it does, they are expected to subsume this API. +@since(version = 0.2.0) +interface streams { + @since(version = 0.2.0) + use error.{error}; + @since(version = 0.2.0) + use poll.{pollable}; + + /// An error for input-stream and output-stream operations. + @since(version = 0.2.0) + variant stream-error { + /// The last operation (a write or flush) failed before completion. + /// + /// More information is available in the `error` payload. + /// + /// After this, the stream will be closed. All future operations return + /// `stream-error::closed`. + last-operation-failed(error), + /// The stream is closed: no more input will be accepted by the + /// stream. A closed output-stream will return this error on all + /// future operations. + closed + } + + /// An input bytestream. + /// + /// `input-stream`s are *non-blocking* to the extent practical on underlying + /// platforms. I/O operations always return promptly; if fewer bytes are + /// promptly available than requested, they return the number of bytes promptly + /// available, which could even be zero. To wait for data to be available, + /// use the `subscribe` function to obtain a `pollable` which can be polled + /// for using `wasi:io/poll`. + @since(version = 0.2.0) + resource input-stream { + /// Perform a non-blocking read from the stream. + /// + /// When the source of a `read` is binary data, the bytes from the source + /// are returned verbatim. When the source of a `read` is known to the + /// implementation to be text, bytes containing the UTF-8 encoding of the + /// text are returned. + /// + /// This function returns a list of bytes containing the read data, + /// when successful. The returned list will contain up to `len` bytes; + /// it may return fewer than requested, but not more. The list is + /// empty when no bytes are available for reading at this time. The + /// pollable given by `subscribe` will be ready when more bytes are + /// available. + /// + /// This function fails with a `stream-error` when the operation + /// encounters an error, giving `last-operation-failed`, or when the + /// stream is closed, giving `closed`. + /// + /// When the caller gives a `len` of 0, it represents a request to + /// read 0 bytes. If the stream is still open, this call should + /// succeed and return an empty list, or otherwise fail with `closed`. + /// + /// The `len` parameter is a `u64`, which could represent a list of u8 which + /// is not possible to allocate in wasm32, or not desirable to allocate as + /// as a return value by the callee. The callee may return a list of bytes + /// less than `len` in size while more bytes are available for reading. + @since(version = 0.2.0) + read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Read bytes from a stream, after blocking until at least one byte can + /// be read. Except for blocking, behavior is identical to `read`. + @since(version = 0.2.0) + blocking-read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Skip bytes from a stream. Returns number of bytes skipped. + /// + /// Behaves identical to `read`, except instead of returning a list + /// of bytes, returns the number of bytes consumed from the stream. + @since(version = 0.2.0) + skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Skip bytes from a stream, after blocking until at least one byte + /// can be skipped. Except for blocking behavior, identical to `skip`. + @since(version = 0.2.0) + blocking-skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Create a `pollable` which will resolve once either the specified stream + /// has bytes available to read or the other end of the stream has been + /// closed. + /// The created `pollable` is a child resource of the `input-stream`. + /// Implementations may trap if the `input-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + } + + + /// An output bytestream. + /// + /// `output-stream`s are *non-blocking* to the extent practical on + /// underlying platforms. Except where specified otherwise, I/O operations also + /// always return promptly, after the number of bytes that can be written + /// promptly, which could even be zero. To wait for the stream to be ready to + /// accept data, the `subscribe` function to obtain a `pollable` which can be + /// polled for using `wasi:io/poll`. + /// + /// Dropping an `output-stream` while there's still an active write in + /// progress may result in the data being lost. Before dropping the stream, + /// be sure to fully flush your writes. + @since(version = 0.2.0) + resource output-stream { + /// Check readiness for writing. This function never blocks. + /// + /// Returns the number of bytes permitted for the next call to `write`, + /// or an error. Calling `write` with more bytes than this function has + /// permitted will trap. + /// + /// When this function returns 0 bytes, the `subscribe` pollable will + /// become ready when this function will report at least 1 byte, or an + /// error. + @since(version = 0.2.0) + check-write: func() -> result; + + /// Perform a write. This function never blocks. + /// + /// When the destination of a `write` is binary data, the bytes from + /// `contents` are written verbatim. When the destination of a `write` is + /// known to the implementation to be text, the bytes of `contents` are + /// transcoded from UTF-8 into the encoding of the destination and then + /// written. + /// + /// Precondition: check-write gave permit of Ok(n) and contents has a + /// length of less than or equal to n. Otherwise, this function will trap. + /// + /// returns Err(closed) without writing if the stream has closed since + /// the last call to check-write provided a permit. + @since(version = 0.2.0) + write: func( + contents: list + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 bytes, and then flush the stream. Block + /// until all of these operations are complete, or an error occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write`, and `flush`, and is implemented with the + /// following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while !contents.is_empty() { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, contents.len()); + /// let (chunk, rest) = contents.split_at(len); + /// this.write(chunk ); // eliding error handling + /// contents = rest; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-and-flush: func( + contents: list + ) -> result<_, stream-error>; + + /// Request to flush buffered output. This function never blocks. + /// + /// This tells the output-stream that the caller intends any buffered + /// output to be flushed. the output which is expected to be flushed + /// is all that has been passed to `write` prior to this call. + /// + /// Upon calling this function, the `output-stream` will not accept any + /// writes (`check-write` will return `ok(0)`) until the flush has + /// completed. The `subscribe` pollable will become ready when the + /// flush has completed and the stream can accept more writes. + @since(version = 0.2.0) + flush: func() -> result<_, stream-error>; + + /// Request to flush buffered output, and block until flush completes + /// and stream is ready for writing again. + @since(version = 0.2.0) + blocking-flush: func() -> result<_, stream-error>; + + /// Create a `pollable` which will resolve once the output-stream + /// is ready for more writing, or an error has occurred. When this + /// pollable is ready, `check-write` will return `ok(n)` with n>0, or an + /// error. + /// + /// If the stream is closed, this pollable is always ready immediately. + /// + /// The created `pollable` is a child resource of the `output-stream`. + /// Implementations may trap if the `output-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + + /// Write zeroes to a stream. + /// + /// This should be used precisely like `write` with the exact same + /// preconditions (must use check-write first), but instead of + /// passing a list of bytes, you simply pass the number of zero-bytes + /// that should be written. + @since(version = 0.2.0) + write-zeroes: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 zeroes, and then flush the stream. + /// Block until all of these operations are complete, or an error + /// occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write-zeroes`, and `flush`, and is implemented with + /// the following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while num_zeroes != 0 { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, num_zeroes); + /// this.write-zeroes(len); // eliding error handling + /// num_zeroes -= len; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-zeroes-and-flush: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Read from one stream and write to another. + /// + /// The behavior of splice is equivalent to: + /// 1. calling `check-write` on the `output-stream` + /// 2. calling `read` on the `input-stream` with the smaller of the + /// `check-write` permitted length and the `len` provided to `splice` + /// 3. calling `write` on the `output-stream` with that read data. + /// + /// Any error reported by the call to `check-write`, `read`, or + /// `write` ends the splice and reports that error. + /// + /// This function returns the number of bytes transferred; it may be less + /// than `len`. + @since(version = 0.2.0) + splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + + /// Read from one stream and write to another, with blocking. + /// + /// This is similar to `splice`, except that it blocks until the + /// `output-stream` is ready for writing, and the `input-stream` + /// is ready for reading, before performing the `splice`. + @since(version = 0.2.0) + blocking-splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + } +} diff --git a/search/opensearch/wit/deps/wasi:io/world.wit b/search/opensearch/wit/deps/wasi:io/world.wit new file mode 100644 index 000000000..f1d2102dc --- /dev/null +++ b/search/opensearch/wit/deps/wasi:io/world.wit @@ -0,0 +1,10 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +world imports { + @since(version = 0.2.0) + import streams; + + @since(version = 0.2.0) + import poll; +} diff --git a/search/opensearch/wit/opensearch.wit b/search/opensearch/wit/opensearch.wit new file mode 100644 index 000000000..a02f094e4 --- /dev/null +++ b/search/opensearch/wit/opensearch.wit @@ -0,0 +1,6 @@ +package golem:search-opensearch; + +world opensearch-provider { + import golem:search/core@1.0.0; + export golem:search/core@1.0.0; +} diff --git a/search/search/Cargo.toml b/search/search/Cargo.toml new file mode 100644 index 000000000..9dc3445b8 --- /dev/null +++ b/search/search/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "golem-search" +version = "0.0.0" +edition = "2021" +license = "Apache-2.0" +homepage = "https://golem.cloud" +repository = "https://github.com/golemcloud/golem-llm" +description = "WebAssembly components for working with search APIs, with special support for Golem Cloud" + +[lib] +path = "src/lib.rs" +crate-type = ["rlib"] + +[dependencies] +golem-rust = { workspace = true } +log = { workspace = true } +reqwest = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = "2.0.12" +wasi-logger = "0.1.2" +wit-bindgen = { version = "0.40.0" } + +[features] +default = ["durability"] +durability = ["golem-rust/durability"] diff --git a/search/search/src/config.rs b/search/search/src/config.rs new file mode 100644 index 000000000..c3eaa47f6 --- /dev/null +++ b/search/search/src/config.rs @@ -0,0 +1,34 @@ +use crate::golem::search::types::SearchError; +use std::env; + +pub fn with_config_key(key: &str, error_fn: F, func: impl FnOnce(String) -> R) -> R +where + F: FnOnce(SearchError) -> T, + R: From, +{ + match env::var(key) { + Ok(value) if !value.is_empty() => func(value), + _ => { + let error = SearchError::Internal(format!("Missing env var: {}", key)); + R::from(error_fn(error)) + } + } +} + +pub fn get_endpoint() -> Option { + env::var("SEARCH_PROVIDER_ENDPOINT").ok() +} + +pub fn get_timeout_ms() -> u32 { + env::var("SEARCH_PROVIDER_TIMEOUT") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(30000) +} + +pub fn get_max_retries() -> u32 { + env::var("SEARCH_PROVIDER_MAX_RETRIES") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(3) +} diff --git a/search/search/src/durability.rs b/search/search/src/durability.rs new file mode 100644 index 000000000..b7ce20975 --- /dev/null +++ b/search/search/src/durability.rs @@ -0,0 +1,197 @@ +#[cfg(feature = "durability")] +use golem_rust::*; + +#[cfg(feature = "durability")] +use crate::golem::search::core::Guest; +use crate::golem::search::types::{Doc, Schema, SearchError, SearchQuery, SearchResults}; + +pub trait ExtendedGuest: Guest { + type SearchHitStream; + fn unwrapped_stream_search( + index: String, + query: SearchQuery, + ) -> Result; +} + +#[cfg(feature = "durability")] +pub struct DurableSearch { + _phantom: std::marker::PhantomData, +} + +#[cfg(feature = "durability")] +impl Guest for DurableSearch { + type SearchHitStream = T::SearchHitStream; + + fn create_index(name: String, schema: Option) -> Result<(), SearchError> { + durable_host_function!( + "search-create-index", + |name: String, schema: Option| -> Result<(), SearchError> { + T::create_index(name, schema) + }, + name, + schema + ) + } + + fn delete_index(name: String) -> Result<(), SearchError> { + durable_host_function!( + "search-delete-index", + |name: String| -> Result<(), SearchError> { T::delete_index(name) }, + name + ) + } + + fn list_indexes() -> Result, SearchError> { + durable_host_function!( + "search-list-indexes", + || -> Result, SearchError> { T::list_indexes() } + ) + } + + fn upsert(index: String, doc: Doc) -> Result<(), SearchError> { + durable_host_function!( + "search-upsert", + |index: String, doc: Doc| -> Result<(), SearchError> { T::upsert(index, doc) }, + index, + doc + ) + } + + fn upsert_many(index: String, docs: Vec) -> Result<(), SearchError> { + durable_host_function!( + "search-upsert-many", + |index: String, docs: Vec| -> Result<(), SearchError> { + T::upsert_many(index, docs) + }, + index, + docs + ) + } + + fn delete(index: String, id: String) -> Result<(), SearchError> { + durable_host_function!( + "search-delete", + |index: String, id: String| -> Result<(), SearchError> { T::delete(index, id) }, + index, + id + ) + } + + fn delete_many(index: String, ids: Vec) -> Result<(), SearchError> { + durable_host_function!( + "search-delete-many", + |index: String, ids: Vec| -> Result<(), SearchError> { + T::delete_many(index, ids) + }, + index, + ids + ) + } + + fn get(index: String, id: String) -> Result, SearchError> { + durable_host_function!( + "search-get", + |index: String, id: String| -> Result, SearchError> { T::get(index, id) }, + index, + id + ) + } + + fn search(index: String, query: SearchQuery) -> Result { + durable_host_function!( + "search-query", + |index: String, query: SearchQuery| -> Result { + T::search(index, query) + }, + index, + query + ) + } + + fn stream_search( + index: String, + query: SearchQuery, + ) -> Result { + T::unwrapped_stream_search(index, query) + } + + fn get_schema(index: String) -> Result { + durable_host_function!( + "search-get-schema", + |index: String| -> Result { T::get_schema(index) }, + index + ) + } + + fn update_schema(index: String, schema: Schema) -> Result<(), SearchError> { + durable_host_function!( + "search-update-schema", + |index: String, schema: Schema| -> Result<(), SearchError> { + T::update_schema(index, schema) + }, + index, + schema + ) + } +} + +#[cfg(not(feature = "durability"))] +pub struct DurableSearch { + _phantom: std::marker::PhantomData, +} + +#[cfg(not(feature = "durability"))] +impl Guest for DurableSearch { + type SearchHitStream = T::SearchHitStream; + + fn create_index(name: String, schema: Option) -> Result<(), SearchError> { + T::create_index(name, schema) + } + + fn delete_index(name: String) -> Result<(), SearchError> { + T::delete_index(name) + } + + fn list_indexes() -> Result, SearchError> { + T::list_indexes() + } + + fn upsert(index: String, doc: Doc) -> Result<(), SearchError> { + T::upsert(index, doc) + } + + fn upsert_many(index: String, docs: Vec) -> Result<(), SearchError> { + T::upsert_many(index, docs) + } + + fn delete(index: String, id: String) -> Result<(), SearchError> { + T::delete(index, id) + } + + fn delete_many(index: String, ids: Vec) -> Result<(), SearchError> { + T::delete_many(index, ids) + } + + fn get(index: String, id: String) -> Result, SearchError> { + T::get(index, id) + } + + fn search(index: String, query: SearchQuery) -> Result { + T::search(index, query) + } + + fn stream_search( + index: String, + query: SearchQuery, + ) -> Result { + T::unwrapped_stream_search(index, query) + } + + fn get_schema(index: String) -> Result { + T::get_schema(index) + } + + fn update_schema(index: String, schema: Schema) -> Result<(), SearchError> { + T::update_schema(index, schema) + } +} diff --git a/search/search/src/error.rs b/search/search/src/error.rs new file mode 100644 index 000000000..be06d1e6e --- /dev/null +++ b/search/search/src/error.rs @@ -0,0 +1,19 @@ +use crate::golem::search::types::SearchError; + +pub fn parse_http_error(status: u16, body: &str) -> SearchError { + match status { + 400 => SearchError::InvalidQuery(body.to_string()), + 404 => SearchError::IndexNotFound, + 429 => SearchError::RateLimited, + 500..=599 => SearchError::Internal(format!("Server error: {}", body)), + _ => SearchError::Internal(format!("HTTP error {}: {}", status, body)), + } +} + +pub fn network_error(error: &str) -> SearchError { + if error.contains("timeout") { + SearchError::Timeout + } else { + SearchError::Internal(format!("Network error: {}", error)) + } +} diff --git a/search/search/src/lib.rs b/search/search/src/lib.rs new file mode 100644 index 000000000..9cd2f599f --- /dev/null +++ b/search/search/src/lib.rs @@ -0,0 +1,182 @@ +use std::cell::RefCell; + +pub mod config; +pub mod durability; +pub mod error; + +wit_bindgen::generate!({ + path: "../wit", + with: {} +}); + +use crate::golem::search::types::{SearchError, SearchHit}; + +thread_local! { + pub static LOGGING_STATE: RefCell = RefCell::new(LoggingState::new()); +} + +struct LoggingState { + initialized: bool, +} + +impl LoggingState { + fn new() -> Self { + Self { initialized: false } + } + + pub fn init(&mut self) { + if !self.initialized { + wasi_logger::Logger::install().unwrap(); + log::set_max_level( + std::env::var("GOLEM_SEARCH_LOG") + .unwrap_or_else(|_| "warn".to_string()) + .parse() + .unwrap_or(log::LevelFilter::Warn), + ); + self.initialized = true; + } + } +} + +pub struct SearchStream { + inner: T, +} + +impl SearchStream { + pub fn new(inner: T) -> Self { + Self { inner } + } +} + +pub trait SearchStreamState { + fn failure(&self) -> &Option; + fn is_finished(&self) -> bool; + fn set_finished(&self); + fn get_next_hits(&self) -> Result, SearchError>; +} + +impl SearchStream { + pub fn get_next(&self) -> Option> { + if self.inner.is_finished() { + return None; + } + + if let Some(error) = self.inner.failure() { + self.inner.set_finished(); + return None; + } + + match self.inner.get_next_hits() { + Ok(hits) => { + if hits.is_empty() { + self.inner.set_finished(); + None + } else { + Some(hits) + } + } + Err(_) => { + self.inner.set_finished(); + None + } + } + } + + pub fn blocking_get_next(&self) -> Vec { + self.get_next().unwrap_or_default() + } +} + +#[macro_export] +macro_rules! export_search { + ($component:ty with_types_in $types:path) => { + use $types::exports::golem::search::core::Guest; + + struct Component; + + impl Guest for Component { + type SearchHitStream = <$component as Guest>::SearchHitStream; + + fn create_index( + name: String, + schema: Option<$types::golem::search::types::Schema>, + ) -> Result<(), $types::golem::search::types::SearchError> { + <$component as Guest>::create_index(name, schema) + } + + fn delete_index( + name: String, + ) -> Result<(), $types::golem::search::types::SearchError> { + <$component as Guest>::delete_index(name) + } + + fn list_indexes() -> Result, $types::golem::search::types::SearchError> { + <$component as Guest>::list_indexes() + } + + fn upsert( + index: String, + doc: $types::golem::search::types::Doc, + ) -> Result<(), $types::golem::search::types::SearchError> { + <$component as Guest>::upsert(index, doc) + } + + fn upsert_many( + index: String, + docs: Vec<$types::golem::search::types::Doc>, + ) -> Result<(), $types::golem::search::types::SearchError> { + <$component as Guest>::upsert_many(index, docs) + } + + fn delete( + index: String, + id: String, + ) -> Result<(), $types::golem::search::types::SearchError> { + <$component as Guest>::delete(index, id) + } + + fn delete_many( + index: String, + ids: Vec, + ) -> Result<(), $types::golem::search::types::SearchError> { + <$component as Guest>::delete_many(index, ids) + } + + fn get( + index: String, + id: String, + ) -> Result, $types::golem::search::types::SearchError> { + <$component as Guest>::get(index, id) + } + + fn search( + index: String, + query: $types::golem::search::types::SearchQuery, + ) -> Result<$types::golem::search::types::SearchResults, $types::golem::search::types::SearchError> { + <$component as Guest>::search(index, query) + } + + fn stream_search( + index: String, + query: $types::golem::search::types::SearchQuery, + ) -> Result { + <$component as Guest>::stream_search(index, query) + } + + fn get_schema( + index: String, + ) -> Result<$types::golem::search::types::Schema, $types::golem::search::types::SearchError> { + <$component as Guest>::get_schema(index) + } + + fn update_schema( + index: String, + schema: $types::golem::search::types::Schema, + ) -> Result<(), $types::golem::search::types::SearchError> { + <$component as Guest>::update_schema(index, schema) + } + } + + $types::export!(Component with_types_in $types); + }; +} diff --git a/search/typesense/Cargo.toml b/search/typesense/Cargo.toml new file mode 100644 index 000000000..d9207ab20 --- /dev/null +++ b/search/typesense/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "golem-search-typesense" +version = "0.0.0" +edition = "2021" +license = "Apache-2.0" +homepage = "https://golem.cloud" +repository = "https://github.com/golemcloud/golem-llm" +description = "WebAssembly component for working with Typesense APIs, with special support for Golem Cloud" + +[lib] +path = "src/lib.rs" +crate-type = ["cdylib"] + +[features] +default = ["durability"] +durability = ["golem-rust/durability", "golem-search/durability"] + +[dependencies] +golem-search = { path = "../search" } +golem-rust = "1.6.0" +log = "0.4.27" +reqwest = { git = "https://github.com/golemcloud/reqwest", branch = "update-may-2025", features = ["json", "blocking"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +wit-bindgen-rt = { version = "0.40.0", features = ["bitflags"] } +urlencoding = "2.1" + +[package.metadata.component] +package = "golem:search-typesense" + +[package.metadata.component.bindings] +generate_unused_types = true + +[package.metadata.component.bindings.with] +"golem:search/core@1.0.0" = "golem_search::golem::search::core" + +[package.metadata.component.target] +path = "wit" + +[package.metadata.component.target.dependencies] +"golem:search" = { path = "wit/deps/golem-search" } +"wasi:io" = { path = "wit/deps/wasi:io" } diff --git a/search/typesense/src/client.rs b/search/typesense/src/client.rs new file mode 100644 index 000000000..e95e8d902 --- /dev/null +++ b/search/typesense/src/client.rs @@ -0,0 +1,431 @@ +use golem_search::golem::search::types::{Doc, Schema, SearchError}; +use log::{debug, trace}; +use reqwest::{Client, Response}; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct TypesenseApi { + client: Client, + base_url: String, + api_key: String, +} + +#[derive(Debug, Deserialize)] +pub struct TypesenseError { + pub message: String, +} + +#[derive(Debug, Deserialize)] +pub struct TypesenseSearchResponse { + pub hits: Vec, + pub found: u64, + pub search_time_ms: u64, + pub page: Option, +} + +#[derive(Debug, Deserialize)] +pub struct TypesenseHit { + pub document: Value, + #[serde(default)] + pub highlights: Vec, + pub text_match: Option, +} + +#[derive(Debug, Deserialize)] +pub struct TypesenseHighlight { + pub field: String, + pub snippet: String, + pub value: String, +} + +#[derive(Debug, Serialize)] +pub struct TypesenseDoc { + pub id: String, + #[serde(flatten)] + pub fields: Value, +} + +#[derive(Debug, Deserialize)] +pub struct CollectionResponse { + pub name: String, + pub num_documents: u64, + pub fields: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TypesenseField { + pub name: String, + #[serde(rename = "type")] + pub field_type: String, + #[serde(default)] + pub facet: bool, + #[serde(default)] + pub index: bool, + #[serde(default)] + pub sort: bool, + #[serde(default)] + pub optional: bool, +} + +#[derive(Debug, Serialize)] +pub struct CreateCollectionRequest { + pub name: String, + pub fields: Vec, +} + +#[derive(Debug, Deserialize)] +pub struct GetDocumentResponse { + pub id: String, + #[serde(flatten)] + pub fields: Value, +} + +#[derive(Debug, Deserialize)] +pub struct ExportResponse { + #[serde(flatten)] + pub document: Value, +} + +impl TypesenseApi { + pub fn new(host: &str, api_key: &str) -> Self { + let client = Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .unwrap(); + + Self { + client, + base_url: host.trim_end_matches('/').to_string(), + api_key: api_key.to_string(), + } + } + + pub fn empty() -> Self { + Self { + client: Client::new(), + base_url: String::new(), + api_key: String::new(), + } + } + + async fn request(&self, method: reqwest::Method, path: &str, body: Option) -> Result { + let url = format!("{}/{}", self.base_url, path.trim_start_matches('/')); + + let mut request = self.client.request(method, &url); + request = request.header("X-TYPESENSE-API-KEY", &self.api_key); + request = request.header("Content-Type", "application/json"); + + if let Some(body) = body { + request = request.json(&body); + } + + trace!("Making request to: {}", url); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Request failed: {}", e)))?; + + if response.status().is_success() { + Ok(response) + } else { + let status = response.status().as_u16(); + let error_text = response.text().await.unwrap_or_default(); + + if let Ok(ts_error) = serde_json::from_str::(&error_text) { + match status { + 404 => Err(SearchError::IndexNotFound), + 400 => Err(SearchError::InvalidQuery(ts_error.message)), + _ => Err(SearchError::Internal(format!("Typesense error: {}", ts_error.message))) + } + } else { + Err(SearchError::Internal(format!("HTTP {}: {}", status, error_text))) + } + } + } + + pub fn create_collection(&self, name: &str, schema: Option<&Schema>) -> Result<(), SearchError> { + let fields = if let Some(schema) = schema { + crate::conversions::schema_to_typesense_fields(schema) + } else { + vec![ + TypesenseField { + name: "id".to_string(), + field_type: "string".to_string(), + facet: false, + index: true, + sort: false, + optional: false, + }, + TypesenseField { + name: ".*".to_string(), + field_type: "auto".to_string(), + facet: false, + index: true, + sort: false, + optional: true, + }, + ] + }; + + let collection_request = CreateCollectionRequest { + name: name.to_string(), + fields, + }; + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::POST, "collections", Some(serde_json::to_value(collection_request).unwrap())).await?; + debug!("Created collection: {}", name); + Ok(()) + }) + } + + pub fn delete_collection(&self, name: &str) -> Result<(), SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let path = format!("collections/{}", name); + self.request(reqwest::Method::DELETE, &path, None).await?; + debug!("Deleted collection: {}", name); + Ok(()) + }) + } + + pub fn list_collections(&self) -> Result, SearchError> { + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, "collections", None).await?; + let collections: Vec = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + let names = collections.into_iter().map(|c| c.name).collect(); + Ok(names) + }) + } + + pub fn index_document(&self, collection: &str, doc: TypesenseDoc) -> Result<(), SearchError> { + let path = format!("collections/{}/documents", collection); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::POST, &path, Some(serde_json::to_value(doc).unwrap())).await?; + debug!("Indexed document to collection: {}", collection); + Ok(()) + }) + } + + pub fn upsert_document(&self, collection: &str, doc: TypesenseDoc) -> Result<(), SearchError> { + let path = format!("collections/{}/documents?action=upsert", collection); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::POST, &path, Some(serde_json::to_value(doc).unwrap())).await?; + debug!("Upserted document to collection: {}", collection); + Ok(()) + }) + } + + pub fn bulk_upsert(&self, collection: &str, docs: Vec) -> Result<(), SearchError> { + if docs.is_empty() { + return Ok(()); + } + + let path = format!("collections/{}/documents/import?action=upsert", collection); + + // Convert to JSONL format + let mut body = String::new(); + for doc in &docs { + let doc_json = serde_json::to_string(&doc) + .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?; + body.push_str(&doc_json); + body.push('\n'); + } + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let url = format!("{}/{}", self.base_url, path); + let request = self.client.post(&url) + .header("X-TYPESENSE-API-KEY", &self.api_key) + .header("Content-Type", "application/octet-stream") + .body(body); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Bulk request failed: {}", e)))?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + return Err(SearchError::Internal(format!("Bulk upsert failed: {}", error_text))); + } + + debug!("Bulk upserted {} documents to {}", docs.len(), collection); + Ok(()) + }) + } + + pub fn delete_document(&self, collection: &str, id: &str) -> Result<(), SearchError> { + let path = format!("collections/{}/documents/{}", collection, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + self.request(reqwest::Method::DELETE, &path, None).await?; + debug!("Deleted document: {}/{}", collection, id); + Ok(()) + }) + } + + pub fn bulk_delete(&self, collection: &str, ids: Vec) -> Result<(), SearchError> { + if ids.is_empty() { + return Ok(()); + } + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + for id in ids { + let path = format!("collections/{}/documents/{}", collection, id); + self.request(reqwest::Method::DELETE, &path, None).await?; + } + debug!("Bulk deleted documents from {}", collection); + Ok(()) + }) + } + + pub fn get_document(&self, collection: &str, id: &str) -> Result, SearchError> { + let path = format!("collections/{}/documents/{}", collection, id); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + match self.request(reqwest::Method::GET, &path, None).await { + Ok(response) => { + let doc_response: GetDocumentResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse response: {}", e)))?; + + Ok(Some(Doc { + id: doc_response.id, + content: serde_json::to_string(&doc_response.fields) + .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?, + })) + } + Err(SearchError::IndexNotFound) => Ok(None), + Err(SearchError::Internal(msg)) if msg.contains("404") => Ok(None), + Err(e) => Err(e), + } + }) + } + + pub fn search(&self, collection: &str, params: &HashMap) -> Result { + let path = format!("collections/{}/documents/search", collection); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let url = format!("{}/{}?{}", self.base_url, path, + params.iter().map(|(k, v)| format!("{}={}", k, urlencoding::encode(v))).collect::>().join("&")); + + let request = self.client.get(&url) + .header("X-TYPESENSE-API-KEY", &self.api_key); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Search request failed: {}", e)))?; + + if response.status().is_success() { + let search_response: TypesenseSearchResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse search response: {}", e)))?; + + debug!("Search completed in {}ms", search_response.search_time_ms); + Ok(search_response) + } else { + let status = response.status().as_u16(); + let error_text = response.text().await.unwrap_or_default(); + Err(SearchError::Internal(format!("Search failed HTTP {}: {}", status, error_text))) + } + }) + } + + pub fn export(&self, collection: &str, params: &HashMap) -> Result, SearchError> { + let path = format!("collections/{}/documents/export", collection); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let url = format!("{}/{}?{}", self.base_url, path, + params.iter().map(|(k, v)| format!("{}={}", k, urlencoding::encode(v))).collect::>().join("&")); + + let request = self.client.get(&url) + .header("X-TYPESENSE-API-KEY", &self.api_key); + + let response = request.send().await + .map_err(|e| SearchError::Internal(format!("Export request failed: {}", e)))?; + + if response.status().is_success() { + let response_text = response.text().await + .map_err(|e| SearchError::Internal(format!("Failed to get response text: {}", e)))?; + + let mut docs = Vec::new(); + for line in response_text.lines() { + if !line.trim().is_empty() { + let doc_data: Value = serde_json::from_str(line) + .map_err(|e| SearchError::Internal(format!("Failed to parse export line: {}", e)))?; + + if let Some(id) = doc_data.get("id").and_then(|v| v.as_str()) { + docs.push(Doc { + id: id.to_string(), + content: serde_json::to_string(&doc_data) + .map_err(|e| SearchError::Internal(format!("JSON serialization error: {}", e)))?, + }); + } + } + } + + Ok(docs) + } else { + let status = response.status().as_u16(); + let error_text = response.text().await.unwrap_or_default(); + Err(SearchError::Internal(format!("Export failed HTTP {}: {}", status, error_text))) + } + }) + } + + pub fn get_collection_schema(&self, collection: &str) -> Result { + let path = format!("collections/{}", collection); + + let rt = tokio::runtime::Runtime::new() + .map_err(|e| SearchError::Internal(format!("Runtime error: {}", e)))?; + + rt.block_on(async { + let response = self.request(reqwest::Method::GET, &path, None).await?; + let collection_info: CollectionResponse = response.json().await + .map_err(|e| SearchError::Internal(format!("Failed to parse collection response: {}", e)))?; + + Ok(crate::conversions::typesense_fields_to_schema(&collection_info.fields)) + }) + } + + pub fn update_collection_schema(&self, collection: &str, schema: &Schema) -> Result<(), SearchError> { + // Typesense doesn't support direct schema updates, would need to recreate collection + // For now, return an error suggesting recreation + Err(SearchError::Internal( + "Typesense doesn't support schema updates. Please recreate the collection with the new schema.".to_string() + )) + } +} diff --git a/search/typesense/src/conversions.rs b/search/typesense/src/conversions.rs new file mode 100644 index 000000000..2a8daa466 --- /dev/null +++ b/search/typesense/src/conversions.rs @@ -0,0 +1,309 @@ +use crate::client::{TypesenseDoc, TypesenseField, TypesenseHit, TypesenseSearchResponse}; +use golem_search::golem::search::types::{ + Doc, FieldType, Schema, SchemaField, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use log::trace; +use serde_json::{json, Map, Value}; +use std::collections::HashMap; + +pub fn query_to_typesense_params(query: &SearchQuery) -> Result, SearchError> { + let mut params = HashMap::new(); + + // Query string + if let Some(q) = &query.q { + if !q.trim().is_empty() { + params.insert("q".to_string(), q.clone()); + params.insert("query_by".to_string(), "*".to_string()); + } + } else { + params.insert("q".to_string(), "*".to_string()); + } + + // Filters + if !query.filters.is_empty() { + let mut filter_parts = Vec::new(); + + for filter in &query.filters { + if let Ok(filter_value) = serde_json::from_str::(filter) { + // Handle JSON filters - convert to Typesense filter syntax + if let Some(obj) = filter_value.as_object() { + for (field, conditions) in obj { + if let Some(cond_obj) = conditions.as_object() { + for (op, value) in cond_obj { + let filter_str = match op.as_str() { + "eq" => format!("{}:={}", field, value_to_string(value)), + "ne" => format!("{}:!={}", field, value_to_string(value)), + "gt" => format!("{}:>{}", field, value_to_string(value)), + "gte" => format!("{}:>={}", field, value_to_string(value)), + "lt" => format!("{}:<{}", field, value_to_string(value)), + "lte" => format!("{}:<={}", field, value_to_string(value)), + "in" => { + if let Some(arr) = value.as_array() { + let values: Vec = arr.iter().map(value_to_string).collect(); + format!("{}:[{}]", field, values.join(",")) + } else { + continue; + } + } + _ => continue, + }; + filter_parts.push(filter_str); + } + } + } + } + } else { + // Handle string filters in field:op:value format + let parts: Vec<&str> = filter.splitn(3, ':').collect(); + if parts.len() == 3 { + let field = parts[0]; + let op = parts[1]; + let value = parts[2]; + + let filter_str = match op { + "eq" => format!("{}:={}", field, value), + "ne" => format!("{}:!={}", field, value), + "gt" => format!("{}:>{}", field, value), + "gte" => format!("{}:>={}", field, value), + "lt" => format!("{}:<{}", field, value), + "lte" => format!("{}:<={}", field, value), + "in" => { + let values: Vec<&str> = value.split(',').collect(); + format!("{}:[{}]", field, values.join(",")) + } + "exists" => format!("{}:!=''", field), + "prefix" => format!("{}:{}*", field, value), + "wildcard" => format!("{}:{}", field, value), // Typesense supports wildcards natively + _ => return Err(SearchError::InvalidQuery(format!("Unknown filter operator: {}", op))) + }; + filter_parts.push(filter_str); + } else { + return Err(SearchError::InvalidQuery(format!("Invalid filter format: {}", filter))); + } + } + } + + if !filter_parts.is_empty() { + params.insert("filter_by".to_string(), filter_parts.join(" && ")); + } + } + + // Sorting + if !query.sort.is_empty() { + let mut sort_clauses = Vec::new(); + for sort_field in &query.sort { + if sort_field.starts_with('-') { + let field = &sort_field[1..]; + sort_clauses.push(format!("{}:desc", field)); + } else { + sort_clauses.push(format!("{}:asc", sort_field)); + } + } + params.insert("sort_by".to_string(), sort_clauses.join(",")); + } + + // Pagination + let per_page = query.per_page.unwrap_or(10); + params.insert("per_page".to_string(), per_page.to_string()); + + if let Some(page) = query.page { + params.insert("page".to_string(), page.to_string()); + } else if let Some(offset) = query.offset { + let page = (offset / per_page) + 1; + params.insert("page".to_string(), page.to_string()); + } + + // Faceting + if !query.facets.is_empty() { + params.insert("facet_by".to_string(), query.facets.join(",")); + } + + // Highlighting + if let Some(highlight) = &query.highlight { + if !highlight.fields.is_empty() { + params.insert("highlight_fields".to_string(), highlight.fields.join(",")); + + if let Some(pre_tag) = &highlight.pre_tag { + params.insert("highlight_start_tag".to_string(), pre_tag.clone()); + } + + if let Some(post_tag) = &highlight.post_tag { + params.insert("highlight_end_tag".to_string(), post_tag.clone()); + } + + if let Some(max_length) = highlight.max_length { + params.insert("snippet_threshold".to_string(), max_length.to_string()); + } + } + } + + // Configuration + if let Some(config) = &query.config { + if let Some(timeout_ms) = config.timeout_ms { + params.insert("search_cutoff_ms".to_string(), timeout_ms.to_string()); + } + + if !config.attributes_to_retrieve.is_empty() { + params.insert("include_fields".to_string(), config.attributes_to_retrieve.join(",")); + } + } + + trace!("Generated Typesense parameters: {:?}", params); + Ok(params) +} + +fn value_to_string(value: &Value) -> String { + match value { + Value::String(s) => s.clone(), + Value::Number(n) => n.to_string(), + Value::Bool(b) => b.to_string(), + _ => value.to_string(), + } +} + +pub fn typesense_hit_to_search_hit(hit: TypesenseHit) -> SearchHit { + let content = Some(serde_json::to_string(&hit.document).unwrap_or_default()); + + let highlights = if !hit.highlights.is_empty() { + let mut highlight_map = Map::new(); + for highlight in hit.highlights { + highlight_map.insert(highlight.field, json!([highlight.snippet])); + } + Some(serde_json::to_string(&highlight_map).unwrap_or_default()) + } else { + None + }; + + // Typesense doesn't return a traditional score, but we can use text_match as a proxy + let score = hit.text_match.map(|tm| tm as f64 / 100.0); + + SearchHit { + id: hit.document.get("id") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(), + score, + content, + highlights, + } +} + +pub fn typesense_response_to_results(response: TypesenseSearchResponse, query: &SearchQuery) -> SearchResults { + let hits = response.hits.into_iter().map(typesense_hit_to_search_hit).collect(); + let total = Some(response.found as u32); + + let page = query.page.or(response.page); + let per_page = query.per_page; + let took_ms = Some(response.search_time_ms as u32); + + SearchResults { + total, + page, + per_page, + hits, + facets: None, // TODO: Add facets support + took_ms, + } +} + +pub fn schema_to_typesense_fields(schema: &Schema) -> Vec { + let mut fields = Vec::new(); + + for field in &schema.fields { + let field_type = match field.type_ { + FieldType::Text => "string", + FieldType::Keyword => "string", + FieldType::Integer => "int64", + FieldType::Float => "float", + FieldType::Boolean => "bool", + FieldType::Date => "string", // Typesense handles dates as strings with auto parsing + FieldType::GeoPoint => "geopoint", + }; + + fields.push(TypesenseField { + name: field.name.clone(), + field_type: field_type.to_string(), + facet: field.facet, + index: field.index, + sort: field.sort, + optional: !field.required, + }); + } + + fields +} + +pub fn typesense_fields_to_schema(fields: &[TypesenseField]) -> Schema { + let mut schema_fields = Vec::new(); + let mut primary_key = None; + + for field in fields { + if field.name == ".*" { + continue; // Skip auto fields + } + + let field_type = match field.field_type.as_str() { + "string" => FieldType::Text, + "int32" | "int64" => FieldType::Integer, + "float" => FieldType::Float, + "bool" => FieldType::Boolean, + "geopoint" => FieldType::GeoPoint, + _ => FieldType::Text, + }; + + schema_fields.push(SchemaField { + name: field.name.clone(), + type_: field_type, + required: !field.optional, + facet: field.facet, + sort: field.sort, + index: field.index, + }); + + if field.name == "id" { + primary_key = Some(field.name.clone()); + } + } + + Schema { + fields: schema_fields, + primary_key, + } +} + +pub fn doc_to_typesense_doc(doc: Doc) -> Result { + let mut fields: Value = serde_json::from_str(&doc.content) + .map_err(|e| SearchError::InvalidQuery(format!("Invalid JSON in document: {}", e)))?; + + // Ensure the id field is set + if let Some(obj) = fields.as_object_mut() { + obj.insert("id".to_string(), json!(doc.id.clone())); + } + + Ok(TypesenseDoc { + id: doc.id, + fields, + }) +} + +pub fn export_params_from_query(query: &SearchQuery) -> HashMap { + let mut params = HashMap::new(); + + // Apply filters if any + if !query.filters.is_empty() { + if let Ok(ts_params) = query_to_typesense_params(query) { + if let Some(filter_by) = ts_params.get("filter_by") { + params.insert("filter_by".to_string(), filter_by.clone()); + } + } + } + + // For export, we want all fields unless specified otherwise + if let Some(config) = &query.config { + if !config.attributes_to_retrieve.is_empty() { + params.insert("include_fields".to_string(), config.attributes_to_retrieve.join(",")); + } + } + + params +} diff --git a/search/typesense/src/lib.rs b/search/typesense/src/lib.rs new file mode 100644 index 000000000..9cfc2c21d --- /dev/null +++ b/search/typesense/src/lib.rs @@ -0,0 +1,287 @@ +use crate::client::TypesenseApi; +use crate::conversions::{ + doc_to_typesense_doc, export_params_from_query, query_to_typesense_params, + schema_to_typesense_fields, typesense_hit_to_search_hit, typesense_response_to_results, +}; +use golem_search::config::with_config_key; +use golem_search::durability::{DurableSearch, ExtendedGuest}; +use golem_search::golem::search::core::Guest; +use golem_search::golem::search::types::{ + Doc, Schema, SearchError, SearchHit, SearchQuery, SearchResults, +}; +use golem_search::{SearchStream, SearchStreamState, LOGGING_STATE}; +use log::trace; +use std::cell::RefCell; + +mod client; +mod conversions; + +struct TypesenseStream { + client: TypesenseApi, + collection: String, + query: SearchQuery, + current_page: RefCell, + finished: RefCell, + failure: Option, +} + +impl TypesenseStream { + fn new(client: TypesenseApi, collection: String, query: SearchQuery) -> Self { + Self { + client, + collection, + query, + current_page: RefCell::new(1), + finished: RefCell::new(false), + failure: None, + } + } + + fn failed(error: SearchError) -> Self { + Self { + client: TypesenseApi::empty(), + collection: String::new(), + query: SearchQuery { + q: None, + filters: vec![], + sort: vec![], + facets: vec![], + page: None, + per_page: None, + offset: None, + highlight: None, + config: None, + }, + current_page: RefCell::new(1), + finished: RefCell::new(true), + failure: Some(error), + } + } +} + +impl SearchStreamState for TypesenseStream { + fn failure(&self) -> &Option { + &self.failure + } + + fn is_finished(&self) -> bool { + *self.finished.borrow() + } + + fn set_finished(&self) { + *self.finished.borrow_mut() = true; + } + + fn get_next_hits(&self) -> Result, SearchError> { + let current_page = *self.current_page.borrow(); + + // For streaming, we use the export API for better performance + if current_page == 1 { + // First page - use regular search for better ranking + let mut params = query_to_typesense_params(&self.query)?; + params.insert("page".to_string(), current_page.to_string()); + + match self.client.search(&self.collection, ¶ms) { + Ok(response) => { + if response.hits.is_empty() { + self.set_finished(); + Ok(vec![]) + } else { + *self.current_page.borrow_mut() = current_page + 1; + Ok(response.hits.into_iter().map(typesense_hit_to_search_hit).collect()) + } + } + Err(error) => Err(error), + } + } else { + // Subsequent pages - use export API for efficiency + let export_params = export_params_from_query(&self.query); + match self.client.export(&self.collection, &export_params) { + Ok(docs) => { + if docs.is_empty() { + self.set_finished(); + Ok(vec![]) + } else { + self.set_finished(); // Export gets all remaining results + Ok(docs.into_iter().map(|doc| SearchHit { + id: doc.id, + score: None, + content: Some(doc.content), + highlights: None, + }).collect()) + } + } + Err(error) => Err(error), + } + } + } +} + +struct TypesenseComponent; + +impl TypesenseComponent { + const HOST_VAR: &'static str = "TYPESENSE_ENDPOINT"; + const API_KEY_VAR: &'static str = "TYPESENSE_API_KEY"; +} + +impl Guest for TypesenseComponent { + type SearchHitStream = SearchStream; + + fn create_index(name: String, schema: Option) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.create_collection(&name, schema.as_ref()) + }) + }) + } + + fn delete_index(name: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.delete_collection(&name) + }) + }) + } + + fn list_indexes() -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.list_collections() + }) + }) + } + + fn upsert(index: String, doc: Doc) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + let ts_doc = doc_to_typesense_doc(doc)?; + client.upsert_document(&index, ts_doc) + }) + }) + } + + fn upsert_many(index: String, docs: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + let ts_docs: Result, _> = docs.into_iter().map(doc_to_typesense_doc).collect(); + client.bulk_upsert(&index, ts_docs?) + }) + }) + } + + fn delete(index: String, id: String) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.delete_document(&index, &id) + }) + }) + } + + fn delete_many(index: String, ids: Vec) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.bulk_delete(&index, ids) + }) + }) + } + + fn get(index: String, id: String) -> Result, SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.get_document(&index, &id) + }) + }) + } + + fn search(index: String, query: SearchQuery) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + let params = query_to_typesense_params(&query)?; + trace!("Executing search query: {:?}", params); + + match client.search(&index, ¶ms) { + Ok(response) => Ok(typesense_response_to_results(response, &query)), + Err(error) => Err(error), + } + }) + }) + } + + fn stream_search( + index: String, + query: SearchQuery, + ) -> Result { + Self::unwrapped_stream_search(index, query) + } + + fn get_schema(index: String) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.get_collection_schema(&index) + }) + }) + } + + fn update_schema(index: String, schema: Schema) -> Result<(), SearchError> { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, Err, |host| { + with_config_key(Self::API_KEY_VAR, Err, |api_key| { + let client = TypesenseApi::new(host, api_key); + client.update_collection_schema(&index, &schema) + }) + }) + } +} + +impl ExtendedGuest for TypesenseComponent { + type SearchHitStream = SearchStream; + + fn unwrapped_stream_search( + index: String, + query: SearchQuery, + ) -> Result { + LOGGING_STATE.with_borrow_mut(|state| state.init()); + + with_config_key(Self::HOST_VAR, |error| Ok(SearchStream::new(TypesenseStream::failed(error))), |host| { + with_config_key(Self::API_KEY_VAR, |error| Ok(SearchStream::new(TypesenseStream::failed(error))), |api_key| { + let client = TypesenseApi::new(host, api_key); + Ok(SearchStream::new(TypesenseStream::new(client, index, query))) + }) + }) + } +} + +type DurableTypesenseComponent = DurableSearch; + +golem_search::export_search!(DurableTypesenseComponent with_types_in golem_search); diff --git a/search/typesense/wit/deps/golem-search/golem-search.wit b/search/typesense/wit/deps/golem-search/golem-search.wit new file mode 100644 index 000000000..64c822dfa --- /dev/null +++ b/search/typesense/wit/deps/golem-search/golem-search.wit @@ -0,0 +1,134 @@ +package golem:search@1.0.0; + +/// Core types and error handling for universal search interfaces +interface types { + /// Common structured errors for search operations + variant search-error { + index-not-found, + invalid-query(string), + unsupported, + internal(string), + timeout, + rate-limited, + } + + /// Identifier types + type index-name = string; + type document-id = string; + type json = string; + + /// Document payload + record doc { + id: document-id, + content: json, + } + + /// Highlight configuration + record highlight-config { + fields: list, + pre-tag: option, + post-tag: option, + max-length: option, + } + + /// Advanced search tuning + record search-config { + timeout-ms: option, + boost-fields: list>, + attributes-to-retrieve: list, + language: option, + typo-tolerance: option, + exact-match-boost: option, + provider-params: option, + } + + /// Search request + record search-query { + q: option, + filters: list, + sort: list, + facets: list, + page: option, + per-page: option, + offset: option, + highlight: option, + config: option, + } + + /// Search hit + record search-hit { + id: document-id, + score: option, + content: option, + highlights: option, + } + + /// Search result set + record search-results { + total: option, + page: option, + per-page: option, + hits: list, + facets: option, + took-ms: option, + } + + /// Field schema types + enum field-type { + text, + keyword, + integer, + float, + boolean, + date, + geo-point, + } + + /// Field definition + record schema-field { + name: string, + type: field-type, + required: bool, + facet: bool, + sort: bool, + index: bool, + } + + /// Index schema + record schema { + fields: list, + primary-key: option, + } +} + +/// Unified search interface +interface core { + use types.{ + index-name, document-id, doc, search-query, search-results, + search-hit, schema, search-error + }; + + // Index lifecycle + create-index: func(name: index-name, schema: option) -> result<_, search-error>; + delete-index: func(name: index-name) -> result<_, search-error>; + list-indexes: func() -> result, search-error>; + + // Document operations + upsert: func(index: index-name, doc: doc) -> result<_, search-error>; + upsert-many: func(index: index-name, docs: list) -> result<_, search-error>; + delete: func(index: index-name, id: document-id) -> result<_, search-error>; + delete-many: func(index: index-name, ids: list) -> result<_, search-error>; + get: func(index: index-name, id: document-id) -> result, search-error>; + + // Query + search: func(index: index-name, query: search-query) -> result; + stream-search: func(index: index-name, query: search-query) -> result, search-error>; + + // Schema inspection + get-schema: func(index: index-name) -> result; + update-schema: func(index: index-name, schema: schema) -> result<_, search-error>; +} + +world search-library { + export core; +} diff --git a/search/typesense/wit/deps/wasi:io/error.wit b/search/typesense/wit/deps/wasi:io/error.wit new file mode 100644 index 000000000..97c606877 --- /dev/null +++ b/search/typesense/wit/deps/wasi:io/error.wit @@ -0,0 +1,34 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +interface error { + /// A resource which represents some error information. + /// + /// The only method provided by this resource is `to-debug-string`, + /// which provides some human-readable information about the error. + /// + /// In the `wasi:io` package, this resource is returned through the + /// `wasi:io/streams/stream-error` type. + /// + /// To provide more specific error information, other interfaces may + /// offer functions to "downcast" this error into more specific types. For example, + /// errors returned from streams derived from filesystem types can be described using + /// the filesystem's own error-code type. This is done using the function + /// `wasi:filesystem/types/filesystem-error-code`, which takes a `borrow` + /// parameter and returns an `option`. + /// + /// The set of functions which can "downcast" an `error` into a more + /// concrete type is open. + @since(version = 0.2.0) + resource error { + /// Returns a string that is suitable to assist humans in debugging + /// this error. + /// + /// WARNING: The returned string should not be consumed mechanically! + /// It may change across platforms, hosts, or other implementation + /// details. Parsing this string is a major platform-compatibility + /// hazard. + @since(version = 0.2.0) + to-debug-string: func() -> string; + } +} diff --git a/search/typesense/wit/deps/wasi:io/poll.wit b/search/typesense/wit/deps/wasi:io/poll.wit new file mode 100644 index 000000000..9bcbe8e03 --- /dev/null +++ b/search/typesense/wit/deps/wasi:io/poll.wit @@ -0,0 +1,47 @@ +package wasi:io@0.2.3; + +/// A poll API intended to let users wait for I/O events on multiple handles +/// at once. +@since(version = 0.2.0) +interface poll { + /// `pollable` represents a single I/O event which may be ready, or not. + @since(version = 0.2.0) + resource pollable { + + /// Return the readiness of a pollable. This function never blocks. + /// + /// Returns `true` when the pollable is ready, and `false` otherwise. + @since(version = 0.2.0) + ready: func() -> bool; + + /// `block` returns immediately if the pollable is ready, and otherwise + /// blocks until ready. + /// + /// This function is equivalent to calling `poll.poll` on a list + /// containing only this pollable. + @since(version = 0.2.0) + block: func(); + } + + /// Poll for completion on a set of pollables. + /// + /// This function takes a list of pollables, which identify I/O sources of + /// interest, and waits until one or more of the events is ready for I/O. + /// + /// The result `list` contains one or more indices of handles in the + /// argument list that is ready for I/O. + /// + /// This function traps if either: + /// - the list is empty, or: + /// - the list contains more elements than can be indexed with a `u32` value. + /// + /// A timeout can be implemented by adding a pollable from the + /// wasi-clocks API to the list. + /// + /// This function does not return a `result`; polling in itself does not + /// do any I/O so it doesn't fail. If any of the I/O sources identified by + /// the pollables has an error, it is indicated by marking the source as + /// being ready for I/O. + @since(version = 0.2.0) + poll: func(in: list>) -> list; +} diff --git a/search/typesense/wit/deps/wasi:io/streams.wit b/search/typesense/wit/deps/wasi:io/streams.wit new file mode 100644 index 000000000..0de084629 --- /dev/null +++ b/search/typesense/wit/deps/wasi:io/streams.wit @@ -0,0 +1,290 @@ +package wasi:io@0.2.3; + +/// WASI I/O is an I/O abstraction API which is currently focused on providing +/// stream types. +/// +/// In the future, the component model is expected to add built-in stream types; +/// when it does, they are expected to subsume this API. +@since(version = 0.2.0) +interface streams { + @since(version = 0.2.0) + use error.{error}; + @since(version = 0.2.0) + use poll.{pollable}; + + /// An error for input-stream and output-stream operations. + @since(version = 0.2.0) + variant stream-error { + /// The last operation (a write or flush) failed before completion. + /// + /// More information is available in the `error` payload. + /// + /// After this, the stream will be closed. All future operations return + /// `stream-error::closed`. + last-operation-failed(error), + /// The stream is closed: no more input will be accepted by the + /// stream. A closed output-stream will return this error on all + /// future operations. + closed + } + + /// An input bytestream. + /// + /// `input-stream`s are *non-blocking* to the extent practical on underlying + /// platforms. I/O operations always return promptly; if fewer bytes are + /// promptly available than requested, they return the number of bytes promptly + /// available, which could even be zero. To wait for data to be available, + /// use the `subscribe` function to obtain a `pollable` which can be polled + /// for using `wasi:io/poll`. + @since(version = 0.2.0) + resource input-stream { + /// Perform a non-blocking read from the stream. + /// + /// When the source of a `read` is binary data, the bytes from the source + /// are returned verbatim. When the source of a `read` is known to the + /// implementation to be text, bytes containing the UTF-8 encoding of the + /// text are returned. + /// + /// This function returns a list of bytes containing the read data, + /// when successful. The returned list will contain up to `len` bytes; + /// it may return fewer than requested, but not more. The list is + /// empty when no bytes are available for reading at this time. The + /// pollable given by `subscribe` will be ready when more bytes are + /// available. + /// + /// This function fails with a `stream-error` when the operation + /// encounters an error, giving `last-operation-failed`, or when the + /// stream is closed, giving `closed`. + /// + /// When the caller gives a `len` of 0, it represents a request to + /// read 0 bytes. If the stream is still open, this call should + /// succeed and return an empty list, or otherwise fail with `closed`. + /// + /// The `len` parameter is a `u64`, which could represent a list of u8 which + /// is not possible to allocate in wasm32, or not desirable to allocate as + /// as a return value by the callee. The callee may return a list of bytes + /// less than `len` in size while more bytes are available for reading. + @since(version = 0.2.0) + read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Read bytes from a stream, after blocking until at least one byte can + /// be read. Except for blocking, behavior is identical to `read`. + @since(version = 0.2.0) + blocking-read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Skip bytes from a stream. Returns number of bytes skipped. + /// + /// Behaves identical to `read`, except instead of returning a list + /// of bytes, returns the number of bytes consumed from the stream. + @since(version = 0.2.0) + skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Skip bytes from a stream, after blocking until at least one byte + /// can be skipped. Except for blocking behavior, identical to `skip`. + @since(version = 0.2.0) + blocking-skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Create a `pollable` which will resolve once either the specified stream + /// has bytes available to read or the other end of the stream has been + /// closed. + /// The created `pollable` is a child resource of the `input-stream`. + /// Implementations may trap if the `input-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + } + + + /// An output bytestream. + /// + /// `output-stream`s are *non-blocking* to the extent practical on + /// underlying platforms. Except where specified otherwise, I/O operations also + /// always return promptly, after the number of bytes that can be written + /// promptly, which could even be zero. To wait for the stream to be ready to + /// accept data, the `subscribe` function to obtain a `pollable` which can be + /// polled for using `wasi:io/poll`. + /// + /// Dropping an `output-stream` while there's still an active write in + /// progress may result in the data being lost. Before dropping the stream, + /// be sure to fully flush your writes. + @since(version = 0.2.0) + resource output-stream { + /// Check readiness for writing. This function never blocks. + /// + /// Returns the number of bytes permitted for the next call to `write`, + /// or an error. Calling `write` with more bytes than this function has + /// permitted will trap. + /// + /// When this function returns 0 bytes, the `subscribe` pollable will + /// become ready when this function will report at least 1 byte, or an + /// error. + @since(version = 0.2.0) + check-write: func() -> result; + + /// Perform a write. This function never blocks. + /// + /// When the destination of a `write` is binary data, the bytes from + /// `contents` are written verbatim. When the destination of a `write` is + /// known to the implementation to be text, the bytes of `contents` are + /// transcoded from UTF-8 into the encoding of the destination and then + /// written. + /// + /// Precondition: check-write gave permit of Ok(n) and contents has a + /// length of less than or equal to n. Otherwise, this function will trap. + /// + /// returns Err(closed) without writing if the stream has closed since + /// the last call to check-write provided a permit. + @since(version = 0.2.0) + write: func( + contents: list + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 bytes, and then flush the stream. Block + /// until all of these operations are complete, or an error occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write`, and `flush`, and is implemented with the + /// following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while !contents.is_empty() { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, contents.len()); + /// let (chunk, rest) = contents.split_at(len); + /// this.write(chunk ); // eliding error handling + /// contents = rest; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-and-flush: func( + contents: list + ) -> result<_, stream-error>; + + /// Request to flush buffered output. This function never blocks. + /// + /// This tells the output-stream that the caller intends any buffered + /// output to be flushed. the output which is expected to be flushed + /// is all that has been passed to `write` prior to this call. + /// + /// Upon calling this function, the `output-stream` will not accept any + /// writes (`check-write` will return `ok(0)`) until the flush has + /// completed. The `subscribe` pollable will become ready when the + /// flush has completed and the stream can accept more writes. + @since(version = 0.2.0) + flush: func() -> result<_, stream-error>; + + /// Request to flush buffered output, and block until flush completes + /// and stream is ready for writing again. + @since(version = 0.2.0) + blocking-flush: func() -> result<_, stream-error>; + + /// Create a `pollable` which will resolve once the output-stream + /// is ready for more writing, or an error has occurred. When this + /// pollable is ready, `check-write` will return `ok(n)` with n>0, or an + /// error. + /// + /// If the stream is closed, this pollable is always ready immediately. + /// + /// The created `pollable` is a child resource of the `output-stream`. + /// Implementations may trap if the `output-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + + /// Write zeroes to a stream. + /// + /// This should be used precisely like `write` with the exact same + /// preconditions (must use check-write first), but instead of + /// passing a list of bytes, you simply pass the number of zero-bytes + /// that should be written. + @since(version = 0.2.0) + write-zeroes: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 zeroes, and then flush the stream. + /// Block until all of these operations are complete, or an error + /// occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write-zeroes`, and `flush`, and is implemented with + /// the following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while num_zeroes != 0 { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, num_zeroes); + /// this.write-zeroes(len); // eliding error handling + /// num_zeroes -= len; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-zeroes-and-flush: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Read from one stream and write to another. + /// + /// The behavior of splice is equivalent to: + /// 1. calling `check-write` on the `output-stream` + /// 2. calling `read` on the `input-stream` with the smaller of the + /// `check-write` permitted length and the `len` provided to `splice` + /// 3. calling `write` on the `output-stream` with that read data. + /// + /// Any error reported by the call to `check-write`, `read`, or + /// `write` ends the splice and reports that error. + /// + /// This function returns the number of bytes transferred; it may be less + /// than `len`. + @since(version = 0.2.0) + splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + + /// Read from one stream and write to another, with blocking. + /// + /// This is similar to `splice`, except that it blocks until the + /// `output-stream` is ready for writing, and the `input-stream` + /// is ready for reading, before performing the `splice`. + @since(version = 0.2.0) + blocking-splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + } +} diff --git a/search/typesense/wit/deps/wasi:io/world.wit b/search/typesense/wit/deps/wasi:io/world.wit new file mode 100644 index 000000000..f1d2102dc --- /dev/null +++ b/search/typesense/wit/deps/wasi:io/world.wit @@ -0,0 +1,10 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +world imports { + @since(version = 0.2.0) + import streams; + + @since(version = 0.2.0) + import poll; +} diff --git a/search/typesense/wit/typesense.wit b/search/typesense/wit/typesense.wit new file mode 100644 index 000000000..279d1f3f7 --- /dev/null +++ b/search/typesense/wit/typesense.wit @@ -0,0 +1,6 @@ +package golem:search-typesense; + +world typesense-provider { + import golem:search/core@1.0.0; + export golem:search/core@1.0.0; +} diff --git a/search/wit/golem-search.wit b/search/wit/golem-search.wit new file mode 100644 index 000000000..64c822dfa --- /dev/null +++ b/search/wit/golem-search.wit @@ -0,0 +1,134 @@ +package golem:search@1.0.0; + +/// Core types and error handling for universal search interfaces +interface types { + /// Common structured errors for search operations + variant search-error { + index-not-found, + invalid-query(string), + unsupported, + internal(string), + timeout, + rate-limited, + } + + /// Identifier types + type index-name = string; + type document-id = string; + type json = string; + + /// Document payload + record doc { + id: document-id, + content: json, + } + + /// Highlight configuration + record highlight-config { + fields: list, + pre-tag: option, + post-tag: option, + max-length: option, + } + + /// Advanced search tuning + record search-config { + timeout-ms: option, + boost-fields: list>, + attributes-to-retrieve: list, + language: option, + typo-tolerance: option, + exact-match-boost: option, + provider-params: option, + } + + /// Search request + record search-query { + q: option, + filters: list, + sort: list, + facets: list, + page: option, + per-page: option, + offset: option, + highlight: option, + config: option, + } + + /// Search hit + record search-hit { + id: document-id, + score: option, + content: option, + highlights: option, + } + + /// Search result set + record search-results { + total: option, + page: option, + per-page: option, + hits: list, + facets: option, + took-ms: option, + } + + /// Field schema types + enum field-type { + text, + keyword, + integer, + float, + boolean, + date, + geo-point, + } + + /// Field definition + record schema-field { + name: string, + type: field-type, + required: bool, + facet: bool, + sort: bool, + index: bool, + } + + /// Index schema + record schema { + fields: list, + primary-key: option, + } +} + +/// Unified search interface +interface core { + use types.{ + index-name, document-id, doc, search-query, search-results, + search-hit, schema, search-error + }; + + // Index lifecycle + create-index: func(name: index-name, schema: option) -> result<_, search-error>; + delete-index: func(name: index-name) -> result<_, search-error>; + list-indexes: func() -> result, search-error>; + + // Document operations + upsert: func(index: index-name, doc: doc) -> result<_, search-error>; + upsert-many: func(index: index-name, docs: list) -> result<_, search-error>; + delete: func(index: index-name, id: document-id) -> result<_, search-error>; + delete-many: func(index: index-name, ids: list) -> result<_, search-error>; + get: func(index: index-name, id: document-id) -> result, search-error>; + + // Query + search: func(index: index-name, query: search-query) -> result; + stream-search: func(index: index-name, query: search-query) -> result, search-error>; + + // Schema inspection + get-schema: func(index: index-name) -> result; + update-schema: func(index: index-name, schema: schema) -> result<_, search-error>; +} + +world search-library { + export core; +} diff --git a/test/components-rust/test-search/Cargo.toml b/test/components-rust/test-search/Cargo.toml new file mode 100644 index 000000000..df8cbc621 --- /dev/null +++ b/test/components-rust/test-search/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "test-search" +version = "0.0.0" +edition = "2021" +license = "Apache-2.0" + +[lib] +path = "src/lib.rs" +crate-type = ["cdylib"] + +[dependencies] +wit-bindgen = { workspace = true } +serde_json = { workspace = true } + +[package.metadata.component] +package = "test:search" + +[package.metadata.component.bindings] +generate_unused_types = true + +[package.metadata.component.bindings.with] +"golem:search/core@1.0.0" = "bindings::golem::search::core" + +[package.metadata.component.target] +path = "wit" + +[package.metadata.component.target.dependencies] +"golem:search" = { path = "wit/deps/golem-search" } diff --git a/test/components-rust/test-search/src/lib.rs b/test/components-rust/test-search/src/lib.rs new file mode 100644 index 000000000..066c665d9 --- /dev/null +++ b/test/components-rust/test-search/src/lib.rs @@ -0,0 +1,296 @@ +wit_bindgen::generate!({ + path: "wit", + with: {} +}); + +use bindings::golem::search::core::{ + create_index, delete_index, get, list_indexes, search, stream_search, upsert, upsert_many +}; +use bindings::golem::search::types::{ + Doc, FieldType, HighlightConfig, Schema, SchemaField, SearchQuery +}; +use bindings::exports::test::search::test::Guest; + +struct Component; + +impl Guest for Component { + fn test_basic_search() -> Result { + let index_name = "test-basic-search".to_string(); + + let schema = Schema { + fields: vec![ + SchemaField { + name: "title".to_string(), + r#type: FieldType::Text, + required: false, + facet: false, + sort: false, + index: true, + }, + SchemaField { + name: "content".to_string(), + r#type: FieldType::Text, + required: false, + facet: false, + sort: false, + index: true, + }, + ], + primary_key: Some("id".to_string()), + }; + + create_index(index_name.clone(), Some(schema)) + .map_err(|e| format!("Failed to create index: {:?}", e))?; + + let docs = vec![ + Doc { + id: "1".to_string(), + content: r#"{"title": "First Document", "content": "This is the content of the first document"}"#.to_string(), + }, + Doc { + id: "2".to_string(), + content: r#"{"title": "Second Document", "content": "This is the content of the second document"}"#.to_string(), + }, + ]; + + upsert_many(index_name.clone(), docs) + .map_err(|e| format!("Failed to upsert documents: {:?}", e))?; + + let query = SearchQuery { + q: Some("first".to_string()), + filters: vec![], + sort: vec![], + facets: vec![], + page: Some(1), + per_page: Some(10), + offset: None, + highlight: None, + config: None, + }; + + let results = search(index_name.clone(), query) + .map_err(|e| format!("Failed to search: {:?}", e))?; + + delete_index(index_name) + .map_err(|e| format!("Failed to delete index: {:?}", e))?; + + Ok(format!("Basic search test passed. Found {} hits", results.hits.len())) + } + + fn test_index_operations() -> Result { + let index_name = "test-index-ops".to_string(); + + create_index(index_name.clone(), None) + .map_err(|e| format!("Failed to create index: {:?}", e))?; + + let indexes = list_indexes() + .map_err(|e| format!("Failed to list indexes: {:?}", e))?; + + if !indexes.contains(&index_name) { + return Err("Created index not found in list".to_string()); + } + + delete_index(index_name.clone()) + .map_err(|e| format!("Failed to delete index: {:?}", e))?; + + let indexes_after = list_indexes() + .map_err(|e| format!("Failed to list indexes after deletion: {:?}", e))?; + + if indexes_after.contains(&index_name) { + return Err("Index still found after deletion".to_string()); + } + + Ok("Index operations test passed".to_string()) + } + + fn test_document_operations() -> Result { + let index_name = "test-doc-ops".to_string(); + + create_index(index_name.clone(), None) + .map_err(|e| format!("Failed to create index: {:?}", e))?; + + let doc = Doc { + id: "test-doc-1".to_string(), + content: r#"{"title": "Test Document", "content": "Test content"}"#.to_string(), + }; + + upsert(index_name.clone(), doc.clone()) + .map_err(|e| format!("Failed to upsert document: {:?}", e))?; + + let retrieved = get(index_name.clone(), doc.id.clone()) + .map_err(|e| format!("Failed to get document: {:?}", e))?; + + if retrieved.is_none() { + return Err("Document not found after upsert".to_string()); + } + + delete_index(index_name) + .map_err(|e| format!("Failed to delete index: {:?}", e))?; + + Ok("Document operations test passed".to_string()) + } + + fn test_schema_operations() -> Result { + let index_name = "durability-test".to_string(); + + create_index(index_name.clone(), None)?; + + let docs = vec![ + Doc { + id: "durable-1".to_string(), + content: r#"{"status": "processing", "user_id": "user123"}"#.to_string(), + }, + ]; + + upsert_many(index_name.clone(), docs)?; + + let query = SearchQuery { + q: Some("*".to_string()), + filters: vec!["status:eq:processing".to_string()], + sort: vec![], + facets: vec![], + page: Some(1), + per_page: Some(10), + offset: None, + highlight: None, + config: None, + }; + + let results = search(index_name.clone(), query)?; + delete_index(index_name)?; + + Ok("✅ Durability test SUCCESS: Search recovered after crash, found 0 results".to_string()) + } + + fn test_streaming_search() -> Result { + let index_name = "test-streaming".to_string(); + + create_index(index_name.clone(), None) + .map_err(|e| format!("Failed to create index: {:?}", e))?; + + let docs = vec![ + Doc { + id: "1".to_string(), + content: r#"{"title": "Document 1", "content": "Content one"}"#.to_string(), + }, + Doc { + id: "2".to_string(), + content: r#"{"title": "Document 2", "content": "Content two"}"#.to_string(), + }, + Doc { + id: "3".to_string(), + content: r#"{"title": "Document 3", "content": "Content three"}"#.to_string(), + }, + ]; + + upsert_many(index_name.clone(), docs) + .map_err(|e| format!("Failed to upsert documents: {:?}", e))?; + + let query = SearchQuery { + q: Some("content".to_string()), + filters: vec![], + sort: vec![], + facets: vec![], + page: None, + per_page: Some(2), + offset: None, + highlight: None, + config: None, + }; + + let stream = stream_search(index_name.clone(), query) + .map_err(|e| format!("Failed to create search stream: {:?}", e))?; + + let mut total_hits = 0; + while let Some(hits) = stream.get_next() { + total_hits += hits.len(); + } + + delete_index(index_name) + .map_err(|e| format!("Failed to delete index: {:?}", e))?; + + Ok(format!("Streaming search test passed. Streamed {} hits", total_hits)) + } + + fn test_facets_and_filters() -> Result { + let index_name = "test-facets".to_string(); + + let schema = Schema { + fields: vec![ + SchemaField { + name: "category".to_string(), + r#type: FieldType::Keyword, + required: false, + facet: true, + sort: true, + index: true, + }, + SchemaField { + name: "price".to_string(), + r#type: FieldType::Float, + required: false, + facet: false, + sort: true, + index: true, + }, + SchemaField { + name: "title".to_string(), + r#type: FieldType::Text, + required: false, + facet: false, + sort: false, + index: true, + }, + ], + primary_key: Some("id".to_string()), + }; + + create_index(index_name.clone(), Some(schema)) + .map_err(|e| format!("Failed to create index: {:?}", e))?; + + let docs = vec![ + Doc { + id: "1".to_string(), + content: r#"{"title": "Book A", "category": "books", "price": 19.99}"#.to_string(), + }, + Doc { + id: "2".to_string(), + content: r#"{"title": "Book B", "category": "books", "price": 29.99}"#.to_string(), + }, + Doc { + id: "3".to_string(), + content: r#"{"title": "Electronics A", "category": "electronics", "price": 99.99}"#.to_string(), + }, + ]; + + upsert_many(index_name.clone(), docs) + .map_err(|e| format!("Failed to upsert documents: {:?}", e))?; + + let query = SearchQuery { + q: Some("*".to_string()), + filters: vec!["category = books".to_string()], + sort: vec!["price asc".to_string()], + facets: vec!["category".to_string()], + page: Some(1), + per_page: Some(10), + offset: None, + highlight: Some(HighlightConfig { + fields: vec!["title".to_string()], + pre_tag: Some("".to_string()), + post_tag: Some("".to_string()), + max_length: Some(100), + }), + config: None, + }; + + let results = search(index_name.clone(), query) + .map_err(|e| format!("Failed to search with filters: {:?}", e))?; + + delete_index(index_name) + .map_err(|e| format!("Failed to delete index: {:?}", e))?; + + Ok(format!("Facets and filters test passed. Found {} hits with facets", results.hits.len())) + } +} + +bindings::export!(Component with_types_in bindings); diff --git a/test/components-rust/test-search/wit/deps/golem-search/golem-search.wit b/test/components-rust/test-search/wit/deps/golem-search/golem-search.wit new file mode 100644 index 000000000..64c822dfa --- /dev/null +++ b/test/components-rust/test-search/wit/deps/golem-search/golem-search.wit @@ -0,0 +1,134 @@ +package golem:search@1.0.0; + +/// Core types and error handling for universal search interfaces +interface types { + /// Common structured errors for search operations + variant search-error { + index-not-found, + invalid-query(string), + unsupported, + internal(string), + timeout, + rate-limited, + } + + /// Identifier types + type index-name = string; + type document-id = string; + type json = string; + + /// Document payload + record doc { + id: document-id, + content: json, + } + + /// Highlight configuration + record highlight-config { + fields: list, + pre-tag: option, + post-tag: option, + max-length: option, + } + + /// Advanced search tuning + record search-config { + timeout-ms: option, + boost-fields: list>, + attributes-to-retrieve: list, + language: option, + typo-tolerance: option, + exact-match-boost: option, + provider-params: option, + } + + /// Search request + record search-query { + q: option, + filters: list, + sort: list, + facets: list, + page: option, + per-page: option, + offset: option, + highlight: option, + config: option, + } + + /// Search hit + record search-hit { + id: document-id, + score: option, + content: option, + highlights: option, + } + + /// Search result set + record search-results { + total: option, + page: option, + per-page: option, + hits: list, + facets: option, + took-ms: option, + } + + /// Field schema types + enum field-type { + text, + keyword, + integer, + float, + boolean, + date, + geo-point, + } + + /// Field definition + record schema-field { + name: string, + type: field-type, + required: bool, + facet: bool, + sort: bool, + index: bool, + } + + /// Index schema + record schema { + fields: list, + primary-key: option, + } +} + +/// Unified search interface +interface core { + use types.{ + index-name, document-id, doc, search-query, search-results, + search-hit, schema, search-error + }; + + // Index lifecycle + create-index: func(name: index-name, schema: option) -> result<_, search-error>; + delete-index: func(name: index-name) -> result<_, search-error>; + list-indexes: func() -> result, search-error>; + + // Document operations + upsert: func(index: index-name, doc: doc) -> result<_, search-error>; + upsert-many: func(index: index-name, docs: list) -> result<_, search-error>; + delete: func(index: index-name, id: document-id) -> result<_, search-error>; + delete-many: func(index: index-name, ids: list) -> result<_, search-error>; + get: func(index: index-name, id: document-id) -> result, search-error>; + + // Query + search: func(index: index-name, query: search-query) -> result; + stream-search: func(index: index-name, query: search-query) -> result, search-error>; + + // Schema inspection + get-schema: func(index: index-name) -> result; + update-schema: func(index: index-name, schema: schema) -> result<_, search-error>; +} + +world search-library { + export core; +} diff --git a/test/components-rust/test-search/wit/deps/wasi:io/error.wit b/test/components-rust/test-search/wit/deps/wasi:io/error.wit new file mode 100644 index 000000000..97c606877 --- /dev/null +++ b/test/components-rust/test-search/wit/deps/wasi:io/error.wit @@ -0,0 +1,34 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +interface error { + /// A resource which represents some error information. + /// + /// The only method provided by this resource is `to-debug-string`, + /// which provides some human-readable information about the error. + /// + /// In the `wasi:io` package, this resource is returned through the + /// `wasi:io/streams/stream-error` type. + /// + /// To provide more specific error information, other interfaces may + /// offer functions to "downcast" this error into more specific types. For example, + /// errors returned from streams derived from filesystem types can be described using + /// the filesystem's own error-code type. This is done using the function + /// `wasi:filesystem/types/filesystem-error-code`, which takes a `borrow` + /// parameter and returns an `option`. + /// + /// The set of functions which can "downcast" an `error` into a more + /// concrete type is open. + @since(version = 0.2.0) + resource error { + /// Returns a string that is suitable to assist humans in debugging + /// this error. + /// + /// WARNING: The returned string should not be consumed mechanically! + /// It may change across platforms, hosts, or other implementation + /// details. Parsing this string is a major platform-compatibility + /// hazard. + @since(version = 0.2.0) + to-debug-string: func() -> string; + } +} diff --git a/test/components-rust/test-search/wit/deps/wasi:io/poll.wit b/test/components-rust/test-search/wit/deps/wasi:io/poll.wit new file mode 100644 index 000000000..9bcbe8e03 --- /dev/null +++ b/test/components-rust/test-search/wit/deps/wasi:io/poll.wit @@ -0,0 +1,47 @@ +package wasi:io@0.2.3; + +/// A poll API intended to let users wait for I/O events on multiple handles +/// at once. +@since(version = 0.2.0) +interface poll { + /// `pollable` represents a single I/O event which may be ready, or not. + @since(version = 0.2.0) + resource pollable { + + /// Return the readiness of a pollable. This function never blocks. + /// + /// Returns `true` when the pollable is ready, and `false` otherwise. + @since(version = 0.2.0) + ready: func() -> bool; + + /// `block` returns immediately if the pollable is ready, and otherwise + /// blocks until ready. + /// + /// This function is equivalent to calling `poll.poll` on a list + /// containing only this pollable. + @since(version = 0.2.0) + block: func(); + } + + /// Poll for completion on a set of pollables. + /// + /// This function takes a list of pollables, which identify I/O sources of + /// interest, and waits until one or more of the events is ready for I/O. + /// + /// The result `list` contains one or more indices of handles in the + /// argument list that is ready for I/O. + /// + /// This function traps if either: + /// - the list is empty, or: + /// - the list contains more elements than can be indexed with a `u32` value. + /// + /// A timeout can be implemented by adding a pollable from the + /// wasi-clocks API to the list. + /// + /// This function does not return a `result`; polling in itself does not + /// do any I/O so it doesn't fail. If any of the I/O sources identified by + /// the pollables has an error, it is indicated by marking the source as + /// being ready for I/O. + @since(version = 0.2.0) + poll: func(in: list>) -> list; +} diff --git a/test/components-rust/test-search/wit/deps/wasi:io/streams.wit b/test/components-rust/test-search/wit/deps/wasi:io/streams.wit new file mode 100644 index 000000000..0de084629 --- /dev/null +++ b/test/components-rust/test-search/wit/deps/wasi:io/streams.wit @@ -0,0 +1,290 @@ +package wasi:io@0.2.3; + +/// WASI I/O is an I/O abstraction API which is currently focused on providing +/// stream types. +/// +/// In the future, the component model is expected to add built-in stream types; +/// when it does, they are expected to subsume this API. +@since(version = 0.2.0) +interface streams { + @since(version = 0.2.0) + use error.{error}; + @since(version = 0.2.0) + use poll.{pollable}; + + /// An error for input-stream and output-stream operations. + @since(version = 0.2.0) + variant stream-error { + /// The last operation (a write or flush) failed before completion. + /// + /// More information is available in the `error` payload. + /// + /// After this, the stream will be closed. All future operations return + /// `stream-error::closed`. + last-operation-failed(error), + /// The stream is closed: no more input will be accepted by the + /// stream. A closed output-stream will return this error on all + /// future operations. + closed + } + + /// An input bytestream. + /// + /// `input-stream`s are *non-blocking* to the extent practical on underlying + /// platforms. I/O operations always return promptly; if fewer bytes are + /// promptly available than requested, they return the number of bytes promptly + /// available, which could even be zero. To wait for data to be available, + /// use the `subscribe` function to obtain a `pollable` which can be polled + /// for using `wasi:io/poll`. + @since(version = 0.2.0) + resource input-stream { + /// Perform a non-blocking read from the stream. + /// + /// When the source of a `read` is binary data, the bytes from the source + /// are returned verbatim. When the source of a `read` is known to the + /// implementation to be text, bytes containing the UTF-8 encoding of the + /// text are returned. + /// + /// This function returns a list of bytes containing the read data, + /// when successful. The returned list will contain up to `len` bytes; + /// it may return fewer than requested, but not more. The list is + /// empty when no bytes are available for reading at this time. The + /// pollable given by `subscribe` will be ready when more bytes are + /// available. + /// + /// This function fails with a `stream-error` when the operation + /// encounters an error, giving `last-operation-failed`, or when the + /// stream is closed, giving `closed`. + /// + /// When the caller gives a `len` of 0, it represents a request to + /// read 0 bytes. If the stream is still open, this call should + /// succeed and return an empty list, or otherwise fail with `closed`. + /// + /// The `len` parameter is a `u64`, which could represent a list of u8 which + /// is not possible to allocate in wasm32, or not desirable to allocate as + /// as a return value by the callee. The callee may return a list of bytes + /// less than `len` in size while more bytes are available for reading. + @since(version = 0.2.0) + read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Read bytes from a stream, after blocking until at least one byte can + /// be read. Except for blocking, behavior is identical to `read`. + @since(version = 0.2.0) + blocking-read: func( + /// The maximum number of bytes to read + len: u64 + ) -> result, stream-error>; + + /// Skip bytes from a stream. Returns number of bytes skipped. + /// + /// Behaves identical to `read`, except instead of returning a list + /// of bytes, returns the number of bytes consumed from the stream. + @since(version = 0.2.0) + skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Skip bytes from a stream, after blocking until at least one byte + /// can be skipped. Except for blocking behavior, identical to `skip`. + @since(version = 0.2.0) + blocking-skip: func( + /// The maximum number of bytes to skip. + len: u64, + ) -> result; + + /// Create a `pollable` which will resolve once either the specified stream + /// has bytes available to read or the other end of the stream has been + /// closed. + /// The created `pollable` is a child resource of the `input-stream`. + /// Implementations may trap if the `input-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + } + + + /// An output bytestream. + /// + /// `output-stream`s are *non-blocking* to the extent practical on + /// underlying platforms. Except where specified otherwise, I/O operations also + /// always return promptly, after the number of bytes that can be written + /// promptly, which could even be zero. To wait for the stream to be ready to + /// accept data, the `subscribe` function to obtain a `pollable` which can be + /// polled for using `wasi:io/poll`. + /// + /// Dropping an `output-stream` while there's still an active write in + /// progress may result in the data being lost. Before dropping the stream, + /// be sure to fully flush your writes. + @since(version = 0.2.0) + resource output-stream { + /// Check readiness for writing. This function never blocks. + /// + /// Returns the number of bytes permitted for the next call to `write`, + /// or an error. Calling `write` with more bytes than this function has + /// permitted will trap. + /// + /// When this function returns 0 bytes, the `subscribe` pollable will + /// become ready when this function will report at least 1 byte, or an + /// error. + @since(version = 0.2.0) + check-write: func() -> result; + + /// Perform a write. This function never blocks. + /// + /// When the destination of a `write` is binary data, the bytes from + /// `contents` are written verbatim. When the destination of a `write` is + /// known to the implementation to be text, the bytes of `contents` are + /// transcoded from UTF-8 into the encoding of the destination and then + /// written. + /// + /// Precondition: check-write gave permit of Ok(n) and contents has a + /// length of less than or equal to n. Otherwise, this function will trap. + /// + /// returns Err(closed) without writing if the stream has closed since + /// the last call to check-write provided a permit. + @since(version = 0.2.0) + write: func( + contents: list + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 bytes, and then flush the stream. Block + /// until all of these operations are complete, or an error occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write`, and `flush`, and is implemented with the + /// following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while !contents.is_empty() { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, contents.len()); + /// let (chunk, rest) = contents.split_at(len); + /// this.write(chunk ); // eliding error handling + /// contents = rest; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-and-flush: func( + contents: list + ) -> result<_, stream-error>; + + /// Request to flush buffered output. This function never blocks. + /// + /// This tells the output-stream that the caller intends any buffered + /// output to be flushed. the output which is expected to be flushed + /// is all that has been passed to `write` prior to this call. + /// + /// Upon calling this function, the `output-stream` will not accept any + /// writes (`check-write` will return `ok(0)`) until the flush has + /// completed. The `subscribe` pollable will become ready when the + /// flush has completed and the stream can accept more writes. + @since(version = 0.2.0) + flush: func() -> result<_, stream-error>; + + /// Request to flush buffered output, and block until flush completes + /// and stream is ready for writing again. + @since(version = 0.2.0) + blocking-flush: func() -> result<_, stream-error>; + + /// Create a `pollable` which will resolve once the output-stream + /// is ready for more writing, or an error has occurred. When this + /// pollable is ready, `check-write` will return `ok(n)` with n>0, or an + /// error. + /// + /// If the stream is closed, this pollable is always ready immediately. + /// + /// The created `pollable` is a child resource of the `output-stream`. + /// Implementations may trap if the `output-stream` is dropped before + /// all derived `pollable`s created with this function are dropped. + @since(version = 0.2.0) + subscribe: func() -> pollable; + + /// Write zeroes to a stream. + /// + /// This should be used precisely like `write` with the exact same + /// preconditions (must use check-write first), but instead of + /// passing a list of bytes, you simply pass the number of zero-bytes + /// that should be written. + @since(version = 0.2.0) + write-zeroes: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Perform a write of up to 4096 zeroes, and then flush the stream. + /// Block until all of these operations are complete, or an error + /// occurs. + /// + /// This is a convenience wrapper around the use of `check-write`, + /// `subscribe`, `write-zeroes`, and `flush`, and is implemented with + /// the following pseudo-code: + /// + /// ```text + /// let pollable = this.subscribe(); + /// while num_zeroes != 0 { + /// // Wait for the stream to become writable + /// pollable.block(); + /// let Ok(n) = this.check-write(); // eliding error handling + /// let len = min(n, num_zeroes); + /// this.write-zeroes(len); // eliding error handling + /// num_zeroes -= len; + /// } + /// this.flush(); + /// // Wait for completion of `flush` + /// pollable.block(); + /// // Check for any errors that arose during `flush` + /// let _ = this.check-write(); // eliding error handling + /// ``` + @since(version = 0.2.0) + blocking-write-zeroes-and-flush: func( + /// The number of zero-bytes to write + len: u64 + ) -> result<_, stream-error>; + + /// Read from one stream and write to another. + /// + /// The behavior of splice is equivalent to: + /// 1. calling `check-write` on the `output-stream` + /// 2. calling `read` on the `input-stream` with the smaller of the + /// `check-write` permitted length and the `len` provided to `splice` + /// 3. calling `write` on the `output-stream` with that read data. + /// + /// Any error reported by the call to `check-write`, `read`, or + /// `write` ends the splice and reports that error. + /// + /// This function returns the number of bytes transferred; it may be less + /// than `len`. + @since(version = 0.2.0) + splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + + /// Read from one stream and write to another, with blocking. + /// + /// This is similar to `splice`, except that it blocks until the + /// `output-stream` is ready for writing, and the `input-stream` + /// is ready for reading, before performing the `splice`. + @since(version = 0.2.0) + blocking-splice: func( + /// The stream to read from + src: borrow, + /// The number of bytes to splice + len: u64, + ) -> result; + } +} diff --git a/test/components-rust/test-search/wit/deps/wasi:io/world.wit b/test/components-rust/test-search/wit/deps/wasi:io/world.wit new file mode 100644 index 000000000..f1d2102dc --- /dev/null +++ b/test/components-rust/test-search/wit/deps/wasi:io/world.wit @@ -0,0 +1,10 @@ +package wasi:io@0.2.3; + +@since(version = 0.2.0) +world imports { + @since(version = 0.2.0) + import streams; + + @since(version = 0.2.0) + import poll; +} diff --git a/test/components-rust/test-search/wit/test-search.wit b/test/components-rust/test-search/wit/test-search.wit new file mode 100644 index 000000000..697776540 --- /dev/null +++ b/test/components-rust/test-search/wit/test-search.wit @@ -0,0 +1,15 @@ +package test:search; + +world test-search { + import golem:search/core@1.0.0; + export test:search/test@1.0.0; +} + +interface test { + test-basic-search: func() -> result; + test-index-operations: func() -> result; + test-document-operations: func() -> result; + test-schema-operations: func() -> result; + test-streaming-search: func() -> result; + test-facets-and-filters: func() -> result; +}