StarlightSearch
diff --git a/‎Cargo.lock
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/clip.py
Lines changed: 10 additions & 5 deletions b/‎examples/clip.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎examples/cohere_pdf.py
Lines changed: 59 additions & 0 deletions b/‎examples/cohere_pdf.py
Lines changed: 59 additions & 0 deletions
diff --git a/‎processors/src/markdown_processor.rs
Lines changed: 1 addition & 1 deletion b/‎processors/src/markdown_processor.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/python/embed_anything/_embed_anything.pyi
Lines changed: 23 additions & 5 deletions b/‎python/python/embed_anything/_embed_anything.pyi
Lines changed: 23 additions & 5 deletions
diff --git a/‎python/src/config.rs
Lines changed: 8 additions & 3 deletions b/‎python/src/config.rs
Lines changed: 8 additions & 3 deletions
diff --git a/‎python/src/lib.rs
Lines changed: 13 additions & 3 deletions b/‎python/src/lib.rs
Lines changed: 13 additions & 3 deletions
diff --git a/‎rust/Cargo.toml
Lines changed: 3 additions & 0 deletions b/‎rust/Cargo.toml
Lines changed: 3 additions & 0 deletions
diff --git a/‎rust/examples/clip.rs
Lines changed: 11 additions & 10 deletions b/‎rust/examples/clip.rs
Lines changed: 11 additions & 10 deletions
@@ -8,7 +8,7 @@
 # Load the model.
 model = embed_anything.EmbeddingModel.from_pretrained_hf(
     embed_anything.WhichModel.Clip,
-    model_id="openai/clip-vit-base-patch16",
+    model_id="google/siglip-base-patch16-224",
 )
 data: list[EmbedData] = embed_anything.embed_image_directory(
     "test_files", embedder=model
@@ -17,10 +17,8 @@
 # Convert the embeddings to a numpy array
 embeddings = np.array([data.embedding for data in data])
 
-print(data[0])
-
 # Embed a query
-query = ["Photo of a monkey?"]
+query = ["Photo of a monkey"]
 query_embedding = np.array(
     embed_anything.embed_query(query, embedder=model)[0].embedding
 )
@@ -31,7 +29,14 @@
 # Find the index of the most similar embedding
 max_index = np.argmax(similarities)
 
+print("Descending order of similarity: ")
+indices = np.argsort(similarities)[::-1]
+for idx in indices:
+    print(data[idx].text)
+
+print("----------- ")
+
 # Print the most similar image
-print(data[max_index].text)
+print("Most similar image: ", data[max_index].text)
 end = time.time()
 print("Time taken: ", end - start)
@@ -0,0 +1,59 @@
+from embed_anything import EmbeddingModel, TextEmbedConfig, WhichModel
+import numpy as np
+from pathlib import Path
+from tabulate import tabulate
+from embed_anything import EmbedData
+from pdf2image import convert_from_path
+
+
+# Initialize the model once
+model: EmbeddingModel = EmbeddingModel.from_pretrained_cloud(
+    WhichModel.CohereVision, model_id="embed-v4.0"
+)
+
+
+# Get all PDF files in the directory
+directory = Path("test_files")
+files = directory.glob("*.pdf")
+# files = [Path("test_files/attention.pdf")]
+
+file_embed_data: list[EmbedData] = []
+for file in files:
+    try:
+        embedding: list[EmbedData] = model.embed_file(
+            str(file), TextEmbedConfig(batch_size=8)
+        )
+        file_embed_data.extend(embedding)
+    except Exception as e:
+        print(f"Error embedding file {file}: {e}")
+
+# Define the query
+query = "What are the Bleu score results for the attention paper?"
+
+# Scoring
+file_embeddings = np.array([e.embedding for e in file_embed_data])
+query_embedding = model.embed_query([query])
+query_embeddings = np.array([e.embedding for e in query_embedding])
+print(file_embeddings.shape)
+print(query_embeddings.shape)
+
+
+scores = np.dot(query_embeddings, file_embeddings.T).squeeze()
+
+# Get top pages
+top_pages = np.argsort(scores)[-5:][::-1].tolist()  # Convert to list
+
+print(top_pages)
+# Extract file names and page numbers
+table = [
+    [
+        file_embed_data[int(page)].metadata["file_path"],
+        file_embed_data[int(page)].metadata["page_number"],
+    ]
+    for page in top_pages
+]
+
+# Print the results in a table
+print(tabulate(table, headers=["File Name", "Page Number"], tablefmt="grid"))
+
+images = [file_embed_data[int(page)].metadata["image"] for page in top_pages]
@@ -20,7 +20,7 @@ impl MarkdownProcessor {
 impl DocumentProcessor for MarkdownProcessor {
 
     fn process_document(&self, content: &str) -> anyhow::Result<Document> {
-        let chunks = self.splitter.chunks(content).into_iter()
+        let chunks = self.splitter.chunks(content)
             .map(|x| x.to_string())
             .collect();
         Ok(Document {
 
@@ -239,7 +239,7 @@ def embed_html(
         file_name: The path to the HTML file to embed.
         embedder: The embedding model to use.
         origin: The origin of the HTML file.
-        config: The configuration for the embedding model.  
+        config: The configuration for the embedding model.
         adapter: The adapter to use for storing the embeddings.
 
     Returns:
@@ -259,7 +259,6 @@ def embed_html(
     ```
     """
 
-
 def embed_audio_file(
     file_path: str,
     audio_decoder: AudioDecoderModel,
@@ -542,19 +541,26 @@ class ImageEmbedConfig:
 
     Attributes:
         buffer_size: The buffer size for the Image Embedding model. Default is 100.
+        batch_size: The batch size for processing the embeddings. Default is 32. Based on the memory, you can increase or decrease the batch size.
     """
 
-    def __init__(self, buffer_size: int | None = None):
+    def __init__(self, buffer_size: int | None = None, batch_size: int | None = None):
         self.buffer_size = buffer_size
+        self.batch_size = batch_size
     buffer_size: int | None
+    batch_size: int | None
 
 class EmbeddingModel:
     """
     Represents an embedding model.
     """
 
     def from_pretrained_hf(
-        model: WhichModel, model_id: str, revision: str | None = None, token: str | None = None, dtype: Dtype | None = None
+        model: WhichModel,
+        model_id: str,
+        revision: str | None = None,
+        token: str | None = None,
+        dtype: Dtype | None = None,
     ) -> EmbeddingModel:
         """
         Loads an embedding model from the Hugging Face model hub.
@@ -586,9 +592,12 @@ class EmbeddingModel:
         Attributes:
             model (WhichModel): The cloud service to use. Currently supports WhichModel.OpenAI and WhichModel.Cohere.
             model_id (str): The ID of the model to use.
+
                 - For OpenAI, see available models at https://platform.openai.com/docs/guides/embeddings/embedding-models
                 - For Cohere, see available models at https://docs.cohere.com/docs/cohere-embed
+                - For CohereVision, see available models at https://docs.cohere.com/docs/cohere-embed
             api_key (str | None, optional): The API key for accessing the model. If not provided, it is taken from the environment variable:
+
                 - For OpenAI: OPENAI_API_KEY
                 - For Cohere: CO_API_KEY
 
@@ -680,6 +689,7 @@ class EmbeddingModel:
         Returns:
             A list of EmbedData objects.
         """
+
     def embed_files_batch(
         self,
         files: list[str],
@@ -697,6 +707,7 @@ class EmbeddingModel:
         Returns:
             A list of EmbedData objects.
         """
+
     def embed_audio_file(
         self,
         audio_file: str,
@@ -714,6 +725,7 @@ class EmbeddingModel:
         Returns:
             A list of EmbedData objects.
         """
+
     def embed_query(
         self,
         query: list[str],
@@ -747,6 +759,7 @@ class EmbeddingModel:
         Returns:
             A list of EmbedData objects.
         """
+
     def embed_directory(
         self,
         directory: str,
@@ -764,6 +777,7 @@ class EmbeddingModel:
         Returns:
             A list of EmbedData objects.
         """
+
     def embed_directory_stream(
         self,
         directory: str,
@@ -781,6 +795,7 @@ class EmbeddingModel:
         Returns:
             A list of EmbedData objects.
         """
+
     def embed_webpage(
         self,
         url: str,
@@ -798,6 +813,7 @@ class EmbeddingModel:
         Returns:
             A list of EmbedData objects.
         """
+
 class AudioDecoderModel:
     """
     Represents an audio decoder model.
@@ -835,13 +851,15 @@ class AudioDecoderModel:
 class WhichModel(Enum):
     OpenAI = ("OpenAI",)
     Cohere = ("Cohere",)
+    CohereVision = ("CohereVision",)
     Bert = ("Bert",)
     Jina = ("Jina",)
     Clip = ("Clip",)
     Colpali = ("Colpali",)
     ColBert = ("ColBert",)
     SparseBert = ("SparseBert",)
     ModernBert = ("ModernBert",)
+
 class ONNXModel(Enum):
     """
     Enum representing various ONNX models.
@@ -952,4 +970,4 @@ class ONNXModel(Enum):
 
     SPLADEPPENV2 = "SPLADEPPENV2"
 
-    ModernBERTBase = "ModernBERTBase"
+    ModernBERTBase = "ModernBERTBase"
@@ -73,15 +73,20 @@ pub struct ImageEmbedConfig {
 #[pymethods]
 impl ImageEmbedConfig {
     #[new]
-    #[pyo3(signature = (buffer_size=None))]
-    pub fn new(buffer_size: Option<usize>) -> Self {
+    #[pyo3(signature = (buffer_size=None, batch_size=None))]
+    pub fn new(buffer_size: Option<usize>, batch_size: Option<usize>) -> Self {
         Self {
-            inner: embed_anything::config::ImageEmbedConfig::new(buffer_size),
+            inner: embed_anything::config::ImageEmbedConfig::new(buffer_size, batch_size),
         }
     }
 
     #[getter]
     pub fn buffer_size(&self) -> Option<usize> {
         self.inner.buffer_size
     }
+
+    #[getter]
+    pub fn batch_size(&self) -> Option<usize> {
+        self.inner.batch_size
+    }
 }
@@ -82,6 +82,7 @@ impl EmbedData {
 pub enum WhichModel {
     OpenAI,
     Cohere,
+    CohereVision,
     Bert,
     SparseBert,
     ColBert,
@@ -275,6 +276,18 @@ impl EmbeddingModel {
                     inner: Arc::new(model),
                 })
             }
+            WhichModel::CohereVision => {
+                let model_id = model_id.unwrap_or("embed-v4.0");
+                let model = Embedder::Vision(VisionEmbedder::Cohere(
+                    embed_anything::embeddings::cloud::cohere::CohereEmbedder::new(
+                        model_id.to_string(),
+                        api_key,
+                    ),
+                ));
+                Ok(EmbeddingModel {
+                    inner: Arc::new(model),
+                })
+            }
             _ => panic!("Invalid model"),
         }
     }
@@ -668,7 +681,6 @@ pub fn embed_directory(
     let embedding_model = &embedder.inner;
 
     let rt = Builder::new_multi_thread().enable_all().build().unwrap();
-    println!("Runtime created");
     let adapter = match adapter {
         Some(adapter) => {
             let callback = move |data: Vec<embed_anything::embeddings::embed::EmbedData>| {
@@ -725,8 +737,6 @@ pub fn embed_image_directory(
     let embedding_model = &embedder.inner;
     let config = config.map(|c| &c.inner);
     let rt = Builder::new_multi_thread().enable_all().build().unwrap();
-    println!("Runtime created");
-
     let adapter = match adapter {
         Some(adapter) => {
             let callback = move |data: Vec<embed_anything::embeddings::embed::EmbedData>| {
 
@@ -69,6 +69,9 @@ statistical = "1.0.0"
 half = "2.4.1"
 candle-flash-attn = { workspace = true, optional = true }
 
+# Logging
+log = "0.4"
+
 [dev-dependencies]
 tempdir = "0.3.7"
 lazy_static = "1.4.0"
 
@@ -11,7 +11,7 @@ async fn main() {
 
     let model = EmbedderBuilder::new()
         .model_architecture("clip")
-        .model_id(Some("openai/clip-vit-base-patch32"))
+        .model_id(Some("google/siglip-base-patch16-224"))
         .revision(None)
         .token(None)
         .from_pretrained_hf()
@@ -22,7 +22,8 @@ async fn main() {
         .unwrap()
         .unwrap();
 
-    let query_emb_data = embed_query(&["Photo of a monkey"], &model, None)
+    
+    let query_emb_data = embed_query(&["Photo of a monkey?"], &model, None)
         .await
         .unwrap();
     let n_vectors = out.len();
@@ -68,18 +69,18 @@ async fn main() {
         .unwrap()
         .to_vec1::<f32>()
         .unwrap();
+
     let mut indices: Vec<usize> = (0..similarities.len()).collect();
     indices.sort_by(|a, b| similarities[*b].partial_cmp(&similarities[*a]).unwrap());
+    
+    println!("Descending order of similarity: ");
+    for idx in &indices {
+        println!("{}", image_paths[*idx]);
+    }
 
-    let top_3_indices = indices[0..3].to_vec();
-    let top_3_image_paths = top_3_indices
-        .iter()
-        .map(|i| image_paths[*i].clone())
-        .collect::<Vec<String>>();
-
-    let similar_image = top_3_image_paths[0].clone();
+    println!("-----------");
 
-    println!("{:?}", similar_image);
+    println!("Most similar image: {}", image_paths[indices[0]]);
 
     let elapsed_time = now.elapsed();
     println!("Elapsed Time: {}", elapsed_time.as_secs_f32());
Original file line number	Diff line number	Diff line change
`@@ -73,15 +73,20 @@ pub struct ImageEmbedConfig {`
`73`	`73`	`#[pymethods]`
`74`	`74`	`impl ImageEmbedConfig {`
`75`	`75`	`#[new]`
`76`		`- #[pyo3(signature = (buffer_size=None))]`
`77`		`- pub fn new(buffer_size: Option<usize>) -> Self {`
	`76`	`+ #[pyo3(signature = (buffer_size=None, batch_size=None))]`
	`77`	`+ pub fn new(buffer_size: Option<usize>, batch_size: Option<usize>) -> Self {`
`78`	`78`	`Self {`
`79`		`- inner: embed_anything::config::ImageEmbedConfig::new(buffer_size),`
	`79`	`+ inner: embed_anything::config::ImageEmbedConfig::new(buffer_size, batch_size),`
`80`	`80`	`}`
`81`	`81`	`}`
`82`	`82`
`83`	`83`	`#[getter]`
`84`	`84`	`pub fn buffer_size(&self) -> Option<usize> {`
`85`	`85`	`self.inner.buffer_size`
`86`	`86`	`}`
	`87`	`+`
	`88`	`+ #[getter]`
	`89`	`+ pub fn batch_size(&self) -> Option<usize> {`
	`90`	`+ self.inner.batch_size`
	`91`	`+ }`
`87`	`92`	`}`