weaviate vector search filter support

kausmeows · kausmeows · commit 8c114c4dfef8 · 2025-05-15T02:52:00.000+05:30
diff --git a/cookbook/agent_concepts/knowledge/filters/filtering_chroma_db.py b/cookbook/agent_concepts/knowledge/filters/filtering_chroma_db.py
@@ -8,11 +8,11 @@
 
 # Download all sample CVs and get their paths
 downloaded_cv_paths = download_knowledge_filters_sample_data(
-    num_files=5, file_extension=SampleDataFileExtension.PDF)
+    num_files=5, file_extension=SampleDataFileExtension.PDF
+)
 
 # Initialize ChromaDB
-vector_db = ChromaDb(collection="recipes",
-                     path="tmp/chromadb", persistent_client=True)
+vector_db = ChromaDb(collection="recipes", path="tmp/chromadb", persistent_client=True)
 
 # Step 1: Initialize knowledge base with documents and metadata
 # ------------------------------------------------------------------------------
diff --git a/cookbook/agent_concepts/knowledge/filters/filtering_weaviate.py b/cookbook/agent_concepts/knowledge/filters/filtering_weaviate.py
@@ -0,0 +1,91 @@
+from os import getenv
+
+from agno.agent import Agent
+from agno.knowledge.pdf import PDFKnowledgeBase
+from agno.utils.media import (
+    SampleDataFileExtension,
+    download_knowledge_filters_sample_data,
+)
+from agno.vectordb.search import SearchType
+from agno.vectordb.weaviate import Distance, VectorIndex, Weaviate
+
+# Download all sample CVs and get their paths
+downloaded_cv_paths = download_knowledge_filters_sample_data(
+    num_files=5, file_extension=SampleDataFileExtension.PDF)
+
+# Step 1: Initialize knowledge base with documents and metadata
+# ------------------------------------------------------------------------------
+# When initializing the knowledge base, we can attach metadata that will be used for filtering
+# This metadata can include user IDs, document types, dates, or any other attributes
+
+vector_db = Weaviate(
+    collection="recipes",
+    vector_index=VectorIndex.HNSW,
+    distance=Distance.COSINE,
+    local=False,  # Set to False if using Weaviate Cloud and True if using local instance
+)
+
+knowledge_base = PDFKnowledgeBase(
+    path=[
+        {
+            "path": downloaded_cv_paths[0],
+            "metadata": {
+                "user_id": "jordan_mitchell",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[1],
+            "metadata": {
+                "user_id": "taylor_brooks",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[2],
+            "metadata": {
+                "user_id": "morgan_lee",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[3],
+            "metadata": {
+                "user_id": "casey_jordan",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[4],
+            "metadata": {
+                "user_id": "alex_rivera",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+    ],
+    vector_db=vector_db,
+)
+
+# Load all documents into the vector database
+knowledge_base.load(recreate=True)
+
+# Step 2: Query the knowledge base with different filter combinations
+# ------------------------------------------------------------------------------
+
+# Option 1: Filters on the Agent
+# Initialize the Agent with the knowledge base and filters
+agent = Agent(
+    knowledge=knowledge_base,
+    search_knowledge=True,
+)
+
+agent.print_response(
+    "Tell me about Jordan Mitchell's experience and skills",
+    knowledge_filters={"user_id": "jordan_mitchell"},
+    markdown=True,
+)
diff --git a/libs/agno/agno/test.py b/libs/agno/agno/test.py
@@ -0,0 +1,90 @@
+from os import getenv
+
+from agno.agent import Agent
+from agno.knowledge.pdf import PDFKnowledgeBase
+from agno.utils.media import (
+    SampleDataFileExtension,
+    download_knowledge_filters_sample_data,
+)
+from agno.vectordb.search import SearchType
+from agno.vectordb.weaviate import Distance, VectorIndex, Weaviate
+
+# Download all sample CVs and get their paths
+downloaded_cv_paths = download_knowledge_filters_sample_data(num_files=5, file_extension=SampleDataFileExtension.PDF)
+
+# Step 1: Initialize knowledge base with documents and metadata
+# ------------------------------------------------------------------------------
+# When initializing the knowledge base, we can attach metadata that will be used for filtering
+# This metadata can include user IDs, document types, dates, or any other attributes
+
+vector_db = Weaviate(
+    collection="recipes",
+    vector_index=VectorIndex.HNSW,
+    distance=Distance.COSINE,
+    local=False,  # Set to False if using Weaviate Cloud and True if using local instance
+)
+
+knowledge_base = PDFKnowledgeBase(
+    path=[
+        {
+            "path": downloaded_cv_paths[0],
+            "metadata": {
+                "user_id": "jordan_mitchell",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[1],
+            "metadata": {
+                "user_id": "taylor_brooks",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[2],
+            "metadata": {
+                "user_id": "morgan_lee",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[3],
+            "metadata": {
+                "user_id": "casey_jordan",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+        {
+            "path": downloaded_cv_paths[4],
+            "metadata": {
+                "user_id": "alex_rivera",
+                "document_type": "cv",
+                "year": 2025,
+            },
+        },
+    ],
+    vector_db=vector_db,
+)
+
+# Load all documents into the vector database
+knowledge_base.load(recreate=True)
+
+# Step 2: Query the knowledge base with different filter combinations
+# ------------------------------------------------------------------------------
+
+# Option 1: Filters on the Agent
+# Initialize the Agent with the knowledge base and filters
+agent = Agent(
+    knowledge=knowledge_base,
+    search_knowledge=True,
+)
+
+agent.print_response(
+    "Tell me about Jordan Mitchell's experience and skills",
+    knowledge_filters={"user_id": "hi"},
+    markdown=True,
+)
diff --git a/libs/agno/agno/vectordb/chroma/chromadb.py b/libs/agno/agno/vectordb/chroma/chromadb.py
@@ -157,12 +157,12 @@ def insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] =
             document.embed(embedder=self.embedder)
             cleaned_content = document.content.replace("\x00", "\ufffd")
             doc_id = md5(cleaned_content.encode()).hexdigest()
-            
+
             # Handle metadata and filters
             metadata = document.meta_data or {}
             if filters:
                 metadata.update(filters)
-            
+
             docs_embeddings.append(document.embedding)
             docs.append(cleaned_content)
             ids.append(doc_id)
@@ -282,21 +282,21 @@ def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] =
 
         if self.reranker:
             search_results = self.reranker.rerank(query=query, documents=search_results)
-            
+
         log_info(f"Found {len(search_results)} documents")
         return search_results
 
     def _convert_filters(self, filters: Dict[str, Any]) -> Dict[str, Any]:
         """Convert simple filters to ChromaDB's filter format.
-        
+
         Handles conversion of simple key-value filters to ChromaDB's operator format
         when needed.
         """
         if not filters:
             return {}
 
         # If filters already use ChromaDB operators ($eq, $ne, etc.), return as is
-        if any(key.startswith('$') for key in filters.keys()):
+        if any(key.startswith("$") for key in filters.keys()):
             return filters
 
         # Convert simple key-value pairs to ChromaDB's format
diff --git a/libs/agno/agno/vectordb/weaviate/weaviate.py b/libs/agno/agno/vectordb/weaviate/weaviate.py
@@ -258,8 +258,13 @@ def insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] =
             content_hash = md5(cleaned_content.encode()).hexdigest()
             doc_uuid = uuid.UUID(hex=content_hash[:32])
 
+            # Merge filters with metadata
+            meta_data = document.meta_data or {}
+            if filters:
+                meta_data.update(filters)
+
             # Serialize meta_data to JSON string
-            meta_data_str = json.dumps(document.meta_data) if document.meta_data else None
+            meta_data_str = json.dumps(meta_data) if meta_data else None
 
             collection.data.insert(
                 properties={
@@ -270,7 +275,7 @@ def insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] =
                 vector=document.embedding,
                 uuid=doc_uuid,
             )
-            log_debug(f"Inserted document: {document.name} ({document.meta_data})")
+            log_debug(f"Inserted document: {document.name} ({meta_data})")
 
     async def async_insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
         """
@@ -390,7 +395,7 @@ def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] =
             List[Document]: List of matching documents.
         """
         if self.search_type == SearchType.vector:
-            return self.vector_search(query, limit)
+            return self.vector_search(query, limit, filters)
         elif self.search_type == SearchType.keyword:
             return self.keyword_search(query, limit)
         elif self.search_type == SearchType.hybrid:
@@ -423,74 +428,80 @@ async def async_search(
             logger.error(f"Invalid search type '{self.search_type}'.")
             return []
 
-    def vector_search(self, query: str, limit: int = 5) -> List[Document]:
+    def vector_search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
         """
         Perform a vector search in Weaviate.
 
         Args:
             query (str): The search query.
             limit (int): Maximum number of results to return.
+            filters (Optional[Dict[str, Any]]): Filters to apply to the search.
 
         Returns:
             List[Document]: List of matching documents.
         """
-        query_embedding = self.embedder.get_embedding(query)
-        if query_embedding is None:
-            logger.error(f"Error getting embedding for query: {query}")
-            return []
-
-        collection = self.get_client().collections.get(self.collection)
-        response = collection.query.near_vector(
-            near_vector=query_embedding,
-            limit=limit,
-            return_properties=["name", "content", "meta_data"],
-            include_vector=True,
-        )
-
-        search_results: List[Document] = self.get_search_results(response)
-
-        if self.reranker:
-            search_results = self.reranker.rerank(query=query, documents=search_results)
-
-        self.get_client().close()
-        return search_results
+        try:
+            query_embedding = self.embedder.get_embedding(query)
+            if query_embedding is None:
+                logger.error(f"Error getting embedding for query: {query}")
+                return []
 
-    async def async_vector_search(self, query: str, limit: int = 5) -> List[Document]:
-        """
-        Perform a vector search in Weaviate asynchronously.
+            collection = self.get_client().collections.get(self.collection)
 
-        Args:
-            query (str): The search query.
-            limit (int): Maximum number of results to return.
+            # Build filter expression if filters are provided
+            filter_expr = None
+            if filters:
+                try:
+                    # Create a filter for each key-value pair
+                    filter_conditions = []
+                    for key, value in filters.items():
+                        # Create a pattern to match in the JSON string
+                        if isinstance(value, (list, tuple)):
+                            # For list values
+                            pattern = f'"{key}": {json.dumps(value)}'
+                        else:
+                            # For single values
+                            pattern = f'"{key}": "{value}"'
+                        
+                        # Add the filter condition using like operator
+                        filter_conditions.append(
+                            Filter.by_property("meta_data").like(f"*{pattern}*")
+                        )
+                    
+                    # If we have multiple conditions, combine them
+                    if len(filter_conditions) > 1:
+                        # Use the first condition as base and chain the rest
+                        filter_expr = filter_conditions[0]
+                        for condition in filter_conditions[1:]:
+                            filter_expr = filter_expr & condition
+                    elif filter_conditions:
+                        filter_expr = filter_conditions[0]
 
-        Returns:
-            List[Document]: List of matching documents.
-        """
-        query_embedding = self.embedder.get_embedding(query)
-        if query_embedding is None:
-            logger.error(f"Error getting embedding for query: {query}")
-            return []
+                except Exception as e:
+                    logger.error(f"Error building filter expression: {e}")
+                    return []
 
-        search_results = []
-        client = await self.get_async_client()
-        try:
-            collection = client.collections.get(self.collection)
-            response = await collection.query.near_vector(
+            response = collection.query.near_vector(
                 near_vector=query_embedding,
                 limit=limit,
                 return_properties=["name", "content", "meta_data"],
                 include_vector=True,
+                filters=filter_expr,
             )
 
-            search_results = self.get_search_results(response)
+            search_results: List[Document] = self.get_search_results(response)
 
             if self.reranker:
                 search_results = self.reranker.rerank(query=query, documents=search_results)
 
-        finally:
-            await client.close()
+            log_info(f"Found {len(search_results)} documents")
+            
+            self.get_client().close()
+            return search_results
 
-        return search_results
+        except Exception as e:
+            logger.error(f"Error searching for documents: {e}")
+            return []
 
     def keyword_search(self, query: str, limit: int = 5) -> List[Document]:
         """