Skip to content

Commit 8c114c4

Browse files
committed
weaviate vector search filter support
1 parent 20c157b commit 8c114c4

File tree

5 files changed

+246
-54
lines changed

5 files changed

+246
-54
lines changed

cookbook/agent_concepts/knowledge/filters/filtering_chroma_db.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88

99
# Download all sample CVs and get their paths
1010
downloaded_cv_paths = download_knowledge_filters_sample_data(
11-
num_files=5, file_extension=SampleDataFileExtension.PDF)
11+
num_files=5, file_extension=SampleDataFileExtension.PDF
12+
)
1213

1314
# Initialize ChromaDB
14-
vector_db = ChromaDb(collection="recipes",
15-
path="tmp/chromadb", persistent_client=True)
15+
vector_db = ChromaDb(collection="recipes", path="tmp/chromadb", persistent_client=True)
1616

1717
# Step 1: Initialize knowledge base with documents and metadata
1818
# ------------------------------------------------------------------------------
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from os import getenv
2+
3+
from agno.agent import Agent
4+
from agno.knowledge.pdf import PDFKnowledgeBase
5+
from agno.utils.media import (
6+
SampleDataFileExtension,
7+
download_knowledge_filters_sample_data,
8+
)
9+
from agno.vectordb.search import SearchType
10+
from agno.vectordb.weaviate import Distance, VectorIndex, Weaviate
11+
12+
# Download all sample CVs and get their paths
13+
downloaded_cv_paths = download_knowledge_filters_sample_data(
14+
num_files=5, file_extension=SampleDataFileExtension.PDF)
15+
16+
# Step 1: Initialize knowledge base with documents and metadata
17+
# ------------------------------------------------------------------------------
18+
# When initializing the knowledge base, we can attach metadata that will be used for filtering
19+
# This metadata can include user IDs, document types, dates, or any other attributes
20+
21+
vector_db = Weaviate(
22+
collection="recipes",
23+
vector_index=VectorIndex.HNSW,
24+
distance=Distance.COSINE,
25+
local=False, # Set to False if using Weaviate Cloud and True if using local instance
26+
)
27+
28+
knowledge_base = PDFKnowledgeBase(
29+
path=[
30+
{
31+
"path": downloaded_cv_paths[0],
32+
"metadata": {
33+
"user_id": "jordan_mitchell",
34+
"document_type": "cv",
35+
"year": 2025,
36+
},
37+
},
38+
{
39+
"path": downloaded_cv_paths[1],
40+
"metadata": {
41+
"user_id": "taylor_brooks",
42+
"document_type": "cv",
43+
"year": 2025,
44+
},
45+
},
46+
{
47+
"path": downloaded_cv_paths[2],
48+
"metadata": {
49+
"user_id": "morgan_lee",
50+
"document_type": "cv",
51+
"year": 2025,
52+
},
53+
},
54+
{
55+
"path": downloaded_cv_paths[3],
56+
"metadata": {
57+
"user_id": "casey_jordan",
58+
"document_type": "cv",
59+
"year": 2025,
60+
},
61+
},
62+
{
63+
"path": downloaded_cv_paths[4],
64+
"metadata": {
65+
"user_id": "alex_rivera",
66+
"document_type": "cv",
67+
"year": 2025,
68+
},
69+
},
70+
],
71+
vector_db=vector_db,
72+
)
73+
74+
# Load all documents into the vector database
75+
knowledge_base.load(recreate=True)
76+
77+
# Step 2: Query the knowledge base with different filter combinations
78+
# ------------------------------------------------------------------------------
79+
80+
# Option 1: Filters on the Agent
81+
# Initialize the Agent with the knowledge base and filters
82+
agent = Agent(
83+
knowledge=knowledge_base,
84+
search_knowledge=True,
85+
)
86+
87+
agent.print_response(
88+
"Tell me about Jordan Mitchell's experience and skills",
89+
knowledge_filters={"user_id": "jordan_mitchell"},
90+
markdown=True,
91+
)

libs/agno/agno/test.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from os import getenv
2+
3+
from agno.agent import Agent
4+
from agno.knowledge.pdf import PDFKnowledgeBase
5+
from agno.utils.media import (
6+
SampleDataFileExtension,
7+
download_knowledge_filters_sample_data,
8+
)
9+
from agno.vectordb.search import SearchType
10+
from agno.vectordb.weaviate import Distance, VectorIndex, Weaviate
11+
12+
# Download all sample CVs and get their paths
13+
downloaded_cv_paths = download_knowledge_filters_sample_data(num_files=5, file_extension=SampleDataFileExtension.PDF)
14+
15+
# Step 1: Initialize knowledge base with documents and metadata
16+
# ------------------------------------------------------------------------------
17+
# When initializing the knowledge base, we can attach metadata that will be used for filtering
18+
# This metadata can include user IDs, document types, dates, or any other attributes
19+
20+
vector_db = Weaviate(
21+
collection="recipes",
22+
vector_index=VectorIndex.HNSW,
23+
distance=Distance.COSINE,
24+
local=False, # Set to False if using Weaviate Cloud and True if using local instance
25+
)
26+
27+
knowledge_base = PDFKnowledgeBase(
28+
path=[
29+
{
30+
"path": downloaded_cv_paths[0],
31+
"metadata": {
32+
"user_id": "jordan_mitchell",
33+
"document_type": "cv",
34+
"year": 2025,
35+
},
36+
},
37+
{
38+
"path": downloaded_cv_paths[1],
39+
"metadata": {
40+
"user_id": "taylor_brooks",
41+
"document_type": "cv",
42+
"year": 2025,
43+
},
44+
},
45+
{
46+
"path": downloaded_cv_paths[2],
47+
"metadata": {
48+
"user_id": "morgan_lee",
49+
"document_type": "cv",
50+
"year": 2025,
51+
},
52+
},
53+
{
54+
"path": downloaded_cv_paths[3],
55+
"metadata": {
56+
"user_id": "casey_jordan",
57+
"document_type": "cv",
58+
"year": 2025,
59+
},
60+
},
61+
{
62+
"path": downloaded_cv_paths[4],
63+
"metadata": {
64+
"user_id": "alex_rivera",
65+
"document_type": "cv",
66+
"year": 2025,
67+
},
68+
},
69+
],
70+
vector_db=vector_db,
71+
)
72+
73+
# Load all documents into the vector database
74+
knowledge_base.load(recreate=True)
75+
76+
# Step 2: Query the knowledge base with different filter combinations
77+
# ------------------------------------------------------------------------------
78+
79+
# Option 1: Filters on the Agent
80+
# Initialize the Agent with the knowledge base and filters
81+
agent = Agent(
82+
knowledge=knowledge_base,
83+
search_knowledge=True,
84+
)
85+
86+
agent.print_response(
87+
"Tell me about Jordan Mitchell's experience and skills",
88+
knowledge_filters={"user_id": "hi"},
89+
markdown=True,
90+
)

libs/agno/agno/vectordb/chroma/chromadb.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,12 @@ def insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] =
157157
document.embed(embedder=self.embedder)
158158
cleaned_content = document.content.replace("\x00", "\ufffd")
159159
doc_id = md5(cleaned_content.encode()).hexdigest()
160-
160+
161161
# Handle metadata and filters
162162
metadata = document.meta_data or {}
163163
if filters:
164164
metadata.update(filters)
165-
165+
166166
docs_embeddings.append(document.embedding)
167167
docs.append(cleaned_content)
168168
ids.append(doc_id)
@@ -282,21 +282,21 @@ def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] =
282282

283283
if self.reranker:
284284
search_results = self.reranker.rerank(query=query, documents=search_results)
285-
285+
286286
log_info(f"Found {len(search_results)} documents")
287287
return search_results
288288

289289
def _convert_filters(self, filters: Dict[str, Any]) -> Dict[str, Any]:
290290
"""Convert simple filters to ChromaDB's filter format.
291-
291+
292292
Handles conversion of simple key-value filters to ChromaDB's operator format
293293
when needed.
294294
"""
295295
if not filters:
296296
return {}
297297

298298
# If filters already use ChromaDB operators ($eq, $ne, etc.), return as is
299-
if any(key.startswith('$') for key in filters.keys()):
299+
if any(key.startswith("$") for key in filters.keys()):
300300
return filters
301301

302302
# Convert simple key-value pairs to ChromaDB's format

libs/agno/agno/vectordb/weaviate/weaviate.py

Lines changed: 57 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,13 @@ def insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] =
258258
content_hash = md5(cleaned_content.encode()).hexdigest()
259259
doc_uuid = uuid.UUID(hex=content_hash[:32])
260260

261+
# Merge filters with metadata
262+
meta_data = document.meta_data or {}
263+
if filters:
264+
meta_data.update(filters)
265+
261266
# Serialize meta_data to JSON string
262-
meta_data_str = json.dumps(document.meta_data) if document.meta_data else None
267+
meta_data_str = json.dumps(meta_data) if meta_data else None
263268

264269
collection.data.insert(
265270
properties={
@@ -270,7 +275,7 @@ def insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] =
270275
vector=document.embedding,
271276
uuid=doc_uuid,
272277
)
273-
log_debug(f"Inserted document: {document.name} ({document.meta_data})")
278+
log_debug(f"Inserted document: {document.name} ({meta_data})")
274279

275280
async def async_insert(self, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
276281
"""
@@ -390,7 +395,7 @@ def search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] =
390395
List[Document]: List of matching documents.
391396
"""
392397
if self.search_type == SearchType.vector:
393-
return self.vector_search(query, limit)
398+
return self.vector_search(query, limit, filters)
394399
elif self.search_type == SearchType.keyword:
395400
return self.keyword_search(query, limit)
396401
elif self.search_type == SearchType.hybrid:
@@ -423,74 +428,80 @@ async def async_search(
423428
logger.error(f"Invalid search type '{self.search_type}'.")
424429
return []
425430

426-
def vector_search(self, query: str, limit: int = 5) -> List[Document]:
431+
def vector_search(self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
427432
"""
428433
Perform a vector search in Weaviate.
429434
430435
Args:
431436
query (str): The search query.
432437
limit (int): Maximum number of results to return.
438+
filters (Optional[Dict[str, Any]]): Filters to apply to the search.
433439
434440
Returns:
435441
List[Document]: List of matching documents.
436442
"""
437-
query_embedding = self.embedder.get_embedding(query)
438-
if query_embedding is None:
439-
logger.error(f"Error getting embedding for query: {query}")
440-
return []
441-
442-
collection = self.get_client().collections.get(self.collection)
443-
response = collection.query.near_vector(
444-
near_vector=query_embedding,
445-
limit=limit,
446-
return_properties=["name", "content", "meta_data"],
447-
include_vector=True,
448-
)
449-
450-
search_results: List[Document] = self.get_search_results(response)
451-
452-
if self.reranker:
453-
search_results = self.reranker.rerank(query=query, documents=search_results)
454-
455-
self.get_client().close()
456-
return search_results
443+
try:
444+
query_embedding = self.embedder.get_embedding(query)
445+
if query_embedding is None:
446+
logger.error(f"Error getting embedding for query: {query}")
447+
return []
457448

458-
async def async_vector_search(self, query: str, limit: int = 5) -> List[Document]:
459-
"""
460-
Perform a vector search in Weaviate asynchronously.
449+
collection = self.get_client().collections.get(self.collection)
461450

462-
Args:
463-
query (str): The search query.
464-
limit (int): Maximum number of results to return.
451+
# Build filter expression if filters are provided
452+
filter_expr = None
453+
if filters:
454+
try:
455+
# Create a filter for each key-value pair
456+
filter_conditions = []
457+
for key, value in filters.items():
458+
# Create a pattern to match in the JSON string
459+
if isinstance(value, (list, tuple)):
460+
# For list values
461+
pattern = f'"{key}": {json.dumps(value)}'
462+
else:
463+
# For single values
464+
pattern = f'"{key}": "{value}"'
465+
466+
# Add the filter condition using like operator
467+
filter_conditions.append(
468+
Filter.by_property("meta_data").like(f"*{pattern}*")
469+
)
470+
471+
# If we have multiple conditions, combine them
472+
if len(filter_conditions) > 1:
473+
# Use the first condition as base and chain the rest
474+
filter_expr = filter_conditions[0]
475+
for condition in filter_conditions[1:]:
476+
filter_expr = filter_expr & condition
477+
elif filter_conditions:
478+
filter_expr = filter_conditions[0]
465479

466-
Returns:
467-
List[Document]: List of matching documents.
468-
"""
469-
query_embedding = self.embedder.get_embedding(query)
470-
if query_embedding is None:
471-
logger.error(f"Error getting embedding for query: {query}")
472-
return []
480+
except Exception as e:
481+
logger.error(f"Error building filter expression: {e}")
482+
return []
473483

474-
search_results = []
475-
client = await self.get_async_client()
476-
try:
477-
collection = client.collections.get(self.collection)
478-
response = await collection.query.near_vector(
484+
response = collection.query.near_vector(
479485
near_vector=query_embedding,
480486
limit=limit,
481487
return_properties=["name", "content", "meta_data"],
482488
include_vector=True,
489+
filters=filter_expr,
483490
)
484491

485-
search_results = self.get_search_results(response)
492+
search_results: List[Document] = self.get_search_results(response)
486493

487494
if self.reranker:
488495
search_results = self.reranker.rerank(query=query, documents=search_results)
489496

490-
finally:
491-
await client.close()
497+
log_info(f"Found {len(search_results)} documents")
498+
499+
self.get_client().close()
500+
return search_results
492501

493-
return search_results
502+
except Exception as e:
503+
logger.error(f"Error searching for documents: {e}")
504+
return []
494505

495506
def keyword_search(self, query: str, limit: int = 5) -> List[Document]:
496507
"""

0 commit comments

Comments
 (0)