Skip to content

Commit da3096b

Browse files
authored
Merge pull request #31 from SolaceLabs/ap/DATAGO-102867/add_hybrid_search
Ap/datago 102867/add hybrid search
2 parents 74bb986 + 66e2826 commit da3096b

File tree

17 files changed

+3710
-1582
lines changed

17 files changed

+3710
-1582
lines changed

sam-rag/configs/agents/rag.yaml

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ flows:
4343
llm_service_topic: ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/llm-service/request/general-good/
4444
embedding_service_topic: ${SOLACE_AGENT_MESH_NAMESPACE}solace-agent-mesh/v1/embedding-service/request/text/
4545
agent_name: rag
46+
47+
# Hybrid Search Configuration
48+
hybrid_search:
49+
enabled: ${HYBRID_SEARCH_ENABLED} # Global toggle for hybrid search
50+
4651
# Scanner configuration
4752
scanner:
4853
batch: true
@@ -276,6 +281,10 @@ flows:
276281
batch_size: 32
277282
additional_kwargs: {}
278283
normalize_embeddings: True
284+
hybrid_search: # Configuration specific to hybrid search embedding
285+
sparse_model_config: # Configuration for sparse vector model (e.g., BM25, SPLADE)
286+
type: "tfidf" # Changed from "bm25" to "tfidf" to match implementation
287+
params: {} # Model-specific parameters
279288

280289
# Vector database configuration
281290
vector_db:
@@ -286,6 +295,10 @@ flows:
286295
api_key: ${QDRANT_API_KEY}
287296
collection_name: ${QDRANT_COLLECTION}
288297
embedding_dimension: ${QDRANT_EMBEDDING_DIMENSION}
298+
hybrid_search_params: # Qdrant specific params, active if global hybrid_search.enabled is true
299+
sparse_vector_name: "sparse_db" # Example name for the sparse vector in Qdrant
300+
# fusion_method: "rrf" # Example, if using direct Query API and need to specify
301+
# Other Qdrant specific hybrid query params can go here
289302

290303
# Chroma DB configuration
291304
# db_type: "chroma"
@@ -307,7 +320,21 @@ flows:
307320
# metric: ${PINECONE_METRIC}
308321
# cloud: ${PINECONE_CLOUD}
309322
# region: ${PINECONE_REGION}
310-
323+
# hybrid_search_params: # Pinecone specific params, active if global hybrid_search.enabled is true
324+
# alpha: 0.5 # Example: 0.0 for pure sparse, 1.0 for pure dense. Default often 0.5
325+
326+
# Redis configuration (placeholder)
327+
# db_type: "redis"
328+
# db_params:
329+
# url: ${REDIS_URL} # e.g., redis://localhost:6379
330+
# index_name: ${REDIS_INDEX_NAME} # e.g., "rag_idx"
331+
# embedding_dimension: ${REDIS_EMBEDDING_DIMENSION}
332+
# text_field_name: "content" # Name of the text field in Redis for FT search
333+
# vector_field_name: "embedding" # Name of the vector field in Redis
334+
# hybrid_search_params: # Redis specific params, active if global hybrid_search.enabled is true
335+
# text_score_weight: 0.3 # Example weight for full-text search score
336+
# vector_score_weight: 0.7 # Example weight for vector similarity score
337+
# # Other params like HYBRID_POLICY (e.g., "WEIGHTED") if applicable via client
311338

312339
# # PostgreSQL with pgvector
313340
# db_type: "pgvector"

sam-rag/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ dependencies = [
2828
"watchdog==6.0.0", # Fixed syntax for watchdog dependency
2929
"ujson==5.10.0", # For faster JSON processing
3030
"odfpy==1.4.1",
31+
"scikit-learn>=1.0", # For TF-IDF and other machine learning utilities
3132
]
3233

3334
[project.optional-dependencies]

sam-rag/src/agents/rag/services/database/vector_db_base.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,21 @@ class VectorDBBase(ABC):
1111
Abstract base class for vector databases.
1212
"""
1313

14-
def __init__(self, config: Dict[str, Any] = None):
14+
def __init__(
15+
self,
16+
config: Dict[str, Any] = None,
17+
hybrid_search_config: Optional[Dict[str, Any]] = None,
18+
):
1519
"""
1620
Initialize the vector database with the given configuration.
1721
1822
Args:
19-
config: A dictionary containing configuration parameters.
23+
config: A dictionary containing configuration parameters for the specific database.
24+
hybrid_search_config: Optional dictionary containing hybrid search configuration.
2025
"""
2126
self.config = config or {}
27+
self.hybrid_search_config = hybrid_search_config or {}
28+
self.hybrid_search_enabled = self.hybrid_search_config.get("enabled", False)
2229

2330
@abstractmethod
2431
def add_documents(
@@ -27,15 +34,17 @@ def add_documents(
2734
embeddings: List[List[float]],
2835
metadatas: Optional[List[Dict[str, Any]]] = None,
2936
ids: Optional[List[str]] = None,
37+
sparse_vectors: Optional[List[Optional[Dict[int, float]]]] = None,
3038
) -> List[str]:
3139
"""
3240
Add documents to the vector database.
3341
3442
Args:
3543
documents: The documents to add.
36-
embeddings: The embeddings of the documents.
44+
embeddings: The dense embeddings of the documents.
3745
metadatas: Optional metadata for each document.
3846
ids: Optional IDs for each document.
47+
sparse_vectors: Optional sparse vector representations for each document.
3948
4049
Returns:
4150
The IDs of the added documents.
@@ -48,14 +57,18 @@ def search(
4857
query_embedding: List[float],
4958
top_k: int = 5,
5059
filter: Optional[Dict[str, Any]] = None,
60+
query_sparse_vector: Optional[Dict[int, float]] = None,
61+
request_hybrid: bool = False,
5162
) -> List[Dict[str, Any]]:
5263
"""
5364
Search for documents similar to the query embedding.
5465
5566
Args:
56-
query_embedding: The query embedding.
67+
query_embedding: The dense query embedding.
5768
top_k: The number of results to return.
5869
filter: Optional filter to apply to the search.
70+
query_sparse_vector: Optional sparse vector for the query.
71+
request_hybrid: Flag to request hybrid search if available and enabled.
5972
6073
Returns:
6174
A list of dictionaries containing the search results.
@@ -92,6 +105,7 @@ def update(
92105
documents: Optional[List[str]] = None,
93106
embeddings: Optional[List[List[float]]] = None,
94107
metadatas: Optional[List[Dict[str, Any]]] = None,
108+
sparse_vectors: Optional[List[Optional[Dict[int, float]]]] = None,
95109
) -> None:
96110
"""
97111
Update documents in the vector database.
@@ -101,6 +115,7 @@ def update(
101115
documents: Optional new document contents.
102116
embeddings: Optional new embeddings.
103117
metadatas: Optional new metadata.
118+
sparse_vectors: Optional sparse vector representations for each document.
104119
"""
105120
pass
106121

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""
2+
Vector DB Implementations Package.
3+
4+
This package contains specific implementations for various vector databases.
5+
"""
6+
7+
from .pinecone_db import PineconeDB
8+
from .qdrant_db import QdrantDB
9+
from .redis_legacy_db import RedisDB as RedisLegacyDB # Alias to avoid name clash
10+
from .pgvector_db import PgVectorDB
11+
from .chroma_db import ChromaDB
12+
from .redis_vl_db import RedisDB as RedisVLDB # Alias for the redisvl version
13+
14+
# You can define an __all__ list if you want to specify what gets imported
15+
# when a client does 'from . import *'
16+
__all__ = [
17+
"PineconeDB",
18+
"QdrantDB",
19+
"RedisLegacyDB", # Use the alias
20+
"PgVectorDB",
21+
"ChromaDB",
22+
"RedisVLDB", # Use the alias
23+
]
24+
25+
# Optional: A dictionary mapping names to classes for easier dynamic loading
26+
IMPLEMENTATIONS = {
27+
"pinecone": PineconeDB,
28+
"qdrant": QdrantDB,
29+
"redis_legacy": RedisLegacyDB,
30+
"pgvector": PgVectorDB,
31+
"chroma": ChromaDB,
32+
"redis_vl": RedisVLDB,
33+
}

0 commit comments

Comments
 (0)