How can I create a CustomRetriever with more parameters? #26142
Replies: 1 comment 8 replies
-
To create a Here is an example of how to properly initialize your custom retriever with the required parameters: import time
import os
from dotenv import load_dotenv
from langchain.vectorstores import VectorStore
from langchain.schema import Document
from langchain.schema import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from utils.telemetry import log_custom_metric
from config.config import NUMBER_OF_CHUNKS_TO_RETURN
from typing import List
class MyUserRetrieverPolicyLCEL(BaseRetriever):
vectorstore: VectorStore
username: str
instrumentation_key: str # Added to log metrics
session_id: str # Added to log metrics
def __init__(self, vectorstore: VectorStore, username: str, instrumentation_key: str, session_id: str):
super().__init__()
self.vectorstore = vectorstore
self.username = username
self.instrumentation_key = instrumentation_key
self.session_id = session_id
def clean_metadata(self, doc):
"""
Cleans the metadata of a document.
Parameters:
doc (object): The document object.
Returns:
dict: A dictionary containing the cleaned metadata.
"""
metadata = doc.metadata
return {
"file_id": metadata["title"],
"source": metadata["title"] + "_page=" + str(int(metadata["chunk_id"].split("_")[-1])+1),
"page_number": str(int(metadata["chunk_id"].split("_")[-1])+1),
"document_title": metadata["document_title_result"],
"document_owning_department": metadata["document_owning_department_result"],
"manual" : metadata["manual_result"]
}
def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
"""
Retrieves relevant documents based on a given query.
Args:
query (str): The query to search for relevant documents.
Returns:
list: A list of relevant documents.
"""
docs = []
is_match_filter = ""
load_dotenv()
admins = os.getenv('ADMINS', '')
admins_list = admins.split(',')
is_admin = self.username.split('@')[0] in admins_list
if not is_admin:
is_match_filter = f"search.ismatch('{self.username.split('@')[0]}', 'usernames_result')"
# Start timing the retrieval process
retrieval_start_time = time.time()
try:
for doc in self.vectorstore.similarity_search(query, search_type="hybrid", k=NUMBER_OF_CHUNKS_TO_RETURN, filters=is_match_filter):
cleaned_metadata = self.clean_metadata(doc)
docs.append(Document(
page_content=doc.page_content,
metadata=cleaned_metadata))
finally:
# End timing and log the retrieval time
retrieval_end_time = time.time()
log_custom_metric(self.instrumentation_key, 'retriever_execution_time', retrieval_end_time - retrieval_start_time, self.session_id)
return docs
# Example initialization
vectorstore = VectorStore() # Replace with actual VectorStore instance
username = "user@example.com"
instrumentation_key = "your_instrumentation_key"
session_id = "your_session_id"
retriever = MyUserRetrieverPolicyLCEL(
vectorstore=vectorstore,
username=username,
instrumentation_key=instrumentation_key,
session_id=session_id
) Make sure to replace the placeholder instances ( |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
-
Checked other resources
Commit to Help
Example Code
Description
I am trying to create a new custom retriever, but inside the get relevant documents I want to measure the search time, thats why I need to pass the session_id and the instrumentation_key, however it fails on the line super().init()
with this errlr
vectorstore
field required (type=value_error.missing)
username
field required (type=value_error.missing)
instrumentation_key
field required (type=value_error.missing)
session_id
field required (type=value_error.missing)
System Info
Package Information
Beta Was this translation helpful? Give feedback.
All reactions