No chunks/doc are retrieved for some documents. #29847
Replies: 3 comments 2 replies
-
hey @mrutunjay-kinagi , I'm encountering the same issue. No chunks or documents are being retrieved for some inputs. I've tried verifying the document loaders, chunk sizes, and retrieval logic without success. Any insights or updates would be appreciated! |
Beta Was this translation helpful? Give feedback.
-
I found the root cause of the issue. VectorSearchDB’s default search config params used in LangChain integration code doesn’t work for scale scenarios. Those parameters does KNN search for the at file scope, and then filters the results down to specific document. Better way would be to set search scope to document at first, and do KNN search at that scope. I tried this : import boto3
import json
from langchain.chains import RetrievalQA, create_retrieval_chain
from langchain.callbacks import StdOutCallbackHandler
from langchain.embeddings import BedrockEmbeddings
from langchain.vectorstores import OpenSearchVectorSearch
from opensearchpy import Urllib3AWSV4SignerAuth
from langchain_community.chat_models import BedrockChat
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate
from langchain import __version__
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
print(f"langchain version: {__version__}")
f
def search_documents(account_id, prompt, index_id, num_of_chunks, source=False):
response = None
try:
qa_chain = get_retrieval_chain(
account_id, prompt, index_id, num_of_chunks, source
)
response = qa_chain.invoke({"input": prompt})
print("Complete Response: ", response)
print("result :", response["result"])
print("source documents:", response["source_documents"])
final_response = build_final_search_response(response)
return final_response
except Exception as e:
print("DEBUG: in search_documents, Error:", e)
raise e
def get_embeddings():
return BedrockEmbeddings(region_name="us-east-1")
def get_opensearch_vectorstore(account_id, index_id, source=False):
print(f"Inside get_opensearch_vectorstore")
opensearch_domain_endpoint = get_opensearch_endpoint_from_parameter_store(
account_id
)
search_index = "index_name" + index_id
else:
search_index = "index_name*"
# Setting AWS credentials ...
vectorstore = OpenSearchVectorSearch(
index_name=search_index,
embedding_function=get_embeddings(),
opensearch_url=opensearch_domain_endpoint,
http_auth=auth,
metadata_query=metadata_query,
)
print("DEBUG: vectorstore created")
return vectorstore
def get_retrieval_chain(account_id, prompt, index_id, num_of_chunks, source):
llm = BedrockChat(
model_id="anthropic.claude-3-haiku-20240307-v1:0", region_name="us-east-1"
)
handler = StdOutCallbackHandler()
vectorstore = get_opensearch_vectorstore(account_id, index_id, source)
# Embed the query
query_embedding = get_embeddings().embed_query(prompt)
try:
# search_kwargs with script_score and cosine similarity
search_kwargs = {
"size": num_of_chunks,
"query": {
"bool": {
"should": [
{
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, doc['vector_field']) + 1.0",
"params": {"query_vector": query_embedding},
},
}
}
]
}
},
}
# Apply metadata filter ONLY IF source is provided
if source:
search_kwargs["query"]["bool"]["filter"] = [
{"term": {"metadata.source.keyword": source}}
]
print(f"search_kwargs: {search_kwargs}")
print("Number of documents:", num_of_chunks)
# Create retriever and set search_kwargs properly
retriever = vectorstore.as_retriever()
retriever.search_kwargs = search_kwargs
# Define a structured prompt
prompt_template = PromptTemplate(
input_variables=["context", "question"],
template="Use the following retrieved documents to answer the question:\n\n{context}\n\nQuestion: {question}\n\nAnswer:"
)
# Create document combination chain
combine_docs_chain = create_stuff_documents_chain(
llm=llm, prompt=prompt_template
)
# Create retrieval chain that integrates document retrieval + answering
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
return retrieval_chain
except Exception as e:
logger.error("An error occurred: %s", str(e))
raise But now i'm facing this error : Error: OpenSearchVectorSearch.similarity_search() got multiple values for argument 'query' |
Beta Was this translation helpful? Give feedback.
-
@dosu-bot any suggestion ? |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Checked other resources
Commit to Help
Example Code
Description
When i'm trying to query a successfully indexed document no chunks are retrieved. But for the same documents when i try the same query on AWS jumpbox/bastion host i get correct response.
System Info
langchain==0.2.16
python version 3.11
Beta Was this translation helpful? Give feedback.
All reactions