LangChain throws an exception when using with AzureCosmosNoSQLDB #25469

balachander1964 · 2024-08-16T03:24:03Z

balachander1964
Aug 16, 2024

Checked other resources

I added a very descriptive title to this question.
I searched the LangChain documentation with the integrated search.
I used the GitHub search to find a similar question and didn't find it.

Commit to Help

I commit to help with one of those options 👆

Example Code

#Imports
_ONLY_CLEAR = False
from html import entities
import warnings
#warnings.filterwarnings('error')
import spacy
import os, sys, datetime, pathlib
from os import path
import fitz
import srsly
import langchain_openai
import pypdf
import tracemalloc
import openai
from openai import OpenAI, embeddings
from azure.cosmos import CosmosClient, PartitionKey, DatabaseProxy, container

import langchain, langchain_core, langchain_community 
from langchain import chains, schema
from langchain_core.documents import Document
from langchain.schema import document as Document
from langchain.chains import summarize, retrieval_qa
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.chains.summarize import load_summarize_chain, map_reduce_prompt, refine_prompts
from langchain.chains import mapreduce 
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
from langchain.chains.combine_documents.reduce import ReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.text_splitter import NLTKTextSplitter, RecursiveCharacterTextSplitter
from langchain_text_splitters import SpacyTextSplitter
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import AzureCosmosDBVectorSearch, azure_cosmos_db, AzureCosmosDBNoSqlVectorSearch
from langchain_community.llms import OpenAI
from langchain_openai import ChatOpenAI

'''Prompts'''
#Map
_MAP_PROMPT_TEMPLATE = '''Write a concise summary of the following chunk of text that includes the main points and any important details.
{text}
helpful answer:'''

# Reduce
_COMBINED_PROMPT_TEMPLATE = """Condense the following text into several key points of interest as required.
Return your response as separate lines of text, each covering a single key point.
{text}
helpful answer:"""

_CRITICAL_TERM_SYSTEM_PROMPT = {
  'role' : 'system',
  'content' : 'The AI assistant is very intelligent and helpful and will return precise answers as exact text without extra generative descriptions and rambling.',
  }

_CRITICAL_TERM_USER_MESSAGE = '''Return the catchy critical words / phrases from the following TEXT separated by commas.
\nTEXT: {}\n'''

_CRITICAL_TERM_ASSISTANT_PROMPT = {
  'role' : 'assistant',
  'content' : '?',
  }

api_key = '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' 

nlp = spacy.load("en_core_web_lg")

# Set the API key for the OpenAI client
openai.api_key = api_key
if api_key is None:
    raise ValueError("API key is not set. Please set the OPENAI_API_KEY environment variable.")

_CLIENT = openai.OpenAI(api_key=api_key)
#Add your open AI key to the system variable and access it.

_MODEL = 'gpt-4o'
_EMBEDDING_MODEL = 'text-embedding-ada-002'
_SPACY_MODEL = 'en_core_web_lg'
_LLM = ChatOpenAI(temperature=0, model_name= _MODEL, openai_api_key=api_key)
_COSMOS_URL = '''https://my-langchain.documents.azure.com:221/'''
_COSMOS_DB_NAME = '''my-langchain'''
_COSMOS_CONTAINER_NAME = '''langchain-data'''
_PARTITION_KEY = PartitionKey(path="/partitionKey")

_COSMOS_PRIMARY_KEY = '''cui1dGVxDmQIHyS5mS6zpCnZO5BXybZYPlGOipZHcGVFC6dThCWn7VL0BpmnMEHNY5uWloRcBecpACDbtEz1jw=='''

#DB policies.
_CONTAINER_POLICY = { 
  "vectorEmbeddings": [
    {
      "path" : "/*",
      "dataType" : "float32",
      "distanceFunction" : "cosine",
      "dimensions": 1536,
      }
    ]
  }

_INDEX_POLICY = {
  "indexingMode" : "consistent",
  "automatic" : True,
  "includedPaths" : [
    {
    "path" : "/*"
    }
  ],
  "vectorIndexes": [
    {
      "path": "/*",
      "type": "flat"
    }
  ]
}

_DB_PROPERTIES = {
  'id' : _COSMOS_DB_NAME,
  }

_CONTAINER_PROPERTIES = {
  'id' : _COSMOS_CONTAINER_NAME,
  'partition_key' : _PARTITION_KEY,
  'indexingPolicy' : _INDEX_POLICY,
  }

#DB declarations.
#First, initialize the azure Cosmos vector database.
_COSMOS_CLIENT = CosmosClient(_COSMOS_URL, _COSMOS_PRIMARY_KEY)
_COSMOS_DB = _COSMOS_CLIENT.create_database_if_not_exists(id = _COSMOS_DB_NAME)
_COSMOS_CONTAINER = _COSMOS_DB.create_container_if_not_exists(
  id = _COSMOS_CONTAINER_NAME,
  partition_key = _PARTITION_KEY,
  indexing_policy = _INDEX_POLICY,
  vector_embedding_policy = _CONTAINER_POLICY,
  offer_throughput = 400
  )
_TEXT_SPLITTER = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 256)

_SRC_PATH = './Yuvaraj Res.pdf'
#_TMP_TEXT_FILE = Pathlib.Path('./Yuvaraj Res.txt')
#_METADATA_REFERENCE = 'Yuvaraj Res.pdf ; {}'
_QUESTIONS = [
    'What technologies did the candidate work with?',
    'What is the candidate\'s educational background?',
    'How many years of experience does the candidate have in software development?',
    'Has the candidate worked in any managerial roles?',
    'What projects has the candidate been involved in?'
]
def main():
  '''Main Entry'''
  #Load the PDF document.
  #Load the pdf document and read page by page.
  pdf_document = fitz.open(_SRC_PATH)

  
  
  #Iterate through pages.
  for page_num in range(len(pdf_document)):
    #Read the page
    page =pdf_document.load_page(page_num)
    page_text = page.get_text()

    #First summarise the text
    summary = summarise_text(page_text)

    #Break the summary into separate docs.
    lines = summary.split('\n')
    #[print(line) for line in lines]
        #Create the chunks, embed the chunks, and insert them in the database.
    line_no = 0
    chunk_no = 0
    for line in lines:
      chunks = split_and_chunk_text(line)
      #print('\n', line)
      for chunk in chunks:
        #print(chunk)
        #Generate the embeddings
        embeddings = generate_embeddings(chunk)

        #Insert the vectors into the cosmos database.
        insert_vector(line_no, chunk_no, chunk, embeddings)
        chunk_no += 1

      line_no += 1

  # Initialize LangChain components
  openai_embeddings = OpenAIEmbeddings(model=_EMBEDDING_MODEL, api_key=api_key)
  vector_store =  AzureCosmosDBNoSqlVectorSearch(
    cosmos_client = _COSMOS_CLIENT,
    embedding = openai_embeddings,
    vector_embedding_policy = _CONTAINER_POLICY,
    indexing_policy = _INDEX_POLICY,
    cosmos_container_properties = _CONTAINER_PROPERTIES,
    cosmos_database_properties = _DB_PROPERTIES,
    database_name = _COSMOS_DB_NAME,
    container_name = _COSMOS_CONTAINER_NAME,
    create_container = False
    )

  db_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":2})
  chain = RetrievalQA.from_chain_type(llm = _LLM, chain_type = 'map_reduce', retriever =db_retriever, return_source_documents = True)

  for question in _QUESTIONS:
    answer = chain.invoke({'query' : question})
    print('q: ', question)
    print('Ans: ', answer)

  return

def summarise_text(page_text : str) -> str:
  '''Loads the specified text into a Langchain document object, splits into pages, summarizes each page, and returns a combined output.'''
  condensed_text = ''
  doc = Document.Document(page_content = page_text)
  docs = [doc]

  #Split the document using Spacy text splitter and use it.
  doc_splitter = SpacyTextSplitter(pipeline = _SPACY_MODEL, max_length = 10000)
  pages = doc_splitter.split_documents(docs)

  #Map Reduce code  
  map_prompt = PromptTemplate.from_template(template = _MAP_PROMPT_TEMPLATE)
  combine_prompt = PromptTemplate.from_template( template = _COMBINED_PROMPT_TEMPLATE)

  #Run the Chain.
  # Generate summaries using MapReduce method

  #After  defining prompts, initialize the associated map_reduce_chain.
  map_reduce_chain = load_summarize_chain(
    llm = _LLM,
    chain_type="map_reduce",
    map_prompt=map_prompt,
    combine_prompt=combine_prompt,
    return_intermediate_steps=True,
    )

  #Then, you generate summaries using the chain. Notice that LangChain use a tokenizer (from transformer library) with 1024 token limit by default.
  map_reduce_outputs  = map_reduce_chain({"input_documents": pages}, return_only_outputs = True)
  condensed_text = get_text_from_output(map_reduce_outputs)

  return condensed_text

def get_text_from_output(map_reduce_outputs):
  '''Extracts the text from the output from LangChain invoke'''
  condensed_text = ''
  chars = map_reduce_outputs['output_text']
  for c in chars:
    condensed_text += c
  #print(condensed_text)
  return condensed_text

def generate_embeddings(text):
  '''Function to generate embeddings'''
  ret_embeddings = []

  response =  _CLIENT.embeddings.create(input = text, model =_EMBEDDING_MODEL)
  for embedding in response.data:
    ret_embeddings.append(embedding.embedding)

  return ret_embeddings

def vector_exists(embedding):
  '''Function to check if vector already exists in Cosmos DB'''  
  query = "SELECT * FROM c WHERE c.embedding = @embedding"
  parameters = [{"name": "@embedding", "value": embedding}]
  items = list(_COSMOS_CONTAINER.query_items(query=query, parameters=parameters, enable_cross_partition_query=True))
  return len(items) > 0

def insert_vector(line_no, ent_no, text, embedding):
  '''# Function to insert vector into Cosmos DB'''  
  dt = datetime.datetime.now()
  time_info = ('{}{}{}{}{}{}').format(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
  
  srcfile = _SRC_PATH.replace('.', '').replace('/', '')
  data_id = ('{}{}{}{}').format(time_info, srcfile, line_no, ent_no)
  #print(data_id)
  if not vector_exists(embedding):
    _COSMOS_CONTAINER.create_item(body={
      'id' : str(data_id),
      'text': text,
      'embedding': embedding
    })
  return

def get_critical_terms(text):
  '''Chat GPT call to get the critical phrases from the document.'''
 
  message = _CRITICAL_TERM_USER_MESSAGE.format(text)
  
  prompt = []
  
  prompt.append(_CRITICAL_TERM_SYSTEM_PROMPT)
  prompt.append({
    'role' : 'user',
    'content' : message,
    })
  prompt.append(_CRITICAL_TERM_ASSISTANT_PROMPT)

  response = _CLIENT.chat.completions.create(
    model = _MODEL,
    messages = prompt,
    temperature = 0,
      max_tokens = 2000,
    top_p = 1,
    frequency_penalty = 0,
    presence_penalty = 0
    )

  response_text  =  response.choices[0].message.content
  
  #print(text, '\n\t', response_text)
  return response_text

def split_and_chunk_text(text):
  '''Splits and chunks the specified text'''  
  return _TEXT_SPLITTER.split_text(text)

def delete_items_from_all_partitions():
  for item in _COSMOS_CONTAINER.query_items(
    query = 'SELECT DISTINCT c.partitionKey FROM c', #'SELECT * FROM c',
    enable_cross_partition_query=True):
    #_COSMOS_CONTAINER.delete_item(item, partition_key = _PARTITION_KEY)
    _COSMOS_CONTAINER.delete_all_items_by_partition_key(item)
    print(f'Deleted partition: ', item)

  return

'''
  Required for all python programs.
'''
if __name__ == '__main__':
  tracemalloc.start()
  print('Starting')
  if not _ONLY_CLEAR:
    main()
  
  #Release all the vectors.
  if _COSMOS_CONTAINER != None:
    delete_items_from_all_partitions()
  tracemalloc.stop()
  
  input('Done')



### Description

I use the LangChain python package with AzureCosmosNoSQL Vector Database for Question Ansering. I get the following exception thrown:

Traceback (most recent call last):
File "D:\Projects2017\langchain_gpt_01\langchain_gpt_01\summarizer_qa.py", line 335, in
main()
File "D:\Projects2017\langchain_gpt_01\langchain_gpt_01\summarizer_qa.py", line 205, in main
answer = chain.invoke({'query' : question})
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain\chains\base.py", line 164, in invoke
raise e
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain\chains\base.py", line 154, in invoke
self._call(inputs, run_manager=run_manager)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain\chains\retrieval_qa\base.py", line 150, in _call
docs = self._get_docs(question, run_manager=_run_manager)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain\chains\retrieval_qa\base.py", line 270, in _get_docs
return self.retriever.invoke(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain_core\retrievers.py", line 251, in invoke
raise e
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain_core\retrievers.py", line 244, in invoke
result = self._get_relevant_documents(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain_core\vectorstores\base.py", line 1040, in _get_relevant_documents
docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain_community\vectorstores\azure_cosmos_db_no_sql.py", line 338, in similarity_search
docs_and_scores = self.similarity_search_with_score(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain_community\vectorstores\azure_cosmos_db_no_sql.py", line 322, in similarity_search_with_score
docs_and_scores = self._similarity_search_with_score(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\langchain_community\vectorstores\azure_cosmos_db_no_sql.py", line 298, in _similarity_search_with_score
items = list(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\core\paging.py", line 123, in next
return next(self._page_iterator)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\core\paging.py", line 75, in next
self._response = self._get_next(self.continuation_token)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_query_iterable.py", line 99, in _fetch_next
block = self._ex_context.fetch_next_block()
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\execution_dispatcher.py", line 110, in fetch_next_block
raise e
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\execution_dispatcher.py", line 102, in fetch_next_block
return self._execution_context.fetch_next_block()
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\base_execution_context.py", line 79, in fetch_next_block
self._ensure()
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\base_execution_context.py", line 64, in _ensure
results = self._fetch_next_block()
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\base_execution_context.py", line 175, in _fetch_next_block
return self._fetch_items_helper_with_retries(self._fetch_function)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\base_execution_context.py", line 147, in _fetch_items_helper_with_retries
return _retry_utility.Execute(self._client, self._client._global_endpoint_manager, callback)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_retry_utility.py", line 87, in Execute
result = ExecuteFunction(function, *args, **kwargs)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_retry_utility.py", line 149, in ExecuteFunction
return function(*args, **kwargs)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\base_execution_context.py", line 145, in callback
return self._fetch_items_helper_no_retries(fetch_function)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_execution_context\base_execution_context.py", line 126, in _fetch_items_helper_no_retries
(fetched_items, response_headers) = fetch_function(new_options)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_cosmos_client_connection.py", line 1065, in fetch_fn
return self.__QueryFeed(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_cosmos_client_connection.py", line 3092, in __QueryFeed
result, last_response_headers = self.__Post(path, request_params, query, req_headers, **kwargs)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_cosmos_client_connection.py", line 2811, in __Post
return synchronized_request.SynchronizedRequest(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_synchronized_request.py", line 204, in SynchronizedRequest
return _retry_utility.Execute(
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_retry_utility.py", line 85, in Execute
result = ExecuteFunction(function, global_endpoint_manager, *args, **kwargs)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_retry_utility.py", line 149, in ExecuteFunction
return function(*args, **kwargs)
File "D:\python\anaconda3\envs\Official39\lib\site-packages\azure\cosmos_synchronized_request.py", line 155, in _Request
raise exceptions.CosmosHttpResponseError(message=data, response=response)
azure.cosmos.exceptions.CosmosHttpResponseError: (BadRequest) One of the input values is invalid.
ActivityId: df33e145-e00e-47c7-9f85-be32341127b7, Windows/10.0.20348 cosmos-netstandard-sdk/3.18.0
Code: BadRequest
Message: One of the input values is invalid.
ActivityId: df33e145-e00e-47c7-9f85-be32341127b7, Windows/10.0.20348 cosmos-netstandard-sdk/3.18.0


I will appreciate if any one provides a solution so that I can fix this issue. I appreciate your quick help in advance. Thanks Bala

### System Info

The Following is the System information:
Windows 11,
LangChain : 0.2.13
langchain-core : 0.2.30
langchain-community : 0.2.12
azure-cosmos : 4.7.0

Answered by dosubot[bot]

Aug 16, 2024

To resolve the "CosmosHttpResponseError: (BadRequest) One of the input values is invalid" exception when using the LangChain Python package with Azure Cosmos NoSQL Vector Database for Question Answering, ensure that the following conditions are met:

Indexing Policy: The indexing_policy must have a non-null and non-empty vectorIndexes field.
Vector Embedding Policy: The vector_embedding_policy must have a non-null and non-empty vectorEmbeddings field.
Container Properties: The cosmos_container_properties must have a non-null partition_key.
Vector Search Configuration: Ensure that the vector_search configuration is properly set up with the necessary algorithms and profiles.

Here is the r…

View full answer

@dosu · 2024-08-16T03:27:02Z

dosubot[bot]
bot Aug 16, 2024

To resolve the "CosmosHttpResponseError: (BadRequest) One of the input values is invalid" exception when using the LangChain Python package with Azure Cosmos NoSQL Vector Database for Question Answering, ensure that the following conditions are met:

Indexing Policy: The indexing_policy must have a non-null and non-empty vectorIndexes field.
Vector Embedding Policy: The vector_embedding_policy must have a non-null and non-empty vectorEmbeddings field.
Container Properties: The cosmos_container_properties must have a non-null partition_key.
Vector Search Configuration: Ensure that the vector_search configuration is properly set up with the necessary algorithms and profiles.

Here is the relevant code snippet that checks these conditions:

if self._create_container:
    if (
        indexing_policy["vectorIndexes"] is None
        or len(indexing_policy["vectorIndexes"]) == 0
    ):
        raise ValueError(
            "vectorIndexes cannot be null or empty in the indexing_policy."
        )
    if (
        vector_embedding_policy is None
        or len(vector_embedding_policy["vectorEmbeddings"]) == 0
    ):
        raise ValueError(
            "vectorEmbeddings cannot be null "
            "or empty in the vector_embedding_policy."
        )
    if self._cosmos_container_properties["partition_key"] is None:
        raise ValueError(
            "partition_key cannot be null or empty for a container."
        )
    if vector_search is None:
        vector_search = VectorSearch(
            algorithms=[
                HnswAlgorithmConfiguration(
                    name="default",
                    kind=VectorSearchAlgorithmKind.HNSW,
                    parameters=HnswParameters(
                        m=4,
                        ef_construction=400,
                        ef_search=500,
                        metric=VectorSearchAlgorithmMetric.COSINE,
                    ),
                ),
                ExhaustiveKnnAlgorithmConfiguration(
                    name="default_exhaustive_knn",
                    kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
                    parameters=ExhaustiveKnnParameters(
                        metric=VectorSearchAlgorithmMetric.COSINE
                    ),
                ),
            ],
            profiles=[
                VectorSearchProfile(
                    name="myHnswProfile",
                    algorithm_configuration_name="default",
                ),
                VectorSearchProfile(
                    name="myExhaustiveKnnProfile",
                    algorithm_configuration_name="default_exhaustive_knn",
                ),
            ],
        )

Additionally, ensure that the AzureCosmosDBVectorSearch class is correctly instantiated and configured. Here is an example of how to set up the AzureCosmosDBVectorSearch:

from langchain_community.vectorstores import AzureCosmosDBVectorSearch
from langchain_community.embeddings.openai import OpenAIEmbeddings
from pymongo import MongoClient

mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
collection = mongo_client["<db_name>"]["<collection_name>"]
embeddings = OpenAIEmbeddings()
vectorstore = AzureCosmosDBVectorSearch(collection, embeddings)

Ensure that these fields and configurations are correctly set in your setup to avoid the "BadRequest" error ^[1]^[2]^[3].

To continue talking to Dosu, mention @dosu.

1 reply

cow-engineer Aug 16, 2024

Is that your actual key ?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

LangChain throws an exception when using with AzureCosmosNoSQLDB #25469

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment 1 reply

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

LangChain throws an exception when using with AzureCosmosNoSQLDB #25469

Uh oh!

Uh oh!

balachander1964 Aug 16, 2024

Checked other resources

Commit to Help

Example Code

Replies: 1 comment · 1 reply

Uh oh!

dosubot[bot] bot Aug 16, 2024

Uh oh!

cow-engineer Aug 16, 2024

balachander1964
Aug 16, 2024

Replies: 1 comment 1 reply

dosubot[bot]
bot Aug 16, 2024