LLM Endpoint stops streaming chunks #25385

levalencia · 2024-08-14T13:43:00Z

levalencia
Aug 14, 2024

Checked other resources

I added a very descriptive title to this question.
I searched the LangChain documentation with the integrated search.
I used the GitHub search to find a similar question and didn't find it.

Commit to Help

I commit to help with one of those options 👆

Example Code

@app.route('/policy', methods=['POST'])
async def policy():
    
    """
    Endpoint to generate tokens based on a prompt for HR Documents

    Returns:
        Response: Response object containing generated tokens.

    """
    
    data = await request.get_json()
    prompt = data.get("prompt", "What are the company values?")
    chat_history = data.get("chat_history", [])
    username = data.get("username", "")
    inputs = [item.get('input') for item in chat_history]
    outputs = [item.get('output') for item in chat_history] 
    access_token_keyvault = request.headers.get('Authorization-KeyVault', '').split(' ')[1]
    access_token_openai = request.headers.get('Authorization-OpenAI', '').split(' ')[1]
    access_token_search = request.headers.get('Authorization-Search', '').split(' ')[1]
    instrumentation_key = config.SHD_INSTRUMENTATION_KEY(access_token_keyvault)
    async def generate():
        start_time = time.time()
        try:
            async for chunk in policy_gpt_chain_lcel(prompt, username, inputs, outputs, access_token_keyvault, access_token_openai, access_token_search):
                yield chunk
        except Exception as e:
            # If an error occurs, log it and yield an error message to the client
            log_exception(instrumentation_key, e)
            yield f"An error occurred: {e}".encode()
        finally:
            end_time = time.time()
            execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
            log_custom_metric(instrumentation_key, 'policy_lcel_quart_execution_time', execution_time)
    return Response(generate(), content_type='text/event-stream')


async def retry_with_backoff_in_stream(fn, max_retries=3, initial_delay=1, backoff_factor=2, jitter=0.1):
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            logging.info("First retry attempt")
            async for chunk in fn():
                yield chunk
            return  # Exit if successful
        except (TimeoutError, ConnectionError) as e:
            logging.warning(f"Attempt {attempt + 1} failed with {str(e)}. Retrying after {delay} seconds...")
            await asyncio.sleep(delay + random.uniform(-jitter, jitter))
            delay *= backoff_factor
        except Exception as e:
            logging.error(f"Unexpected error occurred during streaming: {str(e)}")
            raise
    raise RuntimeError(f"All {max_retries} retry attempts failed.")

async def policy_gpt_chain_lcel(prompt, username, inputs, outputs,access_token_keyvault, access_token_openai, access_token_search):
    async def llm_thread_gpt_lcel(prompt, username, inputs, outputs):
       
        try:
            SHD_AZURE_OPENAI_ENDPOINT = config.SHD_AZURE_OPENAI_ENDPOINT(access_token_keyvault)
            SHD_OPENAI_DEPLOYMENT_NAME = config.SHD_OPENAI_DEPLOYMENT_NAME(access_token_keyvault)
            SHD_OPENAI_GPT_MODEL_NAME = config.SHD_OPENAI_GPT_MODEL_NAME(access_token_keyvault)
   
            llm = AzureChatOpenAI(
                azure_endpoint=SHD_AZURE_OPENAI_ENDPOINT,
                openai_api_version="2023-03-15-preview",
                deployment_name=SHD_OPENAI_DEPLOYMENT_NAME,
                azure_ad_token = access_token_openai,
                openai_api_type="Azure",
                model_name=SHD_OPENAI_GPT_MODEL_NAME,
                streaming=True,  # Set ChainStreamHandler as callback
                temperature=0)     

            embeddings = setup_embeddings(access_token_keyvault, access_token_openai)
            vector_store = setup_vector_store(embeddings, access_token_keyvault, access_token_search)
            retriever = ArgenxUserRetrieverPolicyLCEL(vectorstore=vector_store, username=username)
            
            demo_ephemeral_chat_history = ChatMessageHistory()
            for i in range(len(inputs)):
                demo_ephemeral_chat_history.add_user_message(inputs[i])
                demo_ephemeral_chat_history.add_ai_message(outputs[i])
                
            contextualize_q_system_prompt = (
                "Given a chat history and the latest user question "
                "which might reference context in the chat history, "
                "formulate a standalone question which can be understood "
                "without the chat history. Do NOT answer the question, "
                "just reformulate it if needed and otherwise return it as is."
            )
            contextualize_q_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", contextualize_q_system_prompt),
                    MessagesPlaceholder("chat_history"),
                    ("human", "{input} Please answer in HTML format"),
                ]
            )
            history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt) 
            qa_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", ANSWER_PROMPT_POLICY),
                    MessagesPlaceholder("chat_history"),
                    ("human", "{input} Please answer in HTML format"),
                ]
            )

            # Define your legacy DOCUMENT_PROMPT
            document_prompt = PromptTemplate(
                input_variables=["page_content", "source"],
                template="{page_content} (Source: {source})"
            )
            question_answer_chain = create_stuff_documents_chain(llm, qa_prompt, document_prompt=document_prompt, document_variable_name="context")  | StrOutputParser()
                
            rag_chain_from_docs = create_retrieval_chain(history_aware_retriever, question_answer_chain) 
            conversational_rag_chain = RunnableWithMessageHistory(
                rag_chain_from_docs,
                lambda session_id: demo_ephemeral_chat_history, 
                input_messages_key="input",
                history_messages_key="chat_history",
                output_messages_key="answer",
            )

            async def fetch_and_stream():
                async for chunk in accumulate_and_yield(conversational_rag_chain.astream({"input": prompt}, config={"configurable": {"session_id": "unbound"}})):
                    yield chunk

            # Retry logic embedded in the stream processing
            async for chunk in retry_with_backoff_in_stream(fetch_and_stream):
                yield chunk
        except Exception as e:
            logging.error(f"An error occurred in llm_thread_gpt_lcel: {e}")
            yield f"An error occurred: {e}".encode()
            
    try:
        async for chunk in llm_thread_gpt_lcel(prompt, username, inputs, outputs):
            yield chunk
    except Exception as e:
        logging.error(f"An error occurred in policy_gpt_chain_lcel: {e}")
        yield f"An error occurred: {e}".encode()

Description

sometimes the response gets cut off, and the streaming stops, I suspect its related to rate limit errors on azure open ai, but I dont see any errors in the logs. how can I make sure if this is the real error?. How can I improve the code to catch rate limit errors?

System Info

langchain_core: 0.2.21
langchain: 0.2.9
langchain_community: 0.2.7
langsmith: 0.1.92
langchain_openai: 0.1.17
langchain_text_splitters: 0.2.2
langchainhub: 0.1.20

@dosu · 2024-08-14T13:46:46Z

dosubot[bot]
bot Aug 14, 2024

To confirm if rate limit errors on Azure OpenAI are causing your endpoint to stop streaming chunks, you can check the response payload for specific error messages related to rate limiting. Additionally, you can improve your code to catch these errors by adding error handling in the _stream and _astream methods.

Here is an example of how you can modify the _stream and _astream methods to catch rate limit errors:

def _stream(
    self,
    messages: List[BaseMessage],
    stop: Optional[List[str]] = None,
    run_manager: Optional[CallbackManagerForLLMRun] = None,
    **kwargs: Any,
) -> Iterator[ChatGenerationChunk]:
    self.endpoint_url = self.endpoint_url.replace("/chat/completions", "")
    timeout = None if "timeout" not in kwargs else kwargs["timeout"]

    import openai

    params = {}
    client_params = {
        "api_key": self.endpoint_api_key.get_secret_value(),
        "base_url": self.endpoint_url,
        "timeout": timeout,
        "default_headers": None,
        "default_query": None,
        "http_client": None,
    }

    client = openai.OpenAI(**client_params)
    message_dicts = [
        CustomOpenAIChatContentFormatter._convert_message_to_dict(m)
        for m in messages
    ]
    params = {"stream": True, "stop": stop, "model": None, **kwargs}
    default_chunk_class = AIMessageChunk

    try:
        for chunk in client.chat.completions.create(messages=message_dicts, **params):
            if not isinstance(chunk, dict):
                chunk = chunk.dict()
            if len(chunk["choices"]) == 0:
                continue
            choice = chunk["choices"][0]
            chunk = _convert_delta_to_message_chunk(
                choice["delta"], default_chunk_class
            )
            generation_info = {}
            if finish_reason := choice.get("finish_reason"):
                generation_info["finish_reason"] = finish_reason
            logprobs = choice.get("logprobs")
            if logprobs:
                generation_info["logprobs"] = logprobs
            default_chunk_class = chunk.__class__
            chunk = ChatGenerationChunk(
                message=chunk, generation_info=generation_info or None
            )
            if run_manager:
                run_manager.on_llm_new_token(chunk.text, chunk=chunk, logprobs=logprobs)
            yield chunk
    except openai.error.RateLimitError as e:
        print(f"Rate limit error: {e}")
        # Handle rate limit error (e.g., retry after some time)
    except Exception as e:
        print(f"An error occurred: {e}")
        # Handle other errors

async def _astream(
    self,
    messages: List[BaseMessage],
    stop: Optional[List[str]] = None,
    run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
    **kwargs: Any,
) -> AsyncIterator[ChatGenerationChunk]:
    self.endpoint_url = self.endpoint_url.replace("/chat/completions", "")
    timeout = None if "timeout" not in kwargs else kwargs["timeout"]

    import openai

    params = {}
    client_params = {
        "api_key": self.endpoint_api_key.get_secret_value(),
        "base_url": self.endpoint_url,
        "timeout": timeout,
        "default_headers": None,
        "default_query": None,
        "http_client": None,
    }
    async_client = openai.AsyncOpenAI(**client_params)
    message_dicts = [
        CustomOpenAIChatContentFormatter._convert_message_to_dict(m)
        for m in messages
    ]
    params = {"stream": True, "stop": stop, "model": None, **kwargs}

    default_chunk_class = AIMessageChunk
    try:
        async for chunk in await async_client.chat.completions.create(
            messages=message_dicts, **params
        ):
            if not isinstance(chunk, dict):
                chunk = chunk.dict()
            if len(chunk["choices"]) == 0:
                continue
            choice = chunk["choices"][0]
            chunk = _convert_delta_to_message_chunk(
                choice["delta"], default_chunk_class
            )
            generation_info = {}
            if finish_reason := choice.get("finish_reason"):
                generation_info["finish_reason"] = finish_reason
            logprobs = choice.get("logprobs")
            if logprobs:
                generation_info["logprobs"] = logprobs
            default_chunk_class = chunk.__class__
            chunk = ChatGenerationChunk(
                message=chunk, generation_info=generation_info or None
            )
            if run_manager:
                await run_manager.on_llm_new_token(
                    token=chunk.text, chunk=chunk, logprobs=logprobs
                )
            yield chunk
    except openai.error.RateLimitError as e:
        print(f"Rate limit error: {e}")
        # Handle rate limit error (e.g., retry after some time)
    except Exception as e:
        print(f"An error occurred: {e}")
        # Handle other errors

This code adds error handling for RateLimitError and other exceptions, allowing you to catch and handle rate limit errors appropriately ^[1].

To continue talking to Dosu, mention @dosu.

4 replies

levalencia Aug 14, 2024
Author

@dosu but thats langchain code, which is not in my app, that would mean to modify the langchain library, is there another way?

dosubot[bot] bot Aug 14, 2024

To handle rate limit errors without modifying the LangChain library, you can wrap the LangChain calls in your own retry logic. Here's how you can do it:

Create a Retry Decorator: This decorator will handle retries with exponential backoff.
Wrap LangChain Calls: Use the retry decorator to wrap the LangChain calls in your application.

Here's an example implementation:

import asyncio
import logging
import random
from functools import wraps

logger = logging.getLogger(__name__)

def retry_with_backoff(max_retries=3, initial_delay=1, backoff_factor=2, jitter=0.1):
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            delay = initial_delay
            for attempt in range(max_retries):
                try:
                    return await func(*args, **kwargs)
                except (TimeoutError, ConnectionError, openai.error.RateLimitError) as e:
                    logger.warning(f"Attempt {attempt + 1} failed with {str(e)}. Retrying after {delay} seconds...")
                    await asyncio.sleep(delay + random.uniform(-jitter, jitter))
                    delay *= backoff_factor
                except Exception as e:
                    logger.error(f"Unexpected error occurred: {str(e)}")
                    raise
            raise RuntimeError(f"All {max_retries} retry attempts failed.")
        return wrapper
    return decorator

@retry_with_backoff()
async def call_langchain_function(*args, **kwargs):
    # Replace this with the actual LangChain function call
    async for chunk in policy_gpt_chain_lcel(*args, **kwargs):
        yield chunk

@app.route('/policy', methods=['POST'])
async def policy():
    data = await request.get_json()
    prompt = data.get("prompt", "What are the company values?")
    chat_history = data.get("chat_history", [])
    username = data.get("username", "")
    inputs = [item.get('input') for item in chat_history]
    outputs = [item.get('output') for item in chat_history]
    access_token_keyvault = request.headers.get('Authorization-KeyVault', '').split(' ')[1]
    access_token_openai = request.headers.get('Authorization-OpenAI', '').split(' ')[1]
    access_token_search = request.headers.get('Authorization-Search', '').split(' ')[1]
    instrumentation_key = config.SHD_INSTRUMENTATION_KEY(access_token_keyvault)

    async def generate():
        start_time = time.time()
        try:
            async for chunk in call_langchain_function(prompt, username, inputs, outputs, access_token_keyvault, access_token_openai, access_token_search):
                yield chunk
        except Exception as e:
            log_exception(instrumentation_key, e)
            yield f"An error occurred: {e}".encode()
        finally:
            end_time = time.time()
            execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
            log_custom_metric(instrumentation_key, 'policy_lcel_quart_execution_time', execution_time)

    return Response(generate(), content_type='text/event-stream')

This approach allows you to handle rate limit errors and other exceptions without modifying the LangChain library ^[1]^[2].

To continue talking to Dosu, mention @dosu.

levalencia Aug 14, 2024
Author

@dosu How can I log the finish_reason of every chunk. I know that OPENAI API returns a metadata object for each chunk with a finish_reason within

dosubot[bot] bot Aug 14, 2024

To log the finish_reason for each chunk, you can modify your policy_gpt_chain_lcel function to include logging of the finish_reason for each chunk. Here's how you can do it:

Update the policy_gpt_chain_lcel function: Add logging for the finish_reason for each chunk.

Here's an example of how you can achieve this:

import logging

async def policy_gpt_chain_lcel(prompt, username, inputs, outputs, access_token_keyvault, access_token_openai, access_token_search):
    async def llm_thread_gpt_lcel(prompt, username, inputs, outputs):
        try:
            SHD_AZURE_OPENAI_ENDPOINT = config.SHD_AZURE_OPENAI_ENDPOINT(access_token_keyvault)
            SHD_OPENAI_DEPLOYMENT_NAME = config.SHD_OPENAI_DEPLOYMENT_NAME(access_token_keyvault)
            SHD_OPENAI_GPT_MODEL_NAME = config.SHD_OPENAI_GPT_MODEL_NAME(access_token_keyvault)
   
            llm = AzureChatOpenAI(
                azure_endpoint=SHD_AZURE_OPENAI_ENDPOINT,
                openai_api_version="2023-03-15-preview",
                deployment_name=SHD_OPENAI_DEPLOYMENT_NAME,
                azure_ad_token=access_token_openai,
                openai_api_type="Azure",
                model_name=SHD_OPENAI_GPT_MODEL_NAME,
                streaming=True,
                temperature=0
            )

            embeddings = setup_embeddings(access_token_keyvault, access_token_openai)
            vector_store = setup_vector_store(embeddings, access_token_keyvault, access_token_search)
            retriever = ArgenxUserRetrieverPolicyLCEL(vectorstore=vector_store, username=username)
            
            demo_ephemeral_chat_history = ChatMessageHistory()
            for i in range(len(inputs)):
                demo_ephemeral_chat_history.add_user_message(inputs[i])
                demo_ephemeral_chat_history.add_ai_message(outputs[i])
                
            contextualize_q_system_prompt = (
                "Given a chat history and the latest user question "
                "which might reference context in the chat history, "
                "formulate a standalone question which can be understood "
                "without the chat history. Do NOT answer the question, "
                "just reformulate it if needed and otherwise return it as is."
            )
            contextualize_q_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", contextualize_q_system_prompt),
                    MessagesPlaceholder("chat_history"),
                    ("human", "{input} Please answer in HTML format"),
                ]
            )
            history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt) 
            qa_prompt = ChatPromptTemplate.from_messages(
                [
                    ("system", ANSWER_PROMPT_POLICY),
                    MessagesPlaceholder("chat_history"),
                    ("human", "{input} Please answer in HTML format"),
                ]
            )

            document_prompt = PromptTemplate(
                input_variables=["page_content", "source"],
                template="{page_content} (Source: {source})"
            )
            question_answer_chain = create_stuff_documents_chain(llm, qa_prompt, document_prompt=document_prompt, document_variable_name="context") | StrOutputParser()
                
            rag_chain_from_docs = create_retrieval_chain(history_aware_retriever, question_answer_chain) 
            conversational_rag_chain = RunnableWithMessageHistory(
                rag_chain_from_docs,
                lambda session_id: demo_ephemeral_chat_history, 
                input_messages_key="input",
                history_messages_key="chat_history",
                output_messages_key="answer",
            )

            async def fetch_and_stream():
                async for chunk in accumulate_and_yield(conversational_rag_chain.astream({"input": prompt}, config={"configurable": {"session_id": "unbound"}})):
                    finish_reason = chunk.get("choices", [{}])[0].get("finish_reason", "unknown")
                    logging.info(f"Chunk finish_reason: {finish_reason}")
                    yield chunk

            async for chunk in retry_with_backoff_in_stream(fetch_and_stream):
                yield chunk
        except Exception as e:
            logging.error(f"An error occurred in llm_thread_gpt_lcel: {e}")
            yield f"An error occurred: {e}".encode()
            
    try:
        async for chunk in llm_thread_gpt_lcel(prompt, username, inputs, outputs):
            yield chunk
    except Exception as e:
        logging.error(f"An error occurred in policy_gpt_chain_lcel: {e}")
        yield f"An error occurred: {e}".encode()

This modification adds logging for the finish_reason for each chunk. You can replace logging.info with any other logging mechanism if needed. This approach ensures that you capture and log the finish_reason for each chunk in a structured manner ^[1].

To continue talking to Dosu, mention @dosu.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

LLM Endpoint stops streaming chunks #25385

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment 4 replies

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

LLM Endpoint stops streaming chunks #25385

Uh oh!

levalencia Aug 14, 2024

Checked other resources

Commit to Help

Example Code

Description

System Info

Replies: 1 comment · 4 replies

Uh oh!

dosubot[bot] bot Aug 14, 2024

Uh oh!

levalencia Aug 14, 2024 Author

Uh oh!

dosubot[bot] bot Aug 14, 2024

Uh oh!

levalencia Aug 14, 2024 Author

Uh oh!

dosubot[bot] bot Aug 14, 2024

levalencia
Aug 14, 2024

Replies: 1 comment 4 replies

dosubot[bot]
bot Aug 14, 2024

levalencia Aug 14, 2024
Author

levalencia Aug 14, 2024
Author