From f998309bfcdef379b6c221a054d563bdda414678 Mon Sep 17 00:00:00 2001 From: rajveer43 Date: Tue, 2 Apr 2024 22:09:57 +0530 Subject: [PATCH] add main.py --- src/genai/main.py | 199 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 src/genai/main.py diff --git a/src/genai/main.py b/src/genai/main.py new file mode 100644 index 0000000..ec118d7 --- /dev/null +++ b/src/genai/main.py @@ -0,0 +1,199 @@ +import streamlit as st +from operator import itemgetter +from langchain.llms import LlamaCpp +from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain.memory import ConversationBufferMemory +# from langchain.callbacks.manager import CallbackManager +from langchain.callbacks.base import BaseCallbackHandler +from langchain.schema.runnable import RunnablePassthrough, RunnableLambda +from huggingface_hub import hf_hub_download + + +# StreamHandler to intercept streaming output from the LLM. +# This makes it appear that the Language Model is "typing" +# in realtime. +class StreamHandler(BaseCallbackHandler): + def __init__(self, container, initial_text=""): + self.container = container + self.text = initial_text + + def on_llm_new_token(self, token: str, **kwargs) -> None: + self.text += token + self.container.markdown(self.text) + + +@st.cache_resource +def create_chain(system_prompt): + # --- Disabled --- + # A stream handler to direct streaming output on the chat screen. + # This will need to be handled somewhat differently. + # But it demonstrates what potential it carries. + # stream_handler = StreamHandler(st.empty()) + + # --- Disabled --- + # Callback manager is a way to intercept streaming output from the + # LLM and take some action on it. Here we are giving it our custom + # stream handler to make it appear as if the LLM is typing the + # responses in real time. + # callback_manager = CallbackManager([stream_handler]) + + (repo_id, model_file_name) = ("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", + "mistral-7b-instruct-v0.1.Q4_0.gguf") + + model_path = hf_hub_download(repo_id=repo_id, + filename=model_file_name, + repo_type="model") + + # initialize LlamaCpp llm model + # n_gpu_layers, n_batch, and n_ctx are for GPU support. + # When not set, CPU will be used. + # set 1 for mac m2, and higher numbers based on your GPU support + llm = LlamaCpp( + model_path=model_path, + temperature=0, + max_tokens=512, + top_p=1, + # callback_manager=callback_manager, + # n_gpu_layers=1, + # n_batch=512, + # n_ctx=4096, + verbose=False, + streaming=True, + stop=["Human:"] + ) + + # system_prompt will include instructions to the llm. This might also be + # related to the persona that we desire the llm to assume. + # We will then add a placeholder for the chat history and name of the input + # variable which we will use to pass the history into the template. + # Next, we specify the placeholder for the user prompt as {human_input}. + # Lastly, we include an empty "ai" prompt to indicate the end of user input + # and start of ai response. + # We create a prompt from the template so we can use it with langchain + prompt = ChatPromptTemplate.from_messages([ + ("system", system_prompt), + MessagesPlaceholder(variable_name="chat_history"), + ("human", "{human_input}"), + ("ai", ""), + ]) + + # Conversation buffer memory will keep track of the conversation in the + # memory. It will use the "chat_history" as the name of the key. + memory = ConversationBufferMemory(memory_key="chat_history", + return_messages=True) + + # utility method that takes in the previous user prompt and generated ai + # response and stores it in the conversational memory. + def save_memory(inputs_outputs): + inputs = {"human": inputs_outputs["human"]} + outputs = {"ai": inputs_outputs["ai"]} + memory.save_context(inputs, outputs) + + # utility function to print chat history to the console after every + # interaction. + def debug_memory(): + print("\n", "#"*10, "\n") + print(memory.load_memory_variables({})) + print("\n", "#"*10, "\n") + + # utility function to extract the ai response and return it. There must be + # a better way to handle it but I can't find any examples or documentation + # on how to achieve the same. So I created this function instead. + def extract_response(chain_response): + # debug_memory() + return chain_response["ai"] + + # We create the internal llm chain first that takes our input and chat + # history and wraps it in a dictionary before passing it as input to our + # prompt. The prompt is then passed to our llm to generate an ai response. + llm_chain = { + "human_input": RunnablePassthrough(), + "chat_history": ( + RunnableLambda(memory.load_memory_variables) | + itemgetter("chat_history") + ) + } | prompt | llm + + # Since we need to manually inject our inputs and ai response into the + # memory we need to keep track of the initial prompt that we send through + # the chain so we can then save it to the memory with the generated ai + # response. In order to do that, we create a parallel dummy "chain", which + # will serve as passthrough chain for our prompt while the second chain + # will be used to generate an ai response based on our prompt and the chat + # history using the previous "llm_chain". We then combine both chains in a + # dictionary and past it to two more chains in parallel. First chain will + # call save our prompt and ai response to the chat history and second chain + # will extract the ai response and return that as the output of the chain. + chain_with_memory = RunnablePassthrough() | { + "human": RunnablePassthrough(), + "ai": llm_chain + } | { + "save_memory": RunnableLambda(save_memory), + "ai": itemgetter("ai") + } | RunnableLambda(extract_response) + + return chain_with_memory + + +# Set the webpage title +st.set_page_config( + page_title="Your own Chat!" +) + +# Create a header element +st.header("Your own Chat!") + +# This sets the LLM's personality for each prompt. +# The initial personality privided is basic. +# Try something interesting and notice how the LLM responses are affected. +system_prompt = st.text_area( + label="System Prompt", + value="You are a helpful AI assistant who answers questions in short sentences.", + key="system_prompt") + +# Create llm chain to use for our chat bot. +llm_chain = create_chain(system_prompt) + +# We store the conversation in the session state. +# This will be used to render the chat conversation. +# We initialize it with the first message we want to be greeted with. +if "messages" not in st.session_state: + st.session_state.messages = [ + {"role": "assistant", "content": "How may I help you today?"} + ] + +if "current_response" not in st.session_state: + st.session_state.current_response = "" + +# We loop through each message in the session state and render it as +# a chat message. +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# We take questions/instructions from the chat input to pass to the LLM +if user_prompt := st.chat_input("Your message here", key="user_input"): + + # Add our input to the session state + st.session_state.messages.append( + {"role": "user", "content": user_prompt} + ) + + # Add our input to the chat window + with st.chat_message("user"): + st.markdown(user_prompt) + + # Pass our input to the llm chain and capture the final responses. + # It is worth noting that the Stream Handler is already receiving the + # streaming response as the llm is generating. We get our response + # here once the llm has finished generating the complete response. + response = llm_chain.invoke(user_prompt) + + # Add the response to the session state + st.session_state.messages.append( + {"role": "assistant", "content": response} + ) + + # Add the response to the chat window + with st.chat_message("assistant"): + st.markdown(response) \ No newline at end of file