Azure-Samples · cecheta · May 20, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
@@ -2,26 +2,13 @@
 import logging
 import warnings
 
+from ..common.answer import Answer
 from ..common.source_document import SourceDocument
-from ..search.search import Search
-from .answering_tool_base import AnsweringToolBase
-
-from langchain.chains.llm import LLMChain
-from langchain.prompts import (
-    AIMessagePromptTemplate,
-    ChatPromptTemplate,
-    FewShotChatMessagePromptTemplate,
-    HumanMessagePromptTemplate,
-    MessagesPlaceholder,
-    PromptTemplate,
-)
-from langchain_community.callbacks import get_openai_callback
-from langchain_core.messages import SystemMessage
-
 from ..helpers.config.config_helper import ConfigHelper
-from ..helpers.llm_helper import LLMHelper
 from ..helpers.env_helper import EnvHelper
-from ..common.answer import Answer
+from ..helpers.llm_helper import LLMHelper
+from ..search.search import Search
+from .answering_tool_base import AnsweringToolBase
 
 logger = logging.getLogger(__name__)
 
@@ -46,27 +33,36 @@ def json_remove_whitespace(obj: str) -> str:
         except json.JSONDecodeError:
             return obj
 
-    def generate_llm_chain(self, question: str, sources: list[dict]):
-        answering_prompt = PromptTemplate(
-            template=self.config.prompts.answering_user_prompt,
-            input_variables=["question", "sources"],
-        )
+    @staticmethod
+    def clean_chat_history(chat_history: list[dict]) -> list[dict]:
+        return [
+            {
+                "content": message["content"],
+                "role": message["role"],
+            }
+            for message in chat_history
+        ]
 
+    def generate_messages(self, question: str, sources: list[SourceDocument]):
         sources_text = "\n\n".join(
             [f"[doc{i+1}]: {source.content}" for i, source in enumerate(sources)]
         )
 
-        return answering_prompt, {
-            "sources": sources_text,
-            "question": question,
-        }
+        return [
+            {
+                "content": self.config.prompts.answering_user_prompt.format(
+                    question=question, sources=sources_text
+                ),
+                "role": "user",
+            },
+        ]
 
-    def generate_on_your_data_llm_chain(
+    def generate_on_your_data_messages(
         self,
         question: str,
         chat_history: list[dict],
         sources: list[SourceDocument],
-    ):
+    ) -> list[dict]:
         examples = []
 
         few_shot_example = {
@@ -82,38 +78,28 @@ def generate_on_your_data_llm_chain(
 
         if any(few_shot_example.values()):
             if all((few_shot_example.values())):
-                examples.append(few_shot_example)
+                examples.append(
+                    {
+                        "content": self.config.prompts.answering_user_prompt.format(
+                            sources=few_shot_example["sources"],
+                            question=few_shot_example["question"],
+                        ),
+                        "name": "example_user",
+                        "role": "system",
+                    }
+                )
+                examples.append(
+                    {
+                        "content": few_shot_example["answer"],
+                        "name": "example_assistant",
+                        "role": "system",
+                    }
+                )
             else:
                 warnings.warn(
                     "Not all example fields are set in the config. Skipping few-shot example."
                 )
 
-        example_prompt = ChatPromptTemplate.from_messages(
-            [
-                HumanMessagePromptTemplate.from_template(
-                    self.config.prompts.answering_user_prompt
-                ),
-                AIMessagePromptTemplate.from_template("{answer}"),
-            ]
-        )
-
-        few_shot_prompt = FewShotChatMessagePromptTemplate(
-            example_prompt=example_prompt,
-            examples=examples,
-        )
-
-        answering_prompt = ChatPromptTemplate.from_messages(
-            [
-                SystemMessage(content=self.config.prompts.answering_system_prompt),
-                few_shot_prompt,
-                SystemMessage(content=self.env_helper.AZURE_OPENAI_SYSTEM_MESSAGE),
-                MessagesPlaceholder("chat_history"),
-                HumanMessagePromptTemplate.from_template(
-                    self.config.prompts.answering_user_prompt
-                ),
-            ]
-        )
-
         documents = json.dumps(
             {
                 "retrieved_documents": [
@@ -124,47 +110,52 @@ def generate_on_your_data_llm_chain(
             separators=(",", ":"),
         )
 
-        return answering_prompt, {
-            "sources": documents,
-            "question": question,
-            "chat_history": chat_history,
-        }
+        return [
+            {
+                "content": self.config.prompts.answering_system_prompt,
+                "role": "system",
+            },
+            *examples,
+            {
+                "content": self.env_helper.AZURE_OPENAI_SYSTEM_MESSAGE,
+                "role": "system",
+            },
+            *QuestionAnswerTool.clean_chat_history(chat_history),
+            {
+                "content": self.config.prompts.answering_user_prompt.format(
+                    sources=documents,
+                    question=question,
+                ),
+                "role": "user",
+            },
+        ]
 
-    def answer_question(
-        self, question: str, chat_history: list[SourceDocument], **kwargs
-    ):
+    def answer_question(self, question: str, chat_history: list[dict], **kwargs):
         source_documents = Search.get_source_documents(self.search_handler, question)
 
         if self.config.prompts.use_on_your_data_format:
-            answering_prompt, input = self.generate_on_your_data_llm_chain(
+            messages = self.generate_on_your_data_messages(
                 question, chat_history, source_documents
             )
         else:
             warnings.warn(
                 "Azure OpenAI On Your Data prompt format is recommended and should be enabled in the Admin app.",
             )
-            answering_prompt, input = self.generate_llm_chain(
-                question, source_documents
-            )
+            messages = self.generate_messages(question, source_documents)
 
         llm_helper = LLMHelper()
 
-        answer_generator = LLMChain(
-            llm=llm_helper.get_llm(), prompt=answering_prompt, verbose=self.verbose
-        )
-
-        with get_openai_callback() as cb:
-            result = answer_generator(input)
+        response = llm_helper.get_chat_completion(messages)
 
-        answer = result["text"]
+        answer = response.choices[0].message.content
         logger.debug(f"Answer: {answer}")
 
         # Generate Answer Object
         clean_answer = Answer(
             question=question,
             answer=answer,
             source_documents=source_documents,
-            prompt_tokens=cb.prompt_tokens,
-            completion_tokens=cb.completion_tokens,
+            prompt_tokens=response.usage.prompt_tokens,
+            completion_tokens=response.usage.completion_tokens,
         )
         return clean_answer
@@ -542,11 +542,13 @@ def test_post_makes_correct_call_to_openai_chat_completions_with_documents(
                     },
                     {
                         "content": '## Retrieved Documents\n{"retrieved_documents":[{"[doc1]":{"content":"content"}}]}\n\n## User Question\nuser question',
-                        "role": "user",
+                        "name": "example_user",
+                        "role": "system",
                     },
                     {
                         "content": "answer",
-                        "role": "assistant",
+                        "name": "example_assistant",
+                        "role": "system",
                     },
                     {
                         "content": "You are an AI assistant that helps people find information.",
@@ -559,11 +561,8 @@ def test_post_makes_correct_call_to_openai_chat_completions_with_documents(
                         "role": "user",
                     },
                 ],
-                "model": "gpt-3.5-turbo",  # This is hardcoded in LangChain
+                "model": app_config.get("AZURE_OPENAI_MODEL"),
                 "max_tokens": int(app_config.get("AZURE_OPENAI_MAX_TOKENS")),
-                "n": 1,
-                "stream": False,
-                "temperature": float(app_config.get("AZURE_OPENAI_TEMPERATURE")),
             },
             headers={
                 "Accept": "application/json",

@@ -228,11 +228,13 @@ def test_post_makes_correct_call_to_openai_chat_completions_in_question_answer_t
                     },
                     {
                         "content": '## Retrieved Documents\n{"retrieved_documents":[{"[doc1]":{"content":"Dual Transformer Encoder (DTE) DTE (https://dev.azure.com/TScience/TSciencePublic/_wiki/wikis/TSciencePublic.wiki/82/Dual-Transformer-Encoder) DTE is a general pair-oriented sentence representation learning framework based on transformers. It provides training, inference and evaluation for sentence similarity models. Model Details DTE can be used to train a model for sentence similarity with the following features: - Build upon existing transformer-based text representations (e.g.TNLR, BERT, RoBERTa, BAG-NLR) - Apply smoothness inducing technology to improve the representation robustness - SMART (https://arxiv.org/abs/1911.03437) SMART - Apply NCE (Noise Contrastive Estimation) based similarity learning to speed up training of 100M pairs We use pretrained DTE model"}},{"[doc2]":{"content":"trained on internal data. You can find more details here - Models.md (https://dev.azure.com/TScience/_git/TSciencePublic?path=%2FDualTransformerEncoder%2FMODELS.md&version=GBmaster&_a=preview) Models.md DTE-pretrained for In-context Learning Research suggests that finetuned transformers can be used to retrieve semantically similar exemplars for e.g. KATE (https://arxiv.org/pdf/2101.06804.pdf) KATE . They show that finetuned models esp. tuned on related tasks give the maximum boost to GPT-3 in-context performance. DTE have lot of pretrained models that are trained on intent classification tasks. We can use these model embedding to find natural language utterances which are similar to our test utterances at test time. The steps are: 1. Embed"}},{"[doc3]":{"content":"train and test utterances using DTE model 2. For each test embedding, find K-nearest neighbors. 3. Prefix the prompt with nearest embeddings. The following diagram from the above paper (https://arxiv.org/pdf/2101.06804.pdf) the above paper visualizes this process: DTE-Finetuned This is an extension of DTE-pretrained method where we further finetune the embedding models for prompt crafting task. In summary, we sample random prompts from our training data and use them for GPT-3 inference for the another part of training data. Some prompts work better and lead to right results whereas other prompts lead"}},{"[doc4]":{"content":"to wrong completions. We finetune the model on the downstream task of whether a prompt is good or not based on whether it leads to right or wrong completion. This approach is similar to this paper: Learning To Retrieve Prompts for In-Context Learning (https://arxiv.org/pdf/2112.08633.pdf) this paper: Learning To Retrieve Prompts for In-Context Learning . This method is very general but it may require a lot of data to actually finetune a model to learn how to retrieve examples suitable for the downstream inference model like GPT-3."}}]}\n\n## User Question\nWhat features does the Dual Transformer Encoder (DTE) provide for sentence similarity models and in-context learning?',
-                        "role": "user",
+                        "name": "example_user",
+                        "role": "system",
                     },
                     {
                         "content": "The Dual Transformer Encoder (DTE) is a framework for sentence representation learning that can be used to train, infer, and evaluate sentence similarity models[doc1][doc2]. It builds upon existing transformer-based text representations and applies smoothness inducing technology and Noise Contrastive Estimation for improved robustness and faster training[doc1]. DTE also offers pretrained models for in-context learning, which can be used to find semantically similar natural language utterances[doc2]. These models can be further finetuned for specific tasks, such as prompt crafting, to enhance the performance of downstream inference models like GPT-3[doc2][doc3][doc4]. However, this finetuning may require a significant amount of data[doc3][doc4].",
-                        "role": "assistant",
+                        "name": "example_assistant",
+                        "role": "system",
                     },
                     {
                         "content": "You are an AI assistant that helps people find information.",
@@ -245,11 +247,8 @@ def test_post_makes_correct_call_to_openai_chat_completions_in_question_answer_t
                         "role": "user",
                     },
                 ],
-                "model": "gpt-3.5-turbo",  # This is hardcoded in LangChain
+                "model": app_config.get("AZURE_OPENAI_MODEL"),
                 "max_tokens": int(app_config.get("AZURE_OPENAI_MAX_TOKENS")),
-                "n": 1,
-                "stream": False,
-                "temperature": float(app_config.get("AZURE_OPENAI_TEMPERATURE")),
             },
             headers={
                 "Accept": "application/json",