Merge pull request #8 from lpm0073/next

lpm0073 · web-flow · commit 789455a8301a · 2023-12-01T13:22:47.000-06:00
switch vector store to spare text using BM25Encoder
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,15 +1,13 @@
 # [1.1.0](https://github.com/lpm0073/netec-llm/compare/v1.0.0...v1.1.0) (2023-12-01)
 
-
 ### Bug Fixes
 
-* fix load problem with existing index ([62cd18f](https://github.com/lpm0073/netec-llm/commit/62cd18f8088873a794ec363c4e18770dfdc41ea5))
-
+- fix load problem with existing index ([62cd18f](https://github.com/lpm0073/netec-llm/commit/62cd18f8088873a794ec363c4e18770dfdc41ea5))
 
 ### Features
 
-* perfect load(). revert rag() to openai only calls ([8de793d](https://github.com/lpm0073/netec-llm/commit/8de793dcca77ec23f09e68ca9e8dba7f64623b3c))
-* ssm.rag() w load, split, embed, store ([2335d22](https://github.com/lpm0073/netec-llm/commit/2335d22c5fd9092642ff1eb67a34fbcd712d7f9b))
+- perfect load(). revert rag() to openai only calls ([8de793d](https://github.com/lpm0073/netec-llm/commit/8de793dcca77ec23f09e68ca9e8dba7f64623b3c))
+- ssm.rag() w load, split, embed, store ([2335d22](https://github.com/lpm0073/netec-llm/commit/2335d22c5fd9092642ff1eb67a34fbcd712d7f9b))
 
 # 1.0.0 (2023-11-30)
 
diff --git a/README.md b/README.md
@@ -54,6 +54,16 @@ export PINECONE_API_KEY=SET-ME-PLEASE
 export PINECONE_ENVIRONMENT=SET-ME-PLEASE
 ```
 
+### Pinecone setup
+
+You'll need to manually create an index with the following characteristics
+
+- Environment: gcp-starter
+- Index name: netec-rag
+- Metric: dotproduct
+- Dimensions: 1536
+- Pod Type: starter
+
 ## Contributing
 
 This project uses a mostly automated pull request and unit testing process. See the resources in .github for additional details. You additionally should ensure that pre-commit is installed and working correctly on your dev machine by running the following command from the root of the repo.
diff --git a/models/__version__.py b/models/__version__.py
@@ -1,2 +1,2 @@
 # -*- coding: utf-8 -*-
-__version__ = "1.1.0"
+__version__ = "1.1.1"
diff --git a/models/ssm.py b/models/ssm.py
@@ -4,10 +4,12 @@
 Sales Support Model (SSM) for the LangChain project.
 See: https://python.langchain.com/docs/modules/model_io/llms/llm_caching
      https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
+     https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
 """
 
 import glob
 import os
+import textwrap
 from typing import List  # ClassVar
 
 # pinecone integration
@@ -27,9 +29,11 @@
 from langchain.globals import set_llm_cache
 from langchain.llms.openai import OpenAI
 from langchain.prompts import PromptTemplate
+from langchain.retrievers import PineconeHybridSearchRetriever
 from langchain.schema import HumanMessage, SystemMessage
-from langchain.text_splitter import Document, RecursiveCharacterTextSplitter
+from langchain.text_splitter import Document
 from langchain.vectorstores.pinecone import Pinecone
+from pinecone_text.sparse import BM25Encoder
 
 # this project
 from models.const import Credentials
@@ -46,6 +50,24 @@
 set_llm_cache(InMemoryCache())
 
 
+class TextSplitter:
+    """
+    Custom text splitter that add metadata to the Document object
+    which is required by PineconeHybridSearchRetriever.
+    """
+
+    # ...
+
+    def create_documents(self, texts):
+        """Create documents"""
+        documents = []
+        for text in texts:
+            # Create a Document object with the text and metadata
+            document = Document(page_content=text, metadata={"context": text})
+            documents.append(document)
+        return documents
+
+
 class SalesSupportModel:
     """Sales Support Model (SSM)."""
 
@@ -60,15 +82,14 @@ class SalesSupportModel:
     )
 
     # embeddings
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=100,
-        chunk_overlap=0,
-    )
-    openai_embedding = OpenAIEmbeddings()
-    pinecone_index = Pinecone.from_existing_index(
-        Credentials.PINECONE_INDEX_NAME,
-        embedding=openai_embedding,
+    openai_embeddings = OpenAIEmbeddings(
+        api_key=Credentials.OPENAI_API_KEY, organization=Credentials.OPENAI_API_ORGANIZATION
     )
+    pinecone_index = pinecone.Index(index_name=Credentials.PINECONE_INDEX_NAME)
+    vector_store = Pinecone(index=pinecone_index, embedding=openai_embeddings, text_key="lc_id")
+
+    text_splitter = TextSplitter()
+    bm25_encoder = BM25Encoder().default()
 
     def cached_chat_request(self, system_message: str, human_message: str) -> SystemMessage:
         """Cached chat request."""
@@ -86,24 +107,54 @@ def prompt_with_template(self, prompt: PromptTemplate, concept: str, model: str
         retval = llm(prompt.format(concept=concept))
         return retval
 
-    # FIX NOTE: DEPRECATED
     def split_text(self, text: str) -> List[Document]:
-        """Split text."""
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=100,
-            chunk_overlap=0,
-        )
-        retval = text_splitter.create_documents([text])
+        """Split text. Leaving this here for now, since it exposes the return type."""
+        retval = self.text_splitter.create_documents([text])
         return retval
 
+    def fit_tf_idf_values(self, corpus: List[str]):
+        """Fit TF-IDF values.
+        1. Fit the BM25 encoder on the corpus
+        2. Encode the corpus
+        3. Store the encoded corpus in Pinecone
+        """
+        corpus = ["foo", "bar", "world", "hello"]
+
+        # fit tf-idf values on your corpus
+        self.bm25_encoder.fit(corpus)
+
+        # persist the values to a json file
+        self.bm25_encoder.dump("bm25_values.json")
+        self.bm25_encoder = BM25Encoder().load("bm25_values.json")
+        self.bm25_encoder.fit(corpus)
+
     def load(self, filepath: str):
         """
         Embed PDF.
         1. Load PDF document text data
         2. Split into pages
         3. Embed each page
         4. Store in Pinecone
+
+        Note: it's important to make sure that the "context" field that holds the document text
+        in the metadata is not indexed. Currently you need to specify explicitly the fields you
+        do want to index. For more information checkout
+        https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
         """
+        try:
+            print("Deleting index...")
+            pinecone.delete_index(Credentials.PINECONE_INDEX_NAME)
+        except pinecone.exceptions.PineconeException:
+            print("Index does not exist. Continuing...")
+
+        metadata_config = {
+            "indexed": ["lc_id", "lc_type"],
+            "context": ["lc_text"],
+        }
+        print("Creating index. This may take a few minutes...")
+        pinecone.create_index(
+            Credentials.PINECONE_INDEX_NAME, dimension=1536, metric="dotproduct", metadata_config=metadata_config
+        )
 
         pdf_files = glob.glob(os.path.join(filepath, "*.pdf"))
         i = 0
@@ -117,12 +168,10 @@ def load(self, filepath: str):
             for doc in docs:
                 k += 1
                 print(k * "-", end="\r")
-                texts_splitter_results = self.text_splitter.create_documents([doc.page_content])
-                self.pinecone_index.from_existing_index(
-                    index_name=Credentials.PINECONE_INDEX_NAME,
-                    embedding=self.openai_embedding,
-                    text_key=texts_splitter_results,
-                )
+                documents = self.text_splitter.create_documents([doc.page_content])
+                document_texts = [doc.page_content for doc in documents]
+                embeddings = self.openai_embeddings.embed_documents(document_texts)
+                self.vector_store.add_documents(documents=documents, embeddings=embeddings)
 
         print("Finished loading PDFs")
 
@@ -133,26 +182,42 @@ def rag(self, prompt: str):
            from storage using a Retriever.
         2. Generate: A ChatModel / LLM produces an answer using a prompt that includes
            the question and the retrieved data
-        """
 
-        # pylint: disable=unused-variable
-        def format_docs(docs):
-            """Format docs."""
-            return "\n\n".join(doc.page_content for doc in docs)
+        To prompt OpenAI's GPT-3 model to consider the embeddings from the Pinecone
+        vector database, you would typically need to convert the embeddings back
+        into a format that GPT-3 can understand, such as text. However, GPT-3 does
+        not natively support direct input of embeddings.
 
-        retriever = self.pinecone_index.as_retriever()
-
-        # Use the retriever to get relevant documents
+        The typical workflow is to use the embeddings to retrieve relevant documents,
+        and then use the text of these documents as part of the prompt for GPT-3.
+        """
+        retriever = PineconeHybridSearchRetriever(
+            embeddings=self.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone_index
+        )
         documents = retriever.get_relevant_documents(query=prompt)
         print(f"Retrieved {len(documents)} related documents from Pinecone")
 
-        # Generate a prompt from the retrieved documents
-        prompt += " ".join(doc.page_content for doc in documents)
-        print(f"Prompt contains {len(prompt.split())} words")
-        print("Prompt:", prompt)
-        print(doc for doc in documents)
+        # Extract the text from the documents
+        document_texts = [doc.page_content for doc in documents]
+        leader = textwrap.dedent(
+            """\
+            You can assume that the following is true,
+            and you should attempt to incorporate these facts
+            in your response:
+        """
+        )
+
+        # Create a prompt that includes the document texts
+        prompt_with_relevant_documents = f"{prompt + leader} {'. '.join(document_texts)}"
+
+        print(f"Prompt contains {len(prompt_with_relevant_documents.split())} words")
+        print("Prompt:", prompt_with_relevant_documents)
 
         # Get a response from the GPT-3.5-turbo model
-        response = self.cached_chat_request(system_message="You are a helpful assistant.", human_message=prompt)
+        response = self.cached_chat_request(
+            system_message="You are a helpful assistant.", human_message=prompt_with_relevant_documents
+        )
 
+        print("Response:")
+        print("------------------------------------------------------")
         return response
diff --git a/models/tests/test_openai.py b/models/tests/test_openai.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# flake8: noqa: F401
+# pylint: disable=too-few-public-methods
+"""
+Test integrity of base class.
+"""
+import pytest  # pylint: disable=unused-import
+
+from ..ssm import SalesSupportModel
+
+
+class TestOpenAI:
+    """Test SalesSupportModel class."""
+
+    def test_03_test_openai_connectivity(self):
+        """Ensure that we have connectivity to OpenAI."""
+
+        ssm = SalesSupportModel()
+        retval = ssm.cached_chat_request(
+            "your are a helpful assistant", "please return the value 'CORRECT' in all upper case."
+        )
+        assert retval == "CORRECT"
diff --git a/models/tests/test_pinecone.py b/models/tests/test_pinecone.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# flake8: noqa: F401
+"""
+Test integrity of base class.
+"""
+
+import pinecone
+import pytest  # pylint: disable=unused-import
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores.pinecone import Pinecone
+
+from ..const import Credentials
+
+
+class TestPinecone:
+    """Test SalesSupportModel class."""
+
+    def test_01_test_pinecone_connectivity(self):
+        """Ensure that we have connectivity to Pinecone."""
+        # pylint: disable=broad-except
+        try:
+            pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
+        except Exception as e:
+            assert False, f"pinecone.init() failed with exception: {e}"
+
+    def test_02_test_pinecone_index(self):
+        """Ensure that the Pinecone index exists and that we can connect to it."""
+        pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
+        openai_embedding = OpenAIEmbeddings()
+
+        # pylint: disable=broad-except
+        try:
+            Pinecone.from_existing_index(
+                Credentials.PINECONE_INDEX_NAME,
+                embedding=openai_embedding,
+            )
+        except Exception as e:
+            assert (
+                False
+            ), f"Pinecone initialization of index {Credentials.PINECONE_INDEX_NAME,} failed with exception: {e}"
diff --git a/models/tests/test_prompts.py b/models/tests/test_prompts.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 # flake8: noqa: F401
-# pylint: disable=too-few-public-methods
 """
 Test integrity of base class.
 """
diff --git a/models/tests/test_ssm.py b/models/tests/test_ssm.py
@@ -1,18 +1,14 @@
 # -*- coding: utf-8 -*-
 # flake8: noqa: F401
-# pylint: disable=too-few-public-methods
 """
 Test integrity of base class.
 """
-import pinecone
 import pytest  # pylint: disable=unused-import
 from langchain.chat_models import ChatOpenAI
 from langchain.embeddings import OpenAIEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores.pinecone import Pinecone
+from pinecone import Index
 
-from ..const import Credentials
-from ..ssm import SalesSupportModel
+from models.ssm import SalesSupportModel, TextSplitter
 
 
 class TestSalesSupportModel:
@@ -21,30 +17,17 @@ class TestSalesSupportModel:
     def test_01_basic(self):
         """Ensure that we can instantiate the class."""
 
-        SalesSupportModel()
+        # pylint: disable=broad-except
+        try:
+            SalesSupportModel()
+        except Exception as e:
+            assert False, f"initialization of SalesSupportModel() failed with exception: {e}"
 
     def test_02_class_aatribute_types(self):
         """ensure that class attributes are of the correct type"""
 
         ssm = SalesSupportModel()
         assert isinstance(ssm.chat, ChatOpenAI)
-        assert isinstance(ssm.pinecone_index, Pinecone)
-        assert isinstance(ssm.text_splitter, RecursiveCharacterTextSplitter)
-        assert isinstance(ssm.openai_embedding, OpenAIEmbeddings)
-
-    def test_03_test_openai_connectivity(self):
-        """Ensure that we have connectivity to OpenAI."""
-
-        ssm = SalesSupportModel()
-        retval = ssm.cached_chat_request(
-            "your are a helpful assistant", "please return the value 'CORRECT' in all upper case."
-        )
-        assert retval == "CORRECT"
-
-    def test_04_test_pinecone_connectivity(self):
-        """Ensure that we have connectivity to Pinecone."""
-        # pylint: disable=broad-except
-        try:
-            pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
-        except Exception as e:
-            assert False, f"pinecone.init() failed with exception: {e}"
+        assert isinstance(ssm.pinecone_index, Index)
+        assert isinstance(ssm.text_splitter, TextSplitter)
+        assert isinstance(ssm.openai_embeddings, OpenAIEmbeddings)
diff --git a/requirements.txt b/requirements.txt
@@ -23,6 +23,7 @@ langchain==0.0.343
 langchainhub==0.1.14
 openai==1.3.5
 pinecone-client==2.2.4
+pinecone-text==0.7.0
 pydantic==2.5.2
 pypdf==3.17.1
 python-dotenv==1.0.0

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# -- coding: utf-8 --`
`2`		`-__version__ = "1.1.0"`
	`2`	`+__version__ = "1.1.1"`