4
4
Sales Support Model (SSM) for the LangChain project.
5
5
See: https://python.langchain.com/docs/modules/model_io/llms/llm_caching
6
6
https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
7
+ https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
7
8
"""
8
9
9
10
import glob
10
11
import os
12
+ import textwrap
11
13
from typing import List # ClassVar
12
14
13
15
# pinecone integration
27
29
from langchain .globals import set_llm_cache
28
30
from langchain .llms .openai import OpenAI
29
31
from langchain .prompts import PromptTemplate
32
+ from langchain .retrievers import PineconeHybridSearchRetriever
30
33
from langchain .schema import HumanMessage , SystemMessage
31
- from langchain .text_splitter import Document , RecursiveCharacterTextSplitter
34
+ from langchain .text_splitter import Document
32
35
from langchain .vectorstores .pinecone import Pinecone
36
+ from pinecone_text .sparse import BM25Encoder
33
37
34
38
# this project
35
39
from models .const import Credentials
46
50
set_llm_cache (InMemoryCache ())
47
51
48
52
53
+ class TextSplitter :
54
+ """
55
+ Custom text splitter that add metadata to the Document object
56
+ which is required by PineconeHybridSearchRetriever.
57
+ """
58
+
59
+ # ...
60
+
61
+ def create_documents (self , texts ):
62
+ """Create documents"""
63
+ documents = []
64
+ for text in texts :
65
+ # Create a Document object with the text and metadata
66
+ document = Document (page_content = text , metadata = {"context" : text })
67
+ documents .append (document )
68
+ return documents
69
+
70
+
49
71
class SalesSupportModel :
50
72
"""Sales Support Model (SSM)."""
51
73
@@ -60,15 +82,14 @@ class SalesSupportModel:
60
82
)
61
83
62
84
# embeddings
63
- text_splitter = RecursiveCharacterTextSplitter (
64
- chunk_size = 100 ,
65
- chunk_overlap = 0 ,
66
- )
67
- openai_embedding = OpenAIEmbeddings ()
68
- pinecone_index = Pinecone .from_existing_index (
69
- Credentials .PINECONE_INDEX_NAME ,
70
- embedding = openai_embedding ,
85
+ openai_embeddings = OpenAIEmbeddings (
86
+ api_key = Credentials .OPENAI_API_KEY , organization = Credentials .OPENAI_API_ORGANIZATION
71
87
)
88
+ pinecone_index = pinecone .Index (index_name = Credentials .PINECONE_INDEX_NAME )
89
+ vector_store = Pinecone (index = pinecone_index , embedding = openai_embeddings , text_key = "lc_id" )
90
+
91
+ text_splitter = TextSplitter ()
92
+ bm25_encoder = BM25Encoder ().default ()
72
93
73
94
def cached_chat_request (self , system_message : str , human_message : str ) -> SystemMessage :
74
95
"""Cached chat request."""
@@ -86,24 +107,54 @@ def prompt_with_template(self, prompt: PromptTemplate, concept: str, model: str
86
107
retval = llm (prompt .format (concept = concept ))
87
108
return retval
88
109
89
- # FIX NOTE: DEPRECATED
90
110
def split_text (self , text : str ) -> List [Document ]:
91
- """Split text."""
92
- text_splitter = RecursiveCharacterTextSplitter (
93
- chunk_size = 100 ,
94
- chunk_overlap = 0 ,
95
- )
96
- retval = text_splitter .create_documents ([text ])
111
+ """Split text. Leaving this here for now, since it exposes the return type."""
112
+ retval = self .text_splitter .create_documents ([text ])
97
113
return retval
98
114
115
+ def fit_tf_idf_values (self , corpus : List [str ]):
116
+ """Fit TF-IDF values.
117
+ 1. Fit the BM25 encoder on the corpus
118
+ 2. Encode the corpus
119
+ 3. Store the encoded corpus in Pinecone
120
+ """
121
+ corpus = ["foo" , "bar" , "world" , "hello" ]
122
+
123
+ # fit tf-idf values on your corpus
124
+ self .bm25_encoder .fit (corpus )
125
+
126
+ # persist the values to a json file
127
+ self .bm25_encoder .dump ("bm25_values.json" )
128
+ self .bm25_encoder = BM25Encoder ().load ("bm25_values.json" )
129
+ self .bm25_encoder .fit (corpus )
130
+
99
131
def load (self , filepath : str ):
100
132
"""
101
133
Embed PDF.
102
134
1. Load PDF document text data
103
135
2. Split into pages
104
136
3. Embed each page
105
137
4. Store in Pinecone
138
+
139
+ Note: it's important to make sure that the "context" field that holds the document text
140
+ in the metadata is not indexed. Currently you need to specify explicitly the fields you
141
+ do want to index. For more information checkout
142
+ https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
106
143
"""
144
+ try :
145
+ print ("Deleting index..." )
146
+ pinecone .delete_index (Credentials .PINECONE_INDEX_NAME )
147
+ except pinecone .exceptions .PineconeException :
148
+ print ("Index does not exist. Continuing..." )
149
+
150
+ metadata_config = {
151
+ "indexed" : ["lc_id" , "lc_type" ],
152
+ "context" : ["lc_text" ],
153
+ }
154
+ print ("Creating index. This may take a few minutes..." )
155
+ pinecone .create_index (
156
+ Credentials .PINECONE_INDEX_NAME , dimension = 1536 , metric = "dotproduct" , metadata_config = metadata_config
157
+ )
107
158
108
159
pdf_files = glob .glob (os .path .join (filepath , "*.pdf" ))
109
160
i = 0
@@ -117,12 +168,10 @@ def load(self, filepath: str):
117
168
for doc in docs :
118
169
k += 1
119
170
print (k * "-" , end = "\r " )
120
- texts_splitter_results = self .text_splitter .create_documents ([doc .page_content ])
121
- self .pinecone_index .from_existing_index (
122
- index_name = Credentials .PINECONE_INDEX_NAME ,
123
- embedding = self .openai_embedding ,
124
- text_key = texts_splitter_results ,
125
- )
171
+ documents = self .text_splitter .create_documents ([doc .page_content ])
172
+ document_texts = [doc .page_content for doc in documents ]
173
+ embeddings = self .openai_embeddings .embed_documents (document_texts )
174
+ self .vector_store .add_documents (documents = documents , embeddings = embeddings )
126
175
127
176
print ("Finished loading PDFs" )
128
177
@@ -133,26 +182,42 @@ def rag(self, prompt: str):
133
182
from storage using a Retriever.
134
183
2. Generate: A ChatModel / LLM produces an answer using a prompt that includes
135
184
the question and the retrieved data
136
- """
137
185
138
- # pylint: disable=unused-variable
139
- def format_docs ( docs ):
140
- """Format docs."""
141
- return " \n \n " . join ( doc . page_content for doc in docs )
186
+ To prompt OpenAI's GPT-3 model to consider the embeddings from the Pinecone
187
+ vector database, you would typically need to convert the embeddings back
188
+ into a format that GPT-3 can understand, such as text. However, GPT-3 does
189
+ not natively support direct input of embeddings.
142
190
143
- retriever = self .pinecone_index .as_retriever ()
144
-
145
- # Use the retriever to get relevant documents
191
+ The typical workflow is to use the embeddings to retrieve relevant documents,
192
+ and then use the text of these documents as part of the prompt for GPT-3.
193
+ """
194
+ retriever = PineconeHybridSearchRetriever (
195
+ embeddings = self .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone_index
196
+ )
146
197
documents = retriever .get_relevant_documents (query = prompt )
147
198
print (f"Retrieved { len (documents )} related documents from Pinecone" )
148
199
149
- # Generate a prompt from the retrieved documents
150
- prompt += " " .join (doc .page_content for doc in documents )
151
- print (f"Prompt contains { len (prompt .split ())} words" )
152
- print ("Prompt:" , prompt )
153
- print (doc for doc in documents )
200
+ # Extract the text from the documents
201
+ document_texts = [doc .page_content for doc in documents ]
202
+ leader = textwrap .dedent (
203
+ """\
204
+ You can assume that the following is true,
205
+ and you should attempt to incorporate these facts
206
+ in your response:
207
+ """
208
+ )
209
+
210
+ # Create a prompt that includes the document texts
211
+ prompt_with_relevant_documents = f"{ prompt + leader } { '. ' .join (document_texts )} "
212
+
213
+ print (f"Prompt contains { len (prompt_with_relevant_documents .split ())} words" )
214
+ print ("Prompt:" , prompt_with_relevant_documents )
154
215
155
216
# Get a response from the GPT-3.5-turbo model
156
- response = self .cached_chat_request (system_message = "You are a helpful assistant." , human_message = prompt )
217
+ response = self .cached_chat_request (
218
+ system_message = "You are a helpful assistant." , human_message = prompt_with_relevant_documents
219
+ )
157
220
221
+ print ("Response:" )
222
+ print ("------------------------------------------------------" )
158
223
return response
0 commit comments