1
1
# -*- coding: utf-8 -*-
2
- # pylint: disable=too-few-public-methods
3
2
"""
4
3
Hybrid Search Retriever. A class that combines the following:
5
4
- OpenAI prompting and ChatModel
16
15
https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
17
16
"""
18
17
19
- # document loading
20
- import glob
21
-
22
18
# general purpose imports
23
19
import logging
24
- import os
25
20
import textwrap
26
21
from typing import Union
27
22
28
23
# pinecone integration
29
- import pinecone
30
24
from langchain .cache import InMemoryCache
31
25
from langchain .chat_models import ChatOpenAI
32
- from langchain .document_loaders import PyPDFLoader
33
26
34
27
# embedding
35
- from langchain .embeddings import OpenAIEmbeddings
36
28
from langchain .globals import set_llm_cache
37
29
38
30
# prompting and chat
42
34
# hybrid search capability
43
35
from langchain .retrievers import PineconeHybridSearchRetriever
44
36
from langchain .schema import BaseMessage , HumanMessage , SystemMessage
45
- from langchain .text_splitter import Document
46
- from langchain .vectorstores .pinecone import Pinecone
47
37
from pinecone_text .sparse import BM25Encoder
48
38
49
39
# this project
50
40
from models .const import Config , Credentials
41
+ from models .pinecone import PineconeIndex
51
42
52
43
53
44
###############################################################################
56
47
logging .basicConfig (level = logging .DEBUG if Config .DEBUG_MODE else logging .INFO )
57
48
58
49
59
- class TextSplitter :
60
- """
61
- Custom text splitter that adds metadata to the Document object
62
- which is required by PineconeHybridSearchRetriever.
63
- """
64
-
65
- def create_documents (self , texts ):
66
- """Create documents"""
67
- documents = []
68
- for text in texts :
69
- # Create a Document object with the text and metadata
70
- document = Document (page_content = text , metadata = {"context" : text })
71
- documents .append (document )
72
- return documents
73
-
74
-
75
50
class HybridSearchRetriever :
76
51
"""Hybrid Search Retriever"""
77
52
78
53
_chat : ChatOpenAI = None
79
- _openai_embeddings : OpenAIEmbeddings = None
80
- _pinecone_index : pinecone .Index = None
81
- _vector_store : Pinecone = None
82
- _text_splitter : TextSplitter = None
83
54
_b25_encoder : BM25Encoder = None
55
+ _pinecone : PineconeIndex = None
56
+ _retriever : PineconeHybridSearchRetriever = None
84
57
85
58
def __init__ (self ):
86
59
"""Constructor"""
87
- pinecone .init (api_key = Credentials .PINECONE_API_KEY , environment = Config .PINECONE_ENVIRONMENT )
88
60
set_llm_cache (InMemoryCache ())
89
61
62
+ @property
63
+ def pinecone (self ) -> PineconeIndex :
64
+ """PineconeIndex lazy read-only property."""
65
+ if self ._pinecone is None :
66
+ self ._pinecone = PineconeIndex ()
67
+ return self ._pinecone
68
+
90
69
# prompting wrapper
91
70
@property
92
71
def chat (self ) -> ChatOpenAI :
@@ -102,48 +81,22 @@ def chat(self) -> ChatOpenAI:
102
81
)
103
82
return self ._chat
104
83
105
- # embeddings
106
- @property
107
- def openai_embeddings (self ) -> OpenAIEmbeddings :
108
- """OpenAIEmbeddings lazy read-only property."""
109
- if self ._openai_embeddings is None :
110
- self ._openai_embeddings = OpenAIEmbeddings (
111
- api_key = Credentials .OPENAI_API_KEY , organization = Credentials .OPENAI_API_ORGANIZATION
112
- )
113
- return self ._openai_embeddings
114
-
115
- @property
116
- def pinecone_index (self ) -> pinecone .Index :
117
- """pinecone.Index lazy read-only property."""
118
- if self ._pinecone_index is None :
119
- self ._pinecone_index = pinecone .Index (index_name = Config .PINECONE_INDEX_NAME )
120
- return self ._pinecone_index
121
-
122
- @property
123
- def vector_store (self ) -> Pinecone :
124
- """Pinecone lazy read-only property."""
125
- if self ._vector_store is None :
126
- self ._vector_store = Pinecone (
127
- index = self .pinecone_index ,
128
- embedding = self .openai_embeddings ,
129
- text_key = Config .PINECONE_VECTORSTORE_TEXT_KEY ,
130
- )
131
- return self ._vector_store
132
-
133
- @property
134
- def text_splitter (self ) -> TextSplitter :
135
- """TextSplitter lazy read-only property."""
136
- if self ._text_splitter is None :
137
- self ._text_splitter = TextSplitter ()
138
- return self ._text_splitter
139
-
140
84
@property
141
85
def bm25_encoder (self ) -> BM25Encoder :
142
86
"""BM25Encoder lazy read-only property."""
143
87
if self ._b25_encoder is None :
144
88
self ._b25_encoder = BM25Encoder ().default ()
145
89
return self ._b25_encoder
146
90
91
+ @property
92
+ def retriever (self ) -> PineconeHybridSearchRetriever :
93
+ """PineconeHybridSearchRetriever lazy read-only property."""
94
+ if self ._retriever is None :
95
+ self ._retriever = PineconeHybridSearchRetriever (
96
+ embeddings = self .pinecone .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone .index
97
+ )
98
+ return self ._retriever
99
+
147
100
def cached_chat_request (
148
101
self , system_message : Union [str , SystemMessage ], human_message : Union [str , HumanMessage ]
149
102
) -> BaseMessage :
@@ -169,54 +122,8 @@ def prompt_with_template(
169
122
return retval
170
123
171
124
def load (self , filepath : str ):
172
- """
173
- Embed PDF.
174
- 1. Load PDF document text data
175
- 2. Split into pages
176
- 3. Embed each page
177
- 4. Store in Pinecone
178
-
179
- Note: it's important to make sure that the "context" field that holds the document text
180
- in the metadata is not indexed. Currently you need to specify explicitly the fields you
181
- do want to index. For more information checkout
182
- https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
183
- """
184
- try :
185
- logging .info ("Deleting index..." )
186
- pinecone .delete_index (Config .PINECONE_INDEX_NAME )
187
- except pinecone .exceptions .PineconeException :
188
- logging .info ("Index does not exist. Continuing..." )
189
-
190
- metadata_config = {
191
- "indexed" : [Config .PINECONE_VECTORSTORE_TEXT_KEY , "lc_type" ],
192
- "context" : ["lc_text" ],
193
- }
194
- logging .info ("Creating index. This may take a few minutes..." )
195
- pinecone .create_index (
196
- Config .PINECONE_INDEX_NAME ,
197
- dimension = Config .PINECONE_DIMENSIONS ,
198
- metric = Config .PINECONE_METRIC ,
199
- metadata_config = metadata_config ,
200
- )
201
-
202
- pdf_files = glob .glob (os .path .join (filepath , "*.pdf" ))
203
- i = 0
204
- for pdf_file in pdf_files :
205
- i += 1
206
- j = len (pdf_files )
207
- logging .info ("Loading PDF %s of %s: %s" , i , j , pdf_file )
208
- loader = PyPDFLoader (file_path = pdf_file )
209
- docs = loader .load ()
210
- k = 0
211
- for doc in docs :
212
- k += 1
213
- logging .info (k * "-" , end = "\r " )
214
- documents = self .text_splitter .create_documents ([doc .page_content ])
215
- document_texts = [doc .page_content for doc in documents ]
216
- embeddings = self .openai_embeddings .embed_documents (document_texts )
217
- self .vector_store .add_documents (documents = documents , embeddings = embeddings )
218
-
219
- logging .info ("Finished loading PDFs" )
125
+ """Pdf loader."""
126
+ self .pinecone .pdf_loader (filepath = filepath )
220
127
221
128
def rag (self , human_message : Union [str , HumanMessage ]):
222
129
"""
@@ -241,10 +148,8 @@ def rag(self, human_message: Union[str, HumanMessage]):
241
148
# ---------------------------------------------------------------------
242
149
# 1.) Retrieve relevant documents from Pinecone vector database
243
150
# ---------------------------------------------------------------------
244
- retriever = PineconeHybridSearchRetriever (
245
- embeddings = self .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone_index
246
- )
247
- documents = retriever .get_relevant_documents (query = human_message .content )
151
+ # documents = self.retriever.get_relevant_documents(query=human_message.content)
152
+ documents = self .pinecone .vector_store .similarity_search (query = human_message .content )
248
153
249
154
# Extract the text from the documents
250
155
document_texts = [doc .page_content for doc in documents ]
@@ -261,14 +166,15 @@ def rag(self, human_message: Union[str, HumanMessage]):
261
166
# finished with hybrid search setup
262
167
# ---------------------------------------------------------------------
263
168
264
- # 2.) get a response from the chat model
265
- response = self .cached_chat_request (system_message = system_message , human_message = human_message )
266
-
267
169
logging .debug ("------------------------------------------------------" )
268
170
logging .debug ("rag() Retrieval Augmented Generation prompt" )
269
171
logging .debug ("Diagnostic information:" )
270
172
logging .debug (" Retrieved %i related documents from Pinecone" , len (documents ))
271
173
logging .debug (" System messages contains %i words" , len (system_message .content .split ()))
272
174
logging .debug (" Prompt: %s" , system_message .content )
273
175
logging .debug ("------------------------------------------------------" )
176
+
177
+ # 2.) get a response from the chat model
178
+ response = self .cached_chat_request (system_message = system_message , human_message = human_message )
179
+
274
180
return response .content
0 commit comments