1
1
# -*- coding: utf-8 -*-
2
- # pylint: disable=too-few-public-methods
3
2
"""
4
3
Hybrid Search Retriever. A class that combines the following:
5
4
- OpenAI prompting and ChatModel
16
15
https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
17
16
"""
18
17
19
- # document loading
20
- import glob
21
-
22
18
# general purpose imports
23
19
import logging
24
- import os
25
20
import textwrap
26
21
from typing import Union
27
22
28
23
# pinecone integration
29
24
import pinecone
30
25
from langchain .cache import InMemoryCache
31
26
from langchain .chat_models import ChatOpenAI
32
- from langchain .document_loaders import PyPDFLoader
33
27
34
28
# embedding
35
29
from langchain .embeddings import OpenAIEmbeddings
42
36
# hybrid search capability
43
37
from langchain .retrievers import PineconeHybridSearchRetriever
44
38
from langchain .schema import BaseMessage , HumanMessage , SystemMessage
45
- from langchain .text_splitter import Document
46
39
from langchain .vectorstores .pinecone import Pinecone
47
40
from pinecone_text .sparse import BM25Encoder
48
41
49
42
# this project
50
43
from models .const import Config , Credentials
44
+ from models .pinecone import PineConeIndex , TextSplitter
51
45
52
46
53
47
###############################################################################
56
50
logging .basicConfig (level = logging .DEBUG if Config .DEBUG_MODE else logging .INFO )
57
51
58
52
59
- class TextSplitter :
60
- """
61
- Custom text splitter that adds metadata to the Document object
62
- which is required by PineconeHybridSearchRetriever.
63
- """
64
-
65
- def create_documents (self , texts ):
66
- """Create documents"""
67
- documents = []
68
- for text in texts :
69
- # Create a Document object with the text and metadata
70
- document = Document (page_content = text , metadata = {"context" : text })
71
- documents .append (document )
72
- return documents
73
-
74
-
75
53
class HybridSearchRetriever :
76
54
"""Hybrid Search Retriever"""
77
55
78
56
_chat : ChatOpenAI = None
79
57
_openai_embeddings : OpenAIEmbeddings = None
80
- _pinecone_index : pinecone .Index = None
81
58
_vector_store : Pinecone = None
82
59
_text_splitter : TextSplitter = None
83
60
_b25_encoder : BM25Encoder = None
61
+ _pinecone : PineConeIndex = None
62
+ _retriever : PineconeHybridSearchRetriever = None
84
63
85
64
def __init__ (self ):
86
65
"""Constructor"""
87
66
pinecone .init (api_key = Credentials .PINECONE_API_KEY , environment = Config .PINECONE_ENVIRONMENT )
88
67
set_llm_cache (InMemoryCache ())
89
68
69
+ @property
70
+ def pinecone (self ) -> PineConeIndex :
71
+ """PineConeIndex lazy read-only property."""
72
+ if self ._pinecone is None :
73
+ self ._pinecone = PineConeIndex ()
74
+ return self ._pinecone
75
+
90
76
# prompting wrapper
91
77
@property
92
78
def chat (self ) -> ChatOpenAI :
@@ -112,19 +98,12 @@ def openai_embeddings(self) -> OpenAIEmbeddings:
112
98
)
113
99
return self ._openai_embeddings
114
100
115
- @property
116
- def pinecone_index (self ) -> pinecone .Index :
117
- """pinecone.Index lazy read-only property."""
118
- if self ._pinecone_index is None :
119
- self ._pinecone_index = pinecone .Index (index_name = Config .PINECONE_INDEX_NAME )
120
- return self ._pinecone_index
121
-
122
101
@property
123
102
def vector_store (self ) -> Pinecone :
124
103
"""Pinecone lazy read-only property."""
125
104
if self ._vector_store is None :
126
105
self ._vector_store = Pinecone (
127
- index = self .pinecone_index ,
106
+ index = self .pinecone . index ,
128
107
embedding = self .openai_embeddings ,
129
108
text_key = Config .PINECONE_VECTORSTORE_TEXT_KEY ,
130
109
)
@@ -144,6 +123,15 @@ def bm25_encoder(self) -> BM25Encoder:
144
123
self ._b25_encoder = BM25Encoder ().default ()
145
124
return self ._b25_encoder
146
125
126
+ @property
127
+ def retriever (self ) -> PineconeHybridSearchRetriever :
128
+ """PineconeHybridSearchRetriever lazy read-only property."""
129
+ if self ._retriever is None :
130
+ self ._retriever = PineconeHybridSearchRetriever (
131
+ embeddings = self .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone .index
132
+ )
133
+ return self ._retriever
134
+
147
135
def cached_chat_request (
148
136
self , system_message : Union [str , SystemMessage ], human_message : Union [str , HumanMessage ]
149
137
) -> BaseMessage :
@@ -169,54 +157,8 @@ def prompt_with_template(
169
157
return retval
170
158
171
159
def load (self , filepath : str ):
172
- """
173
- Embed PDF.
174
- 1. Load PDF document text data
175
- 2. Split into pages
176
- 3. Embed each page
177
- 4. Store in Pinecone
178
-
179
- Note: it's important to make sure that the "context" field that holds the document text
180
- in the metadata is not indexed. Currently you need to specify explicitly the fields you
181
- do want to index. For more information checkout
182
- https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
183
- """
184
- try :
185
- logging .info ("Deleting index..." )
186
- pinecone .delete_index (Config .PINECONE_INDEX_NAME )
187
- except pinecone .exceptions .PineconeException :
188
- logging .info ("Index does not exist. Continuing..." )
189
-
190
- metadata_config = {
191
- "indexed" : [Config .PINECONE_VECTORSTORE_TEXT_KEY , "lc_type" ],
192
- "context" : ["lc_text" ],
193
- }
194
- logging .info ("Creating index. This may take a few minutes..." )
195
- pinecone .create_index (
196
- Config .PINECONE_INDEX_NAME ,
197
- dimension = Config .PINECONE_DIMENSIONS ,
198
- metric = Config .PINECONE_METRIC ,
199
- metadata_config = metadata_config ,
200
- )
201
-
202
- pdf_files = glob .glob (os .path .join (filepath , "*.pdf" ))
203
- i = 0
204
- for pdf_file in pdf_files :
205
- i += 1
206
- j = len (pdf_files )
207
- logging .info ("Loading PDF %s of %s: %s" , i , j , pdf_file )
208
- loader = PyPDFLoader (file_path = pdf_file )
209
- docs = loader .load ()
210
- k = 0
211
- for doc in docs :
212
- k += 1
213
- logging .info (k * "-" , end = "\r " )
214
- documents = self .text_splitter .create_documents ([doc .page_content ])
215
- document_texts = [doc .page_content for doc in documents ]
216
- embeddings = self .openai_embeddings .embed_documents (document_texts )
217
- self .vector_store .add_documents (documents = documents , embeddings = embeddings )
218
-
219
- logging .info ("Finished loading PDFs" )
160
+ """Pdf loader."""
161
+ self .pinecone .pdf_loader (filepath = filepath )
220
162
221
163
def rag (self , human_message : Union [str , HumanMessage ]):
222
164
"""
@@ -241,10 +183,7 @@ def rag(self, human_message: Union[str, HumanMessage]):
241
183
# ---------------------------------------------------------------------
242
184
# 1.) Retrieve relevant documents from Pinecone vector database
243
185
# ---------------------------------------------------------------------
244
- retriever = PineconeHybridSearchRetriever (
245
- embeddings = self .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone_index
246
- )
247
- documents = retriever .get_relevant_documents (query = human_message .content )
186
+ documents = self .retriever .get_relevant_documents (query = human_message .content )
248
187
249
188
# Extract the text from the documents
250
189
document_texts = [doc .page_content for doc in documents ]
0 commit comments