Skip to content

Commit 2b8585b

Browse files
committed
feat: refactor pinecone logic and add pinecone unit tests
1 parent 42e86dd commit 2b8585b

File tree

6 files changed

+301
-105
lines changed

6 files changed

+301
-105
lines changed

models/hybrid_search_retreiver.py

Lines changed: 23 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# -*- coding: utf-8 -*-
2-
# pylint: disable=too-few-public-methods
32
"""
43
Hybrid Search Retriever. A class that combines the following:
54
- OpenAI prompting and ChatModel
@@ -16,20 +15,15 @@
1615
https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
1716
"""
1817

19-
# document loading
20-
import glob
21-
2218
# general purpose imports
2319
import logging
24-
import os
2520
import textwrap
2621
from typing import Union
2722

2823
# pinecone integration
2924
import pinecone
3025
from langchain.cache import InMemoryCache
3126
from langchain.chat_models import ChatOpenAI
32-
from langchain.document_loaders import PyPDFLoader
3327

3428
# embedding
3529
from langchain.embeddings import OpenAIEmbeddings
@@ -42,12 +36,12 @@
4236
# hybrid search capability
4337
from langchain.retrievers import PineconeHybridSearchRetriever
4438
from langchain.schema import BaseMessage, HumanMessage, SystemMessage
45-
from langchain.text_splitter import Document
4639
from langchain.vectorstores.pinecone import Pinecone
4740
from pinecone_text.sparse import BM25Encoder
4841

4942
# this project
5043
from models.const import Config, Credentials
44+
from models.pinecone import PineConeIndex, TextSplitter
5145

5246

5347
###############################################################################
@@ -56,37 +50,29 @@
5650
logging.basicConfig(level=logging.DEBUG if Config.DEBUG_MODE else logging.INFO)
5751

5852

59-
class TextSplitter:
60-
"""
61-
Custom text splitter that adds metadata to the Document object
62-
which is required by PineconeHybridSearchRetriever.
63-
"""
64-
65-
def create_documents(self, texts):
66-
"""Create documents"""
67-
documents = []
68-
for text in texts:
69-
# Create a Document object with the text and metadata
70-
document = Document(page_content=text, metadata={"context": text})
71-
documents.append(document)
72-
return documents
73-
74-
7553
class HybridSearchRetriever:
7654
"""Hybrid Search Retriever"""
7755

7856
_chat: ChatOpenAI = None
7957
_openai_embeddings: OpenAIEmbeddings = None
80-
_pinecone_index: pinecone.Index = None
8158
_vector_store: Pinecone = None
8259
_text_splitter: TextSplitter = None
8360
_b25_encoder: BM25Encoder = None
61+
_pinecone: PineConeIndex = None
62+
_retriever: PineconeHybridSearchRetriever = None
8463

8564
def __init__(self):
8665
"""Constructor"""
8766
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT)
8867
set_llm_cache(InMemoryCache())
8968

69+
@property
70+
def pinecone(self) -> PineConeIndex:
71+
"""PineConeIndex lazy read-only property."""
72+
if self._pinecone is None:
73+
self._pinecone = PineConeIndex()
74+
return self._pinecone
75+
9076
# prompting wrapper
9177
@property
9278
def chat(self) -> ChatOpenAI:
@@ -112,19 +98,12 @@ def openai_embeddings(self) -> OpenAIEmbeddings:
11298
)
11399
return self._openai_embeddings
114100

115-
@property
116-
def pinecone_index(self) -> pinecone.Index:
117-
"""pinecone.Index lazy read-only property."""
118-
if self._pinecone_index is None:
119-
self._pinecone_index = pinecone.Index(index_name=Config.PINECONE_INDEX_NAME)
120-
return self._pinecone_index
121-
122101
@property
123102
def vector_store(self) -> Pinecone:
124103
"""Pinecone lazy read-only property."""
125104
if self._vector_store is None:
126105
self._vector_store = Pinecone(
127-
index=self.pinecone_index,
106+
index=self.pinecone.index,
128107
embedding=self.openai_embeddings,
129108
text_key=Config.PINECONE_VECTORSTORE_TEXT_KEY,
130109
)
@@ -144,6 +123,15 @@ def bm25_encoder(self) -> BM25Encoder:
144123
self._b25_encoder = BM25Encoder().default()
145124
return self._b25_encoder
146125

126+
@property
127+
def retriever(self) -> PineconeHybridSearchRetriever:
128+
"""PineconeHybridSearchRetriever lazy read-only property."""
129+
if self._retriever is None:
130+
self._retriever = PineconeHybridSearchRetriever(
131+
embeddings=self.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone.index
132+
)
133+
return self._retriever
134+
147135
def cached_chat_request(
148136
self, system_message: Union[str, SystemMessage], human_message: Union[str, HumanMessage]
149137
) -> BaseMessage:
@@ -169,54 +157,8 @@ def prompt_with_template(
169157
return retval
170158

171159
def load(self, filepath: str):
172-
"""
173-
Embed PDF.
174-
1. Load PDF document text data
175-
2. Split into pages
176-
3. Embed each page
177-
4. Store in Pinecone
178-
179-
Note: it's important to make sure that the "context" field that holds the document text
180-
in the metadata is not indexed. Currently you need to specify explicitly the fields you
181-
do want to index. For more information checkout
182-
https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
183-
"""
184-
try:
185-
logging.info("Deleting index...")
186-
pinecone.delete_index(Config.PINECONE_INDEX_NAME)
187-
except pinecone.exceptions.PineconeException:
188-
logging.info("Index does not exist. Continuing...")
189-
190-
metadata_config = {
191-
"indexed": [Config.PINECONE_VECTORSTORE_TEXT_KEY, "lc_type"],
192-
"context": ["lc_text"],
193-
}
194-
logging.info("Creating index. This may take a few minutes...")
195-
pinecone.create_index(
196-
Config.PINECONE_INDEX_NAME,
197-
dimension=Config.PINECONE_DIMENSIONS,
198-
metric=Config.PINECONE_METRIC,
199-
metadata_config=metadata_config,
200-
)
201-
202-
pdf_files = glob.glob(os.path.join(filepath, "*.pdf"))
203-
i = 0
204-
for pdf_file in pdf_files:
205-
i += 1
206-
j = len(pdf_files)
207-
logging.info("Loading PDF %s of %s: %s", i, j, pdf_file)
208-
loader = PyPDFLoader(file_path=pdf_file)
209-
docs = loader.load()
210-
k = 0
211-
for doc in docs:
212-
k += 1
213-
logging.info(k * "-", end="\r")
214-
documents = self.text_splitter.create_documents([doc.page_content])
215-
document_texts = [doc.page_content for doc in documents]
216-
embeddings = self.openai_embeddings.embed_documents(document_texts)
217-
self.vector_store.add_documents(documents=documents, embeddings=embeddings)
218-
219-
logging.info("Finished loading PDFs")
160+
"""Pdf loader."""
161+
self.pinecone.pdf_loader(filepath=filepath)
220162

221163
def rag(self, human_message: Union[str, HumanMessage]):
222164
"""
@@ -241,10 +183,7 @@ def rag(self, human_message: Union[str, HumanMessage]):
241183
# ---------------------------------------------------------------------
242184
# 1.) Retrieve relevant documents from Pinecone vector database
243185
# ---------------------------------------------------------------------
244-
retriever = PineconeHybridSearchRetriever(
245-
embeddings=self.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone_index
246-
)
247-
documents = retriever.get_relevant_documents(query=human_message.content)
186+
documents = self.retriever.get_relevant_documents(query=human_message.content)
248187

249188
# Extract the text from the documents
250189
document_texts = [doc.page_content for doc in documents]

models/pinecone.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# -*- coding: utf-8 -*-
2+
"""Pinecone helper functions."""
3+
4+
# document loading
5+
import glob
6+
7+
# general purpose imports
8+
import logging
9+
import os
10+
11+
# pinecone integration
12+
import pinecone
13+
from langchain.document_loaders import PyPDFLoader
14+
from langchain.embeddings import OpenAIEmbeddings
15+
from langchain.text_splitter import Document
16+
from langchain.vectorstores.pinecone import Pinecone as LCPinecone
17+
18+
# this project
19+
from models.const import Config, Credentials
20+
21+
22+
# pylint: disable=too-few-public-methods
23+
class TextSplitter:
24+
"""
25+
Custom text splitter that adds metadata to the Document object
26+
which is required by PineconeHybridSearchRetriever.
27+
"""
28+
29+
def create_documents(self, texts):
30+
"""Create documents"""
31+
documents = []
32+
for text in texts:
33+
# Create a Document object with the text and metadata
34+
document = Document(page_content=text, metadata={"context": text})
35+
documents.append(document)
36+
return documents
37+
38+
39+
class PineConeIndex:
40+
"""Pinecone helper class."""
41+
42+
_index: pinecone.Index = None
43+
_index_name: str = None
44+
_text_splitter: TextSplitter = None
45+
_openai_embeddings: OpenAIEmbeddings = None
46+
_vector_store: LCPinecone = None
47+
48+
def __init__(self, index_name: str = None):
49+
self._index_name = index_name or Config.PINECONE_INDEX_NAME
50+
self.init()
51+
52+
@property
53+
def vector_store(self) -> LCPinecone:
54+
"""Pinecone lazy read-only property."""
55+
if self._vector_store is None:
56+
self._vector_store = LCPinecone(
57+
index=self.index,
58+
embedding=self.openai_embeddings,
59+
text_key=Config.PINECONE_VECTORSTORE_TEXT_KEY,
60+
)
61+
return self._vector_store
62+
63+
@property
64+
def openai_embeddings(self) -> OpenAIEmbeddings:
65+
"""OpenAIEmbeddings lazy read-only property."""
66+
if self._openai_embeddings is None:
67+
self._openai_embeddings = OpenAIEmbeddings(
68+
api_key=Credentials.OPENAI_API_KEY, organization=Credentials.OPENAI_API_ORGANIZATION
69+
)
70+
return self._openai_embeddings
71+
72+
@property
73+
def text_splitter(self) -> TextSplitter:
74+
"""TextSplitter lazy read-only property."""
75+
if self._text_splitter is None:
76+
self._text_splitter = TextSplitter()
77+
return self._text_splitter
78+
79+
@property
80+
def index_name(self) -> str:
81+
"""index name."""
82+
return self._index_name
83+
84+
@index_name.setter
85+
def index_name(self, value: str) -> None:
86+
"""Set index name."""
87+
if self._index_name != value:
88+
self._index_name = value
89+
self.initialize()
90+
91+
@property
92+
def index(self) -> pinecone.Index:
93+
"""pinecone.Index lazy read-only property."""
94+
if self._index is None:
95+
try:
96+
self._index = pinecone.Index(index_name=self.index_name)
97+
except pinecone.exceptions.PineconeException:
98+
# index does not exist, so create it.
99+
self.create()
100+
self._index = pinecone.Index(index_name=self.index_name)
101+
return self._index
102+
103+
def init(self):
104+
"""Initialize Pinecone."""
105+
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT)
106+
107+
def delete(self):
108+
"""Delete index."""
109+
try:
110+
logging.info("Deleting index...")
111+
pinecone.delete_index(self.index_name)
112+
except pinecone.exceptions.PineconeException:
113+
logging.info("Index does not exist. Continuing...")
114+
115+
def create(self):
116+
"""Create index."""
117+
metadata_config = {
118+
"indexed": [Config.PINECONE_VECTORSTORE_TEXT_KEY, "lc_type"],
119+
"context": ["lc_text"],
120+
}
121+
logging.info("Creating index. This may take a few minutes...")
122+
123+
pinecone.create_index(
124+
self.index_name,
125+
dimension=Config.PINECONE_DIMENSIONS,
126+
metric=Config.PINECONE_METRIC,
127+
metadata_config=metadata_config,
128+
)
129+
130+
def initialize(self):
131+
"""Initialize index."""
132+
self.delete()
133+
self.create()
134+
135+
def pdf_loader(self, filepath: str):
136+
"""
137+
Embed PDF.
138+
1. Load PDF document text data
139+
2. Split into pages
140+
3. Embed each page
141+
4. Store in Pinecone
142+
143+
Note: it's important to make sure that the "context" field that holds the document text
144+
in the metadata is not indexed. Currently you need to specify explicitly the fields you
145+
do want to index. For more information checkout
146+
https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
147+
"""
148+
self.initialize()
149+
150+
pdf_files = glob.glob(os.path.join(filepath, "*.pdf"))
151+
i = 0
152+
for pdf_file in pdf_files:
153+
i += 1
154+
j = len(pdf_files)
155+
logging.info("Loading PDF %s of %s: %s", i, j, pdf_file)
156+
loader = PyPDFLoader(file_path=pdf_file)
157+
docs = loader.load()
158+
k = 0
159+
for doc in docs:
160+
k += 1
161+
logging.info(k * "-", end="\r")
162+
documents = self.text_splitter.create_documents([doc.page_content])
163+
document_texts = [doc.page_content for doc in documents]
164+
embeddings = self.openai_embeddings.embed_documents(document_texts)
165+
self.vector_store.add_documents(documents=documents, embeddings=embeddings)
166+
167+
logging.info("Finished loading PDFs")

models/tests/test_hsr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
import pytest # pylint: disable=unused-import
77
from langchain.chat_models import ChatOpenAI
88
from langchain.embeddings import OpenAIEmbeddings
9-
from pinecone import Index
109

1110
from models.hybrid_search_retreiver import HybridSearchRetriever, TextSplitter
11+
from models.pinecone import PineConeIndex
1212

1313

1414
class TestSalesSupportModel:
@@ -28,6 +28,6 @@ def test_02_class_aatribute_types(self):
2828

2929
hsr = HybridSearchRetriever()
3030
assert isinstance(hsr.chat, ChatOpenAI)
31-
assert isinstance(hsr.pinecone_index, Index)
31+
assert isinstance(hsr.pinecone, PineConeIndex)
3232
assert isinstance(hsr.text_splitter, TextSplitter)
3333
assert isinstance(hsr.openai_embeddings, OpenAIEmbeddings)

0 commit comments

Comments
 (0)