Skip to content

Commit 84bd16b

Browse files
committed
chore: code, test, debug PineconeIndex
1 parent 1c9699d commit 84bd16b

File tree

9 files changed

+100
-60
lines changed

9 files changed

+100
-60
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ $(shell echo -e "OPENAI_API_ORGANIZATION=PLEASE-ADD-ME\n\
77
OPENAI_API_KEY=PLEASE-ADD-ME\n\
88
PINECONE_API_KEY=PLEASE-ADD-ME\n\
99
PINECONE_ENVIRONMENT=gcp-starter\n\
10-
PINECONE_INDEX_NAME=hsr\n\
10+
PINECONE_INDEX_NAME=rag\n\
1111
PINECONE_VECTORSTORE_TEXT_KEY=lc_id\n\
1212
PINECONE_METRIC=dotproduct\n\
1313
PINECONE_DIMENSIONS=1536\n\

README.md

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,9 @@ python3 -m models.examples.training_services "Microsoft certified Azure AI engin
4242
# example 4 - prompted assistant
4343
python3 -m models.examples.training_services_oracle "Oracle database administrator"
4444

45-
# example 5 - Load PDF documents
45+
# example 5 - Retrieval Augmented Generation
46+
python3 -m models.examples.pinecone_init
4647
python3 -m models.examples.load "./data/"
47-
48-
# example 6 - Retrieval Augmented Generation
4948
python3 -m models.examples.rag "What analytics and accounting courses does Wharton offer?"
5049
```
5150

@@ -67,7 +66,7 @@ DEBUG_MODE=False
6766
You'll need to manually create an index with the following characteristics
6867

6968
- Environment: gcp-starter
70-
- Index name: netec-rag
69+
- Index name: rag
7170
- Metric: dotproduct
7271
- Dimensions: 1536
7372
- Pod Type: starter

models/examples/pinecone_init.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding: utf-8 -*-
2+
"""Sales Support Model (hsr) Retrieval Augmented Generation (RAG)"""
3+
4+
from models.pinecone import PineconeIndex
5+
6+
7+
pinecone = PineconeIndex()
8+
9+
if __name__ == "__main__":
10+
pinecone.initialize()
11+
print("Pinecone index initialized. name: ", pinecone.index_name)
12+
print(pinecone.index.describe_index_stats())

models/examples/rag.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@
1414
parser.add_argument("prompt", type=str, help="A question about the PDF contents")
1515
args = parser.parse_args()
1616

17-
human_message = HumanMessage(text=args.prompt)
17+
human_message = HumanMessage(content=args.prompt)
1818
result = hsr.rag(human_message=human_message)
1919
print(result)

models/hybrid_search_retreiver.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
# this project
4040
from models.const import Config, Credentials
41-
from models.pinecone import PineConeIndex
41+
from models.pinecone import PineconeIndex
4242

4343

4444
###############################################################################
@@ -52,18 +52,18 @@ class HybridSearchRetriever:
5252

5353
_chat: ChatOpenAI = None
5454
_b25_encoder: BM25Encoder = None
55-
_pinecone: PineConeIndex = None
55+
_pinecone: PineconeIndex = None
5656
_retriever: PineconeHybridSearchRetriever = None
5757

5858
def __init__(self):
5959
"""Constructor"""
6060
set_llm_cache(InMemoryCache())
6161

6262
@property
63-
def pinecone(self) -> PineConeIndex:
64-
"""PineConeIndex lazy read-only property."""
63+
def pinecone(self) -> PineconeIndex:
64+
"""PineconeIndex lazy read-only property."""
6565
if self._pinecone is None:
66-
self._pinecone = PineConeIndex()
66+
self._pinecone = PineconeIndex()
6767
return self._pinecone
6868

6969
# prompting wrapper
@@ -148,7 +148,8 @@ def rag(self, human_message: Union[str, HumanMessage]):
148148
# ---------------------------------------------------------------------
149149
# 1.) Retrieve relevant documents from Pinecone vector database
150150
# ---------------------------------------------------------------------
151-
documents = self.retriever.get_relevant_documents(query=human_message.content)
151+
# documents = self.retriever.get_relevant_documents(query=human_message.content)
152+
documents = self.pinecone.vector_store.similarity_search(query=human_message.content)
152153

153154
# Extract the text from the documents
154155
document_texts = [doc.page_content for doc in documents]
@@ -165,14 +166,15 @@ def rag(self, human_message: Union[str, HumanMessage]):
165166
# finished with hybrid search setup
166167
# ---------------------------------------------------------------------
167168

168-
# 2.) get a response from the chat model
169-
response = self.cached_chat_request(system_message=system_message, human_message=human_message)
170-
171169
logging.debug("------------------------------------------------------")
172170
logging.debug("rag() Retrieval Augmented Generation prompt")
173171
logging.debug("Diagnostic information:")
174172
logging.debug(" Retrieved %i related documents from Pinecone", len(documents))
175173
logging.debug(" System messages contains %i words", len(system_message.content.split()))
176174
logging.debug(" Prompt: %s", system_message.content)
177175
logging.debug("------------------------------------------------------")
176+
177+
# 2.) get a response from the chat model
178+
response = self.cached_chat_request(system_message=system_message, human_message=human_message)
179+
178180
return response.content

models/pinecone.py

Lines changed: 54 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import glob
66

77
# general purpose imports
8+
import json
89
import logging
910
import os
1011

@@ -36,7 +37,7 @@ def create_documents(self, texts):
3637
return documents
3738

3839

39-
class PineConeIndex:
40+
class PineconeIndex:
4041
"""Pinecone helper class."""
4142

4243
_index: pinecone.Index = None
@@ -46,13 +47,42 @@ class PineConeIndex:
4647
_vector_store: LCPinecone = None
4748

4849
def __init__(self, index_name: str = None):
49-
self._index_name = index_name or Config.PINECONE_INDEX_NAME
5050
self.init()
51+
self.index_name = index_name or Config.PINECONE_INDEX_NAME
52+
53+
@property
54+
def index_name(self) -> str:
55+
"""index name."""
56+
return self._index_name
57+
58+
@index_name.setter
59+
def index_name(self, value: str) -> None:
60+
"""Set index name."""
61+
if self._index_name != value:
62+
self.init()
63+
self._index_name = value
64+
self.init_index()
65+
66+
@property
67+
def index(self) -> pinecone.Index:
68+
"""pinecone.Index lazy read-only property."""
69+
if self._index is None:
70+
self.init_index()
71+
self._index = pinecone.Index(index_name=self.index_name)
72+
return self._index
73+
74+
@property
75+
def initialized(self) -> bool:
76+
"""initialized read-only property."""
77+
indexes = pinecone.manage.list_indexes()
78+
return self.index_name in indexes
5179

5280
@property
5381
def vector_store(self) -> LCPinecone:
5482
"""Pinecone lazy read-only property."""
5583
if self._vector_store is None:
84+
if not self.initialized:
85+
self.init_index()
5686
self._vector_store = LCPinecone(
5787
index=self.index,
5888
embedding=self.openai_embeddings,
@@ -76,41 +106,29 @@ def text_splitter(self) -> TextSplitter:
76106
self._text_splitter = TextSplitter()
77107
return self._text_splitter
78108

79-
@property
80-
def index_name(self) -> str:
81-
"""index name."""
82-
return self._index_name
83-
84-
@index_name.setter
85-
def index_name(self, value: str) -> None:
86-
"""Set index name."""
87-
if self._index_name != value:
88-
self._index_name = value
89-
self.initialize()
90-
91-
@property
92-
def index(self) -> pinecone.Index:
93-
"""pinecone.Index lazy read-only property."""
94-
if self._index is None:
95-
try:
96-
self._index = pinecone.Index(index_name=self.index_name)
97-
except pinecone.exceptions.PineconeException:
98-
# index does not exist, so create it.
99-
self.create()
100-
self._index = pinecone.Index(index_name=self.index_name)
101-
return self._index
109+
def init_index(self):
110+
"""Verify that an index named self.index_name exists in Pinecone. If not, create it."""
111+
indexes = pinecone.manage.list_indexes()
112+
if self.index_name not in indexes:
113+
logging.info("Index does not exist.")
114+
self.create()
102115

103116
def init(self):
104117
"""Initialize Pinecone."""
105118
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT)
119+
self._index = None
120+
self._index_name = None
121+
self._text_splitter = None
122+
self._openai_embeddings = None
123+
self._vector_store = None
106124

107125
def delete(self):
108126
"""Delete index."""
109-
try:
110-
logging.info("Deleting index...")
111-
pinecone.delete_index(self.index_name)
112-
except pinecone.exceptions.PineconeException:
113-
logging.info("Index does not exist. Continuing...")
127+
if not self.initialized:
128+
logging.info("Index does not exist. Nothing to delete.")
129+
return
130+
logging.info("Deleting index...")
131+
pinecone.delete_index(self.index_name)
114132

115133
def create(self):
116134
"""Create index."""
@@ -121,11 +139,12 @@ def create(self):
121139
logging.info("Creating index. This may take a few minutes...")
122140

123141
pinecone.create_index(
124-
self.index_name,
142+
name=self.index_name,
125143
dimension=Config.PINECONE_DIMENSIONS,
126144
metric=Config.PINECONE_METRIC,
127145
metadata_config=metadata_config,
128146
)
147+
logging.info("Index created.")
129148

130149
def initialize(self):
131150
"""Initialize index."""
@@ -152,16 +171,17 @@ def pdf_loader(self, filepath: str):
152171
for pdf_file in pdf_files:
153172
i += 1
154173
j = len(pdf_files)
155-
logging.info("Loading PDF %s of %s: %s", i, j, pdf_file)
174+
print("Loading PDF %s of %s: %s", i, j, pdf_file)
156175
loader = PyPDFLoader(file_path=pdf_file)
157176
docs = loader.load()
158177
k = 0
159178
for doc in docs:
160179
k += 1
161-
logging.info(k * "-", end="\r")
180+
print(k * "-", end="\r")
162181
documents = self.text_splitter.create_documents([doc.page_content])
163182
document_texts = [doc.page_content for doc in documents]
164183
embeddings = self.openai_embeddings.embed_documents(document_texts)
165184
self.vector_store.add_documents(documents=documents, embeddings=embeddings)
166185

167-
logging.info("Finished loading PDFs")
186+
index_stats_string = json.dumps(self.index.describe_index_stats(), indent=4)
187+
print("Finished loading PDFs. \n" + index_stats_string)

models/tests/test_hsr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from langchain.chat_models import ChatOpenAI
88

99
from models.hybrid_search_retreiver import HybridSearchRetriever
10-
from models.pinecone import PineConeIndex
10+
from models.pinecone import PineconeIndex
1111

1212

1313
class TestSalesSupportModel:
@@ -27,4 +27,4 @@ def test_02_class_aatribute_types(self):
2727

2828
hsr = HybridSearchRetriever()
2929
assert isinstance(hsr.chat, ChatOpenAI)
30-
assert isinstance(hsr.pinecone, PineConeIndex)
30+
assert isinstance(hsr.pinecone, PineconeIndex)

models/tests/test_pinecone.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pytest # pylint: disable=unused-import
1111

1212
from models.const import Config
13-
from models.pinecone import PineConeIndex
13+
from models.pinecone import PineconeIndex
1414

1515

1616
class TestPinecone:
@@ -20,13 +20,13 @@ def test_01_can_instantiate(self):
2020
"""Ensure that we instantiate the object."""
2121
# pylint: disable=broad-except
2222
try:
23-
PineConeIndex()
23+
PineconeIndex()
2424
except Exception as e:
2525
assert False, f"Pinecone() failed with exception: {e}"
2626

2727
def test_02_init(self):
2828
"""Ensure that we can initialize Pinecone."""
29-
pinecone = PineConeIndex()
29+
pinecone = PineconeIndex()
3030
# pylint: disable=broad-except
3131
try:
3232
pinecone.init()
@@ -35,12 +35,12 @@ def test_02_init(self):
3535

3636
def test_03_index(self):
3737
"""Test that the index name is correct."""
38-
pinecone = PineConeIndex()
38+
pinecone = PineconeIndex()
3939
assert pinecone.index_name == Config.PINECONE_INDEX_NAME
4040

4141
def test_04_initialize(self):
4242
"""Test that the index initializes."""
43-
pinecone = PineConeIndex()
43+
pinecone = PineconeIndex()
4444
# pylint: disable=broad-except
4545
try:
4646
pinecone.initialize()
@@ -50,7 +50,9 @@ def test_04_initialize(self):
5050

5151
def test_05_delete(self):
5252
"""Test that the index can be deleted."""
53-
pinecone = PineConeIndex()
53+
pinecone = PineconeIndex()
54+
indexes = oem_pinecone.manage.list_indexes()
55+
assert pinecone.index_name in indexes
5456
# pylint: disable=broad-except
5557
try:
5658
pinecone.delete()
@@ -59,20 +61,25 @@ def test_05_delete(self):
5961

6062
def test_06_create(self):
6163
"""Test that the index can be created."""
62-
pinecone = PineConeIndex()
64+
pinecone = PineconeIndex()
65+
indexes = oem_pinecone.manage.list_indexes()
66+
if pinecone.index_name in indexes:
67+
pinecone.delete()
68+
6369
# pylint: disable=broad-except
6470
try:
6571
pinecone.create()
6672
except Exception as e:
6773
assert False, f"Pinecone.create() failed with exception: {e}"
74+
assert isinstance(pinecone.index, oem_pinecone.Index)
6875
pinecone.delete()
6976

7077
def test_07_load_pdf(self):
7178
"""Test that we can load a PDF document to the index."""
7279
if not os.path.exists("./data/test_07_load.pdf"):
7380
pytest.skip("File './data/test_07_load.pdf' does not exist")
7481

75-
pinecone = PineConeIndex()
82+
pinecone = PineconeIndex()
7683
# pylint: disable=broad-except
7784
try:
7885
pinecone.pdf_loader(filepath="./data/test_07_load.pdf")

models/yt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def test_06_embeddings_b(self):
172172

173173
# 7.) pinecone client. minute 11:00
174174
self.pinecone_search = Pinecone.from_documents(
175-
self.texts_splitter_results,
175+
documents=self.texts_splitter_results,
176176
embedding=self.openai_embedding,
177177
index_name=self.PINECONE_INDEX_NAME,
178178
)

0 commit comments

Comments
 (0)