Skip to content

Commit ef2227a

Browse files
authored
Merge pull request #19 from lpm0073/next
new PineconeIndex class
2 parents 42e86dd + 84bd16b commit ef2227a

12 files changed

+323
-154
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ $(shell echo -e "OPENAI_API_ORGANIZATION=PLEASE-ADD-ME\n\
77
OPENAI_API_KEY=PLEASE-ADD-ME\n\
88
PINECONE_API_KEY=PLEASE-ADD-ME\n\
99
PINECONE_ENVIRONMENT=gcp-starter\n\
10-
PINECONE_INDEX_NAME=hsr\n\
10+
PINECONE_INDEX_NAME=rag\n\
1111
PINECONE_VECTORSTORE_TEXT_KEY=lc_id\n\
1212
PINECONE_METRIC=dotproduct\n\
1313
PINECONE_DIMENSIONS=1536\n\

README.md

+3-4
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,9 @@ python3 -m models.examples.training_services "Microsoft certified Azure AI engin
4242
# example 4 - prompted assistant
4343
python3 -m models.examples.training_services_oracle "Oracle database administrator"
4444

45-
# example 5 - Load PDF documents
45+
# example 5 - Retrieval Augmented Generation
46+
python3 -m models.examples.pinecone_init
4647
python3 -m models.examples.load "./data/"
47-
48-
# example 6 - Retrieval Augmented Generation
4948
python3 -m models.examples.rag "What analytics and accounting courses does Wharton offer?"
5049
```
5150

@@ -67,7 +66,7 @@ DEBUG_MODE=False
6766
You'll need to manually create an index with the following characteristics
6867

6968
- Environment: gcp-starter
70-
- Index name: netec-rag
69+
- Index name: rag
7170
- Metric: dotproduct
7271
- Dimensions: 1536
7372
- Pod Type: starter

models/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# -*- coding: utf-8 -*-
2-
__version__ = "1.1.3"
2+
__version__ = "1.2.0"

models/const.py

+18
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@
77
from dotenv import find_dotenv, load_dotenv
88

99

10+
class ConfigurationError(Exception):
11+
"""Exception raised for errors in the configuration."""
12+
13+
def __init__(self, message):
14+
self.message = message
15+
super().__init__(self.message)
16+
17+
1018
# pylint: disable=duplicate-code
1119
dotenv_path = find_dotenv()
1220
if os.path.exists(dotenv_path):
@@ -25,6 +33,16 @@
2533
OPENAI_CHAT_MAX_RETRIES = int(os.environ.get("OPENAI_CHAT_MAX_RETRIES", 3))
2634
OPENAI_CHAT_CACHE = bool(os.environ.get("OPENAI_CHAT_CACHE", True))
2735
DEBUG_MODE = os.environ.get("DEBUG_MODE", "False") == "True"
36+
37+
if OPENAI_API_KEY == "PLEASE-ADD-ME":
38+
raise ConfigurationError("OPENAI_API_KEY is not set. Please add your OpenAI API key to the .env file.")
39+
if OPENAI_API_ORGANIZATION == "PLEASE-ADD-ME":
40+
raise ConfigurationError(
41+
"OPENAI_API_ORGANIZATION is not set. Please add your OpenAI API organization to the .env file."
42+
)
43+
if PINECONE_API_KEY == "PLEASE-ADD-ME":
44+
raise ConfigurationError("PINECONE_API_KEY is not set. Please add your Pinecone API key to the .env file.")
45+
2846
else:
2947
raise FileNotFoundError("No .env file found in root directory of repository")
3048

models/examples/pinecone_init.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding: utf-8 -*-
2+
"""Sales Support Model (hsr) Retrieval Augmented Generation (RAG)"""
3+
4+
from models.pinecone import PineconeIndex
5+
6+
7+
pinecone = PineconeIndex()
8+
9+
if __name__ == "__main__":
10+
pinecone.initialize()
11+
print("Pinecone index initialized. name: ", pinecone.index_name)
12+
print(pinecone.index.describe_index_stats())

models/examples/rag.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@
1414
parser.add_argument("prompt", type=str, help="A question about the PDF contents")
1515
args = parser.parse_args()
1616

17-
human_message = HumanMessage(text=args.prompt)
17+
human_message = HumanMessage(content=args.prompt)
1818
result = hsr.rag(human_message=human_message)
1919
print(result)

models/hybrid_search_retreiver.py

+27-121
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# -*- coding: utf-8 -*-
2-
# pylint: disable=too-few-public-methods
32
"""
43
Hybrid Search Retriever. A class that combines the following:
54
- OpenAI prompting and ChatModel
@@ -16,23 +15,16 @@
1615
https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
1716
"""
1817

19-
# document loading
20-
import glob
21-
2218
# general purpose imports
2319
import logging
24-
import os
2520
import textwrap
2621
from typing import Union
2722

2823
# pinecone integration
29-
import pinecone
3024
from langchain.cache import InMemoryCache
3125
from langchain.chat_models import ChatOpenAI
32-
from langchain.document_loaders import PyPDFLoader
3326

3427
# embedding
35-
from langchain.embeddings import OpenAIEmbeddings
3628
from langchain.globals import set_llm_cache
3729

3830
# prompting and chat
@@ -42,12 +34,11 @@
4234
# hybrid search capability
4335
from langchain.retrievers import PineconeHybridSearchRetriever
4436
from langchain.schema import BaseMessage, HumanMessage, SystemMessage
45-
from langchain.text_splitter import Document
46-
from langchain.vectorstores.pinecone import Pinecone
4737
from pinecone_text.sparse import BM25Encoder
4838

4939
# this project
5040
from models.const import Config, Credentials
41+
from models.pinecone import PineconeIndex
5142

5243

5344
###############################################################################
@@ -56,37 +47,25 @@
5647
logging.basicConfig(level=logging.DEBUG if Config.DEBUG_MODE else logging.INFO)
5748

5849

59-
class TextSplitter:
60-
"""
61-
Custom text splitter that adds metadata to the Document object
62-
which is required by PineconeHybridSearchRetriever.
63-
"""
64-
65-
def create_documents(self, texts):
66-
"""Create documents"""
67-
documents = []
68-
for text in texts:
69-
# Create a Document object with the text and metadata
70-
document = Document(page_content=text, metadata={"context": text})
71-
documents.append(document)
72-
return documents
73-
74-
7550
class HybridSearchRetriever:
7651
"""Hybrid Search Retriever"""
7752

7853
_chat: ChatOpenAI = None
79-
_openai_embeddings: OpenAIEmbeddings = None
80-
_pinecone_index: pinecone.Index = None
81-
_vector_store: Pinecone = None
82-
_text_splitter: TextSplitter = None
8354
_b25_encoder: BM25Encoder = None
55+
_pinecone: PineconeIndex = None
56+
_retriever: PineconeHybridSearchRetriever = None
8457

8558
def __init__(self):
8659
"""Constructor"""
87-
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT)
8860
set_llm_cache(InMemoryCache())
8961

62+
@property
63+
def pinecone(self) -> PineconeIndex:
64+
"""PineconeIndex lazy read-only property."""
65+
if self._pinecone is None:
66+
self._pinecone = PineconeIndex()
67+
return self._pinecone
68+
9069
# prompting wrapper
9170
@property
9271
def chat(self) -> ChatOpenAI:
@@ -102,48 +81,22 @@ def chat(self) -> ChatOpenAI:
10281
)
10382
return self._chat
10483

105-
# embeddings
106-
@property
107-
def openai_embeddings(self) -> OpenAIEmbeddings:
108-
"""OpenAIEmbeddings lazy read-only property."""
109-
if self._openai_embeddings is None:
110-
self._openai_embeddings = OpenAIEmbeddings(
111-
api_key=Credentials.OPENAI_API_KEY, organization=Credentials.OPENAI_API_ORGANIZATION
112-
)
113-
return self._openai_embeddings
114-
115-
@property
116-
def pinecone_index(self) -> pinecone.Index:
117-
"""pinecone.Index lazy read-only property."""
118-
if self._pinecone_index is None:
119-
self._pinecone_index = pinecone.Index(index_name=Config.PINECONE_INDEX_NAME)
120-
return self._pinecone_index
121-
122-
@property
123-
def vector_store(self) -> Pinecone:
124-
"""Pinecone lazy read-only property."""
125-
if self._vector_store is None:
126-
self._vector_store = Pinecone(
127-
index=self.pinecone_index,
128-
embedding=self.openai_embeddings,
129-
text_key=Config.PINECONE_VECTORSTORE_TEXT_KEY,
130-
)
131-
return self._vector_store
132-
133-
@property
134-
def text_splitter(self) -> TextSplitter:
135-
"""TextSplitter lazy read-only property."""
136-
if self._text_splitter is None:
137-
self._text_splitter = TextSplitter()
138-
return self._text_splitter
139-
14084
@property
14185
def bm25_encoder(self) -> BM25Encoder:
14286
"""BM25Encoder lazy read-only property."""
14387
if self._b25_encoder is None:
14488
self._b25_encoder = BM25Encoder().default()
14589
return self._b25_encoder
14690

91+
@property
92+
def retriever(self) -> PineconeHybridSearchRetriever:
93+
"""PineconeHybridSearchRetriever lazy read-only property."""
94+
if self._retriever is None:
95+
self._retriever = PineconeHybridSearchRetriever(
96+
embeddings=self.pinecone.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone.index
97+
)
98+
return self._retriever
99+
147100
def cached_chat_request(
148101
self, system_message: Union[str, SystemMessage], human_message: Union[str, HumanMessage]
149102
) -> BaseMessage:
@@ -169,54 +122,8 @@ def prompt_with_template(
169122
return retval
170123

171124
def load(self, filepath: str):
172-
"""
173-
Embed PDF.
174-
1. Load PDF document text data
175-
2. Split into pages
176-
3. Embed each page
177-
4. Store in Pinecone
178-
179-
Note: it's important to make sure that the "context" field that holds the document text
180-
in the metadata is not indexed. Currently you need to specify explicitly the fields you
181-
do want to index. For more information checkout
182-
https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
183-
"""
184-
try:
185-
logging.info("Deleting index...")
186-
pinecone.delete_index(Config.PINECONE_INDEX_NAME)
187-
except pinecone.exceptions.PineconeException:
188-
logging.info("Index does not exist. Continuing...")
189-
190-
metadata_config = {
191-
"indexed": [Config.PINECONE_VECTORSTORE_TEXT_KEY, "lc_type"],
192-
"context": ["lc_text"],
193-
}
194-
logging.info("Creating index. This may take a few minutes...")
195-
pinecone.create_index(
196-
Config.PINECONE_INDEX_NAME,
197-
dimension=Config.PINECONE_DIMENSIONS,
198-
metric=Config.PINECONE_METRIC,
199-
metadata_config=metadata_config,
200-
)
201-
202-
pdf_files = glob.glob(os.path.join(filepath, "*.pdf"))
203-
i = 0
204-
for pdf_file in pdf_files:
205-
i += 1
206-
j = len(pdf_files)
207-
logging.info("Loading PDF %s of %s: %s", i, j, pdf_file)
208-
loader = PyPDFLoader(file_path=pdf_file)
209-
docs = loader.load()
210-
k = 0
211-
for doc in docs:
212-
k += 1
213-
logging.info(k * "-", end="\r")
214-
documents = self.text_splitter.create_documents([doc.page_content])
215-
document_texts = [doc.page_content for doc in documents]
216-
embeddings = self.openai_embeddings.embed_documents(document_texts)
217-
self.vector_store.add_documents(documents=documents, embeddings=embeddings)
218-
219-
logging.info("Finished loading PDFs")
125+
"""Pdf loader."""
126+
self.pinecone.pdf_loader(filepath=filepath)
220127

221128
def rag(self, human_message: Union[str, HumanMessage]):
222129
"""
@@ -241,10 +148,8 @@ def rag(self, human_message: Union[str, HumanMessage]):
241148
# ---------------------------------------------------------------------
242149
# 1.) Retrieve relevant documents from Pinecone vector database
243150
# ---------------------------------------------------------------------
244-
retriever = PineconeHybridSearchRetriever(
245-
embeddings=self.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone_index
246-
)
247-
documents = retriever.get_relevant_documents(query=human_message.content)
151+
# documents = self.retriever.get_relevant_documents(query=human_message.content)
152+
documents = self.pinecone.vector_store.similarity_search(query=human_message.content)
248153

249154
# Extract the text from the documents
250155
document_texts = [doc.page_content for doc in documents]
@@ -261,14 +166,15 @@ def rag(self, human_message: Union[str, HumanMessage]):
261166
# finished with hybrid search setup
262167
# ---------------------------------------------------------------------
263168

264-
# 2.) get a response from the chat model
265-
response = self.cached_chat_request(system_message=system_message, human_message=human_message)
266-
267169
logging.debug("------------------------------------------------------")
268170
logging.debug("rag() Retrieval Augmented Generation prompt")
269171
logging.debug("Diagnostic information:")
270172
logging.debug(" Retrieved %i related documents from Pinecone", len(documents))
271173
logging.debug(" System messages contains %i words", len(system_message.content.split()))
272174
logging.debug(" Prompt: %s", system_message.content)
273175
logging.debug("------------------------------------------------------")
176+
177+
# 2.) get a response from the chat model
178+
response = self.cached_chat_request(system_message=system_message, human_message=human_message)
179+
274180
return response.content

0 commit comments

Comments
 (0)