Skip to content

Commit fbc4b20

Browse files
authored
Merge pull request #17 from lpm0073/next
HybridSearchRetriever refactoring
2 parents 77e79d5 + 48d12ab commit fbc4b20

File tree

5 files changed

+127
-43
lines changed

5 files changed

+127
-43
lines changed

Makefile

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,19 @@ SHELL := /bin/bash
33
ifneq ("$(wildcard .env)","")
44
include .env
55
else
6-
$(shell echo -e "OPENAI_API_ORGANIZATION=PLEASE-ADD-ME\nOPENAI_API_KEY=PLEASE-ADD-ME\nPINECONE_API_KEY=PLEASE-ADD-ME\nPINECONE_ENVIRONMENT=gcp-starter\nPINECONE_INDEX_NAME=hsr\nOPENAI_CHAT_MODEL_NAME=gpt-3.5-turbo\nOPENAI_PROMPT_MODEL_NAME=text-davinci-003\nOPENAI_CHAT_TEMPERATURE=0.0\nOPENAI_CHAT_MAX_RETRIES=3\nDEBUG_MODE=True\n" >> .env)
6+
$(shell echo -e "OPENAI_API_ORGANIZATION=PLEASE-ADD-ME\n\
7+
OPENAI_API_KEY=PLEASE-ADD-ME\n\
8+
PINECONE_API_KEY=PLEASE-ADD-ME\n\
9+
PINECONE_ENVIRONMENT=gcp-starter\n\
10+
PINECONE_INDEX_NAME=hsr\n\
11+
PINECONE_VECTORSTORE_TEXT_KEY=lc_id\n\
12+
PINECONE_METRIC=dotproduct\n\
13+
PINECONE_DIMENSIONS=1536\n\
14+
OPENAI_CHAT_MODEL_NAME=gpt-3.5-turbo\n\
15+
OPENAI_PROMPT_MODEL_NAME=text-davinci-003\n\
16+
OPENAI_CHAT_TEMPERATURE=0.0\n\
17+
OPENAI_CHAT_MAX_RETRIES=3\n\
18+
DEBUG_MODE=True\n" >> .env)
719
endif
820

921
.PHONY: analyze init activate test lint clean

models/const.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
1717
PINECONE_ENVIRONMENT = os.environ["PINECONE_ENVIRONMENT"]
1818
PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME", "hsr")
19+
PINECONE_VECTORSTORE_TEXT_KEY = os.environ.get("PINECONE_VECTORSTORE_TEXT_KEY", "lc_id")
20+
PINECONE_METRIC = os.environ.get("PINECONE_METRIC", "dotproduct")
21+
PINECONE_DIMENSIONS = int(os.environ.get("PINECONE_DIMENSIONS", 1536))
1922
OPENAI_CHAT_MODEL_NAME = os.environ.get("OPENAI_CHAT_MODEL_NAME", "gpt-3.5-turbo")
2023
OPENAI_PROMPT_MODEL_NAME = os.environ.get("OPENAI_PROMPT_MODEL_NAME", "text-davinci-003")
2124
OPENAI_CHAT_TEMPERATURE = float(os.environ.get("OPENAI_CHAT_TEMPERATURE", 0.0))
@@ -26,7 +29,16 @@
2629
raise FileNotFoundError("No .env file found in root directory of repository")
2730

2831

29-
class Config:
32+
class ReadOnly(type):
33+
"""Metaclass to make all class attributes read-only."""
34+
35+
def __setattr__(cls, name, value):
36+
if name in cls.__dict__:
37+
raise TypeError(f"Cannot change a read-only attribute {name}")
38+
super().__setattr__(name, value)
39+
40+
41+
class Config(metaclass=ReadOnly):
3042
"""Configuration parameters."""
3143

3244
DEBUG_MODE: bool = DEBUG_MODE
@@ -35,13 +47,16 @@ class Config:
3547
OPENAI_CHAT_TEMPERATURE: float = OPENAI_CHAT_TEMPERATURE
3648
OPENAI_CHAT_MAX_RETRIES: int = OPENAI_CHAT_MAX_RETRIES
3749
OPENAI_CHAT_CACHE: bool = OPENAI_CHAT_CACHE
50+
PINECONE_ENVIRONMENT = PINECONE_ENVIRONMENT
51+
PINECONE_INDEX_NAME = PINECONE_INDEX_NAME
52+
PINECONE_VECTORSTORE_TEXT_KEY: str = PINECONE_VECTORSTORE_TEXT_KEY
53+
PINECONE_METRIC: str = PINECONE_METRIC
54+
PINECONE_DIMENSIONS: int = PINECONE_DIMENSIONS
3855

3956

40-
class Credentials:
57+
class Credentials(metaclass=ReadOnly):
4158
"""Credentials."""
4259

4360
OPENAI_API_KEY = OPENAI_API_KEY
4461
OPENAI_API_ORGANIZATION = OPENAI_API_ORGANIZATION
4562
PINECONE_API_KEY = PINECONE_API_KEY
46-
PINECONE_ENVIRONMENT = PINECONE_ENVIRONMENT
47-
PINECONE_INDEX_NAME = PINECONE_INDEX_NAME

models/hybrid_search_retreiver.py

Lines changed: 87 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,6 @@
5353
###############################################################################
5454
# initializations
5555
###############################################################################
56-
DEFAULT_MODEL_NAME = Config.OPENAI_PROMPT_MODEL_NAME
57-
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
58-
set_llm_cache(InMemoryCache())
5956
logging.basicConfig(level=logging.DEBUG if Config.DEBUG_MODE else logging.INFO)
6057

6158

@@ -78,25 +75,74 @@ def create_documents(self, texts):
7875
class HybridSearchRetriever:
7976
"""Hybrid Search Retriever (OpenAI + Pinecone)"""
8077

78+
_chat: ChatOpenAI = None
79+
_openai_embeddings: OpenAIEmbeddings = None
80+
_pinecone_index: pinecone.Index = None
81+
_vector_store: Pinecone = None
82+
_text_splitter: TextSplitter = None
83+
_b25_encoder: BM25Encoder = None
84+
85+
def __init__(self):
86+
"""Constructor"""
87+
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT)
88+
set_llm_cache(InMemoryCache())
89+
8190
# prompting wrapper
82-
chat = ChatOpenAI(
83-
api_key=Credentials.OPENAI_API_KEY,
84-
organization=Credentials.OPENAI_API_ORGANIZATION,
85-
cache=Config.OPENAI_CHAT_CACHE,
86-
max_retries=Config.OPENAI_CHAT_MAX_RETRIES,
87-
model=Config.OPENAI_CHAT_MODEL_NAME,
88-
temperature=Config.OPENAI_CHAT_TEMPERATURE,
89-
)
91+
@property
92+
def chat(self) -> ChatOpenAI:
93+
"""ChatOpenAI lazy read-only property."""
94+
if self._chat is None:
95+
self._chat = ChatOpenAI(
96+
api_key=Credentials.OPENAI_API_KEY,
97+
organization=Credentials.OPENAI_API_ORGANIZATION,
98+
cache=Config.OPENAI_CHAT_CACHE,
99+
max_retries=Config.OPENAI_CHAT_MAX_RETRIES,
100+
model=Config.OPENAI_CHAT_MODEL_NAME,
101+
temperature=Config.OPENAI_CHAT_TEMPERATURE,
102+
)
103+
return self._chat
90104

91105
# embeddings
92-
openai_embeddings = OpenAIEmbeddings(
93-
api_key=Credentials.OPENAI_API_KEY, organization=Credentials.OPENAI_API_ORGANIZATION
94-
)
95-
pinecone_index = pinecone.Index(index_name=Credentials.PINECONE_INDEX_NAME)
96-
vector_store = Pinecone(index=pinecone_index, embedding=openai_embeddings, text_key="lc_id")
97-
98-
text_splitter = TextSplitter()
99-
bm25_encoder = BM25Encoder().default()
106+
@property
107+
def openai_embeddings(self) -> OpenAIEmbeddings:
108+
"""OpenAIEmbeddings lazy read-only property."""
109+
if self._openai_embeddings is None:
110+
self._openai_embeddings = OpenAIEmbeddings(
111+
api_key=Credentials.OPENAI_API_KEY, organization=Credentials.OPENAI_API_ORGANIZATION
112+
)
113+
return self._openai_embeddings
114+
115+
@property
116+
def pinecone_index(self) -> pinecone.Index:
117+
"""pinecone.Index lazy read-only property."""
118+
if self._pinecone_index is None:
119+
self._pinecone_index = pinecone.Index(index_name=Config.PINECONE_INDEX_NAME)
120+
return self._pinecone_index
121+
122+
@property
123+
def vector_store(self) -> Pinecone:
124+
"""Pinecone lazy read-only property."""
125+
if self._vector_store is None:
126+
self._vector_store = Pinecone(
127+
index=self.pinecone_index,
128+
embedding=self.openai_embeddings,
129+
text_key=Config.PINECONE_VECTORSTORE_TEXT_KEY,
130+
)
131+
return self._vector_store
132+
133+
@property
134+
def text_splitter(self) -> TextSplitter:
135+
"""TextSplitter lazy read-only property."""
136+
if self._text_splitter is None:
137+
self._text_splitter = TextSplitter()
138+
return self._text_splitter
139+
140+
@property
141+
def bm25_encoder(self) -> BM25Encoder:
142+
"""BM25Encoder lazy read-only property."""
143+
if self._b25_encoder is None:
144+
self._b25_encoder = BM25Encoder().default()
145+
return self._b25_encoder
100146

101147
def cached_chat_request(
102148
self, system_message: Union[str, SystemMessage], human_message: Union[str, HumanMessage]
@@ -114,7 +160,9 @@ def cached_chat_request(
114160
retval = self.chat(messages)
115161
return retval
116162

117-
def prompt_with_template(self, prompt: PromptTemplate, concept: str, model: str = DEFAULT_MODEL_NAME) -> str:
163+
def prompt_with_template(
164+
self, prompt: PromptTemplate, concept: str, model: str = Config.OPENAI_PROMPT_MODEL_NAME
165+
) -> str:
118166
"""Prompt with template."""
119167
llm = OpenAI(model=model)
120168
retval = llm(prompt.format(concept=concept))
@@ -135,17 +183,20 @@ def load(self, filepath: str):
135183
"""
136184
try:
137185
logging.debug("Deleting index...")
138-
pinecone.delete_index(Credentials.PINECONE_INDEX_NAME)
186+
pinecone.delete_index(Config.PINECONE_INDEX_NAME)
139187
except pinecone.exceptions.PineconeException:
140188
logging.debug("Index does not exist. Continuing...")
141189

142190
metadata_config = {
143-
"indexed": ["lc_id", "lc_type"],
191+
"indexed": [Config.PINECONE_VECTORSTORE_TEXT_KEY, "lc_type"],
144192
"context": ["lc_text"],
145193
}
146194
logging.debug("Creating index. This may take a few minutes...")
147195
pinecone.create_index(
148-
Credentials.PINECONE_INDEX_NAME, dimension=1536, metric="dotproduct", metadata_config=metadata_config
196+
Config.PINECONE_INDEX_NAME,
197+
dimension=Config.PINECONE_DIMENSIONS,
198+
metric=Config.PINECONE_METRIC,
199+
metadata_config=metadata_config,
149200
)
150201

151202
pdf_files = glob.glob(os.path.join(filepath, "*.pdf"))
@@ -187,11 +238,13 @@ def rag(self, human_message: Union[str, HumanMessage]):
187238
logging.debug("Converting human_message to HumanMessage")
188239
human_message = HumanMessage(content=human_message)
189240

241+
# ---------------------------------------------------------------------
242+
# 1.) Retrieve relevant documents from Pinecone vector database
243+
# ---------------------------------------------------------------------
190244
retriever = PineconeHybridSearchRetriever(
191245
embeddings=self.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone_index
192246
)
193247
documents = retriever.get_relevant_documents(query=human_message.content)
194-
logging.debug("Retrieved %i related documents from Pinecone", len(documents))
195248

196249
# Extract the text from the documents
197250
document_texts = [doc.page_content for doc in documents]
@@ -202,13 +255,19 @@ def rag(self, human_message: Union[str, HumanMessage]):
202255
into your responses:\n\n
203256
"""
204257
)
205-
system_message = f"{leader} {'. '.join(document_texts)}"
258+
system_message_content = f"{leader} {'. '.join(document_texts)}"
259+
system_message = SystemMessage(content=system_message_content)
260+
# ---------------------------------------------------------------------
261+
# finished with hybrid search setup
262+
# ---------------------------------------------------------------------
206263

207-
logging.debug("System messages contains %i words", len(system_message.split()))
208-
logging.debug("Prompt: %s", system_message)
209-
system_message = SystemMessage(content=system_message)
264+
# 2.) get a response from the chat model
210265
response = self.cached_chat_request(system_message=system_message, human_message=human_message)
211266

267+
logging.debug("------------------------------------------------------")
268+
logging.debug("Retrieved %i related documents from Pinecone", len(documents))
269+
logging.debug("System messages contains %i words", len(system_message.content.split()))
270+
logging.debug("Prompt: %s", system_message.content)
212271
logging.debug("Response:")
213272
logging.debug("------------------------------------------------------")
214273
return response.content

models/tests/test_pinecone.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from langchain.embeddings import OpenAIEmbeddings
1010
from langchain.vectorstores.pinecone import Pinecone
1111

12-
from ..const import Credentials
12+
from ..const import Config, Credentials
1313

1414

1515
class TestPinecone:
@@ -19,22 +19,20 @@ def test_01_test_pinecone_connectivity(self):
1919
"""Ensure that we have connectivity to Pinecone."""
2020
# pylint: disable=broad-except
2121
try:
22-
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
22+
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT)
2323
except Exception as e:
2424
assert False, f"pinecone.init() failed with exception: {e}"
2525

2626
def test_02_test_pinecone_index(self):
2727
"""Ensure that the Pinecone index exists and that we can connect to it."""
28-
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
28+
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT)
2929
openai_embedding = OpenAIEmbeddings()
3030

3131
# pylint: disable=broad-except
3232
try:
3333
Pinecone.from_existing_index(
34-
Credentials.PINECONE_INDEX_NAME,
34+
Config.PINECONE_INDEX_NAME,
3535
embedding=openai_embedding,
3636
)
3737
except Exception as e:
38-
assert (
39-
False
40-
), f"Pinecone initialization of index {Credentials.PINECONE_INDEX_NAME,} failed with exception: {e}"
38+
assert False, f"Pinecone initialization of index {Config.PINECONE_INDEX_NAME,} failed with exception: {e}"

models/yt.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
# 8.) LangChain agents
3838
from langchain_experimental.agents.agent_toolkits.python.base import create_python_agent
3939

40+
from models.const import Config, Credentials
41+
4042

4143
# Load environment variables from .env file in all folders
4244
# pylint: disable=duplicate-code
@@ -45,8 +47,6 @@
4547
load_dotenv(dotenv_path=dotenv_path, verbose=True)
4648
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
4749
OPENAI_API_ORGANIZATION = os.environ["OPENAI_API_ORGANIZATION"]
48-
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
49-
PINECONE_ENVIRONMENT = os.environ["PINECONE_ENVIRONMENT"]
5050
else:
5151
raise FileNotFoundError("No .env file found in root directory of repository")
5252

@@ -66,7 +66,7 @@ class LangChainDev:
6666
tool=PythonREPL(),
6767
verbose=True,
6868
)
69-
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT) # minute 10:43
69+
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Config.PINECONE_ENVIRONMENT) # minute 10:43
7070

7171
# LLM wrappers. minute 5:46
7272
def test_01_basic(self):

0 commit comments

Comments
 (0)