Skip to content

Commit 789455a

Browse files
authored
Merge pull request #8 from lpm0073/next
switch vector store to spare text using BM25Encoder
2 parents 0139c0f + 0160ce2 commit 789455a

File tree

9 files changed

+188
-70
lines changed

9 files changed

+188
-70
lines changed

CHANGELOG.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
# [1.1.0](https://github.com/lpm0073/netec-llm/compare/v1.0.0...v1.1.0) (2023-12-01)
22

3-
43
### Bug Fixes
54

6-
* fix load problem with existing index ([62cd18f](https://github.com/lpm0073/netec-llm/commit/62cd18f8088873a794ec363c4e18770dfdc41ea5))
7-
5+
- fix load problem with existing index ([62cd18f](https://github.com/lpm0073/netec-llm/commit/62cd18f8088873a794ec363c4e18770dfdc41ea5))
86

97
### Features
108

11-
* perfect load(). revert rag() to openai only calls ([8de793d](https://github.com/lpm0073/netec-llm/commit/8de793dcca77ec23f09e68ca9e8dba7f64623b3c))
12-
* ssm.rag() w load, split, embed, store ([2335d22](https://github.com/lpm0073/netec-llm/commit/2335d22c5fd9092642ff1eb67a34fbcd712d7f9b))
9+
- perfect load(). revert rag() to openai only calls ([8de793d](https://github.com/lpm0073/netec-llm/commit/8de793dcca77ec23f09e68ca9e8dba7f64623b3c))
10+
- ssm.rag() w load, split, embed, store ([2335d22](https://github.com/lpm0073/netec-llm/commit/2335d22c5fd9092642ff1eb67a34fbcd712d7f9b))
1311

1412
# 1.0.0 (2023-11-30)
1513

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@ export PINECONE_API_KEY=SET-ME-PLEASE
5454
export PINECONE_ENVIRONMENT=SET-ME-PLEASE
5555
```
5656

57+
### Pinecone setup
58+
59+
You'll need to manually create an index with the following characteristics
60+
61+
- Environment: gcp-starter
62+
- Index name: netec-rag
63+
- Metric: dotproduct
64+
- Dimensions: 1536
65+
- Pod Type: starter
66+
5767
## Contributing
5868

5969
This project uses a mostly automated pull request and unit testing process. See the resources in .github for additional details. You additionally should ensure that pre-commit is installed and working correctly on your dev machine by running the following command from the root of the repo.

models/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# -*- coding: utf-8 -*-
2-
__version__ = "1.1.0"
2+
__version__ = "1.1.1"

models/ssm.py

Lines changed: 101 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
Sales Support Model (SSM) for the LangChain project.
55
See: https://python.langchain.com/docs/modules/model_io/llms/llm_caching
66
https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
7+
https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
78
"""
89

910
import glob
1011
import os
12+
import textwrap
1113
from typing import List # ClassVar
1214

1315
# pinecone integration
@@ -27,9 +29,11 @@
2729
from langchain.globals import set_llm_cache
2830
from langchain.llms.openai import OpenAI
2931
from langchain.prompts import PromptTemplate
32+
from langchain.retrievers import PineconeHybridSearchRetriever
3033
from langchain.schema import HumanMessage, SystemMessage
31-
from langchain.text_splitter import Document, RecursiveCharacterTextSplitter
34+
from langchain.text_splitter import Document
3235
from langchain.vectorstores.pinecone import Pinecone
36+
from pinecone_text.sparse import BM25Encoder
3337

3438
# this project
3539
from models.const import Credentials
@@ -46,6 +50,24 @@
4650
set_llm_cache(InMemoryCache())
4751

4852

53+
class TextSplitter:
54+
"""
55+
Custom text splitter that add metadata to the Document object
56+
which is required by PineconeHybridSearchRetriever.
57+
"""
58+
59+
# ...
60+
61+
def create_documents(self, texts):
62+
"""Create documents"""
63+
documents = []
64+
for text in texts:
65+
# Create a Document object with the text and metadata
66+
document = Document(page_content=text, metadata={"context": text})
67+
documents.append(document)
68+
return documents
69+
70+
4971
class SalesSupportModel:
5072
"""Sales Support Model (SSM)."""
5173

@@ -60,15 +82,14 @@ class SalesSupportModel:
6082
)
6183

6284
# embeddings
63-
text_splitter = RecursiveCharacterTextSplitter(
64-
chunk_size=100,
65-
chunk_overlap=0,
66-
)
67-
openai_embedding = OpenAIEmbeddings()
68-
pinecone_index = Pinecone.from_existing_index(
69-
Credentials.PINECONE_INDEX_NAME,
70-
embedding=openai_embedding,
85+
openai_embeddings = OpenAIEmbeddings(
86+
api_key=Credentials.OPENAI_API_KEY, organization=Credentials.OPENAI_API_ORGANIZATION
7187
)
88+
pinecone_index = pinecone.Index(index_name=Credentials.PINECONE_INDEX_NAME)
89+
vector_store = Pinecone(index=pinecone_index, embedding=openai_embeddings, text_key="lc_id")
90+
91+
text_splitter = TextSplitter()
92+
bm25_encoder = BM25Encoder().default()
7293

7394
def cached_chat_request(self, system_message: str, human_message: str) -> SystemMessage:
7495
"""Cached chat request."""
@@ -86,24 +107,54 @@ def prompt_with_template(self, prompt: PromptTemplate, concept: str, model: str
86107
retval = llm(prompt.format(concept=concept))
87108
return retval
88109

89-
# FIX NOTE: DEPRECATED
90110
def split_text(self, text: str) -> List[Document]:
91-
"""Split text."""
92-
text_splitter = RecursiveCharacterTextSplitter(
93-
chunk_size=100,
94-
chunk_overlap=0,
95-
)
96-
retval = text_splitter.create_documents([text])
111+
"""Split text. Leaving this here for now, since it exposes the return type."""
112+
retval = self.text_splitter.create_documents([text])
97113
return retval
98114

115+
def fit_tf_idf_values(self, corpus: List[str]):
116+
"""Fit TF-IDF values.
117+
1. Fit the BM25 encoder on the corpus
118+
2. Encode the corpus
119+
3. Store the encoded corpus in Pinecone
120+
"""
121+
corpus = ["foo", "bar", "world", "hello"]
122+
123+
# fit tf-idf values on your corpus
124+
self.bm25_encoder.fit(corpus)
125+
126+
# persist the values to a json file
127+
self.bm25_encoder.dump("bm25_values.json")
128+
self.bm25_encoder = BM25Encoder().load("bm25_values.json")
129+
self.bm25_encoder.fit(corpus)
130+
99131
def load(self, filepath: str):
100132
"""
101133
Embed PDF.
102134
1. Load PDF document text data
103135
2. Split into pages
104136
3. Embed each page
105137
4. Store in Pinecone
138+
139+
Note: it's important to make sure that the "context" field that holds the document text
140+
in the metadata is not indexed. Currently you need to specify explicitly the fields you
141+
do want to index. For more information checkout
142+
https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
106143
"""
144+
try:
145+
print("Deleting index...")
146+
pinecone.delete_index(Credentials.PINECONE_INDEX_NAME)
147+
except pinecone.exceptions.PineconeException:
148+
print("Index does not exist. Continuing...")
149+
150+
metadata_config = {
151+
"indexed": ["lc_id", "lc_type"],
152+
"context": ["lc_text"],
153+
}
154+
print("Creating index. This may take a few minutes...")
155+
pinecone.create_index(
156+
Credentials.PINECONE_INDEX_NAME, dimension=1536, metric="dotproduct", metadata_config=metadata_config
157+
)
107158

108159
pdf_files = glob.glob(os.path.join(filepath, "*.pdf"))
109160
i = 0
@@ -117,12 +168,10 @@ def load(self, filepath: str):
117168
for doc in docs:
118169
k += 1
119170
print(k * "-", end="\r")
120-
texts_splitter_results = self.text_splitter.create_documents([doc.page_content])
121-
self.pinecone_index.from_existing_index(
122-
index_name=Credentials.PINECONE_INDEX_NAME,
123-
embedding=self.openai_embedding,
124-
text_key=texts_splitter_results,
125-
)
171+
documents = self.text_splitter.create_documents([doc.page_content])
172+
document_texts = [doc.page_content for doc in documents]
173+
embeddings = self.openai_embeddings.embed_documents(document_texts)
174+
self.vector_store.add_documents(documents=documents, embeddings=embeddings)
126175

127176
print("Finished loading PDFs")
128177

@@ -133,26 +182,42 @@ def rag(self, prompt: str):
133182
from storage using a Retriever.
134183
2. Generate: A ChatModel / LLM produces an answer using a prompt that includes
135184
the question and the retrieved data
136-
"""
137185
138-
# pylint: disable=unused-variable
139-
def format_docs(docs):
140-
"""Format docs."""
141-
return "\n\n".join(doc.page_content for doc in docs)
186+
To prompt OpenAI's GPT-3 model to consider the embeddings from the Pinecone
187+
vector database, you would typically need to convert the embeddings back
188+
into a format that GPT-3 can understand, such as text. However, GPT-3 does
189+
not natively support direct input of embeddings.
142190
143-
retriever = self.pinecone_index.as_retriever()
144-
145-
# Use the retriever to get relevant documents
191+
The typical workflow is to use the embeddings to retrieve relevant documents,
192+
and then use the text of these documents as part of the prompt for GPT-3.
193+
"""
194+
retriever = PineconeHybridSearchRetriever(
195+
embeddings=self.openai_embeddings, sparse_encoder=self.bm25_encoder, index=self.pinecone_index
196+
)
146197
documents = retriever.get_relevant_documents(query=prompt)
147198
print(f"Retrieved {len(documents)} related documents from Pinecone")
148199

149-
# Generate a prompt from the retrieved documents
150-
prompt += " ".join(doc.page_content for doc in documents)
151-
print(f"Prompt contains {len(prompt.split())} words")
152-
print("Prompt:", prompt)
153-
print(doc for doc in documents)
200+
# Extract the text from the documents
201+
document_texts = [doc.page_content for doc in documents]
202+
leader = textwrap.dedent(
203+
"""\
204+
You can assume that the following is true,
205+
and you should attempt to incorporate these facts
206+
in your response:
207+
"""
208+
)
209+
210+
# Create a prompt that includes the document texts
211+
prompt_with_relevant_documents = f"{prompt + leader} {'. '.join(document_texts)}"
212+
213+
print(f"Prompt contains {len(prompt_with_relevant_documents.split())} words")
214+
print("Prompt:", prompt_with_relevant_documents)
154215

155216
# Get a response from the GPT-3.5-turbo model
156-
response = self.cached_chat_request(system_message="You are a helpful assistant.", human_message=prompt)
217+
response = self.cached_chat_request(
218+
system_message="You are a helpful assistant.", human_message=prompt_with_relevant_documents
219+
)
157220

221+
print("Response:")
222+
print("------------------------------------------------------")
158223
return response

models/tests/test_openai.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# -*- coding: utf-8 -*-
2+
# flake8: noqa: F401
3+
# pylint: disable=too-few-public-methods
4+
"""
5+
Test integrity of base class.
6+
"""
7+
import pytest # pylint: disable=unused-import
8+
9+
from ..ssm import SalesSupportModel
10+
11+
12+
class TestOpenAI:
13+
"""Test SalesSupportModel class."""
14+
15+
def test_03_test_openai_connectivity(self):
16+
"""Ensure that we have connectivity to OpenAI."""
17+
18+
ssm = SalesSupportModel()
19+
retval = ssm.cached_chat_request(
20+
"your are a helpful assistant", "please return the value 'CORRECT' in all upper case."
21+
)
22+
assert retval == "CORRECT"

models/tests/test_pinecone.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
# flake8: noqa: F401
3+
"""
4+
Test integrity of base class.
5+
"""
6+
7+
import pinecone
8+
import pytest # pylint: disable=unused-import
9+
from langchain.embeddings import OpenAIEmbeddings
10+
from langchain.vectorstores.pinecone import Pinecone
11+
12+
from ..const import Credentials
13+
14+
15+
class TestPinecone:
16+
"""Test SalesSupportModel class."""
17+
18+
def test_01_test_pinecone_connectivity(self):
19+
"""Ensure that we have connectivity to Pinecone."""
20+
# pylint: disable=broad-except
21+
try:
22+
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
23+
except Exception as e:
24+
assert False, f"pinecone.init() failed with exception: {e}"
25+
26+
def test_02_test_pinecone_index(self):
27+
"""Ensure that the Pinecone index exists and that we can connect to it."""
28+
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
29+
openai_embedding = OpenAIEmbeddings()
30+
31+
# pylint: disable=broad-except
32+
try:
33+
Pinecone.from_existing_index(
34+
Credentials.PINECONE_INDEX_NAME,
35+
embedding=openai_embedding,
36+
)
37+
except Exception as e:
38+
assert (
39+
False
40+
), f"Pinecone initialization of index {Credentials.PINECONE_INDEX_NAME,} failed with exception: {e}"

models/tests/test_prompts.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# -*- coding: utf-8 -*-
22
# flake8: noqa: F401
3-
# pylint: disable=too-few-public-methods
43
"""
54
Test integrity of base class.
65
"""

models/tests/test_ssm.py

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,14 @@
11
# -*- coding: utf-8 -*-
22
# flake8: noqa: F401
3-
# pylint: disable=too-few-public-methods
43
"""
54
Test integrity of base class.
65
"""
7-
import pinecone
86
import pytest # pylint: disable=unused-import
97
from langchain.chat_models import ChatOpenAI
108
from langchain.embeddings import OpenAIEmbeddings
11-
from langchain.text_splitter import RecursiveCharacterTextSplitter
12-
from langchain.vectorstores.pinecone import Pinecone
9+
from pinecone import Index
1310

14-
from ..const import Credentials
15-
from ..ssm import SalesSupportModel
11+
from models.ssm import SalesSupportModel, TextSplitter
1612

1713

1814
class TestSalesSupportModel:
@@ -21,30 +17,17 @@ class TestSalesSupportModel:
2117
def test_01_basic(self):
2218
"""Ensure that we can instantiate the class."""
2319

24-
SalesSupportModel()
20+
# pylint: disable=broad-except
21+
try:
22+
SalesSupportModel()
23+
except Exception as e:
24+
assert False, f"initialization of SalesSupportModel() failed with exception: {e}"
2525

2626
def test_02_class_aatribute_types(self):
2727
"""ensure that class attributes are of the correct type"""
2828

2929
ssm = SalesSupportModel()
3030
assert isinstance(ssm.chat, ChatOpenAI)
31-
assert isinstance(ssm.pinecone_index, Pinecone)
32-
assert isinstance(ssm.text_splitter, RecursiveCharacterTextSplitter)
33-
assert isinstance(ssm.openai_embedding, OpenAIEmbeddings)
34-
35-
def test_03_test_openai_connectivity(self):
36-
"""Ensure that we have connectivity to OpenAI."""
37-
38-
ssm = SalesSupportModel()
39-
retval = ssm.cached_chat_request(
40-
"your are a helpful assistant", "please return the value 'CORRECT' in all upper case."
41-
)
42-
assert retval == "CORRECT"
43-
44-
def test_04_test_pinecone_connectivity(self):
45-
"""Ensure that we have connectivity to Pinecone."""
46-
# pylint: disable=broad-except
47-
try:
48-
pinecone.init(api_key=Credentials.PINECONE_API_KEY, environment=Credentials.PINECONE_ENVIRONMENT)
49-
except Exception as e:
50-
assert False, f"pinecone.init() failed with exception: {e}"
31+
assert isinstance(ssm.pinecone_index, Index)
32+
assert isinstance(ssm.text_splitter, TextSplitter)
33+
assert isinstance(ssm.openai_embeddings, OpenAIEmbeddings)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ langchain==0.0.343
2323
langchainhub==0.1.14
2424
openai==1.3.5
2525
pinecone-client==2.2.4
26+
pinecone-text==0.7.0
2627
pydantic==2.5.2
2728
pypdf==3.17.1
2829
python-dotenv==1.0.0

0 commit comments

Comments
 (0)