Skip to content

Commit a4e9c1c

Browse files
committed
ML/LangChain: Fix notebooks by following upstream changes
1 parent a2761fe commit a4e9c1c

File tree

4 files changed

+11
-12
lines changed

4 files changed

+11
-12
lines changed

topic/machine-learning/llm-langchain/document_loader.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@
159159
"loader = CrateDBLoader(\n",
160160
" 'SELECT * FROM mlb_teams_2012 ORDER BY \"Team\" LIMIT 5;',\n",
161161
" db=db,\n",
162-
" page_content_columns=[\"Team\"],\n",
163-
" metadata_columns=[\"Payroll (millions)\"],\n",
162+
" page_content_mapper=lambda row: row[\"Team\"],\n",
163+
" metadata_mapper=lambda row: {\"Payroll (millions)\": row[\"Payroll (millions)\"]},\n",
164164
")\n",
165165
"documents = loader.load()"
166166
]

topic/machine-learning/llm-langchain/requirements-dev.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Real.
22
cratedb-toolkit[io]
3-
pueblo[notebook,testing]==0.0.9
3+
pueblo[notebook,testing]
44

55
# Development.
66
# cratedb-toolkit[io] @ git+https://github.com/crate-workbench/cratedb-toolkit.git@main

topic/machine-learning/llm-langchain/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ google-cloud-aiplatform<2
44
langchain-google-vertexai<3
55
langchain-openai<0.3
66
langchain-text-splitters<0.4
7-
pueblo[cli,nlp]==0.0.9
7+
pueblo[cli,nlp]>=0.0.10
88
pydantic>=2,<3
99
pypdf<6
1010
python-dotenv<2

topic/machine-learning/llm-langchain/vector_search.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,25 +19,24 @@
1919
# Run program.
2020
python vector_search.py
2121
""" # noqa: E501
22-
from langchain_community.document_loaders import UnstructuredURLLoader
22+
2323
from langchain_community.vectorstores import CrateDBVectorSearch
24-
from langchain_text_splitters import CharacterTextSplitter
2524
from langchain_openai import OpenAIEmbeddings
2625

2726
import nltk
27+
from pueblo.nlp.resource import CachedWebResource
2828

2929

3030
def main():
3131

3232
nltk.download("averaged_perceptron_tagger_eng")
3333
nltk.download("punkt_tab")
3434

35-
# Load the document, split it into chunks, embed each chunk,
36-
# and load it into the vector store.
37-
state_of_the_union_url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
38-
raw_documents = UnstructuredURLLoader(urls=[state_of_the_union_url]).load()
39-
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
40-
documents = text_splitter.split_documents(raw_documents)
35+
# Load a document, and split it into chunks.
36+
url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
37+
documents = CachedWebResource(url).langchain_documents(chunk_size=1000, chunk_overlap=0)
38+
39+
# Embed each chunk, and load them into the vector store.
4140
db = CrateDBVectorSearch.from_documents(documents, OpenAIEmbeddings())
4241

4342
# Invoke a query, and display the first result.

0 commit comments

Comments
 (0)