Skip to content

Commit 2588c4a

Browse files
Merge branch 'main' into tools/browserbase
2 parents 7cfd511 + e3d1fe2 commit 2588c4a

File tree

13 files changed

+431
-119
lines changed

13 files changed

+431
-119
lines changed

cookbook/agent_concepts/knowledge/pdf_kb_async.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
# Create a knowledge base with the PDFs from the data/pdfs directory
1212
knowledge_base = PDFKnowledgeBase(
13-
path="data/pdf",
13+
path="data/pdf", # for password-protected PDFs, use path=[{"path": "tmp/ThaiRecipes_protected.pdf", "password": "ThaiRecipes"}],
1414
vector_db=vector_db,
1515
reader=PDFReader(chunk=True),
1616
)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from agno.agent import Agent
2+
from agno.knowledge.pdf import PDFKnowledgeBase
3+
from agno.utils.media import download_file
4+
from agno.vectordb.pgvector import PgVector
5+
6+
db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
7+
download_file(
8+
"https://agno-public.s3.us-east-1.amazonaws.com/recipes/ThaiRecipes_protected.pdf",
9+
"ThaiRecipes_protected.pdf",
10+
)
11+
12+
# Create a knowledge base with simplified password handling
13+
knowledge_base = PDFKnowledgeBase(
14+
path=[
15+
{
16+
"path": "ThaiRecipes_protected.pdf",
17+
"password": "ThaiRecipes",
18+
}
19+
],
20+
vector_db=PgVector(
21+
table_name="pdf_documents_password",
22+
db_url=db_url,
23+
),
24+
)
25+
# Load the knowledge base
26+
knowledge_base.load(recreate=True)
27+
28+
# Create an agent with the knowledge base
29+
agent = Agent(
30+
knowledge=knowledge_base,
31+
search_knowledge=True,
32+
show_tool_calls=True,
33+
)
34+
35+
agent.print_response("Give me the recipe for pad thai")
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from agno.agent import Agent
2+
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
3+
from agno.vectordb.pgvector import PgVector
4+
5+
db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
6+
7+
# Create a knowledge base with simplified password handling
8+
knowledge_base = PDFUrlKnowledgeBase(
9+
urls=[
10+
{
11+
"url": "https://agno-public.s3.us-east-1.amazonaws.com/recipes/ThaiRecipes_protected.pdf",
12+
"password": "ThaiRecipes",
13+
}
14+
],
15+
vector_db=PgVector(
16+
table_name="pdf_documents_password",
17+
db_url=db_url,
18+
),
19+
)
20+
# Load the knowledge base
21+
knowledge_base.load(recreate=True)
22+
23+
# Create an agent with the knowledge base
24+
agent = Agent(
25+
knowledge=knowledge_base,
26+
search_knowledge=True,
27+
show_tool_calls=True,
28+
)
29+
30+
agent.print_response("Give me the recipe for pad thai")
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
This example demonstrates using Weaviate as a vector database.
3+
4+
Installation:
5+
pip install weaviate-client
6+
7+
You can use either Weaviate Cloud or a local instance.
8+
9+
Weaviate Cloud Setup:
10+
1. Create account at https://console.weaviate.cloud/
11+
2. Create a cluster and copy the "REST endpoint" and "Admin" API Key. Then set environment variables:
12+
export WCD_URL="your-cluster-url"
13+
export WCD_API_KEY="your-api-key"
14+
15+
Local Development Setup:
16+
1. Install Docker from https://docs.docker.com/get-docker/
17+
2. Run Weaviate locally:
18+
docker run -d \
19+
-p 8080:8080 \
20+
-p 50051:50051 \
21+
--name weaviate \
22+
cr.weaviate.io/semitechnologies/weaviate:1.28.4
23+
or use the script `cookbook/scripts/run_weviate.sh` to start a local instance.
24+
3. Remember to set `local=True` on the Weaviate instantiation.
25+
"""
26+
27+
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
28+
from agno.knowledge.document import DocumentKnowledgeBase
29+
from agno.document import Document
30+
from agno.vectordb.search import SearchType
31+
from agno.vectordb.weaviate import Distance, VectorIndex, Weaviate
32+
from agno.utils.log import set_log_level_to_debug
33+
34+
from agno.embedder.sentence_transformer import SentenceTransformerEmbedder
35+
embedder = SentenceTransformerEmbedder()
36+
37+
vector_db = Weaviate(
38+
collection="recipes",
39+
search_type=SearchType.hybrid,
40+
vector_index=VectorIndex.HNSW,
41+
distance=Distance.COSINE,
42+
embedder=embedder,
43+
local=True, # Set to False if using Weaviate Cloud and True if using local instance
44+
)
45+
# Create knowledge base
46+
knowledge_base = PDFUrlKnowledgeBase(
47+
urls=["https://agno-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
48+
vector_db=vector_db,
49+
)
50+
51+
vector_db.drop()
52+
set_log_level_to_debug()
53+
54+
knowledge_base.load(recreate=False, upsert=True)
55+
56+
print("Knowledge base loaded with PDF content. Loading the same data again will not recreate it.")
57+
knowledge_base.load(recreate=False, upsert=True)
58+
59+
print("First example finished. Now dropping the knowledge base.")
60+
vector_db.drop()
61+
62+
doc1 = Document(content="my first content", name="doc1")
63+
doc1_modified = Document(content="my first content corrected", name="doc1")
64+
doc2 = Document(content="my second content", name="doc2")
65+
66+
knowledge_base = DocumentKnowledgeBase(
67+
documents=[doc1, doc2],
68+
vector_db=vector_db,
69+
)
70+
knowledge_base_changed = DocumentKnowledgeBase(
71+
documents=[doc1_modified, doc2],
72+
vector_db=vector_db,
73+
)
74+
75+
print("\n\nStart second example. Load initial data...")
76+
knowledge_base.load(recreate=False, upsert=True)
77+
print("\nNow uploading the changed data...")
78+
knowledge_base_changed.load(recreate=False, upsert=True)
79+
print("Example finished. Now dropping the knowledge base.")
80+
vector_db.drop()

libs/agno/agno/agent/agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6872,7 +6872,7 @@ def add_to_knowledge(self, query: str, result: str) -> str:
68726872
document_name = query.replace(" ", "_").replace("?", "").replace("!", "").replace(".", "")
68736873
document_content = json.dumps({"query": query, "result": result})
68746874
log_info(f"Adding document to knowledge base: {document_name}: {document_content}")
6875-
self.knowledge.add_document_to_knowledge_base(
6875+
self.knowledge.load_document(
68766876
document=Document(
68776877
name=document_name,
68786878
content=document_content,

libs/agno/agno/document/reader/pdf_reader.py

Lines changed: 69 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from agno.document.base import Document
88
from agno.document.reader.base import Reader
99
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
10-
from agno.utils.log import log_info, logger
10+
from agno.utils.log import log_error, log_info, logger
1111

1212
try:
1313
from pypdf import PdfReader as DocumentReader # noqa: F401
@@ -177,6 +177,7 @@ def __init__(
177177
split_on_pages: bool = True,
178178
page_start_numbering_format: Optional[str] = None,
179179
page_end_numbering_format: Optional[str] = None,
180+
password: Optional[str] = None,
180181
**kwargs,
181182
):
182183
if page_start_numbering_format is None:
@@ -187,6 +188,7 @@ def __init__(
187188
self.split_on_pages = split_on_pages
188189
self.page_start_numbering_format = page_start_numbering_format
189190
self.page_end_numbering_format = page_end_numbering_format
191+
self.password = password
190192

191193
super().__init__(**kwargs)
192194

@@ -196,6 +198,28 @@ def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
196198
chunked_documents.extend(self.chunk_document(document))
197199
return chunked_documents
198200

201+
def _decrypt_pdf(self, doc_reader: DocumentReader, doc_name: str, password: Optional[str] = None) -> bool:
202+
if not doc_reader.is_encrypted:
203+
return True
204+
205+
# Use provided password or fall back to instance password
206+
pdf_password = password or self.password
207+
if not pdf_password:
208+
logger.error(f"PDF {doc_name} is password protected but no password provided")
209+
return False
210+
211+
try:
212+
decrypted_pdf = doc_reader.decrypt(pdf_password)
213+
if decrypted_pdf:
214+
log_info(f"Successfully decrypted PDF {doc_name} with user password")
215+
return True
216+
else:
217+
log_error(f"Failed to decrypt PDF {doc_name}: incorrect password")
218+
return False
219+
except Exception as e:
220+
log_error(f"Error decrypting PDF {doc_name}: {e}")
221+
return False
222+
199223
def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
200224
if self.split_on_pages:
201225
shift = page_number_shift if page_number_shift is not None else 1
@@ -282,7 +306,7 @@ async def _read_pdf_page(page, read_images) -> Tuple[str, str]:
282306
class PDFReader(BasePDFReader):
283307
"""Reader for PDF files"""
284308

285-
def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
309+
def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
286310
try:
287311
if isinstance(pdf, str):
288312
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
@@ -299,10 +323,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
299323
logger.error(f"Error reading PDF: {e}")
300324
return []
301325

326+
# Handle PDF decryption
327+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
328+
return []
329+
302330
# Read and chunk.
303331
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
304332

305-
async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
333+
async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
306334
try:
307335
if isinstance(pdf, str):
308336
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
@@ -319,18 +347,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
319347
logger.error(f"Error reading PDF: {e}")
320348
return []
321349

350+
# Handle PDF decryption
351+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
352+
return []
353+
322354
# Read and chunk.
323355
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
324356

325357

326358
class PDFUrlReader(BasePDFReader):
327359
"""Reader for PDF files from URL"""
328360

329-
def __init__(self, proxy: Optional[str] = None, **kwargs):
330-
super().__init__(**kwargs)
361+
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
362+
super().__init__(password=password, **kwargs)
331363
self.proxy = proxy
332364

333-
def read(self, url: str) -> List[Document]:
365+
def read(self, url: str, password: Optional[str] = None) -> List[Document]:
334366
if not url:
335367
raise ValueError("No url provided")
336368

@@ -344,10 +376,14 @@ def read(self, url: str) -> List[Document]:
344376
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
345377
pdf_reader = DocumentReader(BytesIO(response.content))
346378

379+
# Handle PDF decryption
380+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
381+
return []
382+
347383
# Read and chunk.
348384
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
349385

350-
async def async_read(self, url: str) -> List[Document]:
386+
async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
351387
if not url:
352388
raise ValueError("No url provided")
353389

@@ -364,14 +400,18 @@ async def async_read(self, url: str) -> List[Document]:
364400
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
365401
pdf_reader = DocumentReader(BytesIO(response.content))
366402

403+
# Handle PDF decryption
404+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
405+
return []
406+
367407
# Read and chunk.
368408
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
369409

370410

371411
class PDFImageReader(BasePDFReader):
372412
"""Reader for PDF files with text and images extraction"""
373413

374-
def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
414+
def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
375415
if not pdf:
376416
raise ValueError("No pdf provided")
377417

@@ -386,10 +426,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
386426
log_info(f"Reading: {doc_name}")
387427
pdf_reader = DocumentReader(pdf)
388428

429+
# Handle PDF decryption
430+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
431+
return []
432+
389433
# Read and chunk.
390434
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
391435

392-
async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
436+
async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
393437
if not pdf:
394438
raise ValueError("No pdf provided")
395439

@@ -404,18 +448,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
404448
log_info(f"Reading: {doc_name}")
405449
pdf_reader = DocumentReader(pdf)
406450

451+
# Handle PDF decryption
452+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
453+
return []
454+
407455
# Read and chunk.
408456
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
409457

410458

411459
class PDFUrlImageReader(BasePDFReader):
412460
"""Reader for PDF files from URL with text and images extraction"""
413461

414-
def __init__(self, proxy: Optional[str] = None, **kwargs):
415-
super().__init__(**kwargs)
462+
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
463+
super().__init__(password=password, **kwargs)
416464
self.proxy = proxy
417465

418-
def read(self, url: str) -> List[Document]:
466+
def read(self, url: str, password: Optional[str] = None) -> List[Document]:
419467
if not url:
420468
raise ValueError("No url provided")
421469

@@ -430,10 +478,14 @@ def read(self, url: str) -> List[Document]:
430478
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
431479
pdf_reader = DocumentReader(BytesIO(response.content))
432480

481+
# Handle PDF decryption
482+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
483+
return []
484+
433485
# Read and chunk.
434486
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
435487

436-
async def async_read(self, url: str) -> List[Document]:
488+
async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
437489
if not url:
438490
raise ValueError("No url provided")
439491

@@ -451,5 +503,9 @@ async def async_read(self, url: str) -> List[Document]:
451503
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
452504
pdf_reader = DocumentReader(BytesIO(response.content))
453505

506+
# Handle PDF decryption
507+
if not self._decrypt_pdf(pdf_reader, doc_name, password):
508+
return []
509+
454510
# Read and chunk.
455511
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)

0 commit comments

Comments
 (0)