Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cookbook/agent_concepts/knowledge/pdf_kb_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

# Create a knowledge base with the PDFs from the data/pdfs directory
knowledge_base = PDFKnowledgeBase(
path="data/pdf",
path="data/pdf", # for password-protected PDFs, use path=[{"path": "tmp/ThaiRecipes_protected.pdf", "password": "ThaiRecipes"}],
vector_db=vector_db,
reader=PDFReader(chunk=True),
)
Expand Down
35 changes: 35 additions & 0 deletions cookbook/agent_concepts/knowledge/pdf_kb_password.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from agno.agent import Agent
from agno.knowledge.pdf import PDFKnowledgeBase
from agno.utils.media import download_file
from agno.vectordb.pgvector import PgVector

db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
download_file(
"https://agno-public.s3.us-east-1.amazonaws.com/recipes/ThaiRecipes_protected.pdf",
"ThaiRecipes_protected.pdf",
)

# Create a knowledge base with simplified password handling
knowledge_base = PDFKnowledgeBase(
path=[
{
"path": "ThaiRecipes_protected.pdf",
"password": "ThaiRecipes",
}
],
vector_db=PgVector(
table_name="pdf_documents_password",
db_url=db_url,
),
)
# Load the knowledge base
knowledge_base.load(recreate=True)

# Create an agent with the knowledge base
agent = Agent(
knowledge=knowledge_base,
search_knowledge=True,
show_tool_calls=True,
)

agent.print_response("Give me the recipe for pad thai")
30 changes: 30 additions & 0 deletions cookbook/agent_concepts/knowledge/pdf_kb_url_password.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from agno.agent import Agent
from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
from agno.vectordb.pgvector import PgVector

db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"

# Create a knowledge base with simplified password handling
knowledge_base = PDFUrlKnowledgeBase(
urls=[
{
"url": "https://agno-public.s3.us-east-1.amazonaws.com/recipes/ThaiRecipes_protected.pdf",
"password": "ThaiRecipes",
}
],
vector_db=PgVector(
table_name="pdf_documents_password",
db_url=db_url,
),
)
# Load the knowledge base
knowledge_base.load(recreate=True)

# Create an agent with the knowledge base
agent = Agent(
knowledge=knowledge_base,
search_knowledge=True,
show_tool_calls=True,
)

agent.print_response("Give me the recipe for pad thai")
82 changes: 69 additions & 13 deletions libs/agno/agno/document/reader/pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from agno.document.base import Document
from agno.document.reader.base import Reader
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
from agno.utils.log import log_info, logger
from agno.utils.log import log_error, log_info, logger

try:
from pypdf import PdfReader as DocumentReader # noqa: F401
Expand Down Expand Up @@ -177,6 +177,7 @@ def __init__(
split_on_pages: bool = True,
page_start_numbering_format: Optional[str] = None,
page_end_numbering_format: Optional[str] = None,
password: Optional[str] = None,
**kwargs,
):
if page_start_numbering_format is None:
Expand All @@ -187,6 +188,7 @@ def __init__(
self.split_on_pages = split_on_pages
self.page_start_numbering_format = page_start_numbering_format
self.page_end_numbering_format = page_end_numbering_format
self.password = password

super().__init__(**kwargs)

Expand All @@ -196,6 +198,28 @@ def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
chunked_documents.extend(self.chunk_document(document))
return chunked_documents

def _decrypt_pdf(self, doc_reader: DocumentReader, doc_name: str, password: Optional[str] = None) -> bool:
if not doc_reader.is_encrypted:
return True

# Use provided password or fall back to instance password
pdf_password = password or self.password
if not pdf_password:
logger.error(f"PDF {doc_name} is password protected but no password provided")
return False

try:
decrypted_pdf = doc_reader.decrypt(pdf_password)
if decrypted_pdf:
log_info(f"Successfully decrypted PDF {doc_name} with user password")
return True
else:
log_error(f"Failed to decrypt PDF {doc_name}: incorrect password")
return False
except Exception as e:
log_error(f"Error decrypting PDF {doc_name}: {e}")
return False

def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
if self.split_on_pages:
shift = page_number_shift if page_number_shift is not None else 1
Expand Down Expand Up @@ -282,7 +306,7 @@ async def _read_pdf_page(page, read_images) -> Tuple[str, str]:
class PDFReader(BasePDFReader):
"""Reader for PDF files"""

def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
try:
if isinstance(pdf, str):
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
Expand All @@ -299,10 +323,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
logger.error(f"Error reading PDF: {e}")
return []

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)

async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
try:
if isinstance(pdf, str):
doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
Expand All @@ -319,18 +347,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
logger.error(f"Error reading PDF: {e}")
return []

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)


class PDFUrlReader(BasePDFReader):
"""Reader for PDF files from URL"""

def __init__(self, proxy: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
super().__init__(password=password, **kwargs)
self.proxy = proxy

def read(self, url: str) -> List[Document]:
def read(self, url: str, password: Optional[str] = None) -> List[Document]:
if not url:
raise ValueError("No url provided")

Expand All @@ -344,10 +376,14 @@ def read(self, url: str) -> List[Document]:
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
pdf_reader = DocumentReader(BytesIO(response.content))

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)

async def async_read(self, url: str) -> List[Document]:
async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
if not url:
raise ValueError("No url provided")

Expand All @@ -364,14 +400,18 @@ async def async_read(self, url: str) -> List[Document]:
doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
pdf_reader = DocumentReader(BytesIO(response.content))

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)


class PDFImageReader(BasePDFReader):
"""Reader for PDF files with text and images extraction"""

def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
if not pdf:
raise ValueError("No pdf provided")

Expand All @@ -386,10 +426,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
log_info(f"Reading: {doc_name}")
pdf_reader = DocumentReader(pdf)

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)

async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
if not pdf:
raise ValueError("No pdf provided")

Expand All @@ -404,18 +448,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
log_info(f"Reading: {doc_name}")
pdf_reader = DocumentReader(pdf)

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)


class PDFUrlImageReader(BasePDFReader):
"""Reader for PDF files from URL with text and images extraction"""

def __init__(self, proxy: Optional[str] = None, **kwargs):
super().__init__(**kwargs)
def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
super().__init__(password=password, **kwargs)
self.proxy = proxy

def read(self, url: str) -> List[Document]:
def read(self, url: str, password: Optional[str] = None) -> List[Document]:
if not url:
raise ValueError("No url provided")

Expand All @@ -430,10 +478,14 @@ def read(self, url: str) -> List[Document]:
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
pdf_reader = DocumentReader(BytesIO(response.content))

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)

async def async_read(self, url: str) -> List[Document]:
async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
if not url:
raise ValueError("No url provided")

Expand All @@ -451,5 +503,9 @@ async def async_read(self, url: str) -> List[Document]:
doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
pdf_reader = DocumentReader(BytesIO(response.content))

# Handle PDF decryption
if not self._decrypt_pdf(pdf_reader, doc_name, password):
return []

# Read and chunk.
return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
40 changes: 32 additions & 8 deletions libs/agno/agno/knowledge/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,22 @@
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union

from pydantic import Field
from typing_extensions import TypedDict

from agno.document import Document
from agno.document.reader.pdf_reader import PDFImageReader, PDFReader
from agno.knowledge.agent import AgentKnowledge
from agno.utils.log import log_info, logger
from agno.utils.log import log_error, log_info, logger


class PDFConfig(TypedDict, total=False):
path: str
password: Optional[str]
metadata: Optional[Dict[str, Any]]


class PDFKnowledgeBase(AgentKnowledge):
path: Optional[Union[str, Path, List[Dict[str, Union[str, Dict[str, Any]]]]]] = None
path: Optional[Union[str, Path, List[PDFConfig]]] = None
formats: List[str] = [".pdf"]
exclude_files: List[str] = Field(default_factory=list)
reader: Union[PDFReader, PDFImageReader] = PDFReader()
Expand All @@ -24,19 +31,21 @@ def document_lists(self) -> Iterator[List[Document]]:
if isinstance(self.path, list):
for item in self.path:
if isinstance(item, dict) and "path" in item:
# Handle path with metadata
file_path = item["path"]
config = item.get("metadata", {})
file_password = item.get("password")
if file_password is not None and not isinstance(file_password, str):
file_password = None

_pdf_path = Path(file_path) # type: ignore
if self._is_valid_pdf(_pdf_path):
documents = self.reader.read(pdf=_pdf_path)
documents = self.reader.read(pdf=_pdf_path, password=file_password)
if config:
for doc in documents:
log_info(f"Adding metadata {config} to document: {doc.name}")
doc.meta_data.update(config) # type: ignore
yield documents
else:
# Handle single path
_pdf_path = Path(self.path)
if _pdf_path.is_dir():
for _pdf in _pdf_path.glob("**/*.pdf"):
Expand All @@ -47,7 +56,19 @@ def document_lists(self) -> Iterator[List[Document]]:

def _is_valid_pdf(self, path: Path) -> bool:
"""Helper to check if path is a valid PDF file."""
return path.exists() and path.is_file() and path.suffix == ".pdf" and path.name not in self.exclude_files
if not path.exists():
log_error(f"PDF file not found: {path}")
return False
if not path.is_file():
log_error(f"Path is not a file: {path}")
return False
if path.suffix != ".pdf":
log_error(f"File is not a PDF: {path}")
return False
if path.name in self.exclude_files:
log_error(f"PDF file excluded: {path}")
return False
return True

@property
async def async_document_lists(self) -> AsyncIterator[List[Document]]:
Expand All @@ -58,12 +79,15 @@ async def async_document_lists(self) -> AsyncIterator[List[Document]]:
if isinstance(self.path, list):
for item in self.path:
if isinstance(item, dict) and "path" in item:
# Handle path with metadata
file_path = item["path"]
config = item.get("metadata", {})
file_password = item.get("password")
if file_password is not None and not isinstance(file_password, str):
file_password = None

_pdf_path = Path(file_path) # type: ignore
if self._is_valid_pdf(_pdf_path):
documents = await self.reader.async_read(pdf=_pdf_path)
documents = await self.reader.async_read(pdf=_pdf_path, password=file_password)
if config:
for doc in documents:
log_info(f"Adding metadata {config} to document: {doc.name}")
Expand Down
Loading