agno-agi · dirkbrnd · Aug 8, 2025 · Aug 5, 2025 · Aug 5, 2025 · Aug 5, 2025
@@ -10,7 +10,7 @@
 
 # Create a knowledge base with the PDFs from the data/pdfs directory
 knowledge_base = PDFKnowledgeBase(
-    path="data/pdf",
+    path="data/pdf",  # for password-protected PDFs, use path=[{"path": "tmp/ThaiRecipes_protected.pdf", "password": "ThaiRecipes"}],
     vector_db=vector_db,
     reader=PDFReader(chunk=True),
 )

@@ -0,0 +1,35 @@
+from agno.agent import Agent
+from agno.knowledge.pdf import PDFKnowledgeBase
+from agno.utils.media import download_file
+from agno.vectordb.pgvector import PgVector
+
+db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
+download_file(
+    "https://agno-public.s3.us-east-1.amazonaws.com/recipes/ThaiRecipes_protected.pdf",
+    "ThaiRecipes_protected.pdf",
+)
+
+# Create a knowledge base with simplified password handling
+knowledge_base = PDFKnowledgeBase(
+    path=[
+        {
+            "path": "ThaiRecipes_protected.pdf",
+            "password": "ThaiRecipes",
+        }
+    ],
+    vector_db=PgVector(
+        table_name="pdf_documents_password",
+        db_url=db_url,
+    ),
+)
+# Load the knowledge base
+knowledge_base.load(recreate=True)
+
+# Create an agent with the knowledge base
+agent = Agent(
+    knowledge=knowledge_base,
+    search_knowledge=True,
+    show_tool_calls=True,
+)
+
+agent.print_response("Give me the recipe for pad thai")
@@ -0,0 +1,30 @@
+from agno.agent import Agent
+from agno.knowledge.pdf_url import PDFUrlKnowledgeBase
+from agno.vectordb.pgvector import PgVector
+
+db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
+
+# Create a knowledge base with simplified password handling
+knowledge_base = PDFUrlKnowledgeBase(
+    urls=[
+        {
+            "url": "https://agno-public.s3.us-east-1.amazonaws.com/recipes/ThaiRecipes_protected.pdf",
+            "password": "ThaiRecipes",
+        }
+    ],
+    vector_db=PgVector(
+        table_name="pdf_documents_password",
+        db_url=db_url,
+    ),
+)
+# Load the knowledge base
+knowledge_base.load(recreate=True)
+
+# Create an agent with the knowledge base
+agent = Agent(
+    knowledge=knowledge_base,
+    search_knowledge=True,
+    show_tool_calls=True,
+)
+
+agent.print_response("Give me the recipe for pad thai")
@@ -7,7 +7,7 @@
 from agno.document.base import Document
 from agno.document.reader.base import Reader
 from agno.utils.http import async_fetch_with_retry, fetch_with_retry
-from agno.utils.log import log_info, logger
+from agno.utils.log import log_error, log_info, logger
 
 try:
     from pypdf import PdfReader as DocumentReader  # noqa: F401
@@ -177,6 +177,7 @@ def __init__(
         split_on_pages: bool = True,
         page_start_numbering_format: Optional[str] = None,
         page_end_numbering_format: Optional[str] = None,
+        password: Optional[str] = None,
         **kwargs,
     ):
         if page_start_numbering_format is None:
@@ -187,6 +188,7 @@ def __init__(
         self.split_on_pages = split_on_pages
         self.page_start_numbering_format = page_start_numbering_format
         self.page_end_numbering_format = page_end_numbering_format
+        self.password = password
 
         super().__init__(**kwargs)
 
@@ -196,6 +198,28 @@ def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
             chunked_documents.extend(self.chunk_document(document))
         return chunked_documents
 
+    def _decrypt_pdf(self, doc_reader: DocumentReader, doc_name: str, password: Optional[str] = None) -> bool:
+        if not doc_reader.is_encrypted:
+            return True
+
+        # Use provided password or fall back to instance password
+        pdf_password = password or self.password
+        if not pdf_password:
+            logger.error(f"PDF {doc_name} is password protected but no password provided")
+            return False
+
+        try:
+            decrypted_pdf = doc_reader.decrypt(pdf_password)
+            if decrypted_pdf:
+                log_info(f"Successfully decrypted PDF {doc_name} with user password")
+                return True
+            else:
+                log_error(f"Failed to decrypt PDF {doc_name}: incorrect password")
+                return False
+        except Exception as e:
+            log_error(f"Error decrypting PDF {doc_name}: {e}")
+            return False
+
     def _create_documents(self, pdf_content: List[str], doc_name: str, use_uuid_for_id: bool, page_number_shift):
         if self.split_on_pages:
             shift = page_number_shift if page_number_shift is not None else 1
@@ -282,7 +306,7 @@ async def _read_pdf_page(page, read_images) -> Tuple[str, str]:
 class PDFReader(BasePDFReader):
     """Reader for PDF files"""
 
-    def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
+    def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
         try:
             if isinstance(pdf, str):
                 doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
@@ -299,10 +323,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
             logger.error(f"Error reading PDF: {e}")
             return []
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
 
-    async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
+    async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
         try:
             if isinstance(pdf, str):
                 doc_name = pdf.split("/")[-1].split(".")[0].replace(" ", "_")
@@ -319,18 +347,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
             logger.error(f"Error reading PDF: {e}")
             return []
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=True)
 
 
 class PDFUrlReader(BasePDFReader):
     """Reader for PDF files from URL"""
 
-    def __init__(self, proxy: Optional[str] = None, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
+        super().__init__(password=password, **kwargs)
         self.proxy = proxy
 
-    def read(self, url: str) -> List[Document]:
+    def read(self, url: str, password: Optional[str] = None) -> List[Document]:
         if not url:
             raise ValueError("No url provided")
 
@@ -344,10 +376,14 @@ def read(self, url: str) -> List[Document]:
         doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
         pdf_reader = DocumentReader(BytesIO(response.content))
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return self._pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
 
-    async def async_read(self, url: str) -> List[Document]:
+    async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
         if not url:
             raise ValueError("No url provided")
 
@@ -364,14 +400,18 @@ async def async_read(self, url: str) -> List[Document]:
         doc_name = url.split("/")[-1].split(".")[0].replace("/", "_").replace(" ", "_")
         pdf_reader = DocumentReader(BytesIO(response.content))
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, use_uuid_for_id=False)
 
 
 class PDFImageReader(BasePDFReader):
     """Reader for PDF files with text and images extraction"""
 
-    def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
+    def read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
         if not pdf:
             raise ValueError("No pdf provided")
 
@@ -386,10 +426,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
         log_info(f"Reading: {doc_name}")
         pdf_reader = DocumentReader(pdf)
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
 
-    async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
+    async def async_read(self, pdf: Union[str, Path, IO[Any]], password: Optional[str] = None) -> List[Document]:
         if not pdf:
             raise ValueError("No pdf provided")
 
@@ -404,18 +448,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
         log_info(f"Reading: {doc_name}")
         pdf_reader = DocumentReader(pdf)
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
 
 
 class PDFUrlImageReader(BasePDFReader):
     """Reader for PDF files from URL with text and images extraction"""
 
-    def __init__(self, proxy: Optional[str] = None, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, proxy: Optional[str] = None, password: Optional[str] = None, **kwargs):
+        super().__init__(password=password, **kwargs)
         self.proxy = proxy
 
-    def read(self, url: str) -> List[Document]:
+    def read(self, url: str, password: Optional[str] = None) -> List[Document]:
         if not url:
             raise ValueError("No url provided")
 
@@ -430,10 +478,14 @@ def read(self, url: str) -> List[Document]:
         doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
         pdf_reader = DocumentReader(BytesIO(response.content))
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return self._pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
 
-    async def async_read(self, url: str) -> List[Document]:
+    async def async_read(self, url: str, password: Optional[str] = None) -> List[Document]:
         if not url:
             raise ValueError("No url provided")
 
@@ -451,5 +503,9 @@ async def async_read(self, url: str) -> List[Document]:
         doc_name = url.split("/")[-1].split(".")[0].replace(" ", "_")
         pdf_reader = DocumentReader(BytesIO(response.content))
 
+        # Handle PDF decryption
+        if not self._decrypt_pdf(pdf_reader, doc_name, password):
+            return []
+
         # Read and chunk.
         return await self._async_pdf_reader_to_documents(pdf_reader, doc_name, read_images=True, use_uuid_for_id=False)
@@ -2,15 +2,22 @@
 from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union
 
 from pydantic import Field
+from typing_extensions import TypedDict
 
 from agno.document import Document
 from agno.document.reader.pdf_reader import PDFImageReader, PDFReader
 from agno.knowledge.agent import AgentKnowledge
-from agno.utils.log import log_info, logger
+from agno.utils.log import log_error, log_info, logger
+
+
+class PDFConfig(TypedDict, total=False):
+    path: str
+    password: Optional[str]
+    metadata: Optional[Dict[str, Any]]
 
 
 class PDFKnowledgeBase(AgentKnowledge):
-    path: Optional[Union[str, Path, List[Dict[str, Union[str, Dict[str, Any]]]]]] = None
+    path: Optional[Union[str, Path, List[PDFConfig]]] = None
     formats: List[str] = [".pdf"]
     exclude_files: List[str] = Field(default_factory=list)
     reader: Union[PDFReader, PDFImageReader] = PDFReader()
@@ -24,19 +31,21 @@ def document_lists(self) -> Iterator[List[Document]]:
         if isinstance(self.path, list):
             for item in self.path:
                 if isinstance(item, dict) and "path" in item:
-                    # Handle path with metadata
                     file_path = item["path"]
                     config = item.get("metadata", {})
+                    file_password = item.get("password")
+                    if file_password is not None and not isinstance(file_password, str):
+                        file_password = None
+
                     _pdf_path = Path(file_path)  # type: ignore
                     if self._is_valid_pdf(_pdf_path):
-                        documents = self.reader.read(pdf=_pdf_path)
+                        documents = self.reader.read(pdf=_pdf_path, password=file_password)
                         if config:
                             for doc in documents:
                                 log_info(f"Adding metadata {config} to document: {doc.name}")
                                 doc.meta_data.update(config)  # type: ignore
                         yield documents
         else:
-            # Handle single path
             _pdf_path = Path(self.path)
             if _pdf_path.is_dir():
                 for _pdf in _pdf_path.glob("**/*.pdf"):
@@ -47,7 +56,19 @@ def document_lists(self) -> Iterator[List[Document]]:
 
     def _is_valid_pdf(self, path: Path) -> bool:
         """Helper to check if path is a valid PDF file."""
-        return path.exists() and path.is_file() and path.suffix == ".pdf" and path.name not in self.exclude_files
+        if not path.exists():
+            log_error(f"PDF file not found: {path}")
+            return False
+        if not path.is_file():
+            log_error(f"Path is not a file: {path}")
+            return False
+        if path.suffix != ".pdf":
+            log_error(f"File is not a PDF: {path}")
+            return False
+        if path.name in self.exclude_files:
+            log_error(f"PDF file excluded: {path}")
+            return False
+        return True
 
     @property
     async def async_document_lists(self) -> AsyncIterator[List[Document]]:
@@ -58,12 +79,15 @@ async def async_document_lists(self) -> AsyncIterator[List[Document]]:
         if isinstance(self.path, list):
             for item in self.path:
                 if isinstance(item, dict) and "path" in item:
-                    # Handle path with metadata
                     file_path = item["path"]
                     config = item.get("metadata", {})
+                    file_password = item.get("password")
+                    if file_password is not None and not isinstance(file_password, str):
+                        file_password = None
+
                     _pdf_path = Path(file_path)  # type: ignore
                     if self._is_valid_pdf(_pdf_path):
-                        documents = await self.reader.async_read(pdf=_pdf_path)
+                        documents = await self.reader.async_read(pdf=_pdf_path, password=file_password)
                         if config:
                             for doc in documents:
                                 log_info(f"Adding metadata {config} to document: {doc.name}")