77from agno .document .base import Document
88from agno .document .reader .base import Reader
99from agno .utils .http import async_fetch_with_retry , fetch_with_retry
10- from agno .utils .log import log_info , logger
10+ from agno .utils .log import log_error , log_info , logger
1111
1212try :
1313 from pypdf import PdfReader as DocumentReader # noqa: F401
@@ -177,6 +177,7 @@ def __init__(
177177 split_on_pages : bool = True ,
178178 page_start_numbering_format : Optional [str ] = None ,
179179 page_end_numbering_format : Optional [str ] = None ,
180+ password : Optional [str ] = None ,
180181 ** kwargs ,
181182 ):
182183 if page_start_numbering_format is None :
@@ -187,6 +188,7 @@ def __init__(
187188 self .split_on_pages = split_on_pages
188189 self .page_start_numbering_format = page_start_numbering_format
189190 self .page_end_numbering_format = page_end_numbering_format
191+ self .password = password
190192
191193 super ().__init__ (** kwargs )
192194
@@ -196,6 +198,28 @@ def _build_chunked_documents(self, documents: List[Document]) -> List[Document]:
196198 chunked_documents .extend (self .chunk_document (document ))
197199 return chunked_documents
198200
201+ def _decrypt_pdf (self , doc_reader : DocumentReader , doc_name : str , password : Optional [str ] = None ) -> bool :
202+ if not doc_reader .is_encrypted :
203+ return True
204+
205+ # Use provided password or fall back to instance password
206+ pdf_password = password or self .password
207+ if not pdf_password :
208+ logger .error (f"PDF { doc_name } is password protected but no password provided" )
209+ return False
210+
211+ try :
212+ decrypted_pdf = doc_reader .decrypt (pdf_password )
213+ if decrypted_pdf :
214+ log_info (f"Successfully decrypted PDF { doc_name } with user password" )
215+ return True
216+ else :
217+ log_error (f"Failed to decrypt PDF { doc_name } : incorrect password" )
218+ return False
219+ except Exception as e :
220+ log_error (f"Error decrypting PDF { doc_name } : { e } " )
221+ return False
222+
199223 def _create_documents (self , pdf_content : List [str ], doc_name : str , use_uuid_for_id : bool , page_number_shift ):
200224 if self .split_on_pages :
201225 shift = page_number_shift if page_number_shift is not None else 1
@@ -282,7 +306,7 @@ async def _read_pdf_page(page, read_images) -> Tuple[str, str]:
282306class PDFReader (BasePDFReader ):
283307 """Reader for PDF files"""
284308
285- def read (self , pdf : Union [str , Path , IO [Any ]]) -> List [Document ]:
309+ def read (self , pdf : Union [str , Path , IO [Any ]], password : Optional [ str ] = None ) -> List [Document ]:
286310 try :
287311 if isinstance (pdf , str ):
288312 doc_name = pdf .split ("/" )[- 1 ].split ("." )[0 ].replace (" " , "_" )
@@ -299,10 +323,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
299323 logger .error (f"Error reading PDF: { e } " )
300324 return []
301325
326+ # Handle PDF decryption
327+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
328+ return []
329+
302330 # Read and chunk.
303331 return self ._pdf_reader_to_documents (pdf_reader , doc_name , use_uuid_for_id = True )
304332
305- async def async_read (self , pdf : Union [str , Path , IO [Any ]]) -> List [Document ]:
333+ async def async_read (self , pdf : Union [str , Path , IO [Any ]], password : Optional [ str ] = None ) -> List [Document ]:
306334 try :
307335 if isinstance (pdf , str ):
308336 doc_name = pdf .split ("/" )[- 1 ].split ("." )[0 ].replace (" " , "_" )
@@ -319,18 +347,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
319347 logger .error (f"Error reading PDF: { e } " )
320348 return []
321349
350+ # Handle PDF decryption
351+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
352+ return []
353+
322354 # Read and chunk.
323355 return await self ._async_pdf_reader_to_documents (pdf_reader , doc_name , use_uuid_for_id = True )
324356
325357
326358class PDFUrlReader (BasePDFReader ):
327359 """Reader for PDF files from URL"""
328360
329- def __init__ (self , proxy : Optional [str ] = None , ** kwargs ):
330- super ().__init__ (** kwargs )
361+ def __init__ (self , proxy : Optional [str ] = None , password : Optional [ str ] = None , ** kwargs ):
362+ super ().__init__ (password = password , ** kwargs )
331363 self .proxy = proxy
332364
333- def read (self , url : str ) -> List [Document ]:
365+ def read (self , url : str , password : Optional [ str ] = None ) -> List [Document ]:
334366 if not url :
335367 raise ValueError ("No url provided" )
336368
@@ -344,10 +376,14 @@ def read(self, url: str) -> List[Document]:
344376 doc_name = url .split ("/" )[- 1 ].split ("." )[0 ].replace ("/" , "_" ).replace (" " , "_" )
345377 pdf_reader = DocumentReader (BytesIO (response .content ))
346378
379+ # Handle PDF decryption
380+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
381+ return []
382+
347383 # Read and chunk.
348384 return self ._pdf_reader_to_documents (pdf_reader , doc_name , use_uuid_for_id = False )
349385
350- async def async_read (self , url : str ) -> List [Document ]:
386+ async def async_read (self , url : str , password : Optional [ str ] = None ) -> List [Document ]:
351387 if not url :
352388 raise ValueError ("No url provided" )
353389
@@ -364,14 +400,18 @@ async def async_read(self, url: str) -> List[Document]:
364400 doc_name = url .split ("/" )[- 1 ].split ("." )[0 ].replace ("/" , "_" ).replace (" " , "_" )
365401 pdf_reader = DocumentReader (BytesIO (response .content ))
366402
403+ # Handle PDF decryption
404+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
405+ return []
406+
367407 # Read and chunk.
368408 return await self ._async_pdf_reader_to_documents (pdf_reader , doc_name , use_uuid_for_id = False )
369409
370410
371411class PDFImageReader (BasePDFReader ):
372412 """Reader for PDF files with text and images extraction"""
373413
374- def read (self , pdf : Union [str , Path , IO [Any ]]) -> List [Document ]:
414+ def read (self , pdf : Union [str , Path , IO [Any ]], password : Optional [ str ] = None ) -> List [Document ]:
375415 if not pdf :
376416 raise ValueError ("No pdf provided" )
377417
@@ -386,10 +426,14 @@ def read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
386426 log_info (f"Reading: { doc_name } " )
387427 pdf_reader = DocumentReader (pdf )
388428
429+ # Handle PDF decryption
430+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
431+ return []
432+
389433 # Read and chunk.
390434 return self ._pdf_reader_to_documents (pdf_reader , doc_name , read_images = True , use_uuid_for_id = False )
391435
392- async def async_read (self , pdf : Union [str , Path , IO [Any ]]) -> List [Document ]:
436+ async def async_read (self , pdf : Union [str , Path , IO [Any ]], password : Optional [ str ] = None ) -> List [Document ]:
393437 if not pdf :
394438 raise ValueError ("No pdf provided" )
395439
@@ -404,18 +448,22 @@ async def async_read(self, pdf: Union[str, Path, IO[Any]]) -> List[Document]:
404448 log_info (f"Reading: { doc_name } " )
405449 pdf_reader = DocumentReader (pdf )
406450
451+ # Handle PDF decryption
452+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
453+ return []
454+
407455 # Read and chunk.
408456 return await self ._async_pdf_reader_to_documents (pdf_reader , doc_name , read_images = True , use_uuid_for_id = False )
409457
410458
411459class PDFUrlImageReader (BasePDFReader ):
412460 """Reader for PDF files from URL with text and images extraction"""
413461
414- def __init__ (self , proxy : Optional [str ] = None , ** kwargs ):
415- super ().__init__ (** kwargs )
462+ def __init__ (self , proxy : Optional [str ] = None , password : Optional [ str ] = None , ** kwargs ):
463+ super ().__init__ (password = password , ** kwargs )
416464 self .proxy = proxy
417465
418- def read (self , url : str ) -> List [Document ]:
466+ def read (self , url : str , password : Optional [ str ] = None ) -> List [Document ]:
419467 if not url :
420468 raise ValueError ("No url provided" )
421469
@@ -430,10 +478,14 @@ def read(self, url: str) -> List[Document]:
430478 doc_name = url .split ("/" )[- 1 ].split ("." )[0 ].replace (" " , "_" )
431479 pdf_reader = DocumentReader (BytesIO (response .content ))
432480
481+ # Handle PDF decryption
482+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
483+ return []
484+
433485 # Read and chunk.
434486 return self ._pdf_reader_to_documents (pdf_reader , doc_name , read_images = True , use_uuid_for_id = False )
435487
436- async def async_read (self , url : str ) -> List [Document ]:
488+ async def async_read (self , url : str , password : Optional [ str ] = None ) -> List [Document ]:
437489 if not url :
438490 raise ValueError ("No url provided" )
439491
@@ -451,5 +503,9 @@ async def async_read(self, url: str) -> List[Document]:
451503 doc_name = url .split ("/" )[- 1 ].split ("." )[0 ].replace (" " , "_" )
452504 pdf_reader = DocumentReader (BytesIO (response .content ))
453505
506+ # Handle PDF decryption
507+ if not self ._decrypt_pdf (pdf_reader , doc_name , password ):
508+ return []
509+
454510 # Read and chunk.
455511 return await self ._async_pdf_reader_to_documents (pdf_reader , doc_name , read_images = True , use_uuid_for_id = False )
0 commit comments