7
7
from pathlib import Path
8
8
from typing import BinaryIO , Optional , Sequence , Tuple , Union
9
9
10
- import pikepdf
10
+ import pypdfium2 as pdfium
11
11
12
12
from mindee .error .mimetype_error import MimeTypeError
13
13
from mindee .error .mindee_error import MindeeError , MindeeSourceError
@@ -117,14 +117,14 @@ def count_doc_pages(self) -> int:
117
117
:return: the number of pages.
118
118
"""
119
119
self .file_object .seek (0 )
120
- with pikepdf . open (self .file_object ) as pdf :
121
- return len (pdf . pages )
120
+ pdf = pdfium . PdfDocument (self .file_object )
121
+ return len (pdf )
122
122
123
123
def process_pdf (
124
- self ,
125
- behavior : str ,
126
- on_min_pages : int ,
127
- page_indexes : Sequence ,
124
+ self ,
125
+ behavior : str ,
126
+ on_min_pages : int ,
127
+ page_indexes : Sequence ,
128
128
) -> None :
129
129
"""Run any required processing on a PDF file."""
130
130
if self .is_pdf_empty ():
@@ -163,14 +163,13 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
163
163
:return: None
164
164
"""
165
165
self .file_object .seek (0 )
166
- new_pdf = pikepdf .Pdf .new ()
167
- with pikepdf .open (self .file_object ) as pdf :
168
- for page_id in page_numbers :
169
- page = pdf .pages [page_id ]
170
- new_pdf .pages .append (page )
166
+ new_pdf = pdfium .PdfDocument .new ()
167
+ pdf = pdfium .PdfDocument (self .file_object )
168
+ new_pdf .import_pages (pdf , list (page_numbers ))
171
169
self .file_object .close ()
172
- self .file_object = io .BytesIO ()
173
- new_pdf .save (self .file_object )
170
+ bytes_io = io .BytesIO ()
171
+ new_pdf .save (bytes_io )
172
+ self .file_object = bytes_io
174
173
175
174
def is_pdf_empty (self ) -> bool :
176
175
"""
@@ -179,24 +178,17 @@ def is_pdf_empty(self) -> bool:
179
178
:return: ``True`` if the PDF is empty
180
179
"""
181
180
self .file_object .seek (0 )
182
- with pikepdf .open (self .file_object ) as pdf :
183
- for page in pdf .pages :
184
- # mypy incorrectly identifies the "/Length" key's value as
185
- # an object rather than an int.
186
- try :
187
- total_size = page ["/Contents" ]["/Length" ]
188
- except ValueError :
189
- total_size = 0 # type: ignore
190
- for content in page ["/Contents" ]: # type: ignore
191
- total_size += content ["/Length" ]
192
- has_data = total_size > 1000 # type: ignore
193
-
194
- has_font = "/Font" in page ["/Resources" ].keys ()
195
- has_xobj = "/XObject" in page ["/Resources" ].keys ()
196
-
197
- if has_font or has_xobj or has_data :
198
- return False
199
- return True
181
+ pdf = pdfium .PdfDocument (self .file_object )
182
+ for i in range (len (pdf )):
183
+ page = pdf .get_page (i )
184
+
185
+ has_objects = False
186
+ for _ in page .get_objects ():
187
+ has_objects = True
188
+ break
189
+ if has_objects :
190
+ return False
191
+ return True
200
192
201
193
def read_contents (self , close_file : bool ) -> Tuple [str , bytes ]:
202
194
"""
0 commit comments