Skip to content

Commit f74bb46

Browse files
♻️ replace pikepdf with pypdfium2 (#226)
1 parent 8d2539a commit f74bb46

File tree

5 files changed

+34
-44
lines changed

5 files changed

+34
-44
lines changed

.pre-commit-config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ repos:
4141
exclude: "tests/|examples/|docs/"
4242
additional_dependencies:
4343
- toml
44-
- pikepdf
4544
- types-pytz
4645
- types-requests
4746
- types-setuptools

mindee/input/sources.py

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from pathlib import Path
88
from typing import BinaryIO, Optional, Sequence, Tuple, Union
99

10-
import pikepdf
10+
import pypdfium2 as pdfium
1111

1212
from mindee.error.mimetype_error import MimeTypeError
1313
from mindee.error.mindee_error import MindeeError, MindeeSourceError
@@ -117,8 +117,8 @@ def count_doc_pages(self) -> int:
117117
:return: the number of pages.
118118
"""
119119
self.file_object.seek(0)
120-
with pikepdf.open(self.file_object) as pdf:
121-
return len(pdf.pages)
120+
pdf = pdfium.PdfDocument(self.file_object)
121+
return len(pdf)
122122

123123
def process_pdf(
124124
self,
@@ -163,14 +163,13 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
163163
:return: None
164164
"""
165165
self.file_object.seek(0)
166-
new_pdf = pikepdf.Pdf.new()
167-
with pikepdf.open(self.file_object) as pdf:
168-
for page_id in page_numbers:
169-
page = pdf.pages[page_id]
170-
new_pdf.pages.append(page)
166+
new_pdf = pdfium.PdfDocument.new()
167+
pdf = pdfium.PdfDocument(self.file_object)
168+
new_pdf.import_pages(pdf, list(page_numbers))
171169
self.file_object.close()
172-
self.file_object = io.BytesIO()
173-
new_pdf.save(self.file_object)
170+
bytes_io = io.BytesIO()
171+
new_pdf.save(bytes_io)
172+
self.file_object = bytes_io
174173

175174
def is_pdf_empty(self) -> bool:
176175
"""
@@ -179,24 +178,11 @@ def is_pdf_empty(self) -> bool:
179178
:return: ``True`` if the PDF is empty
180179
"""
181180
self.file_object.seek(0)
182-
with pikepdf.open(self.file_object) as pdf:
183-
for page in pdf.pages:
184-
# mypy incorrectly identifies the "/Length" key's value as
185-
# an object rather than an int.
186-
try:
187-
total_size = page["/Contents"]["/Length"]
188-
except ValueError:
189-
total_size = 0 # type: ignore
190-
for content in page["/Contents"]: # type: ignore
191-
total_size += content["/Length"]
192-
has_data = total_size > 1000 # type: ignore
193-
194-
has_font = "/Font" in page["/Resources"].keys()
195-
has_xobj = "/XObject" in page["/Resources"].keys()
196-
197-
if has_font or has_xobj or has_data:
198-
return False
199-
return True
181+
pdf = pdfium.PdfDocument(self.file_object)
182+
for page in pdf:
183+
for _ in page.get_objects():
184+
return False
185+
return True
200186

201187
def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
202188
"""

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,17 @@ strict_equality = true
2222
warn_unused_ignores = true
2323
warn_unreachable = true
2424

25+
[[tool.mypy.overrides]]
26+
module = "pypdfium2.*"
27+
ignore_missing_imports = true
28+
2529
[tool.pylic]
2630
safe_licenses = [
2731
"Apache Software License",
2832
"MIT License",
2933
"Mozilla Public License 2.0 (MPL 2.0)",
3034
"BSD License",
31-
"Historical Permission Notice and Disclaimer (HPND)",
35+
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
3236
]
3337

3438
[tool.pytest.ini_options]

setup.cfg

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ packages = find:
3232
include_package_data = True
3333
python_requires = >=3.7
3434
install_requires =
35-
pikepdf~=8.6;python_version>="3.8"
36-
pikepdf==6.2.9;python_version<"3.8"
35+
pypdfium2>=4.0,<5
3736
pytz>=2023.3
3837
requests~=2.31
3938

tests/test_inputs.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import io
22
from pathlib import Path
33

4-
import pikepdf
4+
import pypdfium2 as pdfium
55
import pytest
66

77
from mindee.error.mimetype_error import MimeTypeError
@@ -18,6 +18,7 @@
1818

1919
FILE_TYPES_DIR = Path("./tests/data/file_types")
2020

21+
2122
#
2223
# PDF
2324
#
@@ -55,18 +56,19 @@ def test_pdf_cut_n_pages(numb_pages: int):
5556
)
5657
assert input_obj.count_doc_pages() == numb_pages
5758

58-
# Each page in the PDF has a unique (and increasing) /Content /Length.
59-
# We use this to make sure we have the correct pages
60-
cut_pdf = pikepdf.open(input_obj.file_object)
61-
with pikepdf.open(
62-
FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf"
63-
) as pdf:
64-
for idx, page in enumerate(pdf.pages):
65-
assert (
66-
page["/Contents"]["/Length"]
67-
== cut_pdf.pages[idx]["/Contents"]["/Length"]
68-
)
59+
# Currently the least verbose way of comparing pages with pypdfium2
60+
# I.e. each page is read & rendered as a rasterized image. These images are then compared as raw byte sequences.
61+
cut_pdf = pdfium.PdfDocument(input_obj.file_object)
62+
pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
63+
for idx in range(len(pdf)):
64+
pdf_page = pdf.get_page(idx)
65+
pdf_page_render = pdfium.PdfPage.render(pdf_page)
66+
cut_pdf_page = cut_pdf.get_page(idx)
67+
cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)
68+
69+
assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
6970
cut_pdf.close()
71+
pdf.close()
7072

7173

7274
def test_pdf_keep_5_first_pages():

0 commit comments

Comments
 (0)