Skip to content

♻️ replace pikepdf with pypdfium2 #226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ repos:
exclude: "tests/|examples/|docs/"
additional_dependencies:
- toml
- pikepdf
- types-pytz
- types-requests
- types-setuptools
Expand Down
42 changes: 14 additions & 28 deletions mindee/input/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pathlib import Path
from typing import BinaryIO, Optional, Sequence, Tuple, Union

import pikepdf
import pypdfium2 as pdfium

from mindee.error.mimetype_error import MimeTypeError
from mindee.error.mindee_error import MindeeError, MindeeSourceError
Expand Down Expand Up @@ -117,8 +117,8 @@ def count_doc_pages(self) -> int:
:return: the number of pages.
"""
self.file_object.seek(0)
with pikepdf.open(self.file_object) as pdf:
return len(pdf.pages)
pdf = pdfium.PdfDocument(self.file_object)
return len(pdf)

def process_pdf(
self,
Expand Down Expand Up @@ -163,14 +163,13 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
:return: None
"""
self.file_object.seek(0)
new_pdf = pikepdf.Pdf.new()
with pikepdf.open(self.file_object) as pdf:
for page_id in page_numbers:
page = pdf.pages[page_id]
new_pdf.pages.append(page)
new_pdf = pdfium.PdfDocument.new()
pdf = pdfium.PdfDocument(self.file_object)
new_pdf.import_pages(pdf, list(page_numbers))
self.file_object.close()
self.file_object = io.BytesIO()
new_pdf.save(self.file_object)
bytes_io = io.BytesIO()
new_pdf.save(bytes_io)
self.file_object = bytes_io

def is_pdf_empty(self) -> bool:
"""
Expand All @@ -179,24 +178,11 @@ def is_pdf_empty(self) -> bool:
:return: ``True`` if the PDF is empty
"""
self.file_object.seek(0)
with pikepdf.open(self.file_object) as pdf:
for page in pdf.pages:
# mypy incorrectly identifies the "/Length" key's value as
# an object rather than an int.
try:
total_size = page["/Contents"]["/Length"]
except ValueError:
total_size = 0 # type: ignore
for content in page["/Contents"]: # type: ignore
total_size += content["/Length"]
has_data = total_size > 1000 # type: ignore

has_font = "/Font" in page["/Resources"].keys()
has_xobj = "/XObject" in page["/Resources"].keys()

if has_font or has_xobj or has_data:
return False
return True
pdf = pdfium.PdfDocument(self.file_object)
for page in pdf:
for _ in page.get_objects():
return False
return True

def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
"""
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@ strict_equality = true
warn_unused_ignores = true
warn_unreachable = true

[[tool.mypy.overrides]]
module = "pypdfium2.*"
ignore_missing_imports = true

[tool.pylic]
safe_licenses = [
"Apache Software License",
"MIT License",
"Mozilla Public License 2.0 (MPL 2.0)",
"BSD License",
"Historical Permission Notice and Disclaimer (HPND)",
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
]

[tool.pytest.ini_options]
Expand Down
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ packages = find:
include_package_data = True
python_requires = >=3.7
install_requires =
pikepdf~=8.6;python_version>="3.8"
pikepdf==6.2.9;python_version<"3.8"
pypdfium2>=4.0,<5
pytz>=2023.3
requests~=2.31

Expand Down
26 changes: 14 additions & 12 deletions tests/test_inputs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import io
from pathlib import Path

import pikepdf
import pypdfium2 as pdfium
import pytest

from mindee.error.mimetype_error import MimeTypeError
Expand All @@ -18,6 +18,7 @@

FILE_TYPES_DIR = Path("./tests/data/file_types")


#
# PDF
#
Expand Down Expand Up @@ -55,18 +56,19 @@ def test_pdf_cut_n_pages(numb_pages: int):
)
assert input_obj.count_doc_pages() == numb_pages

# Each page in the PDF has a unique (and increasing) /Content /Length.
# We use this to make sure we have the correct pages
cut_pdf = pikepdf.open(input_obj.file_object)
with pikepdf.open(
FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf"
) as pdf:
for idx, page in enumerate(pdf.pages):
assert (
page["/Contents"]["/Length"]
== cut_pdf.pages[idx]["/Contents"]["/Length"]
)
# Currently the least verbose way of comparing pages with pypdfium2
# I.e. each page is read & rendered as a rasterized image. These images are then compared as raw byte sequences.
cut_pdf = pdfium.PdfDocument(input_obj.file_object)
pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
for idx in range(len(pdf)):
pdf_page = pdf.get_page(idx)
pdf_page_render = pdfium.PdfPage.render(pdf_page)
cut_pdf_page = cut_pdf.get_page(idx)
cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)

assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
cut_pdf.close()
pdf.close()


def test_pdf_keep_5_first_pages():
Expand Down
Loading