diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4e993f12..785a98de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,7 +41,6 @@ repos: exclude: "tests/|examples/|docs/" additional_dependencies: - toml - - pikepdf - types-pytz - types-requests - types-setuptools diff --git a/mindee/input/sources.py b/mindee/input/sources.py index 57877d09..5978d17d 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import BinaryIO, Optional, Sequence, Tuple, Union -import pikepdf +import pypdfium2 as pdfium from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError @@ -117,8 +117,8 @@ def count_doc_pages(self) -> int: :return: the number of pages. """ self.file_object.seek(0) - with pikepdf.open(self.file_object) as pdf: - return len(pdf.pages) + pdf = pdfium.PdfDocument(self.file_object) + return len(pdf) def process_pdf( self, @@ -163,14 +163,13 @@ def merge_pdf_pages(self, page_numbers: set) -> None: :return: None """ self.file_object.seek(0) - new_pdf = pikepdf.Pdf.new() - with pikepdf.open(self.file_object) as pdf: - for page_id in page_numbers: - page = pdf.pages[page_id] - new_pdf.pages.append(page) + new_pdf = pdfium.PdfDocument.new() + pdf = pdfium.PdfDocument(self.file_object) + new_pdf.import_pages(pdf, list(page_numbers)) self.file_object.close() - self.file_object = io.BytesIO() - new_pdf.save(self.file_object) + bytes_io = io.BytesIO() + new_pdf.save(bytes_io) + self.file_object = bytes_io def is_pdf_empty(self) -> bool: """ @@ -179,24 +178,11 @@ def is_pdf_empty(self) -> bool: :return: ``True`` if the PDF is empty """ self.file_object.seek(0) - with pikepdf.open(self.file_object) as pdf: - for page in pdf.pages: - # mypy incorrectly identifies the "/Length" key's value as - # an object rather than an int. - try: - total_size = page["/Contents"]["/Length"] - except ValueError: - total_size = 0 # type: ignore - for content in page["/Contents"]: # type: ignore - total_size += content["/Length"] - has_data = total_size > 1000 # type: ignore - - has_font = "/Font" in page["/Resources"].keys() - has_xobj = "/XObject" in page["/Resources"].keys() - - if has_font or has_xobj or has_data: - return False - return True + pdf = pdfium.PdfDocument(self.file_object) + for page in pdf: + for _ in page.get_objects(): + return False + return True def read_contents(self, close_file: bool) -> Tuple[str, bytes]: """ diff --git a/pyproject.toml b/pyproject.toml index de8930e9..4874a326 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,13 +22,17 @@ strict_equality = true warn_unused_ignores = true warn_unreachable = true +[[tool.mypy.overrides]] +module = "pypdfium2.*" +ignore_missing_imports = true + [tool.pylic] safe_licenses = [ "Apache Software License", "MIT License", "Mozilla Public License 2.0 (MPL 2.0)", "BSD License", - "Historical Permission Notice and Disclaimer (HPND)", + "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty" ] [tool.pytest.ini_options] diff --git a/setup.cfg b/setup.cfg index e565b108..5d100563 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,8 +32,7 @@ packages = find: include_package_data = True python_requires = >=3.7 install_requires = - pikepdf~=8.6;python_version>="3.8" - pikepdf==6.2.9;python_version<"3.8" + pypdfium2>=4.0,<5 pytz>=2023.3 requests~=2.31 diff --git a/tests/test_inputs.py b/tests/test_inputs.py index c09b2fe1..1c67e8ef 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,7 +1,7 @@ import io from pathlib import Path -import pikepdf +import pypdfium2 as pdfium import pytest from mindee.error.mimetype_error import MimeTypeError @@ -18,6 +18,7 @@ FILE_TYPES_DIR = Path("./tests/data/file_types") + # # PDF # @@ -55,18 +56,19 @@ def test_pdf_cut_n_pages(numb_pages: int): ) assert input_obj.count_doc_pages() == numb_pages - # Each page in the PDF has a unique (and increasing) /Content /Length. - # We use this to make sure we have the correct pages - cut_pdf = pikepdf.open(input_obj.file_object) - with pikepdf.open( - FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf" - ) as pdf: - for idx, page in enumerate(pdf.pages): - assert ( - page["/Contents"]["/Length"] - == cut_pdf.pages[idx]["/Contents"]["/Length"] - ) + # Currently the least verbose way of comparing pages with pypdfium2 + # I.e. each page is read & rendered as a rasterized image. These images are then compared as raw byte sequences. + cut_pdf = pdfium.PdfDocument(input_obj.file_object) + pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf") + for idx in range(len(pdf)): + pdf_page = pdf.get_page(idx) + pdf_page_render = pdfium.PdfPage.render(pdf_page) + cut_pdf_page = cut_pdf.get_page(idx) + cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page) + + assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer) cut_pdf.close() + pdf.close() def test_pdf_keep_5_first_pages():