From 0c00c07d3e2b24ef55003b8822c6d4a920703c47 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 7 Mar 2024 16:02:59 +0100 Subject: [PATCH 01/10] temp save --- .pre-commit-config.yaml | 2 +- mindee/input/sources.py | 56 ++++++++++++++++++----------------------- setup.cfg | 3 +-- 3 files changed, 26 insertions(+), 35 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4e993f12..d665f994 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,7 +41,7 @@ repos: exclude: "tests/|examples/|docs/" additional_dependencies: - toml - - pikepdf + - pypdfium2 - types-pytz - types-requests - types-setuptools diff --git a/mindee/input/sources.py b/mindee/input/sources.py index 57877d09..c97d75ed 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import BinaryIO, Optional, Sequence, Tuple, Union -import pikepdf +import pypdfium2 as pdfium from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError @@ -117,14 +117,14 @@ def count_doc_pages(self) -> int: :return: the number of pages. """ self.file_object.seek(0) - with pikepdf.open(self.file_object) as pdf: - return len(pdf.pages) + pdf = pdfium.PdfDocument(self.file_object) + return len(pdf) def process_pdf( - self, - behavior: str, - on_min_pages: int, - page_indexes: Sequence, + self, + behavior: str, + on_min_pages: int, + page_indexes: Sequence, ) -> None: """Run any required processing on a PDF file.""" if self.is_pdf_empty(): @@ -163,14 +163,13 @@ def merge_pdf_pages(self, page_numbers: set) -> None: :return: None """ self.file_object.seek(0) - new_pdf = pikepdf.Pdf.new() - with pikepdf.open(self.file_object) as pdf: - for page_id in page_numbers: - page = pdf.pages[page_id] - new_pdf.pages.append(page) + new_pdf = pdfium.PdfDocument.new() + pdf = pdfium.PdfDocument(self.file_object) + new_pdf.import_pages(pdf, list(page_numbers)) self.file_object.close() - self.file_object = io.BytesIO() - new_pdf.save(self.file_object) + bytes_io = io.BytesIO() + new_pdf.save(bytes_io) + self.file_object = bytes_io def is_pdf_empty(self) -> bool: """ @@ -179,24 +178,17 @@ def is_pdf_empty(self) -> bool: :return: ``True`` if the PDF is empty """ self.file_object.seek(0) - with pikepdf.open(self.file_object) as pdf: - for page in pdf.pages: - # mypy incorrectly identifies the "/Length" key's value as - # an object rather than an int. - try: - total_size = page["/Contents"]["/Length"] - except ValueError: - total_size = 0 # type: ignore - for content in page["/Contents"]: # type: ignore - total_size += content["/Length"] - has_data = total_size > 1000 # type: ignore - - has_font = "/Font" in page["/Resources"].keys() - has_xobj = "/XObject" in page["/Resources"].keys() - - if has_font or has_xobj or has_data: - return False - return True + pdf = pdfium.PdfDocument(self.file_object) + for i in range(len(pdf)): + page = pdf.get_page(i) + + has_objects = False + for _ in page.get_objects(): + has_objects = True + break + if has_objects: + return False + return True def read_contents(self, close_file: bool) -> Tuple[str, bytes]: """ diff --git a/setup.cfg b/setup.cfg index e565b108..006ed8f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,8 +32,7 @@ packages = find: include_package_data = True python_requires = >=3.7 install_requires = - pikepdf~=8.6;python_version>="3.8" - pikepdf==6.2.9;python_version<"3.8" + pypdfium2~=4.7 pytz>=2023.3 requests~=2.31 From e81b2aac014eb183ff3e0752cb1a07e33b309062 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 7 Mar 2024 18:36:50 +0100 Subject: [PATCH 02/10] :recycle: replace pikepdf by pypdfium2 in dependencies --- .pre-commit-config.yaml | 1 - pyproject.toml | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d665f994..785a98de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,7 +41,6 @@ repos: exclude: "tests/|examples/|docs/" additional_dependencies: - toml - - pypdfium2 - types-pytz - types-requests - types-setuptools diff --git a/pyproject.toml b/pyproject.toml index de8930e9..9e47b1f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,10 @@ strict_equality = true warn_unused_ignores = true warn_unreachable = true +[[tool.mypy.overrides]] +module = "pypdfium2.*" +ignore_missing_imports = true + [tool.pylic] safe_licenses = [ "Apache Software License", From d73e58bffdd122ef547e759f03c9f59f97f8a5b9 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 7 Mar 2024 18:37:23 +0100 Subject: [PATCH 03/10] :recycle: change internals to match new lib usage --- mindee/input/sources.py | 8 ++++---- tests/test_inputs.py | 26 ++++++++++++++------------ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/mindee/input/sources.py b/mindee/input/sources.py index c97d75ed..09462484 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources.py @@ -121,10 +121,10 @@ def count_doc_pages(self) -> int: return len(pdf) def process_pdf( - self, - behavior: str, - on_min_pages: int, - page_indexes: Sequence, + self, + behavior: str, + on_min_pages: int, + page_indexes: Sequence, ) -> None: """Run any required processing on a PDF file.""" if self.is_pdf_empty(): diff --git a/tests/test_inputs.py b/tests/test_inputs.py index c09b2fe1..1c67e8ef 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,7 +1,7 @@ import io from pathlib import Path -import pikepdf +import pypdfium2 as pdfium import pytest from mindee.error.mimetype_error import MimeTypeError @@ -18,6 +18,7 @@ FILE_TYPES_DIR = Path("./tests/data/file_types") + # # PDF # @@ -55,18 +56,19 @@ def test_pdf_cut_n_pages(numb_pages: int): ) assert input_obj.count_doc_pages() == numb_pages - # Each page in the PDF has a unique (and increasing) /Content /Length. - # We use this to make sure we have the correct pages - cut_pdf = pikepdf.open(input_obj.file_object) - with pikepdf.open( - FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf" - ) as pdf: - for idx, page in enumerate(pdf.pages): - assert ( - page["/Contents"]["/Length"] - == cut_pdf.pages[idx]["/Contents"]["/Length"] - ) + # Currently the least verbose way of comparing pages with pypdfium2 + # I.e. each page is read & rendered as a rasterized image. These images are then compared as raw byte sequences. + cut_pdf = pdfium.PdfDocument(input_obj.file_object) + pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf") + for idx in range(len(pdf)): + pdf_page = pdf.get_page(idx) + pdf_page_render = pdfium.PdfPage.render(pdf_page) + cut_pdf_page = cut_pdf.get_page(idx) + cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page) + + assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer) cut_pdf.close() + pdf.close() def test_pdf_keep_5_first_pages(): From b7582d5b2d8f2ec9901e569f055ac6b953befc57 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 7 Mar 2024 18:43:01 +0100 Subject: [PATCH 04/10] :recycle: restore mypy types rule --- pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9e47b1f8..de8930e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,10 +22,6 @@ strict_equality = true warn_unused_ignores = true warn_unreachable = true -[[tool.mypy.overrides]] -module = "pypdfium2.*" -ignore_missing_imports = true - [tool.pylic] safe_licenses = [ "Apache Software License", From 3a658a7ccdde55934c3c2c1a1d096b1270271c97 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 7 Mar 2024 18:51:34 +0100 Subject: [PATCH 05/10] add exception for pypdfium2 to mypy --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index de8930e9..9e47b1f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,10 @@ strict_equality = true warn_unused_ignores = true warn_unreachable = true +[[tool.mypy.overrides]] +module = "pypdfium2.*" +ignore_missing_imports = true + [tool.pylic] safe_licenses = [ "Apache Software License", From e17056ca1be785b1b6aac6a512bd18d28c77e91a Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 11 Mar 2024 11:54:49 +0100 Subject: [PATCH 06/10] Update pyproject license policy --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9e47b1f8..b123e438 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ safe_licenses = [ "Mozilla Public License 2.0 (MPL 2.0)", "BSD License", "Historical Permission Notice and Disclaimer (HPND)", + "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty" ] [tool.pytest.ini_options] From a576f9c7bae908e788bea5aef14b1c1568fc6e57 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 11 Mar 2024 11:56:33 +0100 Subject: [PATCH 07/10] remove pikepdf license check --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b123e438..4874a326 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,6 @@ safe_licenses = [ "MIT License", "Mozilla Public License 2.0 (MPL 2.0)", "BSD License", - "Historical Permission Notice and Disclaimer (HPND)", "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty" ] From 42e0fc0216710d1256ba12c684707b5f04a4312a Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 14 Mar 2024 17:30:38 +0100 Subject: [PATCH 08/10] simplify some syntaxes --- mindee/input/sources.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mindee/input/sources.py b/mindee/input/sources.py index 09462484..5978d17d 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources.py @@ -179,14 +179,8 @@ def is_pdf_empty(self) -> bool: """ self.file_object.seek(0) pdf = pdfium.PdfDocument(self.file_object) - for i in range(len(pdf)): - page = pdf.get_page(i) - - has_objects = False + for page in pdf: for _ in page.get_objects(): - has_objects = True - break - if has_objects: return False return True From ff7455c072a4aed379cf5ef31170b960a6d3c62c Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 23 May 2024 17:33:35 +0200 Subject: [PATCH 09/10] :wrench: loosen pypdfium version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 006ed8f3..021e9023 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ packages = find: include_package_data = True python_requires = >=3.7 install_requires = - pypdfium2~=4.7 + pypdfium2>=4.0 pytz>=2023.3 requests~=2.31 From ef8e866263893687c83b13008afe8438a9b8884f Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 24 May 2024 10:33:06 +0200 Subject: [PATCH 10/10] stricter version control for pypdfium --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 021e9023..5d100563 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,7 +32,7 @@ packages = find: include_package_data = True python_requires = >=3.7 install_requires = - pypdfium2>=4.0 + pypdfium2>=4.0,<5 pytz>=2023.3 requests~=2.31