mindee · sebastianMindee · May 24, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -41,7 +41,6 @@ repos:
         exclude: "tests/|examples/|docs/"
         additional_dependencies:
           - toml
-          - pikepdf
           - types-pytz
           - types-requests
           - types-setuptools

diff --git a/mindee/input/sources.py b/mindee/input/sources.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from typing import BinaryIO, Optional, Sequence, Tuple, Union
 
-import pikepdf
+import pypdfium2 as pdfium
 
 from mindee.error.mimetype_error import MimeTypeError
 from mindee.error.mindee_error import MindeeError, MindeeSourceError
@@ -117,8 +117,8 @@ def count_doc_pages(self) -> int:
         :return: the number of pages.
         """
         self.file_object.seek(0)
-        with pikepdf.open(self.file_object) as pdf:
-            return len(pdf.pages)
+        pdf = pdfium.PdfDocument(self.file_object)
+        return len(pdf)
 
     def process_pdf(
         self,
@@ -163,14 +163,13 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
         :return: None
         """
         self.file_object.seek(0)
-        new_pdf = pikepdf.Pdf.new()
-        with pikepdf.open(self.file_object) as pdf:
-            for page_id in page_numbers:
-                page = pdf.pages[page_id]
-                new_pdf.pages.append(page)
+        new_pdf = pdfium.PdfDocument.new()
+        pdf = pdfium.PdfDocument(self.file_object)
+        new_pdf.import_pages(pdf, list(page_numbers))
         self.file_object.close()
-        self.file_object = io.BytesIO()
-        new_pdf.save(self.file_object)
+        bytes_io = io.BytesIO()
+        new_pdf.save(bytes_io)
+        self.file_object = bytes_io
 
     def is_pdf_empty(self) -> bool:
         """
@@ -179,24 +178,11 @@ def is_pdf_empty(self) -> bool:
         :return: ``True`` if the PDF is empty
         """
         self.file_object.seek(0)
-        with pikepdf.open(self.file_object) as pdf:
-            for page in pdf.pages:
-                # mypy incorrectly identifies the "/Length" key's value as
-                # an object rather than an int.
-                try:
-                    total_size = page["/Contents"]["/Length"]
-                except ValueError:
-                    total_size = 0  # type: ignore
-                    for content in page["/Contents"]:  # type: ignore
-                        total_size += content["/Length"]
-                has_data = total_size > 1000  # type: ignore
-
-                has_font = "/Font" in page["/Resources"].keys()
-                has_xobj = "/XObject" in page["/Resources"].keys()
-
-                if has_font or has_xobj or has_data:
-                    return False
-            return True
+        pdf = pdfium.PdfDocument(self.file_object)
+        for page in pdf:
+            for _ in page.get_objects():
+                return False
+        return True
 
     def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
         """

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,13 +22,17 @@ strict_equality = true
 warn_unused_ignores = true
 warn_unreachable = true
 
+[[tool.mypy.overrides]]
+module = "pypdfium2.*"
+ignore_missing_imports = true
+
 [tool.pylic]
 safe_licenses = [
   "Apache Software License",
   "MIT License",
   "Mozilla Public License 2.0 (MPL 2.0)",
   "BSD License",
-  "Historical Permission Notice and Disclaimer (HPND)",
+  "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
 ]
 
 [tool.pytest.ini_options]

diff --git a/setup.cfg b/setup.cfg
@@ -32,8 +32,7 @@ packages = find:
 include_package_data = True
 python_requires = >=3.7
 install_requires =
-    pikepdf~=8.6;python_version>="3.8"
-    pikepdf==6.2.9;python_version<"3.8"
+    pypdfium2>=4.0,<5
     pytz>=2023.3
     requests~=2.31
 

diff --git a/tests/test_inputs.py b/tests/test_inputs.py
@@ -1,7 +1,7 @@
 import io
 from pathlib import Path
 
-import pikepdf
+import pypdfium2 as pdfium
 import pytest
 
 from mindee.error.mimetype_error import MimeTypeError
@@ -18,6 +18,7 @@
 
 FILE_TYPES_DIR = Path("./tests/data/file_types")
 
+
 #
 # PDF
 #
@@ -55,18 +56,19 @@ def test_pdf_cut_n_pages(numb_pages: int):
     )
     assert input_obj.count_doc_pages() == numb_pages
 
-    # Each page in the PDF has a unique (and increasing) /Content /Length.
-    # We use this to make sure we have the correct pages
-    cut_pdf = pikepdf.open(input_obj.file_object)
-    with pikepdf.open(
-        FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf"
-    ) as pdf:
-        for idx, page in enumerate(pdf.pages):
-            assert (
-                page["/Contents"]["/Length"]
-                == cut_pdf.pages[idx]["/Contents"]["/Length"]
-            )
+    # Currently the least verbose way of comparing pages with pypdfium2
+    # I.e. each page is read & rendered as a rasterized image. These images are then compared as raw byte sequences.
+    cut_pdf = pdfium.PdfDocument(input_obj.file_object)
+    pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
+    for idx in range(len(pdf)):
+        pdf_page = pdf.get_page(idx)
+        pdf_page_render = pdfium.PdfPage.render(pdf_page)
+        cut_pdf_page = cut_pdf.get_page(idx)
+        cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)
+
+        assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
     cut_pdf.close()
+    pdf.close()
 
 
 def test_pdf_keep_5_first_pages():