Skip to content

Commit 0786d47

Browse files
♻️ change internals to match new lib usage
1 parent 2faec3f commit 0786d47

File tree

2 files changed

+18
-16
lines changed

2 files changed

+18
-16
lines changed

mindee/input/sources.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,10 @@ def count_doc_pages(self) -> int:
121121
return len(pdf)
122122

123123
def process_pdf(
124-
self,
125-
behavior: str,
126-
on_min_pages: int,
127-
page_indexes: Sequence,
124+
self,
125+
behavior: str,
126+
on_min_pages: int,
127+
page_indexes: Sequence,
128128
) -> None:
129129
"""Run any required processing on a PDF file."""
130130
if self.is_pdf_empty():

tests/test_inputs.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import io
22
from pathlib import Path
33

4-
import pikepdf
4+
import pypdfium2 as pdfium
55
import pytest
66

77
from mindee.error.mimetype_error import MimeTypeError
@@ -18,6 +18,7 @@
1818

1919
FILE_TYPES_DIR = Path("./tests/data/file_types")
2020

21+
2122
#
2223
# PDF
2324
#
@@ -55,18 +56,19 @@ def test_pdf_cut_n_pages(numb_pages: int):
5556
)
5657
assert input_obj.count_doc_pages() == numb_pages
5758

58-
# Each page in the PDF has a unique (and increasing) /Content /Length.
59-
# We use this to make sure we have the correct pages
60-
cut_pdf = pikepdf.open(input_obj.file_object)
61-
with pikepdf.open(
62-
FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf"
63-
) as pdf:
64-
for idx, page in enumerate(pdf.pages):
65-
assert (
66-
page["/Contents"]["/Length"]
67-
== cut_pdf.pages[idx]["/Contents"]["/Length"]
68-
)
59+
# Currently the least verbose way of comparing pages with pypdfium2
60+
# I.e. each page is read & rendered as a rasterized image. These images are then compared as raw byte sequences.
61+
cut_pdf = pdfium.PdfDocument(input_obj.file_object)
62+
pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
63+
for idx in range(len(pdf)):
64+
pdf_page = pdf.get_page(idx)
65+
pdf_page_render = pdfium.PdfPage.render(pdf_page)
66+
cut_pdf_page = cut_pdf.get_page(idx)
67+
cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)
68+
69+
assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
6970
cut_pdf.close()
71+
pdf.close()
7072

7173

7274
def test_pdf_keep_5_first_pages():

0 commit comments

Comments
 (0)