Skip to content

Commit 298bde6

Browse files
fix lint, typeignore a lot of pypdfium stuff
1 parent 09dad19 commit 298bde6

File tree

9 files changed

+236
-62
lines changed

9 files changed

+236
-62
lines changed
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
11
from mindee.image_extraction.common.extracted_image import ExtractedImage
2-
from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size
2+
from mindee.image_extraction.common.image_extractor import (
3+
attach_bitmap_as_new_page,
4+
extract_from_page,
5+
get_image_size,
6+
)

mindee/image_extraction/common/extracted_image.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88

99
class ExtractedImage:
10+
"""Generic class for image extraction."""
11+
1012
def __init__(self, buffer: bytes, file_name: str):
1113
"""
1214
Initialize the ExtractedImage with a buffer and an internal file name.
@@ -26,13 +28,13 @@ def save_to_file(self, output_path: str):
2628
"""
2729
try:
2830
resolved_path = Path(output_path).resolve()
29-
with open(resolved_path, 'wb') as f:
30-
f.write(self.buffer.read())
31-
logger.info(f"File saved successfully to {resolved_path}.")
32-
except TypeError:
33-
raise MindeeError("Invalid path/filename provided.")
34-
except Exception as e:
35-
raise e
31+
with open(resolved_path, "wb") as file:
32+
file.write(self.buffer.read())
33+
logger.info("File saved successfully to %s.", resolved_path)
34+
except TypeError as exc:
35+
raise MindeeError("Invalid path/filename provided.") from exc
36+
except Exception as exc:
37+
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc
3638

3739
def as_source(self) -> FileInput:
3840
"""

mindee/image_extraction/common/image_extractor.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import io
2-
from typing import List, BinaryIO, Tuple
2+
import struct
3+
from typing import BinaryIO, List, Tuple
34

45
import pypdfium2 as pdfium
56

67
from mindee.error import MimeTypeError
7-
from mindee.geometry import get_min_max_x, get_min_max_y, Polygon
8-
9-
import struct
8+
from mindee.geometry import Polygon, get_min_max_x, get_min_max_y
109

1110

1211
def get_image_size(data: BinaryIO) -> Tuple[int, int]:
@@ -20,48 +19,53 @@ def get_image_size(data: BinaryIO) -> Tuple[int, int]:
2019
signature = data.read(8)
2120

2221
# Check for PNG signature
23-
if signature[:8] == b'\x89PNG\r\n\x1a\n':
24-
# PNG file
22+
if signature[:8] == b"\x89PNG\r\n\x1a\n":
2523
data.seek(16)
26-
width, height = struct.unpack('>II', data.read(8))
24+
width, height = struct.unpack(">II", data.read(8))
2725
return width, height
2826

29-
# Check for JPEG SOI marker
30-
elif signature[:2] == b'\xff\xd8':
27+
# Check for JPEG SOI marker (also works for jpga)
28+
if signature[:2] == b"\xff\xd8":
3129
data.seek(2)
3230
while True:
33-
marker, = struct.unpack('>H', data.read(2))
34-
if marker == 0xFFC0 or marker == 0xFFC2: # SOF0 or SOF2
31+
(marker,) = struct.unpack(">H", data.read(2))
32+
if marker in (0xFFC0, 0xFFC2): # SOF0 or SOF2
3533
data.seek(3, 1) # Skip length and precision
36-
height, width = struct.unpack('>HH', data.read(4))
34+
height, width = struct.unpack(">HH", data.read(4))
3735
return width, height
38-
else:
39-
length, = struct.unpack('>H', data.read(2))
40-
data.seek(length - 2, 1)
36+
(length,) = struct.unpack(">H", data.read(2))
37+
data.seek(length - 2, 1)
4138
data.close()
4239
raise MimeTypeError("Size could not be retrieved for file.")
4340

4441

45-
def attach_bitmap_as_new_page(pdf_doc: pdfium.PdfDocument, bitmap: pdfium.PdfBitmap, new_width: float,
46-
new_height: float) -> pdfium.PdfDocument:
42+
def attach_bitmap_as_new_page( # type: ignore
43+
pdf_doc: pdfium.PdfDocument,
44+
bitmap: pdfium.PdfBitmap,
45+
new_width: float,
46+
new_height: float,
47+
) -> pdfium.PdfDocument:
4748
"""
4849
Attaches a created PdfBitmap object as a new page in a PdfDocument object.
4950
5051
:param pdf_doc: The PdfDocument to which the new page will be added.
5152
:param bitmap: The PdfBitmap object to be added as a new page.
5253
:param new_width: The width of the new page.
5354
:param new_height: The height of the new page.
55+
:return: A PdfDocument handle.
5456
"""
5557
# Create a new page in the PdfDocument
5658
new_page = pdf_doc.new_page(new_width, new_height)
5759

60+
pdf_obj = pdfium.PdfImage.new(pdf_doc)
61+
pdf_obj.set_bitmap(bitmap)
5862
# Create a device context to render the bitmap onto the new page
59-
new_page.insert_obj(bitmap.buffer)
63+
new_page.insert_obj(pdf_obj)
6064

6165
return pdf_doc
6266

6367

64-
def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]):
68+
def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): # type: ignore
6569
"""
6670
Extracts elements from a page based on a list of bounding boxes.
6771
@@ -79,17 +83,21 @@ def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]):
7983
min_max_x = get_min_max_x(polygon)
8084
min_max_y = get_min_max_y(polygon)
8185

82-
new_width = width * (min_max_x.max - min_max_x.min)
83-
new_height = height * (min_max_y.max - min_max_y.min)
84-
85-
left = min_max_x.min * width
86-
right = min_max_x.max * width
87-
top = height - (min_max_y.min * height)
88-
bottom = height - (min_max_y.max * height)
89-
90-
cropped_page: pdfium.PdfBitmap = pdf_page.render(crop=(left, bottom, right, top))
91-
92-
temp_pdf = attach_bitmap_as_new_page(temp_pdf, cropped_page, new_width, new_height)
86+
left = min_max_x.min
87+
right = min_max_x.max
88+
top = (height - (min_max_y.min * height)) / height
89+
bottom = (height - (min_max_y.max * height)) / height
90+
91+
cropped_page: pdfium.PdfBitmap = pdf_page.render( # type: ignore
92+
crop=(left, bottom, right, top)
93+
)
94+
95+
temp_pdf = attach_bitmap_as_new_page(
96+
temp_pdf,
97+
cropped_page,
98+
width * (min_max_x.max - min_max_x.min),
99+
height * (min_max_y.max - min_max_y.min),
100+
)
93101

94102
temp_file = io.BytesIO()
95103
temp_pdf.save(temp_file)
Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1-
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts_from_page
2-
from mindee.image_extraction.multi_receipts_extractor.extracted_mult_receipt_image import ExtractedMultiReceiptImage
1+
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
2+
ExtractedMultiReceiptImage,
3+
)
4+
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
5+
extract_receipts_from_page,
6+
)

mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py renamed to mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33

44
class ExtractedMultiReceiptImage(ExtractedImage):
5+
"""Wrapper class for extracted multiple-receipts images."""
6+
57
_receipt_id: int
6-
page_id: int
8+
_page_id: int
79

810
def __init__(self, buffer, receipt_id: int, page_id: int):
911
super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf")
@@ -12,8 +14,18 @@ def __init__(self, buffer, receipt_id: int, page_id: int):
1214

1315
@property
1416
def receipt_id(self):
17+
"""
18+
ID of the receipt on a given page.
19+
20+
:return:
21+
"""
1522
return self._receipt_id
1623

1724
@property
1825
def page_id(self):
19-
return self.page_id
26+
"""
27+
ID of the page the receipt was found on.
28+
29+
:return:
30+
"""
31+
return self._page_id
Lines changed: 81 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,30 @@
1-
from typing import List
1+
from typing import List, Union
22

33
import pypdfium2 as pdfium
44

5-
from mindee.error import MimeTypeError
6-
from mindee.geometry import Polygon
7-
from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size
8-
from mindee.image_extraction.multi_receipts_extractor import ExtractedMultiReceiptImage
5+
from mindee.error import MimeTypeError, MindeeError
6+
from mindee.geometry.point import Point
7+
from mindee.geometry.polygon import Polygon
8+
from mindee.geometry.quadrilateral import Quadrilateral
9+
from mindee.image_extraction.common.image_extractor import (
10+
attach_bitmap_as_new_page,
11+
extract_from_page,
12+
get_image_size,
13+
)
14+
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
15+
ExtractedMultiReceiptImage,
16+
)
917
from mindee.input import LocalInputSource
18+
from mindee.product import MultiReceiptsDetectorV1
1019

1120

12-
def extract_receipts_from_page(pdf_page: pdfium.PdfPage, bounding_boxes: List[Polygon], page_id: int) \
13-
-> List[ExtractedMultiReceiptImage]:
21+
def extract_receipts_from_page( # type: ignore
22+
pdf_page: pdfium.PdfPage,
23+
bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]],
24+
page_id: int,
25+
) -> List[ExtractedMultiReceiptImage]:
1426
"""
15-
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage
27+
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage\
1628
object.
1729
1830
:param pdf_page: PDF Page to extract from.
@@ -21,26 +33,76 @@ def extract_receipts_from_page(pdf_page: pdfium.PdfPage, bounding_boxes: List[Po
2133
pages.
2234
:return: A list of ExtractedMultiReceiptImage.
2335
"""
24-
extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes)
36+
extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore
2537
extracted_receipts = []
26-
for i in range(len(extracted_receipts_raw)):
27-
extracted_receipts.append(ExtractedMultiReceiptImage(extracted_receipts_raw[i], page_id, i))
38+
for i, extracted_receipt_raw in enumerate(extracted_receipts_raw):
39+
extracted_receipts.append(
40+
ExtractedMultiReceiptImage(extracted_receipt_raw, i, page_id)
41+
)
2842
return extracted_receipts
2943

3044

31-
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:
45+
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
3246
"""
3347
Loads a PDF document from a local input source.
3448
3549
:param input_file: Local input.
3650
:return: A valid PdfDocument handle.
3751
"""
38-
if input_file.file_mimetype not in ["image/jpeg", "image/jpg", "image/png", "application/pdf"]:
39-
raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png',"
40-
f" '.jpg' and '.pdf'.")
52+
if input_file.file_mimetype not in [
53+
"image/jpeg",
54+
"image/jpg",
55+
"image/png",
56+
"application/pdf",
57+
]:
58+
raise MimeTypeError(
59+
f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png',"
60+
f" '.jpg' and '.pdf'."
61+
)
4162
if input_file.is_pdf():
42-
pdf_document = pdfium.PdfDocument(input_file.file_object)
43-
else:
44-
pdf_document = pdfium.PdfDocument.new()
63+
return pdfium.PdfDocument(input_file.file_object)
64+
pdf_document = pdfium.PdfDocument.new()
65+
height, width = get_image_size(input_file.file_object)
66+
pdf_bitmap = pdfium.PdfBitmap.new_native(width, height, 4)
67+
pdf_bitmap = pdfium.PdfBitmap(
68+
raw=pdf_bitmap,
69+
buffer=input_file.file_object,
70+
height=height,
71+
width=width,
72+
needs_free=True,
73+
rev_byteorder=False,
74+
format=4,
75+
stride=4,
76+
)
77+
# Bitmap format 4 should equate to RGBA, assumed to be equivalent to:
78+
# https://docs.rs/pdfium-render/latest/pdfium_render/bitmap/enum.PdfBitmapFormat.html
4579

46-
return attach_bitmap_as_new_page(pdf_document, input_file.file_object, get_image_size(input_file.file_object))
80+
return attach_bitmap_as_new_page(pdf_document, pdf_bitmap, height, width)
81+
82+
83+
def extract_receipts(
84+
input_file: LocalInputSource, inference: MultiReceiptsDetectorV1
85+
) -> List[ExtractedMultiReceiptImage]:
86+
"""
87+
Extracts individual receipts from multi-receipts documents.
88+
89+
:param input_file: File to extract sub-receipts from.
90+
:param inference: Results of the inference.
91+
:return: Individual extracted receipts as an array of ExtractedMultiReceiptImage.
92+
"""
93+
images: List[ExtractedMultiReceiptImage] = []
94+
if not inference.prediction.receipts:
95+
raise MindeeError(
96+
"No possible receipts candidates found for MultiReceipts extraction."
97+
)
98+
pdf_doc = load_pdf_doc(input_file)
99+
for page_id in range(len(pdf_doc)):
100+
receipt_positions = [
101+
receipt.bounding_box
102+
for receipt in inference.pages[page_id].prediction.receipts
103+
]
104+
extracted_receipts = extract_receipts_from_page(
105+
pdf_doc.get_page(page_id), receipt_positions, page_id # type: ignore
106+
)
107+
images.extend(extracted_receipts)
108+
return images

mindee/parsing/standard/locale.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ def __init__(
2727
:param reconstructed: Bool for reconstructed object (not extracted in the API)
2828
:param page_id: Page number for multi-page document
2929
"""
30-
value_key = "value" if ("value" in raw_prediction and raw_prediction["value"]) else "language"
30+
value_key = (
31+
"value"
32+
if ("value" in raw_prediction and raw_prediction["value"])
33+
else "language"
34+
)
3135

3236
super().__init__(
3337
raw_prediction,

tests/image_extraction/test_image_extractor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111
def jpg_file_path():
1212
return FILE_TYPES_DIR / "receipt.jpg"
1313

14+
1415
@pytest.fixture
1516
def txt_file_path():
1617
return FILE_TYPES_DIR / "receipt.txt"
18+
19+
1720
@pytest.fixture
1821
def png_file_path():
1922
return FILE_TYPES_DIR / "receipt.png"

0 commit comments

Comments
 (0)