Skip to content

Commit a9dfc47

Browse files
✨ add support for multi-receipt extraction
1 parent 298bde6 commit a9dfc47

File tree

12 files changed

+115
-137
lines changed

12 files changed

+115
-137
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ repos:
4545
- types-requests
4646
- types-setuptools
4747
- importlib-metadata
48+
- types-Pillow

examples/multi_receipts_tutorial.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from mindee import PredictResponse, Client, product
2+
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts
3+
4+
# Init a new client
5+
mindee_client = Client()
6+
7+
# Load a file from disk
8+
input_doc = mindee_client.source_from_path("path/to/your/file.ext")
9+
result_split: PredictResponse = mindee_client.parse(
10+
product.MultiReceiptsDetectorV1,
11+
input_doc,
12+
close_file=False
13+
)
14+
15+
extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
16+
for receipt in extracted_receipts:
17+
receipt_as_source = receipt.as_source()
18+
# receipt.save_to_file(f"./local_test/{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
19+
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
20+
print(result_receipt.document)
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from mindee.image_extraction.common.extracted_image import ExtractedImage
22
from mindee.image_extraction.common.image_extractor import (
3-
attach_bitmap_as_new_page,
3+
attach_image_as_new_file,
44
extract_from_page,
5-
get_image_size,
65
)

mindee/image_extraction/common/extracted_image.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,22 @@ def __init__(self, buffer: bytes, file_name: str):
1818
"""
1919
self.buffer = io.BytesIO(buffer)
2020
self.internal_file_name = file_name
21+
self.buffer.name = self.internal_file_name
2122

2223
def save_to_file(self, output_path: str):
2324
"""
2425
Saves the document to a file.
2526
2627
:param output_path: Path to save the file to.
28+
:param file_name: Name of the file.
2729
:raises MindeeError: If an invalid path or filename is provided.
2830
"""
2931
try:
32+
self.buffer.seek(0)
3033
resolved_path = Path(output_path).resolve()
3134
with open(resolved_path, "wb") as file:
3235
file.write(self.buffer.read())
33-
logger.info("File saved successfully to %s.", resolved_path)
36+
logger.info("File saved successfully to '%s'.", resolved_path)
3437
except TypeError as exc:
3538
raise MindeeError("Invalid path/filename provided.") from exc
3639
except Exception as exc:
Lines changed: 37 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,45 @@
11
import io
2-
import struct
3-
from typing import BinaryIO, List, Tuple
2+
from typing import BinaryIO, List
43

54
import pypdfium2 as pdfium
5+
from PIL import Image
66

7-
from mindee.error import MimeTypeError
87
from mindee.geometry import Polygon, get_min_max_x, get_min_max_y
98

109

11-
def get_image_size(data: BinaryIO) -> Tuple[int, int]:
12-
"""
13-
Read the first few bytes to determine the file type.
14-
15-
:param data: Image input.
16-
:return: A tuple containing the file's height/width.
17-
"""
18-
data.seek(0)
19-
signature = data.read(8)
20-
21-
# Check for PNG signature
22-
if signature[:8] == b"\x89PNG\r\n\x1a\n":
23-
data.seek(16)
24-
width, height = struct.unpack(">II", data.read(8))
25-
return width, height
26-
27-
# Check for JPEG SOI marker (also works for jpga)
28-
if signature[:2] == b"\xff\xd8":
29-
data.seek(2)
30-
while True:
31-
(marker,) = struct.unpack(">H", data.read(2))
32-
if marker in (0xFFC0, 0xFFC2): # SOF0 or SOF2
33-
data.seek(3, 1) # Skip length and precision
34-
height, width = struct.unpack(">HH", data.read(4))
35-
return width, height
36-
(length,) = struct.unpack(">H", data.read(2))
37-
data.seek(length - 2, 1)
38-
data.close()
39-
raise MimeTypeError("Size could not be retrieved for file.")
40-
41-
42-
def attach_bitmap_as_new_page( # type: ignore
43-
pdf_doc: pdfium.PdfDocument,
44-
bitmap: pdfium.PdfBitmap,
45-
new_width: float,
46-
new_height: float,
10+
def attach_image_as_new_file( # type: ignore
11+
input_buffer: BinaryIO,
4712
) -> pdfium.PdfDocument:
4813
"""
49-
Attaches a created PdfBitmap object as a new page in a PdfDocument object.
14+
Attaches an image as a new page in a PdfDocument object.
5015
51-
:param pdf_doc: The PdfDocument to which the new page will be added.
52-
:param bitmap: The PdfBitmap object to be added as a new page.
53-
:param new_width: The width of the new page.
54-
:param new_height: The height of the new page.
16+
:param input_buffer: Input buffer. Only supports JPEG.
5517
:return: A PdfDocument handle.
5618
"""
5719
# Create a new page in the PdfDocument
58-
new_page = pdf_doc.new_page(new_width, new_height)
20+
input_buffer.seek(0)
21+
image = Image.open(input_buffer)
22+
image.convert("RGB")
23+
image_buffer = io.BytesIO()
24+
image.save(image_buffer, format="JPEG")
25+
26+
pdf = pdfium.PdfDocument.new()
27+
28+
image_pdf = pdfium.PdfImage.new(pdf)
29+
image_pdf.load_jpeg(image_buffer)
30+
width, height = image_pdf.get_size()
5931

60-
pdf_obj = pdfium.PdfImage.new(pdf_doc)
61-
pdf_obj.set_bitmap(bitmap)
62-
# Create a device context to render the bitmap onto the new page
63-
new_page.insert_obj(pdf_obj)
32+
matrix = pdfium.PdfMatrix().scale(width, height)
33+
image_pdf.set_matrix(matrix)
6434

65-
return pdf_doc
35+
page = pdf.new_page(width, height)
36+
page.insert_obj(image_pdf)
37+
page.gen_content()
38+
image.close()
39+
return pdf
6640

6741

68-
def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): # type: ignore
42+
def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List[bytes]: # type: ignore
6943
"""
7044
Extracts elements from a page based on a list of bounding boxes.
7145
@@ -76,32 +50,23 @@ def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): # typ
7650
width, height = pdf_page.get_size()
7751

7852
extracted_elements = []
79-
8053
for polygon in polygons:
81-
temp_pdf = pdfium.PdfDocument.new()
82-
8354
min_max_x = get_min_max_x(polygon)
8455
min_max_y = get_min_max_y(polygon)
8556

86-
left = min_max_x.min
87-
right = min_max_x.max
88-
top = (height - (min_max_y.min * height)) / height
89-
bottom = (height - (min_max_y.max * height)) / height
57+
left = min_max_x.min * width
58+
right = min_max_x.max * width
59+
top = min_max_y.min * height
60+
bottom = min_max_y.max * height
9061

91-
cropped_page: pdfium.PdfBitmap = pdf_page.render( # type: ignore
92-
crop=(left, bottom, right, top)
62+
# Note: cropping done via PIL instead of PyPDFium to simplify operations greatly.
63+
cropped_content_pil = pdf_page.render().to_pil()
64+
cropped_content_pil = cropped_content_pil.crop(
65+
(int(left), int(top), int(right), int(bottom))
9366
)
94-
95-
temp_pdf = attach_bitmap_as_new_page(
96-
temp_pdf,
97-
cropped_page,
98-
width * (min_max_x.max - min_max_x.min),
99-
height * (min_max_y.max - min_max_y.min),
100-
)
101-
102-
temp_file = io.BytesIO()
103-
temp_pdf.save(temp_file)
104-
extracted_elements.append(temp_file.read())
105-
temp_file.close()
67+
jpeg_buffer = io.BytesIO()
68+
cropped_content_pil.save(jpeg_buffer, format="PDF")
69+
jpeg_buffer.seek(0)
70+
extracted_elements.append(jpeg_buffer.read())
10671

10772
return extracted_elements

mindee/image_extraction/multi_receipts_extractor/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
2-
ExtractedMultiReceiptImage,
2+
ExtractedMultiReceiptsImage,
33
)
44
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
55
extract_receipts_from_page,

mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from mindee.image_extraction.common import ExtractedImage
22

33

4-
class ExtractedMultiReceiptImage(ExtractedImage):
4+
class ExtractedMultiReceiptsImage(ExtractedImage):
55
"""Wrapper class for extracted multiple-receipts images."""
66

77
_receipt_id: int

mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py

Lines changed: 22 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7,37 +7,36 @@
77
from mindee.geometry.polygon import Polygon
88
from mindee.geometry.quadrilateral import Quadrilateral
99
from mindee.image_extraction.common.image_extractor import (
10-
attach_bitmap_as_new_page,
10+
attach_image_as_new_file,
1111
extract_from_page,
12-
get_image_size,
1312
)
1413
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
15-
ExtractedMultiReceiptImage,
14+
ExtractedMultiReceiptsImage,
1615
)
1716
from mindee.input import LocalInputSource
18-
from mindee.product import MultiReceiptsDetectorV1
17+
from mindee.parsing.common import Inference
1918

2019

2120
def extract_receipts_from_page( # type: ignore
2221
pdf_page: pdfium.PdfPage,
2322
bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]],
2423
page_id: int,
25-
) -> List[ExtractedMultiReceiptImage]:
24+
) -> List[ExtractedMultiReceiptsImage]:
2625
"""
27-
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage\
26+
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptsImage\
2827
object.
2928
3029
:param pdf_page: PDF Page to extract from.
3130
:param bounding_boxes: A set of coordinates delimiting the position of each receipt.
3231
:param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF
3332
pages.
34-
:return: A list of ExtractedMultiReceiptImage.
33+
:return: A list of ExtractedMultiReceiptsImage.
3534
"""
3635
extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore
3736
extracted_receipts = []
3837
for i, extracted_receipt_raw in enumerate(extracted_receipts_raw):
3938
extracted_receipts.append(
40-
ExtractedMultiReceiptImage(extracted_receipt_raw, i, page_id)
39+
ExtractedMultiReceiptsImage(extracted_receipt_raw, i, page_id)
4140
)
4241
return extracted_receipts
4342

@@ -50,59 +49,45 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
5049
:return: A valid PdfDocument handle.
5150
"""
5251
if input_file.file_mimetype not in [
53-
"image/jpeg",
54-
"image/jpg",
55-
"image/png",
5652
"application/pdf",
53+
"image/heic",
54+
"image/png",
55+
"image/jpg",
56+
"image/jpeg",
57+
"image/tiff",
58+
"image/webp",
5759
]:
58-
raise MimeTypeError(
59-
f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png',"
60-
f" '.jpg' and '.pdf'."
61-
)
60+
raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'.")
61+
input_file.file_object.seek(0)
6262
if input_file.is_pdf():
6363
return pdfium.PdfDocument(input_file.file_object)
64-
pdf_document = pdfium.PdfDocument.new()
65-
height, width = get_image_size(input_file.file_object)
66-
pdf_bitmap = pdfium.PdfBitmap.new_native(width, height, 4)
67-
pdf_bitmap = pdfium.PdfBitmap(
68-
raw=pdf_bitmap,
69-
buffer=input_file.file_object,
70-
height=height,
71-
width=width,
72-
needs_free=True,
73-
rev_byteorder=False,
74-
format=4,
75-
stride=4,
76-
)
77-
# Bitmap format 4 should equate to RGBA, assumed to be equivalent to:
78-
# https://docs.rs/pdfium-render/latest/pdfium_render/bitmap/enum.PdfBitmapFormat.html
7964

80-
return attach_bitmap_as_new_page(pdf_document, pdf_bitmap, height, width)
65+
return attach_image_as_new_file(input_file.file_object)
8166

8267

8368
def extract_receipts(
84-
input_file: LocalInputSource, inference: MultiReceiptsDetectorV1
85-
) -> List[ExtractedMultiReceiptImage]:
69+
input_file: LocalInputSource, inference: Inference
70+
) -> List[ExtractedMultiReceiptsImage]:
8671
"""
8772
Extracts individual receipts from multi-receipts documents.
8873
8974
:param input_file: File to extract sub-receipts from.
9075
:param inference: Results of the inference.
91-
:return: Individual extracted receipts as an array of ExtractedMultiReceiptImage.
76+
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
9277
"""
93-
images: List[ExtractedMultiReceiptImage] = []
78+
images: List[ExtractedMultiReceiptsImage] = []
9479
if not inference.prediction.receipts:
9580
raise MindeeError(
9681
"No possible receipts candidates found for MultiReceipts extraction."
9782
)
9883
pdf_doc = load_pdf_doc(input_file)
99-
for page_id in range(len(pdf_doc)):
84+
for page_id, page in enumerate(pdf_doc):
10085
receipt_positions = [
10186
receipt.bounding_box
10287
for receipt in inference.pages[page_id].prediction.receipts
10388
]
10489
extracted_receipts = extract_receipts_from_page(
105-
pdf_doc.get_page(page_id), receipt_positions, page_id # type: ignore
90+
page, receipt_positions, page_id
10691
)
10792
images.extend(extracted_receipts)
10893
return images

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ safe_licenses = [
3232
"MIT License",
3333
"Mozilla Public License 2.0 (MPL 2.0)",
3434
"BSD License",
35-
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
35+
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
36+
"Historical Permission Notice and Disclaimer (HPND) (HPND)"
3637
]
3738

3839
[tool.pytest.ini_options]

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ include_package_data = True
3333
python_requires = >=3.7
3434
install_requires =
3535
pypdfium2>=4.0,<5
36+
Pillow>=9.5.0
3637
pytz>=2023.3
3738
requests~=2.31
3839

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from io import BytesIO
22

33
import pytest
4+
from PIL import Image
45

56
from mindee.error import MimeTypeError
6-
from mindee.image_extraction.common import get_image_size
77
from tests.test_inputs import FILE_TYPES_DIR
88

99

@@ -24,23 +24,17 @@ def png_file_path():
2424

2525
def test_get_image_size_jpg(jpg_file_path):
2626
with open(jpg_file_path, "rb") as f:
27-
jpg_file = BytesIO(f.read())
28-
jpg_height, jpg_width = get_image_size(jpg_file)
27+
jpg_file = Image.open(jpg_file_path)
28+
jpg_height = jpg_file.size[0]
29+
jpg_width = jpg_file.size[1]
2930
assert jpg_height == 800
3031
assert jpg_width == 1066
3132

3233

3334
def test_get_image_size_png(png_file_path):
3435
with open(png_file_path, "rb") as f:
35-
png_file = BytesIO(f.read())
36-
png_height, png_width = get_image_size(png_file)
36+
png_file = Image.open(png_file_path)
37+
png_height = png_file.size[0]
38+
png_width = png_file.size[1]
3739
assert png_height == 800
3840
assert png_width == 1066
39-
40-
41-
def test_get_image_size_with_invalid_mime(txt_file_path):
42-
with open(txt_file_path, "rb") as f:
43-
txt_file = BytesIO(f.read())
44-
45-
with pytest.raises(MimeTypeError):
46-
get_image_size(txt_file)

0 commit comments

Comments
 (0)