Skip to content

Commit d5a1336

Browse files
✨ add support for multi-receipt extraction (#240)
1 parent 6cfc994 commit d5a1336

File tree

16 files changed

+422
-6
lines changed

16 files changed

+422
-6
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ repos:
4545
- types-requests
4646
- types-setuptools
4747
- importlib-metadata
48+
- types-Pillow

examples/multi_receipts_tutorial.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from mindee import Client, PredictResponse, product
2+
from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
3+
extract_receipts,
4+
)
5+
6+
# Init a new client
7+
mindee_client = Client()
8+
9+
# Load a file from disk
10+
input_doc = mindee_client.source_from_path("path/to/your/file.ext")
11+
result_split: PredictResponse = mindee_client.parse(
12+
product.MultiReceiptsDetectorV1, input_doc, close_file=False
13+
)
14+
15+
extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
16+
for receipt in extracted_receipts:
17+
receipt_as_source = receipt.as_source()
18+
# receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
19+
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
20+
print(result_receipt.document)

mindee/image_extraction/__init__.py

Whitespace-only changes.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from mindee.image_extraction.common.extracted_image import ExtractedImage
2+
from mindee.image_extraction.common.image_extractor import (
3+
attach_image_as_new_file,
4+
extract_multiple_images_from_source,
5+
)
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import io
2+
from pathlib import Path
3+
from typing import Optional
4+
5+
from PIL import Image
6+
7+
from mindee.error import MindeeError
8+
from mindee.input import FileInput, LocalInputSource
9+
from mindee.logger import logger
10+
11+
12+
class ExtractedImage:
13+
"""Generic class for image extraction."""
14+
15+
_page_id: int
16+
"""Id of the page the image was extracted from."""
17+
_element_id: int
18+
"""Id of the element on a given page."""
19+
20+
def __init__(
21+
self, input_source: LocalInputSource, page_id: int, element_id: int
22+
) -> None:
23+
"""
24+
Initialize the ExtractedImage with a buffer and an internal file name.
25+
26+
:param input_source: Local source for input.
27+
:param page_id: ID of the page the element was found on.
28+
:param element_id: ID of the element in a page.
29+
"""
30+
self.buffer = io.BytesIO(input_source.file_object.read())
31+
self.buffer.name = input_source.filename
32+
self.buffer.seek(0)
33+
self.internal_file_name = f"{input_source.filename}_p{page_id}_{element_id}.pdf"
34+
self._page_id = page_id
35+
self._element_id = 0 if element_id is None else element_id
36+
37+
def save_to_file(self, output_path: str, file_format: Optional[str] = None):
38+
"""
39+
Saves the document to a file.
40+
41+
:param output_path: Path to save the file to.
42+
:param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided.
43+
:raises MindeeError: If an invalid path or filename is provided.
44+
"""
45+
try:
46+
print(f"SAVING {self.internal_file_name}")
47+
resolved_path = Path(output_path).resolve()
48+
if not file_format:
49+
if len(resolved_path.suffix) < 1:
50+
raise ValueError("Invalid file format.")
51+
file_format = (
52+
resolved_path.suffix.upper()
53+
) # technically redundant since PIL applies an upper operation
54+
# to the parameter , but older versions may not do so.
55+
self.buffer.seek(0)
56+
image = Image.open(self.buffer)
57+
image.save(resolved_path, format=file_format)
58+
logger.info("File saved successfully to '%s'.", resolved_path)
59+
except TypeError as exc:
60+
raise MindeeError("Invalid path/filename provided.") from exc
61+
except Exception as exc:
62+
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc
63+
64+
def as_source(self) -> FileInput:
65+
"""
66+
Return the file as a Mindee-compatible BufferInput source.
67+
68+
:returns: A BufferInput source.
69+
"""
70+
self.buffer.seek(0)
71+
return FileInput(self.buffer)
72+
73+
@property
74+
def page_id(self):
75+
"""
76+
ID of the page the receipt was found on.
77+
78+
:return:
79+
"""
80+
return self._page_id
81+
82+
@property
83+
def element_id(self):
84+
"""
85+
Id of the element on a given page.
86+
87+
:return:
88+
"""
89+
return self._element_id
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import io
2+
from typing import BinaryIO, List
3+
4+
import pypdfium2 as pdfium
5+
from PIL import Image
6+
7+
from mindee.geometry import Point, get_min_max_x, get_min_max_y
8+
from mindee.image_extraction.common import ExtractedImage
9+
from mindee.input import BytesInput, LocalInputSource
10+
11+
12+
def attach_image_as_new_file( # type: ignore
13+
input_buffer: BinaryIO,
14+
) -> pdfium.PdfDocument:
15+
"""
16+
Attaches an image as a new page in a PdfDocument object.
17+
18+
:param input_buffer: Input buffer. Only supports JPEG.
19+
:return: A PdfDocument handle.
20+
"""
21+
# Create a new page in the PdfDocument
22+
input_buffer.seek(0)
23+
image = Image.open(input_buffer)
24+
image.convert("RGB")
25+
image_buffer = io.BytesIO()
26+
image.save(image_buffer, format="JPEG")
27+
28+
pdf = pdfium.PdfDocument.new()
29+
30+
image_pdf = pdfium.PdfImage.new(pdf)
31+
image_pdf.load_jpeg(image_buffer)
32+
width, height = image_pdf.get_size()
33+
34+
matrix = pdfium.PdfMatrix().scale(width, height)
35+
image_pdf.set_matrix(matrix)
36+
37+
page = pdf.new_page(width, height)
38+
page.insert_obj(image_pdf)
39+
page.gen_content()
40+
image.close()
41+
return pdf
42+
43+
44+
def extract_multiple_images_from_source(
45+
input_source: LocalInputSource, page_id: int, polygons: List[List[Point]]
46+
) -> List[ExtractedImage]:
47+
"""
48+
Extracts elements from a page based on a list of bounding boxes.
49+
50+
:param input_source: Local Input source to extract elements from.
51+
:param page_id: id of the page to extract from.
52+
:param polygons: List of coordinates to pull the elements from.
53+
:return: List of byte arrays representing the extracted elements.
54+
"""
55+
page = load_pdf_doc(input_source).get_page(page_id)
56+
page_content = page.render().to_pil()
57+
width, height = page.get_size()
58+
59+
extracted_elements = []
60+
for element_id, polygon in enumerate(polygons):
61+
min_max_x = get_min_max_x(polygon)
62+
min_max_y = get_min_max_y(polygon)
63+
64+
pillow_page = page_content.crop(
65+
(
66+
int(min_max_x.min * width),
67+
int(min_max_y.min * height),
68+
int(min_max_x.max * width),
69+
int(min_max_y.max * height),
70+
)
71+
)
72+
buffer = io.BytesIO()
73+
pillow_page.save(buffer, format="JPEG")
74+
buffer.seek(0)
75+
extracted_elements.append(
76+
ExtractedImage(
77+
BytesInput(
78+
buffer.read(),
79+
f"{input_source.filename}_p{page_id}_e{element_id}.jpg",
80+
),
81+
page_id,
82+
element_id,
83+
)
84+
)
85+
86+
return extracted_elements
87+
88+
89+
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
90+
"""
91+
Loads a PDF document from a local input source.
92+
93+
:param input_file: Local input.
94+
:return: A valid PdfDocument handle.
95+
"""
96+
if input_file.is_pdf():
97+
input_file.file_object.seek(0)
98+
return pdfium.PdfDocument(input_file.file_object)
99+
100+
return attach_image_as_new_file(input_file.file_object)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from mindee.image_extraction.multi_receipts_extractor import multi_receipts_extractor
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import List
2+
3+
from mindee.error import MindeeError
4+
from mindee.image_extraction.common.extracted_image import ExtractedImage
5+
from mindee.image_extraction.common.image_extractor import (
6+
extract_multiple_images_from_source,
7+
)
8+
from mindee.input import LocalInputSource
9+
from mindee.parsing.common import Inference
10+
11+
12+
def extract_receipts(
13+
input_source: LocalInputSource, inference: Inference
14+
) -> List[ExtractedImage]:
15+
"""
16+
Extracts individual receipts from multi-receipts documents.
17+
18+
:param input_source: Local Input Source to extract sub-receipts from.
19+
:param inference: Results of the inference.
20+
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
21+
"""
22+
images: List[ExtractedImage] = []
23+
if not inference.prediction.receipts:
24+
raise MindeeError(
25+
"No possible receipts candidates found for MultiReceipts extraction."
26+
)
27+
for page_id in range(input_source.count_doc_pages()):
28+
receipt_positions = [
29+
receipt.bounding_box
30+
for receipt in inference.pages[page_id].prediction.receipts
31+
]
32+
images.extend(
33+
extract_multiple_images_from_source(
34+
input_source, page_id, receipt_positions
35+
)
36+
)
37+
return images

mindee/input/local_response.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def is_valid_hmac_signature(
9898
Checks if the hmac signature of the local response is valid.
9999
100100
:param secret_key: Secret key, given as a string.
101-
:param signature:
101+
:param signature: HMAC signature, given as a string.
102102
:return: True if the HMAC signature is valid.
103103
"""
104104
return signature == self.get_hmac_signature(secret_key)

mindee/input/sources.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,11 @@ def count_doc_pages(self) -> int:
116116
117117
:return: the number of pages.
118118
"""
119-
self.file_object.seek(0)
120-
pdf = pdfium.PdfDocument(self.file_object)
121-
return len(pdf)
119+
if self.is_pdf():
120+
self.file_object.seek(0)
121+
pdf = pdfium.PdfDocument(self.file_object)
122+
return len(pdf)
123+
return 1
122124

123125
def process_pdf(
124126
self,

mindee/parsing/standard/locale.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ def __init__(
2727
:param reconstructed: Bool for reconstructed object (not extracted in the API)
2828
:param page_id: Page number for multi-page document
2929
"""
30-
value_key = "value" if "value" in raw_prediction else "language"
30+
value_key = (
31+
"value"
32+
if ("value" in raw_prediction and raw_prediction["value"])
33+
else "language"
34+
)
3135

3236
super().__init__(
3337
raw_prediction,

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ safe_licenses = [
3232
"MIT License",
3333
"Mozilla Public License 2.0 (MPL 2.0)",
3434
"BSD License",
35-
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
35+
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
36+
"Historical Permission Notice and Disclaimer (HPND)"
3637
]
3738

3839
[tool.pytest.ini_options]

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ include_package_data = True
3333
python_requires = >=3.7
3434
install_requires =
3535
pypdfium2>=4.0,<5
36+
Pillow>=9.5.0
3637
pytz>=2023.3
3738
requests~=2.31
3839

tests/image_extraction/__init__.py

Whitespace-only changes.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import json
2+
3+
import pytest
4+
from PIL import Image
5+
6+
from mindee.image_extraction.common import extract_multiple_images_from_source
7+
from mindee.input import PathInput
8+
from mindee.product import BarcodeReaderV1
9+
from tests.test_inputs import PRODUCT_DATA_DIR
10+
11+
12+
@pytest.fixture
13+
def barcode_path():
14+
return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg"
15+
16+
17+
@pytest.fixture
18+
def barcode_json_path():
19+
return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json"
20+
21+
22+
def test_barcode_image_extraction(barcode_path, barcode_json_path):
23+
with open(barcode_json_path, "rb") as f:
24+
response = json.load(f)
25+
inference = BarcodeReaderV1(response["document"]["inference"])
26+
barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d]
27+
barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d]
28+
input_source = PathInput(barcode_path)
29+
extracted_barcodes_1d = extract_multiple_images_from_source(
30+
input_source, 0, barcodes_1
31+
)
32+
extracted_barcodes_2d = extract_multiple_images_from_source(
33+
input_source, 0, barcodes_2
34+
)
35+
assert len(extracted_barcodes_1d) == 1
36+
assert len(extracted_barcodes_2d) == 2
37+
38+
assert Image.open(extracted_barcodes_1d[0].buffer).size == (353, 200)
39+
assert Image.open(extracted_barcodes_2d[0].buffer).size == (214, 216)
40+
assert Image.open(extracted_barcodes_2d[1].buffer).size == (193, 201)

0 commit comments

Comments
 (0)