Skip to content

✨ add support for multi-receipt extraction #240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ repos:
- types-requests
- types-setuptools
- importlib-metadata
- types-Pillow
20 changes: 20 additions & 0 deletions examples/multi_receipts_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mindee import Client, PredictResponse, product
from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
extract_receipts,
)

# Init a new client
mindee_client = Client()

# Load a file from disk
input_doc = mindee_client.source_from_path("path/to/your/file.ext")
result_split: PredictResponse = mindee_client.parse(
product.MultiReceiptsDetectorV1, input_doc, close_file=False
)

extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
for receipt in extracted_receipts:
receipt_as_source = receipt.as_source()
# receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
print(result_receipt.document)
Empty file.
6 changes: 6 additions & 0 deletions mindee/image_extraction/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from mindee.image_extraction.common.extracted_image import ExtractedImage
from mindee.image_extraction.common.image_extractor import (
attach_image_as_new_file,
extract_multiple_images_from_image,
extract_multiple_images_from_page,
)
80 changes: 80 additions & 0 deletions mindee/image_extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import io
from pathlib import Path
from typing import Optional

from PIL import Image

from mindee.error import MindeeError
from mindee.input import FileInput
from mindee.logger import logger


class ExtractedImage:
"""Generic class for image extraction."""

_page_id: int
"""Id of the page the image was extracted from."""
_element_id: int
"""Id of the element on a given page."""

def __init__(
self,
buffer: bytes,
file_name: str,
page_id: int,
element_id: Optional[int] = None,
) -> None:
"""
Initialize the ExtractedImage with a buffer and an internal file name.

:param buffer: The byte buffer representing the image.
:param file_name: The internal file name of the image.
"""
self.buffer = io.BytesIO(buffer)
self.internal_file_name = f"{file_name}_p{page_id}_{element_id}.pdf"
self.buffer.name = self.internal_file_name
self._page_id = page_id
self._element_id = 0 if element_id is None else element_id

def save_to_file(self, output_path: str, file_format: Optional[str] = None):
"""
Saves the document to a file.

:param output_path: Path to save the file to.
:param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided.
:raises MindeeError: If an invalid path or filename is provided.
"""
try:
resolved_path = Path(output_path).resolve()
if not file_format:
if len(resolved_path.suffix) < 1:
raise ValueError("Invalid file format.")
file_format = (
resolved_path.suffix.upper()
) # technically redundant since PIL applies an upper operation
# to the parameter , but older versions may not do so.
self.buffer.seek(0)
image = Image.open(self.buffer)
image.save(resolved_path, format=file_format)
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as exc:
raise MindeeError("Invalid path/filename provided.") from exc
except Exception as exc:
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc

def as_source(self) -> FileInput:
"""
Return the file as a Mindee-compatible BufferInput source.

:returns: A BufferInput source.
"""
return FileInput(self.buffer)

@property
def page_id(self):
"""
ID of the page the receipt was found on.

:return:
"""
return self._page_id
102 changes: 102 additions & 0 deletions mindee/image_extraction/common/image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import io
from pathlib import Path
from typing import BinaryIO, List, Union

import pypdfium2 as pdfium
from PIL import Image

from mindee.geometry import Point, get_min_max_x, get_min_max_y
from mindee.input import LocalInputSource


def attach_image_as_new_file( # type: ignore
input_buffer: BinaryIO,
) -> pdfium.PdfDocument:
"""
Attaches an image as a new page in a PdfDocument object.

:param input_buffer: Input buffer. Only supports JPEG.
:return: A PdfDocument handle.
"""
# Create a new page in the PdfDocument
input_buffer.seek(0)
image = Image.open(input_buffer)
image.convert("RGB")
image_buffer = io.BytesIO()
image.save(image_buffer, format="JPEG")

pdf = pdfium.PdfDocument.new()

image_pdf = pdfium.PdfImage.new(pdf)
image_pdf.load_jpeg(image_buffer)
width, height = image_pdf.get_size()

matrix = pdfium.PdfMatrix().scale(width, height)
image_pdf.set_matrix(matrix)

page = pdf.new_page(width, height)
page.insert_obj(image_pdf)
page.gen_content()
image.close()
return pdf


def extract_multiple_images_from_image(
image: Union[bytes, str, Path], polygons: List[List[Point]]
) -> List[Image.Image]:
"""
Extracts elements from an image based on a list of bounding boxes.

:param image: Image as a path
:param polygons: List of coordinates to pull the elements from.
:return: List of byte arrays representing the extracted elements.
"""
return extract_multiple_images_from_page(Image.open(image), polygons)


def extract_multiple_images_from_page( # type: ignore
page: Union[pdfium.PdfPage, Image.Image], polygons: List[List[Point]]
) -> List[Image.Image]:
"""
Extracts elements from a page based on a list of bounding boxes.

:param page: Single PDF Page. If the page is a pdfium.PdfPage, it is rasterized first.
:param polygons: List of coordinates to pull the elements from.
:return: List of byte arrays representing the extracted elements.
"""
if isinstance(page, pdfium.PdfPage):
page_content = page.render().to_pil()
width, height = page.get_size()
else:
page_content = page
width, height = page.size

extracted_elements = []
for polygon in polygons:
min_max_x = get_min_max_x(polygon)
min_max_y = get_min_max_y(polygon)

left = min_max_x.min * width
right = min_max_x.max * width
top = min_max_y.min * height
bottom = min_max_y.max * height

extracted_elements.append(
page_content.crop((int(left), int(top), int(right), int(bottom)))
)

return extracted_elements


def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
"""
Loads a PDF document from a local input source.

:param input_file: Local input.
:return: A valid PdfDocument handle.
"""
input_file.file_object.seek(0)
if input_file.is_pdf():
return pdfium.PdfDocument(input_file.file_object)

return attach_image_as_new_file(input_file.file_object)
3 changes: 3 additions & 0 deletions mindee/image_extraction/multi_receipts_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from mindee.image_extraction.common import ExtractedImage


class ExtractedMultiReceiptsImage(ExtractedImage):
"""Wrapper class for extracted multiple-receipts images."""

def __init__(self, buffer, file_name: str, receipt_id: int, page_id: int):
super().__init__(buffer, file_name, page_id)
self._receipt_id = receipt_id

@property
def receipt_id(self):
"""
ID of the receipt on a given page.

:return:
"""
return self._receipt_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import io
from typing import List

from mindee.error import MindeeError
from mindee.image_extraction.common.image_extractor import (
extract_multiple_images_from_page,
load_pdf_doc,
)
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
from mindee.input import LocalInputSource
from mindee.parsing.common import Inference


def extract_receipts(
input_file: LocalInputSource, inference: Inference
) -> List[ExtractedMultiReceiptsImage]:
"""
Extracts individual receipts from multi-receipts documents.

:param input_file: File to extract sub-receipts from.
:param inference: Results of the inference.
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
"""
images: List[ExtractedMultiReceiptsImage] = []
if not inference.prediction.receipts:
raise MindeeError(
"No possible receipts candidates found for MultiReceipts extraction."
)
pdf_doc = load_pdf_doc(input_file)
for page_id, page in enumerate(pdf_doc):
receipt_positions = [
receipt.bounding_box
for receipt in inference.pages[page_id].prediction.receipts
]
extracted_receipts = []
receipts = extract_multiple_images_from_page(page, receipt_positions)
for receipt_id, receipt in enumerate(receipts):
buffer = io.BytesIO()
receipt.save(buffer, format="JPEG")
buffer.seek(0)
extracted_receipts.append(
ExtractedMultiReceiptsImage(
buffer.read(), input_file.filename, receipt_id, page_id
)
)
images.extend(extracted_receipts)
return images
2 changes: 1 addition & 1 deletion mindee/input/local_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def is_valid_hmac_signature(
Checks if the hmac signature of the local response is valid.

:param secret_key: Secret key, given as a string.
:param signature:
:param signature: HMAC signature, given as a string.
:return: True if the HMAC signature is valid.
"""
return signature == self.get_hmac_signature(secret_key)
6 changes: 5 additions & 1 deletion mindee/parsing/standard/locale.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def __init__(
:param reconstructed: Bool for reconstructed object (not extracted in the API)
:param page_id: Page number for multi-page document
"""
value_key = "value" if "value" in raw_prediction else "language"
value_key = (
"value"
if ("value" in raw_prediction and raw_prediction["value"])
else "language"
)

super().__init__(
raw_prediction,
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ safe_licenses = [
"MIT License",
"Mozilla Public License 2.0 (MPL 2.0)",
"BSD License",
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
"Historical Permission Notice and Disclaimer (HPND)"
]

[tool.pytest.ini_options]
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ include_package_data = True
python_requires = >=3.7
install_requires =
pypdfium2>=4.0,<5
Pillow>=9.5.0
pytz>=2023.3
requests~=2.31

Expand Down
Empty file.
35 changes: 35 additions & 0 deletions tests/image_extraction/test_image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json

import pytest
from PIL import Image

from mindee.image_extraction.common import extract_multiple_images_from_image
from mindee.input import PathInput
from mindee.product import BarcodeReaderV1
from tests.test_inputs import PRODUCT_DATA_DIR


@pytest.fixture
def barcode_path():
return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg"


@pytest.fixture
def barcode_json_path():
return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json"


def test_barcode_image_extraction(barcode_path, barcode_json_path):
with open(barcode_json_path, "rb") as f:
response = json.load(f)
inference = BarcodeReaderV1(response["document"]["inference"])
barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d]
barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d]
extracted_barcodes_1d = extract_multiple_images_from_image(barcode_path, barcodes_1)
extracted_barcodes_2d = extract_multiple_images_from_image(barcode_path, barcodes_2)
assert len(extracted_barcodes_1d) == 1
assert len(extracted_barcodes_2d) == 2

assert extracted_barcodes_1d[0].size == (353, 200)
assert extracted_barcodes_2d[0].size == (214, 216)
assert extracted_barcodes_2d[1].size == (193, 201)
Loading
Loading