Skip to content

✨ add support for multi-receipt extraction #240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ repos:
- types-requests
- types-setuptools
- importlib-metadata
- types-Pillow
20 changes: 20 additions & 0 deletions examples/multi_receipts_tutorial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from mindee import Client, PredictResponse, product
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
extract_receipts,
)

# Init a new client
mindee_client = Client()

# Load a file from disk
input_doc = mindee_client.source_from_path("path/to/your/file.ext")
result_split: PredictResponse = mindee_client.parse(
product.MultiReceiptsDetectorV1, input_doc, close_file=False
)

extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
for receipt in extracted_receipts:
receipt_as_source = receipt.as_source()
# receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
print(result_receipt.document)
Empty file.
5 changes: 5 additions & 0 deletions mindee/image_extraction/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from mindee.image_extraction.common.extracted_image import ExtractedImage
from mindee.image_extraction.common.image_extractor import (
attach_image_as_new_file,
extract_from_page,
)
48 changes: 48 additions & 0 deletions mindee/image_extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import io
from pathlib import Path

from mindee.error import MindeeError
from mindee.input import FileInput
from mindee.logger import logger


class ExtractedImage:
"""Generic class for image extraction."""

def __init__(self, buffer: bytes, file_name: str):
"""
Initialize the ExtractedImage with a buffer and an internal file name.

:param buffer: The byte buffer representing the image.
:param file_name: The internal file name of the image.
"""
self.buffer = io.BytesIO(buffer)
self.internal_file_name = file_name
self.buffer.name = self.internal_file_name

def save_to_file(self, output_path: str):
"""
Saves the document to a file.

:param output_path: Path to save the file to.
:param file_name: Name of the file.
:raises MindeeError: If an invalid path or filename is provided.
"""
try:
self.buffer.seek(0)
resolved_path = Path(output_path).resolve()
with open(resolved_path, "wb") as file:
file.write(self.buffer.read())
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as exc:
raise MindeeError("Invalid path/filename provided.") from exc
except Exception as exc:
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc

def as_source(self) -> FileInput:
"""
Return the file as a Mindee-compatible BufferInput source.

:returns: A BufferInput source.
"""
return FileInput(self.buffer)
72 changes: 72 additions & 0 deletions mindee/image_extraction/common/image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import io
from typing import BinaryIO, List

import pypdfium2 as pdfium
from PIL import Image

from mindee.geometry import Polygon, get_min_max_x, get_min_max_y


def attach_image_as_new_file( # type: ignore
input_buffer: BinaryIO,
) -> pdfium.PdfDocument:
"""
Attaches an image as a new page in a PdfDocument object.

:param input_buffer: Input buffer. Only supports JPEG.
:return: A PdfDocument handle.
"""
# Create a new page in the PdfDocument
input_buffer.seek(0)
image = Image.open(input_buffer)
image.convert("RGB")
image_buffer = io.BytesIO()
image.save(image_buffer, format="JPEG")

pdf = pdfium.PdfDocument.new()

image_pdf = pdfium.PdfImage.new(pdf)
image_pdf.load_jpeg(image_buffer)
width, height = image_pdf.get_size()

matrix = pdfium.PdfMatrix().scale(width, height)
image_pdf.set_matrix(matrix)

page = pdf.new_page(width, height)
page.insert_obj(image_pdf)
page.gen_content()
image.close()
return pdf


def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List[bytes]: # type: ignore
"""
Extracts elements from a page based on a list of bounding boxes.

:param pdf_page: Single PDF Page.
:param polygons: List of coordinates to pull the elements from.
:return: List of byte arrays representing the extracted elements.
"""
width, height = pdf_page.get_size()

extracted_elements = []
for polygon in polygons:
min_max_x = get_min_max_x(polygon)
min_max_y = get_min_max_y(polygon)

left = min_max_x.min * width
right = min_max_x.max * width
top = min_max_y.min * height
bottom = min_max_y.max * height

# Note: cropping done via PIL instead of PyPDFium to simplify operations greatly.
cropped_content_pil = pdf_page.render().to_pil()
cropped_content_pil = cropped_content_pil.crop(
(int(left), int(top), int(right), int(bottom))
)
jpeg_buffer = io.BytesIO()
cropped_content_pil.save(jpeg_buffer, format="PDF")
jpeg_buffer.seek(0)
extracted_elements.append(jpeg_buffer.read())

return extracted_elements
6 changes: 6 additions & 0 deletions mindee/image_extraction/multi_receipts_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
extract_receipts_from_page,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from mindee.image_extraction.common import ExtractedImage


class ExtractedMultiReceiptsImage(ExtractedImage):
"""Wrapper class for extracted multiple-receipts images."""

_receipt_id: int
_page_id: int

def __init__(self, buffer, receipt_id: int, page_id: int):
super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf")
self._receipt_id = receipt_id
self._page_id = page_id

@property
def receipt_id(self):
"""
ID of the receipt on a given page.

:return:
"""
return self._receipt_id

@property
def page_id(self):
"""
ID of the page the receipt was found on.

:return:
"""
return self._page_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import List, Union

import pypdfium2 as pdfium

from mindee.error import MimeTypeError, MindeeError
from mindee.geometry.point import Point
from mindee.geometry.polygon import Polygon
from mindee.geometry.quadrilateral import Quadrilateral
from mindee.image_extraction.common.image_extractor import (
attach_image_as_new_file,
extract_from_page,
)
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
ExtractedMultiReceiptsImage,
)
from mindee.input import LocalInputSource
from mindee.parsing.common import Inference


def extract_receipts_from_page( # type: ignore
pdf_page: pdfium.PdfPage,
bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]],
page_id: int,
) -> List[ExtractedMultiReceiptsImage]:
"""
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptsImage\
object.

:param pdf_page: PDF Page to extract from.
:param bounding_boxes: A set of coordinates delimiting the position of each receipt.
:param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF
pages.
:return: A list of ExtractedMultiReceiptsImage.
"""
extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore
extracted_receipts = []
for i, extracted_receipt_raw in enumerate(extracted_receipts_raw):
extracted_receipts.append(
ExtractedMultiReceiptsImage(extracted_receipt_raw, i, page_id)
)
return extracted_receipts


def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
"""
Loads a PDF document from a local input source.

:param input_file: Local input.
:return: A valid PdfDocument handle.
"""
if input_file.file_mimetype not in [
"application/pdf",
"image/heic",
"image/png",
"image/jpg",
"image/jpeg",
"image/tiff",
"image/webp",
]:
raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'.")
input_file.file_object.seek(0)
if input_file.is_pdf():
return pdfium.PdfDocument(input_file.file_object)

return attach_image_as_new_file(input_file.file_object)


def extract_receipts(
input_file: LocalInputSource, inference: Inference
) -> List[ExtractedMultiReceiptsImage]:
"""
Extracts individual receipts from multi-receipts documents.

:param input_file: File to extract sub-receipts from.
:param inference: Results of the inference.
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
"""
images: List[ExtractedMultiReceiptsImage] = []
if not inference.prediction.receipts:
raise MindeeError(
"No possible receipts candidates found for MultiReceipts extraction."
)
pdf_doc = load_pdf_doc(input_file)
for page_id, page in enumerate(pdf_doc):
receipt_positions = [
receipt.bounding_box
for receipt in inference.pages[page_id].prediction.receipts
]
extracted_receipts = extract_receipts_from_page(
page, receipt_positions, page_id
)
images.extend(extracted_receipts)
return images
2 changes: 1 addition & 1 deletion mindee/input/local_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def is_valid_hmac_signature(
Checks if the hmac signature of the local response is valid.

:param secret_key: Secret key, given as a string.
:param signature:
:param signature: HMAC signature, given as a string.
:return: True if the HMAC signature is valid.
"""
return signature == self.get_hmac_signature(secret_key)
6 changes: 5 additions & 1 deletion mindee/parsing/standard/locale.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ def __init__(
:param reconstructed: Bool for reconstructed object (not extracted in the API)
:param page_id: Page number for multi-page document
"""
value_key = "value" if "value" in raw_prediction else "language"
value_key = (
"value"
if ("value" in raw_prediction and raw_prediction["value"])
else "language"
)

super().__init__(
raw_prediction,
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ safe_licenses = [
"MIT License",
"Mozilla Public License 2.0 (MPL 2.0)",
"BSD License",
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
"(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
"Historical Permission Notice and Disclaimer (HPND)"
]

[tool.pytest.ini_options]
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ include_package_data = True
python_requires = >=3.7
install_requires =
pypdfium2>=4.0,<5
Pillow>=9.5.0
pytz>=2023.3
requests~=2.31

Expand Down
Empty file.
40 changes: 40 additions & 0 deletions tests/image_extraction/test_image_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from io import BytesIO

import pytest
from PIL import Image

from mindee.error import MimeTypeError
from tests.test_inputs import FILE_TYPES_DIR


@pytest.fixture
def jpg_file_path():
return FILE_TYPES_DIR / "receipt.jpg"


@pytest.fixture
def txt_file_path():
return FILE_TYPES_DIR / "receipt.txt"


@pytest.fixture
def png_file_path():
return FILE_TYPES_DIR / "receipt.png"


def test_get_image_size_jpg(jpg_file_path):
with open(jpg_file_path, "rb") as f:
jpg_file = Image.open(jpg_file_path)
jpg_height = jpg_file.size[0]
jpg_width = jpg_file.size[1]
assert jpg_height == 800
assert jpg_width == 1066


def test_get_image_size_png(png_file_path):
with open(png_file_path, "rb") as f:
png_file = Image.open(png_file_path)
png_height = png_file.size[0]
png_width = png_file.size[1]
assert png_height == 800
assert png_width == 1066
Loading
Loading