-
Notifications
You must be signed in to change notification settings - Fork 5
✨ add support for multi-receipt extraction #240
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 4 commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
09dad19
add image size check & test
sebastianMindee 298bde6
fix lint, typeignore a lot of pypdfium stuff
sebastianMindee a9dfc47
:sparkles: add support for multi-receipt extraction
sebastianMindee 8d7841a
fix lint + license check
sebastianMindee 9fe6e25
add better tests
sebastianMindee 5742474
make generic more generic & fix up misc things
sebastianMindee 775fe3e
fix lint
sebastianMindee 87de9b7
fix a few suggestions
sebastianMindee 087287d
make more things generic
sebastianMindee 7d03c61
update test lib
sebastianMindee 836bf65
update syntaxes
sebastianMindee a4810a8
apply many suggestions
sebastianMindee 0e26709
simplify extractedimage creation
sebastianMindee File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,3 +45,4 @@ repos: | |
- types-requests | ||
- types-setuptools | ||
- importlib-metadata | ||
- types-Pillow |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from mindee import Client, PredictResponse, product | ||
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( | ||
extract_receipts, | ||
) | ||
|
||
# Init a new client | ||
mindee_client = Client() | ||
|
||
# Load a file from disk | ||
input_doc = mindee_client.source_from_path("path/to/your/file.ext") | ||
result_split: PredictResponse = mindee_client.parse( | ||
product.MultiReceiptsDetectorV1, input_doc, close_file=False | ||
) | ||
|
||
extracted_receipts = extract_receipts(input_doc, result_split.document.inference) | ||
for receipt in extracted_receipts: | ||
receipt_as_source = receipt.as_source() | ||
# receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt | ||
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) | ||
print(result_receipt.document) |
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from mindee.image_extraction.common.extracted_image import ExtractedImage | ||
from mindee.image_extraction.common.image_extractor import ( | ||
attach_image_as_new_file, | ||
extract_from_page, | ||
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import io | ||
from pathlib import Path | ||
|
||
from mindee.error import MindeeError | ||
from mindee.input import FileInput | ||
from mindee.logger import logger | ||
|
||
|
||
class ExtractedImage: | ||
"""Generic class for image extraction.""" | ||
|
||
def __init__(self, buffer: bytes, file_name: str): | ||
""" | ||
Initialize the ExtractedImage with a buffer and an internal file name. | ||
|
||
:param buffer: The byte buffer representing the image. | ||
:param file_name: The internal file name of the image. | ||
""" | ||
self.buffer = io.BytesIO(buffer) | ||
self.internal_file_name = file_name | ||
self.buffer.name = self.internal_file_name | ||
|
||
def save_to_file(self, output_path: str): | ||
""" | ||
Saves the document to a file. | ||
|
||
:param output_path: Path to save the file to. | ||
:param file_name: Name of the file. | ||
:raises MindeeError: If an invalid path or filename is provided. | ||
""" | ||
try: | ||
self.buffer.seek(0) | ||
resolved_path = Path(output_path).resolve() | ||
with open(resolved_path, "wb") as file: | ||
file.write(self.buffer.read()) | ||
logger.info("File saved successfully to '%s'.", resolved_path) | ||
except TypeError as exc: | ||
raise MindeeError("Invalid path/filename provided.") from exc | ||
except Exception as exc: | ||
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc | ||
|
||
def as_source(self) -> FileInput: | ||
""" | ||
Return the file as a Mindee-compatible BufferInput source. | ||
|
||
:returns: A BufferInput source. | ||
""" | ||
return FileInput(self.buffer) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import io | ||
from typing import BinaryIO, List | ||
|
||
import pypdfium2 as pdfium | ||
from PIL import Image | ||
|
||
from mindee.geometry import Polygon, get_min_max_x, get_min_max_y | ||
|
||
|
||
def attach_image_as_new_file( # type: ignore | ||
input_buffer: BinaryIO, | ||
) -> pdfium.PdfDocument: | ||
""" | ||
Attaches an image as a new page in a PdfDocument object. | ||
|
||
:param input_buffer: Input buffer. Only supports JPEG. | ||
:return: A PdfDocument handle. | ||
""" | ||
# Create a new page in the PdfDocument | ||
input_buffer.seek(0) | ||
image = Image.open(input_buffer) | ||
image.convert("RGB") | ||
image_buffer = io.BytesIO() | ||
image.save(image_buffer, format="JPEG") | ||
|
||
pdf = pdfium.PdfDocument.new() | ||
|
||
image_pdf = pdfium.PdfImage.new(pdf) | ||
image_pdf.load_jpeg(image_buffer) | ||
width, height = image_pdf.get_size() | ||
|
||
matrix = pdfium.PdfMatrix().scale(width, height) | ||
image_pdf.set_matrix(matrix) | ||
|
||
page = pdf.new_page(width, height) | ||
page.insert_obj(image_pdf) | ||
page.gen_content() | ||
image.close() | ||
return pdf | ||
|
||
|
||
def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List[bytes]: # type: ignore | ||
""" | ||
Extracts elements from a page based on a list of bounding boxes. | ||
|
||
:param pdf_page: Single PDF Page. | ||
:param polygons: List of coordinates to pull the elements from. | ||
:return: List of byte arrays representing the extracted elements. | ||
""" | ||
width, height = pdf_page.get_size() | ||
|
||
extracted_elements = [] | ||
for polygon in polygons: | ||
min_max_x = get_min_max_x(polygon) | ||
min_max_y = get_min_max_y(polygon) | ||
|
||
left = min_max_x.min * width | ||
right = min_max_x.max * width | ||
top = min_max_y.min * height | ||
bottom = min_max_y.max * height | ||
|
||
# Note: cropping done via PIL instead of PyPDFium to simplify operations greatly. | ||
cropped_content_pil = pdf_page.render().to_pil() | ||
cropped_content_pil = cropped_content_pil.crop( | ||
(int(left), int(top), int(right), int(bottom)) | ||
) | ||
jpeg_buffer = io.BytesIO() | ||
cropped_content_pil.save(jpeg_buffer, format="PDF") | ||
jpeg_buffer.seek(0) | ||
extracted_elements.append(jpeg_buffer.read()) | ||
|
||
return extracted_elements |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( | ||
ExtractedMultiReceiptsImage, | ||
) | ||
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( | ||
extract_receipts_from_page, | ||
) |
31 changes: 31 additions & 0 deletions
31
mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from mindee.image_extraction.common import ExtractedImage | ||
|
||
|
||
class ExtractedMultiReceiptsImage(ExtractedImage): | ||
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Wrapper class for extracted multiple-receipts images.""" | ||
|
||
_receipt_id: int | ||
_page_id: int | ||
|
||
def __init__(self, buffer, receipt_id: int, page_id: int): | ||
super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf") | ||
self._receipt_id = receipt_id | ||
self._page_id = page_id | ||
|
||
@property | ||
def receipt_id(self): | ||
""" | ||
ID of the receipt on a given page. | ||
|
||
:return: | ||
""" | ||
return self._receipt_id | ||
|
||
@property | ||
def page_id(self): | ||
""" | ||
ID of the page the receipt was found on. | ||
|
||
:return: | ||
""" | ||
return self._page_id |
93 changes: 93 additions & 0 deletions
93
mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from typing import List, Union | ||
|
||
import pypdfium2 as pdfium | ||
|
||
from mindee.error import MimeTypeError, MindeeError | ||
from mindee.geometry.point import Point | ||
from mindee.geometry.polygon import Polygon | ||
from mindee.geometry.quadrilateral import Quadrilateral | ||
from mindee.image_extraction.common.image_extractor import ( | ||
attach_image_as_new_file, | ||
extract_from_page, | ||
) | ||
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( | ||
ExtractedMultiReceiptsImage, | ||
) | ||
from mindee.input import LocalInputSource | ||
from mindee.parsing.common import Inference | ||
|
||
|
||
def extract_receipts_from_page( # type: ignore | ||
pdf_page: pdfium.PdfPage, | ||
bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]], | ||
page_id: int, | ||
) -> List[ExtractedMultiReceiptsImage]: | ||
""" | ||
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptsImage\ | ||
object. | ||
|
||
:param pdf_page: PDF Page to extract from. | ||
:param bounding_boxes: A set of coordinates delimiting the position of each receipt. | ||
:param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF | ||
pages. | ||
:return: A list of ExtractedMultiReceiptsImage. | ||
""" | ||
extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore | ||
extracted_receipts = [] | ||
for i, extracted_receipt_raw in enumerate(extracted_receipts_raw): | ||
extracted_receipts.append( | ||
ExtractedMultiReceiptsImage(extracted_receipt_raw, i, page_id) | ||
) | ||
return extracted_receipts | ||
|
||
|
||
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore | ||
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Loads a PDF document from a local input source. | ||
|
||
:param input_file: Local input. | ||
:return: A valid PdfDocument handle. | ||
""" | ||
if input_file.file_mimetype not in [ | ||
"application/pdf", | ||
"image/heic", | ||
"image/png", | ||
"image/jpg", | ||
"image/jpeg", | ||
"image/tiff", | ||
"image/webp", | ||
]: | ||
raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'.") | ||
input_file.file_object.seek(0) | ||
if input_file.is_pdf(): | ||
return pdfium.PdfDocument(input_file.file_object) | ||
|
||
return attach_image_as_new_file(input_file.file_object) | ||
|
||
|
||
def extract_receipts( | ||
input_file: LocalInputSource, inference: Inference | ||
) -> List[ExtractedMultiReceiptsImage]: | ||
""" | ||
Extracts individual receipts from multi-receipts documents. | ||
|
||
:param input_file: File to extract sub-receipts from. | ||
:param inference: Results of the inference. | ||
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage. | ||
""" | ||
images: List[ExtractedMultiReceiptsImage] = [] | ||
if not inference.prediction.receipts: | ||
raise MindeeError( | ||
"No possible receipts candidates found for MultiReceipts extraction." | ||
) | ||
pdf_doc = load_pdf_doc(input_file) | ||
for page_id, page in enumerate(pdf_doc): | ||
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
receipt_positions = [ | ||
receipt.bounding_box | ||
for receipt in inference.pages[page_id].prediction.receipts | ||
] | ||
extracted_receipts = extract_receipts_from_page( | ||
page, receipt_positions, page_id | ||
) | ||
images.extend(extracted_receipts) | ||
return images |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule data
updated
8 files
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from io import BytesIO | ||
|
||
import pytest | ||
from PIL import Image | ||
|
||
from mindee.error import MimeTypeError | ||
from tests.test_inputs import FILE_TYPES_DIR | ||
|
||
|
||
@pytest.fixture | ||
def jpg_file_path(): | ||
return FILE_TYPES_DIR / "receipt.jpg" | ||
|
||
|
||
@pytest.fixture | ||
def txt_file_path(): | ||
return FILE_TYPES_DIR / "receipt.txt" | ||
|
||
|
||
@pytest.fixture | ||
def png_file_path(): | ||
return FILE_TYPES_DIR / "receipt.png" | ||
|
||
|
||
def test_get_image_size_jpg(jpg_file_path): | ||
with open(jpg_file_path, "rb") as f: | ||
jpg_file = Image.open(jpg_file_path) | ||
jpg_height = jpg_file.size[0] | ||
jpg_width = jpg_file.size[1] | ||
assert jpg_height == 800 | ||
assert jpg_width == 1066 | ||
|
||
|
||
def test_get_image_size_png(png_file_path): | ||
with open(png_file_path, "rb") as f: | ||
png_file = Image.open(png_file_path) | ||
png_height = png_file.size[0] | ||
png_width = png_file.size[1] | ||
assert png_height == 800 | ||
assert png_width == 1066 |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.