-
Notifications
You must be signed in to change notification settings - Fork 5
✨ add support for multi-receipt extraction #240
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 10 commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
09dad19
add image size check & test
sebastianMindee 298bde6
fix lint, typeignore a lot of pypdfium stuff
sebastianMindee a9dfc47
:sparkles: add support for multi-receipt extraction
sebastianMindee 8d7841a
fix lint + license check
sebastianMindee 9fe6e25
add better tests
sebastianMindee 5742474
make generic more generic & fix up misc things
sebastianMindee 775fe3e
fix lint
sebastianMindee 87de9b7
fix a few suggestions
sebastianMindee 087287d
make more things generic
sebastianMindee 7d03c61
update test lib
sebastianMindee 836bf65
update syntaxes
sebastianMindee a4810a8
apply many suggestions
sebastianMindee 0e26709
simplify extractedimage creation
sebastianMindee File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,3 +45,4 @@ repos: | |
- types-requests | ||
- types-setuptools | ||
- importlib-metadata | ||
- types-Pillow |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from mindee import Client, PredictResponse, product | ||
from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import ( | ||
extract_receipts, | ||
) | ||
|
||
# Init a new client | ||
mindee_client = Client() | ||
|
||
# Load a file from disk | ||
input_doc = mindee_client.source_from_path("path/to/your/file.ext") | ||
result_split: PredictResponse = mindee_client.parse( | ||
product.MultiReceiptsDetectorV1, input_doc, close_file=False | ||
) | ||
|
||
extracted_receipts = extract_receipts(input_doc, result_split.document.inference) | ||
for receipt in extracted_receipts: | ||
receipt_as_source = receipt.as_source() | ||
# receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt | ||
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) | ||
print(result_receipt.document) |
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from mindee.image_extraction.common.extracted_image import ExtractedImage | ||
from mindee.image_extraction.common.image_extractor import ( | ||
attach_image_as_new_file, | ||
extract_multiple_images_from_image, | ||
extract_multiple_images_from_page, | ||
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import io | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
from PIL import Image | ||
|
||
from mindee.error import MindeeError | ||
from mindee.input import FileInput | ||
from mindee.logger import logger | ||
|
||
|
||
class ExtractedImage: | ||
"""Generic class for image extraction.""" | ||
|
||
_page_id: int | ||
"""Id of the page the image was extracted from.""" | ||
_element_id: int | ||
"""Id of the element on a given page.""" | ||
|
||
def __init__( | ||
self, | ||
buffer: bytes, | ||
file_name: str, | ||
page_id: int, | ||
element_id: Optional[int] = None, | ||
) -> None: | ||
""" | ||
Initialize the ExtractedImage with a buffer and an internal file name. | ||
|
||
:param buffer: The byte buffer representing the image. | ||
:param file_name: The internal file name of the image. | ||
""" | ||
self.buffer = io.BytesIO(buffer) | ||
self.internal_file_name = f"{file_name}_p{page_id}_{element_id}.pdf" | ||
self.buffer.name = self.internal_file_name | ||
self._page_id = page_id | ||
self._element_id = 0 if element_id is None else element_id | ||
|
||
def save_to_file(self, output_path: str, file_format: Optional[str] = None): | ||
""" | ||
Saves the document to a file. | ||
|
||
:param output_path: Path to save the file to. | ||
:param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided. | ||
:raises MindeeError: If an invalid path or filename is provided. | ||
""" | ||
try: | ||
resolved_path = Path(output_path).resolve() | ||
if not file_format: | ||
if len(resolved_path.suffix) < 1: | ||
raise ValueError("Invalid file format.") | ||
file_format = ( | ||
resolved_path.suffix.upper() | ||
) # technically redundant since PIL applies an upper operation | ||
# to the parameter , but older versions may not do so. | ||
self.buffer.seek(0) | ||
image = Image.open(self.buffer) | ||
image.save(resolved_path, format=file_format) | ||
logger.info("File saved successfully to '%s'.", resolved_path) | ||
except TypeError as exc: | ||
raise MindeeError("Invalid path/filename provided.") from exc | ||
except Exception as exc: | ||
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc | ||
|
||
def as_source(self) -> FileInput: | ||
""" | ||
Return the file as a Mindee-compatible BufferInput source. | ||
|
||
:returns: A BufferInput source. | ||
""" | ||
return FileInput(self.buffer) | ||
|
||
@property | ||
def page_id(self): | ||
""" | ||
ID of the page the receipt was found on. | ||
|
||
:return: | ||
""" | ||
return self._page_id |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
import io | ||
from pathlib import Path | ||
from typing import BinaryIO, List, Union | ||
|
||
import pypdfium2 as pdfium | ||
from PIL import Image | ||
|
||
from mindee.geometry import Point, get_min_max_x, get_min_max_y | ||
from mindee.input import LocalInputSource | ||
|
||
|
||
def attach_image_as_new_file( # type: ignore | ||
input_buffer: BinaryIO, | ||
) -> pdfium.PdfDocument: | ||
""" | ||
Attaches an image as a new page in a PdfDocument object. | ||
|
||
:param input_buffer: Input buffer. Only supports JPEG. | ||
:return: A PdfDocument handle. | ||
""" | ||
# Create a new page in the PdfDocument | ||
input_buffer.seek(0) | ||
image = Image.open(input_buffer) | ||
image.convert("RGB") | ||
image_buffer = io.BytesIO() | ||
image.save(image_buffer, format="JPEG") | ||
|
||
pdf = pdfium.PdfDocument.new() | ||
|
||
image_pdf = pdfium.PdfImage.new(pdf) | ||
image_pdf.load_jpeg(image_buffer) | ||
width, height = image_pdf.get_size() | ||
|
||
matrix = pdfium.PdfMatrix().scale(width, height) | ||
image_pdf.set_matrix(matrix) | ||
|
||
page = pdf.new_page(width, height) | ||
page.insert_obj(image_pdf) | ||
page.gen_content() | ||
image.close() | ||
return pdf | ||
|
||
|
||
def extract_multiple_images_from_image( | ||
image: Union[bytes, str, Path], polygons: List[List[Point]] | ||
) -> List[Image.Image]: | ||
""" | ||
Extracts elements from an image based on a list of bounding boxes. | ||
|
||
:param image: Image as a path | ||
:param polygons: List of coordinates to pull the elements from. | ||
:return: List of byte arrays representing the extracted elements. | ||
""" | ||
return extract_multiple_images_from_page(Image.open(image), polygons) | ||
|
||
|
||
def extract_multiple_images_from_page( # type: ignore | ||
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
page: Union[pdfium.PdfPage, Image.Image], polygons: List[List[Point]] | ||
) -> List[Image.Image]: | ||
""" | ||
Extracts elements from a page based on a list of bounding boxes. | ||
|
||
:param page: Single PDF Page. If the page is a pdfium.PdfPage, it is rasterized first. | ||
:param polygons: List of coordinates to pull the elements from. | ||
:return: List of byte arrays representing the extracted elements. | ||
""" | ||
if isinstance(page, pdfium.PdfPage): | ||
page_content = page.render().to_pil() | ||
width, height = page.get_size() | ||
else: | ||
page_content = page | ||
width, height = page.size | ||
|
||
extracted_elements = [] | ||
for polygon in polygons: | ||
min_max_x = get_min_max_x(polygon) | ||
min_max_y = get_min_max_y(polygon) | ||
|
||
left = min_max_x.min * width | ||
right = min_max_x.max * width | ||
top = min_max_y.min * height | ||
bottom = min_max_y.max * height | ||
|
||
extracted_elements.append( | ||
page_content.crop((int(left), int(top), int(right), int(bottom))) | ||
) | ||
|
||
return extracted_elements | ||
|
||
|
||
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore | ||
""" | ||
Loads a PDF document from a local input source. | ||
|
||
:param input_file: Local input. | ||
:return: A valid PdfDocument handle. | ||
""" | ||
input_file.file_object.seek(0) | ||
if input_file.is_pdf(): | ||
return pdfium.PdfDocument(input_file.file_object) | ||
|
||
return attach_image_as_new_file(input_file.file_object) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( | ||
ExtractedMultiReceiptsImage, | ||
) |
18 changes: 18 additions & 0 deletions
18
mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from mindee.image_extraction.common import ExtractedImage | ||
|
||
|
||
class ExtractedMultiReceiptsImage(ExtractedImage): | ||
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Wrapper class for extracted multiple-receipts images.""" | ||
|
||
def __init__(self, buffer, file_name: str, receipt_id: int, page_id: int): | ||
super().__init__(buffer, file_name, page_id) | ||
self._receipt_id = receipt_id | ||
|
||
@property | ||
def receipt_id(self): | ||
""" | ||
ID of the receipt on a given page. | ||
|
||
:return: | ||
""" | ||
return self._receipt_id |
49 changes: 49 additions & 0 deletions
49
mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import io | ||
from typing import List | ||
|
||
from mindee.error import MindeeError | ||
from mindee.image_extraction.common.image_extractor import ( | ||
extract_multiple_images_from_page, | ||
load_pdf_doc, | ||
) | ||
from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( | ||
ExtractedMultiReceiptsImage, | ||
) | ||
from mindee.input import LocalInputSource | ||
from mindee.parsing.common import Inference | ||
|
||
|
||
def extract_receipts( | ||
input_file: LocalInputSource, inference: Inference | ||
) -> List[ExtractedMultiReceiptsImage]: | ||
""" | ||
Extracts individual receipts from multi-receipts documents. | ||
|
||
:param input_file: File to extract sub-receipts from. | ||
:param inference: Results of the inference. | ||
:return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage. | ||
""" | ||
images: List[ExtractedMultiReceiptsImage] = [] | ||
if not inference.prediction.receipts: | ||
raise MindeeError( | ||
"No possible receipts candidates found for MultiReceipts extraction." | ||
) | ||
pdf_doc = load_pdf_doc(input_file) | ||
for page_id, page in enumerate(pdf_doc): | ||
receipt_positions = [ | ||
receipt.bounding_box | ||
for receipt in inference.pages[page_id].prediction.receipts | ||
] | ||
extracted_receipts = [] | ||
receipts = extract_multiple_images_from_page(page, receipt_positions) | ||
for receipt_id, receipt in enumerate(receipts): | ||
buffer = io.BytesIO() | ||
receipt.save(buffer, format="JPEG") | ||
buffer.seek(0) | ||
sebastianMindee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
extracted_receipts.append( | ||
ExtractedMultiReceiptsImage( | ||
buffer.read(), input_file.filename, receipt_id, page_id | ||
) | ||
) | ||
images.extend(extracted_receipts) | ||
return images |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule data
updated
19 files
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import json | ||
|
||
import pytest | ||
from PIL import Image | ||
|
||
from mindee.image_extraction.common import extract_multiple_images_from_image | ||
from mindee.input import PathInput | ||
from mindee.product import BarcodeReaderV1 | ||
from tests.test_inputs import PRODUCT_DATA_DIR | ||
|
||
|
||
@pytest.fixture | ||
def barcode_path(): | ||
return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg" | ||
|
||
|
||
@pytest.fixture | ||
def barcode_json_path(): | ||
return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json" | ||
|
||
|
||
def test_barcode_image_extraction(barcode_path, barcode_json_path): | ||
with open(barcode_json_path, "rb") as f: | ||
response = json.load(f) | ||
inference = BarcodeReaderV1(response["document"]["inference"]) | ||
barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d] | ||
barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d] | ||
extracted_barcodes_1d = extract_multiple_images_from_image(barcode_path, barcodes_1) | ||
extracted_barcodes_2d = extract_multiple_images_from_image(barcode_path, barcodes_2) | ||
assert len(extracted_barcodes_1d) == 1 | ||
assert len(extracted_barcodes_2d) == 2 | ||
|
||
assert extracted_barcodes_1d[0].size == (353, 200) | ||
assert extracted_barcodes_2d[0].size == (214, 216) | ||
assert extracted_barcodes_2d[1].size == (193, 201) |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.