From 09dad196fe7f9c8cf613fe27830bc29f8ea8c204 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 6 Jun 2024 13:47:29 +0200 Subject: [PATCH 01/13] add image size check & test --- mindee/image_extraction/__init__.py | 0 mindee/image_extraction/common/__init__.py | 2 + .../common/extracted_image.py | 43 ++++++++ .../common/image_extractor.py | 99 +++++++++++++++++++ .../multi_receipts_extractor/__init__.py | 2 + .../extracted_mult_receipt_image.py | 19 ++++ .../mult_receipts_extractor.py | 46 +++++++++ mindee/input/local_response.py | 2 +- mindee/parsing/standard/locale.py | 2 +- tests/data | 2 +- tests/image_extraction/__init__.py | 0 .../image_extraction/test_image_extractor.py | 43 ++++++++ 12 files changed, 257 insertions(+), 3 deletions(-) create mode 100644 mindee/image_extraction/__init__.py create mode 100644 mindee/image_extraction/common/__init__.py create mode 100644 mindee/image_extraction/common/extracted_image.py create mode 100644 mindee/image_extraction/common/image_extractor.py create mode 100644 mindee/image_extraction/multi_receipts_extractor/__init__.py create mode 100644 mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py create mode 100644 mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py create mode 100644 tests/image_extraction/__init__.py create mode 100644 tests/image_extraction/test_image_extractor.py diff --git a/mindee/image_extraction/__init__.py b/mindee/image_extraction/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py new file mode 100644 index 00000000..22195806 --- /dev/null +++ b/mindee/image_extraction/common/__init__.py @@ -0,0 +1,2 @@ +from mindee.image_extraction.common.extracted_image import ExtractedImage +from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py new file mode 100644 index 00000000..9d353369 --- /dev/null +++ b/mindee/image_extraction/common/extracted_image.py @@ -0,0 +1,43 @@ +import io +from pathlib import Path + +from mindee.error import MindeeError +from mindee.input import FileInput +from mindee.logger import logger + + +class ExtractedImage: + def __init__(self, buffer: bytes, file_name: str): + """ + Initialize the ExtractedImage with a buffer and an internal file name. + + :param buffer: The byte buffer representing the image. + :param file_name: The internal file name of the image. + """ + self.buffer = io.BytesIO(buffer) + self.internal_file_name = file_name + + def save_to_file(self, output_path: str): + """ + Saves the document to a file. + + :param output_path: Path to save the file to. + :raises MindeeError: If an invalid path or filename is provided. + """ + try: + resolved_path = Path(output_path).resolve() + with open(resolved_path, 'wb') as f: + f.write(self.buffer.read()) + logger.info(f"File saved successfully to {resolved_path}.") + except TypeError: + raise MindeeError("Invalid path/filename provided.") + except Exception as e: + raise e + + def as_source(self) -> FileInput: + """ + Return the file as a Mindee-compatible BufferInput source. + + :returns: A BufferInput source. + """ + return FileInput(self.buffer) diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py new file mode 100644 index 00000000..5611c6ed --- /dev/null +++ b/mindee/image_extraction/common/image_extractor.py @@ -0,0 +1,99 @@ +import io +from typing import List, BinaryIO, Tuple + +import pypdfium2 as pdfium + +from mindee.error import MimeTypeError +from mindee.geometry import get_min_max_x, get_min_max_y, Polygon + +import struct + + +def get_image_size(data: BinaryIO) -> Tuple[int, int]: + """ + Read the first few bytes to determine the file type. + + :param data: Image input. + :return: A tuple containing the file's height/width. + """ + data.seek(0) + signature = data.read(8) + + # Check for PNG signature + if signature[:8] == b'\x89PNG\r\n\x1a\n': + # PNG file + data.seek(16) + width, height = struct.unpack('>II', data.read(8)) + return width, height + + # Check for JPEG SOI marker + elif signature[:2] == b'\xff\xd8': + data.seek(2) + while True: + marker, = struct.unpack('>H', data.read(2)) + if marker == 0xFFC0 or marker == 0xFFC2: # SOF0 or SOF2 + data.seek(3, 1) # Skip length and precision + height, width = struct.unpack('>HH', data.read(4)) + return width, height + else: + length, = struct.unpack('>H', data.read(2)) + data.seek(length - 2, 1) + data.close() + raise MimeTypeError("Size could not be retrieved for file.") + + +def attach_bitmap_as_new_page(pdf_doc: pdfium.PdfDocument, bitmap: pdfium.PdfBitmap, new_width: float, + new_height: float) -> pdfium.PdfDocument: + """ + Attaches a created PdfBitmap object as a new page in a PdfDocument object. + + :param pdf_doc: The PdfDocument to which the new page will be added. + :param bitmap: The PdfBitmap object to be added as a new page. + :param new_width: The width of the new page. + :param new_height: The height of the new page. + """ + # Create a new page in the PdfDocument + new_page = pdf_doc.new_page(new_width, new_height) + + # Create a device context to render the bitmap onto the new page + new_page.insert_obj(bitmap.buffer) + + return pdf_doc + + +def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): + """ + Extracts elements from a page based on a list of bounding boxes. + + :param pdf_page: Single PDF Page. + :param polygons: List of coordinates to pull the elements from. + :return: List of byte arrays representing the extracted elements. + """ + width, height = pdf_page.get_size() + + extracted_elements = [] + + for polygon in polygons: + temp_pdf = pdfium.PdfDocument.new() + + min_max_x = get_min_max_x(polygon) + min_max_y = get_min_max_y(polygon) + + new_width = width * (min_max_x.max - min_max_x.min) + new_height = height * (min_max_y.max - min_max_y.min) + + left = min_max_x.min * width + right = min_max_x.max * width + top = height - (min_max_y.min * height) + bottom = height - (min_max_y.max * height) + + cropped_page: pdfium.PdfBitmap = pdf_page.render(crop=(left, bottom, right, top)) + + temp_pdf = attach_bitmap_as_new_page(temp_pdf, cropped_page, new_width, new_height) + + temp_file = io.BytesIO() + temp_pdf.save(temp_file) + extracted_elements.append(temp_file.read()) + temp_file.close() + + return extracted_elements diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py new file mode 100644 index 00000000..365b7e51 --- /dev/null +++ b/mindee/image_extraction/multi_receipts_extractor/__init__.py @@ -0,0 +1,2 @@ +from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts_from_page +from mindee.image_extraction.multi_receipts_extractor.extracted_mult_receipt_image import ExtractedMultiReceiptImage diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py new file mode 100644 index 00000000..2b081998 --- /dev/null +++ b/mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py @@ -0,0 +1,19 @@ +from mindee.image_extraction.common import ExtractedImage + + +class ExtractedMultiReceiptImage(ExtractedImage): + _receipt_id: int + page_id: int + + def __init__(self, buffer, receipt_id: int, page_id: int): + super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf") + self._receipt_id = receipt_id + self._page_id = page_id + + @property + def receipt_id(self): + return self._receipt_id + + @property + def page_id(self): + return self.page_id diff --git a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py new file mode 100644 index 00000000..6dc885b4 --- /dev/null +++ b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py @@ -0,0 +1,46 @@ +from typing import List + +import pypdfium2 as pdfium + +from mindee.error import MimeTypeError +from mindee.geometry import Polygon +from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size +from mindee.image_extraction.multi_receipts_extractor import ExtractedMultiReceiptImage +from mindee.input import LocalInputSource + + +def extract_receipts_from_page(pdf_page: pdfium.PdfPage, bounding_boxes: List[Polygon], page_id: int) \ + -> List[ExtractedMultiReceiptImage]: + """ + Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage + object. + + :param pdf_page: PDF Page to extract from. + :param bounding_boxes: A set of coordinates delimiting the position of each receipt. + :param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF + pages. + :return: A list of ExtractedMultiReceiptImage. + """ + extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) + extracted_receipts = [] + for i in range(len(extracted_receipts_raw)): + extracted_receipts.append(ExtractedMultiReceiptImage(extracted_receipts_raw[i], page_id, i)) + return extracted_receipts + + +def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: + """ + Loads a PDF document from a local input source. + + :param input_file: Local input. + :return: A valid PdfDocument handle. + """ + if input_file.file_mimetype not in ["image/jpeg", "image/jpg", "image/png", "application/pdf"]: + raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png'," + f" '.jpg' and '.pdf'.") + if input_file.is_pdf(): + pdf_document = pdfium.PdfDocument(input_file.file_object) + else: + pdf_document = pdfium.PdfDocument.new() + + return attach_bitmap_as_new_page(pdf_document, input_file.file_object, get_image_size(input_file.file_object)) diff --git a/mindee/input/local_response.py b/mindee/input/local_response.py index 37c2192a..d804d1ed 100644 --- a/mindee/input/local_response.py +++ b/mindee/input/local_response.py @@ -98,7 +98,7 @@ def is_valid_hmac_signature( Checks if the hmac signature of the local response is valid. :param secret_key: Secret key, given as a string. - :param signature: + :param signature: HMAC signature, given as a string. :return: True if the HMAC signature is valid. """ return signature == self.get_hmac_signature(secret_key) diff --git a/mindee/parsing/standard/locale.py b/mindee/parsing/standard/locale.py index 9ba78159..9e3082ce 100644 --- a/mindee/parsing/standard/locale.py +++ b/mindee/parsing/standard/locale.py @@ -27,7 +27,7 @@ def __init__( :param reconstructed: Bool for reconstructed object (not extracted in the API) :param page_id: Page number for multi-page document """ - value_key = "value" if "value" in raw_prediction else "language" + value_key = "value" if ("value" in raw_prediction and raw_prediction["value"]) else "language" super().__init__( raw_prediction, diff --git a/tests/data b/tests/data index abe2a996..6b2f8563 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit abe2a996f71ca3242693af0439f3bf96b4ce7781 +Subproject commit 6b2f85639465c878e70a59337394a8adf14a0b16 diff --git a/tests/image_extraction/__init__.py b/tests/image_extraction/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py new file mode 100644 index 00000000..2ad543f3 --- /dev/null +++ b/tests/image_extraction/test_image_extractor.py @@ -0,0 +1,43 @@ +from io import BytesIO + +import pytest + +from mindee.error import MimeTypeError +from mindee.image_extraction.common import get_image_size +from tests.test_inputs import FILE_TYPES_DIR + + +@pytest.fixture +def jpg_file_path(): + return FILE_TYPES_DIR / "receipt.jpg" + +@pytest.fixture +def txt_file_path(): + return FILE_TYPES_DIR / "receipt.txt" +@pytest.fixture +def png_file_path(): + return FILE_TYPES_DIR / "receipt.png" + + +def test_get_image_size_jpg(jpg_file_path): + with open(jpg_file_path, "rb") as f: + jpg_file = BytesIO(f.read()) + jpg_height, jpg_width = get_image_size(jpg_file) + assert jpg_height == 800 + assert jpg_width == 1066 + + +def test_get_image_size_png(png_file_path): + with open(png_file_path, "rb") as f: + png_file = BytesIO(f.read()) + png_height, png_width = get_image_size(png_file) + assert png_height == 800 + assert png_width == 1066 + + +def test_get_image_size_with_invalid_mime(txt_file_path): + with open(txt_file_path, "rb") as f: + txt_file = BytesIO(f.read()) + + with pytest.raises(MimeTypeError): + get_image_size(txt_file) From 298bde67dcd5f7b7acf5576171093fe8dddfee2b Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 6 Jun 2024 16:20:05 +0200 Subject: [PATCH 02/13] fix lint, typeignore a lot of pypdfium stuff --- mindee/image_extraction/common/__init__.py | 6 +- .../common/extracted_image.py | 16 +-- .../common/image_extractor.py | 68 ++++++------ .../multi_receipts_extractor/__init__.py | 8 +- ...ge.py => extracted_multi_receipt_image.py} | 16 ++- .../mult_receipts_extractor.py | 100 ++++++++++++++---- mindee/parsing/standard/locale.py | 6 +- .../image_extraction/test_image_extractor.py | 3 + .../test_multi_receipts_extractor.py | 75 +++++++++++++ 9 files changed, 236 insertions(+), 62 deletions(-) rename mindee/image_extraction/multi_receipts_extractor/{extracted_mult_receipt_image.py => extracted_multi_receipt_image.py} (60%) create mode 100644 tests/image_extraction/test_multi_receipts_extractor.py diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py index 22195806..ac970ab9 100644 --- a/mindee/image_extraction/common/__init__.py +++ b/mindee/image_extraction/common/__init__.py @@ -1,2 +1,6 @@ from mindee.image_extraction.common.extracted_image import ExtractedImage -from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size +from mindee.image_extraction.common.image_extractor import ( + attach_bitmap_as_new_page, + extract_from_page, + get_image_size, +) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py index 9d353369..3878aeda 100644 --- a/mindee/image_extraction/common/extracted_image.py +++ b/mindee/image_extraction/common/extracted_image.py @@ -7,6 +7,8 @@ class ExtractedImage: + """Generic class for image extraction.""" + def __init__(self, buffer: bytes, file_name: str): """ Initialize the ExtractedImage with a buffer and an internal file name. @@ -26,13 +28,13 @@ def save_to_file(self, output_path: str): """ try: resolved_path = Path(output_path).resolve() - with open(resolved_path, 'wb') as f: - f.write(self.buffer.read()) - logger.info(f"File saved successfully to {resolved_path}.") - except TypeError: - raise MindeeError("Invalid path/filename provided.") - except Exception as e: - raise e + with open(resolved_path, "wb") as file: + file.write(self.buffer.read()) + logger.info("File saved successfully to %s.", resolved_path) + except TypeError as exc: + raise MindeeError("Invalid path/filename provided.") from exc + except Exception as exc: + raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc def as_source(self) -> FileInput: """ diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py index 5611c6ed..35819ba1 100644 --- a/mindee/image_extraction/common/image_extractor.py +++ b/mindee/image_extraction/common/image_extractor.py @@ -1,12 +1,11 @@ import io -from typing import List, BinaryIO, Tuple +import struct +from typing import BinaryIO, List, Tuple import pypdfium2 as pdfium from mindee.error import MimeTypeError -from mindee.geometry import get_min_max_x, get_min_max_y, Polygon - -import struct +from mindee.geometry import Polygon, get_min_max_x, get_min_max_y def get_image_size(data: BinaryIO) -> Tuple[int, int]: @@ -20,30 +19,32 @@ def get_image_size(data: BinaryIO) -> Tuple[int, int]: signature = data.read(8) # Check for PNG signature - if signature[:8] == b'\x89PNG\r\n\x1a\n': - # PNG file + if signature[:8] == b"\x89PNG\r\n\x1a\n": data.seek(16) - width, height = struct.unpack('>II', data.read(8)) + width, height = struct.unpack(">II", data.read(8)) return width, height - # Check for JPEG SOI marker - elif signature[:2] == b'\xff\xd8': + # Check for JPEG SOI marker (also works for jpga) + if signature[:2] == b"\xff\xd8": data.seek(2) while True: - marker, = struct.unpack('>H', data.read(2)) - if marker == 0xFFC0 or marker == 0xFFC2: # SOF0 or SOF2 + (marker,) = struct.unpack(">H", data.read(2)) + if marker in (0xFFC0, 0xFFC2): # SOF0 or SOF2 data.seek(3, 1) # Skip length and precision - height, width = struct.unpack('>HH', data.read(4)) + height, width = struct.unpack(">HH", data.read(4)) return width, height - else: - length, = struct.unpack('>H', data.read(2)) - data.seek(length - 2, 1) + (length,) = struct.unpack(">H", data.read(2)) + data.seek(length - 2, 1) data.close() raise MimeTypeError("Size could not be retrieved for file.") -def attach_bitmap_as_new_page(pdf_doc: pdfium.PdfDocument, bitmap: pdfium.PdfBitmap, new_width: float, - new_height: float) -> pdfium.PdfDocument: +def attach_bitmap_as_new_page( # type: ignore + pdf_doc: pdfium.PdfDocument, + bitmap: pdfium.PdfBitmap, + new_width: float, + new_height: float, +) -> pdfium.PdfDocument: """ Attaches a created PdfBitmap object as a new page in a PdfDocument object. @@ -51,17 +52,20 @@ def attach_bitmap_as_new_page(pdf_doc: pdfium.PdfDocument, bitmap: pdfium.PdfBit :param bitmap: The PdfBitmap object to be added as a new page. :param new_width: The width of the new page. :param new_height: The height of the new page. + :return: A PdfDocument handle. """ # Create a new page in the PdfDocument new_page = pdf_doc.new_page(new_width, new_height) + pdf_obj = pdfium.PdfImage.new(pdf_doc) + pdf_obj.set_bitmap(bitmap) # Create a device context to render the bitmap onto the new page - new_page.insert_obj(bitmap.buffer) + new_page.insert_obj(pdf_obj) return pdf_doc -def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): +def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): # type: ignore """ Extracts elements from a page based on a list of bounding boxes. @@ -79,17 +83,21 @@ def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): min_max_x = get_min_max_x(polygon) min_max_y = get_min_max_y(polygon) - new_width = width * (min_max_x.max - min_max_x.min) - new_height = height * (min_max_y.max - min_max_y.min) - - left = min_max_x.min * width - right = min_max_x.max * width - top = height - (min_max_y.min * height) - bottom = height - (min_max_y.max * height) - - cropped_page: pdfium.PdfBitmap = pdf_page.render(crop=(left, bottom, right, top)) - - temp_pdf = attach_bitmap_as_new_page(temp_pdf, cropped_page, new_width, new_height) + left = min_max_x.min + right = min_max_x.max + top = (height - (min_max_y.min * height)) / height + bottom = (height - (min_max_y.max * height)) / height + + cropped_page: pdfium.PdfBitmap = pdf_page.render( # type: ignore + crop=(left, bottom, right, top) + ) + + temp_pdf = attach_bitmap_as_new_page( + temp_pdf, + cropped_page, + width * (min_max_x.max - min_max_x.min), + height * (min_max_y.max - min_max_y.min), + ) temp_file = io.BytesIO() temp_pdf.save(temp_file) diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py index 365b7e51..400c447a 100644 --- a/mindee/image_extraction/multi_receipts_extractor/__init__.py +++ b/mindee/image_extraction/multi_receipts_extractor/__init__.py @@ -1,2 +1,6 @@ -from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts_from_page -from mindee.image_extraction.multi_receipts_extractor.extracted_mult_receipt_image import ExtractedMultiReceiptImage +from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( + ExtractedMultiReceiptImage, +) +from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( + extract_receipts_from_page, +) diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py similarity index 60% rename from mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py rename to mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py index 2b081998..b312f96d 100644 --- a/mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py +++ b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py @@ -2,8 +2,10 @@ class ExtractedMultiReceiptImage(ExtractedImage): + """Wrapper class for extracted multiple-receipts images.""" + _receipt_id: int - page_id: int + _page_id: int def __init__(self, buffer, receipt_id: int, page_id: int): super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf") @@ -12,8 +14,18 @@ def __init__(self, buffer, receipt_id: int, page_id: int): @property def receipt_id(self): + """ + ID of the receipt on a given page. + + :return: + """ return self._receipt_id @property def page_id(self): - return self.page_id + """ + ID of the page the receipt was found on. + + :return: + """ + return self._page_id diff --git a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py index 6dc885b4..f79fdde7 100644 --- a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py @@ -1,18 +1,30 @@ -from typing import List +from typing import List, Union import pypdfium2 as pdfium -from mindee.error import MimeTypeError -from mindee.geometry import Polygon -from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size -from mindee.image_extraction.multi_receipts_extractor import ExtractedMultiReceiptImage +from mindee.error import MimeTypeError, MindeeError +from mindee.geometry.point import Point +from mindee.geometry.polygon import Polygon +from mindee.geometry.quadrilateral import Quadrilateral +from mindee.image_extraction.common.image_extractor import ( + attach_bitmap_as_new_page, + extract_from_page, + get_image_size, +) +from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( + ExtractedMultiReceiptImage, +) from mindee.input import LocalInputSource +from mindee.product import MultiReceiptsDetectorV1 -def extract_receipts_from_page(pdf_page: pdfium.PdfPage, bounding_boxes: List[Polygon], page_id: int) \ - -> List[ExtractedMultiReceiptImage]: +def extract_receipts_from_page( # type: ignore + pdf_page: pdfium.PdfPage, + bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]], + page_id: int, +) -> List[ExtractedMultiReceiptImage]: """ - Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage + Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage\ object. :param pdf_page: PDF Page to extract from. @@ -21,26 +33,76 @@ def extract_receipts_from_page(pdf_page: pdfium.PdfPage, bounding_boxes: List[Po pages. :return: A list of ExtractedMultiReceiptImage. """ - extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) + extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore extracted_receipts = [] - for i in range(len(extracted_receipts_raw)): - extracted_receipts.append(ExtractedMultiReceiptImage(extracted_receipts_raw[i], page_id, i)) + for i, extracted_receipt_raw in enumerate(extracted_receipts_raw): + extracted_receipts.append( + ExtractedMultiReceiptImage(extracted_receipt_raw, i, page_id) + ) return extracted_receipts -def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: +def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore """ Loads a PDF document from a local input source. :param input_file: Local input. :return: A valid PdfDocument handle. """ - if input_file.file_mimetype not in ["image/jpeg", "image/jpg", "image/png", "application/pdf"]: - raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png'," - f" '.jpg' and '.pdf'.") + if input_file.file_mimetype not in [ + "image/jpeg", + "image/jpg", + "image/png", + "application/pdf", + ]: + raise MimeTypeError( + f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png'," + f" '.jpg' and '.pdf'." + ) if input_file.is_pdf(): - pdf_document = pdfium.PdfDocument(input_file.file_object) - else: - pdf_document = pdfium.PdfDocument.new() + return pdfium.PdfDocument(input_file.file_object) + pdf_document = pdfium.PdfDocument.new() + height, width = get_image_size(input_file.file_object) + pdf_bitmap = pdfium.PdfBitmap.new_native(width, height, 4) + pdf_bitmap = pdfium.PdfBitmap( + raw=pdf_bitmap, + buffer=input_file.file_object, + height=height, + width=width, + needs_free=True, + rev_byteorder=False, + format=4, + stride=4, + ) + # Bitmap format 4 should equate to RGBA, assumed to be equivalent to: + # https://docs.rs/pdfium-render/latest/pdfium_render/bitmap/enum.PdfBitmapFormat.html - return attach_bitmap_as_new_page(pdf_document, input_file.file_object, get_image_size(input_file.file_object)) + return attach_bitmap_as_new_page(pdf_document, pdf_bitmap, height, width) + + +def extract_receipts( + input_file: LocalInputSource, inference: MultiReceiptsDetectorV1 +) -> List[ExtractedMultiReceiptImage]: + """ + Extracts individual receipts from multi-receipts documents. + + :param input_file: File to extract sub-receipts from. + :param inference: Results of the inference. + :return: Individual extracted receipts as an array of ExtractedMultiReceiptImage. + """ + images: List[ExtractedMultiReceiptImage] = [] + if not inference.prediction.receipts: + raise MindeeError( + "No possible receipts candidates found for MultiReceipts extraction." + ) + pdf_doc = load_pdf_doc(input_file) + for page_id in range(len(pdf_doc)): + receipt_positions = [ + receipt.bounding_box + for receipt in inference.pages[page_id].prediction.receipts + ] + extracted_receipts = extract_receipts_from_page( + pdf_doc.get_page(page_id), receipt_positions, page_id # type: ignore + ) + images.extend(extracted_receipts) + return images diff --git a/mindee/parsing/standard/locale.py b/mindee/parsing/standard/locale.py index 9e3082ce..2b692d91 100644 --- a/mindee/parsing/standard/locale.py +++ b/mindee/parsing/standard/locale.py @@ -27,7 +27,11 @@ def __init__( :param reconstructed: Bool for reconstructed object (not extracted in the API) :param page_id: Page number for multi-page document """ - value_key = "value" if ("value" in raw_prediction and raw_prediction["value"]) else "language" + value_key = ( + "value" + if ("value" in raw_prediction and raw_prediction["value"]) + else "language" + ) super().__init__( raw_prediction, diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py index 2ad543f3..866c855c 100644 --- a/tests/image_extraction/test_image_extractor.py +++ b/tests/image_extraction/test_image_extractor.py @@ -11,9 +11,12 @@ def jpg_file_path(): return FILE_TYPES_DIR / "receipt.jpg" + @pytest.fixture def txt_file_path(): return FILE_TYPES_DIR / "receipt.txt" + + @pytest.fixture def png_file_path(): return FILE_TYPES_DIR / "receipt.png" diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py new file mode 100644 index 00000000..3d20d45c --- /dev/null +++ b/tests/image_extraction/test_multi_receipts_extractor.py @@ -0,0 +1,75 @@ +import json +from io import BytesIO + +import pytest + +from mindee.error import MimeTypeError +from mindee.image_extraction.common import get_image_size +from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts +from mindee.input import PathInput +from mindee.product import MultiReceiptsDetectorV1 +from tests.test_inputs import PRODUCT_DATA_DIR + + +@pytest.fixture +def multi_receipts_single_page_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "default_sample.jpg" + + +@pytest.fixture +def multi_receipts_single_page_json_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "response_v1" / "complete.json" + + +@pytest.fixture +def multi_receipts_multi_page_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "multipage_sample.pdf" + + +@pytest.fixture +def multi_receipts_multi_page_json_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "response_v1" / "multipage_sample.json" + + +def test_single_page_multi_receipt_split(multi_receipts_single_page_path, multi_receipts_single_page_json_path): + input_sample = PathInput(multi_receipts_single_page_path) + with open(multi_receipts_single_page_json_path, "rb") as f: + response = json.load(f) + doc = MultiReceiptsDetectorV1(response["document"]["inference"]) + extracted_receipts = extract_receipts(input_sample, doc) + assert len(extracted_receipts) == 6 + for i in range(len(extracted_receipts)): + assert extracted_receipts[i].buffer is not None + assert extracted_receipts[i].page_id == 0 + assert extracted_receipts[i].receipt_id == i + + +def test_multi_page_receipt_split(multi_receipts_multi_page_path, multi_receipts_multi_page_json_path): + input_sample = PathInput(multi_receipts_multi_page_path) + with open(multi_receipts_multi_page_json_path, "rb") as f: + response = json.load(f) + doc = MultiReceiptsDetectorV1(response["document"]["inference"]) + extracted_receipts = extract_receipts(input_sample, doc) + assert len(extracted_receipts) == 5 + assert extracted_receipts[0].buffer is not None + assert extracted_receipts[0].page_id == 0 + assert extracted_receipts[0].receipt_id == 0 + + assert extracted_receipts[1].buffer is not None + assert extracted_receipts[1].page_id == 0 + assert extracted_receipts[1].receipt_id == 1 + + assert extracted_receipts[2].buffer is not None + assert extracted_receipts[2].page_id == 0 + assert extracted_receipts[2].receipt_id == 2 + + assert extracted_receipts[3].buffer is not None + assert extracted_receipts[3].page_id == 1 + assert extracted_receipts[3].receipt_id == 0 + + assert extracted_receipts[4].buffer is not None + assert extracted_receipts[4].page_id == 1 + assert extracted_receipts[4].receipt_id == 1 + + + From a9dfc470526456cb6f854ca57b5875829aea3711 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 10 Jun 2024 11:26:42 +0200 Subject: [PATCH 03/13] :sparkles: add support for multi-receipt extraction --- .pre-commit-config.yaml | 1 + examples/multi_receipts_tutorial.py | 20 ++++ mindee/image_extraction/common/__init__.py | 3 +- .../common/extracted_image.py | 5 +- .../common/image_extractor.py | 109 ++++++------------ .../multi_receipts_extractor/__init__.py | 2 +- .../extracted_multi_receipt_image.py | 2 +- .../mult_receipts_extractor.py | 59 ++++------ pyproject.toml | 3 +- setup.cfg | 1 + .../image_extraction/test_image_extractor.py | 20 ++-- .../test_multi_receipts_extractor.py | 27 +++-- 12 files changed, 115 insertions(+), 137 deletions(-) create mode 100644 examples/multi_receipts_tutorial.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 785a98de..79983dee 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -45,3 +45,4 @@ repos: - types-requests - types-setuptools - importlib-metadata + - types-Pillow \ No newline at end of file diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py new file mode 100644 index 00000000..f76b350f --- /dev/null +++ b/examples/multi_receipts_tutorial.py @@ -0,0 +1,20 @@ +from mindee import PredictResponse, Client, product +from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts + +# Init a new client +mindee_client = Client() + +# Load a file from disk +input_doc = mindee_client.source_from_path("path/to/your/file.ext") +result_split: PredictResponse = mindee_client.parse( + product.MultiReceiptsDetectorV1, + input_doc, + close_file=False +) + +extracted_receipts = extract_receipts(input_doc, result_split.document.inference) +for receipt in extracted_receipts: + receipt_as_source = receipt.as_source() + # receipt.save_to_file(f"./local_test/{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt + result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) + print(result_receipt.document) diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py index ac970ab9..a55a69e6 100644 --- a/mindee/image_extraction/common/__init__.py +++ b/mindee/image_extraction/common/__init__.py @@ -1,6 +1,5 @@ from mindee.image_extraction.common.extracted_image import ExtractedImage from mindee.image_extraction.common.image_extractor import ( - attach_bitmap_as_new_page, + attach_image_as_new_file, extract_from_page, - get_image_size, ) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py index 3878aeda..b52d6648 100644 --- a/mindee/image_extraction/common/extracted_image.py +++ b/mindee/image_extraction/common/extracted_image.py @@ -18,19 +18,22 @@ def __init__(self, buffer: bytes, file_name: str): """ self.buffer = io.BytesIO(buffer) self.internal_file_name = file_name + self.buffer.name = self.internal_file_name def save_to_file(self, output_path: str): """ Saves the document to a file. :param output_path: Path to save the file to. + :param file_name: Name of the file. :raises MindeeError: If an invalid path or filename is provided. """ try: + self.buffer.seek(0) resolved_path = Path(output_path).resolve() with open(resolved_path, "wb") as file: file.write(self.buffer.read()) - logger.info("File saved successfully to %s.", resolved_path) + logger.info("File saved successfully to '%s'.", resolved_path) except TypeError as exc: raise MindeeError("Invalid path/filename provided.") from exc except Exception as exc: diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py index 35819ba1..a23f4e8c 100644 --- a/mindee/image_extraction/common/image_extractor.py +++ b/mindee/image_extraction/common/image_extractor.py @@ -1,71 +1,45 @@ import io -import struct -from typing import BinaryIO, List, Tuple +from typing import BinaryIO, List import pypdfium2 as pdfium +from PIL import Image -from mindee.error import MimeTypeError from mindee.geometry import Polygon, get_min_max_x, get_min_max_y -def get_image_size(data: BinaryIO) -> Tuple[int, int]: - """ - Read the first few bytes to determine the file type. - - :param data: Image input. - :return: A tuple containing the file's height/width. - """ - data.seek(0) - signature = data.read(8) - - # Check for PNG signature - if signature[:8] == b"\x89PNG\r\n\x1a\n": - data.seek(16) - width, height = struct.unpack(">II", data.read(8)) - return width, height - - # Check for JPEG SOI marker (also works for jpga) - if signature[:2] == b"\xff\xd8": - data.seek(2) - while True: - (marker,) = struct.unpack(">H", data.read(2)) - if marker in (0xFFC0, 0xFFC2): # SOF0 or SOF2 - data.seek(3, 1) # Skip length and precision - height, width = struct.unpack(">HH", data.read(4)) - return width, height - (length,) = struct.unpack(">H", data.read(2)) - data.seek(length - 2, 1) - data.close() - raise MimeTypeError("Size could not be retrieved for file.") - - -def attach_bitmap_as_new_page( # type: ignore - pdf_doc: pdfium.PdfDocument, - bitmap: pdfium.PdfBitmap, - new_width: float, - new_height: float, +def attach_image_as_new_file( # type: ignore + input_buffer: BinaryIO, ) -> pdfium.PdfDocument: """ - Attaches a created PdfBitmap object as a new page in a PdfDocument object. + Attaches an image as a new page in a PdfDocument object. - :param pdf_doc: The PdfDocument to which the new page will be added. - :param bitmap: The PdfBitmap object to be added as a new page. - :param new_width: The width of the new page. - :param new_height: The height of the new page. + :param input_buffer: Input buffer. Only supports JPEG. :return: A PdfDocument handle. """ # Create a new page in the PdfDocument - new_page = pdf_doc.new_page(new_width, new_height) + input_buffer.seek(0) + image = Image.open(input_buffer) + image.convert("RGB") + image_buffer = io.BytesIO() + image.save(image_buffer, format="JPEG") + + pdf = pdfium.PdfDocument.new() + + image_pdf = pdfium.PdfImage.new(pdf) + image_pdf.load_jpeg(image_buffer) + width, height = image_pdf.get_size() - pdf_obj = pdfium.PdfImage.new(pdf_doc) - pdf_obj.set_bitmap(bitmap) - # Create a device context to render the bitmap onto the new page - new_page.insert_obj(pdf_obj) + matrix = pdfium.PdfMatrix().scale(width, height) + image_pdf.set_matrix(matrix) - return pdf_doc + page = pdf.new_page(width, height) + page.insert_obj(image_pdf) + page.gen_content() + image.close() + return pdf -def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): # type: ignore +def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List[bytes]: # type: ignore """ Extracts elements from a page based on a list of bounding boxes. @@ -76,32 +50,23 @@ def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]): # typ width, height = pdf_page.get_size() extracted_elements = [] - for polygon in polygons: - temp_pdf = pdfium.PdfDocument.new() - min_max_x = get_min_max_x(polygon) min_max_y = get_min_max_y(polygon) - left = min_max_x.min - right = min_max_x.max - top = (height - (min_max_y.min * height)) / height - bottom = (height - (min_max_y.max * height)) / height + left = min_max_x.min * width + right = min_max_x.max * width + top = min_max_y.min * height + bottom = min_max_y.max * height - cropped_page: pdfium.PdfBitmap = pdf_page.render( # type: ignore - crop=(left, bottom, right, top) + # Note: cropping done via PIL instead of PyPDFium to simplify operations greatly. + cropped_content_pil = pdf_page.render().to_pil() + cropped_content_pil = cropped_content_pil.crop( + (int(left), int(top), int(right), int(bottom)) ) - - temp_pdf = attach_bitmap_as_new_page( - temp_pdf, - cropped_page, - width * (min_max_x.max - min_max_x.min), - height * (min_max_y.max - min_max_y.min), - ) - - temp_file = io.BytesIO() - temp_pdf.save(temp_file) - extracted_elements.append(temp_file.read()) - temp_file.close() + jpeg_buffer = io.BytesIO() + cropped_content_pil.save(jpeg_buffer, format="PDF") + jpeg_buffer.seek(0) + extracted_elements.append(jpeg_buffer.read()) return extracted_elements diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py index 400c447a..dbf73576 100644 --- a/mindee/image_extraction/multi_receipts_extractor/__init__.py +++ b/mindee/image_extraction/multi_receipts_extractor/__init__.py @@ -1,5 +1,5 @@ from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( - ExtractedMultiReceiptImage, + ExtractedMultiReceiptsImage, ) from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( extract_receipts_from_page, diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py index b312f96d..6c2b2ee1 100644 --- a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py +++ b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py @@ -1,7 +1,7 @@ from mindee.image_extraction.common import ExtractedImage -class ExtractedMultiReceiptImage(ExtractedImage): +class ExtractedMultiReceiptsImage(ExtractedImage): """Wrapper class for extracted multiple-receipts images.""" _receipt_id: int diff --git a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py index f79fdde7..89248316 100644 --- a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py @@ -7,37 +7,36 @@ from mindee.geometry.polygon import Polygon from mindee.geometry.quadrilateral import Quadrilateral from mindee.image_extraction.common.image_extractor import ( - attach_bitmap_as_new_page, + attach_image_as_new_file, extract_from_page, - get_image_size, ) from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( - ExtractedMultiReceiptImage, + ExtractedMultiReceiptsImage, ) from mindee.input import LocalInputSource -from mindee.product import MultiReceiptsDetectorV1 +from mindee.parsing.common import Inference def extract_receipts_from_page( # type: ignore pdf_page: pdfium.PdfPage, bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]], page_id: int, -) -> List[ExtractedMultiReceiptImage]: +) -> List[ExtractedMultiReceiptsImage]: """ - Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage\ + Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptsImage\ object. :param pdf_page: PDF Page to extract from. :param bounding_boxes: A set of coordinates delimiting the position of each receipt. :param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF pages. - :return: A list of ExtractedMultiReceiptImage. + :return: A list of ExtractedMultiReceiptsImage. """ extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore extracted_receipts = [] for i, extracted_receipt_raw in enumerate(extracted_receipts_raw): extracted_receipts.append( - ExtractedMultiReceiptImage(extracted_receipt_raw, i, page_id) + ExtractedMultiReceiptsImage(extracted_receipt_raw, i, page_id) ) return extracted_receipts @@ -50,59 +49,45 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i :return: A valid PdfDocument handle. """ if input_file.file_mimetype not in [ - "image/jpeg", - "image/jpg", - "image/png", "application/pdf", + "image/heic", + "image/png", + "image/jpg", + "image/jpeg", + "image/tiff", + "image/webp", ]: - raise MimeTypeError( - f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png'," - f" '.jpg' and '.pdf'." - ) + raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'.") + input_file.file_object.seek(0) if input_file.is_pdf(): return pdfium.PdfDocument(input_file.file_object) - pdf_document = pdfium.PdfDocument.new() - height, width = get_image_size(input_file.file_object) - pdf_bitmap = pdfium.PdfBitmap.new_native(width, height, 4) - pdf_bitmap = pdfium.PdfBitmap( - raw=pdf_bitmap, - buffer=input_file.file_object, - height=height, - width=width, - needs_free=True, - rev_byteorder=False, - format=4, - stride=4, - ) - # Bitmap format 4 should equate to RGBA, assumed to be equivalent to: - # https://docs.rs/pdfium-render/latest/pdfium_render/bitmap/enum.PdfBitmapFormat.html - return attach_bitmap_as_new_page(pdf_document, pdf_bitmap, height, width) + return attach_image_as_new_file(input_file.file_object) def extract_receipts( - input_file: LocalInputSource, inference: MultiReceiptsDetectorV1 -) -> List[ExtractedMultiReceiptImage]: + input_file: LocalInputSource, inference: Inference +) -> List[ExtractedMultiReceiptsImage]: """ Extracts individual receipts from multi-receipts documents. :param input_file: File to extract sub-receipts from. :param inference: Results of the inference. - :return: Individual extracted receipts as an array of ExtractedMultiReceiptImage. + :return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage. """ - images: List[ExtractedMultiReceiptImage] = [] + images: List[ExtractedMultiReceiptsImage] = [] if not inference.prediction.receipts: raise MindeeError( "No possible receipts candidates found for MultiReceipts extraction." ) pdf_doc = load_pdf_doc(input_file) - for page_id in range(len(pdf_doc)): + for page_id, page in enumerate(pdf_doc): receipt_positions = [ receipt.bounding_box for receipt in inference.pages[page_id].prediction.receipts ] extracted_receipts = extract_receipts_from_page( - pdf_doc.get_page(page_id), receipt_positions, page_id # type: ignore + page, receipt_positions, page_id ) images.extend(extracted_receipts) return images diff --git a/pyproject.toml b/pyproject.toml index 4874a326..f0e7888d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,8 @@ safe_licenses = [ "MIT License", "Mozilla Public License 2.0 (MPL 2.0)", "BSD License", - "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty" + "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty", + "Historical Permission Notice and Disclaimer (HPND) (HPND)" ] [tool.pytest.ini_options] diff --git a/setup.cfg b/setup.cfg index 5d100563..736e4639 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ include_package_data = True python_requires = >=3.7 install_requires = pypdfium2>=4.0,<5 + Pillow>=9.5.0 pytz>=2023.3 requests~=2.31 diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py index 866c855c..92408b6b 100644 --- a/tests/image_extraction/test_image_extractor.py +++ b/tests/image_extraction/test_image_extractor.py @@ -1,9 +1,9 @@ from io import BytesIO import pytest +from PIL import Image from mindee.error import MimeTypeError -from mindee.image_extraction.common import get_image_size from tests.test_inputs import FILE_TYPES_DIR @@ -24,23 +24,17 @@ def png_file_path(): def test_get_image_size_jpg(jpg_file_path): with open(jpg_file_path, "rb") as f: - jpg_file = BytesIO(f.read()) - jpg_height, jpg_width = get_image_size(jpg_file) + jpg_file = Image.open(jpg_file_path) + jpg_height = jpg_file.size[0] + jpg_width = jpg_file.size[1] assert jpg_height == 800 assert jpg_width == 1066 def test_get_image_size_png(png_file_path): with open(png_file_path, "rb") as f: - png_file = BytesIO(f.read()) - png_height, png_width = get_image_size(png_file) + png_file = Image.open(png_file_path) + png_height = png_file.size[0] + png_width = png_file.size[1] assert png_height == 800 assert png_width == 1066 - - -def test_get_image_size_with_invalid_mime(txt_file_path): - with open(txt_file_path, "rb") as f: - txt_file = BytesIO(f.read()) - - with pytest.raises(MimeTypeError): - get_image_size(txt_file) diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py index 3d20d45c..8b720e69 100644 --- a/tests/image_extraction/test_multi_receipts_extractor.py +++ b/tests/image_extraction/test_multi_receipts_extractor.py @@ -4,8 +4,9 @@ import pytest from mindee.error import MimeTypeError -from mindee.image_extraction.common import get_image_size -from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts +from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( + extract_receipts, +) from mindee.input import PathInput from mindee.product import MultiReceiptsDetectorV1 from tests.test_inputs import PRODUCT_DATA_DIR @@ -18,7 +19,9 @@ def multi_receipts_single_page_path(): @pytest.fixture def multi_receipts_single_page_json_path(): - return PRODUCT_DATA_DIR / "multi_receipts_detector" / "response_v1" / "complete.json" + return ( + PRODUCT_DATA_DIR / "multi_receipts_detector" / "response_v1" / "complete.json" + ) @pytest.fixture @@ -28,10 +31,17 @@ def multi_receipts_multi_page_path(): @pytest.fixture def multi_receipts_multi_page_json_path(): - return PRODUCT_DATA_DIR / "multi_receipts_detector" / "response_v1" / "multipage_sample.json" + return ( + PRODUCT_DATA_DIR + / "multi_receipts_detector" + / "response_v1" + / "multipage_sample.json" + ) -def test_single_page_multi_receipt_split(multi_receipts_single_page_path, multi_receipts_single_page_json_path): +def test_single_page_multi_receipt_split( + multi_receipts_single_page_path, multi_receipts_single_page_json_path +): input_sample = PathInput(multi_receipts_single_page_path) with open(multi_receipts_single_page_json_path, "rb") as f: response = json.load(f) @@ -44,7 +54,9 @@ def test_single_page_multi_receipt_split(multi_receipts_single_page_path, multi_ assert extracted_receipts[i].receipt_id == i -def test_multi_page_receipt_split(multi_receipts_multi_page_path, multi_receipts_multi_page_json_path): +def test_multi_page_receipt_split( + multi_receipts_multi_page_path, multi_receipts_multi_page_json_path +): input_sample = PathInput(multi_receipts_multi_page_path) with open(multi_receipts_multi_page_json_path, "rb") as f: response = json.load(f) @@ -70,6 +82,3 @@ def test_multi_page_receipt_split(multi_receipts_multi_page_path, multi_receipts assert extracted_receipts[4].buffer is not None assert extracted_receipts[4].page_id == 1 assert extracted_receipts[4].receipt_id == 1 - - - From 8d7841a98332ca0a11ec4a71934b3661610b5a64 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 10 Jun 2024 11:30:37 +0200 Subject: [PATCH 04/13] fix lint + license check --- .pre-commit-config.yaml | 2 +- examples/multi_receipts_tutorial.py | 12 ++++++------ pyproject.toml | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 79983dee..50fa2cfb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -45,4 +45,4 @@ repos: - types-requests - types-setuptools - importlib-metadata - - types-Pillow \ No newline at end of file + - types-Pillow diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py index f76b350f..5bdc7c87 100644 --- a/examples/multi_receipts_tutorial.py +++ b/examples/multi_receipts_tutorial.py @@ -1,5 +1,7 @@ -from mindee import PredictResponse, Client, product -from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts +from mindee import Client, PredictResponse, product +from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( + extract_receipts, +) # Init a new client mindee_client = Client() @@ -7,14 +9,12 @@ # Load a file from disk input_doc = mindee_client.source_from_path("path/to/your/file.ext") result_split: PredictResponse = mindee_client.parse( - product.MultiReceiptsDetectorV1, - input_doc, - close_file=False + product.MultiReceiptsDetectorV1, input_doc, close_file=False ) extracted_receipts = extract_receipts(input_doc, result_split.document.inference) for receipt in extracted_receipts: receipt_as_source = receipt.as_source() - # receipt.save_to_file(f"./local_test/{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt + # receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) print(result_receipt.document) diff --git a/pyproject.toml b/pyproject.toml index f0e7888d..d0d9242d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ safe_licenses = [ "Mozilla Public License 2.0 (MPL 2.0)", "BSD License", "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty", - "Historical Permission Notice and Disclaimer (HPND) (HPND)" + "Historical Permission Notice and Disclaimer (HPND)" ] [tool.pytest.ini_options] From 9fe6e2507d68419c3182d57d86a65ca9dd9bf39e Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 10 Jun 2024 15:49:23 +0200 Subject: [PATCH 05/13] add better tests --- .../image_extraction/test_image_extractor.py | 50 +++++++------ .../test_multi_receipts_extractor.py | 75 ++++++++++++++++--- 2 files changed, 90 insertions(+), 35 deletions(-) diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py index 92408b6b..bb368e49 100644 --- a/tests/image_extraction/test_image_extractor.py +++ b/tests/image_extraction/test_image_extractor.py @@ -1,40 +1,42 @@ from io import BytesIO +import pypdfium2 as pdfium import pytest from PIL import Image from mindee.error import MimeTypeError -from tests.test_inputs import FILE_TYPES_DIR +from tests.test_inputs import PRODUCT_DATA_DIR @pytest.fixture -def jpg_file_path(): - return FILE_TYPES_DIR / "receipt.jpg" +def single_page_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "default_sample.jpg" @pytest.fixture -def txt_file_path(): - return FILE_TYPES_DIR / "receipt.txt" +def multiple_pages_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "multipage_sample.pdf" -@pytest.fixture -def png_file_path(): - return FILE_TYPES_DIR / "receipt.png" - - -def test_get_image_size_jpg(jpg_file_path): - with open(jpg_file_path, "rb") as f: - jpg_file = Image.open(jpg_file_path) +def test_get_images_mono_page(single_page_path): + with open(single_page_path, "rb") as f: + jpg_file = Image.open(single_page_path) jpg_height = jpg_file.size[0] jpg_width = jpg_file.size[1] - assert jpg_height == 800 - assert jpg_width == 1066 - - -def test_get_image_size_png(png_file_path): - with open(png_file_path, "rb") as f: - png_file = Image.open(png_file_path) - png_height = png_file.size[0] - png_width = png_file.size[1] - assert png_height == 800 - assert png_width == 1066 + assert jpg_height == 3628 + assert jpg_width == 1552 + + +def test_get_images_multiple_pages(multiple_pages_path): + with open(multiple_pages_path, "rb") as f: + pdf = pdfium.PdfDocument(f) + pdf_images = [page.render().to_pil() for page in pdf] + height_page_0 = pdf_images[0].size[0] + width_page_0 = pdf_images[0].size[1] + assert height_page_0 == 595 + assert width_page_0 == 842 + + height_page_1 = pdf_images[1].size[0] + width_page_1 = pdf_images[1].size[1] + assert height_page_1 == 595 + assert width_page_1 == 842 diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py index 8b720e69..c1ced83d 100644 --- a/tests/image_extraction/test_multi_receipts_extractor.py +++ b/tests/image_extraction/test_multi_receipts_extractor.py @@ -1,9 +1,8 @@ import json -from io import BytesIO +import pypdfium2 as pdfium import pytest -from mindee.error import MimeTypeError from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( extract_receipts, ) @@ -48,10 +47,48 @@ def test_single_page_multi_receipt_split( doc = MultiReceiptsDetectorV1(response["document"]["inference"]) extracted_receipts = extract_receipts(input_sample, doc) assert len(extracted_receipts) == 6 - for i in range(len(extracted_receipts)): - assert extracted_receipts[i].buffer is not None - assert extracted_receipts[i].page_id == 0 - assert extracted_receipts[i].receipt_id == i + + assert extracted_receipts[0].page_id == 0 + assert extracted_receipts[0].receipt_id == 0 + image_buffer_0 = ( + pdfium.PdfDocument(extracted_receipts[0].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_0.size == (341, 505) + + assert extracted_receipts[1].page_id == 0 + assert extracted_receipts[1].receipt_id == 1 + image_buffer_1 = ( + pdfium.PdfDocument(extracted_receipts[1].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_1.size == (461, 908) + + assert extracted_receipts[2].page_id == 0 + assert extracted_receipts[2].receipt_id == 2 + image_buffer_2 = ( + pdfium.PdfDocument(extracted_receipts[2].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_2.size == (471, 790) + + assert extracted_receipts[3].page_id == 0 + assert extracted_receipts[3].receipt_id == 3 + image_buffer_3 = ( + pdfium.PdfDocument(extracted_receipts[3].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_3.size == (464, 1200) + + assert extracted_receipts[4].page_id == 0 + assert extracted_receipts[4].receipt_id == 4 + image_buffer_4 = ( + pdfium.PdfDocument(extracted_receipts[4].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_4.size == (530, 943) + + assert extracted_receipts[5].page_id == 0 + assert extracted_receipts[5].receipt_id == 5 + image_buffer_5 = ( + pdfium.PdfDocument(extracted_receipts[5].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_5.size == (367, 593) def test_multi_page_receipt_split( @@ -63,22 +100,38 @@ def test_multi_page_receipt_split( doc = MultiReceiptsDetectorV1(response["document"]["inference"]) extracted_receipts = extract_receipts(input_sample, doc) assert len(extracted_receipts) == 5 - assert extracted_receipts[0].buffer is not None + assert extracted_receipts[0].page_id == 0 assert extracted_receipts[0].receipt_id == 0 + image_buffer_0 = ( + pdfium.PdfDocument(extracted_receipts[0].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_0.size == (198, 566) - assert extracted_receipts[1].buffer is not None assert extracted_receipts[1].page_id == 0 assert extracted_receipts[1].receipt_id == 1 + image_buffer_1 = ( + pdfium.PdfDocument(extracted_receipts[1].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_1.size == (206, 382) - assert extracted_receipts[2].buffer is not None assert extracted_receipts[2].page_id == 0 assert extracted_receipts[2].receipt_id == 2 + image_buffer_2 = ( + pdfium.PdfDocument(extracted_receipts[2].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_2.size == (195, 231) - assert extracted_receipts[3].buffer is not None assert extracted_receipts[3].page_id == 1 assert extracted_receipts[3].receipt_id == 0 + image_buffer_3 = ( + pdfium.PdfDocument(extracted_receipts[3].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_3.size == (213, 356) - assert extracted_receipts[4].buffer is not None assert extracted_receipts[4].page_id == 1 assert extracted_receipts[4].receipt_id == 1 + image_buffer_4 = ( + pdfium.PdfDocument(extracted_receipts[4].buffer).get_page(0).render().to_pil() + ) + assert image_buffer_4.size == (212, 516) From 5742474ff4ce98dad65f64350d799de38221d234 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 10 Jun 2024 18:53:22 +0200 Subject: [PATCH 06/13] make generic more generic & fix up misc things --- mindee/image_extraction/common/__init__.py | 3 +- .../common/extracted_image.py | 22 +++++-- .../common/image_extractor.py | 41 ++++++++----- .../multi_receipts_extractor/__init__.py | 3 - .../mult_receipts_extractor.py | 56 +++++------------- .../image_extraction/test_image_extractor.py | 58 +++++++++---------- .../test_multi_receipts_extractor.py | 46 ++++----------- 7 files changed, 97 insertions(+), 132 deletions(-) diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py index a55a69e6..befbed4f 100644 --- a/mindee/image_extraction/common/__init__.py +++ b/mindee/image_extraction/common/__init__.py @@ -1,5 +1,6 @@ from mindee.image_extraction.common.extracted_image import ExtractedImage from mindee.image_extraction.common.image_extractor import ( attach_image_as_new_file, - extract_from_page, + extract_multiple_images_from_image, + extract_multiple_images_from_page, ) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py index b52d6648..bcbe36d1 100644 --- a/mindee/image_extraction/common/extracted_image.py +++ b/mindee/image_extraction/common/extracted_image.py @@ -1,5 +1,8 @@ import io from pathlib import Path +from typing import Optional + +from PIL import Image from mindee.error import MindeeError from mindee.input import FileInput @@ -20,20 +23,27 @@ def __init__(self, buffer: bytes, file_name: str): self.internal_file_name = file_name self.buffer.name = self.internal_file_name - def save_to_file(self, output_path: str): + def save_to_file(self, output_path: str, file_format: Optional[str] = None): """ Saves the document to a file. :param output_path: Path to save the file to. - :param file_name: Name of the file. + :param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided. :raises MindeeError: If an invalid path or filename is provided. """ try: - self.buffer.seek(0) resolved_path = Path(output_path).resolve() - with open(resolved_path, "wb") as file: - file.write(self.buffer.read()) - logger.info("File saved successfully to '%s'.", resolved_path) + if not file_format: + if len(resolved_path.suffix) < 1: + raise ValueError("Invalid file format.") + file_format = ( + resolved_path.suffix.upper() + ) # technically redundant since PIL applies an upper operation + # to the parameter , but older versions may not do so. + self.buffer.seek(0) + image = Image.open(self.buffer) + image.save(resolved_path, format=file_format) + logger.info("File saved successfully to '%s'.", resolved_path) except TypeError as exc: raise MindeeError("Invalid path/filename provided.") from exc except Exception as exc: diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py index a23f4e8c..59ad8598 100644 --- a/mindee/image_extraction/common/image_extractor.py +++ b/mindee/image_extraction/common/image_extractor.py @@ -1,10 +1,11 @@ import io -from typing import BinaryIO, List +from pathlib import Path +from typing import BinaryIO, List, Union import pypdfium2 as pdfium from PIL import Image -from mindee.geometry import Polygon, get_min_max_x, get_min_max_y +from mindee.geometry import Point, get_min_max_x, get_min_max_y def attach_image_as_new_file( # type: ignore @@ -39,15 +40,35 @@ def attach_image_as_new_file( # type: ignore return pdf -def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List[bytes]: # type: ignore +def extract_multiple_images_from_image( + image: Union[bytes, str, Path], polygons: List[List[Point]] +) -> List[Image.Image]: + """ + Extracts elements from an image based on a list of bounding boxes. + + :param image: Image as a path + :param polygons: List of coordinates to pull the elements from. + :return: List of byte arrays representing the extracted elements. + """ + return extract_multiple_images_from_page(Image.open(image), polygons) + + +def extract_multiple_images_from_page( # type: ignore + page: Union[pdfium.PdfPage, Image.Image], polygons: List[List[Point]] +) -> List[Image.Image]: """ Extracts elements from a page based on a list of bounding boxes. - :param pdf_page: Single PDF Page. + :param page: Single PDF Page. If the page is a pdfium.PdfPage, it is rasterized first. :param polygons: List of coordinates to pull the elements from. :return: List of byte arrays representing the extracted elements. """ - width, height = pdf_page.get_size() + if isinstance(page, pdfium.PdfPage): + page_content = page.render().to_pil() + width, height = page.get_size() + else: + page_content = page + width, height = page.size extracted_elements = [] for polygon in polygons: @@ -59,14 +80,8 @@ def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List top = min_max_y.min * height bottom = min_max_y.max * height - # Note: cropping done via PIL instead of PyPDFium to simplify operations greatly. - cropped_content_pil = pdf_page.render().to_pil() - cropped_content_pil = cropped_content_pil.crop( - (int(left), int(top), int(right), int(bottom)) + extracted_elements.append( + page_content.crop((int(left), int(top), int(right), int(bottom))) ) - jpeg_buffer = io.BytesIO() - cropped_content_pil.save(jpeg_buffer, format="PDF") - jpeg_buffer.seek(0) - extracted_elements.append(jpeg_buffer.read()) return extracted_elements diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py index dbf73576..1169000a 100644 --- a/mindee/image_extraction/multi_receipts_extractor/__init__.py +++ b/mindee/image_extraction/multi_receipts_extractor/__init__.py @@ -1,6 +1,3 @@ from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( ExtractedMultiReceiptsImage, ) -from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( - extract_receipts_from_page, -) diff --git a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py index 89248316..2f40194c 100644 --- a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py @@ -1,14 +1,12 @@ -from typing import List, Union +import io +from typing import List import pypdfium2 as pdfium -from mindee.error import MimeTypeError, MindeeError -from mindee.geometry.point import Point -from mindee.geometry.polygon import Polygon -from mindee.geometry.quadrilateral import Quadrilateral +from mindee.error import MindeeError from mindee.image_extraction.common.image_extractor import ( attach_image_as_new_file, - extract_from_page, + extract_multiple_images_from_page, ) from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( ExtractedMultiReceiptsImage, @@ -17,30 +15,6 @@ from mindee.parsing.common import Inference -def extract_receipts_from_page( # type: ignore - pdf_page: pdfium.PdfPage, - bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]], - page_id: int, -) -> List[ExtractedMultiReceiptsImage]: - """ - Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptsImage\ - object. - - :param pdf_page: PDF Page to extract from. - :param bounding_boxes: A set of coordinates delimiting the position of each receipt. - :param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF - pages. - :return: A list of ExtractedMultiReceiptsImage. - """ - extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes) # type: ignore - extracted_receipts = [] - for i, extracted_receipt_raw in enumerate(extracted_receipts_raw): - extracted_receipts.append( - ExtractedMultiReceiptsImage(extracted_receipt_raw, i, page_id) - ) - return extracted_receipts - - def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore """ Loads a PDF document from a local input source. @@ -48,16 +22,6 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i :param input_file: Local input. :return: A valid PdfDocument handle. """ - if input_file.file_mimetype not in [ - "application/pdf", - "image/heic", - "image/png", - "image/jpg", - "image/jpeg", - "image/tiff", - "image/webp", - ]: - raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'.") input_file.file_object.seek(0) if input_file.is_pdf(): return pdfium.PdfDocument(input_file.file_object) @@ -86,8 +50,14 @@ def extract_receipts( receipt.bounding_box for receipt in inference.pages[page_id].prediction.receipts ] - extracted_receipts = extract_receipts_from_page( - page, receipt_positions, page_id - ) + extracted_receipts = [] + receipts = extract_multiple_images_from_page(page, receipt_positions) + for receipt_id, receipt in enumerate(receipts): + buffer = io.BytesIO() + receipt.save(buffer, format="JPEG") + buffer.seek(0) + extracted_receipts.append( + ExtractedMultiReceiptsImage(buffer.read(), receipt_id, page_id) + ) images.extend(extracted_receipts) return images diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py index bb368e49..99118fe0 100644 --- a/tests/image_extraction/test_image_extractor.py +++ b/tests/image_extraction/test_image_extractor.py @@ -1,42 +1,36 @@ -from io import BytesIO +import json -import pypdfium2 as pdfium import pytest from PIL import Image -from mindee.error import MimeTypeError +from mindee.image_extraction.common import extract_multiple_images_from_image +from mindee.input import PathInput +from mindee.product import BarcodeReaderV1 from tests.test_inputs import PRODUCT_DATA_DIR @pytest.fixture -def single_page_path(): - return PRODUCT_DATA_DIR / "multi_receipts_detector" / "default_sample.jpg" - +def barcode_path(): + return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg" @pytest.fixture -def multiple_pages_path(): - return PRODUCT_DATA_DIR / "multi_receipts_detector" / "multipage_sample.pdf" - - -def test_get_images_mono_page(single_page_path): - with open(single_page_path, "rb") as f: - jpg_file = Image.open(single_page_path) - jpg_height = jpg_file.size[0] - jpg_width = jpg_file.size[1] - assert jpg_height == 3628 - assert jpg_width == 1552 - - -def test_get_images_multiple_pages(multiple_pages_path): - with open(multiple_pages_path, "rb") as f: - pdf = pdfium.PdfDocument(f) - pdf_images = [page.render().to_pil() for page in pdf] - height_page_0 = pdf_images[0].size[0] - width_page_0 = pdf_images[0].size[1] - assert height_page_0 == 595 - assert width_page_0 == 842 - - height_page_1 = pdf_images[1].size[0] - width_page_1 = pdf_images[1].size[1] - assert height_page_1 == 595 - assert width_page_1 == 842 +def barcode_json_path(): + return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json" + + +def test_barcode_image_extraction( + barcode_path, barcode_json_path +): + with open(barcode_json_path, "rb") as f: + response = json.load(f) + inference = BarcodeReaderV1(response["document"]["inference"]) + barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d] + barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d] + extracted_barcodes_1d = extract_multiple_images_from_image(barcode_path, barcodes_1) + extracted_barcodes_2d = extract_multiple_images_from_image(barcode_path, barcodes_2) + assert len(extracted_barcodes_1d) == 1 + assert len(extracted_barcodes_2d) == 2 + + assert extracted_barcodes_1d[0].size == (353, 200) + assert extracted_barcodes_2d[0].size == (214, 216) + assert extracted_barcodes_2d[1].size == (193, 201) diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py index c1ced83d..9fc87e52 100644 --- a/tests/image_extraction/test_multi_receipts_extractor.py +++ b/tests/image_extraction/test_multi_receipts_extractor.py @@ -1,7 +1,7 @@ import json -import pypdfium2 as pdfium import pytest +from PIL import Image from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( extract_receipts, @@ -50,44 +50,32 @@ def test_single_page_multi_receipt_split( assert extracted_receipts[0].page_id == 0 assert extracted_receipts[0].receipt_id == 0 - image_buffer_0 = ( - pdfium.PdfDocument(extracted_receipts[0].buffer).get_page(0).render().to_pil() - ) + image_buffer_0 = Image.open(extracted_receipts[0].buffer) assert image_buffer_0.size == (341, 505) assert extracted_receipts[1].page_id == 0 assert extracted_receipts[1].receipt_id == 1 - image_buffer_1 = ( - pdfium.PdfDocument(extracted_receipts[1].buffer).get_page(0).render().to_pil() - ) + image_buffer_1 = Image.open(extracted_receipts[1].buffer) assert image_buffer_1.size == (461, 908) assert extracted_receipts[2].page_id == 0 assert extracted_receipts[2].receipt_id == 2 - image_buffer_2 = ( - pdfium.PdfDocument(extracted_receipts[2].buffer).get_page(0).render().to_pil() - ) + image_buffer_2 = Image.open(extracted_receipts[2].buffer) assert image_buffer_2.size == (471, 790) assert extracted_receipts[3].page_id == 0 assert extracted_receipts[3].receipt_id == 3 - image_buffer_3 = ( - pdfium.PdfDocument(extracted_receipts[3].buffer).get_page(0).render().to_pil() - ) + image_buffer_3 = Image.open(extracted_receipts[3].buffer) assert image_buffer_3.size == (464, 1200) assert extracted_receipts[4].page_id == 0 assert extracted_receipts[4].receipt_id == 4 - image_buffer_4 = ( - pdfium.PdfDocument(extracted_receipts[4].buffer).get_page(0).render().to_pil() - ) + image_buffer_4 = Image.open(extracted_receipts[4].buffer) assert image_buffer_4.size == (530, 943) assert extracted_receipts[5].page_id == 0 assert extracted_receipts[5].receipt_id == 5 - image_buffer_5 = ( - pdfium.PdfDocument(extracted_receipts[5].buffer).get_page(0).render().to_pil() - ) + image_buffer_5 = Image.open(extracted_receipts[5].buffer) assert image_buffer_5.size == (367, 593) @@ -103,35 +91,25 @@ def test_multi_page_receipt_split( assert extracted_receipts[0].page_id == 0 assert extracted_receipts[0].receipt_id == 0 - image_buffer_0 = ( - pdfium.PdfDocument(extracted_receipts[0].buffer).get_page(0).render().to_pil() - ) + image_buffer_0 = Image.open(extracted_receipts[0].buffer) assert image_buffer_0.size == (198, 566) assert extracted_receipts[1].page_id == 0 assert extracted_receipts[1].receipt_id == 1 - image_buffer_1 = ( - pdfium.PdfDocument(extracted_receipts[1].buffer).get_page(0).render().to_pil() - ) + image_buffer_1 = Image.open(extracted_receipts[1].buffer) assert image_buffer_1.size == (206, 382) assert extracted_receipts[2].page_id == 0 assert extracted_receipts[2].receipt_id == 2 - image_buffer_2 = ( - pdfium.PdfDocument(extracted_receipts[2].buffer).get_page(0).render().to_pil() - ) + image_buffer_2 = Image.open(extracted_receipts[2].buffer) assert image_buffer_2.size == (195, 231) assert extracted_receipts[3].page_id == 1 assert extracted_receipts[3].receipt_id == 0 - image_buffer_3 = ( - pdfium.PdfDocument(extracted_receipts[3].buffer).get_page(0).render().to_pil() - ) + image_buffer_3 = Image.open(extracted_receipts[3].buffer) assert image_buffer_3.size == (213, 356) assert extracted_receipts[4].page_id == 1 assert extracted_receipts[4].receipt_id == 1 - image_buffer_4 = ( - pdfium.PdfDocument(extracted_receipts[4].buffer).get_page(0).render().to_pil() - ) + image_buffer_4 = Image.open(extracted_receipts[4].buffer) assert image_buffer_4.size == (212, 516) From 775fe3e3b7c9d2dfb43778137913aa5a1929ff67 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 11 Jun 2024 09:33:04 +0200 Subject: [PATCH 07/13] fix lint --- tests/image_extraction/test_image_extractor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py index 99118fe0..4eb4d311 100644 --- a/tests/image_extraction/test_image_extractor.py +++ b/tests/image_extraction/test_image_extractor.py @@ -13,14 +13,13 @@ def barcode_path(): return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg" + @pytest.fixture def barcode_json_path(): return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json" -def test_barcode_image_extraction( - barcode_path, barcode_json_path -): +def test_barcode_image_extraction(barcode_path, barcode_json_path): with open(barcode_json_path, "rb") as f: response = json.load(f) inference = BarcodeReaderV1(response["document"]["inference"]) From 87de9b720cda2ab48cd3da79ed6e5e8124a0cd6f Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 11 Jun 2024 14:42:54 +0200 Subject: [PATCH 08/13] fix a few suggestions --- examples/multi_receipts_tutorial.py | 2 +- .../image_extraction/common/extracted_image.py | 3 +++ .../image_extraction/common/image_extractor.py | 15 +++++++++++++++ .../extracted_multi_receipt_image.py | 1 - ...xtractor.py => multi_receipts_extractor.py} | 18 +----------------- .../test_multi_receipts_extractor.py | 2 +- 6 files changed, 21 insertions(+), 20 deletions(-) rename mindee/image_extraction/multi_receipts_extractor/{mult_receipts_extractor.py => multi_receipts_extractor.py} (78%) diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py index 5bdc7c87..67ed1bc4 100644 --- a/examples/multi_receipts_tutorial.py +++ b/examples/multi_receipts_tutorial.py @@ -1,5 +1,5 @@ from mindee import Client, PredictResponse, product -from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( +from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import ( extract_receipts, ) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py index bcbe36d1..9817e324 100644 --- a/mindee/image_extraction/common/extracted_image.py +++ b/mindee/image_extraction/common/extracted_image.py @@ -12,6 +12,9 @@ class ExtractedImage: """Generic class for image extraction.""" + page_id: int + """Id of the page the image was extracted from.""" + def __init__(self, buffer: bytes, file_name: str): """ Initialize the ExtractedImage with a buffer and an internal file name. diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py index 59ad8598..88f53466 100644 --- a/mindee/image_extraction/common/image_extractor.py +++ b/mindee/image_extraction/common/image_extractor.py @@ -6,6 +6,7 @@ from PIL import Image from mindee.geometry import Point, get_min_max_x, get_min_max_y +from mindee.input import LocalInputSource def attach_image_as_new_file( # type: ignore @@ -85,3 +86,17 @@ def extract_multiple_images_from_page( # type: ignore ) return extracted_elements + + +def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore + """ + Loads a PDF document from a local input source. + + :param input_file: Local input. + :return: A valid PdfDocument handle. + """ + input_file.file_object.seek(0) + if input_file.is_pdf(): + return pdfium.PdfDocument(input_file.file_object) + + return attach_image_as_new_file(input_file.file_object) diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py index 6c2b2ee1..388c3538 100644 --- a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py +++ b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py @@ -5,7 +5,6 @@ class ExtractedMultiReceiptsImage(ExtractedImage): """Wrapper class for extracted multiple-receipts images.""" _receipt_id: int - _page_id: int def __init__(self, buffer, receipt_id: int, page_id: int): super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf") diff --git a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py similarity index 78% rename from mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py rename to mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py index 2f40194c..f9252d4f 100644 --- a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -1,12 +1,10 @@ import io from typing import List -import pypdfium2 as pdfium - from mindee.error import MindeeError from mindee.image_extraction.common.image_extractor import ( - attach_image_as_new_file, extract_multiple_images_from_page, + load_pdf_doc, ) from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( ExtractedMultiReceiptsImage, @@ -15,20 +13,6 @@ from mindee.parsing.common import Inference -def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore - """ - Loads a PDF document from a local input source. - - :param input_file: Local input. - :return: A valid PdfDocument handle. - """ - input_file.file_object.seek(0) - if input_file.is_pdf(): - return pdfium.PdfDocument(input_file.file_object) - - return attach_image_as_new_file(input_file.file_object) - - def extract_receipts( input_file: LocalInputSource, inference: Inference ) -> List[ExtractedMultiReceiptsImage]: diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py index 9fc87e52..76fbc2e4 100644 --- a/tests/image_extraction/test_multi_receipts_extractor.py +++ b/tests/image_extraction/test_multi_receipts_extractor.py @@ -3,7 +3,7 @@ import pytest from PIL import Image -from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import ( +from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import ( extract_receipts, ) from mindee.input import PathInput From 087287dfbbb194f96056db429cac5eac0f50b59c Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 11 Jun 2024 14:49:44 +0200 Subject: [PATCH 09/13] make more things generic --- .../common/extracted_image.py | 25 ++++++++++++++++--- .../extracted_multi_receipt_image.py | 16 ++---------- .../multi_receipts_extractor.py | 4 ++- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py index 9817e324..23ccc1a1 100644 --- a/mindee/image_extraction/common/extracted_image.py +++ b/mindee/image_extraction/common/extracted_image.py @@ -12,10 +12,18 @@ class ExtractedImage: """Generic class for image extraction.""" - page_id: int + _page_id: int """Id of the page the image was extracted from.""" + _element_id: int + """Id of the element on a given page.""" - def __init__(self, buffer: bytes, file_name: str): + def __init__( + self, + buffer: bytes, + file_name: str, + page_id: int, + element_id: Optional[int] = None, + ) -> None: """ Initialize the ExtractedImage with a buffer and an internal file name. @@ -23,8 +31,10 @@ def __init__(self, buffer: bytes, file_name: str): :param file_name: The internal file name of the image. """ self.buffer = io.BytesIO(buffer) - self.internal_file_name = file_name + self.internal_file_name = f"{file_name}_p{page_id}_{element_id}.pdf" self.buffer.name = self.internal_file_name + self._page_id = page_id + self._element_id = 0 if element_id is None else element_id def save_to_file(self, output_path: str, file_format: Optional[str] = None): """ @@ -59,3 +69,12 @@ def as_source(self) -> FileInput: :returns: A BufferInput source. """ return FileInput(self.buffer) + + @property + def page_id(self): + """ + ID of the page the receipt was found on. + + :return: + """ + return self._page_id diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py index 388c3538..32a3feab 100644 --- a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py +++ b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py @@ -4,12 +4,9 @@ class ExtractedMultiReceiptsImage(ExtractedImage): """Wrapper class for extracted multiple-receipts images.""" - _receipt_id: int - - def __init__(self, buffer, receipt_id: int, page_id: int): - super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf") + def __init__(self, buffer, file_name: str, receipt_id: int, page_id: int): + super().__init__(buffer, file_name, page_id) self._receipt_id = receipt_id - self._page_id = page_id @property def receipt_id(self): @@ -19,12 +16,3 @@ def receipt_id(self): :return: """ return self._receipt_id - - @property - def page_id(self): - """ - ID of the page the receipt was found on. - - :return: - """ - return self._page_id diff --git a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py index f9252d4f..54d92bf9 100644 --- a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -41,7 +41,9 @@ def extract_receipts( receipt.save(buffer, format="JPEG") buffer.seek(0) extracted_receipts.append( - ExtractedMultiReceiptsImage(buffer.read(), receipt_id, page_id) + ExtractedMultiReceiptsImage( + buffer.read(), input_file.filename, receipt_id, page_id + ) ) images.extend(extracted_receipts) return images From 7d03c61ec889ad7786ebcd552bc1a3c2e0db9801 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 11 Jun 2024 15:28:14 +0200 Subject: [PATCH 10/13] update test lib --- tests/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data b/tests/data index 6b2f8563..1cc324c3 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 6b2f85639465c878e70a59337394a8adf14a0b16 +Subproject commit 1cc324c3f4b2e9f9417268552532d2860f8edaa4 From 836bf656169c710be3f5787678a996e769290c4f Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 12 Jun 2024 15:13:22 +0200 Subject: [PATCH 11/13] update syntaxes --- .../common/extracted_image.py | 19 ++++++++----------- .../extracted_multi_receipt_image.py | 10 ++++++++-- .../multi_receipts_extractor.py | 7 +++---- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py index 23ccc1a1..a20fbe17 100644 --- a/mindee/image_extraction/common/extracted_image.py +++ b/mindee/image_extraction/common/extracted_image.py @@ -5,7 +5,7 @@ from PIL import Image from mindee.error import MindeeError -from mindee.input import FileInput +from mindee.input import FileInput, LocalInputSource from mindee.logger import logger @@ -18,21 +18,18 @@ class ExtractedImage: """Id of the element on a given page.""" def __init__( - self, - buffer: bytes, - file_name: str, - page_id: int, - element_id: Optional[int] = None, + self, input_source: LocalInputSource, page_id: int, element_id: int ) -> None: """ Initialize the ExtractedImage with a buffer and an internal file name. - :param buffer: The byte buffer representing the image. - :param file_name: The internal file name of the image. + :param input_source: Local source for input. + :param page_id: ID of the page the element was found on. + :param element_id: ID of the element in a page. """ - self.buffer = io.BytesIO(buffer) - self.internal_file_name = f"{file_name}_p{page_id}_{element_id}.pdf" - self.buffer.name = self.internal_file_name + self.buffer = input_source.file_object + self.buffer.seek(0) + self.internal_file_name = f"{input_source.filename}_p{page_id}_{element_id}.pdf" self._page_id = page_id self._element_id = 0 if element_id is None else element_id diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py index 32a3feab..88a38ca9 100644 --- a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py +++ b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py @@ -1,11 +1,17 @@ from mindee.image_extraction.common import ExtractedImage +from mindee.input import LocalInputSource class ExtractedMultiReceiptsImage(ExtractedImage): """Wrapper class for extracted multiple-receipts images.""" - def __init__(self, buffer, file_name: str, receipt_id: int, page_id: int): - super().__init__(buffer, file_name, page_id) + def __init__( + self, + input_source: LocalInputSource, + page_id: int, + receipt_id: int, + ): + super().__init__(input_source, page_id, receipt_id) self._receipt_id = receipt_id @property diff --git a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py index 54d92bf9..b68529a4 100644 --- a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -9,7 +9,7 @@ from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( ExtractedMultiReceiptsImage, ) -from mindee.input import LocalInputSource +from mindee.input import BytesInput, LocalInputSource from mindee.parsing.common import Inference @@ -40,10 +40,9 @@ def extract_receipts( buffer = io.BytesIO() receipt.save(buffer, format="JPEG") buffer.seek(0) + input_source = BytesInput(buffer.read(), input_file.filename) extracted_receipts.append( - ExtractedMultiReceiptsImage( - buffer.read(), input_file.filename, receipt_id, page_id - ) + ExtractedMultiReceiptsImage(input_source, page_id, receipt_id) ) images.extend(extracted_receipts) return images From a4810a881f2f74d30b847bf6a757a54ae21851f7 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 12 Jun 2024 16:14:53 +0200 Subject: [PATCH 12/13] apply many suggestions --- mindee/image_extraction/common/__init__.py | 3 +- .../common/extracted_image.py | 14 +++++++- .../common/image_extractor.py | 34 +++++-------------- .../multi_receipts_extractor/__init__.py | 4 +-- .../extracted_multi_receipt_image.py | 24 ------------- .../multi_receipts_extractor.py | 28 +++++++-------- mindee/input/sources.py | 8 +++-- .../image_extraction/test_image_extractor.py | 11 ++++-- .../test_multi_receipts_extractor.py | 22 ++++++------ 9 files changed, 60 insertions(+), 88 deletions(-) delete mode 100644 mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py index befbed4f..22267c2c 100644 --- a/mindee/image_extraction/common/__init__.py +++ b/mindee/image_extraction/common/__init__.py @@ -1,6 +1,5 @@ from mindee.image_extraction.common.extracted_image import ExtractedImage from mindee.image_extraction.common.image_extractor import ( attach_image_as_new_file, - extract_multiple_images_from_image, - extract_multiple_images_from_page, + extract_multiple_images_from_source, ) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py index a20fbe17..b835ab2e 100644 --- a/mindee/image_extraction/common/extracted_image.py +++ b/mindee/image_extraction/common/extracted_image.py @@ -27,7 +27,8 @@ def __init__( :param page_id: ID of the page the element was found on. :param element_id: ID of the element in a page. """ - self.buffer = input_source.file_object + self.buffer = io.BytesIO(input_source.file_object.read()) + self.buffer.name = input_source.filename self.buffer.seek(0) self.internal_file_name = f"{input_source.filename}_p{page_id}_{element_id}.pdf" self._page_id = page_id @@ -42,6 +43,7 @@ def save_to_file(self, output_path: str, file_format: Optional[str] = None): :raises MindeeError: If an invalid path or filename is provided. """ try: + print(f"SAVING {self.internal_file_name}") resolved_path = Path(output_path).resolve() if not file_format: if len(resolved_path.suffix) < 1: @@ -65,6 +67,7 @@ def as_source(self) -> FileInput: :returns: A BufferInput source. """ + self.buffer.seek(0) return FileInput(self.buffer) @property @@ -75,3 +78,12 @@ def page_id(self): :return: """ return self._page_id + + @property + def element_id(self): + """ + Id of the element on a given page. + + :return: + """ + return self._element_id diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py index 88f53466..6b8e8e2a 100644 --- a/mindee/image_extraction/common/image_extractor.py +++ b/mindee/image_extraction/common/image_extractor.py @@ -1,6 +1,5 @@ import io -from pathlib import Path -from typing import BinaryIO, List, Union +from typing import BinaryIO, List import pypdfium2 as pdfium from PIL import Image @@ -41,35 +40,20 @@ def attach_image_as_new_file( # type: ignore return pdf -def extract_multiple_images_from_image( - image: Union[bytes, str, Path], polygons: List[List[Point]] -) -> List[Image.Image]: - """ - Extracts elements from an image based on a list of bounding boxes. - - :param image: Image as a path - :param polygons: List of coordinates to pull the elements from. - :return: List of byte arrays representing the extracted elements. - """ - return extract_multiple_images_from_page(Image.open(image), polygons) - - -def extract_multiple_images_from_page( # type: ignore - page: Union[pdfium.PdfPage, Image.Image], polygons: List[List[Point]] +def extract_multiple_images_from_source( + input_source: LocalInputSource, page_id: int, polygons: List[List[Point]] ) -> List[Image.Image]: """ Extracts elements from a page based on a list of bounding boxes. - :param page: Single PDF Page. If the page is a pdfium.PdfPage, it is rasterized first. + :param input_source: Local Input source to extract elements from. + :param page_id: id of the page to extract from. :param polygons: List of coordinates to pull the elements from. :return: List of byte arrays representing the extracted elements. """ - if isinstance(page, pdfium.PdfPage): - page_content = page.render().to_pil() - width, height = page.get_size() - else: - page_content = page - width, height = page.size + page = load_pdf_doc(input_source).get_page(page_id) + page_content = page.render().to_pil() + width, height = page.get_size() extracted_elements = [] for polygon in polygons: @@ -95,8 +79,8 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i :param input_file: Local input. :return: A valid PdfDocument handle. """ - input_file.file_object.seek(0) if input_file.is_pdf(): + input_file.file_object.seek(0) return pdfium.PdfDocument(input_file.file_object) return attach_image_as_new_file(input_file.file_object) diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py index 1169000a..4c234ce4 100644 --- a/mindee/image_extraction/multi_receipts_extractor/__init__.py +++ b/mindee/image_extraction/multi_receipts_extractor/__init__.py @@ -1,3 +1 @@ -from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( - ExtractedMultiReceiptsImage, -) +from mindee.image_extraction.multi_receipts_extractor import multi_receipts_extractor diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py deleted file mode 100644 index 88a38ca9..00000000 --- a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py +++ /dev/null @@ -1,24 +0,0 @@ -from mindee.image_extraction.common import ExtractedImage -from mindee.input import LocalInputSource - - -class ExtractedMultiReceiptsImage(ExtractedImage): - """Wrapper class for extracted multiple-receipts images.""" - - def __init__( - self, - input_source: LocalInputSource, - page_id: int, - receipt_id: int, - ): - super().__init__(input_source, page_id, receipt_id) - self._receipt_id = receipt_id - - @property - def receipt_id(self): - """ - ID of the receipt on a given page. - - :return: - """ - return self._receipt_id diff --git a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py index b68529a4..a8990ec4 100644 --- a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -2,47 +2,43 @@ from typing import List from mindee.error import MindeeError +from mindee.image_extraction.common.extracted_image import ExtractedImage from mindee.image_extraction.common.image_extractor import ( - extract_multiple_images_from_page, - load_pdf_doc, -) -from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import ( - ExtractedMultiReceiptsImage, + extract_multiple_images_from_source, ) from mindee.input import BytesInput, LocalInputSource from mindee.parsing.common import Inference def extract_receipts( - input_file: LocalInputSource, inference: Inference -) -> List[ExtractedMultiReceiptsImage]: + input_source: LocalInputSource, inference: Inference +) -> List[ExtractedImage]: """ Extracts individual receipts from multi-receipts documents. - :param input_file: File to extract sub-receipts from. + :param input_source: Local Input Source to extract sub-receipts from. :param inference: Results of the inference. :return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage. """ - images: List[ExtractedMultiReceiptsImage] = [] + images: List[ExtractedImage] = [] if not inference.prediction.receipts: raise MindeeError( "No possible receipts candidates found for MultiReceipts extraction." ) - pdf_doc = load_pdf_doc(input_file) - for page_id, page in enumerate(pdf_doc): + for page_id in range(input_source.count_doc_pages()): receipt_positions = [ receipt.bounding_box for receipt in inference.pages[page_id].prediction.receipts ] extracted_receipts = [] - receipts = extract_multiple_images_from_page(page, receipt_positions) + receipts = extract_multiple_images_from_source( + input_source, page_id, receipt_positions + ) for receipt_id, receipt in enumerate(receipts): buffer = io.BytesIO() receipt.save(buffer, format="JPEG") buffer.seek(0) - input_source = BytesInput(buffer.read(), input_file.filename) - extracted_receipts.append( - ExtractedMultiReceiptsImage(input_source, page_id, receipt_id) - ) + bytes_input = BytesInput(buffer.read(), input_source.filename) + extracted_receipts.append(ExtractedImage(bytes_input, page_id, receipt_id)) images.extend(extracted_receipts) return images diff --git a/mindee/input/sources.py b/mindee/input/sources.py index 5978d17d..fd0f9831 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources.py @@ -116,9 +116,11 @@ def count_doc_pages(self) -> int: :return: the number of pages. """ - self.file_object.seek(0) - pdf = pdfium.PdfDocument(self.file_object) - return len(pdf) + if self.is_pdf(): + self.file_object.seek(0) + pdf = pdfium.PdfDocument(self.file_object) + return len(pdf) + return 1 def process_pdf( self, diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py index 4eb4d311..3dbbb1d1 100644 --- a/tests/image_extraction/test_image_extractor.py +++ b/tests/image_extraction/test_image_extractor.py @@ -3,7 +3,7 @@ import pytest from PIL import Image -from mindee.image_extraction.common import extract_multiple_images_from_image +from mindee.image_extraction.common import extract_multiple_images_from_source from mindee.input import PathInput from mindee.product import BarcodeReaderV1 from tests.test_inputs import PRODUCT_DATA_DIR @@ -25,8 +25,13 @@ def test_barcode_image_extraction(barcode_path, barcode_json_path): inference = BarcodeReaderV1(response["document"]["inference"]) barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d] barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d] - extracted_barcodes_1d = extract_multiple_images_from_image(barcode_path, barcodes_1) - extracted_barcodes_2d = extract_multiple_images_from_image(barcode_path, barcodes_2) + input_source = PathInput(barcode_path) + extracted_barcodes_1d = extract_multiple_images_from_source( + input_source, 0, barcodes_1 + ) + extracted_barcodes_2d = extract_multiple_images_from_source( + input_source, 0, barcodes_2 + ) assert len(extracted_barcodes_1d) == 1 assert len(extracted_barcodes_2d) == 2 diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py index 76fbc2e4..16e5d994 100644 --- a/tests/image_extraction/test_multi_receipts_extractor.py +++ b/tests/image_extraction/test_multi_receipts_extractor.py @@ -49,32 +49,32 @@ def test_single_page_multi_receipt_split( assert len(extracted_receipts) == 6 assert extracted_receipts[0].page_id == 0 - assert extracted_receipts[0].receipt_id == 0 + assert extracted_receipts[0].element_id == 0 image_buffer_0 = Image.open(extracted_receipts[0].buffer) assert image_buffer_0.size == (341, 505) assert extracted_receipts[1].page_id == 0 - assert extracted_receipts[1].receipt_id == 1 + assert extracted_receipts[1].element_id == 1 image_buffer_1 = Image.open(extracted_receipts[1].buffer) assert image_buffer_1.size == (461, 908) assert extracted_receipts[2].page_id == 0 - assert extracted_receipts[2].receipt_id == 2 + assert extracted_receipts[2].element_id == 2 image_buffer_2 = Image.open(extracted_receipts[2].buffer) assert image_buffer_2.size == (471, 790) assert extracted_receipts[3].page_id == 0 - assert extracted_receipts[3].receipt_id == 3 + assert extracted_receipts[3].element_id == 3 image_buffer_3 = Image.open(extracted_receipts[3].buffer) assert image_buffer_3.size == (464, 1200) assert extracted_receipts[4].page_id == 0 - assert extracted_receipts[4].receipt_id == 4 + assert extracted_receipts[4].element_id == 4 image_buffer_4 = Image.open(extracted_receipts[4].buffer) assert image_buffer_4.size == (530, 943) assert extracted_receipts[5].page_id == 0 - assert extracted_receipts[5].receipt_id == 5 + assert extracted_receipts[5].element_id == 5 image_buffer_5 = Image.open(extracted_receipts[5].buffer) assert image_buffer_5.size == (367, 593) @@ -90,26 +90,26 @@ def test_multi_page_receipt_split( assert len(extracted_receipts) == 5 assert extracted_receipts[0].page_id == 0 - assert extracted_receipts[0].receipt_id == 0 + assert extracted_receipts[0].element_id == 0 image_buffer_0 = Image.open(extracted_receipts[0].buffer) assert image_buffer_0.size == (198, 566) assert extracted_receipts[1].page_id == 0 - assert extracted_receipts[1].receipt_id == 1 + assert extracted_receipts[1].element_id == 1 image_buffer_1 = Image.open(extracted_receipts[1].buffer) assert image_buffer_1.size == (206, 382) assert extracted_receipts[2].page_id == 0 - assert extracted_receipts[2].receipt_id == 2 + assert extracted_receipts[2].element_id == 2 image_buffer_2 = Image.open(extracted_receipts[2].buffer) assert image_buffer_2.size == (195, 231) assert extracted_receipts[3].page_id == 1 - assert extracted_receipts[3].receipt_id == 0 + assert extracted_receipts[3].element_id == 0 image_buffer_3 = Image.open(extracted_receipts[3].buffer) assert image_buffer_3.size == (213, 356) assert extracted_receipts[4].page_id == 1 - assert extracted_receipts[4].receipt_id == 1 + assert extracted_receipts[4].element_id == 1 image_buffer_4 = Image.open(extracted_receipts[4].buffer) assert image_buffer_4.size == (212, 516) From 0e267092a2a30692b6532b6ca92551cb5336f557 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 12 Jun 2024 16:41:49 +0200 Subject: [PATCH 13/13] simplify extractedimage creation --- .../common/image_extractor.py | 32 +++++++++++++------ .../multi_receipts_extractor.py | 17 +++------- .../image_extraction/test_image_extractor.py | 6 ++-- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py index 6b8e8e2a..f703f95e 100644 --- a/mindee/image_extraction/common/image_extractor.py +++ b/mindee/image_extraction/common/image_extractor.py @@ -5,7 +5,8 @@ from PIL import Image from mindee.geometry import Point, get_min_max_x, get_min_max_y -from mindee.input import LocalInputSource +from mindee.image_extraction.common import ExtractedImage +from mindee.input import BytesInput, LocalInputSource def attach_image_as_new_file( # type: ignore @@ -42,7 +43,7 @@ def attach_image_as_new_file( # type: ignore def extract_multiple_images_from_source( input_source: LocalInputSource, page_id: int, polygons: List[List[Point]] -) -> List[Image.Image]: +) -> List[ExtractedImage]: """ Extracts elements from a page based on a list of bounding boxes. @@ -56,17 +57,30 @@ def extract_multiple_images_from_source( width, height = page.get_size() extracted_elements = [] - for polygon in polygons: + for element_id, polygon in enumerate(polygons): min_max_x = get_min_max_x(polygon) min_max_y = get_min_max_y(polygon) - left = min_max_x.min * width - right = min_max_x.max * width - top = min_max_y.min * height - bottom = min_max_y.max * height - + pillow_page = page_content.crop( + ( + int(min_max_x.min * width), + int(min_max_y.min * height), + int(min_max_x.max * width), + int(min_max_y.max * height), + ) + ) + buffer = io.BytesIO() + pillow_page.save(buffer, format="JPEG") + buffer.seek(0) extracted_elements.append( - page_content.crop((int(left), int(top), int(right), int(bottom))) + ExtractedImage( + BytesInput( + buffer.read(), + f"{input_source.filename}_p{page_id}_e{element_id}.jpg", + ), + page_id, + element_id, + ) ) return extracted_elements diff --git a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py index a8990ec4..cd2008c3 100644 --- a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py +++ b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -1,4 +1,3 @@ -import io from typing import List from mindee.error import MindeeError @@ -6,7 +5,7 @@ from mindee.image_extraction.common.image_extractor import ( extract_multiple_images_from_source, ) -from mindee.input import BytesInput, LocalInputSource +from mindee.input import LocalInputSource from mindee.parsing.common import Inference @@ -30,15 +29,9 @@ def extract_receipts( receipt.bounding_box for receipt in inference.pages[page_id].prediction.receipts ] - extracted_receipts = [] - receipts = extract_multiple_images_from_source( - input_source, page_id, receipt_positions + images.extend( + extract_multiple_images_from_source( + input_source, page_id, receipt_positions + ) ) - for receipt_id, receipt in enumerate(receipts): - buffer = io.BytesIO() - receipt.save(buffer, format="JPEG") - buffer.seek(0) - bytes_input = BytesInput(buffer.read(), input_source.filename) - extracted_receipts.append(ExtractedImage(bytes_input, page_id, receipt_id)) - images.extend(extracted_receipts) return images diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py index 3dbbb1d1..8e0e0404 100644 --- a/tests/image_extraction/test_image_extractor.py +++ b/tests/image_extraction/test_image_extractor.py @@ -35,6 +35,6 @@ def test_barcode_image_extraction(barcode_path, barcode_json_path): assert len(extracted_barcodes_1d) == 1 assert len(extracted_barcodes_2d) == 2 - assert extracted_barcodes_1d[0].size == (353, 200) - assert extracted_barcodes_2d[0].size == (214, 216) - assert extracted_barcodes_2d[1].size == (193, 201) + assert Image.open(extracted_barcodes_1d[0].buffer).size == (353, 200) + assert Image.open(extracted_barcodes_2d[0].buffer).size == (214, 216) + assert Image.open(extracted_barcodes_2d[1].buffer).size == (193, 201)