diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 785a98de..50fa2cfb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -45,3 +45,4 @@ repos: - types-requests - types-setuptools - importlib-metadata + - types-Pillow diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py new file mode 100644 index 00000000..67ed1bc4 --- /dev/null +++ b/examples/multi_receipts_tutorial.py @@ -0,0 +1,20 @@ +from mindee import Client, PredictResponse, product +from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import ( + extract_receipts, +) + +# Init a new client +mindee_client = Client() + +# Load a file from disk +input_doc = mindee_client.source_from_path("path/to/your/file.ext") +result_split: PredictResponse = mindee_client.parse( + product.MultiReceiptsDetectorV1, input_doc, close_file=False +) + +extracted_receipts = extract_receipts(input_doc, result_split.document.inference) +for receipt in extracted_receipts: + receipt_as_source = receipt.as_source() + # receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt + result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source()) + print(result_receipt.document) diff --git a/mindee/image_extraction/__init__.py b/mindee/image_extraction/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py new file mode 100644 index 00000000..22267c2c --- /dev/null +++ b/mindee/image_extraction/common/__init__.py @@ -0,0 +1,5 @@ +from mindee.image_extraction.common.extracted_image import ExtractedImage +from mindee.image_extraction.common.image_extractor import ( + attach_image_as_new_file, + extract_multiple_images_from_source, +) diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py new file mode 100644 index 00000000..b835ab2e --- /dev/null +++ b/mindee/image_extraction/common/extracted_image.py @@ -0,0 +1,89 @@ +import io +from pathlib import Path +from typing import Optional + +from PIL import Image + +from mindee.error import MindeeError +from mindee.input import FileInput, LocalInputSource +from mindee.logger import logger + + +class ExtractedImage: + """Generic class for image extraction.""" + + _page_id: int + """Id of the page the image was extracted from.""" + _element_id: int + """Id of the element on a given page.""" + + def __init__( + self, input_source: LocalInputSource, page_id: int, element_id: int + ) -> None: + """ + Initialize the ExtractedImage with a buffer and an internal file name. + + :param input_source: Local source for input. + :param page_id: ID of the page the element was found on. + :param element_id: ID of the element in a page. + """ + self.buffer = io.BytesIO(input_source.file_object.read()) + self.buffer.name = input_source.filename + self.buffer.seek(0) + self.internal_file_name = f"{input_source.filename}_p{page_id}_{element_id}.pdf" + self._page_id = page_id + self._element_id = 0 if element_id is None else element_id + + def save_to_file(self, output_path: str, file_format: Optional[str] = None): + """ + Saves the document to a file. + + :param output_path: Path to save the file to. + :param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided. + :raises MindeeError: If an invalid path or filename is provided. + """ + try: + print(f"SAVING {self.internal_file_name}") + resolved_path = Path(output_path).resolve() + if not file_format: + if len(resolved_path.suffix) < 1: + raise ValueError("Invalid file format.") + file_format = ( + resolved_path.suffix.upper() + ) # technically redundant since PIL applies an upper operation + # to the parameter , but older versions may not do so. + self.buffer.seek(0) + image = Image.open(self.buffer) + image.save(resolved_path, format=file_format) + logger.info("File saved successfully to '%s'.", resolved_path) + except TypeError as exc: + raise MindeeError("Invalid path/filename provided.") from exc + except Exception as exc: + raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc + + def as_source(self) -> FileInput: + """ + Return the file as a Mindee-compatible BufferInput source. + + :returns: A BufferInput source. + """ + self.buffer.seek(0) + return FileInput(self.buffer) + + @property + def page_id(self): + """ + ID of the page the receipt was found on. + + :return: + """ + return self._page_id + + @property + def element_id(self): + """ + Id of the element on a given page. + + :return: + """ + return self._element_id diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py new file mode 100644 index 00000000..f703f95e --- /dev/null +++ b/mindee/image_extraction/common/image_extractor.py @@ -0,0 +1,100 @@ +import io +from typing import BinaryIO, List + +import pypdfium2 as pdfium +from PIL import Image + +from mindee.geometry import Point, get_min_max_x, get_min_max_y +from mindee.image_extraction.common import ExtractedImage +from mindee.input import BytesInput, LocalInputSource + + +def attach_image_as_new_file( # type: ignore + input_buffer: BinaryIO, +) -> pdfium.PdfDocument: + """ + Attaches an image as a new page in a PdfDocument object. + + :param input_buffer: Input buffer. Only supports JPEG. + :return: A PdfDocument handle. + """ + # Create a new page in the PdfDocument + input_buffer.seek(0) + image = Image.open(input_buffer) + image.convert("RGB") + image_buffer = io.BytesIO() + image.save(image_buffer, format="JPEG") + + pdf = pdfium.PdfDocument.new() + + image_pdf = pdfium.PdfImage.new(pdf) + image_pdf.load_jpeg(image_buffer) + width, height = image_pdf.get_size() + + matrix = pdfium.PdfMatrix().scale(width, height) + image_pdf.set_matrix(matrix) + + page = pdf.new_page(width, height) + page.insert_obj(image_pdf) + page.gen_content() + image.close() + return pdf + + +def extract_multiple_images_from_source( + input_source: LocalInputSource, page_id: int, polygons: List[List[Point]] +) -> List[ExtractedImage]: + """ + Extracts elements from a page based on a list of bounding boxes. + + :param input_source: Local Input source to extract elements from. + :param page_id: id of the page to extract from. + :param polygons: List of coordinates to pull the elements from. + :return: List of byte arrays representing the extracted elements. + """ + page = load_pdf_doc(input_source).get_page(page_id) + page_content = page.render().to_pil() + width, height = page.get_size() + + extracted_elements = [] + for element_id, polygon in enumerate(polygons): + min_max_x = get_min_max_x(polygon) + min_max_y = get_min_max_y(polygon) + + pillow_page = page_content.crop( + ( + int(min_max_x.min * width), + int(min_max_y.min * height), + int(min_max_x.max * width), + int(min_max_y.max * height), + ) + ) + buffer = io.BytesIO() + pillow_page.save(buffer, format="JPEG") + buffer.seek(0) + extracted_elements.append( + ExtractedImage( + BytesInput( + buffer.read(), + f"{input_source.filename}_p{page_id}_e{element_id}.jpg", + ), + page_id, + element_id, + ) + ) + + return extracted_elements + + +def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore + """ + Loads a PDF document from a local input source. + + :param input_file: Local input. + :return: A valid PdfDocument handle. + """ + if input_file.is_pdf(): + input_file.file_object.seek(0) + return pdfium.PdfDocument(input_file.file_object) + + return attach_image_as_new_file(input_file.file_object) diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py new file mode 100644 index 00000000..4c234ce4 --- /dev/null +++ b/mindee/image_extraction/multi_receipts_extractor/__init__.py @@ -0,0 +1 @@ +from mindee.image_extraction.multi_receipts_extractor import multi_receipts_extractor diff --git a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py new file mode 100644 index 00000000..cd2008c3 --- /dev/null +++ b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -0,0 +1,37 @@ +from typing import List + +from mindee.error import MindeeError +from mindee.image_extraction.common.extracted_image import ExtractedImage +from mindee.image_extraction.common.image_extractor import ( + extract_multiple_images_from_source, +) +from mindee.input import LocalInputSource +from mindee.parsing.common import Inference + + +def extract_receipts( + input_source: LocalInputSource, inference: Inference +) -> List[ExtractedImage]: + """ + Extracts individual receipts from multi-receipts documents. + + :param input_source: Local Input Source to extract sub-receipts from. + :param inference: Results of the inference. + :return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage. + """ + images: List[ExtractedImage] = [] + if not inference.prediction.receipts: + raise MindeeError( + "No possible receipts candidates found for MultiReceipts extraction." + ) + for page_id in range(input_source.count_doc_pages()): + receipt_positions = [ + receipt.bounding_box + for receipt in inference.pages[page_id].prediction.receipts + ] + images.extend( + extract_multiple_images_from_source( + input_source, page_id, receipt_positions + ) + ) + return images diff --git a/mindee/input/local_response.py b/mindee/input/local_response.py index 37c2192a..d804d1ed 100644 --- a/mindee/input/local_response.py +++ b/mindee/input/local_response.py @@ -98,7 +98,7 @@ def is_valid_hmac_signature( Checks if the hmac signature of the local response is valid. :param secret_key: Secret key, given as a string. - :param signature: + :param signature: HMAC signature, given as a string. :return: True if the HMAC signature is valid. """ return signature == self.get_hmac_signature(secret_key) diff --git a/mindee/input/sources.py b/mindee/input/sources.py index 5978d17d..fd0f9831 100644 --- a/mindee/input/sources.py +++ b/mindee/input/sources.py @@ -116,9 +116,11 @@ def count_doc_pages(self) -> int: :return: the number of pages. """ - self.file_object.seek(0) - pdf = pdfium.PdfDocument(self.file_object) - return len(pdf) + if self.is_pdf(): + self.file_object.seek(0) + pdf = pdfium.PdfDocument(self.file_object) + return len(pdf) + return 1 def process_pdf( self, diff --git a/mindee/parsing/standard/locale.py b/mindee/parsing/standard/locale.py index 9ba78159..2b692d91 100644 --- a/mindee/parsing/standard/locale.py +++ b/mindee/parsing/standard/locale.py @@ -27,7 +27,11 @@ def __init__( :param reconstructed: Bool for reconstructed object (not extracted in the API) :param page_id: Page number for multi-page document """ - value_key = "value" if "value" in raw_prediction else "language" + value_key = ( + "value" + if ("value" in raw_prediction and raw_prediction["value"]) + else "language" + ) super().__init__( raw_prediction, diff --git a/pyproject.toml b/pyproject.toml index 4874a326..d0d9242d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,8 @@ safe_licenses = [ "MIT License", "Mozilla Public License 2.0 (MPL 2.0)", "BSD License", - "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty" + "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty", + "Historical Permission Notice and Disclaimer (HPND)" ] [tool.pytest.ini_options] diff --git a/setup.cfg b/setup.cfg index 5d100563..736e4639 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ include_package_data = True python_requires = >=3.7 install_requires = pypdfium2>=4.0,<5 + Pillow>=9.5.0 pytz>=2023.3 requests~=2.31 diff --git a/tests/data b/tests/data index abe2a996..1cc324c3 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit abe2a996f71ca3242693af0439f3bf96b4ce7781 +Subproject commit 1cc324c3f4b2e9f9417268552532d2860f8edaa4 diff --git a/tests/image_extraction/__init__.py b/tests/image_extraction/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py new file mode 100644 index 00000000..8e0e0404 --- /dev/null +++ b/tests/image_extraction/test_image_extractor.py @@ -0,0 +1,40 @@ +import json + +import pytest +from PIL import Image + +from mindee.image_extraction.common import extract_multiple_images_from_source +from mindee.input import PathInput +from mindee.product import BarcodeReaderV1 +from tests.test_inputs import PRODUCT_DATA_DIR + + +@pytest.fixture +def barcode_path(): + return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg" + + +@pytest.fixture +def barcode_json_path(): + return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json" + + +def test_barcode_image_extraction(barcode_path, barcode_json_path): + with open(barcode_json_path, "rb") as f: + response = json.load(f) + inference = BarcodeReaderV1(response["document"]["inference"]) + barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d] + barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d] + input_source = PathInput(barcode_path) + extracted_barcodes_1d = extract_multiple_images_from_source( + input_source, 0, barcodes_1 + ) + extracted_barcodes_2d = extract_multiple_images_from_source( + input_source, 0, barcodes_2 + ) + assert len(extracted_barcodes_1d) == 1 + assert len(extracted_barcodes_2d) == 2 + + assert Image.open(extracted_barcodes_1d[0].buffer).size == (353, 200) + assert Image.open(extracted_barcodes_2d[0].buffer).size == (214, 216) + assert Image.open(extracted_barcodes_2d[1].buffer).size == (193, 201) diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py new file mode 100644 index 00000000..16e5d994 --- /dev/null +++ b/tests/image_extraction/test_multi_receipts_extractor.py @@ -0,0 +1,115 @@ +import json + +import pytest +from PIL import Image + +from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import ( + extract_receipts, +) +from mindee.input import PathInput +from mindee.product import MultiReceiptsDetectorV1 +from tests.test_inputs import PRODUCT_DATA_DIR + + +@pytest.fixture +def multi_receipts_single_page_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "default_sample.jpg" + + +@pytest.fixture +def multi_receipts_single_page_json_path(): + return ( + PRODUCT_DATA_DIR / "multi_receipts_detector" / "response_v1" / "complete.json" + ) + + +@pytest.fixture +def multi_receipts_multi_page_path(): + return PRODUCT_DATA_DIR / "multi_receipts_detector" / "multipage_sample.pdf" + + +@pytest.fixture +def multi_receipts_multi_page_json_path(): + return ( + PRODUCT_DATA_DIR + / "multi_receipts_detector" + / "response_v1" + / "multipage_sample.json" + ) + + +def test_single_page_multi_receipt_split( + multi_receipts_single_page_path, multi_receipts_single_page_json_path +): + input_sample = PathInput(multi_receipts_single_page_path) + with open(multi_receipts_single_page_json_path, "rb") as f: + response = json.load(f) + doc = MultiReceiptsDetectorV1(response["document"]["inference"]) + extracted_receipts = extract_receipts(input_sample, doc) + assert len(extracted_receipts) == 6 + + assert extracted_receipts[0].page_id == 0 + assert extracted_receipts[0].element_id == 0 + image_buffer_0 = Image.open(extracted_receipts[0].buffer) + assert image_buffer_0.size == (341, 505) + + assert extracted_receipts[1].page_id == 0 + assert extracted_receipts[1].element_id == 1 + image_buffer_1 = Image.open(extracted_receipts[1].buffer) + assert image_buffer_1.size == (461, 908) + + assert extracted_receipts[2].page_id == 0 + assert extracted_receipts[2].element_id == 2 + image_buffer_2 = Image.open(extracted_receipts[2].buffer) + assert image_buffer_2.size == (471, 790) + + assert extracted_receipts[3].page_id == 0 + assert extracted_receipts[3].element_id == 3 + image_buffer_3 = Image.open(extracted_receipts[3].buffer) + assert image_buffer_3.size == (464, 1200) + + assert extracted_receipts[4].page_id == 0 + assert extracted_receipts[4].element_id == 4 + image_buffer_4 = Image.open(extracted_receipts[4].buffer) + assert image_buffer_4.size == (530, 943) + + assert extracted_receipts[5].page_id == 0 + assert extracted_receipts[5].element_id == 5 + image_buffer_5 = Image.open(extracted_receipts[5].buffer) + assert image_buffer_5.size == (367, 593) + + +def test_multi_page_receipt_split( + multi_receipts_multi_page_path, multi_receipts_multi_page_json_path +): + input_sample = PathInput(multi_receipts_multi_page_path) + with open(multi_receipts_multi_page_json_path, "rb") as f: + response = json.load(f) + doc = MultiReceiptsDetectorV1(response["document"]["inference"]) + extracted_receipts = extract_receipts(input_sample, doc) + assert len(extracted_receipts) == 5 + + assert extracted_receipts[0].page_id == 0 + assert extracted_receipts[0].element_id == 0 + image_buffer_0 = Image.open(extracted_receipts[0].buffer) + assert image_buffer_0.size == (198, 566) + + assert extracted_receipts[1].page_id == 0 + assert extracted_receipts[1].element_id == 1 + image_buffer_1 = Image.open(extracted_receipts[1].buffer) + assert image_buffer_1.size == (206, 382) + + assert extracted_receipts[2].page_id == 0 + assert extracted_receipts[2].element_id == 2 + image_buffer_2 = Image.open(extracted_receipts[2].buffer) + assert image_buffer_2.size == (195, 231) + + assert extracted_receipts[3].page_id == 1 + assert extracted_receipts[3].element_id == 0 + image_buffer_3 = Image.open(extracted_receipts[3].buffer) + assert image_buffer_3.size == (213, 356) + + assert extracted_receipts[4].page_id == 1 + assert extracted_receipts[4].element_id == 1 + image_buffer_4 = Image.open(extracted_receipts[4].buffer) + assert image_buffer_4.size == (212, 516)