✨ add support for multi-receipt extraction (#240)

sebastianMindee · web-flow · commit d5a13367dfab · 2024-06-12T16:58:43.000+02:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,3 +45,4 @@ repos:
           - types-requests
           - types-setuptools
           - importlib-metadata
+          - types-Pillow
diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py
@@ -0,0 +1,20 @@
+from mindee import Client, PredictResponse, product
+from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
+    extract_receipts,
+)
+
+# Init a new client
+mindee_client = Client()
+
+# Load a file from disk
+input_doc = mindee_client.source_from_path("path/to/your/file.ext")
+result_split: PredictResponse = mindee_client.parse(
+    product.MultiReceiptsDetectorV1, input_doc, close_file=False
+)
+
+extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
+for receipt in extracted_receipts:
+    receipt_as_source = receipt.as_source()
+    # receipt.save_to_file(f"./{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
+    result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
+    print(result_receipt.document)
diff --git a/mindee/image_extraction/__init__.py b/mindee/image_extraction/__init__.py
diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py
@@ -0,0 +1,5 @@
+from mindee.image_extraction.common.extracted_image import ExtractedImage
+from mindee.image_extraction.common.image_extractor import (
+    attach_image_as_new_file,
+    extract_multiple_images_from_source,
+)
diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py
@@ -0,0 +1,89 @@
+import io
+from pathlib import Path
+from typing import Optional
+
+from PIL import Image
+
+from mindee.error import MindeeError
+from mindee.input import FileInput, LocalInputSource
+from mindee.logger import logger
+
+
+class ExtractedImage:
+    """Generic class for image extraction."""
+
+    _page_id: int
+    """Id of the page the image was extracted from."""
+    _element_id: int
+    """Id of the element on a given page."""
+
+    def __init__(
+        self, input_source: LocalInputSource, page_id: int, element_id: int
+    ) -> None:
+        """
+        Initialize the ExtractedImage with a buffer and an internal file name.
+
+        :param input_source: Local source for input.
+        :param page_id: ID of the page the element was found on.
+        :param element_id: ID of the element in a page.
+        """
+        self.buffer = io.BytesIO(input_source.file_object.read())
+        self.buffer.name = input_source.filename
+        self.buffer.seek(0)
+        self.internal_file_name = f"{input_source.filename}_p{page_id}_{element_id}.pdf"
+        self._page_id = page_id
+        self._element_id = 0 if element_id is None else element_id
+
+    def save_to_file(self, output_path: str, file_format: Optional[str] = None):
+        """
+        Saves the document to a file.
+
+        :param output_path: Path to save the file to.
+        :param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided.
+        :raises MindeeError: If an invalid path or filename is provided.
+        """
+        try:
+            print(f"SAVING {self.internal_file_name}")
+            resolved_path = Path(output_path).resolve()
+            if not file_format:
+                if len(resolved_path.suffix) < 1:
+                    raise ValueError("Invalid file format.")
+                file_format = (
+                    resolved_path.suffix.upper()
+                )  # technically redundant since PIL applies an upper operation
+                # to the parameter , but older versions may not do so.
+            self.buffer.seek(0)
+            image = Image.open(self.buffer)
+            image.save(resolved_path, format=file_format)
+            logger.info("File saved successfully to '%s'.", resolved_path)
+        except TypeError as exc:
+            raise MindeeError("Invalid path/filename provided.") from exc
+        except Exception as exc:
+            raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc
+
+    def as_source(self) -> FileInput:
+        """
+        Return the file as a Mindee-compatible BufferInput source.
+
+        :returns: A BufferInput source.
+        """
+        self.buffer.seek(0)
+        return FileInput(self.buffer)
+
+    @property
+    def page_id(self):
+        """
+        ID of the page the receipt was found on.
+
+        :return:
+        """
+        return self._page_id
+
+    @property
+    def element_id(self):
+        """
+        Id of the element on a given page.
+
+        :return:
+        """
+        return self._element_id
diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py
@@ -0,0 +1,100 @@
+import io
+from typing import BinaryIO, List
+
+import pypdfium2 as pdfium
+from PIL import Image
+
+from mindee.geometry import Point, get_min_max_x, get_min_max_y
+from mindee.image_extraction.common import ExtractedImage
+from mindee.input import BytesInput, LocalInputSource
+
+
+def attach_image_as_new_file(  # type: ignore
+    input_buffer: BinaryIO,
+) -> pdfium.PdfDocument:
+    """
+    Attaches an image as a new page in a PdfDocument object.
+
+    :param input_buffer: Input buffer. Only supports JPEG.
+    :return: A PdfDocument handle.
+    """
+    # Create a new page in the PdfDocument
+    input_buffer.seek(0)
+    image = Image.open(input_buffer)
+    image.convert("RGB")
+    image_buffer = io.BytesIO()
+    image.save(image_buffer, format="JPEG")
+
+    pdf = pdfium.PdfDocument.new()
+
+    image_pdf = pdfium.PdfImage.new(pdf)
+    image_pdf.load_jpeg(image_buffer)
+    width, height = image_pdf.get_size()
+
+    matrix = pdfium.PdfMatrix().scale(width, height)
+    image_pdf.set_matrix(matrix)
+
+    page = pdf.new_page(width, height)
+    page.insert_obj(image_pdf)
+    page.gen_content()
+    image.close()
+    return pdf
+
+
+def extract_multiple_images_from_source(
+    input_source: LocalInputSource, page_id: int, polygons: List[List[Point]]
+) -> List[ExtractedImage]:
+    """
+    Extracts elements from a page based on a list of bounding boxes.
+
+    :param input_source: Local Input source to extract elements from.
+    :param page_id: id of the page to extract from.
+    :param polygons: List of coordinates to pull the elements from.
+    :return: List of byte arrays representing the extracted elements.
+    """
+    page = load_pdf_doc(input_source).get_page(page_id)
+    page_content = page.render().to_pil()
+    width, height = page.get_size()
+
+    extracted_elements = []
+    for element_id, polygon in enumerate(polygons):
+        min_max_x = get_min_max_x(polygon)
+        min_max_y = get_min_max_y(polygon)
+
+        pillow_page = page_content.crop(
+            (
+                int(min_max_x.min * width),
+                int(min_max_y.min * height),
+                int(min_max_x.max * width),
+                int(min_max_y.max * height),
+            )
+        )
+        buffer = io.BytesIO()
+        pillow_page.save(buffer, format="JPEG")
+        buffer.seek(0)
+        extracted_elements.append(
+            ExtractedImage(
+                BytesInput(
+                    buffer.read(),
+                    f"{input_source.filename}_p{page_id}_e{element_id}.jpg",
+                ),
+                page_id,
+                element_id,
+            )
+        )
+
+    return extracted_elements
+
+
+def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: ignore
+    """
+    Loads a PDF document from a local input source.
+
+    :param input_file: Local input.
+    :return: A valid PdfDocument handle.
+    """
+    if input_file.is_pdf():
+        input_file.file_object.seek(0)
+        return pdfium.PdfDocument(input_file.file_object)
+
+    return attach_image_as_new_file(input_file.file_object)
diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py
@@ -0,0 +1 @@
+from mindee.image_extraction.multi_receipts_extractor import multi_receipts_extractor
diff --git a/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py
@@ -0,0 +1,37 @@
+from typing import List
+
+from mindee.error import MindeeError
+from mindee.image_extraction.common.extracted_image import ExtractedImage
+from mindee.image_extraction.common.image_extractor import (
+    extract_multiple_images_from_source,
+)
+from mindee.input import LocalInputSource
+from mindee.parsing.common import Inference
+
+
+def extract_receipts(
+    input_source: LocalInputSource, inference: Inference
+) -> List[ExtractedImage]:
+    """
+    Extracts individual receipts from multi-receipts documents.
+
+    :param input_source: Local Input Source to extract sub-receipts from.
+    :param inference: Results of the inference.
+    :return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
+    """
+    images: List[ExtractedImage] = []
+    if not inference.prediction.receipts:
+        raise MindeeError(
+            "No possible receipts candidates found for MultiReceipts extraction."
+        )
+    for page_id in range(input_source.count_doc_pages()):
+        receipt_positions = [
+            receipt.bounding_box
+            for receipt in inference.pages[page_id].prediction.receipts
+        ]
+        images.extend(
+            extract_multiple_images_from_source(
+                input_source, page_id, receipt_positions
+            )
+        )
+    return images
diff --git a/mindee/input/local_response.py b/mindee/input/local_response.py
@@ -98,7 +98,7 @@ def is_valid_hmac_signature(
         Checks if the hmac signature of the local response is valid.
 
         :param secret_key: Secret key, given as a string.
-        :param signature:
+        :param signature: HMAC signature, given as a string.
         :return: True if the HMAC signature is valid.
         """
         return signature == self.get_hmac_signature(secret_key)
diff --git a/mindee/input/sources.py b/mindee/input/sources.py
@@ -116,9 +116,11 @@ def count_doc_pages(self) -> int:
 
         :return: the number of pages.
         """
-        self.file_object.seek(0)
-        pdf = pdfium.PdfDocument(self.file_object)
-        return len(pdf)
+        if self.is_pdf():
+            self.file_object.seek(0)
+            pdf = pdfium.PdfDocument(self.file_object)
+            return len(pdf)
+        return 1
 
     def process_pdf(
         self,
diff --git a/mindee/parsing/standard/locale.py b/mindee/parsing/standard/locale.py
@@ -27,7 +27,11 @@ def __init__(
         :param reconstructed: Bool for reconstructed object (not extracted in the API)
         :param page_id: Page number for multi-page document
         """
-        value_key = "value" if "value" in raw_prediction else "language"
+        value_key = (
+            "value"
+            if ("value" in raw_prediction and raw_prediction["value"])
+            else "language"
+        )
 
         super().__init__(
             raw_prediction,
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,8 @@ safe_licenses = [
   "MIT License",
   "Mozilla Public License 2.0 (MPL 2.0)",
   "BSD License",
-  "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
+  "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
+  "Historical Permission Notice and Disclaimer (HPND)"
 ]
 
 [tool.pytest.ini_options]
diff --git a/setup.cfg b/setup.cfg
@@ -33,6 +33,7 @@ include_package_data = True
 python_requires = >=3.7
 install_requires =
     pypdfium2>=4.0,<5
+    Pillow>=9.5.0
     pytz>=2023.3
     requests~=2.31
 
diff --git a/tests/image_extraction/__init__.py b/tests/image_extraction/__init__.py
diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py
@@ -0,0 +1,40 @@
+import json
+
+import pytest
+from PIL import Image
+
+from mindee.image_extraction.common import extract_multiple_images_from_source
+from mindee.input import PathInput
+from mindee.product import BarcodeReaderV1
+from tests.test_inputs import PRODUCT_DATA_DIR
+
+
+@pytest.fixture
+def barcode_path():
+    return PRODUCT_DATA_DIR / "barcode_reader" / "default_sample.jpg"
+
+
+@pytest.fixture
+def barcode_json_path():
+    return PRODUCT_DATA_DIR / "barcode_reader" / "response_v1" / "complete.json"
+
+
+def test_barcode_image_extraction(barcode_path, barcode_json_path):
+    with open(barcode_json_path, "rb") as f:
+        response = json.load(f)
+    inference = BarcodeReaderV1(response["document"]["inference"])
+    barcodes_1 = [code_1d.polygon for code_1d in inference.prediction.codes_1d]
+    barcodes_2 = [code_2d.polygon for code_2d in inference.prediction.codes_2d]
+    input_source = PathInput(barcode_path)
+    extracted_barcodes_1d = extract_multiple_images_from_source(
+        input_source, 0, barcodes_1
+    )
+    extracted_barcodes_2d = extract_multiple_images_from_source(
+        input_source, 0, barcodes_2
+    )
+    assert len(extracted_barcodes_1d) == 1
+    assert len(extracted_barcodes_2d) == 2
+
+    assert Image.open(extracted_barcodes_1d[0].buffer).size == (353, 200)
+    assert Image.open(extracted_barcodes_2d[0].buffer).size == (214, 216)
+    assert Image.open(extracted_barcodes_2d[1].buffer).size == (193, 201)
diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from mindee.image_extraction.multi_receipts_extractor import multi_receipts_extractor`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,8 @@ safe_licenses = [`
`32`	`32`	`"MIT License",`
`33`	`33`	`"Mozilla Public License 2.0 (MPL 2.0)",`
`34`	`34`	`"BSD License",`
`35`		`- "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"`
	`35`	`+ "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",`
	`36`	`+ "Historical Permission Notice and Disclaimer (HPND)"`
`36`	`37`	`]`
`37`	`38`
`38`	`39`	`[tool.pytest.ini_options]`