✨ add support for multi-receipt extraction

sebastianMindee · sebastianMindee · commit a9dfc4705264 · 2024-06-10T11:26:42.000+02:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -45,3 +45,4 @@ repos:
           - types-requests
           - types-setuptools
           - importlib-metadata
+          - types-Pillow
diff --git a/examples/multi_receipts_tutorial.py b/examples/multi_receipts_tutorial.py
@@ -0,0 +1,20 @@
+from mindee import PredictResponse, Client, product
+from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts
+
+# Init a new client
+mindee_client = Client()
+
+# Load a file from disk
+input_doc = mindee_client.source_from_path("path/to/your/file.ext")
+result_split: PredictResponse = mindee_client.parse(
+    product.MultiReceiptsDetectorV1,
+    input_doc,
+    close_file=False
+)
+
+extracted_receipts = extract_receipts(input_doc, result_split.document.inference)
+for receipt in extracted_receipts:
+    receipt_as_source = receipt.as_source()
+    # receipt.save_to_file(f"./local_test/{receipt.internal_file_name}.pdf") # Optionally: save each extracted receipt
+    result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
+    print(result_receipt.document)
diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py
@@ -1,6 +1,5 @@
 from mindee.image_extraction.common.extracted_image import ExtractedImage
 from mindee.image_extraction.common.image_extractor import (
-    attach_bitmap_as_new_page,
+    attach_image_as_new_file,
     extract_from_page,
-    get_image_size,
 )
diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py
@@ -18,19 +18,22 @@ def __init__(self, buffer: bytes, file_name: str):
         """
         self.buffer = io.BytesIO(buffer)
         self.internal_file_name = file_name
+        self.buffer.name = self.internal_file_name
 
     def save_to_file(self, output_path: str):
         """
         Saves the document to a file.
 
         :param output_path: Path to save the file to.
+        :param file_name: Name of the file.
         :raises MindeeError: If an invalid path or filename is provided.
         """
         try:
+            self.buffer.seek(0)
             resolved_path = Path(output_path).resolve()
             with open(resolved_path, "wb") as file:
                 file.write(self.buffer.read())
-                logger.info("File saved successfully to %s.", resolved_path)
+                logger.info("File saved successfully to '%s'.", resolved_path)
         except TypeError as exc:
             raise MindeeError("Invalid path/filename provided.") from exc
         except Exception as exc:
diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py
@@ -1,71 +1,45 @@
 import io
-import struct
-from typing import BinaryIO, List, Tuple
+from typing import BinaryIO, List
 
 import pypdfium2 as pdfium
+from PIL import Image
 
-from mindee.error import MimeTypeError
 from mindee.geometry import Polygon, get_min_max_x, get_min_max_y
 
 
-def get_image_size(data: BinaryIO) -> Tuple[int, int]:
-    """
-    Read the first few bytes to determine the file type.
-
-    :param data: Image input.
-    :return: A tuple containing the file's height/width.
-    """
-    data.seek(0)
-    signature = data.read(8)
-
-    # Check for PNG signature
-    if signature[:8] == b"\x89PNG\r\n\x1a\n":
-        data.seek(16)
-        width, height = struct.unpack(">II", data.read(8))
-        return width, height
-
-    # Check for JPEG SOI marker (also works for jpga)
-    if signature[:2] == b"\xff\xd8":
-        data.seek(2)
-        while True:
-            (marker,) = struct.unpack(">H", data.read(2))
-            if marker in (0xFFC0, 0xFFC2):  # SOF0 or SOF2
-                data.seek(3, 1)  # Skip length and precision
-                height, width = struct.unpack(">HH", data.read(4))
-                return width, height
-            (length,) = struct.unpack(">H", data.read(2))
-            data.seek(length - 2, 1)
-    data.close()
-    raise MimeTypeError("Size could not be retrieved for file.")
-
-
-def attach_bitmap_as_new_page(  # type: ignore
-    pdf_doc: pdfium.PdfDocument,
-    bitmap: pdfium.PdfBitmap,
-    new_width: float,
-    new_height: float,
+def attach_image_as_new_file(  # type: ignore
+    input_buffer: BinaryIO,
 ) -> pdfium.PdfDocument:
     """
-    Attaches a created PdfBitmap object as a new page in a PdfDocument object.
+    Attaches an image as a new page in a PdfDocument object.
 
-    :param pdf_doc: The PdfDocument to which the new page will be added.
-    :param bitmap: The PdfBitmap object to be added as a new page.
-    :param new_width: The width of the new page.
-    :param new_height: The height of the new page.
+    :param input_buffer: Input buffer. Only supports JPEG.
     :return: A PdfDocument handle.
     """
     # Create a new page in the PdfDocument
-    new_page = pdf_doc.new_page(new_width, new_height)
+    input_buffer.seek(0)
+    image = Image.open(input_buffer)
+    image.convert("RGB")
+    image_buffer = io.BytesIO()
+    image.save(image_buffer, format="JPEG")
+
+    pdf = pdfium.PdfDocument.new()
+
+    image_pdf = pdfium.PdfImage.new(pdf)
+    image_pdf.load_jpeg(image_buffer)
+    width, height = image_pdf.get_size()
 
-    pdf_obj = pdfium.PdfImage.new(pdf_doc)
-    pdf_obj.set_bitmap(bitmap)
-    # Create a device context to render the bitmap onto the new page
-    new_page.insert_obj(pdf_obj)
+    matrix = pdfium.PdfMatrix().scale(width, height)
+    image_pdf.set_matrix(matrix)
 
-    return pdf_doc
+    page = pdf.new_page(width, height)
+    page.insert_obj(image_pdf)
+    page.gen_content()
+    image.close()
+    return pdf
 
 
-def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]):  # type: ignore
+def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]) -> List[bytes]:  # type: ignore
     """
     Extracts elements from a page based on a list of bounding boxes.
 
@@ -76,32 +50,23 @@ def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]):  # typ
     width, height = pdf_page.get_size()
 
     extracted_elements = []
-
     for polygon in polygons:
-        temp_pdf = pdfium.PdfDocument.new()
-
         min_max_x = get_min_max_x(polygon)
         min_max_y = get_min_max_y(polygon)
 
-        left = min_max_x.min
-        right = min_max_x.max
-        top = (height - (min_max_y.min * height)) / height
-        bottom = (height - (min_max_y.max * height)) / height
+        left = min_max_x.min * width
+        right = min_max_x.max * width
+        top = min_max_y.min * height
+        bottom = min_max_y.max * height
 
-        cropped_page: pdfium.PdfBitmap = pdf_page.render(  # type: ignore
-            crop=(left, bottom, right, top)
+        # Note: cropping done via PIL instead of PyPDFium to simplify operations greatly.
+        cropped_content_pil = pdf_page.render().to_pil()
+        cropped_content_pil = cropped_content_pil.crop(
+            (int(left), int(top), int(right), int(bottom))
         )
-
-        temp_pdf = attach_bitmap_as_new_page(
-            temp_pdf,
-            cropped_page,
-            width * (min_max_x.max - min_max_x.min),
-            height * (min_max_y.max - min_max_y.min),
-        )
-
-        temp_file = io.BytesIO()
-        temp_pdf.save(temp_file)
-        extracted_elements.append(temp_file.read())
-        temp_file.close()
+        jpeg_buffer = io.BytesIO()
+        cropped_content_pil.save(jpeg_buffer, format="PDF")
+        jpeg_buffer.seek(0)
+        extracted_elements.append(jpeg_buffer.read())
 
     return extracted_elements
diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py
@@ -1,5 +1,5 @@
 from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
-    ExtractedMultiReceiptImage,
+    ExtractedMultiReceiptsImage,
 )
 from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (
     extract_receipts_from_page,
diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_multi_receipt_image.py
@@ -1,7 +1,7 @@
 from mindee.image_extraction.common import ExtractedImage
 
 
-class ExtractedMultiReceiptImage(ExtractedImage):
+class ExtractedMultiReceiptsImage(ExtractedImage):
     """Wrapper class for extracted multiple-receipts images."""
 
     _receipt_id: int
diff --git a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py
@@ -7,37 +7,36 @@
 from mindee.geometry.polygon import Polygon
 from mindee.geometry.quadrilateral import Quadrilateral
 from mindee.image_extraction.common.image_extractor import (
-    attach_bitmap_as_new_page,
+    attach_image_as_new_file,
     extract_from_page,
-    get_image_size,
 )
 from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (
-    ExtractedMultiReceiptImage,
+    ExtractedMultiReceiptsImage,
 )
 from mindee.input import LocalInputSource
-from mindee.product import MultiReceiptsDetectorV1
+from mindee.parsing.common import Inference
 
 
 def extract_receipts_from_page(  # type: ignore
     pdf_page: pdfium.PdfPage,
     bounding_boxes: List[Union[List[Point], Polygon, Quadrilateral]],
     page_id: int,
-) -> List[ExtractedMultiReceiptImage]:
+) -> List[ExtractedMultiReceiptsImage]:
     """
-    Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage\
+    Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptsImage\
     object.
 
     :param pdf_page: PDF Page to extract from.
     :param bounding_boxes: A set of coordinates delimiting the position of each receipt.
     :param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF
     pages.
-    :return: A list of ExtractedMultiReceiptImage.
+    :return: A list of ExtractedMultiReceiptsImage.
     """
     extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes)  # type: ignore
     extracted_receipts = []
     for i, extracted_receipt_raw in enumerate(extracted_receipts_raw):
         extracted_receipts.append(
-            ExtractedMultiReceiptImage(extracted_receipt_raw, i, page_id)
+            ExtractedMultiReceiptsImage(extracted_receipt_raw, i, page_id)
         )
     return extracted_receipts
 
@@ -50,59 +49,45 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: i
     :return: A valid PdfDocument handle.
     """
     if input_file.file_mimetype not in [
-        "image/jpeg",
-        "image/jpg",
-        "image/png",
         "application/pdf",
+        "image/heic",
+        "image/png",
+        "image/jpg",
+        "image/jpeg",
+        "image/tiff",
+        "image/webp",
     ]:
-        raise MimeTypeError(
-            f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png',"
-            f" '.jpg' and '.pdf'."
-        )
+        raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'.")
+    input_file.file_object.seek(0)
     if input_file.is_pdf():
         return pdfium.PdfDocument(input_file.file_object)
-    pdf_document = pdfium.PdfDocument.new()
-    height, width = get_image_size(input_file.file_object)
-    pdf_bitmap = pdfium.PdfBitmap.new_native(width, height, 4)
-    pdf_bitmap = pdfium.PdfBitmap(
-        raw=pdf_bitmap,
-        buffer=input_file.file_object,
-        height=height,
-        width=width,
-        needs_free=True,
-        rev_byteorder=False,
-        format=4,
-        stride=4,
-    )
-    # Bitmap format 4 should equate to RGBA, assumed to be equivalent to:
-    # https://docs.rs/pdfium-render/latest/pdfium_render/bitmap/enum.PdfBitmapFormat.html
 
-    return attach_bitmap_as_new_page(pdf_document, pdf_bitmap, height, width)
+    return attach_image_as_new_file(input_file.file_object)
 
 
 def extract_receipts(
-    input_file: LocalInputSource, inference: MultiReceiptsDetectorV1
-) -> List[ExtractedMultiReceiptImage]:
+    input_file: LocalInputSource, inference: Inference
+) -> List[ExtractedMultiReceiptsImage]:
     """
     Extracts individual receipts from multi-receipts documents.
 
     :param input_file: File to extract sub-receipts from.
     :param inference: Results of the inference.
-    :return: Individual extracted receipts as an array of ExtractedMultiReceiptImage.
+    :return: Individual extracted receipts as an array of ExtractedMultiReceiptsImage.
     """
-    images: List[ExtractedMultiReceiptImage] = []
+    images: List[ExtractedMultiReceiptsImage] = []
     if not inference.prediction.receipts:
         raise MindeeError(
             "No possible receipts candidates found for MultiReceipts extraction."
         )
     pdf_doc = load_pdf_doc(input_file)
-    for page_id in range(len(pdf_doc)):
+    for page_id, page in enumerate(pdf_doc):
         receipt_positions = [
             receipt.bounding_box
             for receipt in inference.pages[page_id].prediction.receipts
         ]
         extracted_receipts = extract_receipts_from_page(
-            pdf_doc.get_page(page_id), receipt_positions, page_id  # type: ignore
+            page, receipt_positions, page_id
         )
         images.extend(extracted_receipts)
     return images
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,8 @@ safe_licenses = [
   "MIT License",
   "Mozilla Public License 2.0 (MPL 2.0)",
   "BSD License",
-  "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"
+  "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",
+  "Historical Permission Notice and Disclaimer (HPND) (HPND)"
 ]
 
 [tool.pytest.ini_options]
diff --git a/setup.cfg b/setup.cfg
@@ -33,6 +33,7 @@ include_package_data = True
 python_requires = >=3.7
 install_requires =
     pypdfium2>=4.0,<5
+    Pillow>=9.5.0
     pytz>=2023.3
     requests~=2.31
 
diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py
@@ -1,9 +1,9 @@
 from io import BytesIO
 
 import pytest
+from PIL import Image
 
 from mindee.error import MimeTypeError
-from mindee.image_extraction.common import get_image_size
 from tests.test_inputs import FILE_TYPES_DIR
 
 
@@ -24,23 +24,17 @@ def png_file_path():
 
 def test_get_image_size_jpg(jpg_file_path):
     with open(jpg_file_path, "rb") as f:
-        jpg_file = BytesIO(f.read())
-    jpg_height, jpg_width = get_image_size(jpg_file)
+        jpg_file = Image.open(jpg_file_path)
+    jpg_height = jpg_file.size[0]
+    jpg_width = jpg_file.size[1]
     assert jpg_height == 800
     assert jpg_width == 1066
 
 
 def test_get_image_size_png(png_file_path):
     with open(png_file_path, "rb") as f:
-        png_file = BytesIO(f.read())
-    png_height, png_width = get_image_size(png_file)
+        png_file = Image.open(png_file_path)
+    png_height = png_file.size[0]
+    png_width = png_file.size[1]
     assert png_height == 800
     assert png_width == 1066
-
-
-def test_get_image_size_with_invalid_mime(txt_file_path):
-    with open(txt_file_path, "rb") as f:
-        txt_file = BytesIO(f.read())
-
-    with pytest.raises(MimeTypeError):
-        get_image_size(txt_file)
diff --git a/tests/image_extraction/test_multi_receipts_extractor.py b/tests/image_extraction/test_multi_receipts_extractor.py

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,5 @@`
`1`	`1`	`from mindee.image_extraction.common.extracted_image import ExtractedImage`
`2`	`2`	`from mindee.image_extraction.common.image_extractor import (`
`3`		`- attach_bitmap_as_new_page,`
	`3`	`+ attach_image_as_new_file,`
`4`	`4`	`extract_from_page,`
`5`		`- get_image_size,`
`6`	`5`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`from mindee.image_extraction.multi_receipts_extractor.extracted_multi_receipt_image import (`
`2`		`- ExtractedMultiReceiptImage,`
	`2`	`+ ExtractedMultiReceiptsImage,`
`3`	`3`	`)`
`4`	`4`	`from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import (`
`5`	`5`	`extract_receipts_from_page,`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,8 @@ safe_licenses = [`
`32`	`32`	`"MIT License",`
`33`	`33`	`"Mozilla Public License 2.0 (MPL 2.0)",`
`34`	`34`	`"BSD License",`
`35`		`- "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty"`
	`35`	`+ "(Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty",`
	`36`	`+ "Historical Permission Notice and Disclaimer (HPND) (HPND)"`
`36`	`37`	`]`
`37`	`38`
`38`	`39`	`[tool.pytest.ini_options]`