add image size check & test

sebastianMindee · sebastianMindee · commit 09dad196fe7f · 2024-06-06T13:47:29.000+02:00
diff --git a/mindee/image_extraction/__init__.py b/mindee/image_extraction/__init__.py
diff --git a/mindee/image_extraction/common/__init__.py b/mindee/image_extraction/common/__init__.py
@@ -0,0 +1,2 @@
+from mindee.image_extraction.common.extracted_image import ExtractedImage
+from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size
diff --git a/mindee/image_extraction/common/extracted_image.py b/mindee/image_extraction/common/extracted_image.py
@@ -0,0 +1,43 @@
+import io
+from pathlib import Path
+
+from mindee.error import MindeeError
+from mindee.input import FileInput
+from mindee.logger import logger
+
+
+class ExtractedImage:
+    def __init__(self, buffer: bytes, file_name: str):
+        """
+        Initialize the ExtractedImage with a buffer and an internal file name.
+
+        :param buffer: The byte buffer representing the image.
+        :param file_name: The internal file name of the image.
+        """
+        self.buffer = io.BytesIO(buffer)
+        self.internal_file_name = file_name
+
+    def save_to_file(self, output_path: str):
+        """
+        Saves the document to a file.
+
+        :param output_path: Path to save the file to.
+        :raises MindeeError: If an invalid path or filename is provided.
+        """
+        try:
+            resolved_path = Path(output_path).resolve()
+            with open(resolved_path, 'wb') as f:
+                f.write(self.buffer.read())
+            logger.info(f"File saved successfully to {resolved_path}.")
+        except TypeError:
+            raise MindeeError("Invalid path/filename provided.")
+        except Exception as e:
+            raise e
+
+    def as_source(self) -> FileInput:
+        """
+        Return the file as a Mindee-compatible BufferInput source.
+
+        :returns: A BufferInput source.
+        """
+        return FileInput(self.buffer)
diff --git a/mindee/image_extraction/common/image_extractor.py b/mindee/image_extraction/common/image_extractor.py
@@ -0,0 +1,99 @@
+import io
+from typing import List, BinaryIO, Tuple
+
+import pypdfium2 as pdfium
+
+from mindee.error import MimeTypeError
+from mindee.geometry import get_min_max_x, get_min_max_y, Polygon
+
+import struct
+
+
+def get_image_size(data: BinaryIO) -> Tuple[int, int]:
+    """
+    Read the first few bytes to determine the file type.
+
+    :param data: Image input.
+    :return: A tuple containing the file's height/width.
+    """
+    data.seek(0)
+    signature = data.read(8)
+
+    # Check for PNG signature
+    if signature[:8] == b'\x89PNG\r\n\x1a\n':
+        # PNG file
+        data.seek(16)
+        width, height = struct.unpack('>II', data.read(8))
+        return width, height
+
+    # Check for JPEG SOI marker
+    elif signature[:2] == b'\xff\xd8':
+        data.seek(2)
+        while True:
+            marker, = struct.unpack('>H', data.read(2))
+            if marker == 0xFFC0 or marker == 0xFFC2:  # SOF0 or SOF2
+                data.seek(3, 1)  # Skip length and precision
+                height, width = struct.unpack('>HH', data.read(4))
+                return width, height
+            else:
+                length, = struct.unpack('>H', data.read(2))
+                data.seek(length - 2, 1)
+    data.close()
+    raise MimeTypeError("Size could not be retrieved for file.")
+
+
+def attach_bitmap_as_new_page(pdf_doc: pdfium.PdfDocument, bitmap: pdfium.PdfBitmap, new_width: float,
+                              new_height: float) -> pdfium.PdfDocument:
+    """
+    Attaches a created PdfBitmap object as a new page in a PdfDocument object.
+
+    :param pdf_doc: The PdfDocument to which the new page will be added.
+    :param bitmap: The PdfBitmap object to be added as a new page.
+    :param new_width: The width of the new page.
+    :param new_height: The height of the new page.
+    """
+    # Create a new page in the PdfDocument
+    new_page = pdf_doc.new_page(new_width, new_height)
+
+    # Create a device context to render the bitmap onto the new page
+    new_page.insert_obj(bitmap.buffer)
+
+    return pdf_doc
+
+
+def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]):
+    """
+    Extracts elements from a page based on a list of bounding boxes.
+
+    :param pdf_page: Single PDF Page.
+    :param polygons: List of coordinates to pull the elements from.
+    :return: List of byte arrays representing the extracted elements.
+    """
+    width, height = pdf_page.get_size()
+
+    extracted_elements = []
+
+    for polygon in polygons:
+        temp_pdf = pdfium.PdfDocument.new()
+
+        min_max_x = get_min_max_x(polygon)
+        min_max_y = get_min_max_y(polygon)
+
+        new_width = width * (min_max_x.max - min_max_x.min)
+        new_height = height * (min_max_y.max - min_max_y.min)
+
+        left = min_max_x.min * width
+        right = min_max_x.max * width
+        top = height - (min_max_y.min * height)
+        bottom = height - (min_max_y.max * height)
+
+        cropped_page: pdfium.PdfBitmap = pdf_page.render(crop=(left, bottom, right, top))
+
+        temp_pdf = attach_bitmap_as_new_page(temp_pdf, cropped_page, new_width, new_height)
+
+        temp_file = io.BytesIO()
+        temp_pdf.save(temp_file)
+        extracted_elements.append(temp_file.read())
+        temp_file.close()
+
+    return extracted_elements
diff --git a/mindee/image_extraction/multi_receipts_extractor/__init__.py b/mindee/image_extraction/multi_receipts_extractor/__init__.py
@@ -0,0 +1,2 @@
+from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts_from_page
+from mindee.image_extraction.multi_receipts_extractor.extracted_mult_receipt_image import ExtractedMultiReceiptImage
diff --git a/mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py b/mindee/image_extraction/multi_receipts_extractor/extracted_mult_receipt_image.py
@@ -0,0 +1,19 @@
+from mindee.image_extraction.common import ExtractedImage
+
+
+class ExtractedMultiReceiptImage(ExtractedImage):
+    _receipt_id: int
+    page_id:  int
+
+    def __init__(self, buffer, receipt_id: int, page_id: int):
+        super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf")
+        self._receipt_id = receipt_id
+        self._page_id = page_id
+
+    @property
+    def receipt_id(self):
+        return self._receipt_id
+
+    @property
+    def page_id(self):
+        return self.page_id
diff --git a/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py b/mindee/image_extraction/multi_receipts_extractor/mult_receipts_extractor.py
@@ -0,0 +1,46 @@
+from typing import List
+
+import pypdfium2 as pdfium
+
+from mindee.error import MimeTypeError
+from mindee.geometry import Polygon
+from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size
+from mindee.image_extraction.multi_receipts_extractor import ExtractedMultiReceiptImage
+from mindee.input import LocalInputSource
+
+
+def extract_receipts_from_page(pdf_page: pdfium.PdfPage, bounding_boxes: List[Polygon], page_id: int) \
+        -> List[ExtractedMultiReceiptImage]:
+    """
+    Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage
+    object.
+
+    :param pdf_page: PDF Page to extract from.
+    :param bounding_boxes: A set of coordinates delimiting the position of each receipt.
+    :param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF
+    pages.
+    :return: A list of ExtractedMultiReceiptImage.
+    """
+    extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes)
+    extracted_receipts = []
+    for i in range(len(extracted_receipts_raw)):
+        extracted_receipts.append(ExtractedMultiReceiptImage(extracted_receipts_raw[i], page_id, i))
+    return extracted_receipts
+
+
+def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:
+    """
+    Loads a PDF document from a local input source.
+
+    :param input_file: Local input.
+    :return: A valid PdfDocument handle.
+    """
+    if input_file.file_mimetype not in ["image/jpeg", "image/jpg", "image/png", "application/pdf"]:
+        raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png',"
+                            f" '.jpg' and '.pdf'.")
+    if input_file.is_pdf():
+        pdf_document = pdfium.PdfDocument(input_file.file_object)
+    else:
+        pdf_document = pdfium.PdfDocument.new()
+
+    return attach_bitmap_as_new_page(pdf_document, input_file.file_object, get_image_size(input_file.file_object))
diff --git a/mindee/input/local_response.py b/mindee/input/local_response.py
@@ -98,7 +98,7 @@ def is_valid_hmac_signature(
         Checks if the hmac signature of the local response is valid.
 
         :param secret_key: Secret key, given as a string.
-        :param signature:
+        :param signature: HMAC signature, given as a string.
         :return: True if the HMAC signature is valid.
         """
         return signature == self.get_hmac_signature(secret_key)
diff --git a/mindee/parsing/standard/locale.py b/mindee/parsing/standard/locale.py
@@ -27,7 +27,7 @@ def __init__(
         :param reconstructed: Bool for reconstructed object (not extracted in the API)
         :param page_id: Page number for multi-page document
         """
-        value_key = "value" if "value" in raw_prediction else "language"
+        value_key = "value" if ("value" in raw_prediction and raw_prediction["value"]) else "language"
 
         super().__init__(
             raw_prediction,
diff --git a/tests/data b/tests/data
@@ -1 +1 @@
-Subproject commit abe2a996f71ca3242693af0439f3bf96b4ce7781
+Subproject commit 6b2f85639465c878e70a59337394a8adf14a0b16
diff --git a/tests/image_extraction/__init__.py b/tests/image_extraction/__init__.py
diff --git a/tests/image_extraction/test_image_extractor.py b/tests/image_extraction/test_image_extractor.py
@@ -0,0 +1,43 @@
+from io import BytesIO
+
+import pytest
+
+from mindee.error import MimeTypeError
+from mindee.image_extraction.common import get_image_size
+from tests.test_inputs import FILE_TYPES_DIR
+
+
+@pytest.fixture
+def jpg_file_path():
+    return FILE_TYPES_DIR / "receipt.jpg"
+
+@pytest.fixture
+def txt_file_path():
+    return FILE_TYPES_DIR / "receipt.txt"
+@pytest.fixture
+def png_file_path():
+    return FILE_TYPES_DIR / "receipt.png"
+
+
+def test_get_image_size_jpg(jpg_file_path):
+    with open(jpg_file_path, "rb") as f:
+        jpg_file = BytesIO(f.read())
+    jpg_height, jpg_width = get_image_size(jpg_file)
+    assert jpg_height == 800
+    assert jpg_width == 1066
+
+
+def test_get_image_size_png(png_file_path):
+    with open(png_file_path, "rb") as f:
+        png_file = BytesIO(f.read())
+    png_height, png_width = get_image_size(png_file)
+    assert png_height == 800
+    assert png_width == 1066
+
+
+def test_get_image_size_with_invalid_mime(txt_file_path):
+    with open(txt_file_path, "rb") as f:
+        txt_file = BytesIO(f.read())
+
+    with pytest.raises(MimeTypeError):
+        get_image_size(txt_file)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from mindee.image_extraction.common.extracted_image import ExtractedImage`
	`2`	`+from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts_from_page`
	`2`	`+from mindee.image_extraction.multi_receipts_extractor.extracted_mult_receipt_image import ExtractedMultiReceiptImage`