Skip to content

Commit 09dad19

Browse files
add image size check & test
1 parent 0a33334 commit 09dad19

File tree

12 files changed

+257
-3
lines changed

12 files changed

+257
-3
lines changed

mindee/image_extraction/__init__.py

Whitespace-only changes.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from mindee.image_extraction.common.extracted_image import ExtractedImage
2+
from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import io
2+
from pathlib import Path
3+
4+
from mindee.error import MindeeError
5+
from mindee.input import FileInput
6+
from mindee.logger import logger
7+
8+
9+
class ExtractedImage:
10+
def __init__(self, buffer: bytes, file_name: str):
11+
"""
12+
Initialize the ExtractedImage with a buffer and an internal file name.
13+
14+
:param buffer: The byte buffer representing the image.
15+
:param file_name: The internal file name of the image.
16+
"""
17+
self.buffer = io.BytesIO(buffer)
18+
self.internal_file_name = file_name
19+
20+
def save_to_file(self, output_path: str):
21+
"""
22+
Saves the document to a file.
23+
24+
:param output_path: Path to save the file to.
25+
:raises MindeeError: If an invalid path or filename is provided.
26+
"""
27+
try:
28+
resolved_path = Path(output_path).resolve()
29+
with open(resolved_path, 'wb') as f:
30+
f.write(self.buffer.read())
31+
logger.info(f"File saved successfully to {resolved_path}.")
32+
except TypeError:
33+
raise MindeeError("Invalid path/filename provided.")
34+
except Exception as e:
35+
raise e
36+
37+
def as_source(self) -> FileInput:
38+
"""
39+
Return the file as a Mindee-compatible BufferInput source.
40+
41+
:returns: A BufferInput source.
42+
"""
43+
return FileInput(self.buffer)
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import io
2+
from typing import List, BinaryIO, Tuple
3+
4+
import pypdfium2 as pdfium
5+
6+
from mindee.error import MimeTypeError
7+
from mindee.geometry import get_min_max_x, get_min_max_y, Polygon
8+
9+
import struct
10+
11+
12+
def get_image_size(data: BinaryIO) -> Tuple[int, int]:
13+
"""
14+
Read the first few bytes to determine the file type.
15+
16+
:param data: Image input.
17+
:return: A tuple containing the file's height/width.
18+
"""
19+
data.seek(0)
20+
signature = data.read(8)
21+
22+
# Check for PNG signature
23+
if signature[:8] == b'\x89PNG\r\n\x1a\n':
24+
# PNG file
25+
data.seek(16)
26+
width, height = struct.unpack('>II', data.read(8))
27+
return width, height
28+
29+
# Check for JPEG SOI marker
30+
elif signature[:2] == b'\xff\xd8':
31+
data.seek(2)
32+
while True:
33+
marker, = struct.unpack('>H', data.read(2))
34+
if marker == 0xFFC0 or marker == 0xFFC2: # SOF0 or SOF2
35+
data.seek(3, 1) # Skip length and precision
36+
height, width = struct.unpack('>HH', data.read(4))
37+
return width, height
38+
else:
39+
length, = struct.unpack('>H', data.read(2))
40+
data.seek(length - 2, 1)
41+
data.close()
42+
raise MimeTypeError("Size could not be retrieved for file.")
43+
44+
45+
def attach_bitmap_as_new_page(pdf_doc: pdfium.PdfDocument, bitmap: pdfium.PdfBitmap, new_width: float,
46+
new_height: float) -> pdfium.PdfDocument:
47+
"""
48+
Attaches a created PdfBitmap object as a new page in a PdfDocument object.
49+
50+
:param pdf_doc: The PdfDocument to which the new page will be added.
51+
:param bitmap: The PdfBitmap object to be added as a new page.
52+
:param new_width: The width of the new page.
53+
:param new_height: The height of the new page.
54+
"""
55+
# Create a new page in the PdfDocument
56+
new_page = pdf_doc.new_page(new_width, new_height)
57+
58+
# Create a device context to render the bitmap onto the new page
59+
new_page.insert_obj(bitmap.buffer)
60+
61+
return pdf_doc
62+
63+
64+
def extract_from_page(pdf_page: pdfium.PdfPage, polygons: List[Polygon]):
65+
"""
66+
Extracts elements from a page based on a list of bounding boxes.
67+
68+
:param pdf_page: Single PDF Page.
69+
:param polygons: List of coordinates to pull the elements from.
70+
:return: List of byte arrays representing the extracted elements.
71+
"""
72+
width, height = pdf_page.get_size()
73+
74+
extracted_elements = []
75+
76+
for polygon in polygons:
77+
temp_pdf = pdfium.PdfDocument.new()
78+
79+
min_max_x = get_min_max_x(polygon)
80+
min_max_y = get_min_max_y(polygon)
81+
82+
new_width = width * (min_max_x.max - min_max_x.min)
83+
new_height = height * (min_max_y.max - min_max_y.min)
84+
85+
left = min_max_x.min * width
86+
right = min_max_x.max * width
87+
top = height - (min_max_y.min * height)
88+
bottom = height - (min_max_y.max * height)
89+
90+
cropped_page: pdfium.PdfBitmap = pdf_page.render(crop=(left, bottom, right, top))
91+
92+
temp_pdf = attach_bitmap_as_new_page(temp_pdf, cropped_page, new_width, new_height)
93+
94+
temp_file = io.BytesIO()
95+
temp_pdf.save(temp_file)
96+
extracted_elements.append(temp_file.read())
97+
temp_file.close()
98+
99+
return extracted_elements
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from mindee.image_extraction.multi_receipts_extractor.mult_receipts_extractor import extract_receipts_from_page
2+
from mindee.image_extraction.multi_receipts_extractor.extracted_mult_receipt_image import ExtractedMultiReceiptImage
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from mindee.image_extraction.common import ExtractedImage
2+
3+
4+
class ExtractedMultiReceiptImage(ExtractedImage):
5+
_receipt_id: int
6+
page_id: int
7+
8+
def __init__(self, buffer, receipt_id: int, page_id: int):
9+
super().__init__(buffer, f"receipt_p{page_id}_{receipt_id}.pdf")
10+
self._receipt_id = receipt_id
11+
self._page_id = page_id
12+
13+
@property
14+
def receipt_id(self):
15+
return self._receipt_id
16+
17+
@property
18+
def page_id(self):
19+
return self.page_id
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from typing import List
2+
3+
import pypdfium2 as pdfium
4+
5+
from mindee.error import MimeTypeError
6+
from mindee.geometry import Polygon
7+
from mindee.image_extraction.common.image_extractor import extract_from_page, attach_bitmap_as_new_page, get_image_size
8+
from mindee.image_extraction.multi_receipts_extractor import ExtractedMultiReceiptImage
9+
from mindee.input import LocalInputSource
10+
11+
12+
def extract_receipts_from_page(pdf_page: pdfium.PdfPage, bounding_boxes: List[Polygon], page_id: int) \
13+
-> List[ExtractedMultiReceiptImage]:
14+
"""
15+
Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage
16+
object.
17+
18+
:param pdf_page: PDF Page to extract from.
19+
:param bounding_boxes: A set of coordinates delimiting the position of each receipt.
20+
:param page_id: ID of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF
21+
pages.
22+
:return: A list of ExtractedMultiReceiptImage.
23+
"""
24+
extracted_receipts_raw = extract_from_page(pdf_page, bounding_boxes)
25+
extracted_receipts = []
26+
for i in range(len(extracted_receipts_raw)):
27+
extracted_receipts.append(ExtractedMultiReceiptImage(extracted_receipts_raw[i], page_id, i))
28+
return extracted_receipts
29+
30+
31+
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:
32+
"""
33+
Loads a PDF document from a local input source.
34+
35+
:param input_file: Local input.
36+
:return: A valid PdfDocument handle.
37+
"""
38+
if input_file.file_mimetype not in ["image/jpeg", "image/jpg", "image/png", "application/pdf"]:
39+
raise MimeTypeError(f"Unsupported file type '{input_file.file_mimetype}'. Currently supported types are '.png',"
40+
f" '.jpg' and '.pdf'.")
41+
if input_file.is_pdf():
42+
pdf_document = pdfium.PdfDocument(input_file.file_object)
43+
else:
44+
pdf_document = pdfium.PdfDocument.new()
45+
46+
return attach_bitmap_as_new_page(pdf_document, input_file.file_object, get_image_size(input_file.file_object))

mindee/input/local_response.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def is_valid_hmac_signature(
9898
Checks if the hmac signature of the local response is valid.
9999
100100
:param secret_key: Secret key, given as a string.
101-
:param signature:
101+
:param signature: HMAC signature, given as a string.
102102
:return: True if the HMAC signature is valid.
103103
"""
104104
return signature == self.get_hmac_signature(secret_key)

mindee/parsing/standard/locale.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __init__(
2727
:param reconstructed: Bool for reconstructed object (not extracted in the API)
2828
:param page_id: Page number for multi-page document
2929
"""
30-
value_key = "value" if "value" in raw_prediction else "language"
30+
value_key = "value" if ("value" in raw_prediction and raw_prediction["value"]) else "language"
3131

3232
super().__init__(
3333
raw_prediction,

tests/image_extraction/__init__.py

Whitespace-only changes.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from io import BytesIO
2+
3+
import pytest
4+
5+
from mindee.error import MimeTypeError
6+
from mindee.image_extraction.common import get_image_size
7+
from tests.test_inputs import FILE_TYPES_DIR
8+
9+
10+
@pytest.fixture
11+
def jpg_file_path():
12+
return FILE_TYPES_DIR / "receipt.jpg"
13+
14+
@pytest.fixture
15+
def txt_file_path():
16+
return FILE_TYPES_DIR / "receipt.txt"
17+
@pytest.fixture
18+
def png_file_path():
19+
return FILE_TYPES_DIR / "receipt.png"
20+
21+
22+
def test_get_image_size_jpg(jpg_file_path):
23+
with open(jpg_file_path, "rb") as f:
24+
jpg_file = BytesIO(f.read())
25+
jpg_height, jpg_width = get_image_size(jpg_file)
26+
assert jpg_height == 800
27+
assert jpg_width == 1066
28+
29+
30+
def test_get_image_size_png(png_file_path):
31+
with open(png_file_path, "rb") as f:
32+
png_file = BytesIO(f.read())
33+
png_height, png_width = get_image_size(png_file)
34+
assert png_height == 800
35+
assert png_width == 1066
36+
37+
38+
def test_get_image_size_with_invalid_mime(txt_file_path):
39+
with open(txt_file_path, "rb") as f:
40+
txt_file = BytesIO(f.read())
41+
42+
with pytest.raises(MimeTypeError):
43+
get_image_size(txt_file)

0 commit comments

Comments
 (0)