Skip to content

Commit 2e0fe16

Browse files
✨ add support for image and pdf compression (#297)
1 parent 349502c commit 2e0fe16

35 files changed

+906
-48
lines changed

examples/auto_invoice_splitter_extraction_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from mindee import Client
22
from mindee.extraction.pdf_extractor import PdfExtractor
3-
from mindee.input.sources import PathInput
3+
from mindee.input.sources.path_input import PathInput
44
from mindee.product.invoice.invoice_v4 import InvoiceV4
55
from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
66

mindee/client.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,12 @@
77
from mindee.input import WorkflowOptions
88
from mindee.input.local_response import LocalResponse
99
from mindee.input.page_options import PageOptions
10-
from mindee.input.sources import (
11-
Base64Input,
12-
BytesInput,
13-
FileInput,
14-
LocalInputSource,
15-
PathInput,
16-
UrlInputSource,
17-
)
10+
from mindee.input.sources.base_64_input import Base64Input
11+
from mindee.input.sources.bytes_input import BytesInput
12+
from mindee.input.sources.file_input import FileInput
13+
from mindee.input.sources.local_input_source import LocalInputSource
14+
from mindee.input.sources.path_input import PathInput
15+
from mindee.input.sources.url_input_source import UrlInputSource
1816
from mindee.logger import logger
1917
from mindee.mindee_http.endpoint import CustomEndpoint, Endpoint
2018
from mindee.mindee_http.mindee_api import MindeeApi

mindee/error/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77
MindeeHTTPServerError,
88
handle_error,
99
)
10+
from mindee.error.mindee_image_error import MindeeImageError
11+
from mindee.error.mindee_pdf_error import MindeePDFError

mindee/error/mindee_image_error.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class MindeeImageError(RuntimeError):
2+
"""An exception relating to errors during image operations."""

mindee/error/mindee_pdf_error.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class MindeePDFError(RuntimeError):
2+
"""An exception relating to errors during PDF operations."""

mindee/extraction/common/extracted_image.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from PIL import Image
66

77
from mindee.error.mindee_error import MindeeError
8-
from mindee.input.sources import FileInput, LocalInputSource
8+
from mindee.input.sources.file_input import FileInput
9+
from mindee.input.sources.local_input_source import LocalInputSource
910
from mindee.logger import logger
1011

1112

mindee/extraction/common/image_extractor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
from mindee.extraction.common.extracted_image import ExtractedImage
99
from mindee.geometry.point import Point
1010
from mindee.geometry.polygon import get_min_max_x, get_min_max_y
11-
from mindee.input.sources import BytesInput, LocalInputSource
11+
from mindee.input.sources.bytes_input import BytesInput
12+
from mindee.input.sources.local_input_source import LocalInputSource
1213

1314

1415
def attach_image_as_new_file( # type: ignore
@@ -158,6 +159,6 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
158159
"""
159160
if input_file.is_pdf():
160161
input_file.file_object.seek(0)
161-
return pdfium.PdfDocument(input_file.file_object)
162+
return pdfium.PdfDocument(input_file.file_object.read())
162163

163164
return attach_image_as_new_file(input_file.file_object)

mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from mindee.extraction.common.image_extractor import (
66
extract_multiple_images_from_source,
77
)
8-
from mindee.input.sources import LocalInputSource
8+
from mindee.input.sources.local_input_source import LocalInputSource
99
from mindee.parsing.common.inference import Inference
1010

1111

mindee/extraction/pdf_extractor/extracted_pdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pypdfium2 as pdfium
55

66
from mindee.error.mindee_error import MindeeError
7-
from mindee.input.sources import BytesInput
7+
from mindee.input.sources.bytes_input import BytesInput
88

99

1010
class ExtractedPdf:

mindee/extraction/pdf_extractor/pdf_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from mindee.error.mindee_error import MindeeError
99
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
10-
from mindee.input.sources import LocalInputSource
10+
from mindee.input.sources.local_input_source import LocalInputSource
1111
from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import (
1212
InvoiceSplitterV1PageGroup,
1313
)

mindee/image_operations/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from mindee.image_operations.image_compressor import compress_image
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import io
2+
from typing import BinaryIO, Union
3+
4+
from PIL import Image
5+
6+
7+
def compress_image(
8+
image_buffer: Union[BinaryIO, bytes],
9+
quality: int = 85,
10+
max_width: Union[int, float, None] = None,
11+
max_height: Union[int, float, None] = None,
12+
) -> bytes:
13+
"""
14+
Compresses an image with the given parameters.
15+
16+
:param image_buffer: Buffer representation of an image, also accepts BinaryIO.
17+
:param quality: Quality to apply to the image (JPEG compression).
18+
:param max_width: Maximum bound for the width.
19+
:param max_height: Maximum bound for the height.
20+
:return:
21+
"""
22+
if isinstance(image_buffer, bytes):
23+
image_buffer = io.BytesIO(image_buffer)
24+
with Image.open(image_buffer) as img:
25+
original_width, original_height = img.size
26+
max_width = max_width or original_width
27+
max_height = max_height or original_height
28+
if max_width or max_height:
29+
img.thumbnail((int(max_width), int(max_height)), Image.Resampling.LANCZOS)
30+
31+
output_buffer = io.BytesIO()
32+
img.save(output_buffer, format="JPEG", quality=quality, optimize=True)
33+
34+
compressed_image = output_buffer.getvalue()
35+
return compressed_image

mindee/input/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from mindee.input.sources.base_64_input import Base64Input
44
from mindee.input.sources.bytes_input import BytesInput
55
from mindee.input.sources.file_input import FileInput
6-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
6+
from mindee.input.sources.input_type import InputType
7+
from mindee.input.sources.local_input_source import LocalInputSource
78
from mindee.input.sources.path_input import PathInput
89
from mindee.input.sources.url_input_source import UrlInputSource
910
from mindee.input.workflow_options import WorkflowOptions

mindee/input/sources/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from mindee.input.sources.base_64_input import Base64Input
22
from mindee.input.sources.bytes_input import BytesInput
33
from mindee.input.sources.file_input import FileInput
4-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
4+
from mindee.input.sources.input_type import InputType
5+
from mindee.input.sources.local_input_source import LocalInputSource
56
from mindee.input.sources.path_input import PathInput
67
from mindee.input.sources.url_input_source import UrlInputSource

mindee/input/sources/base_64_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import base64
22
import io
33

4-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
4+
from mindee.input.sources.input_type import InputType
5+
from mindee.input.sources.local_input_source import LocalInputSource
56

67

78
class Base64Input(LocalInputSource):

mindee/input/sources/bytes_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import io
22

3-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
3+
from mindee.input.sources.input_type import InputType
4+
from mindee.input.sources.local_input_source import LocalInputSource
45

56

67
class BytesInput(LocalInputSource):

mindee/input/sources/file_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import os
22
from typing import BinaryIO
33

4-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
4+
from mindee.input.sources.input_type import InputType
5+
from mindee.input.sources.local_input_source import LocalInputSource
56

67

78
class FileInput(LocalInputSource):

mindee/input/sources/input_type.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from enum import Enum
2+
3+
4+
class InputType(Enum):
5+
"""The input type, for internal use."""
6+
7+
FILE = "file"
8+
BASE64 = "base64"
9+
BYTES = "bytes"
10+
PATH = "path"
11+
URL = "url"

mindee/input/sources/local_input_source.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
import io
22
import mimetypes
33
import tempfile
4-
from enum import Enum
54
from typing import BinaryIO, Optional, Sequence, Tuple
65

76
import pypdfium2 as pdfium
87

98
from mindee.error.mimetype_error import MimeTypeError
109
from mindee.error.mindee_error import MindeeError, MindeeSourceError
10+
from mindee.image_operations.image_compressor import compress_image
1111
from mindee.input.page_options import KEEP_ONLY, REMOVE
12+
from mindee.input.sources.input_type import InputType
1213
from mindee.logger import logger
14+
from mindee.pdf.pdf_compressor import compress_pdf
15+
from mindee.pdf.pdf_utils import has_source_text
1316

1417
mimetypes.add_type("image/heic", ".heic")
1518
mimetypes.add_type("image/heic", ".heif")
@@ -25,16 +28,6 @@
2528
]
2629

2730

28-
class InputType(Enum):
29-
"""The input type, for internal use."""
30-
31-
FILE = "file"
32-
BASE64 = "base64"
33-
BYTES = "bytes"
34-
PATH = "path"
35-
URL = "url"
36-
37-
3831
class LocalInputSource:
3932
"""Base class for all input sources coming from the local machine."""
4033

@@ -202,3 +195,43 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
202195
def close(self) -> None:
203196
"""Close the file object."""
204197
self.file_object.close()
198+
199+
def has_source_text(self) -> bool:
200+
"""
201+
If the file is a PDF, checks if it has source text.
202+
203+
:return: True if the file is a PDF and has source text. False otherwise.
204+
"""
205+
if not self.is_pdf():
206+
return False
207+
return has_source_text(self.file_object.read())
208+
209+
def compress(
210+
self,
211+
quality: int = 85,
212+
max_width: Optional[int] = None,
213+
max_height: Optional[int] = None,
214+
force_source_text: bool = False,
215+
disable_source_text: bool = True,
216+
) -> None:
217+
"""
218+
Compresses the file object, either as a PDF or an image.
219+
220+
:param quality: Quality of the compression. For images, this is the JPEG quality.
221+
For PDFs, this affects image quality within the PDF.
222+
:param max_width: Maximum width for image resizing. Ignored for PDFs.
223+
:param max_height: Maximum height for image resizing. Ignored for PDFs.
224+
:param force_source_text: For PDFs, whether to force compression even if source text is present.
225+
:param disable_source_text: For PDFs, whether to disable source text during compression.
226+
"""
227+
new_file_bytes: bytes
228+
if self.is_pdf():
229+
new_file_bytes = compress_pdf(
230+
self.file_object, quality, force_source_text, disable_source_text
231+
)
232+
else:
233+
new_file_bytes = compress_image(
234+
self.file_object, quality, max_width, max_height
235+
)
236+
237+
self.file_object = io.BytesIO(new_file_bytes)

mindee/input/sources/path_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
from pathlib import Path
33
from typing import Union
44

5-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
5+
from mindee.input.sources.input_type import InputType
6+
from mindee.input.sources.local_input_source import LocalInputSource
67

78

89
class PathInput(LocalInputSource):

mindee/input/sources/url_input_source.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from mindee.error.mindee_error import MindeeSourceError
1212
from mindee.input.sources.bytes_input import BytesInput
13-
from mindee.input.sources.local_input_source import InputType
13+
from mindee.input.sources.input_type import InputType
1414
from mindee.logger import logger
1515

1616

mindee/mindee_http/endpoint.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import requests
55
from requests import Response
66

7-
from mindee.input.sources import LocalInputSource, UrlInputSource
7+
from mindee.input.sources.local_input_source import LocalInputSource
8+
from mindee.input.sources.url_input_source import UrlInputSource
89
from mindee.mindee_http.base_endpoint import BaseEndpoint
910
from mindee.mindee_http.mindee_api import MindeeApi
1011
from mindee.parsing.common.string_dict import StringDict

mindee/pdf/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from mindee.pdf.pdf_char_data import PDFCharData
2+
from mindee.pdf.pdf_compressor import compress_pdf
3+
from mindee.pdf.pdf_utils import (
4+
extract_text_from_pdf,
5+
has_source_text,
6+
lerp,
7+
)

mindee/pdf/pdf_char_data.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from dataclasses import dataclass
2+
from typing import Tuple
3+
4+
5+
@dataclass
6+
class PDFCharData:
7+
"""Data class representing character data."""
8+
9+
char: str
10+
"""The character."""
11+
left: int
12+
"""Left bound."""
13+
right: int
14+
"""Right bound."""
15+
top: int
16+
"""Top bound."""
17+
bottom: int
18+
"""Bottom bound."""
19+
font_name: str
20+
"""The font name."""
21+
font_size: float
22+
"""The font size in pt."""
23+
font_weight: int
24+
"""The font weight."""
25+
font_flags: int
26+
"""The font flags."""
27+
font_stroke_color: Tuple[int, int, int, int]
28+
"""RGBA representation of the font's stroke color."""
29+
font_fill_color: Tuple[int, int, int, int]
30+
"""RGBA representation of the font's fill color."""
31+
page_id: int
32+
"""ID of the page the character was found on."""

0 commit comments

Comments
 (0)