Skip to content

✨ add support for image and pdf compression #297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/auto_invoice_splitter_extraction_example.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from mindee import Client
from mindee.extraction.pdf_extractor import PdfExtractor
from mindee.input.sources import PathInput
from mindee.input.sources.path_input import PathInput
from mindee.product.invoice.invoice_v4 import InvoiceV4
from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1

Expand Down
14 changes: 6 additions & 8 deletions mindee/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,12 @@
from mindee.input import WorkflowOptions
from mindee.input.local_response import LocalResponse
from mindee.input.page_options import PageOptions
from mindee.input.sources import (
Base64Input,
BytesInput,
FileInput,
LocalInputSource,
PathInput,
UrlInputSource,
)
from mindee.input.sources.base_64_input import Base64Input
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.file_input import FileInput
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.path_input import PathInput
from mindee.input.sources.url_input_source import UrlInputSource
from mindee.logger import logger
from mindee.mindee_http.endpoint import CustomEndpoint, Endpoint
from mindee.mindee_http.mindee_api import MindeeApi
Expand Down
2 changes: 2 additions & 0 deletions mindee/error/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
MindeeHTTPServerError,
handle_error,
)
from mindee.error.mindee_image_error import MindeeImageError
from mindee.error.mindee_pdf_error import MindeePDFError
2 changes: 2 additions & 0 deletions mindee/error/mindee_image_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class MindeeImageError(RuntimeError):
"""An exception relating to errors during image operations."""
2 changes: 2 additions & 0 deletions mindee/error/mindee_pdf_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class MindeePDFError(RuntimeError):
"""An exception relating to errors during PDF operations."""
3 changes: 2 additions & 1 deletion mindee/extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from PIL import Image

from mindee.error.mindee_error import MindeeError
from mindee.input.sources import FileInput, LocalInputSource
from mindee.input.sources.file_input import FileInput
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.logger import logger


Expand Down
5 changes: 3 additions & 2 deletions mindee/extraction/common/image_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from mindee.extraction.common.extracted_image import ExtractedImage
from mindee.geometry.point import Point
from mindee.geometry.polygon import get_min_max_x, get_min_max_y
from mindee.input.sources import BytesInput, LocalInputSource
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.local_input_source import LocalInputSource


def attach_image_as_new_file( # type: ignore
Expand Down Expand Up @@ -158,6 +159,6 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
"""
if input_file.is_pdf():
input_file.file_object.seek(0)
return pdfium.PdfDocument(input_file.file_object)
return pdfium.PdfDocument(input_file.file_object.read())

return attach_image_as_new_file(input_file.file_object)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from mindee.extraction.common.image_extractor import (
extract_multiple_images_from_source,
)
from mindee.input.sources import LocalInputSource
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.inference import Inference


Expand Down
2 changes: 1 addition & 1 deletion mindee/extraction/pdf_extractor/extracted_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pypdfium2 as pdfium

from mindee.error.mindee_error import MindeeError
from mindee.input.sources import BytesInput
from mindee.input.sources.bytes_input import BytesInput


class ExtractedPdf:
Expand Down
2 changes: 1 addition & 1 deletion mindee/extraction/pdf_extractor/pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from mindee.error.mindee_error import MindeeError
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.input.sources import LocalInputSource
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import (
InvoiceSplitterV1PageGroup,
)
Expand Down
1 change: 1 addition & 0 deletions mindee/image_operations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from mindee.image_operations.image_compressor import compress_image
35 changes: 35 additions & 0 deletions mindee/image_operations/image_compressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import io
from typing import BinaryIO, Union

from PIL import Image


def compress_image(
image_buffer: Union[BinaryIO, bytes],
quality: int = 85,
max_width: Union[int, float, None] = None,
max_height: Union[int, float, None] = None,
) -> bytes:
"""
Compresses an image with the given parameters.

:param image_buffer: Buffer representation of an image, also accepts BinaryIO.
:param quality: Quality to apply to the image (JPEG compression).
:param max_width: Maximum bound for the width.
:param max_height: Maximum bound for the height.
:return:
"""
if isinstance(image_buffer, bytes):
image_buffer = io.BytesIO(image_buffer)
with Image.open(image_buffer) as img:
original_width, original_height = img.size
max_width = max_width or original_width
max_height = max_height or original_height
if max_width or max_height:
img.thumbnail((int(max_width), int(max_height)), Image.Resampling.LANCZOS)

output_buffer = io.BytesIO()
img.save(output_buffer, format="JPEG", quality=quality, optimize=True)

compressed_image = output_buffer.getvalue()
return compressed_image
3 changes: 2 additions & 1 deletion mindee/input/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from mindee.input.sources.base_64_input import Base64Input
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.file_input import FileInput
from mindee.input.sources.local_input_source import InputType, LocalInputSource
from mindee.input.sources.input_type import InputType
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.path_input import PathInput
from mindee.input.sources.url_input_source import UrlInputSource
from mindee.input.workflow_options import WorkflowOptions
3 changes: 2 additions & 1 deletion mindee/input/sources/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from mindee.input.sources.base_64_input import Base64Input
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.file_input import FileInput
from mindee.input.sources.local_input_source import InputType, LocalInputSource
from mindee.input.sources.input_type import InputType
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.path_input import PathInput
from mindee.input.sources.url_input_source import UrlInputSource
3 changes: 2 additions & 1 deletion mindee/input/sources/base_64_input.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import base64
import io

from mindee.input.sources.local_input_source import InputType, LocalInputSource
from mindee.input.sources.input_type import InputType
from mindee.input.sources.local_input_source import LocalInputSource


class Base64Input(LocalInputSource):
Expand Down
3 changes: 2 additions & 1 deletion mindee/input/sources/bytes_input.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io

from mindee.input.sources.local_input_source import InputType, LocalInputSource
from mindee.input.sources.input_type import InputType
from mindee.input.sources.local_input_source import LocalInputSource


class BytesInput(LocalInputSource):
Expand Down
3 changes: 2 additions & 1 deletion mindee/input/sources/file_input.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
from typing import BinaryIO

from mindee.input.sources.local_input_source import InputType, LocalInputSource
from mindee.input.sources.input_type import InputType
from mindee.input.sources.local_input_source import LocalInputSource


class FileInput(LocalInputSource):
Expand Down
11 changes: 11 additions & 0 deletions mindee/input/sources/input_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from enum import Enum


class InputType(Enum):
"""The input type, for internal use."""

FILE = "file"
BASE64 = "base64"
BYTES = "bytes"
PATH = "path"
URL = "url"
55 changes: 44 additions & 11 deletions mindee/input/sources/local_input_source.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import io
import mimetypes
import tempfile
from enum import Enum
from typing import BinaryIO, Optional, Sequence, Tuple

import pypdfium2 as pdfium

from mindee.error.mimetype_error import MimeTypeError
from mindee.error.mindee_error import MindeeError, MindeeSourceError
from mindee.image_operations.image_compressor import compress_image
from mindee.input.page_options import KEEP_ONLY, REMOVE
from mindee.input.sources.input_type import InputType
from mindee.logger import logger
from mindee.pdf.pdf_compressor import compress_pdf
from mindee.pdf.pdf_utils import has_source_text

mimetypes.add_type("image/heic", ".heic")
mimetypes.add_type("image/heic", ".heif")
Expand All @@ -25,16 +28,6 @@
]


class InputType(Enum):
"""The input type, for internal use."""

FILE = "file"
BASE64 = "base64"
BYTES = "bytes"
PATH = "path"
URL = "url"


class LocalInputSource:
"""Base class for all input sources coming from the local machine."""

Expand Down Expand Up @@ -202,3 +195,43 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
def close(self) -> None:
"""Close the file object."""
self.file_object.close()

def has_source_text(self) -> bool:
"""
If the file is a PDF, checks if it has source text.

:return: True if the file is a PDF and has source text. False otherwise.
"""
if not self.is_pdf():
return False
return has_source_text(self.file_object.read())

def compress(
self,
quality: int = 85,
max_width: Optional[int] = None,
max_height: Optional[int] = None,
force_source_text: bool = False,
disable_source_text: bool = True,
) -> None:
"""
Compresses the file object, either as a PDF or an image.

:param quality: Quality of the compression. For images, this is the JPEG quality.
For PDFs, this affects image quality within the PDF.
:param max_width: Maximum width for image resizing. Ignored for PDFs.
:param max_height: Maximum height for image resizing. Ignored for PDFs.
:param force_source_text: For PDFs, whether to force compression even if source text is present.
:param disable_source_text: For PDFs, whether to disable source text during compression.
"""
new_file_bytes: bytes
if self.is_pdf():
new_file_bytes = compress_pdf(
self.file_object, quality, force_source_text, disable_source_text
)
else:
new_file_bytes = compress_image(
self.file_object, quality, max_width, max_height
)

self.file_object = io.BytesIO(new_file_bytes)
3 changes: 2 additions & 1 deletion mindee/input/sources/path_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from pathlib import Path
from typing import Union

from mindee.input.sources.local_input_source import InputType, LocalInputSource
from mindee.input.sources.input_type import InputType
from mindee.input.sources.local_input_source import LocalInputSource


class PathInput(LocalInputSource):
Expand Down
2 changes: 1 addition & 1 deletion mindee/input/sources/url_input_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from mindee.error.mindee_error import MindeeSourceError
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.local_input_source import InputType
from mindee.input.sources.input_type import InputType
from mindee.logger import logger


Expand Down
3 changes: 2 additions & 1 deletion mindee/mindee_http/endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import requests
from requests import Response

from mindee.input.sources import LocalInputSource, UrlInputSource
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.url_input_source import UrlInputSource
from mindee.mindee_http.base_endpoint import BaseEndpoint
from mindee.mindee_http.mindee_api import MindeeApi
from mindee.parsing.common.string_dict import StringDict
Expand Down
7 changes: 7 additions & 0 deletions mindee/pdf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from mindee.pdf.pdf_char_data import PDFCharData
from mindee.pdf.pdf_compressor import compress_pdf
from mindee.pdf.pdf_utils import (
extract_text_from_pdf,
has_source_text,
lerp,
)
32 changes: 32 additions & 0 deletions mindee/pdf/pdf_char_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from dataclasses import dataclass
from typing import Tuple


@dataclass
class PDFCharData:
"""Data class representing character data."""

char: str
"""The character."""
left: int
"""Left bound."""
right: int
"""Right bound."""
top: int
"""Top bound."""
bottom: int
"""Bottom bound."""
font_name: str
"""The font name."""
font_size: float
"""The font size in pt."""
font_weight: int
"""The font weight."""
font_flags: int
"""The font flags."""
font_stroke_color: Tuple[int, int, int, int]
"""RGBA representation of the font's stroke color."""
font_fill_color: Tuple[int, int, int, int]
"""RGBA representation of the font's fill color."""
page_id: int
"""ID of the page the character was found on."""
Loading
Loading