diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py index 161075d0..a9a2bb5a 100644 --- a/examples/auto_invoice_splitter_extraction_example.py +++ b/examples/auto_invoice_splitter_extraction_example.py @@ -1,6 +1,6 @@ from mindee import Client from mindee.extraction.pdf_extractor import PdfExtractor -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.invoice.invoice_v4 import InvoiceV4 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 diff --git a/mindee/client.py b/mindee/client.py index bebd8560..901b68cb 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -7,14 +7,12 @@ from mindee.input import WorkflowOptions from mindee.input.local_response import LocalResponse from mindee.input.page_options import PageOptions -from mindee.input.sources import ( - Base64Input, - BytesInput, - FileInput, - LocalInputSource, - PathInput, - UrlInputSource, -) +from mindee.input.sources.base_64_input import Base64Input +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.path_input import PathInput +from mindee.input.sources.url_input_source import UrlInputSource from mindee.logger import logger from mindee.mindee_http.endpoint import CustomEndpoint, Endpoint from mindee.mindee_http.mindee_api import MindeeApi diff --git a/mindee/error/__init__.py b/mindee/error/__init__.py index e8075e37..c49c3cf3 100644 --- a/mindee/error/__init__.py +++ b/mindee/error/__init__.py @@ -7,3 +7,5 @@ MindeeHTTPServerError, handle_error, ) +from mindee.error.mindee_image_error import MindeeImageError +from mindee.error.mindee_pdf_error import MindeePDFError diff --git a/mindee/error/mindee_image_error.py b/mindee/error/mindee_image_error.py new file mode 100644 index 00000000..1da0abec --- /dev/null +++ b/mindee/error/mindee_image_error.py @@ -0,0 +1,2 @@ +class MindeeImageError(RuntimeError): + """An exception relating to errors during image operations.""" diff --git a/mindee/error/mindee_pdf_error.py b/mindee/error/mindee_pdf_error.py new file mode 100644 index 00000000..52f0b32f --- /dev/null +++ b/mindee/error/mindee_pdf_error.py @@ -0,0 +1,2 @@ +class MindeePDFError(RuntimeError): + """An exception relating to errors during PDF operations.""" diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py index 3d6b6f22..e4013246 100644 --- a/mindee/extraction/common/extracted_image.py +++ b/mindee/extraction/common/extracted_image.py @@ -5,7 +5,8 @@ from PIL import Image from mindee.error.mindee_error import MindeeError -from mindee.input.sources import FileInput, LocalInputSource +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.local_input_source import LocalInputSource from mindee.logger import logger diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 046b312c..5bae6d37 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -8,7 +8,8 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.geometry.point import Point from mindee.geometry.polygon import get_min_max_x, get_min_max_y -from mindee.input.sources import BytesInput, LocalInputSource +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.local_input_source import LocalInputSource def attach_image_as_new_file( # type: ignore @@ -158,6 +159,6 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i """ if input_file.is_pdf(): input_file.file_object.seek(0) - return pdfium.PdfDocument(input_file.file_object) + return pdfium.PdfDocument(input_file.file_object.read()) return attach_image_as_new_file(input_file.file_object) diff --git a/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py index 89ad63b9..7c31ca93 100644 --- a/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py +++ b/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -5,7 +5,7 @@ from mindee.extraction.common.image_extractor import ( extract_multiple_images_from_source, ) -from mindee.input.sources import LocalInputSource +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.inference import Inference diff --git a/mindee/extraction/pdf_extractor/extracted_pdf.py b/mindee/extraction/pdf_extractor/extracted_pdf.py index fd02ce90..0e3dcb8d 100644 --- a/mindee/extraction/pdf_extractor/extracted_pdf.py +++ b/mindee/extraction/pdf_extractor/extracted_pdf.py @@ -4,7 +4,7 @@ import pypdfium2 as pdfium from mindee.error.mindee_error import MindeeError -from mindee.input.sources import BytesInput +from mindee.input.sources.bytes_input import BytesInput class ExtractedPdf: diff --git a/mindee/extraction/pdf_extractor/pdf_extractor.py b/mindee/extraction/pdf_extractor/pdf_extractor.py index 1a2023ca..5d5f2e19 100644 --- a/mindee/extraction/pdf_extractor/pdf_extractor.py +++ b/mindee/extraction/pdf_extractor/pdf_extractor.py @@ -7,7 +7,7 @@ from mindee.error.mindee_error import MindeeError from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf -from mindee.input.sources import LocalInputSource +from mindee.input.sources.local_input_source import LocalInputSource from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import ( InvoiceSplitterV1PageGroup, ) diff --git a/mindee/image_operations/__init__.py b/mindee/image_operations/__init__.py new file mode 100644 index 00000000..f92bd401 --- /dev/null +++ b/mindee/image_operations/__init__.py @@ -0,0 +1 @@ +from mindee.image_operations.image_compressor import compress_image diff --git a/mindee/image_operations/image_compressor.py b/mindee/image_operations/image_compressor.py new file mode 100644 index 00000000..82b0bf87 --- /dev/null +++ b/mindee/image_operations/image_compressor.py @@ -0,0 +1,35 @@ +import io +from typing import BinaryIO, Union + +from PIL import Image + + +def compress_image( + image_buffer: Union[BinaryIO, bytes], + quality: int = 85, + max_width: Union[int, float, None] = None, + max_height: Union[int, float, None] = None, +) -> bytes: + """ + Compresses an image with the given parameters. + + :param image_buffer: Buffer representation of an image, also accepts BinaryIO. + :param quality: Quality to apply to the image (JPEG compression). + :param max_width: Maximum bound for the width. + :param max_height: Maximum bound for the height. + :return: + """ + if isinstance(image_buffer, bytes): + image_buffer = io.BytesIO(image_buffer) + with Image.open(image_buffer) as img: + original_width, original_height = img.size + max_width = max_width or original_width + max_height = max_height or original_height + if max_width or max_height: + img.thumbnail((int(max_width), int(max_height)), Image.Resampling.LANCZOS) + + output_buffer = io.BytesIO() + img.save(output_buffer, format="JPEG", quality=quality, optimize=True) + + compressed_image = output_buffer.getvalue() + return compressed_image diff --git a/mindee/input/__init__.py b/mindee/input/__init__.py index 3c75c072..82624650 100644 --- a/mindee/input/__init__.py +++ b/mindee/input/__init__.py @@ -3,7 +3,8 @@ from mindee.input.sources.base_64_input import Base64Input from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.file_input import FileInput -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource from mindee.input.sources.path_input import PathInput from mindee.input.sources.url_input_source import UrlInputSource from mindee.input.workflow_options import WorkflowOptions diff --git a/mindee/input/sources/__init__.py b/mindee/input/sources/__init__.py index 6f8a51e3..c7d9c22a 100644 --- a/mindee/input/sources/__init__.py +++ b/mindee/input/sources/__init__.py @@ -1,6 +1,7 @@ from mindee.input.sources.base_64_input import Base64Input from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.file_input import FileInput -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource from mindee.input.sources.path_input import PathInput from mindee.input.sources.url_input_source import UrlInputSource diff --git a/mindee/input/sources/base_64_input.py b/mindee/input/sources/base_64_input.py index b651bd23..b656255b 100644 --- a/mindee/input/sources/base_64_input.py +++ b/mindee/input/sources/base_64_input.py @@ -1,7 +1,8 @@ import base64 import io -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class Base64Input(LocalInputSource): diff --git a/mindee/input/sources/bytes_input.py b/mindee/input/sources/bytes_input.py index 13fbf41d..1f2b63fd 100644 --- a/mindee/input/sources/bytes_input.py +++ b/mindee/input/sources/bytes_input.py @@ -1,6 +1,7 @@ import io -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class BytesInput(LocalInputSource): diff --git a/mindee/input/sources/file_input.py b/mindee/input/sources/file_input.py index 561fd754..2623a4f3 100644 --- a/mindee/input/sources/file_input.py +++ b/mindee/input/sources/file_input.py @@ -1,7 +1,8 @@ import os from typing import BinaryIO -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class FileInput(LocalInputSource): diff --git a/mindee/input/sources/input_type.py b/mindee/input/sources/input_type.py new file mode 100644 index 00000000..6daf1131 --- /dev/null +++ b/mindee/input/sources/input_type.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class InputType(Enum): + """The input type, for internal use.""" + + FILE = "file" + BASE64 = "base64" + BYTES = "bytes" + PATH = "path" + URL = "url" diff --git a/mindee/input/sources/local_input_source.py b/mindee/input/sources/local_input_source.py index ef5bcaf5..9f6f5cc6 100644 --- a/mindee/input/sources/local_input_source.py +++ b/mindee/input/sources/local_input_source.py @@ -1,15 +1,18 @@ import io import mimetypes import tempfile -from enum import Enum from typing import BinaryIO, Optional, Sequence, Tuple import pypdfium2 as pdfium from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError +from mindee.image_operations.image_compressor import compress_image from mindee.input.page_options import KEEP_ONLY, REMOVE +from mindee.input.sources.input_type import InputType from mindee.logger import logger +from mindee.pdf.pdf_compressor import compress_pdf +from mindee.pdf.pdf_utils import has_source_text mimetypes.add_type("image/heic", ".heic") mimetypes.add_type("image/heic", ".heif") @@ -25,16 +28,6 @@ ] -class InputType(Enum): - """The input type, for internal use.""" - - FILE = "file" - BASE64 = "base64" - BYTES = "bytes" - PATH = "path" - URL = "url" - - class LocalInputSource: """Base class for all input sources coming from the local machine.""" @@ -202,3 +195,43 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]: def close(self) -> None: """Close the file object.""" self.file_object.close() + + def has_source_text(self) -> bool: + """ + If the file is a PDF, checks if it has source text. + + :return: True if the file is a PDF and has source text. False otherwise. + """ + if not self.is_pdf(): + return False + return has_source_text(self.file_object.read()) + + def compress( + self, + quality: int = 85, + max_width: Optional[int] = None, + max_height: Optional[int] = None, + force_source_text: bool = False, + disable_source_text: bool = True, + ) -> None: + """ + Compresses the file object, either as a PDF or an image. + + :param quality: Quality of the compression. For images, this is the JPEG quality. + For PDFs, this affects image quality within the PDF. + :param max_width: Maximum width for image resizing. Ignored for PDFs. + :param max_height: Maximum height for image resizing. Ignored for PDFs. + :param force_source_text: For PDFs, whether to force compression even if source text is present. + :param disable_source_text: For PDFs, whether to disable source text during compression. + """ + new_file_bytes: bytes + if self.is_pdf(): + new_file_bytes = compress_pdf( + self.file_object, quality, force_source_text, disable_source_text + ) + else: + new_file_bytes = compress_image( + self.file_object, quality, max_width, max_height + ) + + self.file_object = io.BytesIO(new_file_bytes) diff --git a/mindee/input/sources/path_input.py b/mindee/input/sources/path_input.py index 3f9698b4..2e7fc736 100644 --- a/mindee/input/sources/path_input.py +++ b/mindee/input/sources/path_input.py @@ -2,7 +2,8 @@ from pathlib import Path from typing import Union -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class PathInput(LocalInputSource): diff --git a/mindee/input/sources/url_input_source.py b/mindee/input/sources/url_input_source.py index 983343e5..0e62573a 100644 --- a/mindee/input/sources/url_input_source.py +++ b/mindee/input/sources/url_input_source.py @@ -10,7 +10,7 @@ from mindee.error.mindee_error import MindeeSourceError from mindee.input.sources.bytes_input import BytesInput -from mindee.input.sources.local_input_source import InputType +from mindee.input.sources.input_type import InputType from mindee.logger import logger diff --git a/mindee/mindee_http/endpoint.py b/mindee/mindee_http/endpoint.py index fdbd2ae7..227c1e2f 100644 --- a/mindee/mindee_http/endpoint.py +++ b/mindee/mindee_http/endpoint.py @@ -4,7 +4,8 @@ import requests from requests import Response -from mindee.input.sources import LocalInputSource, UrlInputSource +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.url_input_source import UrlInputSource from mindee.mindee_http.base_endpoint import BaseEndpoint from mindee.mindee_http.mindee_api import MindeeApi from mindee.parsing.common.string_dict import StringDict diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py new file mode 100644 index 00000000..4c55dad2 --- /dev/null +++ b/mindee/pdf/__init__.py @@ -0,0 +1,7 @@ +from mindee.pdf.pdf_char_data import PDFCharData +from mindee.pdf.pdf_compressor import compress_pdf +from mindee.pdf.pdf_utils import ( + extract_text_from_pdf, + has_source_text, + lerp, +) diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py new file mode 100644 index 00000000..58d46db9 --- /dev/null +++ b/mindee/pdf/pdf_char_data.py @@ -0,0 +1,32 @@ +from dataclasses import dataclass +from typing import Tuple + + +@dataclass +class PDFCharData: + """Data class representing character data.""" + + char: str + """The character.""" + left: int + """Left bound.""" + right: int + """Right bound.""" + top: int + """Top bound.""" + bottom: int + """Bottom bound.""" + font_name: str + """The font name.""" + font_size: float + """The font size in pt.""" + font_weight: int + """The font weight.""" + font_flags: int + """The font flags.""" + font_stroke_color: Tuple[int, int, int, int] + """RGBA representation of the font's stroke color.""" + font_fill_color: Tuple[int, int, int, int] + """RGBA representation of the font's fill color.""" + page_id: int + """ID of the page the character was found on.""" diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py new file mode 100644 index 00000000..cfaf4254 --- /dev/null +++ b/mindee/pdf/pdf_compressor.py @@ -0,0 +1,222 @@ +import io +import logging +from ctypes import c_char_p, c_ushort +from threading import RLock +from typing import BinaryIO, List, Optional, Tuple, Union + +import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c +from _ctypes import POINTER +from PIL import Image + +from mindee.image_operations.image_compressor import compress_image +from mindee.pdf.pdf_char_data import PDFCharData +from mindee.pdf.pdf_utils import ( + extract_text_from_pdf, + has_source_text, + lerp, +) + +logger = logging.getLogger(__name__) +MIN_QUALITY = 1 + + +def compress_pdf( + pdf_data: Union[BinaryIO, bytes], + image_quality: int = 85, + force_source_text_compression: bool = False, + disable_source_text: bool = True, +) -> bytes: + """ + Compresses each page of a provided PDF buffer. + + :param pdf_data: The input PDF as bytes. + :param image_quality: Compression quality (70-100 for most JPG images). + :param force_source_text_compression: If true, attempts to re-write detected text. + :param disable_source_text: If true, doesn't re-apply source text to the output PDF. + :return: Compressed PDF as bytes. + """ + if not isinstance(pdf_data, bytes): + pdf_bytes = pdf_data.read() + pdf_data.seek(0) + else: + pdf_bytes = pdf_data + + if has_source_text(pdf_bytes): + if force_source_text_compression: + if not disable_source_text: + logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.") + else: + logger.warning( + "Source file contains text, but disable_source_text flag " + "is set to false. Resulting file will not contain any embedded text." + ) + else: + logger.warning( + "Found text inside of the provided PDF file. Compression operation aborted since disableSourceText " + "is set to 'true'." + ) + return pdf_bytes + + extracted_text = ( + extract_text_from_pdf(pdf_bytes) if not disable_source_text else None + ) + + compressed_pages = _compress_pdf_pages(pdf_bytes, image_quality) + + if not compressed_pages: + logger.warning( + "Could not compress PDF to a smaller size. Returning original PDF." + ) + return pdf_bytes + + out_pdf = _collect_images_as_pdf( + [compressed_page_image[0] for compressed_page_image in compressed_pages] + ) + + if not disable_source_text: + for i, page in enumerate(out_pdf): + add_text_to_pdf_page(page, i, extracted_text) + + out_buffer = io.BytesIO() + out_pdf.save(out_buffer) + out_buffer.seek(0) + return out_buffer.read() + + +def _compress_pdf_pages( + pdf_data: bytes, + image_quality: int, +) -> Optional[List[Tuple[bytes, int, int]]]: + """ + Compresses PDF pages and returns an array of compressed page buffers. + + :param pdf_data: The input PDF as bytes. + :param image_quality: Initial compression quality. + :return: List of compressed page buffers, or None if compression fails. + """ + original_size = len(pdf_data) + image_quality_loop = image_quality + + while image_quality_loop >= MIN_QUALITY: + compressed_pages = _compress_pages_with_quality(pdf_data, image_quality_loop) + total_compressed_size = sum(len(page) for page in compressed_pages) + + if _is_compression_successful( + total_compressed_size, original_size, image_quality + ): + return compressed_pages + + image_quality_loop -= round(lerp(1, 10, image_quality_loop / 100)) + + return None + + +def add_text_to_pdf_page( # type: ignore + page: pdfium.PdfPage, + page_id: int, + extracted_text: Optional[List[List[PDFCharData]]], +) -> None: + """ + Adds text to a PDF page based on the extracted text data. + + :param page: The PDFDocument object. + :param page_id: The ID of the page. + :param extracted_text: List of PDFCharData objects containing text and positioning information. + """ + if not extracted_text or not extracted_text[page_id]: + return + + height = page.get_height() + pdfium_lock = RLock() + + with pdfium_lock: + for char_data in extracted_text[page_id]: + font_name = c_char_p(char_data.font_name.encode("utf-8")) + text_handler = pdfium_c.FPDFPageObj_NewTextObj( + page.pdf.raw, font_name, char_data.font_size + ) + char_code = ord(char_data.char) + char_code_c_char = c_ushort(char_code) + char_ptr = POINTER(c_ushort)(char_code_c_char) + pdfium_c.FPDFText_SetText(text_handler, char_ptr) + pdfium_c.FPDFPageObj_Transform( + text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top + ) + pdfium_c.FPDFPage_InsertObject(page.raw, text_handler) + pdfium_c.FPDFPage_GenerateContent(page.raw) + + +def _compress_pages_with_quality( + pdf_data: bytes, + image_quality: int, +) -> List[Tuple[bytes, int, int]]: + """ + Compresses pages with a specific quality. + + :param pdf_data: The input PDF as bytes. + :param image_quality: Compression quality. + :return: List of compressed page buffers. + """ + pdf_document = pdfium.PdfDocument(pdf_data) + compressed_pages = [] + for page in pdf_document: + rasterized_page = _rasterize_page(page, image_quality) + compressed_image = compress_image(rasterized_page, image_quality) + image = Image.open(io.BytesIO(compressed_image)) + compressed_pages.append((compressed_image, image.size[0], image.size[1])) + + return compressed_pages + + +def _is_compression_successful( + total_compressed_size: int, original_size: int, image_quality: int +) -> bool: + """ + Checks if the compression was successful based on the compressed size and original size. + + :param total_compressed_size: Total size of compressed pages. + :param original_size: Original PDF size. + :param image_quality: Compression quality. + :return: True if compression was successful, false otherwise. + """ + overhead = lerp(0.54, 0.18, image_quality / 100) + return total_compressed_size + total_compressed_size * overhead < original_size + + +def _rasterize_page( # type: ignore + page: pdfium.PdfPage, + quality: int = 85, +) -> bytes: + """ + Rasterizes a PDF page. + + :param page: PdfPage object to rasterize. + :param quality: Quality to apply during rasterization. + :return: Rasterized page as bytes. + """ + image = page.render().to_pil() + buffer = io.BytesIO() + image.save(buffer, format="JPEG", quality=quality) + return buffer.getvalue() + + +def _collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # type: ignore + """ + Converts a list of JPEG images into pages in a PdfDocument. + + :param image_list: A list of bytes representing JPEG images. + :return: A PdfDocument handle containing the images as pages. + """ + out_pdf = pdfium.PdfDocument.new() + + for image_bytes in image_list: + pdf_image = pdfium.PdfImage.new(out_pdf) + pdf_image.load_jpeg(io.BytesIO(image_bytes)) + + width, height = pdf_image.get_size() + page = out_pdf.new_page(width, height) + page.insert_obj(pdf_image) + page.gen_content() + + return out_pdf diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py new file mode 100644 index 00000000..70b7d984 --- /dev/null +++ b/mindee/pdf/pdf_utils.py @@ -0,0 +1,277 @@ +import ctypes +from ctypes import byref, c_double, c_int, create_string_buffer +from threading import RLock +from typing import List, Tuple + +import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c + +from mindee.pdf.pdf_char_data import PDFCharData + +FALLBACK_FONT = "Helvetica" + + +def has_source_text(pdf_bytes: bytes) -> bool: + """ + Checks if the provided PDF bytes contain source text. + + :param pdf_bytes: Raw bytes representation of a PDF file + :return: + """ + pdf = pdfium.PdfDocument(pdf_bytes) + for page in pdf: + if len(page.get_textpage().get_text_bounded().strip()) > 0: + return True + return False + + +def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]: + """ + Extracts the raw text from a given PDF's bytes along with font data. + + :param pdf_bytes: Raw bytes representation of a PDF file. + :return: A list of info regarding each read character. + """ + pdfium_lock = RLock() + pdf = pdfium.PdfDocument(pdf_bytes) + char_data_list: List[List[PDFCharData]] = [] + + for i, page in enumerate(pdf): + char_data_list.append(_process_page(page, i, pdfium_lock)) + + return char_data_list + + +def _process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]: + """ + Processes a single page of the PDF. + + :param page: The PDF page to process. + :param page_id: ID of the page. + :param pdfium_lock: Lock for thread-safe operations. + """ + char_data_list: List[PDFCharData] = [] + internal_height = page.get_height() + internal_width = page.get_width() + + with pdfium_lock: + text_handler = pdfium_c.FPDFText_LoadPage(page.raw) + count_chars = pdfium_c.FPDFText_CountChars(text_handler) + + for i in range(count_chars): + concatenated_chars = _process_char( + i, text_handler, page, pdfium_lock, internal_height, internal_width, page_id + ) + for concatenated_char in concatenated_chars: + char_data_list.append(concatenated_char) + + with pdfium_lock: + pdfium_c.FPDFText_ClosePage(text_handler) + return char_data_list + + +def _process_char( + i: int, + text_handler, + page, + pdfium_lock: RLock, + internal_height: float, + internal_width: float, + page_id: int, +) -> List[PDFCharData]: + """ + Processes a single character from the PDF. + + :param i: The index of the character. + :param text_handler: The text handler for the current page. + :param page: The current page being processed. + :param pdfium_lock: Lock for thread-safe operations. + :param internal_height: The height of the page. + :param internal_width: The width of the page. + :param page_id: ID of the page the character was found on. + :return: List of character data for a page. + """ + char_info = _get_char_info(i, text_handler, pdfium_lock) + if not char_info: + return [] + char_box = _get_char_box(i, text_handler, pdfium_lock) + rotation = _get_page_rotation(page, pdfium_lock) + + adjusted_box = _adjust_char_box(char_box, rotation, internal_height, internal_width) + char_data_list: List[PDFCharData] = [] + for c in char_info["char"] or " ": + if c in ( + "\n", + "\r", + ): # Removes duplicated carriage returns in the PDF due to weird extraction. + # IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check. + next_char_info = _get_char_info(i + 1, text_handler, pdfium_lock) + if not next_char_info or next_char_info["char"] in ("\n", "\r"): + continue + + char_data_list.append( + PDFCharData( + char=c, + left=int(adjusted_box[0]), + right=int(adjusted_box[1]), + top=int(adjusted_box[2]), + bottom=int(adjusted_box[3]), + font_name=char_info["font_name"], + font_size=char_info["font_size"], + font_weight=char_info["font_weight"], + font_stroke_color=char_info["font_stroke_color"], + font_fill_color=char_info["font_fill_color"], + font_flags=char_info["font_flags"], + page_id=page_id, + ) + ) + return char_data_list + + +def _get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: + """ + Retrieves information about a specific character. + + :param i: The index of the character. + :param text_handler: The text handler for the current page. + :param pdfium_lock: Lock for thread-safe operations. + :return: A dictionary containing character information. + """ + stroke = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()) + fill = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()) + + with pdfium_lock: + unicode_char = pdfium_c.FPDFText_GetUnicode(text_handler, i) + if unicode_char == 0xFF: + return {} + char = chr(unicode_char) + font_name = _get_font_name(text_handler, i) + font_flags = _get_font_flags(text_handler, i) + font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i) + font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i) + _ = pdfium_c.FPDFText_GetStrokeColor( + text_handler, i, stroke[0], stroke[1], stroke[2], stroke[3] + ) + _ = pdfium_c.FPDFText_GetFillColor( + text_handler, i, fill[0], fill[1], fill[2], fill[3] + ) + + return { + "char": char, + "font_name": font_name, + "font_flags": font_flags, + "font_size": font_size, + "font_weight": font_weight, + "font_stroke_color": stroke, + "font_fill_color": fill, + } + + +def _get_font_name(text_handler, i: int) -> str: + """ + Retrieves the font name for a specific character. + + :param text_handler: The text handler for the current page. + :param i: The index of the character. + :return: The font name as a string. + """ + buffer_length = 128 + font_name_buffer = create_string_buffer(buffer_length) + flags = c_int(0) + actual_length = pdfium_c.FPDFText_GetFontInfo( + text_handler, i, font_name_buffer, buffer_length, byref(flags) + ) + return ( + font_name_buffer.value.decode("utf-8") if actual_length > 0 else FALLBACK_FONT + ) + + +def _get_font_flags(text_handler, i: int) -> int: + """ + Retrieves the font flags for a specific character. + + :param text_handler: The text handler for the current page. + :param i: The index of the character. + :return: The font flags as an integer. + """ + flags = c_int(0) + pdfium_c.FPDFText_GetFontInfo(text_handler, i, None, 0, byref(flags)) + return flags.value + + +def _get_char_box( + i: int, text_handler, pdfium_lock: RLock +) -> Tuple[float, float, float, float]: + """ + Retrieves the bounding box for a specific character. + + :param i: The index of the character. + :param text_handler: The text handler for the current page. + :param pdfium_lock: Lock for thread-safe operations. + :return: A tuple containing left, right, bottom, and top coordinates. + """ + left, right, bottom, top = (c_double(0), c_double(0), c_double(0), c_double(0)) + with pdfium_lock: + pdfium_c.FPDFText_GetCharBox( + text_handler, i, byref(left), byref(right), byref(bottom), byref(top) + ) + return left.value, right.value, bottom.value, top.value + + +def _get_page_rotation(page, pdfium_lock: RLock) -> int: + """ + Retrieves the rotation value for a specific page. + + :param page: The page to get the rotation for. + :param pdfium_lock: Lock for thread-safe operations. + :return: The rotation value in degrees. + """ + with pdfium_lock: + return {0: 0, 1: 90, 2: 180, 3: 270}.get( + pdfium_c.FPDFPage_GetRotation(page.raw), 0 + ) + + +def _adjust_char_box( + char_box: Tuple[float, float, float, float], + rotation: int, + internal_height: float, + internal_width: float, +) -> Tuple[float, float, float, float]: + """ + Adjusts the character bounding box based on page rotation. + + :param char_box: The original character bounding box. + :param rotation: The page rotation in degrees. + :param internal_height: The height of the page. + :param internal_width: The width of the page. + :return: The adjusted character bounding box. + """ + left, right, bottom, top = char_box + if rotation == 0: + top, bottom = internal_height - top, internal_height - bottom + elif rotation == 90: + left, right, top, bottom = bottom, top, left, right + elif rotation == 180: + left, right = internal_width - right, internal_width - left + top, bottom = bottom, top + elif rotation == 270: + left, right, top, bottom = ( + internal_width - top, + internal_width - bottom, + internal_height - right, + internal_height - left, + ) + return left, right, top, bottom + + +def lerp(start: float, end: float, t: float) -> float: + """ + Performs linear interpolation between two numbers. + + :param start: The starting value. + :param end: The ending value. + :param t: The interpolation factor (0 to 1). + :return: The interpolated value. + """ + return start * (1 - t) + end * t diff --git a/tests/api/test_async_response.py b/tests/api/test_async_response.py index 31319095..e8163d0c 100644 --- a/tests/api/test_async_response.py +++ b/tests/api/test_async_response.py @@ -5,7 +5,7 @@ import requests from mindee.client import Client -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.mindee_http.response_validation import is_valid_async_response from mindee.parsing.common.api_request import RequestStatus from mindee.parsing.common.async_predict_response import AsyncPredictResponse diff --git a/tests/extraction/test_image_extractor.py b/tests/extraction/test_image_extractor.py index f41dc4a4..7f6d5db2 100644 --- a/tests/extraction/test_image_extractor.py +++ b/tests/extraction/test_image_extractor.py @@ -4,7 +4,7 @@ from PIL import Image from mindee.extraction.common.image_extractor import extract_multiple_images_from_source -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1 from tests.test_inputs import PRODUCT_DATA_DIR diff --git a/tests/extraction/test_invoice_splitter_auto_extraction.py b/tests/extraction/test_invoice_splitter_auto_extraction.py index 716628e7..3abc2d2a 100644 --- a/tests/extraction/test_invoice_splitter_auto_extraction.py +++ b/tests/extraction/test_invoice_splitter_auto_extraction.py @@ -4,7 +4,7 @@ from mindee import Client from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.parsing.common.document import Document from mindee.product.invoice.invoice_v4 import InvoiceV4 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 diff --git a/tests/extraction/test_multi_receipts_extractor.py b/tests/extraction/test_multi_receipts_extractor.py index 0f71d1fe..00e22f12 100644 --- a/tests/extraction/test_multi_receipts_extractor.py +++ b/tests/extraction/test_multi_receipts_extractor.py @@ -6,7 +6,7 @@ from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import ( extract_receipts, ) -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import ( MultiReceiptsDetectorV1, ) diff --git a/tests/extraction/test_pdf_extractor.py b/tests/extraction/test_pdf_extractor.py index e323cd2c..a236d9c2 100644 --- a/tests/extraction/test_pdf_extractor.py +++ b/tests/extraction/test_pdf_extractor.py @@ -3,7 +3,7 @@ from mindee import Client from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor from mindee.input.local_response import LocalResponse -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 from mindee.product.invoice_splitter.invoice_splitter_v1_document import ( InvoiceSplitterV1Document, diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py new file mode 100644 index 00000000..e09d2970 --- /dev/null +++ b/tests/input/test_compression.py @@ -0,0 +1,229 @@ +import operator +import os +from functools import reduce +from pathlib import Path + +import pytest +from PIL import Image + +from mindee.image_operations.image_compressor import compress_image +from mindee.input.sources.path_input import PathInput +from mindee.pdf.pdf_compressor import compress_pdf +from mindee.pdf.pdf_utils import extract_text_from_pdf + +DATA_DIR = Path("./tests/data") +OUTPUT_DIR = DATA_DIR / "output" + + +def test_image_quality_compress_from_input_source(): + receipt_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + receipt_input.compress(40) + + with open(OUTPUT_DIR / "compress_indirect.jpg", "wb") as f: + f.write(receipt_input.file_object.read()) + receipt_input.file_object.seek(0) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = os.stat(OUTPUT_DIR / "compress_indirect.jpg") + assert rendered_file_stats.st_size < initial_file_stats.st_size + + +def test_image_quality_compresses_from_compressor(): + receipt_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + compresses = [ + compress_image(receipt_input.file_object, 100), + compress_image(receipt_input.file_object), + compress_image(receipt_input.file_object, 50), + compress_image(receipt_input.file_object, 10), + compress_image(receipt_input.file_object, 1), + ] + + file_names = [ + "compress100.jpg", + "compress75.jpg", + "compress50.jpg", + "compress10.jpg", + "compress1.jpg", + ] + for i, compressed in enumerate(compresses): + with open(OUTPUT_DIR / file_names[i], "wb") as f: + f.write(compressed) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names] + + assert initial_file_stats.st_size < rendered_file_stats[0].st_size + assert initial_file_stats.st_size < rendered_file_stats[1].st_size + assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size + assert rendered_file_stats[2].st_size > rendered_file_stats[3].st_size + assert rendered_file_stats[3].st_size > rendered_file_stats[4].st_size + + +def test_image_resize_from_input_source(): + image_resize_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + + image_resize_input.compress(75, 250, 1000) + with open(OUTPUT_DIR / "resize_indirect.jpg", "wb") as f: + f.write(image_resize_input.file_object.read()) + image_resize_input.file_object.seek(0) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.jpg") + assert rendered_file_stats.st_size < initial_file_stats.st_size + + image = Image.open(image_resize_input.file_object) + assert image.width == 250 + assert image.height == 333 + + +def test_image_resize_from_compressor(): + image_resize_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + + resizes = [ + compress_image(image_resize_input.file_object, 75, 500), + compress_image(image_resize_input.file_object, 75, 250, 500), + compress_image(image_resize_input.file_object, 75, 500, 250), + compress_image(image_resize_input.file_object, 75, None, 250), + ] + + file_names = [ + "resize500xnull.jpg", + "resize250x500.jpg", + "resize500x250.jpg", + "resizenullx250.jpg", + ] + for i, resized in enumerate(resizes): + with open(OUTPUT_DIR / file_names[i], "wb") as f: + f.write(resized) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names] + + assert initial_file_stats.st_size > rendered_file_stats[0].st_size + assert rendered_file_stats[0].st_size > rendered_file_stats[1].st_size + assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size + assert rendered_file_stats[2].st_size == rendered_file_stats[3].st_size + + +def test_pdf_input_has_text(): + has_source_text_path = DATA_DIR / "file_types/pdf/multipage.pdf" + has_no_source_text_path = DATA_DIR / "file_types/pdf/blank_1.pdf" + has_no_source_text_since_its_image_path = os.path.join( + DATA_DIR, "file_types/receipt.jpg" + ) + + has_source_text_input = PathInput(has_source_text_path) + has_no_source_text_input = PathInput(has_no_source_text_path) + has_no_source_text_since_its_image_input = PathInput( + has_no_source_text_since_its_image_path + ) + + assert has_source_text_input.has_source_text() + assert not has_no_source_text_input.has_source_text() + assert not has_no_source_text_since_its_image_input.has_source_text() + + +def test_pdf_compress_from_input_source(): + pdf_resize_input = PathInput( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + + compressed_pdf = compress_pdf(pdf_resize_input.file_object, 75, True) + with open(OUTPUT_DIR / "resize_indirect.pdf", "wb") as f: + f.write(compressed_pdf) + + initial_file_stats = os.stat( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.pdf") + + assert rendered_file_stats.st_size < initial_file_stats.st_size + + +def test_pdf_compress_from_compressor(): + pdf_resize_input = PathInput( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + resizes = [] + qualities = [85, 75, 50, 10] + for quality in qualities: + resizes.append(compress_pdf(pdf_resize_input.file_object, quality)) + pdf_resize_input.file_object.seek(0) + + file_names = [ + "compress85.pdf", + "compress75.pdf", + "compress50.pdf", + "compress10.pdf", + ] + for [i, resized] in enumerate(resizes): + with open(OUTPUT_DIR / file_names[i], "wb") as f: + f.write(resized) + + initial_file_stats = os.stat( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names] + + assert initial_file_stats.st_size > rendered_file_stats[0].st_size + assert rendered_file_stats[0].st_size > rendered_file_stats[1].st_size + assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size + assert rendered_file_stats[2].st_size > rendered_file_stats[3].st_size + + +def test_pdf_compress_with_text_keeps_text(): + initial_with_text = PathInput(DATA_DIR / "file_types/pdf/multipage.pdf") + + compressed_with_text = compress_pdf(initial_with_text.file_object, 100, True, False) + + text_chars = [] + for text_info in extract_text_from_pdf(initial_with_text.file_object.read()): + text_chars.append("".join([ti.char for ti in text_info])) + initial_with_text.file_object.seek(0) + original_text = "".join(text_chars) + compressed_text = "".join( + [ + text_info.char + for text_info in reduce( + operator.concat, extract_text_from_pdf(compressed_with_text) + ) + ] + ) + + assert compressed_text == original_text + + +def test_pdf_compress_with_text_does_not_compress(): + initial_with_text = PathInput(DATA_DIR / "file_types/pdf/multipage.pdf") + + compressed_with_text = compress_pdf(initial_with_text.file_object, 50) + + assert compressed_with_text == initial_with_text.file_object.read() + + +@pytest.fixture(scope="module", autouse=True) +def cleanup(): + yield + created_files = [ + "compress10.pdf", + "compress50.pdf", + "compress75.pdf", + "compress85.pdf", + "resize_indirect.pdf", + "compress1.jpg", + "compress10.jpg", + "compress50.jpg", + "compress75.jpg", + "compress100.jpg", + "compress_indirect.jpg", + "resize250x500.jpg", + "resize500x250.jpg", + "resize500xnull.jpg", + "resize_indirect.jpg", + "resizenullx250.jpg", + ] + + for file_path in created_files: + full_path = DATA_DIR / "output" / file_path + if full_path.exists(): + os.remove(full_path) diff --git a/tests/mindee_http/test_error.py b/tests/mindee_http/test_error.py index f9ac9776..5e2f879e 100644 --- a/tests/mindee_http/test_error.py +++ b/tests/mindee_http/test_error.py @@ -9,7 +9,7 @@ MindeeHTTPServerError, handle_error, ) -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from tests.test_inputs import FILE_TYPES_DIR from tests.utils import clear_envvars, dummy_envvars diff --git a/tests/test_client.py b/tests/test_client.py index 574ad51b..599e244c 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -6,7 +6,7 @@ from mindee.error.mindee_error import MindeeClientError, MindeeError from mindee.error.mindee_http_error import MindeeHTTPError from mindee.input.local_response import LocalResponse -from mindee.input.sources import LocalInputSource +from mindee.input.sources.local_input_source import LocalInputSource from mindee.product.international_id.international_id_v2 import InternationalIdV2 from mindee.product.invoice.invoice_v4 import InvoiceV4 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 1c67e8ef..9eaa84c9 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -7,13 +7,11 @@ from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError from mindee.input.page_options import KEEP_ONLY, REMOVE -from mindee.input.sources import ( - Base64Input, - BytesInput, - FileInput, - PathInput, - UrlInputSource, -) +from mindee.input.sources.base_64_input import Base64Input +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.path_input import PathInput +from mindee.input.sources.url_input_source import UrlInputSource from tests.product import PRODUCT_DATA_DIR FILE_TYPES_DIR = Path("./tests/data/file_types")