temp save, completely untested code

sebastianMindee · sebastianMindee · commit 56ca78334d60 · 2025-01-15T14:49:41.000+01:00
diff --git a/mindee/error/__init__.py b/mindee/error/__init__.py
@@ -7,3 +7,5 @@
     MindeeHTTPServerError,
     handle_error,
 )
+from mindee.error.mindee_image_error import MindeeImageError
+from mindee.error.mindee_pdf_error import MindeePDFError
diff --git a/mindee/error/mindee_image_error.py b/mindee/error/mindee_image_error.py
@@ -0,0 +1,2 @@
+class MindeeImageError(RuntimeError):
+    """An exception relating to errors during image operations."""
diff --git a/mindee/error/mindee_pdf_error.py b/mindee/error/mindee_pdf_error.py
@@ -0,0 +1,2 @@
+class MindeePDFError(RuntimeError):
+    """An exception relating to errors during PDF operations."""
diff --git a/mindee/extraction/__init__.py b/mindee/extraction/__init__.py
@@ -1,6 +1,6 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
-    attach_image_as_new_file,
+    attach_images_as_new_file,
     extract_multiple_images_from_source,
 )
 from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
diff --git a/mindee/extraction/common/__init__.py b/mindee/extraction/common/__init__.py
@@ -1,5 +1,5 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
-    attach_image_as_new_file,
+    attach_images_as_new_file,
     extract_multiple_images_from_source,
 )
diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py
@@ -11,35 +11,34 @@
 from mindee.input.sources import BytesInput, LocalInputSource
 
 
-def attach_image_as_new_file(  # type: ignore
-    input_buffer: BinaryIO,
+def attach_images_as_new_file(  # type: ignore
+    input_buffer_list: List[BinaryIO],
 ) -> pdfium.PdfDocument:
     """
-    Attaches an image as a new page in a PdfDocument object.
+    Attaches a list of images as new pages in a PdfDocument object.
 
-    :param input_buffer: Input buffer.
+    :param input_buffer_list: List of images, represented as buffers.
     :return: A PdfDocument handle.
     """
-    # Create a new page in the PdfDocument
-    input_buffer.seek(0)
-    image = Image.open(input_buffer)
-    image.convert("RGB")
-    image_buffer = io.BytesIO()
-    image.save(image_buffer, format="JPEG")
-
     pdf = pdfium.PdfDocument.new()
-
-    image_pdf = pdfium.PdfImage.new(pdf)
-    image_pdf.load_jpeg(image_buffer)
-    width, height = image_pdf.get_size()
-
-    matrix = pdfium.PdfMatrix().scale(width, height)
-    image_pdf.set_matrix(matrix)
-
-    page = pdf.new_page(width, height)
-    page.insert_obj(image_pdf)
-    page.gen_content()
-    image.close()
+    for input_buffer in input_buffer_list:
+        input_buffer.seek(0)
+        image = Image.open(input_buffer)
+        image.convert("RGB")
+        image_buffer = io.BytesIO()
+        image.save(image_buffer, format="JPEG")
+
+        image_pdf = pdfium.PdfImage.new(pdf)
+        image_pdf.load_jpeg(image_buffer)
+        width, height = image_pdf.get_size()
+
+        matrix = pdfium.PdfMatrix().scale(width, height)
+        image_pdf.set_matrix(matrix)
+
+        page = pdf.new_page(width, height)
+        page.insert_obj(image_pdf)
+        page.gen_content()
+        image.close()
     return pdf
 
 
@@ -160,4 +159,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: i
         input_file.file_object.seek(0)
         return pdfium.PdfDocument(input_file.file_object)
 
-    return attach_image_as_new_file(input_file.file_object)
+    return attach_images_as_new_file([input_file.file_object])
diff --git a/mindee/image_operations/__init__.py b/mindee/image_operations/__init__.py
diff --git a/mindee/image_operations/image_compressor.py b/mindee/image_operations/image_compressor.py
@@ -0,0 +1,33 @@
+import io
+from typing import Union
+
+from PIL import Image
+
+
+def compress_image(
+    image_buffer: bytes,
+    quality: int = 85,
+    max_width: Union[int, float, None] = None,
+    max_height: Union[int, float, None] = None,
+) -> bytes:
+    """
+    Compresses an image with the given parameters.
+
+    :param image_buffer: Buffer representation of an image.
+    :param quality: Quality to apply to the image (JPEG compression).
+    :param max_width: Maximum bound for the width.
+    :param max_height: Maximum bound for the height.
+    :return:
+    """
+    with Image.open(io.BytesIO(image_buffer)) as img:
+        original_width, original_height = img.size
+        max_width = max_width or original_width
+        max_height = max_height or original_height
+        if max_width or max_height:
+            img.thumbnail((int(max_width), int(max_height)), Image.Resampling.LANCZOS)
+
+        output_buffer = io.BytesIO()
+        img.save(output_buffer, format="JPEG", quality=quality, optimize=True)
+
+        compressed_image = output_buffer.getvalue()
+    return compressed_image
diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py
diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass
+from typing import Tuple
+
+
+@dataclass
+class PDFCharData:
+    """Data class representing character data."""
+
+    char: str
+    """The character."""
+    left: int
+    """Left bound."""
+    right: int
+    """Right bound."""
+    top: int
+    """Top bound."""
+    bottom: int
+    """Bottom bound."""
+    font_name: str
+    """The font name."""
+    font_size: int
+    """The font size in pt."""
+    font_weight: int
+    """The font weight."""
+    font_flags: int
+    """The font flags."""
+    font_stroke_color: Tuple[int, int, int, int]
+    """RGBA representation of the font's stroke color."""
+    font_fill_color: Tuple[int, int, int, int]
+    """RGBA representation of the font's fill color."""
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
@@ -0,0 +1,215 @@
+import logging
+from io import BytesIO
+from threading import RLock
+from typing import List, Optional
+
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+
+from mindee.extraction import attach_images_as_new_file
+from mindee.image_operations.image_compressor import compress_image
+from mindee.pdf.pdf_char_data import PDFCharData
+from mindee.pdf.pdf_utils import extract_text_from_pdf, has_source_text
+
+logger = logging.getLogger(__name__)
+MIN_QUALITY = 1
+
+
+def compress_pdf(
+    pdf_data: bytes,
+    image_quality: int = 85,
+    force_source_text_compression: bool = False,
+    disable_source_text: bool = True,
+) -> bytes:
+    """
+    Compresses each page of a provided PDF buffer.
+
+    :param pdf_data: The input PDF as bytes.
+    :param image_quality: Compression quality (70-100 for most JPG images).
+    :param force_source_text_compression: If true, attempts to re-write detected text.
+    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
+    :return: Compressed PDF as bytes.
+    """
+    if has_source_text(pdf_data):
+        if force_source_text_compression:
+            if not disable_source_text:
+                logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.")
+            else:
+                logger.warning(
+                    "Source file contains text, but disable_source_text flag "
+                    "is set to false. Resulting file will not contain any embedded text."
+                )
+        else:
+            logger.warning(
+                "Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
+                "is set to 'true'."
+            )
+            return pdf_data
+
+    extracted_text = (
+        extract_text_from_pdf(pdf_data) if not disable_source_text else None
+    )
+
+    compressed_pages = compress_pdf_pages(
+        pdf_data, extracted_text, image_quality, disable_source_text
+    )
+
+    if not compressed_pages:
+        logger.warning(
+            "Could not compress PDF to a smaller size. Returning original PDF."
+        )
+        return pdf_data
+
+    out_pdf = attach_images_as_new_file(
+        [BytesIO(compressed_page) for compressed_page in compressed_pages]
+    )
+    out_bytes = BytesIO()
+    out_pdf.save(out_bytes)
+
+    return out_bytes.read()
+
+
+def compress_pdf_pages(
+    pdf_data: bytes,
+    extracted_text: Optional[List[PDFCharData]],
+    image_quality: int,
+    disable_source_text: bool,
+) -> Optional[List[bytes]]:
+    """
+    Compresses PDF pages and returns an array of compressed page buffers.
+
+    :param pdf_data: The input PDF as bytes.
+    :param extracted_text: Extracted text from the PDF.
+    :param image_quality: Initial compression quality.
+    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
+    :return: List of compressed page buffers, or None if compression fails.
+    """
+    original_size = len(pdf_data)
+    image_quality_loop = image_quality
+
+    while image_quality_loop >= MIN_QUALITY:
+        compressed_pages = compress_pages_with_quality(
+            pdf_data, extracted_text, image_quality_loop, disable_source_text
+        )
+        total_compressed_size = sum(len(page) for page in compressed_pages)
+
+        if is_compression_successful(
+            total_compressed_size, original_size, image_quality
+        ):
+            return compressed_pages
+
+        image_quality_loop -= round(lerp(1, 10, image_quality_loop / 100))
+
+    return None
+
+
+def add_text_to_pdf_page(  # type: ignore
+    page: pdfium.PdfPage,
+    extracted_text: Optional[List[PDFCharData]],
+) -> None:
+    """
+    Adds text to a PDF page based on the extracted text data.
+
+    :param page: The PdfPage object to add text to.
+    :param extracted_text: List of PDFCharData objects containing text and positioning information.
+    """
+    if not extracted_text:
+        return
+
+    height = page.get_height()
+    document = page.pdf
+    pdfium_lock = RLock()
+
+    with pdfium_lock:
+        text_handler = pdfium_c.FPDFText_LoadPage(page.raw)
+        for char_data in extracted_text:
+            font = document.load_font(
+                char_data.font_name, pdfium_c.FPDF_FONT_TRUETYPE, True
+            )
+            text_object = document.create_text_object(font, char_data.font_size)
+            text_object.set_text(char_data.char)
+            x = char_data.left
+            y = height - char_data.bottom
+            text_object.set_position(x, y)
+            r, g, b, a = char_data.font_fill_color
+            text_object.set_fill_color(r, g, b, a)
+            pdfium_c.FPDFPage_InsertObject(text_handler, text_object)
+        pdfium_c.FPDFPage_GenerateContent(text_handler)
+
+    with pdfium_lock:
+        pdfium_c.FPDFText_ClosePage(text_handler)
+
+
+def compress_pages_with_quality(
+    pdf_data: bytes,
+    extracted_text: Optional[list[PDFCharData]],
+    image_quality: int,
+    disable_source_text: bool,
+) -> List[bytes]:
+    """
+    Compresses pages with a specific quality.
+
+    :param pdf_data: The input PDF as bytes.
+    :param extracted_text: Extracted text from the PDF.
+    :param image_quality: Compression quality.
+    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
+    :return: List of compressed page buffers.
+    """
+    pdf_document = pdfium.PdfDocument(pdf_data)
+    compressed_pages = []
+
+    for i in enumerate(pdf_document):
+        page = pdf_document[i]
+        rasterized_page = rasterize_page(page, image_quality)
+        compressed_image = compress_image(rasterized_page, image_quality)
+
+        if not disable_source_text:
+            add_text_to_pdf_page(page, extracted_text)
+
+        compressed_pages.append(compressed_image)
+
+    return compressed_pages
+
+
+def is_compression_successful(
+    total_compressed_size: int, original_size: int, image_quality: int
+) -> bool:
+    """
+    Checks if the compression was successful based on the compressed size and original size.
+
+    :param total_compressed_size: Total size of compressed pages.
+    :param original_size: Original PDF size.
+    :param image_quality: Compression quality.
+    :return: True if compression was successful, false otherwise.
+    """
+    overhead = lerp(0.54, 0.18, image_quality / 100)
+    return total_compressed_size + total_compressed_size * overhead < original_size
+
+
+def rasterize_page(  # type: ignore
+    page: pdfium.PdfPage,
+    quality: int = 85,
+) -> bytes:
+    """
+    Rasterizes a PDF page.
+
+    :param page: PdfPage object to rasterize.
+    :param quality: Quality to apply during rasterization.
+    :return: Rasterized page as bytes.
+    """
+    image = page.render().to_pil()
+    buffer = BytesIO()
+    image.save(buffer, format="JPEG", quality=quality)
+    return buffer.getvalue()
+
+
+def lerp(start: float, end: float, t: float) -> float:
+    """
+    Performs linear interpolation between two numbers.
+
+    :param start: The starting value.
+    :param end: The ending value.
+    :param t: The interpolation factor (0 to 1).
+    :return: The interpolated value.
+    """
+    return start * (1 - t) + end * t
diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py

Original file line number	Diff line number	Diff line change
`@@ -7,3 +7,5 @@`
`7`	`7`	`MindeeHTTPServerError,`
`8`	`8`	`handle_error,`
`9`	`9`	`)`
	`10`	`+from mindee.error.mindee_image_error import MindeeImageError`
	`11`	`+from mindee.error.mindee_pdf_error import MindeePDFError`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+class MindeeImageError(RuntimeError):`
	`2`	`+ """An exception relating to errors during image operations."""`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+class MindeePDFError(RuntimeError):`
	`2`	`+ """An exception relating to errors during PDF operations."""`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`from mindee.extraction.common.extracted_image import ExtractedImage`
`2`	`2`	`from mindee.extraction.common.image_extractor import (`
`3`		`- attach_image_as_new_file,`
	`3`	`+ attach_images_as_new_file,`
`4`	`4`	`extract_multiple_images_from_source,`
`5`	`5`	`)`
`6`	`6`	`from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`from mindee.extraction.common.extracted_image import ExtractedImage`
`2`	`2`	`from mindee.extraction.common.image_extractor import (`
`3`		`- attach_image_as_new_file,`
	`3`	`+ attach_images_as_new_file,`
`4`	`4`	`extract_multiple_images_from_source,`
`5`	`5`	`)`