From ff2f62c0ce1bcd07bda0e817a39b5e5b915cbe39 Mon Sep 17 00:00:00 2001
From: sebastianMindee <sebastian.oliverasilvera@mindee.co>
Date: Wed, 15 Jan 2025 14:49:41 +0100
Subject: [PATCH 1/7] temp save, completely untested code

---
 mindee/error/__init__.py                    |   2 +
 mindee/error/mindee_image_error.py          |   2 +
 mindee/error/mindee_pdf_error.py            |   2 +
 mindee/extraction/__init__.py               |   2 +-
 mindee/extraction/common/__init__.py        |   2 +-
 mindee/extraction/common/image_extractor.py |  47 ++--
 mindee/image_operations/__init__.py         |   0
 mindee/image_operations/image_compressor.py |  33 +++
 mindee/pdf/__init__.py                      |   0
 mindee/pdf/pdf_char_data.py                 |  30 +++
 mindee/pdf/pdf_compressor.py                | 215 +++++++++++++++++
 mindee/pdf/pdf_utils.py                     | 241 ++++++++++++++++++++
 12 files changed, 550 insertions(+), 26 deletions(-)
 create mode 100644 mindee/error/mindee_image_error.py
 create mode 100644 mindee/error/mindee_pdf_error.py
 create mode 100644 mindee/image_operations/__init__.py
 create mode 100644 mindee/image_operations/image_compressor.py
 create mode 100644 mindee/pdf/__init__.py
 create mode 100644 mindee/pdf/pdf_char_data.py
 create mode 100644 mindee/pdf/pdf_compressor.py
 create mode 100644 mindee/pdf/pdf_utils.py

diff --git a/mindee/error/__init__.py b/mindee/error/__init__.py
index e8075e37..c49c3cf3 100644
--- a/mindee/error/__init__.py
+++ b/mindee/error/__init__.py
@@ -7,3 +7,5 @@
     MindeeHTTPServerError,
     handle_error,
 )
+from mindee.error.mindee_image_error import MindeeImageError
+from mindee.error.mindee_pdf_error import MindeePDFError
diff --git a/mindee/error/mindee_image_error.py b/mindee/error/mindee_image_error.py
new file mode 100644
index 00000000..1da0abec
--- /dev/null
+++ b/mindee/error/mindee_image_error.py
@@ -0,0 +1,2 @@
+class MindeeImageError(RuntimeError):
+    """An exception relating to errors during image operations."""
diff --git a/mindee/error/mindee_pdf_error.py b/mindee/error/mindee_pdf_error.py
new file mode 100644
index 00000000..52f0b32f
--- /dev/null
+++ b/mindee/error/mindee_pdf_error.py
@@ -0,0 +1,2 @@
+class MindeePDFError(RuntimeError):
+    """An exception relating to errors during PDF operations."""
diff --git a/mindee/extraction/__init__.py b/mindee/extraction/__init__.py
index 05629a5d..2e190d7b 100644
--- a/mindee/extraction/__init__.py
+++ b/mindee/extraction/__init__.py
@@ -1,6 +1,6 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
-    attach_image_as_new_file,
+    attach_images_as_new_file,
     extract_multiple_images_from_source,
 )
 from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
diff --git a/mindee/extraction/common/__init__.py b/mindee/extraction/common/__init__.py
index c0301c90..009009d9 100644
--- a/mindee/extraction/common/__init__.py
+++ b/mindee/extraction/common/__init__.py
@@ -1,5 +1,5 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
-    attach_image_as_new_file,
+    attach_images_as_new_file,
     extract_multiple_images_from_source,
 )
diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py
index 046b312c..3277feaf 100644
--- a/mindee/extraction/common/image_extractor.py
+++ b/mindee/extraction/common/image_extractor.py
@@ -11,35 +11,34 @@
 from mindee.input.sources import BytesInput, LocalInputSource
 
 
-def attach_image_as_new_file(  # type: ignore
-    input_buffer: BinaryIO,
+def attach_images_as_new_file(  # type: ignore
+    input_buffer_list: List[BinaryIO],
 ) -> pdfium.PdfDocument:
     """
-    Attaches an image as a new page in a PdfDocument object.
+    Attaches a list of images as new pages in a PdfDocument object.
 
-    :param input_buffer: Input buffer.
+    :param input_buffer_list: List of images, represented as buffers.
     :return: A PdfDocument handle.
     """
-    # Create a new page in the PdfDocument
-    input_buffer.seek(0)
-    image = Image.open(input_buffer)
-    image.convert("RGB")
-    image_buffer = io.BytesIO()
-    image.save(image_buffer, format="JPEG")
-
     pdf = pdfium.PdfDocument.new()
-
-    image_pdf = pdfium.PdfImage.new(pdf)
-    image_pdf.load_jpeg(image_buffer)
-    width, height = image_pdf.get_size()
-
-    matrix = pdfium.PdfMatrix().scale(width, height)
-    image_pdf.set_matrix(matrix)
-
-    page = pdf.new_page(width, height)
-    page.insert_obj(image_pdf)
-    page.gen_content()
-    image.close()
+    for input_buffer in input_buffer_list:
+        input_buffer.seek(0)
+        image = Image.open(input_buffer)
+        image.convert("RGB")
+        image_buffer = io.BytesIO()
+        image.save(image_buffer, format="JPEG")
+
+        image_pdf = pdfium.PdfImage.new(pdf)
+        image_pdf.load_jpeg(image_buffer)
+        width, height = image_pdf.get_size()
+
+        matrix = pdfium.PdfMatrix().scale(width, height)
+        image_pdf.set_matrix(matrix)
+
+        page = pdf.new_page(width, height)
+        page.insert_obj(image_pdf)
+        page.gen_content()
+        image.close()
     return pdf
 
 
@@ -160,4 +159,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: i
         input_file.file_object.seek(0)
         return pdfium.PdfDocument(input_file.file_object)
 
-    return attach_image_as_new_file(input_file.file_object)
+    return attach_images_as_new_file([input_file.file_object])
diff --git a/mindee/image_operations/__init__.py b/mindee/image_operations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindee/image_operations/image_compressor.py b/mindee/image_operations/image_compressor.py
new file mode 100644
index 00000000..4b600938
--- /dev/null
+++ b/mindee/image_operations/image_compressor.py
@@ -0,0 +1,33 @@
+import io
+from typing import Union
+
+from PIL import Image
+
+
+def compress_image(
+    image_buffer: bytes,
+    quality: int = 85,
+    max_width: Union[int, float, None] = None,
+    max_height: Union[int, float, None] = None,
+) -> bytes:
+    """
+    Compresses an image with the given parameters.
+
+    :param image_buffer: Buffer representation of an image.
+    :param quality: Quality to apply to the image (JPEG compression).
+    :param max_width: Maximum bound for the width.
+    :param max_height: Maximum bound for the height.
+    :return:
+    """
+    with Image.open(io.BytesIO(image_buffer)) as img:
+        original_width, original_height = img.size
+        max_width = max_width or original_width
+        max_height = max_height or original_height
+        if max_width or max_height:
+            img.thumbnail((int(max_width), int(max_height)), Image.Resampling.LANCZOS)
+
+        output_buffer = io.BytesIO()
+        img.save(output_buffer, format="JPEG", quality=quality, optimize=True)
+
+        compressed_image = output_buffer.getvalue()
+    return compressed_image
diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py
new file mode 100644
index 00000000..9f516655
--- /dev/null
+++ b/mindee/pdf/pdf_char_data.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass
+from typing import Tuple
+
+
+@dataclass
+class PDFCharData:
+    """Data class representing character data."""
+
+    char: str
+    """The character."""
+    left: int
+    """Left bound."""
+    right: int
+    """Right bound."""
+    top: int
+    """Top bound."""
+    bottom: int
+    """Bottom bound."""
+    font_name: str
+    """The font name."""
+    font_size: int
+    """The font size in pt."""
+    font_weight: int
+    """The font weight."""
+    font_flags: int
+    """The font flags."""
+    font_stroke_color: Tuple[int, int, int, int]
+    """RGBA representation of the font's stroke color."""
+    font_fill_color: Tuple[int, int, int, int]
+    """RGBA representation of the font's fill color."""
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
new file mode 100644
index 00000000..ef7de3da
--- /dev/null
+++ b/mindee/pdf/pdf_compressor.py
@@ -0,0 +1,215 @@
+import logging
+from io import BytesIO
+from threading import RLock
+from typing import List, Optional
+
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+
+from mindee.extraction import attach_images_as_new_file
+from mindee.image_operations.image_compressor import compress_image
+from mindee.pdf.pdf_char_data import PDFCharData
+from mindee.pdf.pdf_utils import extract_text_from_pdf, has_source_text
+
+logger = logging.getLogger(__name__)
+MIN_QUALITY = 1
+
+
+def compress_pdf(
+    pdf_data: bytes,
+    image_quality: int = 85,
+    force_source_text_compression: bool = False,
+    disable_source_text: bool = True,
+) -> bytes:
+    """
+    Compresses each page of a provided PDF buffer.
+
+    :param pdf_data: The input PDF as bytes.
+    :param image_quality: Compression quality (70-100 for most JPG images).
+    :param force_source_text_compression: If true, attempts to re-write detected text.
+    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
+    :return: Compressed PDF as bytes.
+    """
+    if has_source_text(pdf_data):
+        if force_source_text_compression:
+            if not disable_source_text:
+                logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.")
+            else:
+                logger.warning(
+                    "Source file contains text, but disable_source_text flag "
+                    "is set to false. Resulting file will not contain any embedded text."
+                )
+        else:
+            logger.warning(
+                "Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
+                "is set to 'true'."
+            )
+            return pdf_data
+
+    extracted_text = (
+        extract_text_from_pdf(pdf_data) if not disable_source_text else None
+    )
+
+    compressed_pages = compress_pdf_pages(
+        pdf_data, extracted_text, image_quality, disable_source_text
+    )
+
+    if not compressed_pages:
+        logger.warning(
+            "Could not compress PDF to a smaller size. Returning original PDF."
+        )
+        return pdf_data
+
+    out_pdf = attach_images_as_new_file(
+        [BytesIO(compressed_page) for compressed_page in compressed_pages]
+    )
+    out_bytes = BytesIO()
+    out_pdf.save(out_bytes)
+
+    return out_bytes.read()
+
+
+def compress_pdf_pages(
+    pdf_data: bytes,
+    extracted_text: Optional[List[PDFCharData]],
+    image_quality: int,
+    disable_source_text: bool,
+) -> Optional[List[bytes]]:
+    """
+    Compresses PDF pages and returns an array of compressed page buffers.
+
+    :param pdf_data: The input PDF as bytes.
+    :param extracted_text: Extracted text from the PDF.
+    :param image_quality: Initial compression quality.
+    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
+    :return: List of compressed page buffers, or None if compression fails.
+    """
+    original_size = len(pdf_data)
+    image_quality_loop = image_quality
+
+    while image_quality_loop >= MIN_QUALITY:
+        compressed_pages = compress_pages_with_quality(
+            pdf_data, extracted_text, image_quality_loop, disable_source_text
+        )
+        total_compressed_size = sum(len(page) for page in compressed_pages)
+
+        if is_compression_successful(
+            total_compressed_size, original_size, image_quality
+        ):
+            return compressed_pages
+
+        image_quality_loop -= round(lerp(1, 10, image_quality_loop / 100))
+
+    return None
+
+
+def add_text_to_pdf_page(  # type: ignore
+    page: pdfium.PdfPage,
+    extracted_text: Optional[List[PDFCharData]],
+) -> None:
+    """
+    Adds text to a PDF page based on the extracted text data.
+
+    :param page: The PdfPage object to add text to.
+    :param extracted_text: List of PDFCharData objects containing text and positioning information.
+    """
+    if not extracted_text:
+        return
+
+    height = page.get_height()
+    document = page.pdf
+    pdfium_lock = RLock()
+
+    with pdfium_lock:
+        text_handler = pdfium_c.FPDFText_LoadPage(page.raw)
+        for char_data in extracted_text:
+            font = document.load_font(
+                char_data.font_name, pdfium_c.FPDF_FONT_TRUETYPE, True
+            )
+            text_object = document.create_text_object(font, char_data.font_size)
+            text_object.set_text(char_data.char)
+            x = char_data.left
+            y = height - char_data.bottom
+            text_object.set_position(x, y)
+            r, g, b, a = char_data.font_fill_color
+            text_object.set_fill_color(r, g, b, a)
+            pdfium_c.FPDFPage_InsertObject(text_handler, text_object)
+        pdfium_c.FPDFPage_GenerateContent(text_handler)
+
+    with pdfium_lock:
+        pdfium_c.FPDFText_ClosePage(text_handler)
+
+
+def compress_pages_with_quality(
+    pdf_data: bytes,
+    extracted_text: Optional[list[PDFCharData]],
+    image_quality: int,
+    disable_source_text: bool,
+) -> List[bytes]:
+    """
+    Compresses pages with a specific quality.
+
+    :param pdf_data: The input PDF as bytes.
+    :param extracted_text: Extracted text from the PDF.
+    :param image_quality: Compression quality.
+    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
+    :return: List of compressed page buffers.
+    """
+    pdf_document = pdfium.PdfDocument(pdf_data)
+    compressed_pages = []
+
+    for i in enumerate(pdf_document):
+        page = pdf_document[i]
+        rasterized_page = rasterize_page(page, image_quality)
+        compressed_image = compress_image(rasterized_page, image_quality)
+
+        if not disable_source_text:
+            add_text_to_pdf_page(page, extracted_text)
+
+        compressed_pages.append(compressed_image)
+
+    return compressed_pages
+
+
+def is_compression_successful(
+    total_compressed_size: int, original_size: int, image_quality: int
+) -> bool:
+    """
+    Checks if the compression was successful based on the compressed size and original size.
+
+    :param total_compressed_size: Total size of compressed pages.
+    :param original_size: Original PDF size.
+    :param image_quality: Compression quality.
+    :return: True if compression was successful, false otherwise.
+    """
+    overhead = lerp(0.54, 0.18, image_quality / 100)
+    return total_compressed_size + total_compressed_size * overhead < original_size
+
+
+def rasterize_page(  # type: ignore
+    page: pdfium.PdfPage,
+    quality: int = 85,
+) -> bytes:
+    """
+    Rasterizes a PDF page.
+
+    :param page: PdfPage object to rasterize.
+    :param quality: Quality to apply during rasterization.
+    :return: Rasterized page as bytes.
+    """
+    image = page.render().to_pil()
+    buffer = BytesIO()
+    image.save(buffer, format="JPEG", quality=quality)
+    return buffer.getvalue()
+
+
+def lerp(start: float, end: float, t: float) -> float:
+    """
+    Performs linear interpolation between two numbers.
+
+    :param start: The starting value.
+    :param end: The ending value.
+    :param t: The interpolation factor (0 to 1).
+    :return: The interpolated value.
+    """
+    return start * (1 - t) + end * t
diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py
new file mode 100644
index 00000000..2eb8635d
--- /dev/null
+++ b/mindee/pdf/pdf_utils.py
@@ -0,0 +1,241 @@
+from ctypes import byref, c_double, c_int, create_string_buffer
+from threading import RLock
+from typing import List, Tuple
+
+import pypdfium2 as pdfium
+import pypdfium2.raw as pdfium_c
+
+from mindee.pdf.pdf_char_data import PDFCharData
+
+FALLBACK_FONT = "Helvetica"
+
+
+def has_source_text(pdf_bytes: bytes) -> bool:
+    """
+    Checks if the provided PDF bytes contain source text.
+
+    :param pdf_bytes: Raw bytes representation of a PDF file
+    :return:
+    """
+    pdf = pdfium.PdfDocument(pdf_bytes)
+    for page in pdf:
+        if len(page.get_textpage().get_text_bounded().strip()) > 0:
+            return True
+    return False
+
+
+def extract_text_from_pdf(pdf_bytes: bytes) -> List[PDFCharData]:
+    """
+    Extracts the raw text from a given PDF's bytes along with font data.
+
+    :param pdf_bytes: Raw bytes representation of a PDF file.
+    :return: A list of info regarding each read character.
+    """
+    char_data_list: List[PDFCharData] = []
+    pdfium_lock = RLock()
+    pdf = pdfium.PdfDocument(pdf_bytes)
+
+    for page in pdf:
+        process_page(page, pdfium_lock, char_data_list)
+
+    return char_data_list
+
+
+def process_page(page, pdfium_lock: RLock, char_data_list: List[PDFCharData]):
+    """
+    Processes a single page of the PDF.
+
+    :param page: The PDF page to process.
+    :param pdfium_lock: Lock for thread-safe operations.
+    :param char_data_list: List to append character data to.
+    """
+    internal_height = page.get_height()
+    internal_width = page.get_width()
+
+    with pdfium_lock:
+        text_handler = pdfium_c.FPDFText_LoadPage(page.raw)
+        count_chars = pdfium_c.FPDFText_CountChars(text_handler)
+
+    for i in range(count_chars):
+        process_char(
+            i,
+            text_handler,
+            page,
+            pdfium_lock,
+            internal_height,
+            internal_width,
+            char_data_list,
+        )
+
+    with pdfium_lock:
+        pdfium_c.FPDFText_ClosePage(text_handler)
+
+
+def process_char(
+    i: int,
+    text_handler,
+    page,
+    pdfium_lock: RLock,
+    internal_height: float,
+    internal_width: float,
+    char_data_list: List[PDFCharData],
+):
+    """
+    Processes a single character from the PDF.
+
+    :param i: The index of the character.
+    :param text_handler: The text handler for the current page.
+    :param page: The current page being processed.
+    :param pdfium_lock: Lock for thread-safe operations.
+    :param internal_height: The height of the page.
+    :param internal_width: The width of the page.
+    :param char_data_list: List to append character data to.
+    """
+    char_info = get_char_info(i, text_handler, pdfium_lock)
+    char_box = get_char_box(i, text_handler, pdfium_lock)
+    rotation = get_page_rotation(page, pdfium_lock)
+
+    adjusted_box = adjust_char_box(char_box, rotation, internal_height, internal_width)
+
+    for c in char_info["char"] or " ":
+        char_data = PDFCharData(
+            char=c,
+            left=int(adjusted_box[0]),
+            right=int(adjusted_box[1]),
+            top=int(adjusted_box[2]),
+            bottom=int(adjusted_box[3]),
+            font_name=char_info["font_name"],
+            font_size=char_info["font_size"],
+            font_weight=char_info["font_weight"],
+            font_stroke_color=char_info["font_stroke_color"],
+            font_fill_color=char_info["font_fill_color"],
+            font_flags=char_info["font_flags"],
+        )
+        char_data_list.append(char_data)
+
+
+def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
+    """
+    Retrieves information about a specific character.
+
+    :param i: The index of the character.
+    :param text_handler: The text handler for the current page.
+    :param pdfium_lock: Lock for thread-safe operations.
+    :return: A dictionary containing character information.
+    """
+    with pdfium_lock:
+        char = chr(pdfium_c.FPDFText_GetUnicode(text_handler, i))
+        font_name = get_font_name(text_handler, i)
+        font_flags = get_font_flags(text_handler, i)
+        font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i)
+        font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i)
+        font_stroke_color = pdfium_c.FPDFText_GetStrokeColor(text_handler, i)
+        font_fill_color = pdfium_c.FPDFText_GetFillColor(text_handler, i)
+
+    return {
+        "char": char,
+        "font_name": font_name,
+        "font_flags": font_flags,
+        "font_size": font_size,
+        "font_weight": font_weight,
+        "font_stroke_color": font_stroke_color,
+        "font_fill_color": font_fill_color,
+    }
+
+
+def get_font_name(text_handler, i: int) -> str:
+    """
+    Retrieves the font name for a specific character.
+
+    :param text_handler: The text handler for the current page.
+    :param i: The index of the character.
+    :return: The font name as a string.
+    """
+    buffer_length = 128
+    font_name_buffer = create_string_buffer(buffer_length)
+    flags = c_int(0)
+    actual_length = pdfium_c.FPDFText_GetFontInfo(
+        text_handler, i, font_name_buffer, buffer_length, byref(flags)
+    )
+    return (
+        font_name_buffer.value.decode("utf-8") if actual_length > 0 else FALLBACK_FONT
+    )
+
+
+def get_font_flags(text_handler, i: int) -> int:
+    """
+    Retrieves the font flags for a specific character.
+
+    :param text_handler: The text handler for the current page.
+    :param i: The index of the character.
+    :return: The font flags as an integer.
+    """
+    flags = c_int(0)
+    pdfium_c.FPDFText_GetFontInfo(text_handler, i, None, 0, byref(flags))
+    return flags.value
+
+
+def get_char_box(
+    i: int, text_handler, pdfium_lock: RLock
+) -> Tuple[float, float, float, float]:
+    """
+    Retrieves the bounding box for a specific character.
+
+    :param i: The index of the character.
+    :param text_handler: The text handler for the current page.
+    :param pdfium_lock: Lock for thread-safe operations.
+    :return: A tuple containing left, right, bottom, and top coordinates.
+    """
+    left, right, bottom, top = (c_double(0), c_double(0), c_double(0), c_double(0))
+    with pdfium_lock:
+        pdfium_c.FPDFText_GetCharBox(
+            text_handler, i, byref(left), byref(right), byref(bottom), byref(top)
+        )
+    return left.value, right.value, bottom.value, top.value
+
+
+def get_page_rotation(page, pdfium_lock: RLock) -> int:
+    """
+    Retrieves the rotation value for a specific page.
+
+    :param page: The page to get the rotation for.
+    :param pdfium_lock: Lock for thread-safe operations.
+    :return: The rotation value in degrees.
+    """
+    with pdfium_lock:
+        return {0: 0, 1: 90, 2: 180, 3: 270}.get(
+            pdfium_c.FPDFPage_GetRotation(page.raw), 0
+        )
+
+
+def adjust_char_box(
+    char_box: Tuple[float, float, float, float],
+    rotation: int,
+    internal_height: float,
+    internal_width: float,
+) -> Tuple[float, float, float, float]:
+    """
+    Adjusts the character bounding box based on page rotation.
+
+    :param char_box: The original character bounding box.
+    :param rotation: The page rotation in degrees.
+    :param internal_height: The height of the page.
+    :param internal_width: The width of the page.
+    :return: The adjusted character bounding box.
+    """
+    left, right, bottom, top = char_box
+    if rotation == 0:
+        top, bottom = internal_height - top, internal_height - bottom
+    elif rotation == 90:
+        left, right, top, bottom = bottom, top, left, right
+    elif rotation == 180:
+        left, right = internal_width - right, internal_width - left
+        top, bottom = bottom, top
+    elif rotation == 270:
+        left, right, top, bottom = (
+            internal_width - top,
+            internal_width - bottom,
+            internal_height - right,
+            internal_height - left,
+        )
+    return left, right, top, bottom

From 071b95fba4bcfc4b898d5b2e733a60271c7a0624 Mon Sep 17 00:00:00 2001
From: sebastianMindee <sebastian.oliverasilvera@mindee.co>
Date: Thu, 16 Jan 2025 17:45:51 +0100
Subject: [PATCH 2/7] temporary (not working) version

---
 ...uto_invoice_splitter_extraction_example.py |   2 +-
 mindee/client.py                              |  14 +-
 mindee/extraction/__init__.py                 |   1 -
 mindee/extraction/common/__init__.py          |   1 -
 mindee/extraction/common/extracted_image.py   |   3 +-
 mindee/extraction/common/image_extractor.py   |  39 +--
 .../multi_receipts_extractor.py               |   2 +-
 .../extraction/pdf_extractor/extracted_pdf.py |   2 +-
 .../extraction/pdf_extractor/pdf_extractor.py |   2 +-
 mindee/image_operations/image_compressor.py   |  10 +-
 mindee/input/__init__.py                      |   3 +-
 mindee/input/sources/__init__.py              |   3 +-
 mindee/input/sources/base_64_input.py         |   3 +-
 mindee/input/sources/bytes_input.py           |   3 +-
 mindee/input/sources/file_input.py            |   3 +-
 mindee/input/sources/input_type.py            |  11 +
 mindee/input/sources/local_input_source.py    |  55 ++++-
 mindee/input/sources/path_input.py            |   3 +-
 mindee/input/sources/url_input_source.py      |   2 +-
 mindee/mindee_http/endpoint.py                |   3 +-
 mindee/pdf/__init__.py                        |   7 +
 mindee/pdf/pdf_compressor.py                  |  25 +-
 mindee/pdf/pdf_utils.py                       |  35 ++-
 tests/api/test_async_response.py              |   2 +-
 tests/extraction/test_image_extractor.py      |   2 +-
 .../test_invoice_splitter_auto_extraction.py  |   2 +-
 .../test_multi_receipts_extractor.py          |   2 +-
 tests/extraction/test_pdf_extractor.py        |   2 +-
 tests/input/test_compression.py               | 224 ++++++++++++++++++
 tests/mindee_http/test_error.py               |   2 +-
 tests/test_client.py                          |   2 +-
 tests/test_inputs.py                          |  12 +-
 32 files changed, 385 insertions(+), 97 deletions(-)
 create mode 100644 mindee/input/sources/input_type.py
 create mode 100644 tests/input/test_compression.py

diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py
index 161075d0..a9a2bb5a 100644
--- a/examples/auto_invoice_splitter_extraction_example.py
+++ b/examples/auto_invoice_splitter_extraction_example.py
@@ -1,6 +1,6 @@
 from mindee import Client
 from mindee.extraction.pdf_extractor import PdfExtractor
-from mindee.input.sources import PathInput
+from mindee.input.sources.path_input import PathInput
 from mindee.product.invoice.invoice_v4 import InvoiceV4
 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
 
diff --git a/mindee/client.py b/mindee/client.py
index bebd8560..901b68cb 100644
--- a/mindee/client.py
+++ b/mindee/client.py
@@ -7,14 +7,12 @@
 from mindee.input import WorkflowOptions
 from mindee.input.local_response import LocalResponse
 from mindee.input.page_options import PageOptions
-from mindee.input.sources import (
-    Base64Input,
-    BytesInput,
-    FileInput,
-    LocalInputSource,
-    PathInput,
-    UrlInputSource,
-)
+from mindee.input.sources.base_64_input import Base64Input
+from mindee.input.sources.bytes_input import BytesInput
+from mindee.input.sources.file_input import FileInput
+from mindee.input.sources.local_input_source import LocalInputSource
+from mindee.input.sources.path_input import PathInput
+from mindee.input.sources.url_input_source import UrlInputSource
 from mindee.logger import logger
 from mindee.mindee_http.endpoint import CustomEndpoint, Endpoint
 from mindee.mindee_http.mindee_api import MindeeApi
diff --git a/mindee/extraction/__init__.py b/mindee/extraction/__init__.py
index 2e190d7b..9b86d0ee 100644
--- a/mindee/extraction/__init__.py
+++ b/mindee/extraction/__init__.py
@@ -1,6 +1,5 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
-    attach_images_as_new_file,
     extract_multiple_images_from_source,
 )
 from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
diff --git a/mindee/extraction/common/__init__.py b/mindee/extraction/common/__init__.py
index 009009d9..1acb7bb9 100644
--- a/mindee/extraction/common/__init__.py
+++ b/mindee/extraction/common/__init__.py
@@ -1,5 +1,4 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
-    attach_images_as_new_file,
     extract_multiple_images_from_source,
 )
diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py
index 3d6b6f22..e4013246 100644
--- a/mindee/extraction/common/extracted_image.py
+++ b/mindee/extraction/common/extracted_image.py
@@ -5,7 +5,8 @@
 from PIL import Image
 
 from mindee.error.mindee_error import MindeeError
-from mindee.input.sources import FileInput, LocalInputSource
+from mindee.input.sources.file_input import FileInput
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.logger import logger
 
 
diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py
index 3277feaf..95609e5a 100644
--- a/mindee/extraction/common/image_extractor.py
+++ b/mindee/extraction/common/image_extractor.py
@@ -1,5 +1,5 @@
 import io
-from typing import BinaryIO, List
+from typing import List
 
 import pypdfium2 as pdfium
 from PIL import Image
@@ -8,38 +8,9 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.geometry.point import Point
 from mindee.geometry.polygon import get_min_max_x, get_min_max_y
-from mindee.input.sources import BytesInput, LocalInputSource
-
-
-def attach_images_as_new_file(  # type: ignore
-    input_buffer_list: List[BinaryIO],
-) -> pdfium.PdfDocument:
-    """
-    Attaches a list of images as new pages in a PdfDocument object.
-
-    :param input_buffer_list: List of images, represented as buffers.
-    :return: A PdfDocument handle.
-    """
-    pdf = pdfium.PdfDocument.new()
-    for input_buffer in input_buffer_list:
-        input_buffer.seek(0)
-        image = Image.open(input_buffer)
-        image.convert("RGB")
-        image_buffer = io.BytesIO()
-        image.save(image_buffer, format="JPEG")
-
-        image_pdf = pdfium.PdfImage.new(pdf)
-        image_pdf.load_jpeg(image_buffer)
-        width, height = image_pdf.get_size()
-
-        matrix = pdfium.PdfMatrix().scale(width, height)
-        image_pdf.set_matrix(matrix)
-
-        page = pdf.new_page(width, height)
-        page.insert_obj(image_pdf)
-        page.gen_content()
-        image.close()
-    return pdf
+from mindee.input.sources.bytes_input import BytesInput
+from mindee.input.sources.local_input_source import LocalInputSource
+from mindee.pdf.pdf_utils import attach_images_as_new_file
 
 
 def extract_image_from_polygon(
@@ -157,6 +128,6 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: i
     """
     if input_file.is_pdf():
         input_file.file_object.seek(0)
-        return pdfium.PdfDocument(input_file.file_object)
+        return pdfium.PdfDocument(input_file.file_object.read())
 
     return attach_images_as_new_file([input_file.file_object])
diff --git a/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py
index 89ad63b9..7c31ca93 100644
--- a/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py
+++ b/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py
@@ -5,7 +5,7 @@
 from mindee.extraction.common.image_extractor import (
     extract_multiple_images_from_source,
 )
-from mindee.input.sources import LocalInputSource
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.parsing.common.inference import Inference
 
 
diff --git a/mindee/extraction/pdf_extractor/extracted_pdf.py b/mindee/extraction/pdf_extractor/extracted_pdf.py
index fd02ce90..0e3dcb8d 100644
--- a/mindee/extraction/pdf_extractor/extracted_pdf.py
+++ b/mindee/extraction/pdf_extractor/extracted_pdf.py
@@ -4,7 +4,7 @@
 import pypdfium2 as pdfium
 
 from mindee.error.mindee_error import MindeeError
-from mindee.input.sources import BytesInput
+from mindee.input.sources.bytes_input import BytesInput
 
 
 class ExtractedPdf:
diff --git a/mindee/extraction/pdf_extractor/pdf_extractor.py b/mindee/extraction/pdf_extractor/pdf_extractor.py
index 1a2023ca..5d5f2e19 100644
--- a/mindee/extraction/pdf_extractor/pdf_extractor.py
+++ b/mindee/extraction/pdf_extractor/pdf_extractor.py
@@ -7,7 +7,7 @@
 
 from mindee.error.mindee_error import MindeeError
 from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
-from mindee.input.sources import LocalInputSource
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import (
     InvoiceSplitterV1PageGroup,
 )
diff --git a/mindee/image_operations/image_compressor.py b/mindee/image_operations/image_compressor.py
index 4b600938..82b0bf87 100644
--- a/mindee/image_operations/image_compressor.py
+++ b/mindee/image_operations/image_compressor.py
@@ -1,11 +1,11 @@
 import io
-from typing import Union
+from typing import BinaryIO, Union
 
 from PIL import Image
 
 
 def compress_image(
-    image_buffer: bytes,
+    image_buffer: Union[BinaryIO, bytes],
     quality: int = 85,
     max_width: Union[int, float, None] = None,
     max_height: Union[int, float, None] = None,
@@ -13,13 +13,15 @@ def compress_image(
     """
     Compresses an image with the given parameters.
 
-    :param image_buffer: Buffer representation of an image.
+    :param image_buffer: Buffer representation of an image, also accepts BinaryIO.
     :param quality: Quality to apply to the image (JPEG compression).
     :param max_width: Maximum bound for the width.
     :param max_height: Maximum bound for the height.
     :return:
     """
-    with Image.open(io.BytesIO(image_buffer)) as img:
+    if isinstance(image_buffer, bytes):
+        image_buffer = io.BytesIO(image_buffer)
+    with Image.open(image_buffer) as img:
         original_width, original_height = img.size
         max_width = max_width or original_width
         max_height = max_height or original_height
diff --git a/mindee/input/__init__.py b/mindee/input/__init__.py
index 3c75c072..82624650 100644
--- a/mindee/input/__init__.py
+++ b/mindee/input/__init__.py
@@ -3,7 +3,8 @@
 from mindee.input.sources.base_64_input import Base64Input
 from mindee.input.sources.bytes_input import BytesInput
 from mindee.input.sources.file_input import FileInput
-from mindee.input.sources.local_input_source import InputType, LocalInputSource
+from mindee.input.sources.input_type import InputType
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.input.sources.path_input import PathInput
 from mindee.input.sources.url_input_source import UrlInputSource
 from mindee.input.workflow_options import WorkflowOptions
diff --git a/mindee/input/sources/__init__.py b/mindee/input/sources/__init__.py
index 6f8a51e3..c7d9c22a 100644
--- a/mindee/input/sources/__init__.py
+++ b/mindee/input/sources/__init__.py
@@ -1,6 +1,7 @@
 from mindee.input.sources.base_64_input import Base64Input
 from mindee.input.sources.bytes_input import BytesInput
 from mindee.input.sources.file_input import FileInput
-from mindee.input.sources.local_input_source import InputType, LocalInputSource
+from mindee.input.sources.input_type import InputType
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.input.sources.path_input import PathInput
 from mindee.input.sources.url_input_source import UrlInputSource
diff --git a/mindee/input/sources/base_64_input.py b/mindee/input/sources/base_64_input.py
index b651bd23..b656255b 100644
--- a/mindee/input/sources/base_64_input.py
+++ b/mindee/input/sources/base_64_input.py
@@ -1,7 +1,8 @@
 import base64
 import io
 
-from mindee.input.sources.local_input_source import InputType, LocalInputSource
+from mindee.input.sources.input_type import InputType
+from mindee.input.sources.local_input_source import LocalInputSource
 
 
 class Base64Input(LocalInputSource):
diff --git a/mindee/input/sources/bytes_input.py b/mindee/input/sources/bytes_input.py
index 13fbf41d..1f2b63fd 100644
--- a/mindee/input/sources/bytes_input.py
+++ b/mindee/input/sources/bytes_input.py
@@ -1,6 +1,7 @@
 import io
 
-from mindee.input.sources.local_input_source import InputType, LocalInputSource
+from mindee.input.sources.input_type import InputType
+from mindee.input.sources.local_input_source import LocalInputSource
 
 
 class BytesInput(LocalInputSource):
diff --git a/mindee/input/sources/file_input.py b/mindee/input/sources/file_input.py
index 561fd754..2623a4f3 100644
--- a/mindee/input/sources/file_input.py
+++ b/mindee/input/sources/file_input.py
@@ -1,7 +1,8 @@
 import os
 from typing import BinaryIO
 
-from mindee.input.sources.local_input_source import InputType, LocalInputSource
+from mindee.input.sources.input_type import InputType
+from mindee.input.sources.local_input_source import LocalInputSource
 
 
 class FileInput(LocalInputSource):
diff --git a/mindee/input/sources/input_type.py b/mindee/input/sources/input_type.py
new file mode 100644
index 00000000..6daf1131
--- /dev/null
+++ b/mindee/input/sources/input_type.py
@@ -0,0 +1,11 @@
+from enum import Enum
+
+
+class InputType(Enum):
+    """The input type, for internal use."""
+
+    FILE = "file"
+    BASE64 = "base64"
+    BYTES = "bytes"
+    PATH = "path"
+    URL = "url"
diff --git a/mindee/input/sources/local_input_source.py b/mindee/input/sources/local_input_source.py
index ef5bcaf5..9f6f5cc6 100644
--- a/mindee/input/sources/local_input_source.py
+++ b/mindee/input/sources/local_input_source.py
@@ -1,15 +1,18 @@
 import io
 import mimetypes
 import tempfile
-from enum import Enum
 from typing import BinaryIO, Optional, Sequence, Tuple
 
 import pypdfium2 as pdfium
 
 from mindee.error.mimetype_error import MimeTypeError
 from mindee.error.mindee_error import MindeeError, MindeeSourceError
+from mindee.image_operations.image_compressor import compress_image
 from mindee.input.page_options import KEEP_ONLY, REMOVE
+from mindee.input.sources.input_type import InputType
 from mindee.logger import logger
+from mindee.pdf.pdf_compressor import compress_pdf
+from mindee.pdf.pdf_utils import has_source_text
 
 mimetypes.add_type("image/heic", ".heic")
 mimetypes.add_type("image/heic", ".heif")
@@ -25,16 +28,6 @@
 ]
 
 
-class InputType(Enum):
-    """The input type, for internal use."""
-
-    FILE = "file"
-    BASE64 = "base64"
-    BYTES = "bytes"
-    PATH = "path"
-    URL = "url"
-
-
 class LocalInputSource:
     """Base class for all input sources coming from the local machine."""
 
@@ -202,3 +195,43 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
     def close(self) -> None:
         """Close the file object."""
         self.file_object.close()
+
+    def has_source_text(self) -> bool:
+        """
+        If the file is a PDF, checks if it has source text.
+
+        :return: True if the file is a PDF and has source text. False otherwise.
+        """
+        if not self.is_pdf():
+            return False
+        return has_source_text(self.file_object.read())
+
+    def compress(
+        self,
+        quality: int = 85,
+        max_width: Optional[int] = None,
+        max_height: Optional[int] = None,
+        force_source_text: bool = False,
+        disable_source_text: bool = True,
+    ) -> None:
+        """
+        Compresses the file object, either as a PDF or an image.
+
+        :param quality: Quality of the compression. For images, this is the JPEG quality.
+            For PDFs, this affects image quality within the PDF.
+        :param max_width: Maximum width for image resizing. Ignored for PDFs.
+        :param max_height: Maximum height for image resizing. Ignored for PDFs.
+        :param force_source_text: For PDFs, whether to force compression even if source text is present.
+        :param disable_source_text: For PDFs, whether to disable source text during compression.
+        """
+        new_file_bytes: bytes
+        if self.is_pdf():
+            new_file_bytes = compress_pdf(
+                self.file_object, quality, force_source_text, disable_source_text
+            )
+        else:
+            new_file_bytes = compress_image(
+                self.file_object, quality, max_width, max_height
+            )
+
+        self.file_object = io.BytesIO(new_file_bytes)
diff --git a/mindee/input/sources/path_input.py b/mindee/input/sources/path_input.py
index 3f9698b4..2e7fc736 100644
--- a/mindee/input/sources/path_input.py
+++ b/mindee/input/sources/path_input.py
@@ -2,7 +2,8 @@
 from pathlib import Path
 from typing import Union
 
-from mindee.input.sources.local_input_source import InputType, LocalInputSource
+from mindee.input.sources.input_type import InputType
+from mindee.input.sources.local_input_source import LocalInputSource
 
 
 class PathInput(LocalInputSource):
diff --git a/mindee/input/sources/url_input_source.py b/mindee/input/sources/url_input_source.py
index 983343e5..0e62573a 100644
--- a/mindee/input/sources/url_input_source.py
+++ b/mindee/input/sources/url_input_source.py
@@ -10,7 +10,7 @@
 
 from mindee.error.mindee_error import MindeeSourceError
 from mindee.input.sources.bytes_input import BytesInput
-from mindee.input.sources.local_input_source import InputType
+from mindee.input.sources.input_type import InputType
 from mindee.logger import logger
 
 
diff --git a/mindee/mindee_http/endpoint.py b/mindee/mindee_http/endpoint.py
index fdbd2ae7..227c1e2f 100644
--- a/mindee/mindee_http/endpoint.py
+++ b/mindee/mindee_http/endpoint.py
@@ -4,7 +4,8 @@
 import requests
 from requests import Response
 
-from mindee.input.sources import LocalInputSource, UrlInputSource
+from mindee.input.sources.local_input_source import LocalInputSource
+from mindee.input.sources.url_input_source import UrlInputSource
 from mindee.mindee_http.base_endpoint import BaseEndpoint
 from mindee.mindee_http.mindee_api import MindeeApi
 from mindee.parsing.common.string_dict import StringDict
diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py
index e69de29b..5afcb672 100644
--- a/mindee/pdf/__init__.py
+++ b/mindee/pdf/__init__.py
@@ -0,0 +1,7 @@
+from mindee.pdf.pdf_char_data import PDFCharData
+from mindee.pdf.pdf_compressor import compress_pdf
+from mindee.pdf.pdf_utils import (
+    attach_images_as_new_file,
+    extract_text_from_pdf,
+    has_source_text,
+)
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
index ef7de3da..12bce09a 100644
--- a/mindee/pdf/pdf_compressor.py
+++ b/mindee/pdf/pdf_compressor.py
@@ -1,22 +1,25 @@
+import io
 import logging
-from io import BytesIO
 from threading import RLock
-from typing import List, Optional
+from typing import BinaryIO, List, Optional, Union
 
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 
-from mindee.extraction import attach_images_as_new_file
 from mindee.image_operations.image_compressor import compress_image
 from mindee.pdf.pdf_char_data import PDFCharData
-from mindee.pdf.pdf_utils import extract_text_from_pdf, has_source_text
+from mindee.pdf.pdf_utils import (
+    attach_images_as_new_file,
+    extract_text_from_pdf,
+    has_source_text,
+)
 
 logger = logging.getLogger(__name__)
 MIN_QUALITY = 1
 
 
 def compress_pdf(
-    pdf_data: bytes,
+    pdf_data: Union[BinaryIO, bytes],
     image_quality: int = 85,
     force_source_text_compression: bool = False,
     disable_source_text: bool = True,
@@ -30,6 +33,9 @@ def compress_pdf(
     :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
     :return: Compressed PDF as bytes.
     """
+    if not isinstance(pdf_data, bytes):
+        pdf_data = pdf_data.read()
+
     if has_source_text(pdf_data):
         if force_source_text_compression:
             if not disable_source_text:
@@ -61,9 +67,9 @@ def compress_pdf(
         return pdf_data
 
     out_pdf = attach_images_as_new_file(
-        [BytesIO(compressed_page) for compressed_page in compressed_pages]
+        [io.BytesIO(compressed_page) for compressed_page in compressed_pages]
     )
-    out_bytes = BytesIO()
+    out_bytes = io.BytesIO()
     out_pdf.save(out_bytes)
 
     return out_bytes.read()
@@ -158,8 +164,7 @@ def compress_pages_with_quality(
     pdf_document = pdfium.PdfDocument(pdf_data)
     compressed_pages = []
 
-    for i in enumerate(pdf_document):
-        page = pdf_document[i]
+    for [_, page] in enumerate(pdf_document):
         rasterized_page = rasterize_page(page, image_quality)
         compressed_image = compress_image(rasterized_page, image_quality)
 
@@ -198,7 +203,7 @@ def rasterize_page(  # type: ignore
     :return: Rasterized page as bytes.
     """
     image = page.render().to_pil()
-    buffer = BytesIO()
+    buffer = io.BytesIO()
     image.save(buffer, format="JPEG", quality=quality)
     return buffer.getvalue()
 
diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py
index 2eb8635d..747133dd 100644
--- a/mindee/pdf/pdf_utils.py
+++ b/mindee/pdf/pdf_utils.py
@@ -1,9 +1,11 @@
+import io
 from ctypes import byref, c_double, c_int, create_string_buffer
 from threading import RLock
-from typing import List, Tuple
+from typing import BinaryIO, List, Tuple
 
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
+from PIL import Image
 
 from mindee.pdf.pdf_char_data import PDFCharData
 
@@ -239,3 +241,34 @@ def adjust_char_box(
             internal_height - left,
         )
     return left, right, top, bottom
+
+
+def attach_images_as_new_file(  # type: ignore
+    input_buffer_list: List[BinaryIO],
+) -> pdfium.PdfDocument:
+    """
+    Attaches a list of images as new pages in a PdfDocument object.
+
+    :param input_buffer_list: List of images, represented as buffers.
+    :return: A PdfDocument handle.
+    """
+    pdf = pdfium.PdfDocument.new()
+    for input_buffer in input_buffer_list:
+        input_buffer.seek(0)
+        image = Image.open(input_buffer)
+        image.convert("RGB")
+        image_buffer = io.BytesIO()
+        image.save(image_buffer, format="JPEG")
+
+        image_pdf = pdfium.PdfImage.new(pdf)
+        image_pdf.load_jpeg(image_buffer)
+        width, height = image_pdf.get_size()
+
+        matrix = pdfium.PdfMatrix().scale(width, height)
+        image_pdf.set_matrix(matrix)
+
+        page = pdf.new_page(width, height)
+        page.insert_obj(image_pdf)
+        page.gen_content()
+        image.close()
+    return pdf
diff --git a/tests/api/test_async_response.py b/tests/api/test_async_response.py
index 31319095..e8163d0c 100644
--- a/tests/api/test_async_response.py
+++ b/tests/api/test_async_response.py
@@ -5,7 +5,7 @@
 import requests
 
 from mindee.client import Client
-from mindee.input.sources import PathInput
+from mindee.input.sources.path_input import PathInput
 from mindee.mindee_http.response_validation import is_valid_async_response
 from mindee.parsing.common.api_request import RequestStatus
 from mindee.parsing.common.async_predict_response import AsyncPredictResponse
diff --git a/tests/extraction/test_image_extractor.py b/tests/extraction/test_image_extractor.py
index f41dc4a4..7f6d5db2 100644
--- a/tests/extraction/test_image_extractor.py
+++ b/tests/extraction/test_image_extractor.py
@@ -4,7 +4,7 @@
 from PIL import Image
 
 from mindee.extraction.common.image_extractor import extract_multiple_images_from_source
-from mindee.input.sources import PathInput
+from mindee.input.sources.path_input import PathInput
 from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1
 from tests.test_inputs import PRODUCT_DATA_DIR
 
diff --git a/tests/extraction/test_invoice_splitter_auto_extraction.py b/tests/extraction/test_invoice_splitter_auto_extraction.py
index 716628e7..3abc2d2a 100644
--- a/tests/extraction/test_invoice_splitter_auto_extraction.py
+++ b/tests/extraction/test_invoice_splitter_auto_extraction.py
@@ -4,7 +4,7 @@
 
 from mindee import Client
 from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
-from mindee.input.sources import PathInput
+from mindee.input.sources.path_input import PathInput
 from mindee.parsing.common.document import Document
 from mindee.product.invoice.invoice_v4 import InvoiceV4
 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
diff --git a/tests/extraction/test_multi_receipts_extractor.py b/tests/extraction/test_multi_receipts_extractor.py
index 0f71d1fe..00e22f12 100644
--- a/tests/extraction/test_multi_receipts_extractor.py
+++ b/tests/extraction/test_multi_receipts_extractor.py
@@ -6,7 +6,7 @@
 from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import (
     extract_receipts,
 )
-from mindee.input.sources import PathInput
+from mindee.input.sources.path_input import PathInput
 from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import (
     MultiReceiptsDetectorV1,
 )
diff --git a/tests/extraction/test_pdf_extractor.py b/tests/extraction/test_pdf_extractor.py
index e323cd2c..a236d9c2 100644
--- a/tests/extraction/test_pdf_extractor.py
+++ b/tests/extraction/test_pdf_extractor.py
@@ -3,7 +3,7 @@
 from mindee import Client
 from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
 from mindee.input.local_response import LocalResponse
-from mindee.input.sources import PathInput
+from mindee.input.sources.path_input import PathInput
 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
 from mindee.product.invoice_splitter.invoice_splitter_v1_document import (
     InvoiceSplitterV1Document,
diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py
new file mode 100644
index 00000000..900c0d71
--- /dev/null
+++ b/tests/input/test_compression.py
@@ -0,0 +1,224 @@
+import os
+from pathlib import Path
+
+import pytest
+from PIL import Image
+
+from mindee.image_operations.image_compressor import compress_image
+from mindee.input.sources.path_input import PathInput
+from mindee.pdf.pdf_compressor import compress_pdf
+from mindee.pdf.pdf_utils import extract_text_from_pdf
+
+DATA_DIR = Path("./tests/data")
+OUTPUT_DIR = DATA_DIR / "output"
+
+
+def test_image_quality_compress_from_input_source():
+    receipt_input = PathInput(DATA_DIR / "file_types/receipt.jpg")
+    receipt_input.compress(40)
+
+    with open(OUTPUT_DIR / "compress_indirect.jpg", "wb") as f:
+        f.write(receipt_input.file_object.read())
+
+    initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
+    rendered_file_stats = os.stat(OUTPUT_DIR / "compress_indirect.jpg")
+    assert rendered_file_stats.st_size < initial_file_stats.st_size
+
+
+def test_image_quality_compresses_from_compressor():
+    receipt_input = PathInput(DATA_DIR / "file_types/receipt.jpg")
+    compresses = [
+        compress_image(receipt_input.file_object, 100),
+        compress_image(receipt_input.file_object),
+        compress_image(receipt_input.file_object, 50),
+        compress_image(receipt_input.file_object, 10),
+        compress_image(receipt_input.file_object, 1),
+    ]
+
+    file_names = [
+        "compress100.jpg",
+        "compress75.jpg",
+        "compress50.jpg",
+        "compress10.jpg",
+        "compress1.jpg",
+    ]
+    for i, compressed in enumerate(compresses):
+        with open(OUTPUT_DIR / file_names[i], "wb") as f:
+            f.write(compressed)
+
+    initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
+    rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names]
+
+    assert initial_file_stats.st_size < rendered_file_stats[0].st_size
+    assert initial_file_stats.st_size < rendered_file_stats[1].st_size
+    assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size
+    assert rendered_file_stats[2].st_size > rendered_file_stats[3].st_size
+    assert rendered_file_stats[3].st_size > rendered_file_stats[4].st_size
+
+
+def test_image_resize_from_input_source():
+    image_resize_input = PathInput(DATA_DIR / "file_types/receipt.jpg")
+
+    image_resize_input.compress(75, 250, 1000)
+    with open(OUTPUT_DIR / "resize_indirect.jpg", "wb") as f:
+        f.write(image_resize_input.file_object.read())
+
+    initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
+    rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.jpg")
+    assert rendered_file_stats.st_size < initial_file_stats.st_size
+
+    image = Image.open(image_resize_input.file_object)
+    assert image.width == 250
+    assert image.height == 333
+
+
+def test_image_resize_from_compressor():
+    image_resize_input = PathInput(DATA_DIR / "file_types/receipt.jpg")
+
+    resizes = [
+        compress_image(image_resize_input.file_object, 75, 500),
+        compress_image(image_resize_input.file_object, 75, 250, 500),
+        compress_image(image_resize_input.file_object, 75, 500, 250),
+        compress_image(image_resize_input.file_object, 75, None, 250),
+    ]
+
+    file_names = [
+        "resize500xnull.jpg",
+        "resize250x500.jpg",
+        "resize500x250.jpg",
+        "resizenullx250.jpg",
+    ]
+    for i, resized in enumerate(resizes):
+        with open(OUTPUT_DIR / file_names[i], "wb") as f:
+            f.write(resized)
+
+    initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
+    rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names]
+
+    assert initial_file_stats.st_size > rendered_file_stats[0].st_size
+    assert rendered_file_stats[0].st_size > rendered_file_stats[1].st_size
+    assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size
+    assert rendered_file_stats[2].st_size == rendered_file_stats[3].st_size
+
+
+def test_pdf_input_has_text():
+    has_source_text_path = DATA_DIR / "file_types/pdf/multipage.pdf"
+    has_no_source_text_path = DATA_DIR / "file_types/pdf/blank_1.pdf"
+    has_no_source_text_since_its_image_path = os.path.join(
+        DATA_DIR, "file_types/receipt.jpg"
+    )
+
+    has_source_text_input = PathInput(has_source_text_path)
+    has_no_source_text_input = PathInput(has_no_source_text_path)
+    has_no_source_text_since_its_image_input = PathInput(
+        has_no_source_text_since_its_image_path
+    )
+
+    assert has_source_text_input.has_source_text()
+    assert not has_no_source_text_input.has_source_text()
+    assert not has_no_source_text_since_its_image_input.has_source_text()
+
+
+def test_pdf_compress_from_input_source():
+    pdf_resize_input = PathInput(
+        DATA_DIR / "products/invoice_splitter/default_sample.pdf"
+    )
+
+    compressed_pdf = compress_pdf(pdf_resize_input.file_object, 75, True)
+    with open(OUTPUT_DIR / "resize_indirect.pdf", "wb") as f:
+        f.write(compressed_pdf)
+
+    initial_file_stats = os.stat(
+        DATA_DIR / "products/invoice_splitter/default_sample.pdf"
+    )
+    rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.pdf")
+
+    assert rendered_file_stats.st_size < initial_file_stats.st_size
+
+
+def test_pdf_compress_from_compressor():
+    pdf_resize_input = PathInput(
+        DATA_DIR / "products/invoice_splitter/default_sample.pdf"
+    )
+    resizes = []
+    qualities = [85, 75, 50, 10]
+    for quality in qualities:
+        pdf_resize_input.file_object.seek(0)
+        resizes.append(compress_pdf(pdf_resize_input.file_object, quality))
+
+    file_names = [
+        "compress85.pdf",
+        "compress75.pdf",
+        "compress50.pdf",
+        "compress10.pdf",
+    ]
+    for [i, resized] in enumerate(resizes):
+        with open(OUTPUT_DIR / file_names[i], "wb") as f:
+            f.write(resized)
+
+    initial_file_stats = os.stat(
+        DATA_DIR / "products/invoice_splitter/default_sample.pdf"
+    )
+    rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names]
+
+    assert initial_file_stats.st_size > rendered_file_stats[0].st_size
+    assert rendered_file_stats[0].st_size > rendered_file_stats[1].st_size
+    assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size
+    assert rendered_file_stats[2].st_size > rendered_file_stats[3].st_size
+
+
+def test_pdf_compress_with_text_keeps_text():
+    initial_with_text = PathInput(DATA_DIR / "file_types/pdf/multipage.pdf")
+
+    compressed_with_text = compress_pdf(initial_with_text.file_object, 100, True, False)
+
+    original_text = "".join(
+        [
+            text_info.char
+            for text_info in extract_text_from_pdf(initial_with_text.file_object.read())
+        ]
+    )
+    compressed_text = "".join(
+        [text_info.char for text_info in extract_text_from_pdf(compressed_with_text)]
+    )
+
+    assert compressed_text == original_text
+
+
+def test_pdf_compress_with_text_does_not_compress():
+    initial_with_text = PathInput(DATA_DIR / "file_types/pdf/multipage.pdf")
+
+    compressed_with_text = compress_pdf(initial_with_text.file_object, 50)
+
+    assert compressed_with_text == initial_with_text.file_object
+
+
+@pytest.fixture(scope="module", autouse=True)
+def cleanup():
+    yield
+    created_files = [
+        "compress10.pdf",
+        "compress50.pdf",
+        "compress75.pdf",
+        "compress85.pdf",
+        "resize_indirect.pdf",
+        "compress1.jpg",
+        "compress10.jpg",
+        "compress50.jpg",
+        "compress75.jpg",
+        "compress100.jpg",
+        "compress_indirect.jpg",
+        "resize250x500.jpg",
+        "resize500x250.jpg",
+        "resize500xnull.jpg",
+        "resize_indirect.jpg",
+        "resizenullx250.jpg",
+    ]
+
+    for file_path in created_files:
+        full_path = DATA_DIR / "output" / file_path
+        if full_path.exists():
+            try:
+                os.remove(full_path)
+            except OSError as e:
+                print(f"Could not delete file '{file_path}': {e.strerror}")
diff --git a/tests/mindee_http/test_error.py b/tests/mindee_http/test_error.py
index f9ac9776..5e2f879e 100644
--- a/tests/mindee_http/test_error.py
+++ b/tests/mindee_http/test_error.py
@@ -9,7 +9,7 @@
     MindeeHTTPServerError,
     handle_error,
 )
-from mindee.input.sources import PathInput
+from mindee.input.sources.path_input import PathInput
 from tests.test_inputs import FILE_TYPES_DIR
 from tests.utils import clear_envvars, dummy_envvars
 
diff --git a/tests/test_client.py b/tests/test_client.py
index 574ad51b..599e244c 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -6,7 +6,7 @@
 from mindee.error.mindee_error import MindeeClientError, MindeeError
 from mindee.error.mindee_http_error import MindeeHTTPError
 from mindee.input.local_response import LocalResponse
-from mindee.input.sources import LocalInputSource
+from mindee.input.sources.local_input_source import LocalInputSource
 from mindee.product.international_id.international_id_v2 import InternationalIdV2
 from mindee.product.invoice.invoice_v4 import InvoiceV4
 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 1c67e8ef..9eaa84c9 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,13 +7,11 @@
 from mindee.error.mimetype_error import MimeTypeError
 from mindee.error.mindee_error import MindeeError, MindeeSourceError
 from mindee.input.page_options import KEEP_ONLY, REMOVE
-from mindee.input.sources import (
-    Base64Input,
-    BytesInput,
-    FileInput,
-    PathInput,
-    UrlInputSource,
-)
+from mindee.input.sources.base_64_input import Base64Input
+from mindee.input.sources.bytes_input import BytesInput
+from mindee.input.sources.file_input import FileInput
+from mindee.input.sources.path_input import PathInput
+from mindee.input.sources.url_input_source import UrlInputSource
 from tests.product import PRODUCT_DATA_DIR
 
 FILE_TYPES_DIR = Path("./tests/data/file_types")

From a72c46514a0c508270b210c0e3a21423d70929aa Mon Sep 17 00:00:00 2001
From: sebastianMindee <sebastian.oliverasilvera@mindee.co>
Date: Fri, 17 Jan 2025 18:04:45 +0100
Subject: [PATCH 3/7] fix a few issues... but segfault :D

---
 mindee/pdf/pdf_char_data.py     |  2 +-
 mindee/pdf/pdf_compressor.py    | 67 ++++++++++++++++--------------
 mindee/pdf/pdf_utils.py         | 16 ++++++--
 tests/input/test_compression.py | 73 +++++++++++++++++----------------
 4 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py
index 9f516655..75637a52 100644
--- a/mindee/pdf/pdf_char_data.py
+++ b/mindee/pdf/pdf_char_data.py
@@ -18,7 +18,7 @@ class PDFCharData:
     """Bottom bound."""
     font_name: str
     """The font name."""
-    font_size: int
+    font_size: float
     """The font size in pt."""
     font_weight: int
     """The font weight."""
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
index 12bce09a..9f859d92 100644
--- a/mindee/pdf/pdf_compressor.py
+++ b/mindee/pdf/pdf_compressor.py
@@ -1,10 +1,12 @@
 import io
 import logging
+from ctypes import c_char_p, c_ushort
 from threading import RLock
 from typing import BinaryIO, List, Optional, Union
 
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
+from _ctypes import POINTER
 
 from mindee.image_operations.image_compressor import compress_image
 from mindee.pdf.pdf_char_data import PDFCharData
@@ -34,9 +36,12 @@ def compress_pdf(
     :return: Compressed PDF as bytes.
     """
     if not isinstance(pdf_data, bytes):
-        pdf_data = pdf_data.read()
+        pdf_bytes = pdf_data.read()
+        pdf_data.seek(0)
+    else:
+        pdf_bytes = pdf_data
 
-    if has_source_text(pdf_data):
+    if has_source_text(pdf_bytes):
         if force_source_text_compression:
             if not disable_source_text:
                 logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.")
@@ -50,29 +55,29 @@ def compress_pdf(
                 "Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
                 "is set to 'true'."
             )
-            return pdf_data
+            return pdf_bytes
 
     extracted_text = (
-        extract_text_from_pdf(pdf_data) if not disable_source_text else None
+        extract_text_from_pdf(pdf_bytes) if not disable_source_text else None
     )
 
     compressed_pages = compress_pdf_pages(
-        pdf_data, extracted_text, image_quality, disable_source_text
+        pdf_bytes, extracted_text, image_quality, disable_source_text
     )
 
     if not compressed_pages:
         logger.warning(
             "Could not compress PDF to a smaller size. Returning original PDF."
         )
-        return pdf_data
+        return pdf_bytes
 
     out_pdf = attach_images_as_new_file(
         [io.BytesIO(compressed_page) for compressed_page in compressed_pages]
     )
-    out_bytes = io.BytesIO()
-    out_pdf.save(out_bytes)
-
-    return out_bytes.read()
+    out_buffer = io.BytesIO()
+    out_pdf.save(out_buffer)
+    out_buffer.seek(0)
+    return out_buffer.read()
 
 
 def compress_pdf_pages(
@@ -110,40 +115,40 @@ def compress_pdf_pages(
 
 
 def add_text_to_pdf_page(  # type: ignore
-    page: pdfium.PdfPage,
+    document: pdfium.PdfDocument,
+    page_id: int,
     extracted_text: Optional[List[PDFCharData]],
 ) -> None:
     """
     Adds text to a PDF page based on the extracted text data.
 
-    :param page: The PdfPage object to add text to.
+    :param document: The PDFDocument object.
+    :param page_id: ID of the current page.
     :param extracted_text: List of PDFCharData objects containing text and positioning information.
     """
     if not extracted_text:
         return
 
-    height = page.get_height()
-    document = page.pdf
+    height = document[page_id].get_height()
     pdfium_lock = RLock()
 
     with pdfium_lock:
-        text_handler = pdfium_c.FPDFText_LoadPage(page.raw)
         for char_data in extracted_text:
-            font = document.load_font(
-                char_data.font_name, pdfium_c.FPDF_FONT_TRUETYPE, True
+            font_name = c_char_p(char_data.font_name.encode("utf-8"))
+            text_handler = pdfium_c.FPDFPageObj_NewTextObj(
+                document.raw, font_name, char_data.font_size
             )
-            text_object = document.create_text_object(font, char_data.font_size)
-            text_object.set_text(char_data.char)
-            x = char_data.left
-            y = height - char_data.bottom
-            text_object.set_position(x, y)
-            r, g, b, a = char_data.font_fill_color
-            text_object.set_fill_color(r, g, b, a)
-            pdfium_c.FPDFPage_InsertObject(text_handler, text_object)
-        pdfium_c.FPDFPage_GenerateContent(text_handler)
-
-    with pdfium_lock:
-        pdfium_c.FPDFText_ClosePage(text_handler)
+            char_code = ord(char_data.char)
+            char_code_c_char = c_ushort(char_code)
+            char_ptr = POINTER(c_ushort)(char_code_c_char)
+            pdfium_c.FPDFText_SetText(text_handler, char_ptr)
+            pdfium_c.FPDFPageObj_Transform(
+                text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top
+            )
+            pdfium_c.FPDFPage_InsertObject(document[page_id].raw, text_handler)
+            pdfium_c.FPDFPageObj_Destroy(text_handler)
+        pdfium_c.FPDFPage_GenerateContent(document[page_id].raw)
+        pdfium_c.FPDF_ClosePage(document[page_id].raw)
 
 
 def compress_pages_with_quality(
@@ -164,12 +169,12 @@ def compress_pages_with_quality(
     pdf_document = pdfium.PdfDocument(pdf_data)
     compressed_pages = []
 
-    for [_, page] in enumerate(pdf_document):
+    for [i, page] in enumerate(pdf_document):
         rasterized_page = rasterize_page(page, image_quality)
         compressed_image = compress_image(rasterized_page, image_quality)
 
         if not disable_source_text:
-            add_text_to_pdf_page(page, extracted_text)
+            add_text_to_pdf_page(pdf_document, i, extracted_text)
 
         compressed_pages.append(compressed_image)
 
diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py
index 747133dd..699c39c6 100644
--- a/mindee/pdf/pdf_utils.py
+++ b/mindee/pdf/pdf_utils.py
@@ -1,3 +1,4 @@
+import ctypes
 import io
 from ctypes import byref, c_double, c_int, create_string_buffer
 from threading import RLock
@@ -125,14 +126,21 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
     :param pdfium_lock: Lock for thread-safe operations.
     :return: A dictionary containing character information.
     """
+    stroke = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint())
+    fill = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint())
+
     with pdfium_lock:
         char = chr(pdfium_c.FPDFText_GetUnicode(text_handler, i))
         font_name = get_font_name(text_handler, i)
         font_flags = get_font_flags(text_handler, i)
         font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i)
         font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i)
-        font_stroke_color = pdfium_c.FPDFText_GetStrokeColor(text_handler, i)
-        font_fill_color = pdfium_c.FPDFText_GetFillColor(text_handler, i)
+        _ = pdfium_c.FPDFText_GetStrokeColor(
+            text_handler, i, stroke[0], stroke[1], stroke[2], stroke[3]
+        )
+        _ = pdfium_c.FPDFText_GetFillColor(
+            text_handler, i, fill[0], fill[1], fill[2], fill[3]
+        )
 
     return {
         "char": char,
@@ -140,8 +148,8 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
         "font_flags": font_flags,
         "font_size": font_size,
         "font_weight": font_weight,
-        "font_stroke_color": font_stroke_color,
-        "font_fill_color": font_fill_color,
+        "font_stroke_color": stroke,
+        "font_fill_color": fill,
     }
 
 
diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py
index 900c0d71..ccc402eb 100644
--- a/tests/input/test_compression.py
+++ b/tests/input/test_compression.py
@@ -19,6 +19,7 @@ def test_image_quality_compress_from_input_source():
 
     with open(OUTPUT_DIR / "compress_indirect.jpg", "wb") as f:
         f.write(receipt_input.file_object.read())
+        receipt_input.file_object.seek(0)
 
     initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
     rendered_file_stats = os.stat(OUTPUT_DIR / "compress_indirect.jpg")
@@ -62,6 +63,7 @@ def test_image_resize_from_input_source():
     image_resize_input.compress(75, 250, 1000)
     with open(OUTPUT_DIR / "resize_indirect.jpg", "wb") as f:
         f.write(image_resize_input.file_object.read())
+        image_resize_input.file_object.seek(0)
 
     initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
     rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.jpg")
@@ -143,8 +145,8 @@ def test_pdf_compress_from_compressor():
     resizes = []
     qualities = [85, 75, 50, 10]
     for quality in qualities:
-        pdf_resize_input.file_object.seek(0)
         resizes.append(compress_pdf(pdf_resize_input.file_object, quality))
+        pdf_resize_input.file_object.seek(0)
 
     file_names = [
         "compress85.pdf",
@@ -172,12 +174,11 @@ def test_pdf_compress_with_text_keeps_text():
 
     compressed_with_text = compress_pdf(initial_with_text.file_object, 100, True, False)
 
-    original_text = "".join(
-        [
-            text_info.char
-            for text_info in extract_text_from_pdf(initial_with_text.file_object.read())
-        ]
-    )
+    text_chars = []
+    for text_info in extract_text_from_pdf(initial_with_text.file_object.read()):
+        text_chars.append(text_info.char)
+    initial_with_text.file_object.seek(0)
+    original_text = "".join(text_chars)
     compressed_text = "".join(
         [text_info.char for text_info in extract_text_from_pdf(compressed_with_text)]
     )
@@ -193,32 +194,32 @@ def test_pdf_compress_with_text_does_not_compress():
     assert compressed_with_text == initial_with_text.file_object
 
 
-@pytest.fixture(scope="module", autouse=True)
-def cleanup():
-    yield
-    created_files = [
-        "compress10.pdf",
-        "compress50.pdf",
-        "compress75.pdf",
-        "compress85.pdf",
-        "resize_indirect.pdf",
-        "compress1.jpg",
-        "compress10.jpg",
-        "compress50.jpg",
-        "compress75.jpg",
-        "compress100.jpg",
-        "compress_indirect.jpg",
-        "resize250x500.jpg",
-        "resize500x250.jpg",
-        "resize500xnull.jpg",
-        "resize_indirect.jpg",
-        "resizenullx250.jpg",
-    ]
-
-    for file_path in created_files:
-        full_path = DATA_DIR / "output" / file_path
-        if full_path.exists():
-            try:
-                os.remove(full_path)
-            except OSError as e:
-                print(f"Could not delete file '{file_path}': {e.strerror}")
+# @pytest.fixture(scope="module", autouse=True)
+# def cleanup():
+#     yield
+#     created_files = [
+#         "compress10.pdf",
+#         "compress50.pdf",
+#         "compress75.pdf",
+#         "compress85.pdf",
+#         "resize_indirect.pdf",
+#         "compress1.jpg",
+#         "compress10.jpg",
+#         "compress50.jpg",
+#         "compress75.jpg",
+#         "compress100.jpg",
+#         "compress_indirect.jpg",
+#         "resize250x500.jpg",
+#         "resize500x250.jpg",
+#         "resize500xnull.jpg",
+#         "resize_indirect.jpg",
+#         "resizenullx250.jpg",
+#     ]
+#
+#     for file_path in created_files:
+#         full_path = DATA_DIR / "output" / file_path
+#         if full_path.exists():
+#             try:
+#                 os.remove(full_path)
+#             except OSError as e:
+#                 print(f"Could not delete file '{file_path}': {e.strerror}")

From e53bba33ef53555cbeb8334cb5519a10354005e3 Mon Sep 17 00:00:00 2001
From: sebastianMindee <sebastian.oliverasilvera@mindee.co>
Date: Mon, 20 Jan 2025 16:59:13 +0100
Subject: [PATCH 4/7] it works

---
 mindee/extraction/common/image_extractor.py |  34 +++++-
 mindee/pdf/__init__.py                      |   1 -
 mindee/pdf/pdf_char_data.py                 |   2 +
 mindee/pdf/pdf_compressor.py                |  95 +++++++++-------
 mindee/pdf/pdf_utils.py                     | 115 +++++++++-----------
 tests/input/test_compression.py             |  75 +++++++------
 6 files changed, 180 insertions(+), 142 deletions(-)

diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py
index 95609e5a..0cfdcba7 100644
--- a/mindee/extraction/common/image_extractor.py
+++ b/mindee/extraction/common/image_extractor.py
@@ -1,5 +1,5 @@
 import io
-from typing import List
+from typing import BinaryIO, List
 
 import pypdfium2 as pdfium
 from PIL import Image
@@ -10,7 +10,6 @@
 from mindee.geometry.polygon import get_min_max_x, get_min_max_y
 from mindee.input.sources.bytes_input import BytesInput
 from mindee.input.sources.local_input_source import LocalInputSource
-from mindee.pdf.pdf_utils import attach_images_as_new_file
 
 
 def extract_image_from_polygon(
@@ -131,3 +130,34 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: i
         return pdfium.PdfDocument(input_file.file_object.read())
 
     return attach_images_as_new_file([input_file.file_object])
+
+
+def attach_images_as_new_file(  # type: ignore
+    input_buffer_list: List[BinaryIO],
+) -> pdfium.PdfDocument:
+    """
+    Attaches a list of images as new pages in a PdfDocument object.
+
+    :param input_buffer_list: List of images, represented as buffers.
+    :return: A PdfDocument handle.
+    """
+    pdf = pdfium.PdfDocument.new()
+    for input_buffer in input_buffer_list:
+        input_buffer.seek(0)
+        image = Image.open(input_buffer)
+        image.convert("RGB")
+        image_buffer = io.BytesIO()
+        image.save(image_buffer, format="JPEG")
+
+        image_pdf = pdfium.PdfImage.new(pdf)
+        image_pdf.load_jpeg(image_buffer)
+        width, height = image_pdf.get_size()
+
+        matrix = pdfium.PdfMatrix().scale(width, height)
+        image_pdf.set_matrix(matrix)
+
+        page = pdf.new_page(width, height)
+        page.insert_obj(image_pdf)
+        page.gen_content()
+        image.close()
+    return pdf
diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py
index 5afcb672..6864d138 100644
--- a/mindee/pdf/__init__.py
+++ b/mindee/pdf/__init__.py
@@ -1,7 +1,6 @@
 from mindee.pdf.pdf_char_data import PDFCharData
 from mindee.pdf.pdf_compressor import compress_pdf
 from mindee.pdf.pdf_utils import (
-    attach_images_as_new_file,
     extract_text_from_pdf,
     has_source_text,
 )
diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py
index 75637a52..58d46db9 100644
--- a/mindee/pdf/pdf_char_data.py
+++ b/mindee/pdf/pdf_char_data.py
@@ -28,3 +28,5 @@ class PDFCharData:
     """RGBA representation of the font's stroke color."""
     font_fill_color: Tuple[int, int, int, int]
     """RGBA representation of the font's fill color."""
+    page_id: int
+    """ID of the page the character was found on."""
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
index 9f859d92..037c58d2 100644
--- a/mindee/pdf/pdf_compressor.py
+++ b/mindee/pdf/pdf_compressor.py
@@ -2,16 +2,16 @@
 import logging
 from ctypes import c_char_p, c_ushort
 from threading import RLock
-from typing import BinaryIO, List, Optional, Union
+from typing import BinaryIO, List, Optional, Tuple, Union
 
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from _ctypes import POINTER
+from PIL import Image
 
 from mindee.image_operations.image_compressor import compress_image
 from mindee.pdf.pdf_char_data import PDFCharData
 from mindee.pdf.pdf_utils import (
-    attach_images_as_new_file,
     extract_text_from_pdf,
     has_source_text,
 )
@@ -61,9 +61,7 @@ def compress_pdf(
         extract_text_from_pdf(pdf_bytes) if not disable_source_text else None
     )
 
-    compressed_pages = compress_pdf_pages(
-        pdf_bytes, extracted_text, image_quality, disable_source_text
-    )
+    compressed_pages = compress_pdf_pages(pdf_bytes, image_quality)
 
     if not compressed_pages:
         logger.warning(
@@ -71,9 +69,14 @@ def compress_pdf(
         )
         return pdf_bytes
 
-    out_pdf = attach_images_as_new_file(
-        [io.BytesIO(compressed_page) for compressed_page in compressed_pages]
+    out_pdf = collect_images_as_pdf(
+        [compressed_page_image[0] for compressed_page_image in compressed_pages]
     )
+
+    if not disable_source_text:
+        for i, page in enumerate(out_pdf):
+            add_text_to_pdf_page(page, i, extracted_text)
+
     out_buffer = io.BytesIO()
     out_pdf.save(out_buffer)
     out_buffer.seek(0)
@@ -82,26 +85,20 @@ def compress_pdf(
 
 def compress_pdf_pages(
     pdf_data: bytes,
-    extracted_text: Optional[List[PDFCharData]],
     image_quality: int,
-    disable_source_text: bool,
-) -> Optional[List[bytes]]:
+) -> Optional[List[Tuple[bytes, int, int]]]:
     """
     Compresses PDF pages and returns an array of compressed page buffers.
 
     :param pdf_data: The input PDF as bytes.
-    :param extracted_text: Extracted text from the PDF.
     :param image_quality: Initial compression quality.
-    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
     :return: List of compressed page buffers, or None if compression fails.
     """
     original_size = len(pdf_data)
     image_quality_loop = image_quality
 
     while image_quality_loop >= MIN_QUALITY:
-        compressed_pages = compress_pages_with_quality(
-            pdf_data, extracted_text, image_quality_loop, disable_source_text
-        )
+        compressed_pages = compress_pages_with_quality(pdf_data, image_quality_loop)
         total_compressed_size = sum(len(page) for page in compressed_pages)
 
         if is_compression_successful(
@@ -115,28 +112,28 @@ def compress_pdf_pages(
 
 
 def add_text_to_pdf_page(  # type: ignore
-    document: pdfium.PdfDocument,
+    page: pdfium.PdfPage,
     page_id: int,
-    extracted_text: Optional[List[PDFCharData]],
+    extracted_text: Optional[List[List[PDFCharData]]],
 ) -> None:
     """
     Adds text to a PDF page based on the extracted text data.
 
-    :param document: The PDFDocument object.
-    :param page_id: ID of the current page.
+    :param page: The PDFDocument object.
+    :param page_id: The ID of the page.
     :param extracted_text: List of PDFCharData objects containing text and positioning information.
     """
-    if not extracted_text:
+    if not extracted_text or not extracted_text[page_id]:
         return
 
-    height = document[page_id].get_height()
+    height = page.get_height()
     pdfium_lock = RLock()
 
     with pdfium_lock:
-        for char_data in extracted_text:
+        for char_data in extracted_text[page_id]:
             font_name = c_char_p(char_data.font_name.encode("utf-8"))
             text_handler = pdfium_c.FPDFPageObj_NewTextObj(
-                document.raw, font_name, char_data.font_size
+                page.pdf.raw, font_name, char_data.font_size
             )
             char_code = ord(char_data.char)
             char_code_c_char = c_ushort(char_code)
@@ -145,38 +142,28 @@ def add_text_to_pdf_page(  # type: ignore
             pdfium_c.FPDFPageObj_Transform(
                 text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top
             )
-            pdfium_c.FPDFPage_InsertObject(document[page_id].raw, text_handler)
-            pdfium_c.FPDFPageObj_Destroy(text_handler)
-        pdfium_c.FPDFPage_GenerateContent(document[page_id].raw)
-        pdfium_c.FPDF_ClosePage(document[page_id].raw)
+            pdfium_c.FPDFPage_InsertObject(page.raw, text_handler)
+        pdfium_c.FPDFPage_GenerateContent(page.raw)
 
 
 def compress_pages_with_quality(
     pdf_data: bytes,
-    extracted_text: Optional[list[PDFCharData]],
     image_quality: int,
-    disable_source_text: bool,
-) -> List[bytes]:
+) -> List[Tuple[bytes, int, int]]:
     """
     Compresses pages with a specific quality.
 
     :param pdf_data: The input PDF as bytes.
-    :param extracted_text: Extracted text from the PDF.
     :param image_quality: Compression quality.
-    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
     :return: List of compressed page buffers.
     """
     pdf_document = pdfium.PdfDocument(pdf_data)
     compressed_pages = []
-
-    for [i, page] in enumerate(pdf_document):
+    for page in pdf_document:
         rasterized_page = rasterize_page(page, image_quality)
         compressed_image = compress_image(rasterized_page, image_quality)
-
-        if not disable_source_text:
-            add_text_to_pdf_page(pdf_document, i, extracted_text)
-
-        compressed_pages.append(compressed_image)
+        image = Image.open(io.BytesIO(compressed_image))
+        compressed_pages.append((compressed_image, image.size[0], image.size[1]))
 
     return compressed_pages
 
@@ -223,3 +210,33 @@ def lerp(start: float, end: float, t: float) -> float:
     :return: The interpolated value.
     """
     return start * (1 - t) + end * t
+
+
+def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument:  # type: ignore
+    """
+    Converts a list of JPEG images into pages in a PdfDocument.
+
+    :param image_list: A list of bytes representing JPEG images.
+    :return: A PdfDocument handle containing the images as pages.
+    """
+    # Create a new, empty PdfDocument
+    out_pdf = pdfium.PdfDocument.new()
+
+    for image_bytes in image_list:
+        # Load the JPEG image into a PdfImage object
+        pdf_image = pdfium.PdfImage.new(out_pdf)
+        pdf_image.load_jpeg(io.BytesIO(image_bytes))
+
+        # Get the dimensions of the image
+        width, height = pdf_image.get_size()
+
+        # Create a new page in the PDF with the same dimensions as the image
+        page = out_pdf.new_page(width, height)
+
+        # Place the image on the page
+        page.insert_obj(pdf_image)
+
+        # Generate content for the page to finalize it
+        page.gen_content()
+
+    return out_pdf
diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py
index 699c39c6..e5ab71c8 100644
--- a/mindee/pdf/pdf_utils.py
+++ b/mindee/pdf/pdf_utils.py
@@ -1,12 +1,10 @@
 import ctypes
-import io
 from ctypes import byref, c_double, c_int, create_string_buffer
 from threading import RLock
-from typing import BinaryIO, List, Tuple
+from typing import List, Tuple
 
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
-from PIL import Image
 
 from mindee.pdf.pdf_char_data import PDFCharData
 
@@ -27,31 +25,32 @@ def has_source_text(pdf_bytes: bytes) -> bool:
     return False
 
 
-def extract_text_from_pdf(pdf_bytes: bytes) -> List[PDFCharData]:
+def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]:
     """
     Extracts the raw text from a given PDF's bytes along with font data.
 
     :param pdf_bytes: Raw bytes representation of a PDF file.
     :return: A list of info regarding each read character.
     """
-    char_data_list: List[PDFCharData] = []
     pdfium_lock = RLock()
     pdf = pdfium.PdfDocument(pdf_bytes)
+    char_data_list: List[List[PDFCharData]] = []
 
-    for page in pdf:
-        process_page(page, pdfium_lock, char_data_list)
+    for i, page in enumerate(pdf):
+        char_data_list.append(process_page(page, i, pdfium_lock))
 
     return char_data_list
 
 
-def process_page(page, pdfium_lock: RLock, char_data_list: List[PDFCharData]):
+def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
     """
     Processes a single page of the PDF.
 
     :param page: The PDF page to process.
+    :param page_id: ID of the page.
     :param pdfium_lock: Lock for thread-safe operations.
-    :param char_data_list: List to append character data to.
     """
+    char_data_list: List[PDFCharData] = []
     internal_height = page.get_height()
     internal_width = page.get_width()
 
@@ -60,18 +59,15 @@ def process_page(page, pdfium_lock: RLock, char_data_list: List[PDFCharData]):
         count_chars = pdfium_c.FPDFText_CountChars(text_handler)
 
     for i in range(count_chars):
-        process_char(
-            i,
-            text_handler,
-            page,
-            pdfium_lock,
-            internal_height,
-            internal_width,
-            char_data_list,
+        concatenated_chars = process_char(
+            i, text_handler, page, pdfium_lock, internal_height, internal_width, page_id
         )
+        for concatenated_char in concatenated_chars:
+            char_data_list.append(concatenated_char)
 
     with pdfium_lock:
         pdfium_c.FPDFText_ClosePage(text_handler)
+    return char_data_list
 
 
 def process_char(
@@ -81,8 +77,8 @@ def process_char(
     pdfium_lock: RLock,
     internal_height: float,
     internal_width: float,
-    char_data_list: List[PDFCharData],
-):
+    page_id: int,
+) -> List[PDFCharData]:
     """
     Processes a single character from the PDF.
 
@@ -92,29 +88,44 @@ def process_char(
     :param pdfium_lock: Lock for thread-safe operations.
     :param internal_height: The height of the page.
     :param internal_width: The width of the page.
-    :param char_data_list: List to append character data to.
+    :param page_id: ID of the page the character was found on.
+    :return: List of character data for a page.
     """
     char_info = get_char_info(i, text_handler, pdfium_lock)
+    if not char_info:
+        return []
     char_box = get_char_box(i, text_handler, pdfium_lock)
     rotation = get_page_rotation(page, pdfium_lock)
 
     adjusted_box = adjust_char_box(char_box, rotation, internal_height, internal_width)
-
+    char_data_list: List[PDFCharData] = []
     for c in char_info["char"] or " ":
-        char_data = PDFCharData(
-            char=c,
-            left=int(adjusted_box[0]),
-            right=int(adjusted_box[1]),
-            top=int(adjusted_box[2]),
-            bottom=int(adjusted_box[3]),
-            font_name=char_info["font_name"],
-            font_size=char_info["font_size"],
-            font_weight=char_info["font_weight"],
-            font_stroke_color=char_info["font_stroke_color"],
-            font_fill_color=char_info["font_fill_color"],
-            font_flags=char_info["font_flags"],
+        if c in (
+            "\n",
+            "\r",
+        ):  # Removes duplicated carriage returns in the PDF due to weird extraction.
+            # IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check.
+            next_char_info = get_char_info(i + 1, text_handler, pdfium_lock)
+            if not next_char_info or next_char_info["char"] in ("\n", "\r"):
+                continue
+
+        char_data_list.append(
+            PDFCharData(
+                char=c,
+                left=int(adjusted_box[0]),
+                right=int(adjusted_box[1]),
+                top=int(adjusted_box[2]),
+                bottom=int(adjusted_box[3]),
+                font_name=char_info["font_name"],
+                font_size=char_info["font_size"],
+                font_weight=char_info["font_weight"],
+                font_stroke_color=char_info["font_stroke_color"],
+                font_fill_color=char_info["font_fill_color"],
+                font_flags=char_info["font_flags"],
+                page_id=page_id,
+            )
         )
-        char_data_list.append(char_data)
+    return char_data_list
 
 
 def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
@@ -130,7 +141,10 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
     fill = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint())
 
     with pdfium_lock:
-        char = chr(pdfium_c.FPDFText_GetUnicode(text_handler, i))
+        unicode_char = pdfium_c.FPDFText_GetUnicode(text_handler, i)
+        if unicode_char == 0xFF:
+            return {}
+        char = chr(unicode_char)
         font_name = get_font_name(text_handler, i)
         font_flags = get_font_flags(text_handler, i)
         font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i)
@@ -249,34 +263,3 @@ def adjust_char_box(
             internal_height - left,
         )
     return left, right, top, bottom
-
-
-def attach_images_as_new_file(  # type: ignore
-    input_buffer_list: List[BinaryIO],
-) -> pdfium.PdfDocument:
-    """
-    Attaches a list of images as new pages in a PdfDocument object.
-
-    :param input_buffer_list: List of images, represented as buffers.
-    :return: A PdfDocument handle.
-    """
-    pdf = pdfium.PdfDocument.new()
-    for input_buffer in input_buffer_list:
-        input_buffer.seek(0)
-        image = Image.open(input_buffer)
-        image.convert("RGB")
-        image_buffer = io.BytesIO()
-        image.save(image_buffer, format="JPEG")
-
-        image_pdf = pdfium.PdfImage.new(pdf)
-        image_pdf.load_jpeg(image_buffer)
-        width, height = image_pdf.get_size()
-
-        matrix = pdfium.PdfMatrix().scale(width, height)
-        image_pdf.set_matrix(matrix)
-
-        page = pdf.new_page(width, height)
-        page.insert_obj(image_pdf)
-        page.gen_content()
-        image.close()
-    return pdf
diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py
index ccc402eb..03af2085 100644
--- a/tests/input/test_compression.py
+++ b/tests/input/test_compression.py
@@ -1,4 +1,6 @@
+import operator
 import os
+from functools import reduce
 from pathlib import Path
 
 import pytest
@@ -176,11 +178,16 @@ def test_pdf_compress_with_text_keeps_text():
 
     text_chars = []
     for text_info in extract_text_from_pdf(initial_with_text.file_object.read()):
-        text_chars.append(text_info.char)
+        text_chars.append("".join([ti.char for ti in text_info]))
     initial_with_text.file_object.seek(0)
     original_text = "".join(text_chars)
     compressed_text = "".join(
-        [text_info.char for text_info in extract_text_from_pdf(compressed_with_text)]
+        [
+            text_info.char
+            for text_info in reduce(
+                operator.concat, extract_text_from_pdf(compressed_with_text)
+            )
+        ]
     )
 
     assert compressed_text == original_text
@@ -191,35 +198,35 @@ def test_pdf_compress_with_text_does_not_compress():
 
     compressed_with_text = compress_pdf(initial_with_text.file_object, 50)
 
-    assert compressed_with_text == initial_with_text.file_object
-
-
-# @pytest.fixture(scope="module", autouse=True)
-# def cleanup():
-#     yield
-#     created_files = [
-#         "compress10.pdf",
-#         "compress50.pdf",
-#         "compress75.pdf",
-#         "compress85.pdf",
-#         "resize_indirect.pdf",
-#         "compress1.jpg",
-#         "compress10.jpg",
-#         "compress50.jpg",
-#         "compress75.jpg",
-#         "compress100.jpg",
-#         "compress_indirect.jpg",
-#         "resize250x500.jpg",
-#         "resize500x250.jpg",
-#         "resize500xnull.jpg",
-#         "resize_indirect.jpg",
-#         "resizenullx250.jpg",
-#     ]
-#
-#     for file_path in created_files:
-#         full_path = DATA_DIR / "output" / file_path
-#         if full_path.exists():
-#             try:
-#                 os.remove(full_path)
-#             except OSError as e:
-#                 print(f"Could not delete file '{file_path}': {e.strerror}")
+    assert compressed_with_text == initial_with_text.file_object.read()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def cleanup():
+    yield
+    created_files = [
+        "compress10.pdf",
+        "compress50.pdf",
+        "compress75.pdf",
+        "compress85.pdf",
+        "resize_indirect.pdf",
+        "compress1.jpg",
+        "compress10.jpg",
+        "compress50.jpg",
+        "compress75.jpg",
+        "compress100.jpg",
+        "compress_indirect.jpg",
+        "resize250x500.jpg",
+        "resize500x250.jpg",
+        "resize500xnull.jpg",
+        "resize_indirect.jpg",
+        "resizenullx250.jpg",
+    ]
+
+    for file_path in created_files:
+        full_path = DATA_DIR / "output" / file_path
+        if full_path.exists():
+            try:
+                os.remove(full_path)
+            except OSError as e:
+                print(f"Could not delete file '{file_path}': {e.strerror}")

From 1865726ac39566a249344f43341bf1fd61fd6b7b Mon Sep 17 00:00:00 2001
From: sebastianMindee <sebastian.oliverasilvera@mindee.co>
Date: Mon, 20 Jan 2025 17:03:39 +0100
Subject: [PATCH 5/7] restore function that was needlessly edited

---
 mindee/extraction/__init__.py               |  1 +
 mindee/extraction/common/__init__.py        |  1 +
 mindee/extraction/common/image_extractor.py | 65 +++++++++++----------
 mindee/pdf/pdf_compressor.py                |  9 ---
 4 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/mindee/extraction/__init__.py b/mindee/extraction/__init__.py
index 9b86d0ee..05629a5d 100644
--- a/mindee/extraction/__init__.py
+++ b/mindee/extraction/__init__.py
@@ -1,5 +1,6 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
+    attach_image_as_new_file,
     extract_multiple_images_from_source,
 )
 from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
diff --git a/mindee/extraction/common/__init__.py b/mindee/extraction/common/__init__.py
index 1acb7bb9..c0301c90 100644
--- a/mindee/extraction/common/__init__.py
+++ b/mindee/extraction/common/__init__.py
@@ -1,4 +1,5 @@
 from mindee.extraction.common.extracted_image import ExtractedImage
 from mindee.extraction.common.image_extractor import (
+    attach_image_as_new_file,
     extract_multiple_images_from_source,
 )
diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py
index 0cfdcba7..5bae6d37 100644
--- a/mindee/extraction/common/image_extractor.py
+++ b/mindee/extraction/common/image_extractor.py
@@ -12,6 +12,38 @@
 from mindee.input.sources.local_input_source import LocalInputSource
 
 
+def attach_image_as_new_file(  # type: ignore
+    input_buffer: BinaryIO,
+) -> pdfium.PdfDocument:
+    """
+    Attaches an image as a new page in a PdfDocument object.
+
+    :param input_buffer: Input buffer.
+    :return: A PdfDocument handle.
+    """
+    # Create a new page in the PdfDocument
+    input_buffer.seek(0)
+    image = Image.open(input_buffer)
+    image.convert("RGB")
+    image_buffer = io.BytesIO()
+    image.save(image_buffer, format="JPEG")
+
+    pdf = pdfium.PdfDocument.new()
+
+    image_pdf = pdfium.PdfImage.new(pdf)
+    image_pdf.load_jpeg(image_buffer)
+    width, height = image_pdf.get_size()
+
+    matrix = pdfium.PdfMatrix().scale(width, height)
+    image_pdf.set_matrix(matrix)
+
+    page = pdf.new_page(width, height)
+    page.insert_obj(image_pdf)
+    page.gen_content()
+    image.close()
+    return pdf
+
+
 def extract_image_from_polygon(
     page_content: Image.Image,
     polygon: List[Point],
@@ -129,35 +161,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: i
         input_file.file_object.seek(0)
         return pdfium.PdfDocument(input_file.file_object.read())
 
-    return attach_images_as_new_file([input_file.file_object])
-
-
-def attach_images_as_new_file(  # type: ignore
-    input_buffer_list: List[BinaryIO],
-) -> pdfium.PdfDocument:
-    """
-    Attaches a list of images as new pages in a PdfDocument object.
-
-    :param input_buffer_list: List of images, represented as buffers.
-    :return: A PdfDocument handle.
-    """
-    pdf = pdfium.PdfDocument.new()
-    for input_buffer in input_buffer_list:
-        input_buffer.seek(0)
-        image = Image.open(input_buffer)
-        image.convert("RGB")
-        image_buffer = io.BytesIO()
-        image.save(image_buffer, format="JPEG")
-
-        image_pdf = pdfium.PdfImage.new(pdf)
-        image_pdf.load_jpeg(image_buffer)
-        width, height = image_pdf.get_size()
-
-        matrix = pdfium.PdfMatrix().scale(width, height)
-        image_pdf.set_matrix(matrix)
-
-        page = pdf.new_page(width, height)
-        page.insert_obj(image_pdf)
-        page.gen_content()
-        image.close()
-    return pdf
+    return attach_image_as_new_file(input_file.file_object)
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
index 037c58d2..133c087e 100644
--- a/mindee/pdf/pdf_compressor.py
+++ b/mindee/pdf/pdf_compressor.py
@@ -219,24 +219,15 @@ def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument:  # typ
     :param image_list: A list of bytes representing JPEG images.
     :return: A PdfDocument handle containing the images as pages.
     """
-    # Create a new, empty PdfDocument
     out_pdf = pdfium.PdfDocument.new()
 
     for image_bytes in image_list:
-        # Load the JPEG image into a PdfImage object
         pdf_image = pdfium.PdfImage.new(out_pdf)
         pdf_image.load_jpeg(io.BytesIO(image_bytes))
 
-        # Get the dimensions of the image
         width, height = pdf_image.get_size()
-
-        # Create a new page in the PDF with the same dimensions as the image
         page = out_pdf.new_page(width, height)
-
-        # Place the image on the page
         page.insert_obj(pdf_image)
-
-        # Generate content for the page to finalize it
         page.gen_content()
 
     return out_pdf

From 3125e2e021c495c1f4ca20a1a88dd5dea065fc80 Mon Sep 17 00:00:00 2001
From: sebastianMindee <sebastian.oliverasilvera@mindee.co>
Date: Mon, 20 Jan 2025 17:43:35 +0100
Subject: [PATCH 6/7] fix function names

---
 mindee/image_operations/__init__.py |  1 +
 mindee/pdf/__init__.py              |  1 +
 mindee/pdf/pdf_compressor.py        | 33 +++++++--------------
 mindee/pdf/pdf_utils.py             | 46 ++++++++++++++++++-----------
 4 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/mindee/image_operations/__init__.py b/mindee/image_operations/__init__.py
index e69de29b..f92bd401 100644
--- a/mindee/image_operations/__init__.py
+++ b/mindee/image_operations/__init__.py
@@ -0,0 +1 @@
+from mindee.image_operations.image_compressor import compress_image
diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py
index 6864d138..4c55dad2 100644
--- a/mindee/pdf/__init__.py
+++ b/mindee/pdf/__init__.py
@@ -3,4 +3,5 @@
 from mindee.pdf.pdf_utils import (
     extract_text_from_pdf,
     has_source_text,
+    lerp,
 )
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
index 133c087e..cfaf4254 100644
--- a/mindee/pdf/pdf_compressor.py
+++ b/mindee/pdf/pdf_compressor.py
@@ -14,6 +14,7 @@
 from mindee.pdf.pdf_utils import (
     extract_text_from_pdf,
     has_source_text,
+    lerp,
 )
 
 logger = logging.getLogger(__name__)
@@ -61,7 +62,7 @@ def compress_pdf(
         extract_text_from_pdf(pdf_bytes) if not disable_source_text else None
     )
 
-    compressed_pages = compress_pdf_pages(pdf_bytes, image_quality)
+    compressed_pages = _compress_pdf_pages(pdf_bytes, image_quality)
 
     if not compressed_pages:
         logger.warning(
@@ -69,7 +70,7 @@ def compress_pdf(
         )
         return pdf_bytes
 
-    out_pdf = collect_images_as_pdf(
+    out_pdf = _collect_images_as_pdf(
         [compressed_page_image[0] for compressed_page_image in compressed_pages]
     )
 
@@ -83,7 +84,7 @@ def compress_pdf(
     return out_buffer.read()
 
 
-def compress_pdf_pages(
+def _compress_pdf_pages(
     pdf_data: bytes,
     image_quality: int,
 ) -> Optional[List[Tuple[bytes, int, int]]]:
@@ -98,10 +99,10 @@ def compress_pdf_pages(
     image_quality_loop = image_quality
 
     while image_quality_loop >= MIN_QUALITY:
-        compressed_pages = compress_pages_with_quality(pdf_data, image_quality_loop)
+        compressed_pages = _compress_pages_with_quality(pdf_data, image_quality_loop)
         total_compressed_size = sum(len(page) for page in compressed_pages)
 
-        if is_compression_successful(
+        if _is_compression_successful(
             total_compressed_size, original_size, image_quality
         ):
             return compressed_pages
@@ -146,7 +147,7 @@ def add_text_to_pdf_page(  # type: ignore
         pdfium_c.FPDFPage_GenerateContent(page.raw)
 
 
-def compress_pages_with_quality(
+def _compress_pages_with_quality(
     pdf_data: bytes,
     image_quality: int,
 ) -> List[Tuple[bytes, int, int]]:
@@ -160,7 +161,7 @@ def compress_pages_with_quality(
     pdf_document = pdfium.PdfDocument(pdf_data)
     compressed_pages = []
     for page in pdf_document:
-        rasterized_page = rasterize_page(page, image_quality)
+        rasterized_page = _rasterize_page(page, image_quality)
         compressed_image = compress_image(rasterized_page, image_quality)
         image = Image.open(io.BytesIO(compressed_image))
         compressed_pages.append((compressed_image, image.size[0], image.size[1]))
@@ -168,7 +169,7 @@ def compress_pages_with_quality(
     return compressed_pages
 
 
-def is_compression_successful(
+def _is_compression_successful(
     total_compressed_size: int, original_size: int, image_quality: int
 ) -> bool:
     """
@@ -183,7 +184,7 @@ def is_compression_successful(
     return total_compressed_size + total_compressed_size * overhead < original_size
 
 
-def rasterize_page(  # type: ignore
+def _rasterize_page(  # type: ignore
     page: pdfium.PdfPage,
     quality: int = 85,
 ) -> bytes:
@@ -200,19 +201,7 @@ def rasterize_page(  # type: ignore
     return buffer.getvalue()
 
 
-def lerp(start: float, end: float, t: float) -> float:
-    """
-    Performs linear interpolation between two numbers.
-
-    :param start: The starting value.
-    :param end: The ending value.
-    :param t: The interpolation factor (0 to 1).
-    :return: The interpolated value.
-    """
-    return start * (1 - t) + end * t
-
-
-def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument:  # type: ignore
+def _collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument:  # type: ignore
     """
     Converts a list of JPEG images into pages in a PdfDocument.
 
diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py
index e5ab71c8..70b7d984 100644
--- a/mindee/pdf/pdf_utils.py
+++ b/mindee/pdf/pdf_utils.py
@@ -37,12 +37,12 @@ def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]:
     char_data_list: List[List[PDFCharData]] = []
 
     for i, page in enumerate(pdf):
-        char_data_list.append(process_page(page, i, pdfium_lock))
+        char_data_list.append(_process_page(page, i, pdfium_lock))
 
     return char_data_list
 
 
-def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
+def _process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
     """
     Processes a single page of the PDF.
 
@@ -59,7 +59,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
         count_chars = pdfium_c.FPDFText_CountChars(text_handler)
 
     for i in range(count_chars):
-        concatenated_chars = process_char(
+        concatenated_chars = _process_char(
             i, text_handler, page, pdfium_lock, internal_height, internal_width, page_id
         )
         for concatenated_char in concatenated_chars:
@@ -70,7 +70,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
     return char_data_list
 
 
-def process_char(
+def _process_char(
     i: int,
     text_handler,
     page,
@@ -91,13 +91,13 @@ def process_char(
     :param page_id: ID of the page the character was found on.
     :return: List of character data for a page.
     """
-    char_info = get_char_info(i, text_handler, pdfium_lock)
+    char_info = _get_char_info(i, text_handler, pdfium_lock)
     if not char_info:
         return []
-    char_box = get_char_box(i, text_handler, pdfium_lock)
-    rotation = get_page_rotation(page, pdfium_lock)
+    char_box = _get_char_box(i, text_handler, pdfium_lock)
+    rotation = _get_page_rotation(page, pdfium_lock)
 
-    adjusted_box = adjust_char_box(char_box, rotation, internal_height, internal_width)
+    adjusted_box = _adjust_char_box(char_box, rotation, internal_height, internal_width)
     char_data_list: List[PDFCharData] = []
     for c in char_info["char"] or " ":
         if c in (
@@ -105,7 +105,7 @@ def process_char(
             "\r",
         ):  # Removes duplicated carriage returns in the PDF due to weird extraction.
             # IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check.
-            next_char_info = get_char_info(i + 1, text_handler, pdfium_lock)
+            next_char_info = _get_char_info(i + 1, text_handler, pdfium_lock)
             if not next_char_info or next_char_info["char"] in ("\n", "\r"):
                 continue
 
@@ -128,7 +128,7 @@ def process_char(
     return char_data_list
 
 
-def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
+def _get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
     """
     Retrieves information about a specific character.
 
@@ -145,8 +145,8 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
         if unicode_char == 0xFF:
             return {}
         char = chr(unicode_char)
-        font_name = get_font_name(text_handler, i)
-        font_flags = get_font_flags(text_handler, i)
+        font_name = _get_font_name(text_handler, i)
+        font_flags = _get_font_flags(text_handler, i)
         font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i)
         font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i)
         _ = pdfium_c.FPDFText_GetStrokeColor(
@@ -167,7 +167,7 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
     }
 
 
-def get_font_name(text_handler, i: int) -> str:
+def _get_font_name(text_handler, i: int) -> str:
     """
     Retrieves the font name for a specific character.
 
@@ -186,7 +186,7 @@ def get_font_name(text_handler, i: int) -> str:
     )
 
 
-def get_font_flags(text_handler, i: int) -> int:
+def _get_font_flags(text_handler, i: int) -> int:
     """
     Retrieves the font flags for a specific character.
 
@@ -199,7 +199,7 @@ def get_font_flags(text_handler, i: int) -> int:
     return flags.value
 
 
-def get_char_box(
+def _get_char_box(
     i: int, text_handler, pdfium_lock: RLock
 ) -> Tuple[float, float, float, float]:
     """
@@ -218,7 +218,7 @@ def get_char_box(
     return left.value, right.value, bottom.value, top.value
 
 
-def get_page_rotation(page, pdfium_lock: RLock) -> int:
+def _get_page_rotation(page, pdfium_lock: RLock) -> int:
     """
     Retrieves the rotation value for a specific page.
 
@@ -232,7 +232,7 @@ def get_page_rotation(page, pdfium_lock: RLock) -> int:
         )
 
 
-def adjust_char_box(
+def _adjust_char_box(
     char_box: Tuple[float, float, float, float],
     rotation: int,
     internal_height: float,
@@ -263,3 +263,15 @@ def adjust_char_box(
             internal_height - left,
         )
     return left, right, top, bottom
+
+
+def lerp(start: float, end: float, t: float) -> float:
+    """
+    Performs linear interpolation between two numbers.
+
+    :param start: The starting value.
+    :param end: The ending value.
+    :param t: The interpolation factor (0 to 1).
+    :return: The interpolated value.
+    """
+    return start * (1 - t) + end * t

From c232dc68ead27caea2843aabce33f3369c06d392 Mon Sep 17 00:00:00 2001
From: sebastianMindee <sebastian.oliverasilvera@mindee.co>
Date: Tue, 21 Jan 2025 09:37:39 +0100
Subject: [PATCH 7/7] remove crash on file deletion

---
 tests/input/test_compression.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py
index 03af2085..e09d2970 100644
--- a/tests/input/test_compression.py
+++ b/tests/input/test_compression.py
@@ -226,7 +226,4 @@ def cleanup():
     for file_path in created_files:
         full_path = DATA_DIR / "output" / file_path
         if full_path.exists():
-            try:
-                os.remove(full_path)
-            except OSError as e:
-                print(f"Could not delete file '{file_path}': {e.strerror}")
+            os.remove(full_path)