it works

sebastianMindee · sebastianMindee · commit 940a5748622d · 2025-01-20T16:59:13.000+01:00
diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py
@@ -1,5 +1,5 @@
 import io
-from typing import List
+from typing import BinaryIO, List
 
 import pypdfium2 as pdfium
 from PIL import Image
@@ -10,7 +10,6 @@
 from mindee.geometry.polygon import get_min_max_x, get_min_max_y
 from mindee.input.sources.bytes_input import BytesInput
 from mindee.input.sources.local_input_source import LocalInputSource
-from mindee.pdf.pdf_utils import attach_images_as_new_file
 
 
 def extract_image_from_polygon(
@@ -131,3 +130,34 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument:  # type: i
         return pdfium.PdfDocument(input_file.file_object.read())
 
     return attach_images_as_new_file([input_file.file_object])
+
+
+def attach_images_as_new_file(  # type: ignore
+    input_buffer_list: List[BinaryIO],
+) -> pdfium.PdfDocument:
+    """
+    Attaches a list of images as new pages in a PdfDocument object.
+
+    :param input_buffer_list: List of images, represented as buffers.
+    :return: A PdfDocument handle.
+    """
+    pdf = pdfium.PdfDocument.new()
+    for input_buffer in input_buffer_list:
+        input_buffer.seek(0)
+        image = Image.open(input_buffer)
+        image.convert("RGB")
+        image_buffer = io.BytesIO()
+        image.save(image_buffer, format="JPEG")
+
+        image_pdf = pdfium.PdfImage.new(pdf)
+        image_pdf.load_jpeg(image_buffer)
+        width, height = image_pdf.get_size()
+
+        matrix = pdfium.PdfMatrix().scale(width, height)
+        image_pdf.set_matrix(matrix)
+
+        page = pdf.new_page(width, height)
+        page.insert_obj(image_pdf)
+        page.gen_content()
+        image.close()
+    return pdf
diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py
@@ -1,7 +1,6 @@
 from mindee.pdf.pdf_char_data import PDFCharData
 from mindee.pdf.pdf_compressor import compress_pdf
 from mindee.pdf.pdf_utils import (
-    attach_images_as_new_file,
     extract_text_from_pdf,
     has_source_text,
 )
diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py
@@ -28,3 +28,5 @@ class PDFCharData:
     """RGBA representation of the font's stroke color."""
     font_fill_color: Tuple[int, int, int, int]
     """RGBA representation of the font's fill color."""
+    page_id: int
+    """ID of the page the character was found on."""
diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py
@@ -2,16 +2,16 @@
 import logging
 from ctypes import c_char_p, c_ushort
 from threading import RLock
-from typing import BinaryIO, List, Optional, Union
+from typing import BinaryIO, List, Optional, Tuple, Union
 
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from _ctypes import POINTER
+from PIL import Image
 
 from mindee.image_operations.image_compressor import compress_image
 from mindee.pdf.pdf_char_data import PDFCharData
 from mindee.pdf.pdf_utils import (
-    attach_images_as_new_file,
     extract_text_from_pdf,
     has_source_text,
 )
@@ -61,19 +61,22 @@ def compress_pdf(
         extract_text_from_pdf(pdf_bytes) if not disable_source_text else None
     )
 
-    compressed_pages = compress_pdf_pages(
-        pdf_bytes, extracted_text, image_quality, disable_source_text
-    )
+    compressed_pages = compress_pdf_pages(pdf_bytes, image_quality)
 
     if not compressed_pages:
         logger.warning(
             "Could not compress PDF to a smaller size. Returning original PDF."
         )
         return pdf_bytes
 
-    out_pdf = attach_images_as_new_file(
-        [io.BytesIO(compressed_page) for compressed_page in compressed_pages]
+    out_pdf = collect_images_as_pdf(
+        [compressed_page_image[0] for compressed_page_image in compressed_pages]
     )
+
+    if not disable_source_text:
+        for i, page in enumerate(out_pdf):
+            add_text_to_pdf_page(page, i, extracted_text)
+
     out_buffer = io.BytesIO()
     out_pdf.save(out_buffer)
     out_buffer.seek(0)
@@ -82,26 +85,20 @@ def compress_pdf(
 
 def compress_pdf_pages(
     pdf_data: bytes,
-    extracted_text: Optional[List[PDFCharData]],
     image_quality: int,
-    disable_source_text: bool,
-) -> Optional[List[bytes]]:
+) -> Optional[List[Tuple[bytes, int, int]]]:
     """
     Compresses PDF pages and returns an array of compressed page buffers.
 
     :param pdf_data: The input PDF as bytes.
-    :param extracted_text: Extracted text from the PDF.
     :param image_quality: Initial compression quality.
-    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
     :return: List of compressed page buffers, or None if compression fails.
     """
     original_size = len(pdf_data)
     image_quality_loop = image_quality
 
     while image_quality_loop >= MIN_QUALITY:
-        compressed_pages = compress_pages_with_quality(
-            pdf_data, extracted_text, image_quality_loop, disable_source_text
-        )
+        compressed_pages = compress_pages_with_quality(pdf_data, image_quality_loop)
         total_compressed_size = sum(len(page) for page in compressed_pages)
 
         if is_compression_successful(
@@ -115,28 +112,28 @@ def compress_pdf_pages(
 
 
 def add_text_to_pdf_page(  # type: ignore
-    document: pdfium.PdfDocument,
+    page: pdfium.PdfPage,
     page_id: int,
-    extracted_text: Optional[List[PDFCharData]],
+    extracted_text: Optional[List[List[PDFCharData]]],
 ) -> None:
     """
     Adds text to a PDF page based on the extracted text data.
 
-    :param document: The PDFDocument object.
-    :param page_id: ID of the current page.
+    :param page: The PDFDocument object.
+    :param page_id: The ID of the page.
     :param extracted_text: List of PDFCharData objects containing text and positioning information.
     """
-    if not extracted_text:
+    if not extracted_text or not extracted_text[page_id]:
         return
 
-    height = document[page_id].get_height()
+    height = page.get_height()
     pdfium_lock = RLock()
 
     with pdfium_lock:
-        for char_data in extracted_text:
+        for char_data in extracted_text[page_id]:
             font_name = c_char_p(char_data.font_name.encode("utf-8"))
             text_handler = pdfium_c.FPDFPageObj_NewTextObj(
-                document.raw, font_name, char_data.font_size
+                page.pdf.raw, font_name, char_data.font_size
             )
             char_code = ord(char_data.char)
             char_code_c_char = c_ushort(char_code)
@@ -145,38 +142,28 @@ def add_text_to_pdf_page(  # type: ignore
             pdfium_c.FPDFPageObj_Transform(
                 text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top
             )
-            pdfium_c.FPDFPage_InsertObject(document[page_id].raw, text_handler)
-            pdfium_c.FPDFPageObj_Destroy(text_handler)
-        pdfium_c.FPDFPage_GenerateContent(document[page_id].raw)
-        pdfium_c.FPDF_ClosePage(document[page_id].raw)
+            pdfium_c.FPDFPage_InsertObject(page.raw, text_handler)
+        pdfium_c.FPDFPage_GenerateContent(page.raw)
 
 
 def compress_pages_with_quality(
     pdf_data: bytes,
-    extracted_text: Optional[list[PDFCharData]],
     image_quality: int,
-    disable_source_text: bool,
-) -> List[bytes]:
+) -> List[Tuple[bytes, int, int]]:
     """
     Compresses pages with a specific quality.
 
     :param pdf_data: The input PDF as bytes.
-    :param extracted_text: Extracted text from the PDF.
     :param image_quality: Compression quality.
-    :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
     :return: List of compressed page buffers.
     """
     pdf_document = pdfium.PdfDocument(pdf_data)
     compressed_pages = []
-
-    for [i, page] in enumerate(pdf_document):
+    for page in pdf_document:
         rasterized_page = rasterize_page(page, image_quality)
         compressed_image = compress_image(rasterized_page, image_quality)
-
-        if not disable_source_text:
-            add_text_to_pdf_page(pdf_document, i, extracted_text)
-
-        compressed_pages.append(compressed_image)
+        image = Image.open(io.BytesIO(compressed_image))
+        compressed_pages.append((compressed_image, image.size[0], image.size[1]))
 
     return compressed_pages
 
@@ -223,3 +210,33 @@ def lerp(start: float, end: float, t: float) -> float:
     :return: The interpolated value.
     """
     return start * (1 - t) + end * t
+
+
+def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument:  # type: ignore
+    """
+    Converts a list of JPEG images into pages in a PdfDocument.
+
+    :param image_list: A list of bytes representing JPEG images.
+    :return: A PdfDocument handle containing the images as pages.
+    """
+    # Create a new, empty PdfDocument
+    out_pdf = pdfium.PdfDocument.new()
+
+    for image_bytes in image_list:
+        # Load the JPEG image into a PdfImage object
+        pdf_image = pdfium.PdfImage.new(out_pdf)
+        pdf_image.load_jpeg(io.BytesIO(image_bytes))
+
+        # Get the dimensions of the image
+        width, height = pdf_image.get_size()
+
+        # Create a new page in the PDF with the same dimensions as the image
+        page = out_pdf.new_page(width, height)
+
+        # Place the image on the page
+        page.insert_obj(pdf_image)
+
+        # Generate content for the page to finalize it
+        page.gen_content()
+
+    return out_pdf
diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py
diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`from mindee.pdf.pdf_char_data import PDFCharData`
`2`	`2`	`from mindee.pdf.pdf_compressor import compress_pdf`
`3`	`3`	`from mindee.pdf.pdf_utils import (`
`4`		`- attach_images_as_new_file,`
`5`	`4`	`extract_text_from_pdf,`
`6`	`5`	`has_source_text,`
`7`	`6`	`)`