From ff2f62c0ce1bcd07bda0e817a39b5e5b915cbe39 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 15 Jan 2025 14:49:41 +0100 Subject: [PATCH 1/7] temp save, completely untested code --- mindee/error/__init__.py | 2 + mindee/error/mindee_image_error.py | 2 + mindee/error/mindee_pdf_error.py | 2 + mindee/extraction/__init__.py | 2 +- mindee/extraction/common/__init__.py | 2 +- mindee/extraction/common/image_extractor.py | 47 ++-- mindee/image_operations/__init__.py | 0 mindee/image_operations/image_compressor.py | 33 +++ mindee/pdf/__init__.py | 0 mindee/pdf/pdf_char_data.py | 30 +++ mindee/pdf/pdf_compressor.py | 215 +++++++++++++++++ mindee/pdf/pdf_utils.py | 241 ++++++++++++++++++++ 12 files changed, 550 insertions(+), 26 deletions(-) create mode 100644 mindee/error/mindee_image_error.py create mode 100644 mindee/error/mindee_pdf_error.py create mode 100644 mindee/image_operations/__init__.py create mode 100644 mindee/image_operations/image_compressor.py create mode 100644 mindee/pdf/__init__.py create mode 100644 mindee/pdf/pdf_char_data.py create mode 100644 mindee/pdf/pdf_compressor.py create mode 100644 mindee/pdf/pdf_utils.py diff --git a/mindee/error/__init__.py b/mindee/error/__init__.py index e8075e37..c49c3cf3 100644 --- a/mindee/error/__init__.py +++ b/mindee/error/__init__.py @@ -7,3 +7,5 @@ MindeeHTTPServerError, handle_error, ) +from mindee.error.mindee_image_error import MindeeImageError +from mindee.error.mindee_pdf_error import MindeePDFError diff --git a/mindee/error/mindee_image_error.py b/mindee/error/mindee_image_error.py new file mode 100644 index 00000000..1da0abec --- /dev/null +++ b/mindee/error/mindee_image_error.py @@ -0,0 +1,2 @@ +class MindeeImageError(RuntimeError): + """An exception relating to errors during image operations.""" diff --git a/mindee/error/mindee_pdf_error.py b/mindee/error/mindee_pdf_error.py new file mode 100644 index 00000000..52f0b32f --- /dev/null +++ b/mindee/error/mindee_pdf_error.py @@ -0,0 +1,2 @@ +class MindeePDFError(RuntimeError): + """An exception relating to errors during PDF operations.""" diff --git a/mindee/extraction/__init__.py b/mindee/extraction/__init__.py index 05629a5d..2e190d7b 100644 --- a/mindee/extraction/__init__.py +++ b/mindee/extraction/__init__.py @@ -1,6 +1,6 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.extraction.common.image_extractor import ( - attach_image_as_new_file, + attach_images_as_new_file, extract_multiple_images_from_source, ) from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor diff --git a/mindee/extraction/common/__init__.py b/mindee/extraction/common/__init__.py index c0301c90..009009d9 100644 --- a/mindee/extraction/common/__init__.py +++ b/mindee/extraction/common/__init__.py @@ -1,5 +1,5 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.extraction.common.image_extractor import ( - attach_image_as_new_file, + attach_images_as_new_file, extract_multiple_images_from_source, ) diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 046b312c..3277feaf 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -11,35 +11,34 @@ from mindee.input.sources import BytesInput, LocalInputSource -def attach_image_as_new_file( # type: ignore - input_buffer: BinaryIO, +def attach_images_as_new_file( # type: ignore + input_buffer_list: List[BinaryIO], ) -> pdfium.PdfDocument: """ - Attaches an image as a new page in a PdfDocument object. + Attaches a list of images as new pages in a PdfDocument object. - :param input_buffer: Input buffer. + :param input_buffer_list: List of images, represented as buffers. :return: A PdfDocument handle. """ - # Create a new page in the PdfDocument - input_buffer.seek(0) - image = Image.open(input_buffer) - image.convert("RGB") - image_buffer = io.BytesIO() - image.save(image_buffer, format="JPEG") - pdf = pdfium.PdfDocument.new() - - image_pdf = pdfium.PdfImage.new(pdf) - image_pdf.load_jpeg(image_buffer) - width, height = image_pdf.get_size() - - matrix = pdfium.PdfMatrix().scale(width, height) - image_pdf.set_matrix(matrix) - - page = pdf.new_page(width, height) - page.insert_obj(image_pdf) - page.gen_content() - image.close() + for input_buffer in input_buffer_list: + input_buffer.seek(0) + image = Image.open(input_buffer) + image.convert("RGB") + image_buffer = io.BytesIO() + image.save(image_buffer, format="JPEG") + + image_pdf = pdfium.PdfImage.new(pdf) + image_pdf.load_jpeg(image_buffer) + width, height = image_pdf.get_size() + + matrix = pdfium.PdfMatrix().scale(width, height) + image_pdf.set_matrix(matrix) + + page = pdf.new_page(width, height) + page.insert_obj(image_pdf) + page.gen_content() + image.close() return pdf @@ -160,4 +159,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i input_file.file_object.seek(0) return pdfium.PdfDocument(input_file.file_object) - return attach_image_as_new_file(input_file.file_object) + return attach_images_as_new_file([input_file.file_object]) diff --git a/mindee/image_operations/__init__.py b/mindee/image_operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindee/image_operations/image_compressor.py b/mindee/image_operations/image_compressor.py new file mode 100644 index 00000000..4b600938 --- /dev/null +++ b/mindee/image_operations/image_compressor.py @@ -0,0 +1,33 @@ +import io +from typing import Union + +from PIL import Image + + +def compress_image( + image_buffer: bytes, + quality: int = 85, + max_width: Union[int, float, None] = None, + max_height: Union[int, float, None] = None, +) -> bytes: + """ + Compresses an image with the given parameters. + + :param image_buffer: Buffer representation of an image. + :param quality: Quality to apply to the image (JPEG compression). + :param max_width: Maximum bound for the width. + :param max_height: Maximum bound for the height. + :return: + """ + with Image.open(io.BytesIO(image_buffer)) as img: + original_width, original_height = img.size + max_width = max_width or original_width + max_height = max_height or original_height + if max_width or max_height: + img.thumbnail((int(max_width), int(max_height)), Image.Resampling.LANCZOS) + + output_buffer = io.BytesIO() + img.save(output_buffer, format="JPEG", quality=quality, optimize=True) + + compressed_image = output_buffer.getvalue() + return compressed_image diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py new file mode 100644 index 00000000..9f516655 --- /dev/null +++ b/mindee/pdf/pdf_char_data.py @@ -0,0 +1,30 @@ +from dataclasses import dataclass +from typing import Tuple + + +@dataclass +class PDFCharData: + """Data class representing character data.""" + + char: str + """The character.""" + left: int + """Left bound.""" + right: int + """Right bound.""" + top: int + """Top bound.""" + bottom: int + """Bottom bound.""" + font_name: str + """The font name.""" + font_size: int + """The font size in pt.""" + font_weight: int + """The font weight.""" + font_flags: int + """The font flags.""" + font_stroke_color: Tuple[int, int, int, int] + """RGBA representation of the font's stroke color.""" + font_fill_color: Tuple[int, int, int, int] + """RGBA representation of the font's fill color.""" diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py new file mode 100644 index 00000000..ef7de3da --- /dev/null +++ b/mindee/pdf/pdf_compressor.py @@ -0,0 +1,215 @@ +import logging +from io import BytesIO +from threading import RLock +from typing import List, Optional + +import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c + +from mindee.extraction import attach_images_as_new_file +from mindee.image_operations.image_compressor import compress_image +from mindee.pdf.pdf_char_data import PDFCharData +from mindee.pdf.pdf_utils import extract_text_from_pdf, has_source_text + +logger = logging.getLogger(__name__) +MIN_QUALITY = 1 + + +def compress_pdf( + pdf_data: bytes, + image_quality: int = 85, + force_source_text_compression: bool = False, + disable_source_text: bool = True, +) -> bytes: + """ + Compresses each page of a provided PDF buffer. + + :param pdf_data: The input PDF as bytes. + :param image_quality: Compression quality (70-100 for most JPG images). + :param force_source_text_compression: If true, attempts to re-write detected text. + :param disable_source_text: If true, doesn't re-apply source text to the output PDF. + :return: Compressed PDF as bytes. + """ + if has_source_text(pdf_data): + if force_source_text_compression: + if not disable_source_text: + logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.") + else: + logger.warning( + "Source file contains text, but disable_source_text flag " + "is set to false. Resulting file will not contain any embedded text." + ) + else: + logger.warning( + "Found text inside of the provided PDF file. Compression operation aborted since disableSourceText " + "is set to 'true'." + ) + return pdf_data + + extracted_text = ( + extract_text_from_pdf(pdf_data) if not disable_source_text else None + ) + + compressed_pages = compress_pdf_pages( + pdf_data, extracted_text, image_quality, disable_source_text + ) + + if not compressed_pages: + logger.warning( + "Could not compress PDF to a smaller size. Returning original PDF." + ) + return pdf_data + + out_pdf = attach_images_as_new_file( + [BytesIO(compressed_page) for compressed_page in compressed_pages] + ) + out_bytes = BytesIO() + out_pdf.save(out_bytes) + + return out_bytes.read() + + +def compress_pdf_pages( + pdf_data: bytes, + extracted_text: Optional[List[PDFCharData]], + image_quality: int, + disable_source_text: bool, +) -> Optional[List[bytes]]: + """ + Compresses PDF pages and returns an array of compressed page buffers. + + :param pdf_data: The input PDF as bytes. + :param extracted_text: Extracted text from the PDF. + :param image_quality: Initial compression quality. + :param disable_source_text: If true, doesn't re-apply source text to the output PDF. + :return: List of compressed page buffers, or None if compression fails. + """ + original_size = len(pdf_data) + image_quality_loop = image_quality + + while image_quality_loop >= MIN_QUALITY: + compressed_pages = compress_pages_with_quality( + pdf_data, extracted_text, image_quality_loop, disable_source_text + ) + total_compressed_size = sum(len(page) for page in compressed_pages) + + if is_compression_successful( + total_compressed_size, original_size, image_quality + ): + return compressed_pages + + image_quality_loop -= round(lerp(1, 10, image_quality_loop / 100)) + + return None + + +def add_text_to_pdf_page( # type: ignore + page: pdfium.PdfPage, + extracted_text: Optional[List[PDFCharData]], +) -> None: + """ + Adds text to a PDF page based on the extracted text data. + + :param page: The PdfPage object to add text to. + :param extracted_text: List of PDFCharData objects containing text and positioning information. + """ + if not extracted_text: + return + + height = page.get_height() + document = page.pdf + pdfium_lock = RLock() + + with pdfium_lock: + text_handler = pdfium_c.FPDFText_LoadPage(page.raw) + for char_data in extracted_text: + font = document.load_font( + char_data.font_name, pdfium_c.FPDF_FONT_TRUETYPE, True + ) + text_object = document.create_text_object(font, char_data.font_size) + text_object.set_text(char_data.char) + x = char_data.left + y = height - char_data.bottom + text_object.set_position(x, y) + r, g, b, a = char_data.font_fill_color + text_object.set_fill_color(r, g, b, a) + pdfium_c.FPDFPage_InsertObject(text_handler, text_object) + pdfium_c.FPDFPage_GenerateContent(text_handler) + + with pdfium_lock: + pdfium_c.FPDFText_ClosePage(text_handler) + + +def compress_pages_with_quality( + pdf_data: bytes, + extracted_text: Optional[list[PDFCharData]], + image_quality: int, + disable_source_text: bool, +) -> List[bytes]: + """ + Compresses pages with a specific quality. + + :param pdf_data: The input PDF as bytes. + :param extracted_text: Extracted text from the PDF. + :param image_quality: Compression quality. + :param disable_source_text: If true, doesn't re-apply source text to the output PDF. + :return: List of compressed page buffers. + """ + pdf_document = pdfium.PdfDocument(pdf_data) + compressed_pages = [] + + for i in enumerate(pdf_document): + page = pdf_document[i] + rasterized_page = rasterize_page(page, image_quality) + compressed_image = compress_image(rasterized_page, image_quality) + + if not disable_source_text: + add_text_to_pdf_page(page, extracted_text) + + compressed_pages.append(compressed_image) + + return compressed_pages + + +def is_compression_successful( + total_compressed_size: int, original_size: int, image_quality: int +) -> bool: + """ + Checks if the compression was successful based on the compressed size and original size. + + :param total_compressed_size: Total size of compressed pages. + :param original_size: Original PDF size. + :param image_quality: Compression quality. + :return: True if compression was successful, false otherwise. + """ + overhead = lerp(0.54, 0.18, image_quality / 100) + return total_compressed_size + total_compressed_size * overhead < original_size + + +def rasterize_page( # type: ignore + page: pdfium.PdfPage, + quality: int = 85, +) -> bytes: + """ + Rasterizes a PDF page. + + :param page: PdfPage object to rasterize. + :param quality: Quality to apply during rasterization. + :return: Rasterized page as bytes. + """ + image = page.render().to_pil() + buffer = BytesIO() + image.save(buffer, format="JPEG", quality=quality) + return buffer.getvalue() + + +def lerp(start: float, end: float, t: float) -> float: + """ + Performs linear interpolation between two numbers. + + :param start: The starting value. + :param end: The ending value. + :param t: The interpolation factor (0 to 1). + :return: The interpolated value. + """ + return start * (1 - t) + end * t diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py new file mode 100644 index 00000000..2eb8635d --- /dev/null +++ b/mindee/pdf/pdf_utils.py @@ -0,0 +1,241 @@ +from ctypes import byref, c_double, c_int, create_string_buffer +from threading import RLock +from typing import List, Tuple + +import pypdfium2 as pdfium +import pypdfium2.raw as pdfium_c + +from mindee.pdf.pdf_char_data import PDFCharData + +FALLBACK_FONT = "Helvetica" + + +def has_source_text(pdf_bytes: bytes) -> bool: + """ + Checks if the provided PDF bytes contain source text. + + :param pdf_bytes: Raw bytes representation of a PDF file + :return: + """ + pdf = pdfium.PdfDocument(pdf_bytes) + for page in pdf: + if len(page.get_textpage().get_text_bounded().strip()) > 0: + return True + return False + + +def extract_text_from_pdf(pdf_bytes: bytes) -> List[PDFCharData]: + """ + Extracts the raw text from a given PDF's bytes along with font data. + + :param pdf_bytes: Raw bytes representation of a PDF file. + :return: A list of info regarding each read character. + """ + char_data_list: List[PDFCharData] = [] + pdfium_lock = RLock() + pdf = pdfium.PdfDocument(pdf_bytes) + + for page in pdf: + process_page(page, pdfium_lock, char_data_list) + + return char_data_list + + +def process_page(page, pdfium_lock: RLock, char_data_list: List[PDFCharData]): + """ + Processes a single page of the PDF. + + :param page: The PDF page to process. + :param pdfium_lock: Lock for thread-safe operations. + :param char_data_list: List to append character data to. + """ + internal_height = page.get_height() + internal_width = page.get_width() + + with pdfium_lock: + text_handler = pdfium_c.FPDFText_LoadPage(page.raw) + count_chars = pdfium_c.FPDFText_CountChars(text_handler) + + for i in range(count_chars): + process_char( + i, + text_handler, + page, + pdfium_lock, + internal_height, + internal_width, + char_data_list, + ) + + with pdfium_lock: + pdfium_c.FPDFText_ClosePage(text_handler) + + +def process_char( + i: int, + text_handler, + page, + pdfium_lock: RLock, + internal_height: float, + internal_width: float, + char_data_list: List[PDFCharData], +): + """ + Processes a single character from the PDF. + + :param i: The index of the character. + :param text_handler: The text handler for the current page. + :param page: The current page being processed. + :param pdfium_lock: Lock for thread-safe operations. + :param internal_height: The height of the page. + :param internal_width: The width of the page. + :param char_data_list: List to append character data to. + """ + char_info = get_char_info(i, text_handler, pdfium_lock) + char_box = get_char_box(i, text_handler, pdfium_lock) + rotation = get_page_rotation(page, pdfium_lock) + + adjusted_box = adjust_char_box(char_box, rotation, internal_height, internal_width) + + for c in char_info["char"] or " ": + char_data = PDFCharData( + char=c, + left=int(adjusted_box[0]), + right=int(adjusted_box[1]), + top=int(adjusted_box[2]), + bottom=int(adjusted_box[3]), + font_name=char_info["font_name"], + font_size=char_info["font_size"], + font_weight=char_info["font_weight"], + font_stroke_color=char_info["font_stroke_color"], + font_fill_color=char_info["font_fill_color"], + font_flags=char_info["font_flags"], + ) + char_data_list.append(char_data) + + +def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: + """ + Retrieves information about a specific character. + + :param i: The index of the character. + :param text_handler: The text handler for the current page. + :param pdfium_lock: Lock for thread-safe operations. + :return: A dictionary containing character information. + """ + with pdfium_lock: + char = chr(pdfium_c.FPDFText_GetUnicode(text_handler, i)) + font_name = get_font_name(text_handler, i) + font_flags = get_font_flags(text_handler, i) + font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i) + font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i) + font_stroke_color = pdfium_c.FPDFText_GetStrokeColor(text_handler, i) + font_fill_color = pdfium_c.FPDFText_GetFillColor(text_handler, i) + + return { + "char": char, + "font_name": font_name, + "font_flags": font_flags, + "font_size": font_size, + "font_weight": font_weight, + "font_stroke_color": font_stroke_color, + "font_fill_color": font_fill_color, + } + + +def get_font_name(text_handler, i: int) -> str: + """ + Retrieves the font name for a specific character. + + :param text_handler: The text handler for the current page. + :param i: The index of the character. + :return: The font name as a string. + """ + buffer_length = 128 + font_name_buffer = create_string_buffer(buffer_length) + flags = c_int(0) + actual_length = pdfium_c.FPDFText_GetFontInfo( + text_handler, i, font_name_buffer, buffer_length, byref(flags) + ) + return ( + font_name_buffer.value.decode("utf-8") if actual_length > 0 else FALLBACK_FONT + ) + + +def get_font_flags(text_handler, i: int) -> int: + """ + Retrieves the font flags for a specific character. + + :param text_handler: The text handler for the current page. + :param i: The index of the character. + :return: The font flags as an integer. + """ + flags = c_int(0) + pdfium_c.FPDFText_GetFontInfo(text_handler, i, None, 0, byref(flags)) + return flags.value + + +def get_char_box( + i: int, text_handler, pdfium_lock: RLock +) -> Tuple[float, float, float, float]: + """ + Retrieves the bounding box for a specific character. + + :param i: The index of the character. + :param text_handler: The text handler for the current page. + :param pdfium_lock: Lock for thread-safe operations. + :return: A tuple containing left, right, bottom, and top coordinates. + """ + left, right, bottom, top = (c_double(0), c_double(0), c_double(0), c_double(0)) + with pdfium_lock: + pdfium_c.FPDFText_GetCharBox( + text_handler, i, byref(left), byref(right), byref(bottom), byref(top) + ) + return left.value, right.value, bottom.value, top.value + + +def get_page_rotation(page, pdfium_lock: RLock) -> int: + """ + Retrieves the rotation value for a specific page. + + :param page: The page to get the rotation for. + :param pdfium_lock: Lock for thread-safe operations. + :return: The rotation value in degrees. + """ + with pdfium_lock: + return {0: 0, 1: 90, 2: 180, 3: 270}.get( + pdfium_c.FPDFPage_GetRotation(page.raw), 0 + ) + + +def adjust_char_box( + char_box: Tuple[float, float, float, float], + rotation: int, + internal_height: float, + internal_width: float, +) -> Tuple[float, float, float, float]: + """ + Adjusts the character bounding box based on page rotation. + + :param char_box: The original character bounding box. + :param rotation: The page rotation in degrees. + :param internal_height: The height of the page. + :param internal_width: The width of the page. + :return: The adjusted character bounding box. + """ + left, right, bottom, top = char_box + if rotation == 0: + top, bottom = internal_height - top, internal_height - bottom + elif rotation == 90: + left, right, top, bottom = bottom, top, left, right + elif rotation == 180: + left, right = internal_width - right, internal_width - left + top, bottom = bottom, top + elif rotation == 270: + left, right, top, bottom = ( + internal_width - top, + internal_width - bottom, + internal_height - right, + internal_height - left, + ) + return left, right, top, bottom From 071b95fba4bcfc4b898d5b2e733a60271c7a0624 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 16 Jan 2025 17:45:51 +0100 Subject: [PATCH 2/7] temporary (not working) version --- ...uto_invoice_splitter_extraction_example.py | 2 +- mindee/client.py | 14 +- mindee/extraction/__init__.py | 1 - mindee/extraction/common/__init__.py | 1 - mindee/extraction/common/extracted_image.py | 3 +- mindee/extraction/common/image_extractor.py | 39 +-- .../multi_receipts_extractor.py | 2 +- .../extraction/pdf_extractor/extracted_pdf.py | 2 +- .../extraction/pdf_extractor/pdf_extractor.py | 2 +- mindee/image_operations/image_compressor.py | 10 +- mindee/input/__init__.py | 3 +- mindee/input/sources/__init__.py | 3 +- mindee/input/sources/base_64_input.py | 3 +- mindee/input/sources/bytes_input.py | 3 +- mindee/input/sources/file_input.py | 3 +- mindee/input/sources/input_type.py | 11 + mindee/input/sources/local_input_source.py | 55 ++++- mindee/input/sources/path_input.py | 3 +- mindee/input/sources/url_input_source.py | 2 +- mindee/mindee_http/endpoint.py | 3 +- mindee/pdf/__init__.py | 7 + mindee/pdf/pdf_compressor.py | 25 +- mindee/pdf/pdf_utils.py | 35 ++- tests/api/test_async_response.py | 2 +- tests/extraction/test_image_extractor.py | 2 +- .../test_invoice_splitter_auto_extraction.py | 2 +- .../test_multi_receipts_extractor.py | 2 +- tests/extraction/test_pdf_extractor.py | 2 +- tests/input/test_compression.py | 224 ++++++++++++++++++ tests/mindee_http/test_error.py | 2 +- tests/test_client.py | 2 +- tests/test_inputs.py | 12 +- 32 files changed, 385 insertions(+), 97 deletions(-) create mode 100644 mindee/input/sources/input_type.py create mode 100644 tests/input/test_compression.py diff --git a/examples/auto_invoice_splitter_extraction_example.py b/examples/auto_invoice_splitter_extraction_example.py index 161075d0..a9a2bb5a 100644 --- a/examples/auto_invoice_splitter_extraction_example.py +++ b/examples/auto_invoice_splitter_extraction_example.py @@ -1,6 +1,6 @@ from mindee import Client from mindee.extraction.pdf_extractor import PdfExtractor -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.invoice.invoice_v4 import InvoiceV4 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 diff --git a/mindee/client.py b/mindee/client.py index bebd8560..901b68cb 100644 --- a/mindee/client.py +++ b/mindee/client.py @@ -7,14 +7,12 @@ from mindee.input import WorkflowOptions from mindee.input.local_response import LocalResponse from mindee.input.page_options import PageOptions -from mindee.input.sources import ( - Base64Input, - BytesInput, - FileInput, - LocalInputSource, - PathInput, - UrlInputSource, -) +from mindee.input.sources.base_64_input import Base64Input +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.path_input import PathInput +from mindee.input.sources.url_input_source import UrlInputSource from mindee.logger import logger from mindee.mindee_http.endpoint import CustomEndpoint, Endpoint from mindee.mindee_http.mindee_api import MindeeApi diff --git a/mindee/extraction/__init__.py b/mindee/extraction/__init__.py index 2e190d7b..9b86d0ee 100644 --- a/mindee/extraction/__init__.py +++ b/mindee/extraction/__init__.py @@ -1,6 +1,5 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.extraction.common.image_extractor import ( - attach_images_as_new_file, extract_multiple_images_from_source, ) from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor diff --git a/mindee/extraction/common/__init__.py b/mindee/extraction/common/__init__.py index 009009d9..1acb7bb9 100644 --- a/mindee/extraction/common/__init__.py +++ b/mindee/extraction/common/__init__.py @@ -1,5 +1,4 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.extraction.common.image_extractor import ( - attach_images_as_new_file, extract_multiple_images_from_source, ) diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py index 3d6b6f22..e4013246 100644 --- a/mindee/extraction/common/extracted_image.py +++ b/mindee/extraction/common/extracted_image.py @@ -5,7 +5,8 @@ from PIL import Image from mindee.error.mindee_error import MindeeError -from mindee.input.sources import FileInput, LocalInputSource +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.local_input_source import LocalInputSource from mindee.logger import logger diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 3277feaf..95609e5a 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -1,5 +1,5 @@ import io -from typing import BinaryIO, List +from typing import List import pypdfium2 as pdfium from PIL import Image @@ -8,38 +8,9 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.geometry.point import Point from mindee.geometry.polygon import get_min_max_x, get_min_max_y -from mindee.input.sources import BytesInput, LocalInputSource - - -def attach_images_as_new_file( # type: ignore - input_buffer_list: List[BinaryIO], -) -> pdfium.PdfDocument: - """ - Attaches a list of images as new pages in a PdfDocument object. - - :param input_buffer_list: List of images, represented as buffers. - :return: A PdfDocument handle. - """ - pdf = pdfium.PdfDocument.new() - for input_buffer in input_buffer_list: - input_buffer.seek(0) - image = Image.open(input_buffer) - image.convert("RGB") - image_buffer = io.BytesIO() - image.save(image_buffer, format="JPEG") - - image_pdf = pdfium.PdfImage.new(pdf) - image_pdf.load_jpeg(image_buffer) - width, height = image_pdf.get_size() - - matrix = pdfium.PdfMatrix().scale(width, height) - image_pdf.set_matrix(matrix) - - page = pdf.new_page(width, height) - page.insert_obj(image_pdf) - page.gen_content() - image.close() - return pdf +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.pdf.pdf_utils import attach_images_as_new_file def extract_image_from_polygon( @@ -157,6 +128,6 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i """ if input_file.is_pdf(): input_file.file_object.seek(0) - return pdfium.PdfDocument(input_file.file_object) + return pdfium.PdfDocument(input_file.file_object.read()) return attach_images_as_new_file([input_file.file_object]) diff --git a/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py b/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py index 89ad63b9..7c31ca93 100644 --- a/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py +++ b/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py @@ -5,7 +5,7 @@ from mindee.extraction.common.image_extractor import ( extract_multiple_images_from_source, ) -from mindee.input.sources import LocalInputSource +from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.inference import Inference diff --git a/mindee/extraction/pdf_extractor/extracted_pdf.py b/mindee/extraction/pdf_extractor/extracted_pdf.py index fd02ce90..0e3dcb8d 100644 --- a/mindee/extraction/pdf_extractor/extracted_pdf.py +++ b/mindee/extraction/pdf_extractor/extracted_pdf.py @@ -4,7 +4,7 @@ import pypdfium2 as pdfium from mindee.error.mindee_error import MindeeError -from mindee.input.sources import BytesInput +from mindee.input.sources.bytes_input import BytesInput class ExtractedPdf: diff --git a/mindee/extraction/pdf_extractor/pdf_extractor.py b/mindee/extraction/pdf_extractor/pdf_extractor.py index 1a2023ca..5d5f2e19 100644 --- a/mindee/extraction/pdf_extractor/pdf_extractor.py +++ b/mindee/extraction/pdf_extractor/pdf_extractor.py @@ -7,7 +7,7 @@ from mindee.error.mindee_error import MindeeError from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf -from mindee.input.sources import LocalInputSource +from mindee.input.sources.local_input_source import LocalInputSource from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import ( InvoiceSplitterV1PageGroup, ) diff --git a/mindee/image_operations/image_compressor.py b/mindee/image_operations/image_compressor.py index 4b600938..82b0bf87 100644 --- a/mindee/image_operations/image_compressor.py +++ b/mindee/image_operations/image_compressor.py @@ -1,11 +1,11 @@ import io -from typing import Union +from typing import BinaryIO, Union from PIL import Image def compress_image( - image_buffer: bytes, + image_buffer: Union[BinaryIO, bytes], quality: int = 85, max_width: Union[int, float, None] = None, max_height: Union[int, float, None] = None, @@ -13,13 +13,15 @@ def compress_image( """ Compresses an image with the given parameters. - :param image_buffer: Buffer representation of an image. + :param image_buffer: Buffer representation of an image, also accepts BinaryIO. :param quality: Quality to apply to the image (JPEG compression). :param max_width: Maximum bound for the width. :param max_height: Maximum bound for the height. :return: """ - with Image.open(io.BytesIO(image_buffer)) as img: + if isinstance(image_buffer, bytes): + image_buffer = io.BytesIO(image_buffer) + with Image.open(image_buffer) as img: original_width, original_height = img.size max_width = max_width or original_width max_height = max_height or original_height diff --git a/mindee/input/__init__.py b/mindee/input/__init__.py index 3c75c072..82624650 100644 --- a/mindee/input/__init__.py +++ b/mindee/input/__init__.py @@ -3,7 +3,8 @@ from mindee.input.sources.base_64_input import Base64Input from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.file_input import FileInput -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource from mindee.input.sources.path_input import PathInput from mindee.input.sources.url_input_source import UrlInputSource from mindee.input.workflow_options import WorkflowOptions diff --git a/mindee/input/sources/__init__.py b/mindee/input/sources/__init__.py index 6f8a51e3..c7d9c22a 100644 --- a/mindee/input/sources/__init__.py +++ b/mindee/input/sources/__init__.py @@ -1,6 +1,7 @@ from mindee.input.sources.base_64_input import Base64Input from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.file_input import FileInput -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource from mindee.input.sources.path_input import PathInput from mindee.input.sources.url_input_source import UrlInputSource diff --git a/mindee/input/sources/base_64_input.py b/mindee/input/sources/base_64_input.py index b651bd23..b656255b 100644 --- a/mindee/input/sources/base_64_input.py +++ b/mindee/input/sources/base_64_input.py @@ -1,7 +1,8 @@ import base64 import io -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class Base64Input(LocalInputSource): diff --git a/mindee/input/sources/bytes_input.py b/mindee/input/sources/bytes_input.py index 13fbf41d..1f2b63fd 100644 --- a/mindee/input/sources/bytes_input.py +++ b/mindee/input/sources/bytes_input.py @@ -1,6 +1,7 @@ import io -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class BytesInput(LocalInputSource): diff --git a/mindee/input/sources/file_input.py b/mindee/input/sources/file_input.py index 561fd754..2623a4f3 100644 --- a/mindee/input/sources/file_input.py +++ b/mindee/input/sources/file_input.py @@ -1,7 +1,8 @@ import os from typing import BinaryIO -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class FileInput(LocalInputSource): diff --git a/mindee/input/sources/input_type.py b/mindee/input/sources/input_type.py new file mode 100644 index 00000000..6daf1131 --- /dev/null +++ b/mindee/input/sources/input_type.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class InputType(Enum): + """The input type, for internal use.""" + + FILE = "file" + BASE64 = "base64" + BYTES = "bytes" + PATH = "path" + URL = "url" diff --git a/mindee/input/sources/local_input_source.py b/mindee/input/sources/local_input_source.py index ef5bcaf5..9f6f5cc6 100644 --- a/mindee/input/sources/local_input_source.py +++ b/mindee/input/sources/local_input_source.py @@ -1,15 +1,18 @@ import io import mimetypes import tempfile -from enum import Enum from typing import BinaryIO, Optional, Sequence, Tuple import pypdfium2 as pdfium from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError +from mindee.image_operations.image_compressor import compress_image from mindee.input.page_options import KEEP_ONLY, REMOVE +from mindee.input.sources.input_type import InputType from mindee.logger import logger +from mindee.pdf.pdf_compressor import compress_pdf +from mindee.pdf.pdf_utils import has_source_text mimetypes.add_type("image/heic", ".heic") mimetypes.add_type("image/heic", ".heif") @@ -25,16 +28,6 @@ ] -class InputType(Enum): - """The input type, for internal use.""" - - FILE = "file" - BASE64 = "base64" - BYTES = "bytes" - PATH = "path" - URL = "url" - - class LocalInputSource: """Base class for all input sources coming from the local machine.""" @@ -202,3 +195,43 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]: def close(self) -> None: """Close the file object.""" self.file_object.close() + + def has_source_text(self) -> bool: + """ + If the file is a PDF, checks if it has source text. + + :return: True if the file is a PDF and has source text. False otherwise. + """ + if not self.is_pdf(): + return False + return has_source_text(self.file_object.read()) + + def compress( + self, + quality: int = 85, + max_width: Optional[int] = None, + max_height: Optional[int] = None, + force_source_text: bool = False, + disable_source_text: bool = True, + ) -> None: + """ + Compresses the file object, either as a PDF or an image. + + :param quality: Quality of the compression. For images, this is the JPEG quality. + For PDFs, this affects image quality within the PDF. + :param max_width: Maximum width for image resizing. Ignored for PDFs. + :param max_height: Maximum height for image resizing. Ignored for PDFs. + :param force_source_text: For PDFs, whether to force compression even if source text is present. + :param disable_source_text: For PDFs, whether to disable source text during compression. + """ + new_file_bytes: bytes + if self.is_pdf(): + new_file_bytes = compress_pdf( + self.file_object, quality, force_source_text, disable_source_text + ) + else: + new_file_bytes = compress_image( + self.file_object, quality, max_width, max_height + ) + + self.file_object = io.BytesIO(new_file_bytes) diff --git a/mindee/input/sources/path_input.py b/mindee/input/sources/path_input.py index 3f9698b4..2e7fc736 100644 --- a/mindee/input/sources/path_input.py +++ b/mindee/input/sources/path_input.py @@ -2,7 +2,8 @@ from pathlib import Path from typing import Union -from mindee.input.sources.local_input_source import InputType, LocalInputSource +from mindee.input.sources.input_type import InputType +from mindee.input.sources.local_input_source import LocalInputSource class PathInput(LocalInputSource): diff --git a/mindee/input/sources/url_input_source.py b/mindee/input/sources/url_input_source.py index 983343e5..0e62573a 100644 --- a/mindee/input/sources/url_input_source.py +++ b/mindee/input/sources/url_input_source.py @@ -10,7 +10,7 @@ from mindee.error.mindee_error import MindeeSourceError from mindee.input.sources.bytes_input import BytesInput -from mindee.input.sources.local_input_source import InputType +from mindee.input.sources.input_type import InputType from mindee.logger import logger diff --git a/mindee/mindee_http/endpoint.py b/mindee/mindee_http/endpoint.py index fdbd2ae7..227c1e2f 100644 --- a/mindee/mindee_http/endpoint.py +++ b/mindee/mindee_http/endpoint.py @@ -4,7 +4,8 @@ import requests from requests import Response -from mindee.input.sources import LocalInputSource, UrlInputSource +from mindee.input.sources.local_input_source import LocalInputSource +from mindee.input.sources.url_input_source import UrlInputSource from mindee.mindee_http.base_endpoint import BaseEndpoint from mindee.mindee_http.mindee_api import MindeeApi from mindee.parsing.common.string_dict import StringDict diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py index e69de29b..5afcb672 100644 --- a/mindee/pdf/__init__.py +++ b/mindee/pdf/__init__.py @@ -0,0 +1,7 @@ +from mindee.pdf.pdf_char_data import PDFCharData +from mindee.pdf.pdf_compressor import compress_pdf +from mindee.pdf.pdf_utils import ( + attach_images_as_new_file, + extract_text_from_pdf, + has_source_text, +) diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py index ef7de3da..12bce09a 100644 --- a/mindee/pdf/pdf_compressor.py +++ b/mindee/pdf/pdf_compressor.py @@ -1,22 +1,25 @@ +import io import logging -from io import BytesIO from threading import RLock -from typing import List, Optional +from typing import BinaryIO, List, Optional, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c -from mindee.extraction import attach_images_as_new_file from mindee.image_operations.image_compressor import compress_image from mindee.pdf.pdf_char_data import PDFCharData -from mindee.pdf.pdf_utils import extract_text_from_pdf, has_source_text +from mindee.pdf.pdf_utils import ( + attach_images_as_new_file, + extract_text_from_pdf, + has_source_text, +) logger = logging.getLogger(__name__) MIN_QUALITY = 1 def compress_pdf( - pdf_data: bytes, + pdf_data: Union[BinaryIO, bytes], image_quality: int = 85, force_source_text_compression: bool = False, disable_source_text: bool = True, @@ -30,6 +33,9 @@ def compress_pdf( :param disable_source_text: If true, doesn't re-apply source text to the output PDF. :return: Compressed PDF as bytes. """ + if not isinstance(pdf_data, bytes): + pdf_data = pdf_data.read() + if has_source_text(pdf_data): if force_source_text_compression: if not disable_source_text: @@ -61,9 +67,9 @@ def compress_pdf( return pdf_data out_pdf = attach_images_as_new_file( - [BytesIO(compressed_page) for compressed_page in compressed_pages] + [io.BytesIO(compressed_page) for compressed_page in compressed_pages] ) - out_bytes = BytesIO() + out_bytes = io.BytesIO() out_pdf.save(out_bytes) return out_bytes.read() @@ -158,8 +164,7 @@ def compress_pages_with_quality( pdf_document = pdfium.PdfDocument(pdf_data) compressed_pages = [] - for i in enumerate(pdf_document): - page = pdf_document[i] + for [_, page] in enumerate(pdf_document): rasterized_page = rasterize_page(page, image_quality) compressed_image = compress_image(rasterized_page, image_quality) @@ -198,7 +203,7 @@ def rasterize_page( # type: ignore :return: Rasterized page as bytes. """ image = page.render().to_pil() - buffer = BytesIO() + buffer = io.BytesIO() image.save(buffer, format="JPEG", quality=quality) return buffer.getvalue() diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py index 2eb8635d..747133dd 100644 --- a/mindee/pdf/pdf_utils.py +++ b/mindee/pdf/pdf_utils.py @@ -1,9 +1,11 @@ +import io from ctypes import byref, c_double, c_int, create_string_buffer from threading import RLock -from typing import List, Tuple +from typing import BinaryIO, List, Tuple import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c +from PIL import Image from mindee.pdf.pdf_char_data import PDFCharData @@ -239,3 +241,34 @@ def adjust_char_box( internal_height - left, ) return left, right, top, bottom + + +def attach_images_as_new_file( # type: ignore + input_buffer_list: List[BinaryIO], +) -> pdfium.PdfDocument: + """ + Attaches a list of images as new pages in a PdfDocument object. + + :param input_buffer_list: List of images, represented as buffers. + :return: A PdfDocument handle. + """ + pdf = pdfium.PdfDocument.new() + for input_buffer in input_buffer_list: + input_buffer.seek(0) + image = Image.open(input_buffer) + image.convert("RGB") + image_buffer = io.BytesIO() + image.save(image_buffer, format="JPEG") + + image_pdf = pdfium.PdfImage.new(pdf) + image_pdf.load_jpeg(image_buffer) + width, height = image_pdf.get_size() + + matrix = pdfium.PdfMatrix().scale(width, height) + image_pdf.set_matrix(matrix) + + page = pdf.new_page(width, height) + page.insert_obj(image_pdf) + page.gen_content() + image.close() + return pdf diff --git a/tests/api/test_async_response.py b/tests/api/test_async_response.py index 31319095..e8163d0c 100644 --- a/tests/api/test_async_response.py +++ b/tests/api/test_async_response.py @@ -5,7 +5,7 @@ import requests from mindee.client import Client -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.mindee_http.response_validation import is_valid_async_response from mindee.parsing.common.api_request import RequestStatus from mindee.parsing.common.async_predict_response import AsyncPredictResponse diff --git a/tests/extraction/test_image_extractor.py b/tests/extraction/test_image_extractor.py index f41dc4a4..7f6d5db2 100644 --- a/tests/extraction/test_image_extractor.py +++ b/tests/extraction/test_image_extractor.py @@ -4,7 +4,7 @@ from PIL import Image from mindee.extraction.common.image_extractor import extract_multiple_images_from_source -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1 from tests.test_inputs import PRODUCT_DATA_DIR diff --git a/tests/extraction/test_invoice_splitter_auto_extraction.py b/tests/extraction/test_invoice_splitter_auto_extraction.py index 716628e7..3abc2d2a 100644 --- a/tests/extraction/test_invoice_splitter_auto_extraction.py +++ b/tests/extraction/test_invoice_splitter_auto_extraction.py @@ -4,7 +4,7 @@ from mindee import Client from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.parsing.common.document import Document from mindee.product.invoice.invoice_v4 import InvoiceV4 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 diff --git a/tests/extraction/test_multi_receipts_extractor.py b/tests/extraction/test_multi_receipts_extractor.py index 0f71d1fe..00e22f12 100644 --- a/tests/extraction/test_multi_receipts_extractor.py +++ b/tests/extraction/test_multi_receipts_extractor.py @@ -6,7 +6,7 @@ from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import ( extract_receipts, ) -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import ( MultiReceiptsDetectorV1, ) diff --git a/tests/extraction/test_pdf_extractor.py b/tests/extraction/test_pdf_extractor.py index e323cd2c..a236d9c2 100644 --- a/tests/extraction/test_pdf_extractor.py +++ b/tests/extraction/test_pdf_extractor.py @@ -3,7 +3,7 @@ from mindee import Client from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor from mindee.input.local_response import LocalResponse -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 from mindee.product.invoice_splitter.invoice_splitter_v1_document import ( InvoiceSplitterV1Document, diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py new file mode 100644 index 00000000..900c0d71 --- /dev/null +++ b/tests/input/test_compression.py @@ -0,0 +1,224 @@ +import os +from pathlib import Path + +import pytest +from PIL import Image + +from mindee.image_operations.image_compressor import compress_image +from mindee.input.sources.path_input import PathInput +from mindee.pdf.pdf_compressor import compress_pdf +from mindee.pdf.pdf_utils import extract_text_from_pdf + +DATA_DIR = Path("./tests/data") +OUTPUT_DIR = DATA_DIR / "output" + + +def test_image_quality_compress_from_input_source(): + receipt_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + receipt_input.compress(40) + + with open(OUTPUT_DIR / "compress_indirect.jpg", "wb") as f: + f.write(receipt_input.file_object.read()) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = os.stat(OUTPUT_DIR / "compress_indirect.jpg") + assert rendered_file_stats.st_size < initial_file_stats.st_size + + +def test_image_quality_compresses_from_compressor(): + receipt_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + compresses = [ + compress_image(receipt_input.file_object, 100), + compress_image(receipt_input.file_object), + compress_image(receipt_input.file_object, 50), + compress_image(receipt_input.file_object, 10), + compress_image(receipt_input.file_object, 1), + ] + + file_names = [ + "compress100.jpg", + "compress75.jpg", + "compress50.jpg", + "compress10.jpg", + "compress1.jpg", + ] + for i, compressed in enumerate(compresses): + with open(OUTPUT_DIR / file_names[i], "wb") as f: + f.write(compressed) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names] + + assert initial_file_stats.st_size < rendered_file_stats[0].st_size + assert initial_file_stats.st_size < rendered_file_stats[1].st_size + assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size + assert rendered_file_stats[2].st_size > rendered_file_stats[3].st_size + assert rendered_file_stats[3].st_size > rendered_file_stats[4].st_size + + +def test_image_resize_from_input_source(): + image_resize_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + + image_resize_input.compress(75, 250, 1000) + with open(OUTPUT_DIR / "resize_indirect.jpg", "wb") as f: + f.write(image_resize_input.file_object.read()) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.jpg") + assert rendered_file_stats.st_size < initial_file_stats.st_size + + image = Image.open(image_resize_input.file_object) + assert image.width == 250 + assert image.height == 333 + + +def test_image_resize_from_compressor(): + image_resize_input = PathInput(DATA_DIR / "file_types/receipt.jpg") + + resizes = [ + compress_image(image_resize_input.file_object, 75, 500), + compress_image(image_resize_input.file_object, 75, 250, 500), + compress_image(image_resize_input.file_object, 75, 500, 250), + compress_image(image_resize_input.file_object, 75, None, 250), + ] + + file_names = [ + "resize500xnull.jpg", + "resize250x500.jpg", + "resize500x250.jpg", + "resizenullx250.jpg", + ] + for i, resized in enumerate(resizes): + with open(OUTPUT_DIR / file_names[i], "wb") as f: + f.write(resized) + + initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") + rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names] + + assert initial_file_stats.st_size > rendered_file_stats[0].st_size + assert rendered_file_stats[0].st_size > rendered_file_stats[1].st_size + assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size + assert rendered_file_stats[2].st_size == rendered_file_stats[3].st_size + + +def test_pdf_input_has_text(): + has_source_text_path = DATA_DIR / "file_types/pdf/multipage.pdf" + has_no_source_text_path = DATA_DIR / "file_types/pdf/blank_1.pdf" + has_no_source_text_since_its_image_path = os.path.join( + DATA_DIR, "file_types/receipt.jpg" + ) + + has_source_text_input = PathInput(has_source_text_path) + has_no_source_text_input = PathInput(has_no_source_text_path) + has_no_source_text_since_its_image_input = PathInput( + has_no_source_text_since_its_image_path + ) + + assert has_source_text_input.has_source_text() + assert not has_no_source_text_input.has_source_text() + assert not has_no_source_text_since_its_image_input.has_source_text() + + +def test_pdf_compress_from_input_source(): + pdf_resize_input = PathInput( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + + compressed_pdf = compress_pdf(pdf_resize_input.file_object, 75, True) + with open(OUTPUT_DIR / "resize_indirect.pdf", "wb") as f: + f.write(compressed_pdf) + + initial_file_stats = os.stat( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.pdf") + + assert rendered_file_stats.st_size < initial_file_stats.st_size + + +def test_pdf_compress_from_compressor(): + pdf_resize_input = PathInput( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + resizes = [] + qualities = [85, 75, 50, 10] + for quality in qualities: + pdf_resize_input.file_object.seek(0) + resizes.append(compress_pdf(pdf_resize_input.file_object, quality)) + + file_names = [ + "compress85.pdf", + "compress75.pdf", + "compress50.pdf", + "compress10.pdf", + ] + for [i, resized] in enumerate(resizes): + with open(OUTPUT_DIR / file_names[i], "wb") as f: + f.write(resized) + + initial_file_stats = os.stat( + DATA_DIR / "products/invoice_splitter/default_sample.pdf" + ) + rendered_file_stats = [os.stat(OUTPUT_DIR / file_name) for file_name in file_names] + + assert initial_file_stats.st_size > rendered_file_stats[0].st_size + assert rendered_file_stats[0].st_size > rendered_file_stats[1].st_size + assert rendered_file_stats[1].st_size > rendered_file_stats[2].st_size + assert rendered_file_stats[2].st_size > rendered_file_stats[3].st_size + + +def test_pdf_compress_with_text_keeps_text(): + initial_with_text = PathInput(DATA_DIR / "file_types/pdf/multipage.pdf") + + compressed_with_text = compress_pdf(initial_with_text.file_object, 100, True, False) + + original_text = "".join( + [ + text_info.char + for text_info in extract_text_from_pdf(initial_with_text.file_object.read()) + ] + ) + compressed_text = "".join( + [text_info.char for text_info in extract_text_from_pdf(compressed_with_text)] + ) + + assert compressed_text == original_text + + +def test_pdf_compress_with_text_does_not_compress(): + initial_with_text = PathInput(DATA_DIR / "file_types/pdf/multipage.pdf") + + compressed_with_text = compress_pdf(initial_with_text.file_object, 50) + + assert compressed_with_text == initial_with_text.file_object + + +@pytest.fixture(scope="module", autouse=True) +def cleanup(): + yield + created_files = [ + "compress10.pdf", + "compress50.pdf", + "compress75.pdf", + "compress85.pdf", + "resize_indirect.pdf", + "compress1.jpg", + "compress10.jpg", + "compress50.jpg", + "compress75.jpg", + "compress100.jpg", + "compress_indirect.jpg", + "resize250x500.jpg", + "resize500x250.jpg", + "resize500xnull.jpg", + "resize_indirect.jpg", + "resizenullx250.jpg", + ] + + for file_path in created_files: + full_path = DATA_DIR / "output" / file_path + if full_path.exists(): + try: + os.remove(full_path) + except OSError as e: + print(f"Could not delete file '{file_path}': {e.strerror}") diff --git a/tests/mindee_http/test_error.py b/tests/mindee_http/test_error.py index f9ac9776..5e2f879e 100644 --- a/tests/mindee_http/test_error.py +++ b/tests/mindee_http/test_error.py @@ -9,7 +9,7 @@ MindeeHTTPServerError, handle_error, ) -from mindee.input.sources import PathInput +from mindee.input.sources.path_input import PathInput from tests.test_inputs import FILE_TYPES_DIR from tests.utils import clear_envvars, dummy_envvars diff --git a/tests/test_client.py b/tests/test_client.py index 574ad51b..599e244c 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -6,7 +6,7 @@ from mindee.error.mindee_error import MindeeClientError, MindeeError from mindee.error.mindee_http_error import MindeeHTTPError from mindee.input.local_response import LocalResponse -from mindee.input.sources import LocalInputSource +from mindee.input.sources.local_input_source import LocalInputSource from mindee.product.international_id.international_id_v2 import InternationalIdV2 from mindee.product.invoice.invoice_v4 import InvoiceV4 from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1 diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 1c67e8ef..9eaa84c9 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -7,13 +7,11 @@ from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError from mindee.input.page_options import KEEP_ONLY, REMOVE -from mindee.input.sources import ( - Base64Input, - BytesInput, - FileInput, - PathInput, - UrlInputSource, -) +from mindee.input.sources.base_64_input import Base64Input +from mindee.input.sources.bytes_input import BytesInput +from mindee.input.sources.file_input import FileInput +from mindee.input.sources.path_input import PathInput +from mindee.input.sources.url_input_source import UrlInputSource from tests.product import PRODUCT_DATA_DIR FILE_TYPES_DIR = Path("./tests/data/file_types") From a72c46514a0c508270b210c0e3a21423d70929aa Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 17 Jan 2025 18:04:45 +0100 Subject: [PATCH 3/7] fix a few issues... but segfault :D --- mindee/pdf/pdf_char_data.py | 2 +- mindee/pdf/pdf_compressor.py | 67 ++++++++++++++++-------------- mindee/pdf/pdf_utils.py | 16 ++++++-- tests/input/test_compression.py | 73 +++++++++++++++++---------------- 4 files changed, 86 insertions(+), 72 deletions(-) diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py index 9f516655..75637a52 100644 --- a/mindee/pdf/pdf_char_data.py +++ b/mindee/pdf/pdf_char_data.py @@ -18,7 +18,7 @@ class PDFCharData: """Bottom bound.""" font_name: str """The font name.""" - font_size: int + font_size: float """The font size in pt.""" font_weight: int """The font weight.""" diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py index 12bce09a..9f859d92 100644 --- a/mindee/pdf/pdf_compressor.py +++ b/mindee/pdf/pdf_compressor.py @@ -1,10 +1,12 @@ import io import logging +from ctypes import c_char_p, c_ushort from threading import RLock from typing import BinaryIO, List, Optional, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c +from _ctypes import POINTER from mindee.image_operations.image_compressor import compress_image from mindee.pdf.pdf_char_data import PDFCharData @@ -34,9 +36,12 @@ def compress_pdf( :return: Compressed PDF as bytes. """ if not isinstance(pdf_data, bytes): - pdf_data = pdf_data.read() + pdf_bytes = pdf_data.read() + pdf_data.seek(0) + else: + pdf_bytes = pdf_data - if has_source_text(pdf_data): + if has_source_text(pdf_bytes): if force_source_text_compression: if not disable_source_text: logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.") @@ -50,29 +55,29 @@ def compress_pdf( "Found text inside of the provided PDF file. Compression operation aborted since disableSourceText " "is set to 'true'." ) - return pdf_data + return pdf_bytes extracted_text = ( - extract_text_from_pdf(pdf_data) if not disable_source_text else None + extract_text_from_pdf(pdf_bytes) if not disable_source_text else None ) compressed_pages = compress_pdf_pages( - pdf_data, extracted_text, image_quality, disable_source_text + pdf_bytes, extracted_text, image_quality, disable_source_text ) if not compressed_pages: logger.warning( "Could not compress PDF to a smaller size. Returning original PDF." ) - return pdf_data + return pdf_bytes out_pdf = attach_images_as_new_file( [io.BytesIO(compressed_page) for compressed_page in compressed_pages] ) - out_bytes = io.BytesIO() - out_pdf.save(out_bytes) - - return out_bytes.read() + out_buffer = io.BytesIO() + out_pdf.save(out_buffer) + out_buffer.seek(0) + return out_buffer.read() def compress_pdf_pages( @@ -110,40 +115,40 @@ def compress_pdf_pages( def add_text_to_pdf_page( # type: ignore - page: pdfium.PdfPage, + document: pdfium.PdfDocument, + page_id: int, extracted_text: Optional[List[PDFCharData]], ) -> None: """ Adds text to a PDF page based on the extracted text data. - :param page: The PdfPage object to add text to. + :param document: The PDFDocument object. + :param page_id: ID of the current page. :param extracted_text: List of PDFCharData objects containing text and positioning information. """ if not extracted_text: return - height = page.get_height() - document = page.pdf + height = document[page_id].get_height() pdfium_lock = RLock() with pdfium_lock: - text_handler = pdfium_c.FPDFText_LoadPage(page.raw) for char_data in extracted_text: - font = document.load_font( - char_data.font_name, pdfium_c.FPDF_FONT_TRUETYPE, True + font_name = c_char_p(char_data.font_name.encode("utf-8")) + text_handler = pdfium_c.FPDFPageObj_NewTextObj( + document.raw, font_name, char_data.font_size ) - text_object = document.create_text_object(font, char_data.font_size) - text_object.set_text(char_data.char) - x = char_data.left - y = height - char_data.bottom - text_object.set_position(x, y) - r, g, b, a = char_data.font_fill_color - text_object.set_fill_color(r, g, b, a) - pdfium_c.FPDFPage_InsertObject(text_handler, text_object) - pdfium_c.FPDFPage_GenerateContent(text_handler) - - with pdfium_lock: - pdfium_c.FPDFText_ClosePage(text_handler) + char_code = ord(char_data.char) + char_code_c_char = c_ushort(char_code) + char_ptr = POINTER(c_ushort)(char_code_c_char) + pdfium_c.FPDFText_SetText(text_handler, char_ptr) + pdfium_c.FPDFPageObj_Transform( + text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top + ) + pdfium_c.FPDFPage_InsertObject(document[page_id].raw, text_handler) + pdfium_c.FPDFPageObj_Destroy(text_handler) + pdfium_c.FPDFPage_GenerateContent(document[page_id].raw) + pdfium_c.FPDF_ClosePage(document[page_id].raw) def compress_pages_with_quality( @@ -164,12 +169,12 @@ def compress_pages_with_quality( pdf_document = pdfium.PdfDocument(pdf_data) compressed_pages = [] - for [_, page] in enumerate(pdf_document): + for [i, page] in enumerate(pdf_document): rasterized_page = rasterize_page(page, image_quality) compressed_image = compress_image(rasterized_page, image_quality) if not disable_source_text: - add_text_to_pdf_page(page, extracted_text) + add_text_to_pdf_page(pdf_document, i, extracted_text) compressed_pages.append(compressed_image) diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py index 747133dd..699c39c6 100644 --- a/mindee/pdf/pdf_utils.py +++ b/mindee/pdf/pdf_utils.py @@ -1,3 +1,4 @@ +import ctypes import io from ctypes import byref, c_double, c_int, create_string_buffer from threading import RLock @@ -125,14 +126,21 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: :param pdfium_lock: Lock for thread-safe operations. :return: A dictionary containing character information. """ + stroke = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()) + fill = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()) + with pdfium_lock: char = chr(pdfium_c.FPDFText_GetUnicode(text_handler, i)) font_name = get_font_name(text_handler, i) font_flags = get_font_flags(text_handler, i) font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i) font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i) - font_stroke_color = pdfium_c.FPDFText_GetStrokeColor(text_handler, i) - font_fill_color = pdfium_c.FPDFText_GetFillColor(text_handler, i) + _ = pdfium_c.FPDFText_GetStrokeColor( + text_handler, i, stroke[0], stroke[1], stroke[2], stroke[3] + ) + _ = pdfium_c.FPDFText_GetFillColor( + text_handler, i, fill[0], fill[1], fill[2], fill[3] + ) return { "char": char, @@ -140,8 +148,8 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: "font_flags": font_flags, "font_size": font_size, "font_weight": font_weight, - "font_stroke_color": font_stroke_color, - "font_fill_color": font_fill_color, + "font_stroke_color": stroke, + "font_fill_color": fill, } diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index 900c0d71..ccc402eb 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -19,6 +19,7 @@ def test_image_quality_compress_from_input_source(): with open(OUTPUT_DIR / "compress_indirect.jpg", "wb") as f: f.write(receipt_input.file_object.read()) + receipt_input.file_object.seek(0) initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") rendered_file_stats = os.stat(OUTPUT_DIR / "compress_indirect.jpg") @@ -62,6 +63,7 @@ def test_image_resize_from_input_source(): image_resize_input.compress(75, 250, 1000) with open(OUTPUT_DIR / "resize_indirect.jpg", "wb") as f: f.write(image_resize_input.file_object.read()) + image_resize_input.file_object.seek(0) initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg") rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.jpg") @@ -143,8 +145,8 @@ def test_pdf_compress_from_compressor(): resizes = [] qualities = [85, 75, 50, 10] for quality in qualities: - pdf_resize_input.file_object.seek(0) resizes.append(compress_pdf(pdf_resize_input.file_object, quality)) + pdf_resize_input.file_object.seek(0) file_names = [ "compress85.pdf", @@ -172,12 +174,11 @@ def test_pdf_compress_with_text_keeps_text(): compressed_with_text = compress_pdf(initial_with_text.file_object, 100, True, False) - original_text = "".join( - [ - text_info.char - for text_info in extract_text_from_pdf(initial_with_text.file_object.read()) - ] - ) + text_chars = [] + for text_info in extract_text_from_pdf(initial_with_text.file_object.read()): + text_chars.append(text_info.char) + initial_with_text.file_object.seek(0) + original_text = "".join(text_chars) compressed_text = "".join( [text_info.char for text_info in extract_text_from_pdf(compressed_with_text)] ) @@ -193,32 +194,32 @@ def test_pdf_compress_with_text_does_not_compress(): assert compressed_with_text == initial_with_text.file_object -@pytest.fixture(scope="module", autouse=True) -def cleanup(): - yield - created_files = [ - "compress10.pdf", - "compress50.pdf", - "compress75.pdf", - "compress85.pdf", - "resize_indirect.pdf", - "compress1.jpg", - "compress10.jpg", - "compress50.jpg", - "compress75.jpg", - "compress100.jpg", - "compress_indirect.jpg", - "resize250x500.jpg", - "resize500x250.jpg", - "resize500xnull.jpg", - "resize_indirect.jpg", - "resizenullx250.jpg", - ] - - for file_path in created_files: - full_path = DATA_DIR / "output" / file_path - if full_path.exists(): - try: - os.remove(full_path) - except OSError as e: - print(f"Could not delete file '{file_path}': {e.strerror}") +# @pytest.fixture(scope="module", autouse=True) +# def cleanup(): +# yield +# created_files = [ +# "compress10.pdf", +# "compress50.pdf", +# "compress75.pdf", +# "compress85.pdf", +# "resize_indirect.pdf", +# "compress1.jpg", +# "compress10.jpg", +# "compress50.jpg", +# "compress75.jpg", +# "compress100.jpg", +# "compress_indirect.jpg", +# "resize250x500.jpg", +# "resize500x250.jpg", +# "resize500xnull.jpg", +# "resize_indirect.jpg", +# "resizenullx250.jpg", +# ] +# +# for file_path in created_files: +# full_path = DATA_DIR / "output" / file_path +# if full_path.exists(): +# try: +# os.remove(full_path) +# except OSError as e: +# print(f"Could not delete file '{file_path}': {e.strerror}") From e53bba33ef53555cbeb8334cb5519a10354005e3 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 20 Jan 2025 16:59:13 +0100 Subject: [PATCH 4/7] it works --- mindee/extraction/common/image_extractor.py | 34 +++++- mindee/pdf/__init__.py | 1 - mindee/pdf/pdf_char_data.py | 2 + mindee/pdf/pdf_compressor.py | 95 +++++++++------- mindee/pdf/pdf_utils.py | 115 +++++++++----------- tests/input/test_compression.py | 75 +++++++------ 6 files changed, 180 insertions(+), 142 deletions(-) diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 95609e5a..0cfdcba7 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -1,5 +1,5 @@ import io -from typing import List +from typing import BinaryIO, List import pypdfium2 as pdfium from PIL import Image @@ -10,7 +10,6 @@ from mindee.geometry.polygon import get_min_max_x, get_min_max_y from mindee.input.sources.bytes_input import BytesInput from mindee.input.sources.local_input_source import LocalInputSource -from mindee.pdf.pdf_utils import attach_images_as_new_file def extract_image_from_polygon( @@ -131,3 +130,34 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i return pdfium.PdfDocument(input_file.file_object.read()) return attach_images_as_new_file([input_file.file_object]) + + +def attach_images_as_new_file( # type: ignore + input_buffer_list: List[BinaryIO], +) -> pdfium.PdfDocument: + """ + Attaches a list of images as new pages in a PdfDocument object. + + :param input_buffer_list: List of images, represented as buffers. + :return: A PdfDocument handle. + """ + pdf = pdfium.PdfDocument.new() + for input_buffer in input_buffer_list: + input_buffer.seek(0) + image = Image.open(input_buffer) + image.convert("RGB") + image_buffer = io.BytesIO() + image.save(image_buffer, format="JPEG") + + image_pdf = pdfium.PdfImage.new(pdf) + image_pdf.load_jpeg(image_buffer) + width, height = image_pdf.get_size() + + matrix = pdfium.PdfMatrix().scale(width, height) + image_pdf.set_matrix(matrix) + + page = pdf.new_page(width, height) + page.insert_obj(image_pdf) + page.gen_content() + image.close() + return pdf diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py index 5afcb672..6864d138 100644 --- a/mindee/pdf/__init__.py +++ b/mindee/pdf/__init__.py @@ -1,7 +1,6 @@ from mindee.pdf.pdf_char_data import PDFCharData from mindee.pdf.pdf_compressor import compress_pdf from mindee.pdf.pdf_utils import ( - attach_images_as_new_file, extract_text_from_pdf, has_source_text, ) diff --git a/mindee/pdf/pdf_char_data.py b/mindee/pdf/pdf_char_data.py index 75637a52..58d46db9 100644 --- a/mindee/pdf/pdf_char_data.py +++ b/mindee/pdf/pdf_char_data.py @@ -28,3 +28,5 @@ class PDFCharData: """RGBA representation of the font's stroke color.""" font_fill_color: Tuple[int, int, int, int] """RGBA representation of the font's fill color.""" + page_id: int + """ID of the page the character was found on.""" diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py index 9f859d92..037c58d2 100644 --- a/mindee/pdf/pdf_compressor.py +++ b/mindee/pdf/pdf_compressor.py @@ -2,16 +2,16 @@ import logging from ctypes import c_char_p, c_ushort from threading import RLock -from typing import BinaryIO, List, Optional, Union +from typing import BinaryIO, List, Optional, Tuple, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from _ctypes import POINTER +from PIL import Image from mindee.image_operations.image_compressor import compress_image from mindee.pdf.pdf_char_data import PDFCharData from mindee.pdf.pdf_utils import ( - attach_images_as_new_file, extract_text_from_pdf, has_source_text, ) @@ -61,9 +61,7 @@ def compress_pdf( extract_text_from_pdf(pdf_bytes) if not disable_source_text else None ) - compressed_pages = compress_pdf_pages( - pdf_bytes, extracted_text, image_quality, disable_source_text - ) + compressed_pages = compress_pdf_pages(pdf_bytes, image_quality) if not compressed_pages: logger.warning( @@ -71,9 +69,14 @@ def compress_pdf( ) return pdf_bytes - out_pdf = attach_images_as_new_file( - [io.BytesIO(compressed_page) for compressed_page in compressed_pages] + out_pdf = collect_images_as_pdf( + [compressed_page_image[0] for compressed_page_image in compressed_pages] ) + + if not disable_source_text: + for i, page in enumerate(out_pdf): + add_text_to_pdf_page(page, i, extracted_text) + out_buffer = io.BytesIO() out_pdf.save(out_buffer) out_buffer.seek(0) @@ -82,26 +85,20 @@ def compress_pdf( def compress_pdf_pages( pdf_data: bytes, - extracted_text: Optional[List[PDFCharData]], image_quality: int, - disable_source_text: bool, -) -> Optional[List[bytes]]: +) -> Optional[List[Tuple[bytes, int, int]]]: """ Compresses PDF pages and returns an array of compressed page buffers. :param pdf_data: The input PDF as bytes. - :param extracted_text: Extracted text from the PDF. :param image_quality: Initial compression quality. - :param disable_source_text: If true, doesn't re-apply source text to the output PDF. :return: List of compressed page buffers, or None if compression fails. """ original_size = len(pdf_data) image_quality_loop = image_quality while image_quality_loop >= MIN_QUALITY: - compressed_pages = compress_pages_with_quality( - pdf_data, extracted_text, image_quality_loop, disable_source_text - ) + compressed_pages = compress_pages_with_quality(pdf_data, image_quality_loop) total_compressed_size = sum(len(page) for page in compressed_pages) if is_compression_successful( @@ -115,28 +112,28 @@ def compress_pdf_pages( def add_text_to_pdf_page( # type: ignore - document: pdfium.PdfDocument, + page: pdfium.PdfPage, page_id: int, - extracted_text: Optional[List[PDFCharData]], + extracted_text: Optional[List[List[PDFCharData]]], ) -> None: """ Adds text to a PDF page based on the extracted text data. - :param document: The PDFDocument object. - :param page_id: ID of the current page. + :param page: The PDFDocument object. + :param page_id: The ID of the page. :param extracted_text: List of PDFCharData objects containing text and positioning information. """ - if not extracted_text: + if not extracted_text or not extracted_text[page_id]: return - height = document[page_id].get_height() + height = page.get_height() pdfium_lock = RLock() with pdfium_lock: - for char_data in extracted_text: + for char_data in extracted_text[page_id]: font_name = c_char_p(char_data.font_name.encode("utf-8")) text_handler = pdfium_c.FPDFPageObj_NewTextObj( - document.raw, font_name, char_data.font_size + page.pdf.raw, font_name, char_data.font_size ) char_code = ord(char_data.char) char_code_c_char = c_ushort(char_code) @@ -145,38 +142,28 @@ def add_text_to_pdf_page( # type: ignore pdfium_c.FPDFPageObj_Transform( text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top ) - pdfium_c.FPDFPage_InsertObject(document[page_id].raw, text_handler) - pdfium_c.FPDFPageObj_Destroy(text_handler) - pdfium_c.FPDFPage_GenerateContent(document[page_id].raw) - pdfium_c.FPDF_ClosePage(document[page_id].raw) + pdfium_c.FPDFPage_InsertObject(page.raw, text_handler) + pdfium_c.FPDFPage_GenerateContent(page.raw) def compress_pages_with_quality( pdf_data: bytes, - extracted_text: Optional[list[PDFCharData]], image_quality: int, - disable_source_text: bool, -) -> List[bytes]: +) -> List[Tuple[bytes, int, int]]: """ Compresses pages with a specific quality. :param pdf_data: The input PDF as bytes. - :param extracted_text: Extracted text from the PDF. :param image_quality: Compression quality. - :param disable_source_text: If true, doesn't re-apply source text to the output PDF. :return: List of compressed page buffers. """ pdf_document = pdfium.PdfDocument(pdf_data) compressed_pages = [] - - for [i, page] in enumerate(pdf_document): + for page in pdf_document: rasterized_page = rasterize_page(page, image_quality) compressed_image = compress_image(rasterized_page, image_quality) - - if not disable_source_text: - add_text_to_pdf_page(pdf_document, i, extracted_text) - - compressed_pages.append(compressed_image) + image = Image.open(io.BytesIO(compressed_image)) + compressed_pages.append((compressed_image, image.size[0], image.size[1])) return compressed_pages @@ -223,3 +210,33 @@ def lerp(start: float, end: float, t: float) -> float: :return: The interpolated value. """ return start * (1 - t) + end * t + + +def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # type: ignore + """ + Converts a list of JPEG images into pages in a PdfDocument. + + :param image_list: A list of bytes representing JPEG images. + :return: A PdfDocument handle containing the images as pages. + """ + # Create a new, empty PdfDocument + out_pdf = pdfium.PdfDocument.new() + + for image_bytes in image_list: + # Load the JPEG image into a PdfImage object + pdf_image = pdfium.PdfImage.new(out_pdf) + pdf_image.load_jpeg(io.BytesIO(image_bytes)) + + # Get the dimensions of the image + width, height = pdf_image.get_size() + + # Create a new page in the PDF with the same dimensions as the image + page = out_pdf.new_page(width, height) + + # Place the image on the page + page.insert_obj(pdf_image) + + # Generate content for the page to finalize it + page.gen_content() + + return out_pdf diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py index 699c39c6..e5ab71c8 100644 --- a/mindee/pdf/pdf_utils.py +++ b/mindee/pdf/pdf_utils.py @@ -1,12 +1,10 @@ import ctypes -import io from ctypes import byref, c_double, c_int, create_string_buffer from threading import RLock -from typing import BinaryIO, List, Tuple +from typing import List, Tuple import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c -from PIL import Image from mindee.pdf.pdf_char_data import PDFCharData @@ -27,31 +25,32 @@ def has_source_text(pdf_bytes: bytes) -> bool: return False -def extract_text_from_pdf(pdf_bytes: bytes) -> List[PDFCharData]: +def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]: """ Extracts the raw text from a given PDF's bytes along with font data. :param pdf_bytes: Raw bytes representation of a PDF file. :return: A list of info regarding each read character. """ - char_data_list: List[PDFCharData] = [] pdfium_lock = RLock() pdf = pdfium.PdfDocument(pdf_bytes) + char_data_list: List[List[PDFCharData]] = [] - for page in pdf: - process_page(page, pdfium_lock, char_data_list) + for i, page in enumerate(pdf): + char_data_list.append(process_page(page, i, pdfium_lock)) return char_data_list -def process_page(page, pdfium_lock: RLock, char_data_list: List[PDFCharData]): +def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]: """ Processes a single page of the PDF. :param page: The PDF page to process. + :param page_id: ID of the page. :param pdfium_lock: Lock for thread-safe operations. - :param char_data_list: List to append character data to. """ + char_data_list: List[PDFCharData] = [] internal_height = page.get_height() internal_width = page.get_width() @@ -60,18 +59,15 @@ def process_page(page, pdfium_lock: RLock, char_data_list: List[PDFCharData]): count_chars = pdfium_c.FPDFText_CountChars(text_handler) for i in range(count_chars): - process_char( - i, - text_handler, - page, - pdfium_lock, - internal_height, - internal_width, - char_data_list, + concatenated_chars = process_char( + i, text_handler, page, pdfium_lock, internal_height, internal_width, page_id ) + for concatenated_char in concatenated_chars: + char_data_list.append(concatenated_char) with pdfium_lock: pdfium_c.FPDFText_ClosePage(text_handler) + return char_data_list def process_char( @@ -81,8 +77,8 @@ def process_char( pdfium_lock: RLock, internal_height: float, internal_width: float, - char_data_list: List[PDFCharData], -): + page_id: int, +) -> List[PDFCharData]: """ Processes a single character from the PDF. @@ -92,29 +88,44 @@ def process_char( :param pdfium_lock: Lock for thread-safe operations. :param internal_height: The height of the page. :param internal_width: The width of the page. - :param char_data_list: List to append character data to. + :param page_id: ID of the page the character was found on. + :return: List of character data for a page. """ char_info = get_char_info(i, text_handler, pdfium_lock) + if not char_info: + return [] char_box = get_char_box(i, text_handler, pdfium_lock) rotation = get_page_rotation(page, pdfium_lock) adjusted_box = adjust_char_box(char_box, rotation, internal_height, internal_width) - + char_data_list: List[PDFCharData] = [] for c in char_info["char"] or " ": - char_data = PDFCharData( - char=c, - left=int(adjusted_box[0]), - right=int(adjusted_box[1]), - top=int(adjusted_box[2]), - bottom=int(adjusted_box[3]), - font_name=char_info["font_name"], - font_size=char_info["font_size"], - font_weight=char_info["font_weight"], - font_stroke_color=char_info["font_stroke_color"], - font_fill_color=char_info["font_fill_color"], - font_flags=char_info["font_flags"], + if c in ( + "\n", + "\r", + ): # Removes duplicated carriage returns in the PDF due to weird extraction. + # IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check. + next_char_info = get_char_info(i + 1, text_handler, pdfium_lock) + if not next_char_info or next_char_info["char"] in ("\n", "\r"): + continue + + char_data_list.append( + PDFCharData( + char=c, + left=int(adjusted_box[0]), + right=int(adjusted_box[1]), + top=int(adjusted_box[2]), + bottom=int(adjusted_box[3]), + font_name=char_info["font_name"], + font_size=char_info["font_size"], + font_weight=char_info["font_weight"], + font_stroke_color=char_info["font_stroke_color"], + font_fill_color=char_info["font_fill_color"], + font_flags=char_info["font_flags"], + page_id=page_id, + ) ) - char_data_list.append(char_data) + return char_data_list def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: @@ -130,7 +141,10 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: fill = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint()) with pdfium_lock: - char = chr(pdfium_c.FPDFText_GetUnicode(text_handler, i)) + unicode_char = pdfium_c.FPDFText_GetUnicode(text_handler, i) + if unicode_char == 0xFF: + return {} + char = chr(unicode_char) font_name = get_font_name(text_handler, i) font_flags = get_font_flags(text_handler, i) font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i) @@ -249,34 +263,3 @@ def adjust_char_box( internal_height - left, ) return left, right, top, bottom - - -def attach_images_as_new_file( # type: ignore - input_buffer_list: List[BinaryIO], -) -> pdfium.PdfDocument: - """ - Attaches a list of images as new pages in a PdfDocument object. - - :param input_buffer_list: List of images, represented as buffers. - :return: A PdfDocument handle. - """ - pdf = pdfium.PdfDocument.new() - for input_buffer in input_buffer_list: - input_buffer.seek(0) - image = Image.open(input_buffer) - image.convert("RGB") - image_buffer = io.BytesIO() - image.save(image_buffer, format="JPEG") - - image_pdf = pdfium.PdfImage.new(pdf) - image_pdf.load_jpeg(image_buffer) - width, height = image_pdf.get_size() - - matrix = pdfium.PdfMatrix().scale(width, height) - image_pdf.set_matrix(matrix) - - page = pdf.new_page(width, height) - page.insert_obj(image_pdf) - page.gen_content() - image.close() - return pdf diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index ccc402eb..03af2085 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -1,4 +1,6 @@ +import operator import os +from functools import reduce from pathlib import Path import pytest @@ -176,11 +178,16 @@ def test_pdf_compress_with_text_keeps_text(): text_chars = [] for text_info in extract_text_from_pdf(initial_with_text.file_object.read()): - text_chars.append(text_info.char) + text_chars.append("".join([ti.char for ti in text_info])) initial_with_text.file_object.seek(0) original_text = "".join(text_chars) compressed_text = "".join( - [text_info.char for text_info in extract_text_from_pdf(compressed_with_text)] + [ + text_info.char + for text_info in reduce( + operator.concat, extract_text_from_pdf(compressed_with_text) + ) + ] ) assert compressed_text == original_text @@ -191,35 +198,35 @@ def test_pdf_compress_with_text_does_not_compress(): compressed_with_text = compress_pdf(initial_with_text.file_object, 50) - assert compressed_with_text == initial_with_text.file_object - - -# @pytest.fixture(scope="module", autouse=True) -# def cleanup(): -# yield -# created_files = [ -# "compress10.pdf", -# "compress50.pdf", -# "compress75.pdf", -# "compress85.pdf", -# "resize_indirect.pdf", -# "compress1.jpg", -# "compress10.jpg", -# "compress50.jpg", -# "compress75.jpg", -# "compress100.jpg", -# "compress_indirect.jpg", -# "resize250x500.jpg", -# "resize500x250.jpg", -# "resize500xnull.jpg", -# "resize_indirect.jpg", -# "resizenullx250.jpg", -# ] -# -# for file_path in created_files: -# full_path = DATA_DIR / "output" / file_path -# if full_path.exists(): -# try: -# os.remove(full_path) -# except OSError as e: -# print(f"Could not delete file '{file_path}': {e.strerror}") + assert compressed_with_text == initial_with_text.file_object.read() + + +@pytest.fixture(scope="module", autouse=True) +def cleanup(): + yield + created_files = [ + "compress10.pdf", + "compress50.pdf", + "compress75.pdf", + "compress85.pdf", + "resize_indirect.pdf", + "compress1.jpg", + "compress10.jpg", + "compress50.jpg", + "compress75.jpg", + "compress100.jpg", + "compress_indirect.jpg", + "resize250x500.jpg", + "resize500x250.jpg", + "resize500xnull.jpg", + "resize_indirect.jpg", + "resizenullx250.jpg", + ] + + for file_path in created_files: + full_path = DATA_DIR / "output" / file_path + if full_path.exists(): + try: + os.remove(full_path) + except OSError as e: + print(f"Could not delete file '{file_path}': {e.strerror}") From 1865726ac39566a249344f43341bf1fd61fd6b7b Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 20 Jan 2025 17:03:39 +0100 Subject: [PATCH 5/7] restore function that was needlessly edited --- mindee/extraction/__init__.py | 1 + mindee/extraction/common/__init__.py | 1 + mindee/extraction/common/image_extractor.py | 65 +++++++++++---------- mindee/pdf/pdf_compressor.py | 9 --- 4 files changed, 35 insertions(+), 41 deletions(-) diff --git a/mindee/extraction/__init__.py b/mindee/extraction/__init__.py index 9b86d0ee..05629a5d 100644 --- a/mindee/extraction/__init__.py +++ b/mindee/extraction/__init__.py @@ -1,5 +1,6 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.extraction.common.image_extractor import ( + attach_image_as_new_file, extract_multiple_images_from_source, ) from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor diff --git a/mindee/extraction/common/__init__.py b/mindee/extraction/common/__init__.py index 1acb7bb9..c0301c90 100644 --- a/mindee/extraction/common/__init__.py +++ b/mindee/extraction/common/__init__.py @@ -1,4 +1,5 @@ from mindee.extraction.common.extracted_image import ExtractedImage from mindee.extraction.common.image_extractor import ( + attach_image_as_new_file, extract_multiple_images_from_source, ) diff --git a/mindee/extraction/common/image_extractor.py b/mindee/extraction/common/image_extractor.py index 0cfdcba7..5bae6d37 100644 --- a/mindee/extraction/common/image_extractor.py +++ b/mindee/extraction/common/image_extractor.py @@ -12,6 +12,38 @@ from mindee.input.sources.local_input_source import LocalInputSource +def attach_image_as_new_file( # type: ignore + input_buffer: BinaryIO, +) -> pdfium.PdfDocument: + """ + Attaches an image as a new page in a PdfDocument object. + + :param input_buffer: Input buffer. + :return: A PdfDocument handle. + """ + # Create a new page in the PdfDocument + input_buffer.seek(0) + image = Image.open(input_buffer) + image.convert("RGB") + image_buffer = io.BytesIO() + image.save(image_buffer, format="JPEG") + + pdf = pdfium.PdfDocument.new() + + image_pdf = pdfium.PdfImage.new(pdf) + image_pdf.load_jpeg(image_buffer) + width, height = image_pdf.get_size() + + matrix = pdfium.PdfMatrix().scale(width, height) + image_pdf.set_matrix(matrix) + + page = pdf.new_page(width, height) + page.insert_obj(image_pdf) + page.gen_content() + image.close() + return pdf + + def extract_image_from_polygon( page_content: Image.Image, polygon: List[Point], @@ -129,35 +161,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i input_file.file_object.seek(0) return pdfium.PdfDocument(input_file.file_object.read()) - return attach_images_as_new_file([input_file.file_object]) - - -def attach_images_as_new_file( # type: ignore - input_buffer_list: List[BinaryIO], -) -> pdfium.PdfDocument: - """ - Attaches a list of images as new pages in a PdfDocument object. - - :param input_buffer_list: List of images, represented as buffers. - :return: A PdfDocument handle. - """ - pdf = pdfium.PdfDocument.new() - for input_buffer in input_buffer_list: - input_buffer.seek(0) - image = Image.open(input_buffer) - image.convert("RGB") - image_buffer = io.BytesIO() - image.save(image_buffer, format="JPEG") - - image_pdf = pdfium.PdfImage.new(pdf) - image_pdf.load_jpeg(image_buffer) - width, height = image_pdf.get_size() - - matrix = pdfium.PdfMatrix().scale(width, height) - image_pdf.set_matrix(matrix) - - page = pdf.new_page(width, height) - page.insert_obj(image_pdf) - page.gen_content() - image.close() - return pdf + return attach_image_as_new_file(input_file.file_object) diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py index 037c58d2..133c087e 100644 --- a/mindee/pdf/pdf_compressor.py +++ b/mindee/pdf/pdf_compressor.py @@ -219,24 +219,15 @@ def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # typ :param image_list: A list of bytes representing JPEG images. :return: A PdfDocument handle containing the images as pages. """ - # Create a new, empty PdfDocument out_pdf = pdfium.PdfDocument.new() for image_bytes in image_list: - # Load the JPEG image into a PdfImage object pdf_image = pdfium.PdfImage.new(out_pdf) pdf_image.load_jpeg(io.BytesIO(image_bytes)) - # Get the dimensions of the image width, height = pdf_image.get_size() - - # Create a new page in the PDF with the same dimensions as the image page = out_pdf.new_page(width, height) - - # Place the image on the page page.insert_obj(pdf_image) - - # Generate content for the page to finalize it page.gen_content() return out_pdf From 3125e2e021c495c1f4ca20a1a88dd5dea065fc80 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 20 Jan 2025 17:43:35 +0100 Subject: [PATCH 6/7] fix function names --- mindee/image_operations/__init__.py | 1 + mindee/pdf/__init__.py | 1 + mindee/pdf/pdf_compressor.py | 33 +++++++-------------- mindee/pdf/pdf_utils.py | 46 ++++++++++++++++++----------- 4 files changed, 42 insertions(+), 39 deletions(-) diff --git a/mindee/image_operations/__init__.py b/mindee/image_operations/__init__.py index e69de29b..f92bd401 100644 --- a/mindee/image_operations/__init__.py +++ b/mindee/image_operations/__init__.py @@ -0,0 +1 @@ +from mindee.image_operations.image_compressor import compress_image diff --git a/mindee/pdf/__init__.py b/mindee/pdf/__init__.py index 6864d138..4c55dad2 100644 --- a/mindee/pdf/__init__.py +++ b/mindee/pdf/__init__.py @@ -3,4 +3,5 @@ from mindee.pdf.pdf_utils import ( extract_text_from_pdf, has_source_text, + lerp, ) diff --git a/mindee/pdf/pdf_compressor.py b/mindee/pdf/pdf_compressor.py index 133c087e..cfaf4254 100644 --- a/mindee/pdf/pdf_compressor.py +++ b/mindee/pdf/pdf_compressor.py @@ -14,6 +14,7 @@ from mindee.pdf.pdf_utils import ( extract_text_from_pdf, has_source_text, + lerp, ) logger = logging.getLogger(__name__) @@ -61,7 +62,7 @@ def compress_pdf( extract_text_from_pdf(pdf_bytes) if not disable_source_text else None ) - compressed_pages = compress_pdf_pages(pdf_bytes, image_quality) + compressed_pages = _compress_pdf_pages(pdf_bytes, image_quality) if not compressed_pages: logger.warning( @@ -69,7 +70,7 @@ def compress_pdf( ) return pdf_bytes - out_pdf = collect_images_as_pdf( + out_pdf = _collect_images_as_pdf( [compressed_page_image[0] for compressed_page_image in compressed_pages] ) @@ -83,7 +84,7 @@ def compress_pdf( return out_buffer.read() -def compress_pdf_pages( +def _compress_pdf_pages( pdf_data: bytes, image_quality: int, ) -> Optional[List[Tuple[bytes, int, int]]]: @@ -98,10 +99,10 @@ def compress_pdf_pages( image_quality_loop = image_quality while image_quality_loop >= MIN_QUALITY: - compressed_pages = compress_pages_with_quality(pdf_data, image_quality_loop) + compressed_pages = _compress_pages_with_quality(pdf_data, image_quality_loop) total_compressed_size = sum(len(page) for page in compressed_pages) - if is_compression_successful( + if _is_compression_successful( total_compressed_size, original_size, image_quality ): return compressed_pages @@ -146,7 +147,7 @@ def add_text_to_pdf_page( # type: ignore pdfium_c.FPDFPage_GenerateContent(page.raw) -def compress_pages_with_quality( +def _compress_pages_with_quality( pdf_data: bytes, image_quality: int, ) -> List[Tuple[bytes, int, int]]: @@ -160,7 +161,7 @@ def compress_pages_with_quality( pdf_document = pdfium.PdfDocument(pdf_data) compressed_pages = [] for page in pdf_document: - rasterized_page = rasterize_page(page, image_quality) + rasterized_page = _rasterize_page(page, image_quality) compressed_image = compress_image(rasterized_page, image_quality) image = Image.open(io.BytesIO(compressed_image)) compressed_pages.append((compressed_image, image.size[0], image.size[1])) @@ -168,7 +169,7 @@ def compress_pages_with_quality( return compressed_pages -def is_compression_successful( +def _is_compression_successful( total_compressed_size: int, original_size: int, image_quality: int ) -> bool: """ @@ -183,7 +184,7 @@ def is_compression_successful( return total_compressed_size + total_compressed_size * overhead < original_size -def rasterize_page( # type: ignore +def _rasterize_page( # type: ignore page: pdfium.PdfPage, quality: int = 85, ) -> bytes: @@ -200,19 +201,7 @@ def rasterize_page( # type: ignore return buffer.getvalue() -def lerp(start: float, end: float, t: float) -> float: - """ - Performs linear interpolation between two numbers. - - :param start: The starting value. - :param end: The ending value. - :param t: The interpolation factor (0 to 1). - :return: The interpolated value. - """ - return start * (1 - t) + end * t - - -def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # type: ignore +def _collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # type: ignore """ Converts a list of JPEG images into pages in a PdfDocument. diff --git a/mindee/pdf/pdf_utils.py b/mindee/pdf/pdf_utils.py index e5ab71c8..70b7d984 100644 --- a/mindee/pdf/pdf_utils.py +++ b/mindee/pdf/pdf_utils.py @@ -37,12 +37,12 @@ def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]: char_data_list: List[List[PDFCharData]] = [] for i, page in enumerate(pdf): - char_data_list.append(process_page(page, i, pdfium_lock)) + char_data_list.append(_process_page(page, i, pdfium_lock)) return char_data_list -def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]: +def _process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]: """ Processes a single page of the PDF. @@ -59,7 +59,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]: count_chars = pdfium_c.FPDFText_CountChars(text_handler) for i in range(count_chars): - concatenated_chars = process_char( + concatenated_chars = _process_char( i, text_handler, page, pdfium_lock, internal_height, internal_width, page_id ) for concatenated_char in concatenated_chars: @@ -70,7 +70,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]: return char_data_list -def process_char( +def _process_char( i: int, text_handler, page, @@ -91,13 +91,13 @@ def process_char( :param page_id: ID of the page the character was found on. :return: List of character data for a page. """ - char_info = get_char_info(i, text_handler, pdfium_lock) + char_info = _get_char_info(i, text_handler, pdfium_lock) if not char_info: return [] - char_box = get_char_box(i, text_handler, pdfium_lock) - rotation = get_page_rotation(page, pdfium_lock) + char_box = _get_char_box(i, text_handler, pdfium_lock) + rotation = _get_page_rotation(page, pdfium_lock) - adjusted_box = adjust_char_box(char_box, rotation, internal_height, internal_width) + adjusted_box = _adjust_char_box(char_box, rotation, internal_height, internal_width) char_data_list: List[PDFCharData] = [] for c in char_info["char"] or " ": if c in ( @@ -105,7 +105,7 @@ def process_char( "\r", ): # Removes duplicated carriage returns in the PDF due to weird extraction. # IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check. - next_char_info = get_char_info(i + 1, text_handler, pdfium_lock) + next_char_info = _get_char_info(i + 1, text_handler, pdfium_lock) if not next_char_info or next_char_info["char"] in ("\n", "\r"): continue @@ -128,7 +128,7 @@ def process_char( return char_data_list -def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: +def _get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: """ Retrieves information about a specific character. @@ -145,8 +145,8 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: if unicode_char == 0xFF: return {} char = chr(unicode_char) - font_name = get_font_name(text_handler, i) - font_flags = get_font_flags(text_handler, i) + font_name = _get_font_name(text_handler, i) + font_flags = _get_font_flags(text_handler, i) font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i) font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i) _ = pdfium_c.FPDFText_GetStrokeColor( @@ -167,7 +167,7 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict: } -def get_font_name(text_handler, i: int) -> str: +def _get_font_name(text_handler, i: int) -> str: """ Retrieves the font name for a specific character. @@ -186,7 +186,7 @@ def get_font_name(text_handler, i: int) -> str: ) -def get_font_flags(text_handler, i: int) -> int: +def _get_font_flags(text_handler, i: int) -> int: """ Retrieves the font flags for a specific character. @@ -199,7 +199,7 @@ def get_font_flags(text_handler, i: int) -> int: return flags.value -def get_char_box( +def _get_char_box( i: int, text_handler, pdfium_lock: RLock ) -> Tuple[float, float, float, float]: """ @@ -218,7 +218,7 @@ def get_char_box( return left.value, right.value, bottom.value, top.value -def get_page_rotation(page, pdfium_lock: RLock) -> int: +def _get_page_rotation(page, pdfium_lock: RLock) -> int: """ Retrieves the rotation value for a specific page. @@ -232,7 +232,7 @@ def get_page_rotation(page, pdfium_lock: RLock) -> int: ) -def adjust_char_box( +def _adjust_char_box( char_box: Tuple[float, float, float, float], rotation: int, internal_height: float, @@ -263,3 +263,15 @@ def adjust_char_box( internal_height - left, ) return left, right, top, bottom + + +def lerp(start: float, end: float, t: float) -> float: + """ + Performs linear interpolation between two numbers. + + :param start: The starting value. + :param end: The ending value. + :param t: The interpolation factor (0 to 1). + :return: The interpolated value. + """ + return start * (1 - t) + end * t From c232dc68ead27caea2843aabce33f3369c06d392 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 21 Jan 2025 09:37:39 +0100 Subject: [PATCH 7/7] remove crash on file deletion --- tests/input/test_compression.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index 03af2085..e09d2970 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -226,7 +226,4 @@ def cleanup(): for file_path in created_files: full_path = DATA_DIR / "output" / file_path if full_path.exists(): - try: - os.remove(full_path) - except OSError as e: - print(f"Could not delete file '{file_path}': {e.strerror}") + os.remove(full_path)