Skip to content

Commit 56ca783

Browse files
temp save, completely untested code
1 parent bfb73cf commit 56ca783

File tree

12 files changed

+550
-26
lines changed

12 files changed

+550
-26
lines changed

mindee/error/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77
MindeeHTTPServerError,
88
handle_error,
99
)
10+
from mindee.error.mindee_image_error import MindeeImageError
11+
from mindee.error.mindee_pdf_error import MindeePDFError

mindee/error/mindee_image_error.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class MindeeImageError(RuntimeError):
2+
"""An exception relating to errors during image operations."""

mindee/error/mindee_pdf_error.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class MindeePDFError(RuntimeError):
2+
"""An exception relating to errors during PDF operations."""

mindee/extraction/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from mindee.extraction.common.extracted_image import ExtractedImage
22
from mindee.extraction.common.image_extractor import (
3-
attach_image_as_new_file,
3+
attach_images_as_new_file,
44
extract_multiple_images_from_source,
55
)
66
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor

mindee/extraction/common/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from mindee.extraction.common.extracted_image import ExtractedImage
22
from mindee.extraction.common.image_extractor import (
3-
attach_image_as_new_file,
3+
attach_images_as_new_file,
44
extract_multiple_images_from_source,
55
)

mindee/extraction/common/image_extractor.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,35 +11,34 @@
1111
from mindee.input.sources import BytesInput, LocalInputSource
1212

1313

14-
def attach_image_as_new_file( # type: ignore
15-
input_buffer: BinaryIO,
14+
def attach_images_as_new_file( # type: ignore
15+
input_buffer_list: List[BinaryIO],
1616
) -> pdfium.PdfDocument:
1717
"""
18-
Attaches an image as a new page in a PdfDocument object.
18+
Attaches a list of images as new pages in a PdfDocument object.
1919
20-
:param input_buffer: Input buffer.
20+
:param input_buffer_list: List of images, represented as buffers.
2121
:return: A PdfDocument handle.
2222
"""
23-
# Create a new page in the PdfDocument
24-
input_buffer.seek(0)
25-
image = Image.open(input_buffer)
26-
image.convert("RGB")
27-
image_buffer = io.BytesIO()
28-
image.save(image_buffer, format="JPEG")
29-
3023
pdf = pdfium.PdfDocument.new()
31-
32-
image_pdf = pdfium.PdfImage.new(pdf)
33-
image_pdf.load_jpeg(image_buffer)
34-
width, height = image_pdf.get_size()
35-
36-
matrix = pdfium.PdfMatrix().scale(width, height)
37-
image_pdf.set_matrix(matrix)
38-
39-
page = pdf.new_page(width, height)
40-
page.insert_obj(image_pdf)
41-
page.gen_content()
42-
image.close()
24+
for input_buffer in input_buffer_list:
25+
input_buffer.seek(0)
26+
image = Image.open(input_buffer)
27+
image.convert("RGB")
28+
image_buffer = io.BytesIO()
29+
image.save(image_buffer, format="JPEG")
30+
31+
image_pdf = pdfium.PdfImage.new(pdf)
32+
image_pdf.load_jpeg(image_buffer)
33+
width, height = image_pdf.get_size()
34+
35+
matrix = pdfium.PdfMatrix().scale(width, height)
36+
image_pdf.set_matrix(matrix)
37+
38+
page = pdf.new_page(width, height)
39+
page.insert_obj(image_pdf)
40+
page.gen_content()
41+
image.close()
4342
return pdf
4443

4544

@@ -160,4 +159,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
160159
input_file.file_object.seek(0)
161160
return pdfium.PdfDocument(input_file.file_object)
162161

163-
return attach_image_as_new_file(input_file.file_object)
162+
return attach_images_as_new_file([input_file.file_object])

mindee/image_operations/__init__.py

Whitespace-only changes.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import io
2+
from typing import Union
3+
4+
from PIL import Image
5+
6+
7+
def compress_image(
8+
image_buffer: bytes,
9+
quality: int = 85,
10+
max_width: Union[int, float, None] = None,
11+
max_height: Union[int, float, None] = None,
12+
) -> bytes:
13+
"""
14+
Compresses an image with the given parameters.
15+
16+
:param image_buffer: Buffer representation of an image.
17+
:param quality: Quality to apply to the image (JPEG compression).
18+
:param max_width: Maximum bound for the width.
19+
:param max_height: Maximum bound for the height.
20+
:return:
21+
"""
22+
with Image.open(io.BytesIO(image_buffer)) as img:
23+
original_width, original_height = img.size
24+
max_width = max_width or original_width
25+
max_height = max_height or original_height
26+
if max_width or max_height:
27+
img.thumbnail((int(max_width), int(max_height)), Image.Resampling.LANCZOS)
28+
29+
output_buffer = io.BytesIO()
30+
img.save(output_buffer, format="JPEG", quality=quality, optimize=True)
31+
32+
compressed_image = output_buffer.getvalue()
33+
return compressed_image

mindee/pdf/__init__.py

Whitespace-only changes.

mindee/pdf/pdf_char_data.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from dataclasses import dataclass
2+
from typing import Tuple
3+
4+
5+
@dataclass
6+
class PDFCharData:
7+
"""Data class representing character data."""
8+
9+
char: str
10+
"""The character."""
11+
left: int
12+
"""Left bound."""
13+
right: int
14+
"""Right bound."""
15+
top: int
16+
"""Top bound."""
17+
bottom: int
18+
"""Bottom bound."""
19+
font_name: str
20+
"""The font name."""
21+
font_size: int
22+
"""The font size in pt."""
23+
font_weight: int
24+
"""The font weight."""
25+
font_flags: int
26+
"""The font flags."""
27+
font_stroke_color: Tuple[int, int, int, int]
28+
"""RGBA representation of the font's stroke color."""
29+
font_fill_color: Tuple[int, int, int, int]
30+
"""RGBA representation of the font's fill color."""

mindee/pdf/pdf_compressor.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
import logging
2+
from io import BytesIO
3+
from threading import RLock
4+
from typing import List, Optional
5+
6+
import pypdfium2 as pdfium
7+
import pypdfium2.raw as pdfium_c
8+
9+
from mindee.extraction import attach_images_as_new_file
10+
from mindee.image_operations.image_compressor import compress_image
11+
from mindee.pdf.pdf_char_data import PDFCharData
12+
from mindee.pdf.pdf_utils import extract_text_from_pdf, has_source_text
13+
14+
logger = logging.getLogger(__name__)
15+
MIN_QUALITY = 1
16+
17+
18+
def compress_pdf(
19+
pdf_data: bytes,
20+
image_quality: int = 85,
21+
force_source_text_compression: bool = False,
22+
disable_source_text: bool = True,
23+
) -> bytes:
24+
"""
25+
Compresses each page of a provided PDF buffer.
26+
27+
:param pdf_data: The input PDF as bytes.
28+
:param image_quality: Compression quality (70-100 for most JPG images).
29+
:param force_source_text_compression: If true, attempts to re-write detected text.
30+
:param disable_source_text: If true, doesn't re-apply source text to the output PDF.
31+
:return: Compressed PDF as bytes.
32+
"""
33+
if has_source_text(pdf_data):
34+
if force_source_text_compression:
35+
if not disable_source_text:
36+
logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.")
37+
else:
38+
logger.warning(
39+
"Source file contains text, but disable_source_text flag "
40+
"is set to false. Resulting file will not contain any embedded text."
41+
)
42+
else:
43+
logger.warning(
44+
"Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
45+
"is set to 'true'."
46+
)
47+
return pdf_data
48+
49+
extracted_text = (
50+
extract_text_from_pdf(pdf_data) if not disable_source_text else None
51+
)
52+
53+
compressed_pages = compress_pdf_pages(
54+
pdf_data, extracted_text, image_quality, disable_source_text
55+
)
56+
57+
if not compressed_pages:
58+
logger.warning(
59+
"Could not compress PDF to a smaller size. Returning original PDF."
60+
)
61+
return pdf_data
62+
63+
out_pdf = attach_images_as_new_file(
64+
[BytesIO(compressed_page) for compressed_page in compressed_pages]
65+
)
66+
out_bytes = BytesIO()
67+
out_pdf.save(out_bytes)
68+
69+
return out_bytes.read()
70+
71+
72+
def compress_pdf_pages(
73+
pdf_data: bytes,
74+
extracted_text: Optional[List[PDFCharData]],
75+
image_quality: int,
76+
disable_source_text: bool,
77+
) -> Optional[List[bytes]]:
78+
"""
79+
Compresses PDF pages and returns an array of compressed page buffers.
80+
81+
:param pdf_data: The input PDF as bytes.
82+
:param extracted_text: Extracted text from the PDF.
83+
:param image_quality: Initial compression quality.
84+
:param disable_source_text: If true, doesn't re-apply source text to the output PDF.
85+
:return: List of compressed page buffers, or None if compression fails.
86+
"""
87+
original_size = len(pdf_data)
88+
image_quality_loop = image_quality
89+
90+
while image_quality_loop >= MIN_QUALITY:
91+
compressed_pages = compress_pages_with_quality(
92+
pdf_data, extracted_text, image_quality_loop, disable_source_text
93+
)
94+
total_compressed_size = sum(len(page) for page in compressed_pages)
95+
96+
if is_compression_successful(
97+
total_compressed_size, original_size, image_quality
98+
):
99+
return compressed_pages
100+
101+
image_quality_loop -= round(lerp(1, 10, image_quality_loop / 100))
102+
103+
return None
104+
105+
106+
def add_text_to_pdf_page( # type: ignore
107+
page: pdfium.PdfPage,
108+
extracted_text: Optional[List[PDFCharData]],
109+
) -> None:
110+
"""
111+
Adds text to a PDF page based on the extracted text data.
112+
113+
:param page: The PdfPage object to add text to.
114+
:param extracted_text: List of PDFCharData objects containing text and positioning information.
115+
"""
116+
if not extracted_text:
117+
return
118+
119+
height = page.get_height()
120+
document = page.pdf
121+
pdfium_lock = RLock()
122+
123+
with pdfium_lock:
124+
text_handler = pdfium_c.FPDFText_LoadPage(page.raw)
125+
for char_data in extracted_text:
126+
font = document.load_font(
127+
char_data.font_name, pdfium_c.FPDF_FONT_TRUETYPE, True
128+
)
129+
text_object = document.create_text_object(font, char_data.font_size)
130+
text_object.set_text(char_data.char)
131+
x = char_data.left
132+
y = height - char_data.bottom
133+
text_object.set_position(x, y)
134+
r, g, b, a = char_data.font_fill_color
135+
text_object.set_fill_color(r, g, b, a)
136+
pdfium_c.FPDFPage_InsertObject(text_handler, text_object)
137+
pdfium_c.FPDFPage_GenerateContent(text_handler)
138+
139+
with pdfium_lock:
140+
pdfium_c.FPDFText_ClosePage(text_handler)
141+
142+
143+
def compress_pages_with_quality(
144+
pdf_data: bytes,
145+
extracted_text: Optional[list[PDFCharData]],
146+
image_quality: int,
147+
disable_source_text: bool,
148+
) -> List[bytes]:
149+
"""
150+
Compresses pages with a specific quality.
151+
152+
:param pdf_data: The input PDF as bytes.
153+
:param extracted_text: Extracted text from the PDF.
154+
:param image_quality: Compression quality.
155+
:param disable_source_text: If true, doesn't re-apply source text to the output PDF.
156+
:return: List of compressed page buffers.
157+
"""
158+
pdf_document = pdfium.PdfDocument(pdf_data)
159+
compressed_pages = []
160+
161+
for i in enumerate(pdf_document):
162+
page = pdf_document[i]
163+
rasterized_page = rasterize_page(page, image_quality)
164+
compressed_image = compress_image(rasterized_page, image_quality)
165+
166+
if not disable_source_text:
167+
add_text_to_pdf_page(page, extracted_text)
168+
169+
compressed_pages.append(compressed_image)
170+
171+
return compressed_pages
172+
173+
174+
def is_compression_successful(
175+
total_compressed_size: int, original_size: int, image_quality: int
176+
) -> bool:
177+
"""
178+
Checks if the compression was successful based on the compressed size and original size.
179+
180+
:param total_compressed_size: Total size of compressed pages.
181+
:param original_size: Original PDF size.
182+
:param image_quality: Compression quality.
183+
:return: True if compression was successful, false otherwise.
184+
"""
185+
overhead = lerp(0.54, 0.18, image_quality / 100)
186+
return total_compressed_size + total_compressed_size * overhead < original_size
187+
188+
189+
def rasterize_page( # type: ignore
190+
page: pdfium.PdfPage,
191+
quality: int = 85,
192+
) -> bytes:
193+
"""
194+
Rasterizes a PDF page.
195+
196+
:param page: PdfPage object to rasterize.
197+
:param quality: Quality to apply during rasterization.
198+
:return: Rasterized page as bytes.
199+
"""
200+
image = page.render().to_pil()
201+
buffer = BytesIO()
202+
image.save(buffer, format="JPEG", quality=quality)
203+
return buffer.getvalue()
204+
205+
206+
def lerp(start: float, end: float, t: float) -> float:
207+
"""
208+
Performs linear interpolation between two numbers.
209+
210+
:param start: The starting value.
211+
:param end: The ending value.
212+
:param t: The interpolation factor (0 to 1).
213+
:return: The interpolated value.
214+
"""
215+
return start * (1 - t) + end * t

0 commit comments

Comments
 (0)