2
2
import logging
3
3
from ctypes import c_char_p , c_ushort
4
4
from threading import RLock
5
- from typing import BinaryIO , List , Optional , Union
5
+ from typing import BinaryIO , List , Optional , Tuple , Union
6
6
7
7
import pypdfium2 as pdfium
8
8
import pypdfium2 .raw as pdfium_c
9
9
from _ctypes import POINTER
10
+ from PIL import Image
10
11
11
12
from mindee .image_operations .image_compressor import compress_image
12
13
from mindee .pdf .pdf_char_data import PDFCharData
13
14
from mindee .pdf .pdf_utils import (
14
- attach_images_as_new_file ,
15
15
extract_text_from_pdf ,
16
16
has_source_text ,
17
17
)
@@ -61,19 +61,22 @@ def compress_pdf(
61
61
extract_text_from_pdf (pdf_bytes ) if not disable_source_text else None
62
62
)
63
63
64
- compressed_pages = compress_pdf_pages (
65
- pdf_bytes , extracted_text , image_quality , disable_source_text
66
- )
64
+ compressed_pages = compress_pdf_pages (pdf_bytes , image_quality )
67
65
68
66
if not compressed_pages :
69
67
logger .warning (
70
68
"Could not compress PDF to a smaller size. Returning original PDF."
71
69
)
72
70
return pdf_bytes
73
71
74
- out_pdf = attach_images_as_new_file (
75
- [io . BytesIO ( compressed_page ) for compressed_page in compressed_pages ]
72
+ out_pdf = collect_images_as_pdf (
73
+ [compressed_page_image [ 0 ] for compressed_page_image in compressed_pages ]
76
74
)
75
+
76
+ if not disable_source_text :
77
+ for i , page in enumerate (out_pdf ):
78
+ add_text_to_pdf_page (page , i , extracted_text )
79
+
77
80
out_buffer = io .BytesIO ()
78
81
out_pdf .save (out_buffer )
79
82
out_buffer .seek (0 )
@@ -82,26 +85,20 @@ def compress_pdf(
82
85
83
86
def compress_pdf_pages (
84
87
pdf_data : bytes ,
85
- extracted_text : Optional [List [PDFCharData ]],
86
88
image_quality : int ,
87
- disable_source_text : bool ,
88
- ) -> Optional [List [bytes ]]:
89
+ ) -> Optional [List [Tuple [bytes , int , int ]]]:
89
90
"""
90
91
Compresses PDF pages and returns an array of compressed page buffers.
91
92
92
93
:param pdf_data: The input PDF as bytes.
93
- :param extracted_text: Extracted text from the PDF.
94
94
:param image_quality: Initial compression quality.
95
- :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
96
95
:return: List of compressed page buffers, or None if compression fails.
97
96
"""
98
97
original_size = len (pdf_data )
99
98
image_quality_loop = image_quality
100
99
101
100
while image_quality_loop >= MIN_QUALITY :
102
- compressed_pages = compress_pages_with_quality (
103
- pdf_data , extracted_text , image_quality_loop , disable_source_text
104
- )
101
+ compressed_pages = compress_pages_with_quality (pdf_data , image_quality_loop )
105
102
total_compressed_size = sum (len (page ) for page in compressed_pages )
106
103
107
104
if is_compression_successful (
@@ -115,28 +112,28 @@ def compress_pdf_pages(
115
112
116
113
117
114
def add_text_to_pdf_page ( # type: ignore
118
- document : pdfium .PdfDocument ,
115
+ page : pdfium .PdfPage ,
119
116
page_id : int ,
120
- extracted_text : Optional [List [PDFCharData ]],
117
+ extracted_text : Optional [List [List [ PDFCharData ] ]],
121
118
) -> None :
122
119
"""
123
120
Adds text to a PDF page based on the extracted text data.
124
121
125
- :param document : The PDFDocument object.
126
- :param page_id: ID of the current page.
122
+ :param page : The PDFDocument object.
123
+ :param page_id: The ID of the page.
127
124
:param extracted_text: List of PDFCharData objects containing text and positioning information.
128
125
"""
129
- if not extracted_text :
126
+ if not extracted_text or not extracted_text [ page_id ] :
130
127
return
131
128
132
- height = document [ page_id ] .get_height ()
129
+ height = page .get_height ()
133
130
pdfium_lock = RLock ()
134
131
135
132
with pdfium_lock :
136
- for char_data in extracted_text :
133
+ for char_data in extracted_text [ page_id ] :
137
134
font_name = c_char_p (char_data .font_name .encode ("utf-8" ))
138
135
text_handler = pdfium_c .FPDFPageObj_NewTextObj (
139
- document .raw , font_name , char_data .font_size
136
+ page . pdf .raw , font_name , char_data .font_size
140
137
)
141
138
char_code = ord (char_data .char )
142
139
char_code_c_char = c_ushort (char_code )
@@ -145,38 +142,28 @@ def add_text_to_pdf_page( # type: ignore
145
142
pdfium_c .FPDFPageObj_Transform (
146
143
text_handler , 1 , 0 , 0 , 1 , char_data .left , height - char_data .top
147
144
)
148
- pdfium_c .FPDFPage_InsertObject (document [page_id ].raw , text_handler )
149
- pdfium_c .FPDFPageObj_Destroy (text_handler )
150
- pdfium_c .FPDFPage_GenerateContent (document [page_id ].raw )
151
- pdfium_c .FPDF_ClosePage (document [page_id ].raw )
145
+ pdfium_c .FPDFPage_InsertObject (page .raw , text_handler )
146
+ pdfium_c .FPDFPage_GenerateContent (page .raw )
152
147
153
148
154
149
def compress_pages_with_quality (
155
150
pdf_data : bytes ,
156
- extracted_text : Optional [list [PDFCharData ]],
157
151
image_quality : int ,
158
- disable_source_text : bool ,
159
- ) -> List [bytes ]:
152
+ ) -> List [Tuple [bytes , int , int ]]:
160
153
"""
161
154
Compresses pages with a specific quality.
162
155
163
156
:param pdf_data: The input PDF as bytes.
164
- :param extracted_text: Extracted text from the PDF.
165
157
:param image_quality: Compression quality.
166
- :param disable_source_text: If true, doesn't re-apply source text to the output PDF.
167
158
:return: List of compressed page buffers.
168
159
"""
169
160
pdf_document = pdfium .PdfDocument (pdf_data )
170
161
compressed_pages = []
171
-
172
- for [i , page ] in enumerate (pdf_document ):
162
+ for page in pdf_document :
173
163
rasterized_page = rasterize_page (page , image_quality )
174
164
compressed_image = compress_image (rasterized_page , image_quality )
175
-
176
- if not disable_source_text :
177
- add_text_to_pdf_page (pdf_document , i , extracted_text )
178
-
179
- compressed_pages .append (compressed_image )
165
+ image = Image .open (io .BytesIO (compressed_image ))
166
+ compressed_pages .append ((compressed_image , image .size [0 ], image .size [1 ]))
180
167
181
168
return compressed_pages
182
169
@@ -223,3 +210,33 @@ def lerp(start: float, end: float, t: float) -> float:
223
210
:return: The interpolated value.
224
211
"""
225
212
return start * (1 - t ) + end * t
213
+
214
+
215
+ def collect_images_as_pdf (image_list : List [bytes ]) -> pdfium .PdfDocument : # type: ignore
216
+ """
217
+ Converts a list of JPEG images into pages in a PdfDocument.
218
+
219
+ :param image_list: A list of bytes representing JPEG images.
220
+ :return: A PdfDocument handle containing the images as pages.
221
+ """
222
+ # Create a new, empty PdfDocument
223
+ out_pdf = pdfium .PdfDocument .new ()
224
+
225
+ for image_bytes in image_list :
226
+ # Load the JPEG image into a PdfImage object
227
+ pdf_image = pdfium .PdfImage .new (out_pdf )
228
+ pdf_image .load_jpeg (io .BytesIO (image_bytes ))
229
+
230
+ # Get the dimensions of the image
231
+ width , height = pdf_image .get_size ()
232
+
233
+ # Create a new page in the PDF with the same dimensions as the image
234
+ page = out_pdf .new_page (width , height )
235
+
236
+ # Place the image on the page
237
+ page .insert_obj (pdf_image )
238
+
239
+ # Generate content for the page to finalize it
240
+ page .gen_content ()
241
+
242
+ return out_pdf
0 commit comments