1
1
import io
2
2
import logging
3
+ from ctypes import c_char_p , c_ushort
3
4
from threading import RLock
4
5
from typing import BinaryIO , List , Optional , Union
5
6
6
7
import pypdfium2 as pdfium
7
8
import pypdfium2 .raw as pdfium_c
9
+ from _ctypes import POINTER
8
10
9
11
from mindee .image_operations .image_compressor import compress_image
10
12
from mindee .pdf .pdf_char_data import PDFCharData
@@ -34,9 +36,12 @@ def compress_pdf(
34
36
:return: Compressed PDF as bytes.
35
37
"""
36
38
if not isinstance (pdf_data , bytes ):
37
- pdf_data = pdf_data .read ()
39
+ pdf_bytes = pdf_data .read ()
40
+ pdf_data .seek (0 )
41
+ else :
42
+ pdf_bytes = pdf_data
38
43
39
- if has_source_text (pdf_data ):
44
+ if has_source_text (pdf_bytes ):
40
45
if force_source_text_compression :
41
46
if not disable_source_text :
42
47
logger .warning ("Re-writing PDF source-text is an EXPERIMENTAL feature." )
@@ -50,29 +55,29 @@ def compress_pdf(
50
55
"Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
51
56
"is set to 'true'."
52
57
)
53
- return pdf_data
58
+ return pdf_bytes
54
59
55
60
extracted_text = (
56
- extract_text_from_pdf (pdf_data ) if not disable_source_text else None
61
+ extract_text_from_pdf (pdf_bytes ) if not disable_source_text else None
57
62
)
58
63
59
64
compressed_pages = compress_pdf_pages (
60
- pdf_data , extracted_text , image_quality , disable_source_text
65
+ pdf_bytes , extracted_text , image_quality , disable_source_text
61
66
)
62
67
63
68
if not compressed_pages :
64
69
logger .warning (
65
70
"Could not compress PDF to a smaller size. Returning original PDF."
66
71
)
67
- return pdf_data
72
+ return pdf_bytes
68
73
69
74
out_pdf = attach_images_as_new_file (
70
75
[io .BytesIO (compressed_page ) for compressed_page in compressed_pages ]
71
76
)
72
- out_bytes = io .BytesIO ()
73
- out_pdf .save (out_bytes )
74
-
75
- return out_bytes .read ()
77
+ out_buffer = io .BytesIO ()
78
+ out_pdf .save (out_buffer )
79
+ out_buffer . seek ( 0 )
80
+ return out_buffer .read ()
76
81
77
82
78
83
def compress_pdf_pages (
@@ -110,40 +115,40 @@ def compress_pdf_pages(
110
115
111
116
112
117
def add_text_to_pdf_page ( # type: ignore
113
- page : pdfium .PdfPage ,
118
+ document : pdfium .PdfDocument ,
119
+ page_id : int ,
114
120
extracted_text : Optional [List [PDFCharData ]],
115
121
) -> None :
116
122
"""
117
123
Adds text to a PDF page based on the extracted text data.
118
124
119
- :param page: The PdfPage object to add text to.
125
+ :param document: The PDFDocument object.
126
+ :param page_id: ID of the current page.
120
127
:param extracted_text: List of PDFCharData objects containing text and positioning information.
121
128
"""
122
129
if not extracted_text :
123
130
return
124
131
125
- height = page .get_height ()
126
- document = page .pdf
132
+ height = document [page_id ].get_height ()
127
133
pdfium_lock = RLock ()
128
134
129
135
with pdfium_lock :
130
- text_handler = pdfium_c .FPDFText_LoadPage (page .raw )
131
136
for char_data in extracted_text :
132
- font = document .load_font (
133
- char_data .font_name , pdfium_c .FPDF_FONT_TRUETYPE , True
137
+ font_name = c_char_p (char_data .font_name .encode ("utf-8" ))
138
+ text_handler = pdfium_c .FPDFPageObj_NewTextObj (
139
+ document .raw , font_name , char_data .font_size
134
140
)
135
- text_object = document .create_text_object (font , char_data .font_size )
136
- text_object .set_text (char_data .char )
137
- x = char_data .left
138
- y = height - char_data .bottom
139
- text_object .set_position (x , y )
140
- r , g , b , a = char_data .font_fill_color
141
- text_object .set_fill_color (r , g , b , a )
142
- pdfium_c .FPDFPage_InsertObject (text_handler , text_object )
143
- pdfium_c .FPDFPage_GenerateContent (text_handler )
144
-
145
- with pdfium_lock :
146
- pdfium_c .FPDFText_ClosePage (text_handler )
141
+ char_code = ord (char_data .char )
142
+ char_code_c_char = c_ushort (char_code )
143
+ char_ptr = POINTER (c_ushort )(char_code_c_char )
144
+ pdfium_c .FPDFText_SetText (text_handler , char_ptr )
145
+ pdfium_c .FPDFPageObj_Transform (
146
+ text_handler , 1 , 0 , 0 , 1 , char_data .left , height - char_data .top
147
+ )
148
+ pdfium_c .FPDFPage_InsertObject (document [page_id ].raw , text_handler )
149
+ pdfium_c .FPDFPageObj_Destroy (text_handler )
150
+ pdfium_c .FPDFPage_GenerateContent (document [page_id ].raw )
151
+ pdfium_c .FPDF_ClosePage (document [page_id ].raw )
147
152
148
153
149
154
def compress_pages_with_quality (
@@ -164,12 +169,12 @@ def compress_pages_with_quality(
164
169
pdf_document = pdfium .PdfDocument (pdf_data )
165
170
compressed_pages = []
166
171
167
- for [_ , page ] in enumerate (pdf_document ):
172
+ for [i , page ] in enumerate (pdf_document ):
168
173
rasterized_page = rasterize_page (page , image_quality )
169
174
compressed_image = compress_image (rasterized_page , image_quality )
170
175
171
176
if not disable_source_text :
172
- add_text_to_pdf_page (page , extracted_text )
177
+ add_text_to_pdf_page (pdf_document , i , extracted_text )
173
178
174
179
compressed_pages .append (compressed_image )
175
180
0 commit comments