Skip to content

Commit 69228ca

Browse files
fix a few issues... but segfault :D
1 parent e866e84 commit 69228ca

File tree

4 files changed

+86
-72
lines changed

4 files changed

+86
-72
lines changed

mindee/pdf/pdf_char_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class PDFCharData:
1818
"""Bottom bound."""
1919
font_name: str
2020
"""The font name."""
21-
font_size: int
21+
font_size: float
2222
"""The font size in pt."""
2323
font_weight: int
2424
"""The font weight."""

mindee/pdf/pdf_compressor.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import io
22
import logging
3+
from ctypes import c_char_p, c_ushort
34
from threading import RLock
45
from typing import BinaryIO, List, Optional, Union
56

67
import pypdfium2 as pdfium
78
import pypdfium2.raw as pdfium_c
9+
from _ctypes import POINTER
810

911
from mindee.image_operations.image_compressor import compress_image
1012
from mindee.pdf.pdf_char_data import PDFCharData
@@ -34,9 +36,12 @@ def compress_pdf(
3436
:return: Compressed PDF as bytes.
3537
"""
3638
if not isinstance(pdf_data, bytes):
37-
pdf_data = pdf_data.read()
39+
pdf_bytes = pdf_data.read()
40+
pdf_data.seek(0)
41+
else:
42+
pdf_bytes = pdf_data
3843

39-
if has_source_text(pdf_data):
44+
if has_source_text(pdf_bytes):
4045
if force_source_text_compression:
4146
if not disable_source_text:
4247
logger.warning("Re-writing PDF source-text is an EXPERIMENTAL feature.")
@@ -50,29 +55,29 @@ def compress_pdf(
5055
"Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
5156
"is set to 'true'."
5257
)
53-
return pdf_data
58+
return pdf_bytes
5459

5560
extracted_text = (
56-
extract_text_from_pdf(pdf_data) if not disable_source_text else None
61+
extract_text_from_pdf(pdf_bytes) if not disable_source_text else None
5762
)
5863

5964
compressed_pages = compress_pdf_pages(
60-
pdf_data, extracted_text, image_quality, disable_source_text
65+
pdf_bytes, extracted_text, image_quality, disable_source_text
6166
)
6267

6368
if not compressed_pages:
6469
logger.warning(
6570
"Could not compress PDF to a smaller size. Returning original PDF."
6671
)
67-
return pdf_data
72+
return pdf_bytes
6873

6974
out_pdf = attach_images_as_new_file(
7075
[io.BytesIO(compressed_page) for compressed_page in compressed_pages]
7176
)
72-
out_bytes = io.BytesIO()
73-
out_pdf.save(out_bytes)
74-
75-
return out_bytes.read()
77+
out_buffer = io.BytesIO()
78+
out_pdf.save(out_buffer)
79+
out_buffer.seek(0)
80+
return out_buffer.read()
7681

7782

7883
def compress_pdf_pages(
@@ -110,40 +115,40 @@ def compress_pdf_pages(
110115

111116

112117
def add_text_to_pdf_page( # type: ignore
113-
page: pdfium.PdfPage,
118+
document: pdfium.PdfDocument,
119+
page_id: int,
114120
extracted_text: Optional[List[PDFCharData]],
115121
) -> None:
116122
"""
117123
Adds text to a PDF page based on the extracted text data.
118124
119-
:param page: The PdfPage object to add text to.
125+
:param document: The PDFDocument object.
126+
:param page_id: ID of the current page.
120127
:param extracted_text: List of PDFCharData objects containing text and positioning information.
121128
"""
122129
if not extracted_text:
123130
return
124131

125-
height = page.get_height()
126-
document = page.pdf
132+
height = document[page_id].get_height()
127133
pdfium_lock = RLock()
128134

129135
with pdfium_lock:
130-
text_handler = pdfium_c.FPDFText_LoadPage(page.raw)
131136
for char_data in extracted_text:
132-
font = document.load_font(
133-
char_data.font_name, pdfium_c.FPDF_FONT_TRUETYPE, True
137+
font_name = c_char_p(char_data.font_name.encode("utf-8"))
138+
text_handler = pdfium_c.FPDFPageObj_NewTextObj(
139+
document.raw, font_name, char_data.font_size
134140
)
135-
text_object = document.create_text_object(font, char_data.font_size)
136-
text_object.set_text(char_data.char)
137-
x = char_data.left
138-
y = height - char_data.bottom
139-
text_object.set_position(x, y)
140-
r, g, b, a = char_data.font_fill_color
141-
text_object.set_fill_color(r, g, b, a)
142-
pdfium_c.FPDFPage_InsertObject(text_handler, text_object)
143-
pdfium_c.FPDFPage_GenerateContent(text_handler)
144-
145-
with pdfium_lock:
146-
pdfium_c.FPDFText_ClosePage(text_handler)
141+
char_code = ord(char_data.char)
142+
char_code_c_char = c_ushort(char_code)
143+
char_ptr = POINTER(c_ushort)(char_code_c_char)
144+
pdfium_c.FPDFText_SetText(text_handler, char_ptr)
145+
pdfium_c.FPDFPageObj_Transform(
146+
text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top
147+
)
148+
pdfium_c.FPDFPage_InsertObject(document[page_id].raw, text_handler)
149+
pdfium_c.FPDFPageObj_Destroy(text_handler)
150+
pdfium_c.FPDFPage_GenerateContent(document[page_id].raw)
151+
pdfium_c.FPDF_ClosePage(document[page_id].raw)
147152

148153

149154
def compress_pages_with_quality(
@@ -164,12 +169,12 @@ def compress_pages_with_quality(
164169
pdf_document = pdfium.PdfDocument(pdf_data)
165170
compressed_pages = []
166171

167-
for [_, page] in enumerate(pdf_document):
172+
for [i, page] in enumerate(pdf_document):
168173
rasterized_page = rasterize_page(page, image_quality)
169174
compressed_image = compress_image(rasterized_page, image_quality)
170175

171176
if not disable_source_text:
172-
add_text_to_pdf_page(page, extracted_text)
177+
add_text_to_pdf_page(pdf_document, i, extracted_text)
173178

174179
compressed_pages.append(compressed_image)
175180

mindee/pdf/pdf_utils.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import ctypes
12
import io
23
from ctypes import byref, c_double, c_int, create_string_buffer
34
from threading import RLock
@@ -125,23 +126,30 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
125126
:param pdfium_lock: Lock for thread-safe operations.
126127
:return: A dictionary containing character information.
127128
"""
129+
stroke = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint())
130+
fill = (ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint(), ctypes.c_uint())
131+
128132
with pdfium_lock:
129133
char = chr(pdfium_c.FPDFText_GetUnicode(text_handler, i))
130134
font_name = get_font_name(text_handler, i)
131135
font_flags = get_font_flags(text_handler, i)
132136
font_size = pdfium_c.FPDFText_GetFontSize(text_handler, i)
133137
font_weight = pdfium_c.FPDFText_GetFontWeight(text_handler, i)
134-
font_stroke_color = pdfium_c.FPDFText_GetStrokeColor(text_handler, i)
135-
font_fill_color = pdfium_c.FPDFText_GetFillColor(text_handler, i)
138+
_ = pdfium_c.FPDFText_GetStrokeColor(
139+
text_handler, i, stroke[0], stroke[1], stroke[2], stroke[3]
140+
)
141+
_ = pdfium_c.FPDFText_GetFillColor(
142+
text_handler, i, fill[0], fill[1], fill[2], fill[3]
143+
)
136144

137145
return {
138146
"char": char,
139147
"font_name": font_name,
140148
"font_flags": font_flags,
141149
"font_size": font_size,
142150
"font_weight": font_weight,
143-
"font_stroke_color": font_stroke_color,
144-
"font_fill_color": font_fill_color,
151+
"font_stroke_color": stroke,
152+
"font_fill_color": fill,
145153
}
146154

147155

tests/input/test_compression.py

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def test_image_quality_compress_from_input_source():
1919

2020
with open(OUTPUT_DIR / "compress_indirect.jpg", "wb") as f:
2121
f.write(receipt_input.file_object.read())
22+
receipt_input.file_object.seek(0)
2223

2324
initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
2425
rendered_file_stats = os.stat(OUTPUT_DIR / "compress_indirect.jpg")
@@ -62,6 +63,7 @@ def test_image_resize_from_input_source():
6263
image_resize_input.compress(75, 250, 1000)
6364
with open(OUTPUT_DIR / "resize_indirect.jpg", "wb") as f:
6465
f.write(image_resize_input.file_object.read())
66+
image_resize_input.file_object.seek(0)
6567

6668
initial_file_stats = os.stat(DATA_DIR / "file_types/receipt.jpg")
6769
rendered_file_stats = os.stat(OUTPUT_DIR / "resize_indirect.jpg")
@@ -143,8 +145,8 @@ def test_pdf_compress_from_compressor():
143145
resizes = []
144146
qualities = [85, 75, 50, 10]
145147
for quality in qualities:
146-
pdf_resize_input.file_object.seek(0)
147148
resizes.append(compress_pdf(pdf_resize_input.file_object, quality))
149+
pdf_resize_input.file_object.seek(0)
148150

149151
file_names = [
150152
"compress85.pdf",
@@ -172,12 +174,11 @@ def test_pdf_compress_with_text_keeps_text():
172174

173175
compressed_with_text = compress_pdf(initial_with_text.file_object, 100, True, False)
174176

175-
original_text = "".join(
176-
[
177-
text_info.char
178-
for text_info in extract_text_from_pdf(initial_with_text.file_object.read())
179-
]
180-
)
177+
text_chars = []
178+
for text_info in extract_text_from_pdf(initial_with_text.file_object.read()):
179+
text_chars.append(text_info.char)
180+
initial_with_text.file_object.seek(0)
181+
original_text = "".join(text_chars)
181182
compressed_text = "".join(
182183
[text_info.char for text_info in extract_text_from_pdf(compressed_with_text)]
183184
)
@@ -193,32 +194,32 @@ def test_pdf_compress_with_text_does_not_compress():
193194
assert compressed_with_text == initial_with_text.file_object
194195

195196

196-
@pytest.fixture(scope="module", autouse=True)
197-
def cleanup():
198-
yield
199-
created_files = [
200-
"compress10.pdf",
201-
"compress50.pdf",
202-
"compress75.pdf",
203-
"compress85.pdf",
204-
"resize_indirect.pdf",
205-
"compress1.jpg",
206-
"compress10.jpg",
207-
"compress50.jpg",
208-
"compress75.jpg",
209-
"compress100.jpg",
210-
"compress_indirect.jpg",
211-
"resize250x500.jpg",
212-
"resize500x250.jpg",
213-
"resize500xnull.jpg",
214-
"resize_indirect.jpg",
215-
"resizenullx250.jpg",
216-
]
217-
218-
for file_path in created_files:
219-
full_path = DATA_DIR / "output" / file_path
220-
if full_path.exists():
221-
try:
222-
os.remove(full_path)
223-
except OSError as e:
224-
print(f"Could not delete file '{file_path}': {e.strerror}")
197+
# @pytest.fixture(scope="module", autouse=True)
198+
# def cleanup():
199+
# yield
200+
# created_files = [
201+
# "compress10.pdf",
202+
# "compress50.pdf",
203+
# "compress75.pdf",
204+
# "compress85.pdf",
205+
# "resize_indirect.pdf",
206+
# "compress1.jpg",
207+
# "compress10.jpg",
208+
# "compress50.jpg",
209+
# "compress75.jpg",
210+
# "compress100.jpg",
211+
# "compress_indirect.jpg",
212+
# "resize250x500.jpg",
213+
# "resize500x250.jpg",
214+
# "resize500xnull.jpg",
215+
# "resize_indirect.jpg",
216+
# "resizenullx250.jpg",
217+
# ]
218+
#
219+
# for file_path in created_files:
220+
# full_path = DATA_DIR / "output" / file_path
221+
# if full_path.exists():
222+
# try:
223+
# os.remove(full_path)
224+
# except OSError as e:
225+
# print(f"Could not delete file '{file_path}': {e.strerror}")

0 commit comments

Comments
 (0)