Skip to content

Commit 940a574

Browse files
it works
1 parent 69228ca commit 940a574

File tree

6 files changed

+180
-142
lines changed

6 files changed

+180
-142
lines changed

mindee/extraction/common/image_extractor.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import io
2-
from typing import List
2+
from typing import BinaryIO, List
33

44
import pypdfium2 as pdfium
55
from PIL import Image
@@ -10,7 +10,6 @@
1010
from mindee.geometry.polygon import get_min_max_x, get_min_max_y
1111
from mindee.input.sources.bytes_input import BytesInput
1212
from mindee.input.sources.local_input_source import LocalInputSource
13-
from mindee.pdf.pdf_utils import attach_images_as_new_file
1413

1514

1615
def extract_image_from_polygon(
@@ -131,3 +130,34 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
131130
return pdfium.PdfDocument(input_file.file_object.read())
132131

133132
return attach_images_as_new_file([input_file.file_object])
133+
134+
135+
def attach_images_as_new_file( # type: ignore
136+
input_buffer_list: List[BinaryIO],
137+
) -> pdfium.PdfDocument:
138+
"""
139+
Attaches a list of images as new pages in a PdfDocument object.
140+
141+
:param input_buffer_list: List of images, represented as buffers.
142+
:return: A PdfDocument handle.
143+
"""
144+
pdf = pdfium.PdfDocument.new()
145+
for input_buffer in input_buffer_list:
146+
input_buffer.seek(0)
147+
image = Image.open(input_buffer)
148+
image.convert("RGB")
149+
image_buffer = io.BytesIO()
150+
image.save(image_buffer, format="JPEG")
151+
152+
image_pdf = pdfium.PdfImage.new(pdf)
153+
image_pdf.load_jpeg(image_buffer)
154+
width, height = image_pdf.get_size()
155+
156+
matrix = pdfium.PdfMatrix().scale(width, height)
157+
image_pdf.set_matrix(matrix)
158+
159+
page = pdf.new_page(width, height)
160+
page.insert_obj(image_pdf)
161+
page.gen_content()
162+
image.close()
163+
return pdf

mindee/pdf/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from mindee.pdf.pdf_char_data import PDFCharData
22
from mindee.pdf.pdf_compressor import compress_pdf
33
from mindee.pdf.pdf_utils import (
4-
attach_images_as_new_file,
54
extract_text_from_pdf,
65
has_source_text,
76
)

mindee/pdf/pdf_char_data.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,5 @@ class PDFCharData:
2828
"""RGBA representation of the font's stroke color."""
2929
font_fill_color: Tuple[int, int, int, int]
3030
"""RGBA representation of the font's fill color."""
31+
page_id: int
32+
"""ID of the page the character was found on."""

mindee/pdf/pdf_compressor.py

Lines changed: 56 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22
import logging
33
from ctypes import c_char_p, c_ushort
44
from threading import RLock
5-
from typing import BinaryIO, List, Optional, Union
5+
from typing import BinaryIO, List, Optional, Tuple, Union
66

77
import pypdfium2 as pdfium
88
import pypdfium2.raw as pdfium_c
99
from _ctypes import POINTER
10+
from PIL import Image
1011

1112
from mindee.image_operations.image_compressor import compress_image
1213
from mindee.pdf.pdf_char_data import PDFCharData
1314
from mindee.pdf.pdf_utils import (
14-
attach_images_as_new_file,
1515
extract_text_from_pdf,
1616
has_source_text,
1717
)
@@ -61,19 +61,22 @@ def compress_pdf(
6161
extract_text_from_pdf(pdf_bytes) if not disable_source_text else None
6262
)
6363

64-
compressed_pages = compress_pdf_pages(
65-
pdf_bytes, extracted_text, image_quality, disable_source_text
66-
)
64+
compressed_pages = compress_pdf_pages(pdf_bytes, image_quality)
6765

6866
if not compressed_pages:
6967
logger.warning(
7068
"Could not compress PDF to a smaller size. Returning original PDF."
7169
)
7270
return pdf_bytes
7371

74-
out_pdf = attach_images_as_new_file(
75-
[io.BytesIO(compressed_page) for compressed_page in compressed_pages]
72+
out_pdf = collect_images_as_pdf(
73+
[compressed_page_image[0] for compressed_page_image in compressed_pages]
7674
)
75+
76+
if not disable_source_text:
77+
for i, page in enumerate(out_pdf):
78+
add_text_to_pdf_page(page, i, extracted_text)
79+
7780
out_buffer = io.BytesIO()
7881
out_pdf.save(out_buffer)
7982
out_buffer.seek(0)
@@ -82,26 +85,20 @@ def compress_pdf(
8285

8386
def compress_pdf_pages(
8487
pdf_data: bytes,
85-
extracted_text: Optional[List[PDFCharData]],
8688
image_quality: int,
87-
disable_source_text: bool,
88-
) -> Optional[List[bytes]]:
89+
) -> Optional[List[Tuple[bytes, int, int]]]:
8990
"""
9091
Compresses PDF pages and returns an array of compressed page buffers.
9192
9293
:param pdf_data: The input PDF as bytes.
93-
:param extracted_text: Extracted text from the PDF.
9494
:param image_quality: Initial compression quality.
95-
:param disable_source_text: If true, doesn't re-apply source text to the output PDF.
9695
:return: List of compressed page buffers, or None if compression fails.
9796
"""
9897
original_size = len(pdf_data)
9998
image_quality_loop = image_quality
10099

101100
while image_quality_loop >= MIN_QUALITY:
102-
compressed_pages = compress_pages_with_quality(
103-
pdf_data, extracted_text, image_quality_loop, disable_source_text
104-
)
101+
compressed_pages = compress_pages_with_quality(pdf_data, image_quality_loop)
105102
total_compressed_size = sum(len(page) for page in compressed_pages)
106103

107104
if is_compression_successful(
@@ -115,28 +112,28 @@ def compress_pdf_pages(
115112

116113

117114
def add_text_to_pdf_page( # type: ignore
118-
document: pdfium.PdfDocument,
115+
page: pdfium.PdfPage,
119116
page_id: int,
120-
extracted_text: Optional[List[PDFCharData]],
117+
extracted_text: Optional[List[List[PDFCharData]]],
121118
) -> None:
122119
"""
123120
Adds text to a PDF page based on the extracted text data.
124121
125-
:param document: The PDFDocument object.
126-
:param page_id: ID of the current page.
122+
:param page: The PDFDocument object.
123+
:param page_id: The ID of the page.
127124
:param extracted_text: List of PDFCharData objects containing text and positioning information.
128125
"""
129-
if not extracted_text:
126+
if not extracted_text or not extracted_text[page_id]:
130127
return
131128

132-
height = document[page_id].get_height()
129+
height = page.get_height()
133130
pdfium_lock = RLock()
134131

135132
with pdfium_lock:
136-
for char_data in extracted_text:
133+
for char_data in extracted_text[page_id]:
137134
font_name = c_char_p(char_data.font_name.encode("utf-8"))
138135
text_handler = pdfium_c.FPDFPageObj_NewTextObj(
139-
document.raw, font_name, char_data.font_size
136+
page.pdf.raw, font_name, char_data.font_size
140137
)
141138
char_code = ord(char_data.char)
142139
char_code_c_char = c_ushort(char_code)
@@ -145,38 +142,28 @@ def add_text_to_pdf_page( # type: ignore
145142
pdfium_c.FPDFPageObj_Transform(
146143
text_handler, 1, 0, 0, 1, char_data.left, height - char_data.top
147144
)
148-
pdfium_c.FPDFPage_InsertObject(document[page_id].raw, text_handler)
149-
pdfium_c.FPDFPageObj_Destroy(text_handler)
150-
pdfium_c.FPDFPage_GenerateContent(document[page_id].raw)
151-
pdfium_c.FPDF_ClosePage(document[page_id].raw)
145+
pdfium_c.FPDFPage_InsertObject(page.raw, text_handler)
146+
pdfium_c.FPDFPage_GenerateContent(page.raw)
152147

153148

154149
def compress_pages_with_quality(
155150
pdf_data: bytes,
156-
extracted_text: Optional[list[PDFCharData]],
157151
image_quality: int,
158-
disable_source_text: bool,
159-
) -> List[bytes]:
152+
) -> List[Tuple[bytes, int, int]]:
160153
"""
161154
Compresses pages with a specific quality.
162155
163156
:param pdf_data: The input PDF as bytes.
164-
:param extracted_text: Extracted text from the PDF.
165157
:param image_quality: Compression quality.
166-
:param disable_source_text: If true, doesn't re-apply source text to the output PDF.
167158
:return: List of compressed page buffers.
168159
"""
169160
pdf_document = pdfium.PdfDocument(pdf_data)
170161
compressed_pages = []
171-
172-
for [i, page] in enumerate(pdf_document):
162+
for page in pdf_document:
173163
rasterized_page = rasterize_page(page, image_quality)
174164
compressed_image = compress_image(rasterized_page, image_quality)
175-
176-
if not disable_source_text:
177-
add_text_to_pdf_page(pdf_document, i, extracted_text)
178-
179-
compressed_pages.append(compressed_image)
165+
image = Image.open(io.BytesIO(compressed_image))
166+
compressed_pages.append((compressed_image, image.size[0], image.size[1]))
180167

181168
return compressed_pages
182169

@@ -223,3 +210,33 @@ def lerp(start: float, end: float, t: float) -> float:
223210
:return: The interpolated value.
224211
"""
225212
return start * (1 - t) + end * t
213+
214+
215+
def collect_images_as_pdf(image_list: List[bytes]) -> pdfium.PdfDocument: # type: ignore
216+
"""
217+
Converts a list of JPEG images into pages in a PdfDocument.
218+
219+
:param image_list: A list of bytes representing JPEG images.
220+
:return: A PdfDocument handle containing the images as pages.
221+
"""
222+
# Create a new, empty PdfDocument
223+
out_pdf = pdfium.PdfDocument.new()
224+
225+
for image_bytes in image_list:
226+
# Load the JPEG image into a PdfImage object
227+
pdf_image = pdfium.PdfImage.new(out_pdf)
228+
pdf_image.load_jpeg(io.BytesIO(image_bytes))
229+
230+
# Get the dimensions of the image
231+
width, height = pdf_image.get_size()
232+
233+
# Create a new page in the PDF with the same dimensions as the image
234+
page = out_pdf.new_page(width, height)
235+
236+
# Place the image on the page
237+
page.insert_obj(pdf_image)
238+
239+
# Generate content for the page to finalize it
240+
page.gen_content()
241+
242+
return out_pdf

0 commit comments

Comments
 (0)