4
4
import pypdfium2 as pdfium
5
5
from PIL import Image
6
6
7
+ from mindee .error import MindeeError
7
8
from mindee .geometry import Point , get_min_max_x , get_min_max_y
8
9
from mindee .image_extraction .common import ExtractedImage
9
10
from mindee .input import BytesInput , LocalInputSource
@@ -15,7 +16,7 @@ def attach_image_as_new_file( # type: ignore
15
16
"""
16
17
Attaches an image as a new page in a PdfDocument object.
17
18
18
- :param input_buffer: Input buffer. Only supports JPEG.
19
+ :param input_buffer: Input buffer.
19
20
:return: A PdfDocument handle.
20
21
"""
21
22
# Create a new page in the PdfDocument
@@ -41,6 +42,75 @@ def attach_image_as_new_file( # type: ignore
41
42
return pdf
42
43
43
44
45
+ def extract_image_from_polygon (
46
+ page_content : Image .Image ,
47
+ polygon : List [Point ],
48
+ width : float ,
49
+ height : float ,
50
+ file_format : str ,
51
+ ) -> bytes :
52
+ """
53
+ Crops the image from the given polygon.
54
+
55
+ :param page_content: Contents of the page as a Pillow object.
56
+ :param polygon: Polygon coordinates for the image.
57
+ :param width: Width of the generated image.
58
+ :param height: Height of the generated image.
59
+ :param file_format: Format for the generated file.
60
+ :return: A generated image as a buffer.
61
+ """
62
+ min_max_x = get_min_max_x (polygon )
63
+ min_max_y = get_min_max_y (polygon )
64
+ cropped_image = page_content .crop (
65
+ (
66
+ int (min_max_x .min * width ),
67
+ int (min_max_y .min * height ),
68
+ int (min_max_x .max * width ),
69
+ int (min_max_y .max * height ),
70
+ )
71
+ )
72
+ return save_image_to_buffer (cropped_image , file_format )
73
+
74
+
75
+ def save_image_to_buffer (image : Image .Image , file_format : str ) -> bytes :
76
+ """
77
+ Saves an image as a buffer.
78
+
79
+ :param image: Pillow wrapper for the image.
80
+ :param file_format: Format to save the file as.
81
+ :return: A valid buffer.
82
+ """
83
+ buffer = io .BytesIO ()
84
+ image .save (buffer , format = file_format )
85
+ buffer .seek (0 )
86
+ return buffer .read ()
87
+
88
+
89
+ def determine_file_format (input_source : LocalInputSource ) -> str :
90
+ """
91
+ Retrieves the file format from an input source.
92
+
93
+ :param input_source: Local input source to retrieve the format from.
94
+ :return: A valid pillow file format.
95
+ """
96
+ if input_source .is_pdf ():
97
+ return "JPEG"
98
+ img = Image .open (input_source .file_object )
99
+ if img .format is None :
100
+ raise MindeeError ("Image format was not found." )
101
+ return img .format
102
+
103
+
104
+ def get_file_extension (file_format : str ):
105
+ """
106
+ Extract the correct file extension.
107
+
108
+ :param file_format: Format of the file.
109
+ :return: A valid file extension.
110
+ """
111
+ return file_format .lower () if file_format != "JPEG" else "jpg"
112
+
113
+
44
114
def extract_multiple_images_from_source (
45
115
input_source : LocalInputSource , page_id : int , polygons : List [List [Point ]]
46
116
) -> List [ExtractedImage ]:
@@ -56,27 +126,19 @@ def extract_multiple_images_from_source(
56
126
page_content = page .render ().to_pil ()
57
127
width , height = page .get_size ()
58
128
129
+ file_format = determine_file_format (input_source )
130
+ file_extension = get_file_extension (file_format )
131
+
59
132
extracted_elements = []
60
133
for element_id , polygon in enumerate (polygons ):
61
- min_max_x = get_min_max_x (polygon )
62
- min_max_y = get_min_max_y (polygon )
63
-
64
- pillow_page = page_content .crop (
65
- (
66
- int (min_max_x .min * width ),
67
- int (min_max_y .min * height ),
68
- int (min_max_x .max * width ),
69
- int (min_max_y .max * height ),
70
- )
134
+ image_data = extract_image_from_polygon (
135
+ page_content , polygon , width , height , file_format
71
136
)
72
- buffer = io .BytesIO ()
73
- pillow_page .save (buffer , format = "JPEG" )
74
- buffer .seek (0 )
75
137
extracted_elements .append (
76
138
ExtractedImage (
77
139
BytesInput (
78
- buffer . read () ,
79
- f"{ input_source .filename } _p { page_id } _e { element_id } .jpg " ,
140
+ image_data ,
141
+ f"{ input_source .filename } _page { page_id + 1 } - { element_id } .{ file_extension } " ,
80
142
),
81
143
page_id ,
82
144
element_id ,
0 commit comments