Skip to content

Commit 071b95f

Browse files
temporary (not working) version
1 parent ff2f62c commit 071b95f

32 files changed

+385
-97
lines changed

examples/auto_invoice_splitter_extraction_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from mindee import Client
22
from mindee.extraction.pdf_extractor import PdfExtractor
3-
from mindee.input.sources import PathInput
3+
from mindee.input.sources.path_input import PathInput
44
from mindee.product.invoice.invoice_v4 import InvoiceV4
55
from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
66

mindee/client.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,12 @@
77
from mindee.input import WorkflowOptions
88
from mindee.input.local_response import LocalResponse
99
from mindee.input.page_options import PageOptions
10-
from mindee.input.sources import (
11-
Base64Input,
12-
BytesInput,
13-
FileInput,
14-
LocalInputSource,
15-
PathInput,
16-
UrlInputSource,
17-
)
10+
from mindee.input.sources.base_64_input import Base64Input
11+
from mindee.input.sources.bytes_input import BytesInput
12+
from mindee.input.sources.file_input import FileInput
13+
from mindee.input.sources.local_input_source import LocalInputSource
14+
from mindee.input.sources.path_input import PathInput
15+
from mindee.input.sources.url_input_source import UrlInputSource
1816
from mindee.logger import logger
1917
from mindee.mindee_http.endpoint import CustomEndpoint, Endpoint
2018
from mindee.mindee_http.mindee_api import MindeeApi

mindee/extraction/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from mindee.extraction.common.extracted_image import ExtractedImage
22
from mindee.extraction.common.image_extractor import (
3-
attach_images_as_new_file,
43
extract_multiple_images_from_source,
54
)
65
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor

mindee/extraction/common/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from mindee.extraction.common.extracted_image import ExtractedImage
22
from mindee.extraction.common.image_extractor import (
3-
attach_images_as_new_file,
43
extract_multiple_images_from_source,
54
)

mindee/extraction/common/extracted_image.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from PIL import Image
66

77
from mindee.error.mindee_error import MindeeError
8-
from mindee.input.sources import FileInput, LocalInputSource
8+
from mindee.input.sources.file_input import FileInput
9+
from mindee.input.sources.local_input_source import LocalInputSource
910
from mindee.logger import logger
1011

1112

mindee/extraction/common/image_extractor.py

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import io
2-
from typing import BinaryIO, List
2+
from typing import List
33

44
import pypdfium2 as pdfium
55
from PIL import Image
@@ -8,38 +8,9 @@
88
from mindee.extraction.common.extracted_image import ExtractedImage
99
from mindee.geometry.point import Point
1010
from mindee.geometry.polygon import get_min_max_x, get_min_max_y
11-
from mindee.input.sources import BytesInput, LocalInputSource
12-
13-
14-
def attach_images_as_new_file( # type: ignore
15-
input_buffer_list: List[BinaryIO],
16-
) -> pdfium.PdfDocument:
17-
"""
18-
Attaches a list of images as new pages in a PdfDocument object.
19-
20-
:param input_buffer_list: List of images, represented as buffers.
21-
:return: A PdfDocument handle.
22-
"""
23-
pdf = pdfium.PdfDocument.new()
24-
for input_buffer in input_buffer_list:
25-
input_buffer.seek(0)
26-
image = Image.open(input_buffer)
27-
image.convert("RGB")
28-
image_buffer = io.BytesIO()
29-
image.save(image_buffer, format="JPEG")
30-
31-
image_pdf = pdfium.PdfImage.new(pdf)
32-
image_pdf.load_jpeg(image_buffer)
33-
width, height = image_pdf.get_size()
34-
35-
matrix = pdfium.PdfMatrix().scale(width, height)
36-
image_pdf.set_matrix(matrix)
37-
38-
page = pdf.new_page(width, height)
39-
page.insert_obj(image_pdf)
40-
page.gen_content()
41-
image.close()
42-
return pdf
11+
from mindee.input.sources.bytes_input import BytesInput
12+
from mindee.input.sources.local_input_source import LocalInputSource
13+
from mindee.pdf.pdf_utils import attach_images_as_new_file
4314

4415

4516
def extract_image_from_polygon(
@@ -157,6 +128,6 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
157128
"""
158129
if input_file.is_pdf():
159130
input_file.file_object.seek(0)
160-
return pdfium.PdfDocument(input_file.file_object)
131+
return pdfium.PdfDocument(input_file.file_object.read())
161132

162133
return attach_images_as_new_file([input_file.file_object])

mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from mindee.extraction.common.image_extractor import (
66
extract_multiple_images_from_source,
77
)
8-
from mindee.input.sources import LocalInputSource
8+
from mindee.input.sources.local_input_source import LocalInputSource
99
from mindee.parsing.common.inference import Inference
1010

1111

mindee/extraction/pdf_extractor/extracted_pdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pypdfium2 as pdfium
55

66
from mindee.error.mindee_error import MindeeError
7-
from mindee.input.sources import BytesInput
7+
from mindee.input.sources.bytes_input import BytesInput
88

99

1010
class ExtractedPdf:

mindee/extraction/pdf_extractor/pdf_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from mindee.error.mindee_error import MindeeError
99
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
10-
from mindee.input.sources import LocalInputSource
10+
from mindee.input.sources.local_input_source import LocalInputSource
1111
from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import (
1212
InvoiceSplitterV1PageGroup,
1313
)

mindee/image_operations/image_compressor.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,27 @@
11
import io
2-
from typing import Union
2+
from typing import BinaryIO, Union
33

44
from PIL import Image
55

66

77
def compress_image(
8-
image_buffer: bytes,
8+
image_buffer: Union[BinaryIO, bytes],
99
quality: int = 85,
1010
max_width: Union[int, float, None] = None,
1111
max_height: Union[int, float, None] = None,
1212
) -> bytes:
1313
"""
1414
Compresses an image with the given parameters.
1515
16-
:param image_buffer: Buffer representation of an image.
16+
:param image_buffer: Buffer representation of an image, also accepts BinaryIO.
1717
:param quality: Quality to apply to the image (JPEG compression).
1818
:param max_width: Maximum bound for the width.
1919
:param max_height: Maximum bound for the height.
2020
:return:
2121
"""
22-
with Image.open(io.BytesIO(image_buffer)) as img:
22+
if isinstance(image_buffer, bytes):
23+
image_buffer = io.BytesIO(image_buffer)
24+
with Image.open(image_buffer) as img:
2325
original_width, original_height = img.size
2426
max_width = max_width or original_width
2527
max_height = max_height or original_height

mindee/input/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from mindee.input.sources.base_64_input import Base64Input
44
from mindee.input.sources.bytes_input import BytesInput
55
from mindee.input.sources.file_input import FileInput
6-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
6+
from mindee.input.sources.input_type import InputType
7+
from mindee.input.sources.local_input_source import LocalInputSource
78
from mindee.input.sources.path_input import PathInput
89
from mindee.input.sources.url_input_source import UrlInputSource
910
from mindee.input.workflow_options import WorkflowOptions

mindee/input/sources/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from mindee.input.sources.base_64_input import Base64Input
22
from mindee.input.sources.bytes_input import BytesInput
33
from mindee.input.sources.file_input import FileInput
4-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
4+
from mindee.input.sources.input_type import InputType
5+
from mindee.input.sources.local_input_source import LocalInputSource
56
from mindee.input.sources.path_input import PathInput
67
from mindee.input.sources.url_input_source import UrlInputSource

mindee/input/sources/base_64_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import base64
22
import io
33

4-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
4+
from mindee.input.sources.input_type import InputType
5+
from mindee.input.sources.local_input_source import LocalInputSource
56

67

78
class Base64Input(LocalInputSource):

mindee/input/sources/bytes_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import io
22

3-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
3+
from mindee.input.sources.input_type import InputType
4+
from mindee.input.sources.local_input_source import LocalInputSource
45

56

67
class BytesInput(LocalInputSource):

mindee/input/sources/file_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import os
22
from typing import BinaryIO
33

4-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
4+
from mindee.input.sources.input_type import InputType
5+
from mindee.input.sources.local_input_source import LocalInputSource
56

67

78
class FileInput(LocalInputSource):

mindee/input/sources/input_type.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from enum import Enum
2+
3+
4+
class InputType(Enum):
5+
"""The input type, for internal use."""
6+
7+
FILE = "file"
8+
BASE64 = "base64"
9+
BYTES = "bytes"
10+
PATH = "path"
11+
URL = "url"

mindee/input/sources/local_input_source.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
import io
22
import mimetypes
33
import tempfile
4-
from enum import Enum
54
from typing import BinaryIO, Optional, Sequence, Tuple
65

76
import pypdfium2 as pdfium
87

98
from mindee.error.mimetype_error import MimeTypeError
109
from mindee.error.mindee_error import MindeeError, MindeeSourceError
10+
from mindee.image_operations.image_compressor import compress_image
1111
from mindee.input.page_options import KEEP_ONLY, REMOVE
12+
from mindee.input.sources.input_type import InputType
1213
from mindee.logger import logger
14+
from mindee.pdf.pdf_compressor import compress_pdf
15+
from mindee.pdf.pdf_utils import has_source_text
1316

1417
mimetypes.add_type("image/heic", ".heic")
1518
mimetypes.add_type("image/heic", ".heif")
@@ -25,16 +28,6 @@
2528
]
2629

2730

28-
class InputType(Enum):
29-
"""The input type, for internal use."""
30-
31-
FILE = "file"
32-
BASE64 = "base64"
33-
BYTES = "bytes"
34-
PATH = "path"
35-
URL = "url"
36-
37-
3831
class LocalInputSource:
3932
"""Base class for all input sources coming from the local machine."""
4033

@@ -202,3 +195,43 @@ def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
202195
def close(self) -> None:
203196
"""Close the file object."""
204197
self.file_object.close()
198+
199+
def has_source_text(self) -> bool:
200+
"""
201+
If the file is a PDF, checks if it has source text.
202+
203+
:return: True if the file is a PDF and has source text. False otherwise.
204+
"""
205+
if not self.is_pdf():
206+
return False
207+
return has_source_text(self.file_object.read())
208+
209+
def compress(
210+
self,
211+
quality: int = 85,
212+
max_width: Optional[int] = None,
213+
max_height: Optional[int] = None,
214+
force_source_text: bool = False,
215+
disable_source_text: bool = True,
216+
) -> None:
217+
"""
218+
Compresses the file object, either as a PDF or an image.
219+
220+
:param quality: Quality of the compression. For images, this is the JPEG quality.
221+
For PDFs, this affects image quality within the PDF.
222+
:param max_width: Maximum width for image resizing. Ignored for PDFs.
223+
:param max_height: Maximum height for image resizing. Ignored for PDFs.
224+
:param force_source_text: For PDFs, whether to force compression even if source text is present.
225+
:param disable_source_text: For PDFs, whether to disable source text during compression.
226+
"""
227+
new_file_bytes: bytes
228+
if self.is_pdf():
229+
new_file_bytes = compress_pdf(
230+
self.file_object, quality, force_source_text, disable_source_text
231+
)
232+
else:
233+
new_file_bytes = compress_image(
234+
self.file_object, quality, max_width, max_height
235+
)
236+
237+
self.file_object = io.BytesIO(new_file_bytes)

mindee/input/sources/path_input.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
from pathlib import Path
33
from typing import Union
44

5-
from mindee.input.sources.local_input_source import InputType, LocalInputSource
5+
from mindee.input.sources.input_type import InputType
6+
from mindee.input.sources.local_input_source import LocalInputSource
67

78

89
class PathInput(LocalInputSource):

mindee/input/sources/url_input_source.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from mindee.error.mindee_error import MindeeSourceError
1212
from mindee.input.sources.bytes_input import BytesInput
13-
from mindee.input.sources.local_input_source import InputType
13+
from mindee.input.sources.input_type import InputType
1414
from mindee.logger import logger
1515

1616

mindee/mindee_http/endpoint.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import requests
55
from requests import Response
66

7-
from mindee.input.sources import LocalInputSource, UrlInputSource
7+
from mindee.input.sources.local_input_source import LocalInputSource
8+
from mindee.input.sources.url_input_source import UrlInputSource
89
from mindee.mindee_http.base_endpoint import BaseEndpoint
910
from mindee.mindee_http.mindee_api import MindeeApi
1011
from mindee.parsing.common.string_dict import StringDict

mindee/pdf/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from mindee.pdf.pdf_char_data import PDFCharData
2+
from mindee.pdf.pdf_compressor import compress_pdf
3+
from mindee.pdf.pdf_utils import (
4+
attach_images_as_new_file,
5+
extract_text_from_pdf,
6+
has_source_text,
7+
)

0 commit comments

Comments
 (0)