Skip to content

Commit bcc2189

Browse files
🧪 ✨ add **experimental** pdf-fixing utility (#190)
1 parent 9f04f00 commit bcc2189

File tree

3 files changed

+86
-29
lines changed

3 files changed

+86
-29
lines changed

mindee/client.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -447,60 +447,58 @@ def create_endpoint(
447447
return self._build_endpoint(endpoint_name, account_name, version)
448448

449449
def source_from_path(
450-
self,
451-
input_path: Union[Path, str],
450+
self, input_path: Union[Path, str], fix_pdf: bool = False
452451
) -> PathInput:
453452
"""
454453
Load a document from an absolute path, as a string.
455454
456455
:param input_path: Path of file to open
457456
"""
458-
return PathInput(input_path)
457+
input_doc = PathInput(input_path)
458+
if fix_pdf:
459+
input_doc.fix_pdf()
460+
return input_doc
459461

460462
def source_from_file(
461-
self,
462-
input_file: BinaryIO,
463+
self, input_file: BinaryIO, fix_pdf: bool = False
463464
) -> FileInput:
464465
"""
465466
Load a document from a normal Python file object/handle.
466467
467468
:param input_file: Input file handle
468469
"""
469-
return FileInput(
470-
input_file,
471-
)
470+
input_doc = FileInput(input_file)
471+
if fix_pdf:
472+
input_doc.fix_pdf()
473+
return input_doc
472474

473475
def source_from_b64string(
474-
self,
475-
input_string: str,
476-
filename: str,
476+
self, input_string: str, filename: str, fix_pdf: bool = False
477477
) -> Base64Input:
478478
"""
479479
Load a document from a base64 encoded string.
480480
481481
:param input_string: Input to parse as base64 string
482482
:param filename: The name of the file (without the path)
483483
"""
484-
return Base64Input(
485-
input_string,
486-
filename,
487-
)
484+
input_doc = Base64Input(input_string, filename)
485+
if fix_pdf:
486+
input_doc.fix_pdf()
487+
return input_doc
488488

489489
def source_from_bytes(
490-
self,
491-
input_bytes: bytes,
492-
filename: str,
490+
self, input_bytes: bytes, filename: str, fix_pdf: bool = False
493491
) -> BytesInput:
494492
"""
495493
Load a document from raw bytes.
496494
497495
:param input_bytes: Raw byte input
498496
:param filename: The name of the file (without the path)
499497
"""
500-
return BytesInput(
501-
input_bytes,
502-
filename,
503-
)
498+
input_doc = BytesInput(input_bytes, filename)
499+
if fix_pdf:
500+
input_doc.fix_pdf()
501+
return input_doc
504502

505503
def source_from_url(
506504
self,

mindee/input/sources.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22
import io
33
import mimetypes
44
import os
5+
import tempfile
56
from enum import Enum
67
from pathlib import Path
78
from typing import BinaryIO, Optional, Sequence, Tuple, Union
89

910
import pikepdf
1011

1112
from mindee.error.mimetype_error import MimeTypeError
12-
from mindee.error.mindee_error import MindeeSourceError
13+
from mindee.error.mindee_error import MindeeError, MindeeSourceError
1314
from mindee.input.page_options import KEEP_ONLY, REMOVE
1415
from mindee.logger import logger
1516

@@ -46,10 +47,7 @@ class LocalInputSource:
4647
input_type: InputType
4748
filepath: Optional[str] = None
4849

49-
def __init__(
50-
self,
51-
input_type: InputType,
52-
):
50+
def __init__(self, input_type: InputType):
5351
self.input_type = input_type
5452
self._check_mimetype()
5553

@@ -60,11 +58,51 @@ def _check_mimetype(self) -> None:
6058
if file_mimetype:
6159
self.file_mimetype = file_mimetype
6260
else:
63-
raise MimeTypeError(f"Could not determine MIME type of '{self.filename}'")
61+
raise MimeTypeError(f"Could not determine MIME type of '{self.filename}'.")
6462

6563
if self.file_mimetype not in ALLOWED_MIME_TYPES:
6664
file_types = ", ".join(ALLOWED_MIME_TYPES)
67-
raise MimeTypeError(f"File type not allowed, must be one of {file_types}")
65+
raise MimeTypeError(f"File type not allowed, must be one of {file_types}.")
66+
67+
def fix_pdf(self, maximum_offset: int = 500) -> None:
68+
"""
69+
Fix a potentially broken pdf file.
70+
71+
WARNING: this feature alters the data of the enqueued file by removing unnecessary headers.
72+
73+
Reads the bytes of a PDF file until a proper pdf tag is encountered, or until the maximum offset has been
74+
reached. If a tag denoting a PDF file is found, deletes all bytes before it.
75+
76+
:param maximum_offset: maximum byte offset where superfluous headers will be removed. Cannot be less than 0.
77+
"""
78+
if maximum_offset < 0:
79+
raise MindeeError("Can't set maximum offset for pdf-fixing to less than 0.")
80+
try:
81+
buf = self.file_object.read()
82+
self.file_object.seek(0)
83+
pos: int = buf.find(b"%PDF-")
84+
if pos != -1 and pos < maximum_offset:
85+
self.file_object.seek(pos)
86+
raw_bytes = self.file_object.read()
87+
temp_file = tempfile.TemporaryFile()
88+
temp_file.write(raw_bytes)
89+
temp_file.seek(0)
90+
self.file_object = io.BytesIO(temp_file.read())
91+
temp_file.close()
92+
else:
93+
if pos < 0:
94+
raise MimeTypeError(
95+
"Provided stream isn't a valid PDF-like object."
96+
)
97+
raise MimeTypeError(
98+
f"PDF couldn't be fixed. PDF tag was found at position {pos}."
99+
)
100+
self.file_mimetype = "application/pdf"
101+
except MimeTypeError as exc:
102+
raise exc
103+
except Exception as exc:
104+
print(f"Attempt to fix pdf raised exception {exc}.")
105+
raise exc
68106

69107
def is_pdf(self) -> bool:
70108
""":return: True if the file is a PDF."""

tests/test_inputs.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,27 @@ def test_pdf_blank_check():
170170
assert input_not_blank.count_doc_pages() == 1
171171

172172

173+
#
174+
# Broken PDFS fixing
175+
#
176+
177+
178+
def test_broken_unfixable_pdf():
179+
with pytest.raises(MimeTypeError):
180+
input_doc = PathInput(FILE_TYPES_DIR / "pdf" / "broken_unfixable.pdf")
181+
input_doc.fix_pdf()
182+
183+
184+
def test_broken_fixable_pdf():
185+
input_doc = PathInput(FILE_TYPES_DIR / "pdf" / "broken_fixable.pdf")
186+
input_doc.fix_pdf()
187+
188+
189+
def test_broken_fixable_invoice_pdf():
190+
input_doc = PathInput(FILE_TYPES_DIR / "pdf" / "broken_invoice.pdf")
191+
input_doc.fix_pdf()
192+
193+
173194
#
174195
# Images
175196
#

0 commit comments

Comments
 (0)