Skip to content

Commit c227b0c

Browse files
authored
BUG: Reading large compressed images takes huge time to process (#2644)
Added buffered reading for zlib decompression
1 parent 6226d66 commit c227b0c

File tree

3 files changed

+33
-8
lines changed

3 files changed

+33
-8
lines changed

pypdf/filters.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,14 +80,19 @@ def decompress(data: bytes) -> bytes:
8080
try:
8181
return zlib.decompress(data)
8282
except zlib.error:
83-
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
84-
result_str = b""
85-
for b in [data[i : i + 1] for i in range(len(data))]:
86-
try:
87-
result_str += d.decompress(b)
88-
except zlib.error:
89-
pass
90-
return result_str
83+
try:
84+
# For larger files, use Decompress object to enable buffered reading
85+
return zlib.decompressobj().decompress(data)
86+
except zlib.error:
87+
# If still failed, then try with increased window size
88+
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
89+
result_str = b""
90+
for b in [data[i : i + 1] for i in range(len(data))]:
91+
try:
92+
result_str += d.decompress(b)
93+
except zlib.error:
94+
pass
95+
return result_str
9196

9297

9398
class FlateDecode:

tests/bench.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,15 @@ def test_image_new_property_performance(benchmark):
227227
data = BytesIO(get_data_from_url(url, name=name))
228228

229229
benchmark(image_new_property, data)
230+
231+
232+
def image_extraction(data):
233+
reader = PdfReader(data)
234+
list(reader.pages[0].images)
235+
236+
237+
@pytest.mark.enable_socket()
238+
def test_large_compressed_image_performance(benchmark):
239+
url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf"
240+
data = BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf"))
241+
benchmark(image_extraction, data)

tests/test_images.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,3 +346,11 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
346346
print(fn) # noqa: T201
347347
img = Image.open(BytesIO(zf.read(fn)))
348348
assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99
349+
350+
351+
@pytest.mark.enable_socket()
352+
@pytest.mark.timeout(30)
353+
def test_large_compressed_image():
354+
url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf"
355+
reader = PdfReader(BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf")))
356+
list(reader.pages[0].images)

0 commit comments

Comments
 (0)