BUG: Reading large compressed images takes huge time to process (#2644)

snanda85 · web-flow · commit c227b0c725af · 2024-05-14T12:46:11.000+02:00
Added buffered reading for zlib decompression
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -80,14 +80,19 @@ def decompress(data: bytes) -> bytes:
     try:
         return zlib.decompress(data)
     except zlib.error:
-        d = zlib.decompressobj(zlib.MAX_WBITS | 32)
-        result_str = b""
-        for b in [data[i : i + 1] for i in range(len(data))]:
-            try:
-                result_str += d.decompress(b)
-            except zlib.error:
-                pass
-        return result_str
+        try:
+            # For larger files, use Decompress object to enable buffered reading
+            return zlib.decompressobj().decompress(data)
+        except zlib.error:
+            # If still failed, then try with increased window size
+            d = zlib.decompressobj(zlib.MAX_WBITS | 32)
+            result_str = b""
+            for b in [data[i : i + 1] for i in range(len(data))]:
+                try:
+                    result_str += d.decompress(b)
+                except zlib.error:
+                    pass
+            return result_str
 
 
 class FlateDecode:
diff --git a/tests/bench.py b/tests/bench.py
@@ -227,3 +227,15 @@ def test_image_new_property_performance(benchmark):
     data = BytesIO(get_data_from_url(url, name=name))
 
     benchmark(image_new_property, data)
+
+
+def image_extraction(data):
+    reader = PdfReader(data)
+    list(reader.pages[0].images)
+
+
+@pytest.mark.enable_socket()
+def test_large_compressed_image_performance(benchmark):
+    url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf"
+    data = BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf"))
+    benchmark(image_extraction, data)
diff --git a/tests/test_images.py b/tests/test_images.py
@@ -346,3 +346,11 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr):
             print(fn)  # noqa: T201
             img = Image.open(BytesIO(zf.read(fn)))
             assert image_similarity(reader.pages[p].images[i].image, img) >= 0.99
+
+
+@pytest.mark.enable_socket()
+@pytest.mark.timeout(30)
+def test_large_compressed_image():
+    url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf")))
+    list(reader.pages[0].images)