ROB: Flate decoding for streams with faulty tail bytes (#3332)

henningkoertelgmg · web-flow · commit ed645ca0a6b3 · 2025-06-27T13:12:25.000+02:00
Some FLATE encoded streams of early Adobe Distiller / Pitstop versions are written with additionally added CR bytes to the PDF and calculate the faulty tail bytes into Length value of stream dict. Later then decoding fails. Solved with removing step by step tail bytes until decoding is successful.
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -95,6 +95,23 @@ def decompress(data: bytes) -> bytes:
             # For larger files, use decompression object to enable buffered reading
             return zlib.decompressobj().decompress(data)
         except zlib.error:
+            # First quick approach for known issue with faulty added bytes to the
+            # tail of the encoded stream from early Adobe Distiller or Pitstop versions
+            # with CR char as the default line separator (assumed by reverse engeneering)
+            # that breaks the decoding process in the end.
+            #
+            # Try first to cut off some of the tail byte by byte, however limited to not
+            # iterate through too many loops and kill the performance for large streams,
+            # to then allow the final fallback to run. Added this intermediate attempt,
+            # because starting from the head of the stream byte by byte kills completely
+            # the performace for large streams (e.g. 6 MB) with the tail-byte-issue
+            # and takes ages. This solution is really fast:
+            max_tail_cut_off_bytes: int = 8
+            for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
+                try:
+                    return zlib.decompressobj().decompress(data[:-i])
+                except zlib.error:
+                    pass
             # If still failing, then try with increased window size
             d = zlib.decompressobj(zlib.MAX_WBITS | 32)
             result_str = b""
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -6,6 +6,7 @@
 from io import BytesIO
 from itertools import product as cartesian_product
 from pathlib import Path
+from typing import cast
 from unittest import mock
 
 import pytest
@@ -30,6 +31,7 @@
     NameObject,
     NullObject,
     NumberObject,
+    StreamObject,
 )
 
 from . import PILContext, get_data_from_url
@@ -790,3 +792,27 @@ def test_jbig2decode__edge_cases(caplog):
         "jbig2dec FATAL ERROR page has no image, cannot be completed",
         "jbig2dec WARNING unable to complete page"
     ]
+
+
+@pytest.mark.timeout(timeout=30, method="thread")
+@pytest.mark.enable_socket
+def test_flate_decode_stream_with_faulty_tail_bytes():
+    """
+    Test for #3332
+
+    The test ensures two things:
+        1. stream can be decoded at all
+        2. decoding doesn't falls through to last fallback in try-except blocks
+           that is too slow and takes ages for this stream
+    """
+    data = get_data_from_url(
+        url="https://github.com/user-attachments/files/20901522/faulty_stream_tail_example.1.pdf",
+        name="faulty_stream_tail_example.1.pdf"
+    )
+    expected = get_data_from_url(
+        url="https://github.com/user-attachments/files/20941717/decoded.dat.txt",
+        name="faulty_stream_tail_example.1.decoded.dat"
+    )
+    reader = PdfReader(BytesIO(data))
+    obj = reader.get_object(IndirectObject(182, 0, reader))
+    assert cast(StreamObject, obj).get_data() == expected