Skip to content

Commit ed645ca

Browse files
ROB: Flate decoding for streams with faulty tail bytes (#3332)
Some FLATE encoded streams of early Adobe Distiller / Pitstop versions are written with additionally added CR bytes to the PDF and calculate the faulty tail bytes into Length value of stream dict. Later then decoding fails. Solved with removing step by step tail bytes until decoding is successful.
1 parent 7c3db03 commit ed645ca

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

pypdf/filters.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,23 @@ def decompress(data: bytes) -> bytes:
9595
# For larger files, use decompression object to enable buffered reading
9696
return zlib.decompressobj().decompress(data)
9797
except zlib.error:
98+
# First quick approach for known issue with faulty added bytes to the
99+
# tail of the encoded stream from early Adobe Distiller or Pitstop versions
100+
# with CR char as the default line separator (assumed by reverse engeneering)
101+
# that breaks the decoding process in the end.
102+
#
103+
# Try first to cut off some of the tail byte by byte, however limited to not
104+
# iterate through too many loops and kill the performance for large streams,
105+
# to then allow the final fallback to run. Added this intermediate attempt,
106+
# because starting from the head of the stream byte by byte kills completely
107+
# the performace for large streams (e.g. 6 MB) with the tail-byte-issue
108+
# and takes ages. This solution is really fast:
109+
max_tail_cut_off_bytes: int = 8
110+
for i in range(1, min(max_tail_cut_off_bytes + 1, len(data))):
111+
try:
112+
return zlib.decompressobj().decompress(data[:-i])
113+
except zlib.error:
114+
pass
98115
# If still failing, then try with increased window size
99116
d = zlib.decompressobj(zlib.MAX_WBITS | 32)
100117
result_str = b""

tests/test_filters.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from io import BytesIO
77
from itertools import product as cartesian_product
88
from pathlib import Path
9+
from typing import cast
910
from unittest import mock
1011

1112
import pytest
@@ -30,6 +31,7 @@
3031
NameObject,
3132
NullObject,
3233
NumberObject,
34+
StreamObject,
3335
)
3436

3537
from . import PILContext, get_data_from_url
@@ -790,3 +792,27 @@ def test_jbig2decode__edge_cases(caplog):
790792
"jbig2dec FATAL ERROR page has no image, cannot be completed",
791793
"jbig2dec WARNING unable to complete page"
792794
]
795+
796+
797+
@pytest.mark.timeout(timeout=30, method="thread")
798+
@pytest.mark.enable_socket
799+
def test_flate_decode_stream_with_faulty_tail_bytes():
800+
"""
801+
Test for #3332
802+
803+
The test ensures two things:
804+
1. stream can be decoded at all
805+
2. decoding doesn't falls through to last fallback in try-except blocks
806+
that is too slow and takes ages for this stream
807+
"""
808+
data = get_data_from_url(
809+
url="https://github.com/user-attachments/files/20901522/faulty_stream_tail_example.1.pdf",
810+
name="faulty_stream_tail_example.1.pdf"
811+
)
812+
expected = get_data_from_url(
813+
url="https://github.com/user-attachments/files/20941717/decoded.dat.txt",
814+
name="faulty_stream_tail_example.1.decoded.dat"
815+
)
816+
reader = PdfReader(BytesIO(data))
817+
obj = reader.get_object(IndirectObject(182, 0, reader))
818+
assert cast(StreamObject, obj).get_data() == expected

0 commit comments

Comments
 (0)