Skip to content

Commit 442e8d5

Browse files
ROB: ignore faulty trailing newline during RLE decoding (#3355)
Found PDFs from Dalim software with multi-encoded streams: inner stream is RLE, outer stream is FLATE. The inner stream contains a trailing newline char that breaks the RLE decoding. It seems that there was in some Dalim version a systematíc error that included the bytes of the inner stream just from raw PDF bytes with the trailing newline before "endstream". This is fixed with the changes by ignoring the trailing newline and raising a warning instead of an exception.
1 parent da7f0af commit 442e8d5

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

pypdf/filters.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,16 @@ def decode(
385385
length = data[index]
386386
index += 1
387387
if length == 128:
388-
if index < len(data):
388+
data_length = len(data)
389+
if index < data_length:
390+
# We should first check, if we have an inner stream from a multi-encoded
391+
# stream with a faulty trailing newline that we can decode properly.
392+
# We will just ignore the last byte and raise a warning ...
393+
if (index == data_length - 1) and (data[index : index+1] == b"\n"):
394+
logger_warning(
395+
"Found trailing newline in stream data, check if output is OK", __name__
396+
)
397+
break
389398
raise PdfStreamError("Early EOD in RunLengthDecode")
390399
break
391400
if length < 128:

tests/test_filters.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
CCITTParameters,
2323
FlateDecode,
2424
JBIG2Decode,
25+
RunLengthDecode,
2526
)
2627
from pypdf.generic import (
2728
ArrayObject,
@@ -816,3 +817,38 @@ def test_flate_decode_stream_with_faulty_tail_bytes():
816817
reader = PdfReader(BytesIO(data))
817818
obj = reader.get_object(IndirectObject(182, 0, reader))
818819
assert cast(StreamObject, obj).get_data() == expected
820+
821+
822+
@pytest.mark.enable_socket
823+
def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog):
824+
"""
825+
Test for #3355
826+
827+
The test ensures that the inner RLE encoded stream can be decoded,
828+
because this stream contains an extra faulty newline byte in the
829+
end that can be ignored during decoding.
830+
"""
831+
data = get_data_from_url(
832+
url="https://github.com/user-attachments/files/21038398/test_data_rle.txt",
833+
name="multi_decoding_example_with_faulty_tail_byte.pdf"
834+
)
835+
reader = PdfReader(BytesIO(data))
836+
obj = reader.get_object(IndirectObject(60, 0, reader))
837+
cast(StreamObject, obj).get_data()
838+
assert "Found trailing newline in stream data, check if output is OK" in caplog.messages
839+
840+
841+
@pytest.mark.enable_socket
842+
def test_rle_decode_exception_with_corrupted_stream():
843+
"""
844+
Additional Test to #3355
845+
846+
This test must raise the EOD exception during RLE decoding and ensures
847+
that we do not fail during code coverage analyses in the git PR pipeline.
848+
"""
849+
data = get_data_from_url(
850+
url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt",
851+
name="rle_stream_with_error.txt"
852+
)
853+
with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
854+
RunLengthDecode.decode(data)

0 commit comments

Comments
 (0)