ROB: ignore faulty trailing newline during RLE decoding (#3355)

henningkoertelgmg · web-flow · commit 442e8d501db4 · 2025-07-04T11:05:48.000+02:00
Found PDFs from Dalim software with multi-encoded streams: inner stream is RLE, outer stream is FLATE. The inner stream contains a trailing newline char that breaks the RLE decoding. It seems that there was in some Dalim version a systematíc error that included the bytes of the inner stream just from raw PDF bytes with the trailing newline before "endstream".
This is fixed with the changes by ignoring the trailing newline and raising a warning instead of an exception.
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -385,7 +385,16 @@ def decode(
             length = data[index]
             index += 1
             if length == 128:
-                if index < len(data):
+                data_length = len(data)
+                if index < data_length:
+                    # We should first check, if we have an inner stream from a multi-encoded
+                    # stream with a faulty trailing newline that we can decode properly.
+                    # We will just ignore the last byte and raise a warning ...
+                    if (index == data_length - 1) and (data[index : index+1] == b"\n"):
+                        logger_warning(
+                            "Found trailing newline in stream data, check if output is OK", __name__
+                        )
+                        break
                     raise PdfStreamError("Early EOD in RunLengthDecode")
                 break
             if length < 128:
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -22,6 +22,7 @@
     CCITTParameters,
     FlateDecode,
     JBIG2Decode,
+    RunLengthDecode,
 )
 from pypdf.generic import (
     ArrayObject,
@@ -816,3 +817,38 @@ def test_flate_decode_stream_with_faulty_tail_bytes():
     reader = PdfReader(BytesIO(data))
     obj = reader.get_object(IndirectObject(182, 0, reader))
     assert cast(StreamObject, obj).get_data() == expected
+
+
+@pytest.mark.enable_socket
+def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog):
+    """
+    Test for #3355
+
+    The test ensures that the inner RLE encoded stream can be decoded,
+    because this stream contains an extra faulty newline byte in the
+    end that can be ignored during decoding.
+    """
+    data = get_data_from_url(
+        url="https://github.com/user-attachments/files/21038398/test_data_rle.txt",
+        name="multi_decoding_example_with_faulty_tail_byte.pdf"
+    )
+    reader = PdfReader(BytesIO(data))
+    obj = reader.get_object(IndirectObject(60, 0, reader))
+    cast(StreamObject, obj).get_data()
+    assert "Found trailing newline in stream data, check if output is OK" in caplog.messages
+
+
+@pytest.mark.enable_socket
+def test_rle_decode_exception_with_corrupted_stream():
+    """
+    Additional Test to #3355
+
+    This test must raise the EOD exception during RLE decoding and ensures
+    that we do not fail during code coverage analyses in the git PR pipeline.
+    """
+    data = get_data_from_url(
+        url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt",
+        name="rle_stream_with_error.txt"
+    )
+    with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
+        RunLengthDecode.decode(data)