py-pdf · ash01ish · Apr 21, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 17, 2025
diff --git a/pypdf/constants.py b/pypdf/constants.py
@@ -245,6 +245,7 @@ class FilterTypes(StrEnum):
     CCITT_FAX_DECODE = "/CCITTFaxDecode"  # abbreviation: CCF
     DCT_DECODE = "/DCTDecode"  # abbreviation: DCT
     JPX_DECODE = "/JPXDecode"
+    BROTLI_DECODE = "/BrotliDecode"   # abbreviation: Br, PDF 2.0
     JBIG2_DECODE = "/JBIG2Decode"
 
 
@@ -258,6 +259,7 @@ class FilterTypeAbbreviations:
     RL = "/RL"
     CCF = "/CCF"
     DCT = "/DCT"
+    BR = "/Br"
 
 
 class LzwFilterParameters:

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -72,6 +72,11 @@
     is_null_or_none,
 )
 
+try:
+    import brotli
+except ImportError:
+    brotli = None
+
 
 def decompress(data: bytes) -> bytes:
     """
@@ -513,6 +518,68 @@
         return data
 
 
+class BrotliDecode:
+    """
+    Decompress the given data using Brotli.
+
+    Decodes data that has been encoded using the Brotli compression algorithm.
+    Brotli is a general-purpose lossless compression algorithm that combines
+    LZ77 and Huffman coding. It typically achieves better compression ratios
+    than Flate encoding, though with slightly slower compression speeds.
+
+    See ISO 32000-2:2020, Section 7.4.11.
+
+    Args:
+        data: The input data to be decompressed.
+        decode_parms: Optional decoding parameters (currently unused).
+        **kwargs: Additional keyword arguments (currently unused).
+
+    Returns:
+        The decompressed data.
+    """
+    @staticmethod
+    def decode(
+        data: bytes,
+        decode_parms: Optional[DictionaryObject] = None,
+        **kwargs: Any,
+    ) -> bytes:
+        """
+        Decode Brotli-compressed data.
+
+        Args:
+            data: Brotli-compressed data.
+            decode_parms: A dictionary of parameter values (unused).
+
+        Returns:
+            The decompressed data.
+
+        Raises:
+            ImportError: If the 'brotli' library is not installed.
+        """
+        if brotli is None:
+            raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
+        return brotli.decompress(data)
+
+    @staticmethod
+    def encode(data: bytes, **kwargs: Any) -> bytes:
+        """
+        Encode data using Brotli compression.
+
+        Args:
+            data: The data to be compressed.
+            **kwargs: Additional keyword arguments (unused).
+
+        Returns:
+            The compressed data.
+
+        Raises:
+            ImportError: If the 'brotli' library is not installed.
+        """
+        if brotli is None:
+            raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
+        return brotli.compress(data)
+
+
 @dataclass
 class CCITTParameters:
     """§7.4.6, optional parameters for the CCITTFaxDecode filter."""
@@ -759,6 +826,8 @@
             data = DCTDecode.decode(data)
         elif filter_name == FT.JPX_DECODE:
             data = JPXDecode.decode(data)
+        elif filter_name == FT.BROTLI_DECODE:
+            data = BrotliDecode.decode(data)
         elif filter_name == FT.JBIG2_DECODE:
             data = JBIG2Decode.decode(data, params)
         elif filter_name == "/Crypt":

diff --git a/pyproject.toml b/pyproject.toml
@@ -42,9 +42,11 @@ Source = "https://github.com/py-pdf/pypdf"
 crypto = ["cryptography"]
 cryptodome = ["PyCryptodome"]
 image = ["Pillow>=8.0.0"]
+brotli = ["brotli"]
 full = [
     "cryptography",
-    "Pillow>=8.0.0"
+    "Pillow>=8.0.0",
+    "brotli",
 ]
 dev = [
     "black",

diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt
@@ -4,6 +4,8 @@
 #
 #    pip-compile --output-file=requirements/ci-3.11.txt requirements/ci.in
 #
+brotli==1.1.0
+    # via -r requirements/ci.in
 cffi==1.17.1
     # via cryptography
 coverage[toml]==7.6.4

diff --git a/requirements/ci.in b/requirements/ci.in
@@ -13,3 +13,4 @@ pytest-cov
 typeguard
 types-Pillow
 pyyaml
+brotli
diff --git a/requirements/ci.txt b/requirements/ci.txt
@@ -4,6 +4,8 @@
 #
 #    pip-compile requirements/ci.in
 #
+brotli==1.1.0
+    # via -r requirements/ci.in
 cffi==1.17.1
     # via cryptography
 coverage[toml]==7.6.1

diff --git a/requirements/dev.in b/requirements/dev.in
@@ -4,3 +4,4 @@ pre-commit
 pytest-cov
 flit
 wheel
+brotli
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -4,6 +4,8 @@
 #
 #    pip-compile requirements/dev.in
 #
+brotli==1.1.0
+    # via -r requirements/ci.in
 build==1.2.2.post1
     # via pip-tools
 certifi==2024.8.30

diff --git a/resources/brotli-test-pdfs/minimal-brotli-compressed.pdf b/resources/brotli-test-pdfs/minimal-brotli-compressed.pdf
diff --git a/resources/create_brotli_test_pdf.py b/resources/create_brotli_test_pdf.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+"""
+Create a minimal PDF with Brotli compression for testing purposes.
+
+This script generates a simple PDF file that uses Brotli compression
+for the content stream, allowing for testing of the BrotliDecode filter
+in pypdf.
+
+Note: /BrotliDecode is not a standard PDF filter. This file is specifically
+for testing PDF library support for this filter (e.g., in pypdf).
+Standard PDF viewers will likely not render this file correctly.
+"""
+
+import logging
+from pathlib import Path
+
+import brotli
+
+logging.basicConfig(level=logging.INFO, format="%(name)s: %(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+content_stream = b"BT /F1 24 Tf 100 700 Td (Hello, Brotli!) Tj ET"
+compressed_content = brotli.compress(content_stream, quality=5)
+
+xref_offsets = [0] * 6
+current_offset = 0
+pdf_parts = []
+
+part = b"%PDF-1.7\n%\xc2\xa5\xc2\xb1\xc3\xab\xc3\xbf\n" # Binary marker
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[1] = current_offset
+
+part = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[2] = current_offset
+
+part = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[3] = current_offset
+
+part = (
+    b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
+    b"/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n"
+)
+pdf_parts.append(part)
+current_offset += len(part)
+xref_offsets[4] = current_offset
+
+part_header = (
+    f"4 0 obj\n<< /Length {len(compressed_content)} /Filter /BrotliDecode >>\nstream\n"
+).encode("ascii")
+part_footer = b"\nendstream\nendobj\n"
+pdf_parts.append(part_header)
+pdf_parts.append(compressed_content)
+pdf_parts.append(part_footer)
+current_offset += len(part_header) + len(compressed_content) + len(part_footer)
+xref_offsets[5] = current_offset
+
+part = b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
+pdf_parts.append(part)
+current_offset += len(part)
+xref_table_start_offset = current_offset
+
+xref_lines = [b"xref\n0 6\n", b"0000000000 65535 f \n"]
+xref_lines.extend(
+    f"{xref_offsets[i]:010d} 00000 n \n".encode("ascii") for i in range(1, 6)
+)
+pdf_parts.extend(xref_lines)
+
+trailer = (
+    f"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_table_start_offset}\n%%EOF"
+).encode("ascii")
+pdf_parts.append(trailer)
+
+script_path = Path(__file__).resolve()
+output_dir = script_path.parent / "brotli-test-pdfs"
+output_path = output_dir / "minimal-brotli-compressed.pdf"
+
+output_dir.mkdir(parents=True, exist_ok=True)
+
+try:
+    with open(output_path, "wb") as f:
+        for part in pdf_parts:
+            f.write(part)
+    logger.info(f"Created test PDF with Brotli compression at: {output_path}")
+except OSError:
+    logger.exception("Error writing PDF file")
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -1,4 +1,5 @@
 """Test the pypdf.filters module."""
+
 import os
 import shutil
 import string
@@ -7,6 +8,7 @@
 from itertools import product as cartesian_product
 from pathlib import Path
 from unittest import mock
+from unittest.mock import patch
 
 import pytest
 from PIL import Image, ImageOps
@@ -16,6 +18,7 @@
 from pypdf.filters import (
     ASCII85Decode,
     ASCIIHexDecode,
+    BrotliDecode,
     CCITParameters,
     CCITTFaxDecode,
     CCITTParameters,
@@ -34,6 +37,12 @@
 
 from . import PILContext, get_data_from_url
 from .test_encryption import HAS_AES
+
+try:
+    import brotli  # noqa: F401
+    HAS_BROTLI = True
+except ImportError:
+    HAS_BROTLI = False
 from .test_images import image_similarity
 
 filter_inputs = (
@@ -62,6 +71,52 @@ def test_flate_decode_encode(predictor, s):
     assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s
 
 
+@pytest.mark.parametrize("s", filter_inputs)
+@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed")
+def test_brotli_decode_encode(s):
+
+    codec = BrotliDecode()
+    s_bytes = s.encode()
+    encoded = codec.encode(s_bytes)
+    assert encoded != s_bytes  # Ensure encoding actually happened
+    decoded = codec.decode(encoded)
+    assert decoded == s_bytes
+
+
+@patch("pypdf.filters.brotli", None)
+def test_brotli_missing_installation():
+    from pypdf.filters import BrotliDecode, decode_stream_data  # noqa: PLC0415
+
+    # Test direct decode call
+    codec = BrotliDecode()
+    with pytest.raises(ImportError) as exc_info_decode:
+        codec.decode(b"test data")
+    assert "Brotli library not installed" in str(exc_info_decode.value)
+
+    # Test direct encode call
+    with pytest.raises(ImportError) as exc_info_encode:
+        codec.encode(b"test data")
+    assert "Brotli library not installed" in str(exc_info_encode.value)
+
+    # Test call via decode_stream_data
+    stream = DictionaryObject()
+    stream[NameObject("/Filter")] = NameObject("/BrotliDecode")
+    stream._data = b"dummy compressed data"
+    with pytest.raises(ImportError) as exc_info_stream:
+        decode_stream_data(stream)
+    assert "Brotli library not installed" in str(exc_info_stream.value)
+
+
+@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed")
+def test_brotli_decode_encode_with_real_module():
+
+    s = b"Hello, Brotli!"
+    codec = BrotliDecode()
+    encoded = codec.encode(s)
+    assert encoded != s  # Ensure encoding actually happened
+    assert codec.decode(encoded) == s
+
+
 def test_flatedecode_unsupported_predictor():
     """
     FlateDecode raises PdfReadError for unsupported predictors.
@@ -383,7 +438,9 @@ def test_iss1787():
     obj = data.indirect_reference.get_object()
     obj["/DecodeParms"][NameObject("/Columns")] = NumberObject(1000)
     obj.decoded_self = None
-    with pytest.raises(expected_exception=PdfReadError, match="^Unsupported PNG filter 244$"):
+    with pytest.raises(
+        expected_exception=PdfReadError, match="^Unsupported PNG filter 244$"
+    ):
         _ = reader.pages[0].images[0]
 
 
@@ -700,7 +757,9 @@ def test_flate_decode__not_rectangular(caplog):
     decode_parms[NameObject("/Columns")] = NumberObject(4881)
     actual = FlateDecode.decode(data=data, decode_parms=decode_parms)
     actual_image = BytesIO()
-    Image.frombytes(mode="1", size=(4881, 81), data=actual).save(actual_image, format="png")
+    Image.frombytes(mode="1", size=(4881, 81), data=actual).save(
+        actual_image, format="png"
+    )
 
     url = "https://github.com/user-attachments/assets/c5695850-c076-4255-ab72-7c86851a4a04"
     name = "issue3241.png"
@@ -709,6 +768,23 @@ def test_flate_decode__not_rectangular(caplog):
     assert caplog.messages == ["Image data is not rectangular. Adding padding."]
 
 
+@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed")
+def test_main_decode_brotli_installed():
+
+    pdf_path = RESOURCE_ROOT / "brotli-test-pdfs" / "minimal-brotli-compressed.pdf"
+
+    reader = PdfReader(pdf_path)
+    page = reader.pages[0]
+
+    # This test specifically exercises the BrotliDecode path in decode_stream_data function
+    # when processing a real PDF with BrotliDecode filter
+    extracted_text = page.extract_text()
+
+    assert extracted_text.strip() == "Hello, Brotli!"
+
+
+def test_brotli_module_importability():
+    assert BrotliDecode is not None
 def test_jbig2decode__binary_errors():
     with mock.patch("pypdf.filters.JBIG2DEC_BINARY", None), \
             pytest.raises(DependencyError, match="jbig2dec binary is not available."):
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ pytest-cov @@
     typeguard
     types-Pillow
     pyyaml
+    brotli