Skip to content

ENH: Add support for BrotliDecode filter (PDF 2.0) #3223 #3254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
025226a
ENH: Add support for BrotliDecode filter (PDF 2.0) #3223
ash01ish Apr 21, 2025
97fc4a4
STY: Slightly increase readability (#3098)
j-t-1 Apr 15, 2025
7da38f1
BUG: Get font information more reliably when removing text (#3252)
samuelbradshaw Apr 15, 2025
5b7ac7e
MAINT: Modify some comments (#3256)
j-t-1 Apr 17, 2025
50c230d
STY: Remove variable check_crlf_space (#3096)
j-t-1 Apr 17, 2025
42b9efe
MAINT: Remove an unused variable and zero from the start of slices (#…
j-t-1 Apr 17, 2025
2d03292
MAINT: Tweak comments (#3257)
j-t-1 Apr 17, 2025
9c064d3
BUG: T* 2D Translation consistent with PDF 1.7 Spec (#3250)
hackowitz-af Apr 14, 2025
cbbea23
STY: Remove variable check_crlf_space (#3096)
j-t-1 Apr 17, 2025
3ba2235
update changes as suggested
ash01ish May 9, 2025
7dcd2e9
Merge branch 'main' into feat/add-brotli-decode
ash01ish May 9, 2025
3935ea2
Update test_text_extraction.py
ash01ish May 9, 2025
b4ac3d9
Update test_filters.py
ash01ish May 9, 2025
666b871
Update test_filters.py
ash01ish May 9, 2025
48cee95
remove unused imports
ash01ish May 9, 2025
94b6485
change import error test case
ash01ish May 9, 2025
9b8b80a
fix code fmt
ash01ish May 9, 2025
e099ead
improve as per suggestions.
ash01ish May 17, 2025
ab1c492
fix code fmt
ash01ish May 17, 2025
dc2b4db
remove whitespace
ash01ish May 17, 2025
fd842b3
Merge branch 'main' into feat/add-brotli-decode
ash01ish Jun 5, 2025
1016c29
Update test_filters.py
ash01ish Jun 5, 2025
b0bf326
Merge branch 'main' into feat/add-brotli-decode
ash01ish Jun 24, 2025
f143805
fix code fmt issues
ash01ish Jun 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pypdf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ class FilterTypes(StrEnum):
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
JPX_DECODE = "/JPXDecode"
BROTLI_DECODE = "/BrotliDecode" # abbreviation: Br, PDF 2.0
JBIG2_DECODE = "/JBIG2Decode"


Expand All @@ -258,6 +259,7 @@ class FilterTypeAbbreviations:
RL = "/RL"
CCF = "/CCF"
DCT = "/DCT"
BR = "/Br"


class LzwFilterParameters:
Expand Down
69 changes: 69 additions & 0 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@
is_null_or_none,
)

try:
import brotli
except ImportError:
brotli = None

Check warning on line 78 in pypdf/filters.py

View check run for this annotation

Codecov / codecov/patch

pypdf/filters.py#L77-L78

Added lines #L77 - L78 were not covered by tests


def decompress(data: bytes) -> bytes:
"""
Expand Down Expand Up @@ -513,6 +518,68 @@
return data


class BrotliDecode:
"""
Decompress the given data using Brotli.

Decodes data that has been encoded using the Brotli compression algorithm.
Brotli is a general-purpose lossless compression algorithm that combines
LZ77 and Huffman coding. It typically achieves better compression ratios
than Flate encoding, though with slightly slower compression speeds.

See ISO 32000-2:2020, Section 7.4.11.

Args:
data: The input data to be decompressed.
decode_parms: Optional decoding parameters (currently unused).
**kwargs: Additional keyword arguments (currently unused).

Returns:
The decompressed data.
"""
@staticmethod
def decode(
data: bytes,
decode_parms: Optional[DictionaryObject] = None,
**kwargs: Any,
) -> bytes:
"""
Decode Brotli-compressed data.

Args:
data: Brotli-compressed data.
decode_parms: A dictionary of parameter values (unused).

Returns:
The decompressed data.

Raises:
ImportError: If the 'brotli' library is not installed.
"""
if brotli is None:
raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
return brotli.decompress(data)

@staticmethod
def encode(data: bytes, **kwargs: Any) -> bytes:
"""
Encode data using Brotli compression.

Args:
data: The data to be compressed.
**kwargs: Additional keyword arguments (unused).

Returns:
The compressed data.

Raises:
ImportError: If the 'brotli' library is not installed.
"""
if brotli is None:
raise ImportError("Brotli library not installed. Required for BrotliDecode filter.")
return brotli.compress(data)


@dataclass
class CCITTParameters:
"""§7.4.6, optional parameters for the CCITTFaxDecode filter."""
Expand Down Expand Up @@ -759,6 +826,8 @@
data = DCTDecode.decode(data)
elif filter_name == FT.JPX_DECODE:
data = JPXDecode.decode(data)
elif filter_name == FT.BROTLI_DECODE:
data = BrotliDecode.decode(data)
elif filter_name == FT.JBIG2_DECODE:
data = JBIG2Decode.decode(data, params)
elif filter_name == "/Crypt":
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ Source = "https://github.com/py-pdf/pypdf"
crypto = ["cryptography"]
cryptodome = ["PyCryptodome"]
image = ["Pillow>=8.0.0"]
brotli = ["brotli"]
full = [
"cryptography",
"Pillow>=8.0.0"
"Pillow>=8.0.0",
"brotli",
]
dev = [
"black",
Expand Down
2 changes: 2 additions & 0 deletions requirements/ci-3.11.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#
# pip-compile --output-file=requirements/ci-3.11.txt requirements/ci.in
#
brotli==1.1.0
# via -r requirements/ci.in
cffi==1.17.1
# via cryptography
coverage[toml]==7.6.4
Expand Down
1 change: 1 addition & 0 deletions requirements/ci.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ pytest-cov
typeguard
types-Pillow
pyyaml
brotli
2 changes: 2 additions & 0 deletions requirements/ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#
# pip-compile requirements/ci.in
#
brotli==1.1.0
# via -r requirements/ci.in
cffi==1.17.1
# via cryptography
coverage[toml]==7.6.1
Expand Down
1 change: 1 addition & 0 deletions requirements/dev.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pre-commit
pytest-cov
flit
wheel
brotli
2 changes: 2 additions & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#
# pip-compile requirements/dev.in
#
brotli==1.1.0
# via -r requirements/ci.in
build==1.2.2.post1
# via pip-tools
certifi==2024.8.30
Expand Down
Binary file not shown.
90 changes: 90 additions & 0 deletions resources/create_brotli_test_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python
"""
Create a minimal PDF with Brotli compression for testing purposes.

This script generates a simple PDF file that uses Brotli compression
for the content stream, allowing for testing of the BrotliDecode filter
in pypdf.

Note: /BrotliDecode is not a standard PDF filter. This file is specifically
for testing PDF library support for this filter (e.g., in pypdf).
Standard PDF viewers will likely not render this file correctly.
"""

import logging
from pathlib import Path

import brotli

logging.basicConfig(level=logging.INFO, format="%(name)s: %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

content_stream = b"BT /F1 24 Tf 100 700 Td (Hello, Brotli!) Tj ET"
compressed_content = brotli.compress(content_stream, quality=5)

xref_offsets = [0] * 6
current_offset = 0
pdf_parts = []

part = b"%PDF-1.7\n%\xc2\xa5\xc2\xb1\xc3\xab\xc3\xbf\n" # Binary marker
pdf_parts.append(part)
current_offset += len(part)
xref_offsets[1] = current_offset

part = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
pdf_parts.append(part)
current_offset += len(part)
xref_offsets[2] = current_offset

part = b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
pdf_parts.append(part)
current_offset += len(part)
xref_offsets[3] = current_offset

part = (
b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
b"/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>\nendobj\n"
)
pdf_parts.append(part)
current_offset += len(part)
xref_offsets[4] = current_offset

part_header = (
f"4 0 obj\n<< /Length {len(compressed_content)} /Filter /BrotliDecode >>\nstream\n"
).encode("ascii")
part_footer = b"\nendstream\nendobj\n"
pdf_parts.append(part_header)
pdf_parts.append(compressed_content)
pdf_parts.append(part_footer)
current_offset += len(part_header) + len(compressed_content) + len(part_footer)
xref_offsets[5] = current_offset

part = b"5 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n"
pdf_parts.append(part)
current_offset += len(part)
xref_table_start_offset = current_offset

xref_lines = [b"xref\n0 6\n", b"0000000000 65535 f \n"]
xref_lines.extend(
f"{xref_offsets[i]:010d} 00000 n \n".encode("ascii") for i in range(1, 6)
)
pdf_parts.extend(xref_lines)

trailer = (
f"trailer\n<< /Size 6 /Root 1 0 R >>\nstartxref\n{xref_table_start_offset}\n%%EOF"
).encode("ascii")
pdf_parts.append(trailer)

script_path = Path(__file__).resolve()
output_dir = script_path.parent / "brotli-test-pdfs"
output_path = output_dir / "minimal-brotli-compressed.pdf"

output_dir.mkdir(parents=True, exist_ok=True)

try:
with open(output_path, "wb") as f:
for part in pdf_parts:
f.write(part)
logger.info(f"Created test PDF with Brotli compression at: {output_path}")
except OSError:
logger.exception("Error writing PDF file")
80 changes: 78 additions & 2 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Test the pypdf.filters module."""

import os
import shutil
import string
Expand All @@ -7,6 +8,7 @@
from itertools import product as cartesian_product
from pathlib import Path
from unittest import mock
from unittest.mock import patch

import pytest
from PIL import Image, ImageOps
Expand All @@ -16,6 +18,7 @@
from pypdf.filters import (
ASCII85Decode,
ASCIIHexDecode,
BrotliDecode,
CCITParameters,
CCITTFaxDecode,
CCITTParameters,
Expand All @@ -34,6 +37,12 @@

from . import PILContext, get_data_from_url
from .test_encryption import HAS_AES

try:
import brotli # noqa: F401
HAS_BROTLI = True
except ImportError:
HAS_BROTLI = False
from .test_images import image_similarity

filter_inputs = (
Expand Down Expand Up @@ -62,6 +71,52 @@ def test_flate_decode_encode(predictor, s):
assert codec.decode(encoded, DictionaryObject({"/Predictor": predictor})) == s


@pytest.mark.parametrize("s", filter_inputs)
@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed")
def test_brotli_decode_encode(s):

codec = BrotliDecode()
s_bytes = s.encode()
encoded = codec.encode(s_bytes)
assert encoded != s_bytes # Ensure encoding actually happened
decoded = codec.decode(encoded)
assert decoded == s_bytes


@patch("pypdf.filters.brotli", None)
def test_brotli_missing_installation():
from pypdf.filters import BrotliDecode, decode_stream_data # noqa: PLC0415

# Test direct decode call
codec = BrotliDecode()
with pytest.raises(ImportError) as exc_info_decode:
codec.decode(b"test data")
assert "Brotli library not installed" in str(exc_info_decode.value)

# Test direct encode call
with pytest.raises(ImportError) as exc_info_encode:
codec.encode(b"test data")
assert "Brotli library not installed" in str(exc_info_encode.value)

# Test call via decode_stream_data
stream = DictionaryObject()
stream[NameObject("/Filter")] = NameObject("/BrotliDecode")
stream._data = b"dummy compressed data"
with pytest.raises(ImportError) as exc_info_stream:
decode_stream_data(stream)
assert "Brotli library not installed" in str(exc_info_stream.value)


@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed")
def test_brotli_decode_encode_with_real_module():

s = b"Hello, Brotli!"
codec = BrotliDecode()
encoded = codec.encode(s)
assert encoded != s # Ensure encoding actually happened
assert codec.decode(encoded) == s


def test_flatedecode_unsupported_predictor():
"""
FlateDecode raises PdfReadError for unsupported predictors.
Expand Down Expand Up @@ -383,7 +438,9 @@ def test_iss1787():
obj = data.indirect_reference.get_object()
obj["/DecodeParms"][NameObject("/Columns")] = NumberObject(1000)
obj.decoded_self = None
with pytest.raises(expected_exception=PdfReadError, match="^Unsupported PNG filter 244$"):
with pytest.raises(
expected_exception=PdfReadError, match="^Unsupported PNG filter 244$"
):
_ = reader.pages[0].images[0]


Expand Down Expand Up @@ -700,7 +757,9 @@ def test_flate_decode__not_rectangular(caplog):
decode_parms[NameObject("/Columns")] = NumberObject(4881)
actual = FlateDecode.decode(data=data, decode_parms=decode_parms)
actual_image = BytesIO()
Image.frombytes(mode="1", size=(4881, 81), data=actual).save(actual_image, format="png")
Image.frombytes(mode="1", size=(4881, 81), data=actual).save(
actual_image, format="png"
)

url = "https://github.com/user-attachments/assets/c5695850-c076-4255-ab72-7c86851a4a04"
name = "issue3241.png"
Expand All @@ -709,6 +768,23 @@ def test_flate_decode__not_rectangular(caplog):
assert caplog.messages == ["Image data is not rectangular. Adding padding."]


@pytest.mark.skipif(not HAS_BROTLI, reason="brotli library not installed")
def test_main_decode_brotli_installed():

pdf_path = RESOURCE_ROOT / "brotli-test-pdfs" / "minimal-brotli-compressed.pdf"

reader = PdfReader(pdf_path)
page = reader.pages[0]

# This test specifically exercises the BrotliDecode path in decode_stream_data function
# when processing a real PDF with BrotliDecode filter
extracted_text = page.extract_text()

assert extracted_text.strip() == "Hello, Brotli!"


def test_brotli_module_importability():
assert BrotliDecode is not None
def test_jbig2decode__binary_errors():
with mock.patch("pypdf.filters.JBIG2DEC_BINARY", None), \
pytest.raises(DependencyError, match="jbig2dec binary is not available."):
Expand Down
Loading