Skip to content

Commit 24c095b

Browse files
ROB: Resolve some image extraction edge cases (#3371)
Relates to #3369.
1 parent a8016c9 commit 24c095b

File tree

4 files changed

+70
-6
lines changed

4 files changed

+70
-6
lines changed

pypdf/_page.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -601,12 +601,14 @@ def _get_ids_image(
601601
if ancest is None:
602602
ancest = []
603603
lst: List[Union[str, List[str]]] = []
604-
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
605-
DictionaryObject, obj[PG.RESOURCES]
604+
if (
605+
PG.RESOURCES not in obj or
606+
is_null_or_none(resources := obj[PG.RESOURCES]) or
607+
RES.XOBJECT not in cast(DictionaryObject, resources)
606608
):
607609
return [] if self.inline_images is None else list(self.inline_images.keys())
608610

609-
x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
611+
x_object = resources[RES.XOBJECT].get_object() # type: ignore
610612
for o in x_object:
611613
if not isinstance(x_object[o], StreamObject):
612614
continue

pypdf/_xobj_image_helpers.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
EncodedStreamObject,
1616
NullObject,
1717
TextStringObject,
18+
is_null_or_none,
1819
)
1920

2021
if sys.version_info[:2] >= (3, 10):
@@ -54,7 +55,7 @@ def _get_imagemode(
5455
raise PdfReadError(
5556
"Color spaces nested too deeply. If required, consider increasing MAX_IMAGE_MODE_NESTING_DEPTH."
5657
)
57-
if isinstance(color_space, NullObject):
58+
if is_null_or_none(color_space):
5859
return "", False
5960
color_space_str: str = ""
6061
if isinstance(color_space, str):
@@ -156,6 +157,24 @@ def _extended_image_frombytes(
156157
return img
157158

158159

160+
def __handle_flate__indexed(color_space: ArrayObject) -> Tuple[Any, Any, Any, Any]:
161+
count = len(color_space)
162+
if count == 4:
163+
color_space, base, hival, lookup = (value.get_object() for value in color_space)
164+
return color_space, base, hival, lookup
165+
166+
# Deal with strange AutoDesk files where `base` and `hival` look like this:
167+
# /DeviceRGB\x00255
168+
element1 = color_space[1]
169+
element1 = element1 if isinstance(element1, str) else element1.get_object()
170+
if count == 3 and "\x00" in element1:
171+
color_space, lookup = color_space[0].get_object(), color_space[2].get_object()
172+
base, hival = element1.split("\x00")
173+
hival = int(hival)
174+
return color_space, base, hival, lookup
175+
raise PdfReadError(f"Expected color space with 4 values, got {count}: {color_space}")
176+
177+
159178
def _handle_flate(
160179
size: Tuple[int, int],
161180
data: bytes,
@@ -174,7 +193,7 @@ def _handle_flate(
174193
base: Any
175194
hival: Any
176195
if isinstance(color_space, ArrayObject) and color_space[0] == "/Indexed":
177-
color_space, base, hival, lookup = (value.get_object() for value in color_space)
196+
color_space, base, hival, lookup = __handle_flate__indexed(color_space)
178197
if mode == "2bits":
179198
mode = "P"
180199
data = bits2byte(data, size, 2)
@@ -365,7 +384,7 @@ def _get_mode_and_invert_color(
365384
if (
366385
colors == 1
367386
and (
368-
not isinstance(color_space, NullObject)
387+
not is_null_or_none(color_space)
369388
and "Gray" not in color_space
370389
)
371390
)

tests/test_images.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,3 +572,12 @@ def test_jbig2decode__jbig2globals():
572572

573573
# Wrong image: 0.9618265964800714
574574
assert image_similarity(image.image, img) >= 0.999
575+
576+
577+
@pytest.mark.enable_socket
578+
def test_get_ids_image__resources_is_none():
579+
url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf"
580+
name = "tika-957721.pdf"
581+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
582+
page = reader.pages[2]
583+
assert list(page.images.items()) == []

tests/test_xobject_image_helpers.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Test the pypdf._xobj_image_helpers module."""
22
from io import BytesIO
3+
from pathlib import Path
34

45
import pytest
56

@@ -10,6 +11,10 @@
1011

1112
from . import get_data_from_url
1213

14+
TESTS_ROOT = Path(__file__).parent.resolve()
15+
PROJECT_ROOT = TESTS_ROOT.parent
16+
RESOURCE_ROOT = PROJECT_ROOT / "resources"
17+
1318

1419
@pytest.mark.enable_socket
1520
def test_get_imagemode_recursion_depth():
@@ -126,3 +131,32 @@ def test_extended_image_frombytes_zero_data():
126131

127132
with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."):
128133
_extended_image_frombytes(mode, size, data)
134+
135+
136+
def test_handle_flate__autodesk_indexed():
137+
reader = PdfReader(RESOURCE_ROOT / "AutoCad_Diagram.pdf")
138+
page = reader.pages[0]
139+
for name, image in page.images.items():
140+
assert name.startswith("/")
141+
image.image.load()
142+
143+
data = RESOURCE_ROOT.joinpath("AutoCad_Diagram.pdf").read_bytes()
144+
data = data.replace(b"/DeviceRGB\x00255", b"/DeviceRGB")
145+
reader = PdfReader(BytesIO(data))
146+
page = reader.pages[0]
147+
with pytest.raises(
148+
PdfReadError,
149+
match=r"^Expected color space with 4 values, got 3: \['/Indexed', '/DeviceRGB', '\\x00\\x80\\x00\\x80\\x80耀" # noqa: E501
150+
):
151+
for name, _image in page.images.items(): # noqa: PERF102
152+
assert name.startswith("/")
153+
154+
155+
@pytest.mark.enable_socket
156+
def test_get_mode_and_invert_color():
157+
url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf"
158+
name = "tika-957721.pdf"
159+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
160+
page = reader.pages[12]
161+
for _name, image in page.images.items(): # noqa: PERF102
162+
image.image.load()

0 commit comments

Comments
 (0)