Skip to content

Commit eb6b294

Browse files
author
Nathanaël Renaud
committed
STY: consider images inside pages patterns.
Added code to detect patterns in "_get_ids_image". To avoid any conflicts with images that could be located directly in a page or images using the same ID in differents patterns, images ids under patterns are returned in this form : "/Pattern/patternNameHere/imageNameHere" Added code to deal with Pattern images in "_get_image".
1 parent 32f826b commit eb6b294

File tree

2 files changed

+113
-26
lines changed

2 files changed

+113
-26
lines changed

pypdf/_page.py

Lines changed: 70 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -457,19 +457,35 @@ def _get_ids_image(
457457
if ancest is None:
458458
ancest = []
459459
lst: List[Union[str, List[str]]] = []
460-
if PG.RESOURCES not in obj or RES.XOBJECT not in cast(
461-
DictionaryObject, obj[PG.RESOURCES]
462-
):
463-
return self.inline_images_keys
464-
465-
x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
466-
for o in x_object:
467-
if not isinstance(x_object[o], StreamObject):
468-
continue
469-
if x_object[o][IA.SUBTYPE] == "/Image":
470-
lst.append(o if len(ancest) == 0 else ancest + [o])
471-
else: # is a form with possible images inside
472-
lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack))
460+
461+
if PG.RESOURCES in obj:
462+
if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]):
463+
for patternName, pattern in obj[PG.RESOURCES][RES.PATTERN].items():
464+
if PG.RESOURCES in pattern.get_object():
465+
if RES.XOBJECT in cast(DictionaryObject, pattern[PG.RESOURCES]):
466+
x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
467+
for o in x_object:
468+
if not isinstance(x_object[o], StreamObject):
469+
continue
470+
if x_object[o][IA.SUBTYPE] == "/Image":
471+
lst.append(
472+
f"{RES.PATTERN}{patternName}{o}"
473+
if len(ancest) == 0
474+
else ancest + [o]
475+
)
476+
477+
if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]):
478+
x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore
479+
for o in x_object:
480+
if not isinstance(x_object[o], StreamObject):
481+
continue
482+
if x_object[o][IA.SUBTYPE] == "/Image":
483+
lst.append(o if len(ancest) == 0 else ancest + [o])
484+
else: # is a form with possible images inside
485+
lst.extend(
486+
self._get_ids_image(x_object[o], ancest + [o], call_stack)
487+
)
488+
473489
return lst + self.inline_images_keys
474490

475491
def _get_image(
@@ -484,9 +500,27 @@ def _get_image(
484500
if isinstance(id, List) and len(id) == 1:
485501
id = id[0]
486502
try:
487-
xobjs = cast(
488-
DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT]
489-
)
503+
if isinstance(id, str) and id.find(RES.PATTERN) == 0:
504+
pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)]
505+
image_name = id[id.rfind("/") :]
506+
507+
patterns = cast(
508+
DictionaryObject,
509+
cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN],
510+
)
511+
512+
xobjs = cast(
513+
DictionaryObject,
514+
cast(DictionaryObject, patterns[pattern_name][PG.RESOURCES])[
515+
RES.XOBJECT
516+
],
517+
)
518+
519+
else:
520+
xobjs = cast(
521+
DictionaryObject,
522+
cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT],
523+
)
490524
except KeyError:
491525
if not (id[0] == "~" and id[-1] == "~"):
492526
raise
@@ -497,16 +531,26 @@ def _get_image(
497531
if self.inline_images is None: # pragma: no cover
498532
raise KeyError("no inline image can be found")
499533
return self.inline_images[id]
500-
501-
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
502-
extension, byte_stream = imgd[:2]
503-
f = ImageFile(
504-
name=f"{id[1:]}{extension}",
505-
data=byte_stream,
506-
image=imgd[2],
507-
indirect_reference=xobjs[id].indirect_reference,
508-
)
509-
return f
534+
elif id.find("/Pattern") == 0:
535+
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_name]))
536+
extension, byte_stream = imgd[:2]
537+
f = ImageFile(
538+
name=f"{pattern_name[1:]}_{image_name[1:]}{extension}",
539+
data=byte_stream,
540+
image=imgd[2],
541+
indirect_reference=xobjs[image_name].indirect_reference,
542+
)
543+
return f
544+
else:
545+
imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id]))
546+
extension, byte_stream = imgd[:2]
547+
f = ImageFile(
548+
name=f"{id[1:]}{extension}",
549+
data=byte_stream,
550+
image=imgd[2],
551+
indirect_reference=xobjs[id].indirect_reference,
552+
)
553+
return f
510554
else: # in a sub object
511555
ids = id[1:]
512556
return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]]))

tests/test_images.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,49 @@ def test_image_extraction(src, page_index, image_key, expected):
214214
assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99
215215

216216

217+
@pytest.mark.parametrize(
218+
("src", "page_index", "image_key", "expected"),
219+
[
220+
(
221+
SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf",
222+
0,
223+
"/Pattern/P1/X1",
224+
SAMPLE_ROOT / "027-onlyoffice-image/P1_X1.jpg",
225+
),
226+
(
227+
SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf",
228+
0,
229+
"/Pattern/P2/X1",
230+
SAMPLE_ROOT / "027-onlyoffice-image/P2_X1.jpg",
231+
),
232+
(
233+
SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf",
234+
0,
235+
"/Pattern/P3/X1",
236+
SAMPLE_ROOT / "027-onlyoffice-image/P3_X1.jpg",
237+
),
238+
],
239+
ids=[
240+
"027-onlyoffice-image/P1_X1.jpg",
241+
"027-onlyoffice-image/P2_X1.jpg",
242+
"027-onlyoffice-image/P3_X1.jpg",
243+
],
244+
)
245+
@pytest.mark.samples()
246+
def test_patterns_image_extraction(src, page_index, image_key, expected):
247+
reader = PdfReader(src)
248+
extreactedIDs = reader.pages[page_index].images
249+
250+
assert (
251+
str(extreactedIDs)
252+
== "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]"
253+
)
254+
255+
actual_image = reader.pages[page_index].images[image_key]
256+
257+
assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99
258+
259+
217260
@pytest.mark.enable_socket()
218261
@pytest.mark.timeout(30)
219262
def test_loop_in_image_keys():

0 commit comments

Comments
 (0)