diff --git a/pypdf/_page.py b/pypdf/_page.py index 63038d9d0..fe91acb82 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -444,19 +444,78 @@ def _get_ids_image( if ancest is None: ancest = [] lst: List[Union[str, List[str]]] = [] - if PG.RESOURCES not in obj or RES.XOBJECT not in cast( - DictionaryObject, obj[PG.RESOURCES] - ): - return [] if self.inline_images is None else list(self.inline_images.keys()) - - x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore - for o in x_object: - if not isinstance(x_object[o], StreamObject): - continue - if x_object[o][IA.SUBTYPE] == "/Image": - lst.append(o if len(ancest) == 0 else ancest + [o]) - else: # is a form with possible images inside - lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) + + if PG.ANNOTS in obj: + for annot in cast(DictionaryObject, obj[PG.ANNOTS]): + if ( + "/AP" in cast(DictionaryObject, annot.keys()) + and "/N" in cast(DictionaryObject, annot["/AP"].keys()) + and PG.RESOURCES in annot["/AP"]["/N"].get_object() + and RES.XOBJECT + in cast(DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES]) + and "/FRM" + in cast( + DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT] + ) + ): + frame = annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT]["/FRM"] + + if PG.RESOURCES in frame.get_object() and RES.XOBJECT in cast( + DictionaryObject, frame[PG.RESOURCES] + ): + x_object = frame[PG.RESOURCES][RES.XOBJECT] + for o in x_object: + if ( + isinstance(x_object[o], StreamObject) + and x_object[o][IA.SUBTYPE] == "/Image" + ): + lst.extend( + [ + ( + f"{PG.ANNOTS}/{annot['/T']}{o}" + if len(ancest) == 0 + else ancest + [o] + ) + ] + ) + + if PG.RESOURCES in obj: + if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): + for pattern_name, pattern in cast( + DictionaryObject, + cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN], + ).items(): + if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast( + DictionaryObject, pattern[PG.RESOURCES] + ): + x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() + for o in x_object: + if ( + isinstance(x_object[o], StreamObject) + and x_object[o][IA.SUBTYPE] == "/Image" + ): + lst.extend( + [ + ( + f"{RES.PATTERN}{pattern_name}{o}" + if len(ancest) == 0 + else ancest + [o] + ) + ] + ) + + if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]): + x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for o in x_object: + if not isinstance(x_object[o], StreamObject): + continue + if x_object[o][IA.SUBTYPE] == "/Image": + lst.append(o if len(ancest) == 0 else ancest + [o]) + else: # is a form with possible images inside + lst.extend( + self._get_ids_image(x_object[o], ancest + [o], call_stack) + ) + assert self.inline_images is not None lst.extend(list(self.inline_images.keys())) return lst @@ -473,9 +532,50 @@ def _get_image( if isinstance(id, List) and len(id) == 1: id = id[0] try: - xobjs = cast( - DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] - ) + if isinstance(id, str) and id.find(RES.PATTERN) == 0: + pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)] + + patterns = cast( + DictionaryObject, + cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN], + ) + + xobjs = cast( + DictionaryObject, + cast( + DictionaryObject, + cast(DictionaryObject, patterns[pattern_name])[PG.RESOURCES], + )[RES.XOBJECT], + ) + elif isinstance(id, str) and id.find(PG.ANNOTS) == 0: + annot_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)] + annots = cast(DictionaryObject, obj[PG.ANNOTS]) + + for temp_annot in annots: + if temp_annot["/T"] == annot_name: + annot = temp_annot + break + + frame_xobjs = cast( + DictionaryObject, + cast( + DictionaryObject, + cast(DictionaryObject, annot["/AP"]["/N"])[PG.RESOURCES], + )[RES.XOBJECT], + ) + + xobjs = cast( + DictionaryObject, + cast( + DictionaryObject, + cast(DictionaryObject, frame_xobjs["/FRM"])[PG.RESOURCES], + )[RES.XOBJECT], + ) + else: + xobjs = cast( + DictionaryObject, + cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT], + ) except KeyError: if not (id[0] == "~" and id[-1] == "~"): raise @@ -487,15 +587,25 @@ def _get_image( raise KeyError("no inline image can be found") return self.inline_images[id] - imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) - extension, byte_stream = imgd[:2] - f = ImageFile( - name=f"{id[1:]}{extension}", + if id.find("/Pattern") == 0: + image_identifier = id[id.rfind("/") :] + image_name = pattern_name[1:] + "_" + image_identifier[1:] + elif id.find("/Annot") == 0: + image_identifier = id[id.rfind("/") :] + image_name = annot_name + "_" + image_identifier[1:] + else: + image_identifier = str(id) + image_name = id[1:] + + imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_identifier])) + image_extension, byte_stream = imgd[:2] + + return ImageFile( + name=image_name + str(image_extension), data=byte_stream, image=imgd[2], - indirect_reference=xobjs[id].indirect_reference, + indirect_reference=xobjs[image_identifier].indirect_reference, ) - return f else: # in a sub object ids = id[1:] return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) diff --git a/sample-files b/sample-files index 8c405ece5..91311ce97 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 8c405ece5eff12396a34a1fae3276132002e1753 +Subproject commit 91311ce97033ea9669dba2a7b6c591c05ed74c76 diff --git a/tests/example_files.yaml b/tests/example_files.yaml index 049855d2f..475159ffb 100644 --- a/tests/example_files.yaml +++ b/tests/example_files.yaml @@ -112,5 +112,7 @@ url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf - local_filename: Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf url: https://www.joinville.sc.gov.br/wp-content/uploads/2023/11/Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf -- local_filename: iss2138.pdf - url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf +- local_filename: iss2613-onlyoffice-standardImages.pdf + url: https://github.com/py-pdf/pypdf/files/15355445/iss2613-onlyoffice-standardImages.pdf +- local_filename: iss2613-onlyoffice-form.pdf + url: https://github.com/py-pdf/pypdf/files/15355444/iss2613-onlyoffice-form.pdf diff --git a/tests/test_images.py b/tests/test_images.py index 5955bf47c..ae6b8f943 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -214,6 +214,66 @@ def test_image_extraction(src, page_index, image_key, expected): assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 +@pytest.mark.enable_socket() +def test_onlyoffice_standard_images_extraction(): + reader = PdfReader( + BytesIO(get_data_from_url(name="iss2613-onlyoffice-standardImages.pdf")) + ) + + assert ( + str(reader.pages[0].images) + == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]" + ) + + url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9" + name = "iss2613-P1_X1.jpg" + P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e" + name = "iss2613-P2_X1.jpg" + P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd" + name = "iss2613-P3_X1.jpg" + P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[2].image, P3_X1) >= 0.99 + + +@pytest.mark.enable_socket() +def test_onlyoffice_form_images_extraction(): + reader = PdfReader(BytesIO(get_data_from_url(name="iss2613-onlyoffice-form.pdf"))) + + assert ( + str(reader.pages[0].images) + == "[Image_0=/Annots/Image2_af_image/Img, Image_1=/Annots/Image3_af_image/Img]" + ) + + assert str(reader.pages[1].images) == "[Image_0=/Annots/Image4_af_image/Img]" + + url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9" + name = "iss2613-P1_X1.jpg" + P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e" + name = "iss2613-P2_X1.jpg" + P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd" + name = "iss2613-P3_X1.jpg" + P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[1].images[0].image, P3_X1) >= 0.99 + + @pytest.mark.enable_socket() @pytest.mark.timeout(30) def test_loop_in_image_keys():