From 10f3b85e5c847322b6e77a1dd50ce0d1cd7facb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Sat, 11 May 2024 23:47:13 +0200 Subject: [PATCH 01/15] STY: consider images inside pages patterns. Added code to detect patterns in "_get_ids_image". To avoid any conflicts with images that could be located directly in a page or images using the same ID in differents patterns, images ids under patterns are returned in this form : "/Pattern/patternNameHere/imageNameHere" Added code to deal with Pattern images in "_get_image". --- pypdf/_page.py | 96 ++++++++++++++++++++++++++++++++------------ tests/test_images.py | 43 ++++++++++++++++++++ 2 files changed, 113 insertions(+), 26 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index ea8e6f5a9..a009e0dce 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -457,19 +457,35 @@ def _get_ids_image( if ancest is None: ancest = [] lst: List[Union[str, List[str]]] = [] - if PG.RESOURCES not in obj or RES.XOBJECT not in cast( - DictionaryObject, obj[PG.RESOURCES] - ): - return self.inline_images_keys - - x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore - for o in x_object: - if not isinstance(x_object[o], StreamObject): - continue - if x_object[o][IA.SUBTYPE] == "/Image": - lst.append(o if len(ancest) == 0 else ancest + [o]) - else: # is a form with possible images inside - lst.extend(self._get_ids_image(x_object[o], ancest + [o], call_stack)) + + if PG.RESOURCES in obj: + if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): + for patternName, pattern in obj[PG.RESOURCES][RES.PATTERN].items(): + if PG.RESOURCES in pattern.get_object(): + if RES.XOBJECT in cast(DictionaryObject, pattern[PG.RESOURCES]): + x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for o in x_object: + if not isinstance(x_object[o], StreamObject): + continue + if x_object[o][IA.SUBTYPE] == "/Image": + lst.append( + f"{RES.PATTERN}{patternName}{o}" + if len(ancest) == 0 + else ancest + [o] + ) + + if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]): + x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for o in x_object: + if not isinstance(x_object[o], StreamObject): + continue + if x_object[o][IA.SUBTYPE] == "/Image": + lst.append(o if len(ancest) == 0 else ancest + [o]) + else: # is a form with possible images inside + lst.extend( + self._get_ids_image(x_object[o], ancest + [o], call_stack) + ) + return lst + self.inline_images_keys def _get_image( @@ -484,9 +500,27 @@ def _get_image( if isinstance(id, List) and len(id) == 1: id = id[0] try: - xobjs = cast( - DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT] - ) + if isinstance(id, str) and id.find(RES.PATTERN) == 0: + pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)] + image_name = id[id.rfind("/") :] + + patterns = cast( + DictionaryObject, + cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN], + ) + + xobjs = cast( + DictionaryObject, + cast(DictionaryObject, patterns[pattern_name][PG.RESOURCES])[ + RES.XOBJECT + ], + ) + + else: + xobjs = cast( + DictionaryObject, + cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT], + ) except KeyError: if not (id[0] == "~" and id[-1] == "~"): raise @@ -497,16 +531,26 @@ def _get_image( if self.inline_images is None: # pragma: no cover raise KeyError("no inline image can be found") return self.inline_images[id] - - imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) - extension, byte_stream = imgd[:2] - f = ImageFile( - name=f"{id[1:]}{extension}", - data=byte_stream, - image=imgd[2], - indirect_reference=xobjs[id].indirect_reference, - ) - return f + elif id.find("/Pattern") == 0: + imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_name])) + extension, byte_stream = imgd[:2] + f = ImageFile( + name=f"{pattern_name[1:]}_{image_name[1:]}{extension}", + data=byte_stream, + image=imgd[2], + indirect_reference=xobjs[image_name].indirect_reference, + ) + return f + else: + imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) + extension, byte_stream = imgd[:2] + f = ImageFile( + name=f"{id[1:]}{extension}", + data=byte_stream, + image=imgd[2], + indirect_reference=xobjs[id].indirect_reference, + ) + return f else: # in a sub object ids = id[1:] return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) diff --git a/tests/test_images.py b/tests/test_images.py index e77090171..08d525957 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -214,6 +214,49 @@ def test_image_extraction(src, page_index, image_key, expected): assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 +@pytest.mark.parametrize( + ("src", "page_index", "image_key", "expected"), + [ + ( + SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf", + 0, + "/Pattern/P1/X1", + SAMPLE_ROOT / "027-onlyoffice-image/P1_X1.jpg", + ), + ( + SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf", + 0, + "/Pattern/P2/X1", + SAMPLE_ROOT / "027-onlyoffice-image/P2_X1.jpg", + ), + ( + SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf", + 0, + "/Pattern/P3/X1", + SAMPLE_ROOT / "027-onlyoffice-image/P3_X1.jpg", + ), + ], + ids=[ + "027-onlyoffice-image/P1_X1.jpg", + "027-onlyoffice-image/P2_X1.jpg", + "027-onlyoffice-image/P3_X1.jpg", + ], +) +@pytest.mark.samples() +def test_patterns_image_extraction(src, page_index, image_key, expected): + reader = PdfReader(src) + extreactedIDs = reader.pages[page_index].images + + assert ( + str(extreactedIDs) + == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]" + ) + + actual_image = reader.pages[page_index].images[image_key] + + assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 + + @pytest.mark.enable_socket() @pytest.mark.timeout(30) def test_loop_in_image_keys(): From 557d77da071ad898482ed18f8806705d3828095e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= <67143274+0xNath@users.noreply.github.com> Date: Sun, 12 May 2024 10:13:42 +0200 Subject: [PATCH 02/15] Update _page.py fixed code style --- pypdf/_page.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index a009e0dce..15dd62ab2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -461,8 +461,7 @@ def _get_ids_image( if PG.RESOURCES in obj: if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): for patternName, pattern in obj[PG.RESOURCES][RES.PATTERN].items(): - if PG.RESOURCES in pattern.get_object(): - if RES.XOBJECT in cast(DictionaryObject, pattern[PG.RESOURCES]): + if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast(DictionaryObject, pattern[PG.RESOURCES]): x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for o in x_object: if not isinstance(x_object[o], StreamObject): From 524ffc36a6c1a88cfc59c61c2986e2bd19dc61de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= <67143274+0xNath@users.noreply.github.com> Date: Sun, 12 May 2024 10:14:37 +0200 Subject: [PATCH 03/15] Update _page.py fixed code style --- pypdf/_page.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 15dd62ab2..19a324344 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -462,16 +462,16 @@ def _get_ids_image( if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): for patternName, pattern in obj[PG.RESOURCES][RES.PATTERN].items(): if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast(DictionaryObject, pattern[PG.RESOURCES]): - x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore - for o in x_object: - if not isinstance(x_object[o], StreamObject): - continue - if x_object[o][IA.SUBTYPE] == "/Image": - lst.append( - f"{RES.PATTERN}{patternName}{o}" - if len(ancest) == 0 - else ancest + [o] - ) + x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + for o in x_object: + if not isinstance(x_object[o], StreamObject): + continue + if x_object[o][IA.SUBTYPE] == "/Image": + lst.append( + f"{RES.PATTERN}{patternName}{o}" + if len(ancest) == 0 + else ancest + [o] + ) if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]): x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore From 46adbe18b103d3e662e95806607769b7b03422cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Sun, 12 May 2024 10:18:47 +0200 Subject: [PATCH 04/15] fix code style... --- pypdf/_page.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 19a324344..f4d1357d2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -461,7 +461,9 @@ def _get_ids_image( if PG.RESOURCES in obj: if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): for patternName, pattern in obj[PG.RESOURCES][RES.PATTERN].items(): - if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast(DictionaryObject, pattern[PG.RESOURCES]): + if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast( + DictionaryObject, pattern[PG.RESOURCES] + ): x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore for o in x_object: if not isinstance(x_object[o], StreamObject): From 76b545fd56cefc843acea3be50e4b507eed91762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Sun, 12 May 2024 10:43:51 +0200 Subject: [PATCH 05/15] fix typing --- pypdf/_page.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index f4d1357d2..cd121bccf 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -460,11 +460,14 @@ def _get_ids_image( if PG.RESOURCES in obj: if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): - for patternName, pattern in obj[PG.RESOURCES][RES.PATTERN].items(): + for patternName, pattern in cast( + DictionaryObject, + cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN], + ).items(): if PG.RESOURCES in pattern.get_object() and RES.XOBJECT in cast( DictionaryObject, pattern[PG.RESOURCES] ): - x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() for o in x_object: if not isinstance(x_object[o], StreamObject): continue @@ -476,7 +479,10 @@ def _get_ids_image( ) if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]): - x_object = obj[PG.RESOURCES][RES.XOBJECT].get_object() # type: ignore + x_object = cast( + DictionaryObject, + cast(DictionaryObject, obj[PG.RESOURCES])[RES.XOBJECT], + ).get_object() for o in x_object: if not isinstance(x_object[o], StreamObject): continue @@ -512,11 +518,11 @@ def _get_image( xobjs = cast( DictionaryObject, - cast(DictionaryObject, patterns[pattern_name][PG.RESOURCES])[ - RES.XOBJECT - ], + cast( + DictionaryObject, + cast(DictionaryObject, patterns[pattern_name])[PG.RESOURCES], + )[RES.XOBJECT], ) - else: xobjs = cast( DictionaryObject, From 6edfa2d7c7eecb62dfb4e0144f9472a15fcce564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 18:09:57 +0200 Subject: [PATCH 06/15] fixed typo --- tests/test_images.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_images.py b/tests/test_images.py index 08d525957..893eda33b 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -245,10 +245,10 @@ def test_image_extraction(src, page_index, image_key, expected): @pytest.mark.samples() def test_patterns_image_extraction(src, page_index, image_key, expected): reader = PdfReader(src) - extreactedIDs = reader.pages[page_index].images + extractedIDs = reader.pages[page_index].images assert ( - str(extreactedIDs) + str(extractedIDs) == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]" ) From 0e81eeeb0b9bcd4120030d2623f9fe667fbab59e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 18:10:32 +0200 Subject: [PATCH 07/15] refactored code --- pypdf/_page.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index cd121bccf..6de845727 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -509,7 +509,6 @@ def _get_image( try: if isinstance(id, str) and id.find(RES.PATTERN) == 0: pattern_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)] - image_name = id[id.rfind("/") :] patterns = cast( DictionaryObject, @@ -538,26 +537,24 @@ def _get_image( if self.inline_images is None: # pragma: no cover raise KeyError("no inline image can be found") return self.inline_images[id] - elif id.find("/Pattern") == 0: - imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_name])) - extension, byte_stream = imgd[:2] - f = ImageFile( - name=f"{pattern_name[1:]}_{image_name[1:]}{extension}", - data=byte_stream, - image=imgd[2], - indirect_reference=xobjs[image_name].indirect_reference, - ) - return f + + if id.find("/Pattern") == 0: + image_identifier = id[id.rfind("/") :] + + image_name = pattern_name[1:] + "_" + image_identifier[1:] else: - imgd = _xobj_to_image(cast(DictionaryObject, xobjs[id])) - extension, byte_stream = imgd[:2] - f = ImageFile( - name=f"{id[1:]}{extension}", - data=byte_stream, - image=imgd[2], - indirect_reference=xobjs[id].indirect_reference, - ) - return f + image_identifier = str(id) + image_name = id[1:] + + imgd = _xobj_to_image(cast(DictionaryObject, xobjs[image_identifier])) + image_extension, byte_stream = imgd[:2] + + return ImageFile( + name=image_name + str(image_extension), + data=byte_stream, + image=imgd[2], + indirect_reference=xobjs[image_identifier].indirect_reference, + ) else: # in a sub object ids = id[1:] return self._get_image(ids, cast(DictionaryObject, xobjs[id[0]])) From b6ba82027c18660ad9e34d7f4a25d1176c6719a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 18:13:43 +0200 Subject: [PATCH 08/15] fixed typo --- pypdf/_page.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 6de845727..7a92041ec 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -540,7 +540,6 @@ def _get_image( if id.find("/Pattern") == 0: image_identifier = id[id.rfind("/") :] - image_name = pattern_name[1:] + "_" + image_identifier[1:] else: image_identifier = str(id) From 07752a1ae83ed7c5284fade0a59596a9cb94b58b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 20:53:43 +0200 Subject: [PATCH 09/15] Add test for forms and standard onlyoffice pages --- tests/example_files.yaml | 6 ++- tests/test_images.py | 87 ++++++++++++++++++++++++---------------- 2 files changed, 56 insertions(+), 37 deletions(-) diff --git a/tests/example_files.yaml b/tests/example_files.yaml index 049855d2f..475159ffb 100644 --- a/tests/example_files.yaml +++ b/tests/example_files.yaml @@ -112,5 +112,7 @@ url: https://github.com/py-pdf/pypdf/files/12050253/tt.pdf - local_filename: Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf url: https://www.joinville.sc.gov.br/wp-content/uploads/2023/11/Pesquisa-de-Precos-Combustiveis-novembro-2023.pdf -- local_filename: iss2138.pdf - url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf +- local_filename: iss2613-onlyoffice-standardImages.pdf + url: https://github.com/py-pdf/pypdf/files/15355445/iss2613-onlyoffice-standardImages.pdf +- local_filename: iss2613-onlyoffice-form.pdf + url: https://github.com/py-pdf/pypdf/files/15355444/iss2613-onlyoffice-form.pdf diff --git a/tests/test_images.py b/tests/test_images.py index 893eda33b..85e0e605d 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -214,47 +214,64 @@ def test_image_extraction(src, page_index, image_key, expected): assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 -@pytest.mark.parametrize( - ("src", "page_index", "image_key", "expected"), - [ - ( - SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf", - 0, - "/Pattern/P1/X1", - SAMPLE_ROOT / "027-onlyoffice-image/P1_X1.jpg", - ), - ( - SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf", - 0, - "/Pattern/P2/X1", - SAMPLE_ROOT / "027-onlyoffice-image/P2_X1.jpg", - ), - ( - SAMPLE_ROOT / "027-onlyoffice-image/Patterns.pdf", - 0, - "/Pattern/P3/X1", - SAMPLE_ROOT / "027-onlyoffice-image/P3_X1.jpg", - ), - ], - ids=[ - "027-onlyoffice-image/P1_X1.jpg", - "027-onlyoffice-image/P2_X1.jpg", - "027-onlyoffice-image/P3_X1.jpg", - ], -) -@pytest.mark.samples() -def test_patterns_image_extraction(src, page_index, image_key, expected): - reader = PdfReader(src) - extractedIDs = reader.pages[page_index].images +@pytest.mark.enable_socket() +def test_onlyoffice_standard_images_extraction(): + reader = PdfReader( + BytesIO(get_data_from_url(name="iss2613-onlyoffice-standardImages.pdf")) + ) assert ( - str(extractedIDs) + str(reader.pages[0].images) == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1, Image_2=/Pattern/P3/X1]" ) - actual_image = reader.pages[page_index].images[image_key] + url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9" + name = "iss2613-P1_X1.jpg" + P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) - assert image_similarity(BytesIO(actual_image.data), expected) >= 0.99 + assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e" + name = "iss2613-P2_X1.jpg" + P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd" + name = "iss2613-P3_X1.jpg" + P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[2].image, P3_X1) >= 0.99 + + +@pytest.mark.samples() +def test_onlyoffice_form_images_extraction(): + reader = PdfReader(BytesIO(get_data_from_url(name="iss2613-onlyoffice-form.pdf"))) + + assert ( + str(reader.pages[0].images) + == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1]" + ) + + assert str(reader.pages[1].images) == "[Image_0=/Pattern/P1/X1]" + + url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9" + name = "iss2613-P1_X1.jpg" + P1_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[0].image, P1_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/827c9066-546a-4502-a613-579ec25c598e" + name = "iss2613-P2_X1.jpg" + P2_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[0].images[1].image, P2_X1) >= 0.99 + + url = "https://github.com/py-pdf/pypdf/assets/67143274/df9cb9e9-e589-4d2e-a537-ae0fe3240bbd" + name = "iss2613-P3_X1.jpg" + P3_X1 = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(reader.pages[1].images[0].image, P3_X1) >= 0.99 @pytest.mark.enable_socket() From a5b980b65e50d999a2852ac3f39fa7969eddcddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 20:56:11 +0200 Subject: [PATCH 10/15] Fix decorator for form test --- tests/test_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index 85e0e605d..7890cfa42 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -244,7 +244,7 @@ def test_onlyoffice_standard_images_extraction(): assert image_similarity(reader.pages[0].images[2].image, P3_X1) >= 0.99 -@pytest.mark.samples() +@pytest.mark.enable_socket() def test_onlyoffice_form_images_extraction(): reader = PdfReader(BytesIO(get_data_from_url(name="iss2613-onlyoffice-form.pdf"))) From a23b27290f9031a0fcb69ba21e22585346442a6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 22:35:08 +0200 Subject: [PATCH 11/15] Fix test for form --- tests/test_images.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_images.py b/tests/test_images.py index 7890cfa42..723a81a24 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -250,10 +250,10 @@ def test_onlyoffice_form_images_extraction(): assert ( str(reader.pages[0].images) - == "[Image_0=/Pattern/P1/X1, Image_1=/Pattern/P2/X1]" + == "[Image_0=/Annots/Image2_af_image/Img, Image_1=/Annots/Image3_af_image/Img]" ) - assert str(reader.pages[1].images) == "[Image_0=/Pattern/P1/X1]" + assert str(reader.pages[1].images) == "[Image_0=/Annots/Image4_af_image/Img]" url = "https://github.com/py-pdf/pypdf/assets/67143274/cc28b39b-2e96-4bd3-b33c-c545c5cec2d9" name = "iss2613-P1_X1.jpg" From d524672c2bfeac0056c8156fabe6aafdf9fee596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 22:49:14 +0200 Subject: [PATCH 12/15] Add Annot parsing for onlyoffice forms Fix typo --- pypdf/_page.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 7a92041ec..a3146bc4a 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -458,9 +458,38 @@ def _get_ids_image( ancest = [] lst: List[Union[str, List[str]]] = [] + if PG.ANNOTS in obj: + for annot in cast(DictionaryObject, obj[PG.ANNOTS]): + if ( + "/AP" in cast(DictionaryObject, annot.keys()) + and "/N" in cast(DictionaryObject, annot["/AP"].keys()) + and PG.RESOURCES in annot["/AP"]["/N"].get_object() + and RES.XOBJECT + in cast(DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES]) + and "/FRM" + in cast( + DictionaryObject, annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT] + ) + ): + frame = annot["/AP"]["/N"][PG.RESOURCES][RES.XOBJECT]["/FRM"] + + if PG.RESOURCES in frame.get_object() and RES.XOBJECT in cast( + DictionaryObject, frame[PG.RESOURCES] + ): + x_object = frame[PG.RESOURCES][RES.XOBJECT] + for o in x_object: + if x_object[o][IA.SUBTYPE] == "/Image": + if not isinstance(x_object[o], StreamObject): + continue + lst.append( + f"{PG.ANNOTS}/{annot['/T']}{o}" + if len(ancest) == 0 + else ancest + [o] + ) + if PG.RESOURCES in obj: if RES.PATTERN in cast(DictionaryObject, obj[PG.RESOURCES]): - for patternName, pattern in cast( + for pattern_name, pattern in cast( DictionaryObject, cast(DictionaryObject, obj[PG.RESOURCES])[RES.PATTERN], ).items(): @@ -473,7 +502,7 @@ def _get_ids_image( continue if x_object[o][IA.SUBTYPE] == "/Image": lst.append( - f"{RES.PATTERN}{patternName}{o}" + f"{RES.PATTERN}{pattern_name}{o}" if len(ancest) == 0 else ancest + [o] ) @@ -522,6 +551,30 @@ def _get_image( cast(DictionaryObject, patterns[pattern_name])[PG.RESOURCES], )[RES.XOBJECT], ) + elif isinstance(id, str) and id.find(PG.ANNOTS) == 0: + annot_name = id[len(RES.PATTERN) : id.find("/", len(RES.PATTERN) + 1)] + annots = cast(DictionaryObject, obj[PG.ANNOTS]) + + for temp_annot in annots: + if temp_annot["/T"] == annot_name: + annot = temp_annot + break + + frame_xobjs = cast( + DictionaryObject, + cast( + DictionaryObject, + cast(DictionaryObject, annot["/AP"]["/N"])[PG.RESOURCES], + )[RES.XOBJECT], + ) + + xobjs = cast( + DictionaryObject, + cast( + DictionaryObject, + cast(DictionaryObject, frame_xobjs["/FRM"])[PG.RESOURCES], + )[RES.XOBJECT], + ) else: xobjs = cast( DictionaryObject, @@ -541,6 +594,9 @@ def _get_image( if id.find("/Pattern") == 0: image_identifier = id[id.rfind("/") :] image_name = pattern_name[1:] + "_" + image_identifier[1:] + elif id.find("/Annot") == 0: + image_identifier = id[id.rfind("/") :] + image_name = annot_name + "_" + image_identifier[1:] else: image_identifier = str(id) image_name = id[1:] From 24218c81bd46cca1dc058b2abd2a4b0dddab8b01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Fri, 17 May 2024 23:29:29 +0200 Subject: [PATCH 13/15] Squashed commit of the following: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d0493ae20bfb243e6fddc85d03d97611e040c38e Author: Nathanaƫl Renaud Date: Fri May 17 23:25:13 2024 +0200 Modified _get_ids_image and _get_image so they work with onlyoffice images commit 53a3781dd60a5b3f30af73fb422c195a34401225 Author: Nathanaƫl Renaud Date: Fri May 17 23:22:27 2024 +0200 Added tests units about images extractions from PDF generated using onlyoffice --- sample-files | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample-files b/sample-files index 8c405ece5..91311ce97 160000 --- a/sample-files +++ b/sample-files @@ -1 +1 @@ -Subproject commit 8c405ece5eff12396a34a1fae3276132002e1753 +Subproject commit 91311ce97033ea9669dba2a7b6c591c05ed74c76 From e5585d74e3177a362813ad6ed37f8710ce730882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Sat, 3 Aug 2024 15:43:17 +0200 Subject: [PATCH 14/15] Try fix issue with codecov and PERF401 ruff warning --- pypdf/_page.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index a3146bc4a..515d615bc 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -478,13 +478,18 @@ def _get_ids_image( ): x_object = frame[PG.RESOURCES][RES.XOBJECT] for o in x_object: - if x_object[o][IA.SUBTYPE] == "/Image": - if not isinstance(x_object[o], StreamObject): - continue - lst.append( - f"{PG.ANNOTS}/{annot['/T']}{o}" - if len(ancest) == 0 - else ancest + [o] + if ( + isinstance(x_object[o], StreamObject) + and x_object[o][IA.SUBTYPE] == "/Image" + ): + lst.extend( + [ + ( + f"{PG.ANNOTS}/{annot['/T']}{o}" + if len(ancest) == 0 + else ancest + [o] + ) + ] ) if PG.RESOURCES in obj: @@ -498,13 +503,18 @@ def _get_ids_image( ): x_object = pattern[PG.RESOURCES][RES.XOBJECT].get_object() for o in x_object: - if not isinstance(x_object[o], StreamObject): - continue - if x_object[o][IA.SUBTYPE] == "/Image": - lst.append( - f"{RES.PATTERN}{pattern_name}{o}" - if len(ancest) == 0 - else ancest + [o] + if ( + isinstance(x_object[o], StreamObject) + and x_object[o][IA.SUBTYPE] == "/Image" + ): + lst.extend( + [ + ( + f"{RES.PATTERN}{pattern_name}{o}" + if len(ancest) == 0 + else ancest + [o] + ) + ] ) if RES.XOBJECT in cast(DictionaryObject, obj[PG.RESOURCES]): From cc1600d06f1970f157eb223eab6adebaacd44fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathana=C3=ABl=20Renaud?= Date: Sat, 3 Aug 2024 16:19:19 +0200 Subject: [PATCH 15/15] Ruff not happy about whitespaces --- pypdf/_page.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 7f590f252..fe91acb82 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -515,7 +515,7 @@ def _get_ids_image( lst.extend( self._get_ids_image(x_object[o], ancest + [o], call_stack) ) - + assert self.inline_images is not None lst.extend(list(self.inline_images.keys())) return lst