From 1a5a7b610cc63a73648e0796f1539ba71be04b77 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 10 Jun 2025 18:39:26 +0100 Subject: [PATCH 1/4] BUG: Use remove_orphans in compress_identical_objects Issue #3306: PdfWriter.compress_identical_objects ignored remove_orphans. Correct for this. Also deprecate_with_replacement remove_orphans to remove_unreferenced. --- pypdf/_writer.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 069a36deb..ab2c4b6fc 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -64,6 +64,7 @@ _get_max_pdf_version_header, deprecate, deprecate_no_replacement, + deprecate_with_replacement, deprecation_with_replacement, logger_warning, ) @@ -1613,17 +1614,25 @@ def compress_identical_objects( self, remove_identicals: bool = True, remove_orphans: bool = True, + remove_unreferenced: bool = True ) -> None: """ - Parse the PDF file and merge objects that have the same hash. + Parse the PDF file objects that have the same hash. This will make objects common to multiple pages. Recommended to be used just before writing output. Args: remove_identicals: Remove identical objects. - remove_orphans: Remove unreferenced objects. + remove_orphans: Remove unreferenced objects, deprecated use remove_unreferenced. + remove_unreferenced: Remove unreferenced objects. """ + deprecate_with_replacement( + old_name="remove_orphans", + new_name="remove_unreferenced", + removed_in="6.0.0", + ) + if not remove_orphans: remove_unreferenced = remove_orphans def replace_in_obj( obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] @@ -1637,7 +1646,7 @@ def replace_in_obj( assert isinstance(obj, (DictionaryObject, ArrayObject)) for k, v in key_val: if isinstance(v, IndirectObject): - orphans[v.idnum - 1] = False + unreferenced[v.idnum - 1] = False if v in crossref: obj[k] = crossref[v] else: @@ -1645,9 +1654,9 @@ def replace_in_obj( will be performed within replace_in_obj""" replace_in_obj(v, crossref) - # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) + # _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...]) self._idnum_hash = {} - orphans = [True] * len(self._objects) + unreferenced = [True] * len(self._objects) # look for similar objects for idx, obj in enumerate(self._objects): if is_null_or_none(obj): @@ -1672,17 +1681,15 @@ def replace_in_obj( if isinstance(obj, (DictionaryObject, ArrayObject)): replace_in_obj(obj, cnv_rev) - # remove orphans (if applicable) - orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore - - orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore - - try: - orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore - except AttributeError: - pass - for i in compress(range(len(self._objects)), orphans): - self._objects[i] = None + if remove_unreferenced: + unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore + unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore + try: + unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore + except AttributeError: + pass + for i in compress(range(len(self._objects)), unreferenced): + self._objects[i] = None def _sweep_indirect_references( self, From 77829b919da95ee9cbed649e62afc71e8165a2db Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 10 Jun 2025 18:43:04 +0100 Subject: [PATCH 2/4] Update _writer.py --- pypdf/_writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index ab2c4b6fc..ece537d71 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -1632,7 +1632,8 @@ def compress_identical_objects( new_name="remove_unreferenced", removed_in="6.0.0", ) - if not remove_orphans: remove_unreferenced = remove_orphans + if not remove_orphans: + remove_unreferenced = remove_orphans def replace_in_obj( obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] From 12a27b544bc3c9504ea37cffdeb88b8b31aa2047 Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Tue, 10 Jun 2025 19:11:05 +0100 Subject: [PATCH 3/4] Update test_workflows.py --- tests/test_workflows.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_workflows.py b/tests/test_workflows.py index f40858fe7..f1355383d 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -273,7 +273,11 @@ def test_transform_compress_identical_objects(): op = Transformation().scale(sx=0.8, sy=0.8) page.add_transformation(op) writer.add_page(page) - writer.compress_identical_objects() + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects() bytes_out = BytesIO() writer.write(bytes_out) result_reader = PdfReader(bytes_out) From ba65fd946ef64b38df3b1825f43ad5caf297895d Mon Sep 17 00:00:00 2001 From: j-t-1 <120829237+j-t-1@users.noreply.github.com> Date: Wed, 11 Jun 2025 19:02:20 +0100 Subject: [PATCH 4/4] Update test_writer.py --- tests/test_writer.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 90ff4f2de..3e26a6ad1 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2399,7 +2399,11 @@ def test_compress_identical_objects(): name = "iss2794.pdf" in_bytes = BytesIO(get_data_from_url(url, name=name)) writer = PdfWriter(in_bytes) - writer.compress_identical_objects(remove_orphans=False) + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects(remove_orphans=False) out1 = BytesIO() writer.write(out1) assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue()) @@ -2409,7 +2413,11 @@ def test_compress_identical_objects(): out2 = BytesIO() writer.write(out2) assert len(out1.getvalue()) - 100 < len(out2.getvalue()) - writer.compress_identical_objects(remove_identicals=False) + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects(remove_identicals=False) out3 = BytesIO() writer.write(out3) assert len(out2.getvalue()) > len(out3.getvalue()) @@ -2694,7 +2702,11 @@ def test_compress_identical_objects__after_remove_images(): """Test for #3237""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "AutoCad_Diagram.pdf") writer.remove_images() - writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) def test_merge__process_named_dests__no_dests_in_source_file():