diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 069a36deb..ece537d71 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -64,6 +64,7 @@ _get_max_pdf_version_header, deprecate, deprecate_no_replacement, + deprecate_with_replacement, deprecation_with_replacement, logger_warning, ) @@ -1613,17 +1614,26 @@ def compress_identical_objects( self, remove_identicals: bool = True, remove_orphans: bool = True, + remove_unreferenced: bool = True ) -> None: """ - Parse the PDF file and merge objects that have the same hash. + Parse the PDF file objects that have the same hash. This will make objects common to multiple pages. Recommended to be used just before writing output. Args: remove_identicals: Remove identical objects. - remove_orphans: Remove unreferenced objects. + remove_orphans: Remove unreferenced objects, deprecated use remove_unreferenced. + remove_unreferenced: Remove unreferenced objects. """ + deprecate_with_replacement( + old_name="remove_orphans", + new_name="remove_unreferenced", + removed_in="6.0.0", + ) + if not remove_orphans: + remove_unreferenced = remove_orphans def replace_in_obj( obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject] @@ -1637,7 +1647,7 @@ def replace_in_obj( assert isinstance(obj, (DictionaryObject, ArrayObject)) for k, v in key_val: if isinstance(v, IndirectObject): - orphans[v.idnum - 1] = False + unreferenced[v.idnum - 1] = False if v in crossref: obj[k] = crossref[v] else: @@ -1645,9 +1655,9 @@ def replace_in_obj( will be performed within replace_in_obj""" replace_in_obj(v, crossref) - # _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...]) + # _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...]) self._idnum_hash = {} - orphans = [True] * len(self._objects) + unreferenced = [True] * len(self._objects) # look for similar objects for idx, obj in enumerate(self._objects): if is_null_or_none(obj): @@ -1672,17 +1682,15 @@ def replace_in_obj( if isinstance(obj, (DictionaryObject, ArrayObject)): replace_in_obj(obj, cnv_rev) - # remove orphans (if applicable) - orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore - - orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore - - try: - orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore - except AttributeError: - pass - for i in compress(range(len(self._objects)), orphans): - self._objects[i] = None + if remove_unreferenced: + unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore + unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore + try: + unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore + except AttributeError: + pass + for i in compress(range(len(self._objects)), unreferenced): + self._objects[i] = None def _sweep_indirect_references( self, diff --git a/tests/test_workflows.py b/tests/test_workflows.py index f40858fe7..f1355383d 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -273,7 +273,11 @@ def test_transform_compress_identical_objects(): op = Transformation().scale(sx=0.8, sy=0.8) page.add_transformation(op) writer.add_page(page) - writer.compress_identical_objects() + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects() bytes_out = BytesIO() writer.write(bytes_out) result_reader = PdfReader(bytes_out) diff --git a/tests/test_writer.py b/tests/test_writer.py index 90ff4f2de..3e26a6ad1 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2399,7 +2399,11 @@ def test_compress_identical_objects(): name = "iss2794.pdf" in_bytes = BytesIO(get_data_from_url(url, name=name)) writer = PdfWriter(in_bytes) - writer.compress_identical_objects(remove_orphans=False) + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects(remove_orphans=False) out1 = BytesIO() writer.write(out1) assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue()) @@ -2409,7 +2413,11 @@ def test_compress_identical_objects(): out2 = BytesIO() writer.write(out2) assert len(out1.getvalue()) - 100 < len(out2.getvalue()) - writer.compress_identical_objects(remove_identicals=False) + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects(remove_identicals=False) out3 = BytesIO() writer.write(out3) assert len(out2.getvalue()) > len(out3.getvalue()) @@ -2694,7 +2702,11 @@ def test_compress_identical_objects__after_remove_images(): """Test for #3237""" writer = PdfWriter(clone_from=RESOURCE_ROOT / "AutoCad_Diagram.pdf") writer.remove_images() - writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) + with pytest.warns( + DeprecationWarning, + match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.", + ): + writer.compress_identical_objects(remove_identicals=True, remove_orphans=True) def test_merge__process_named_dests__no_dests_in_source_file():