Skip to content

BUG: Use remove_orphans in compress_identical_objects #3310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 24 additions & 16 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
_get_max_pdf_version_header,
deprecate,
deprecate_no_replacement,
deprecate_with_replacement,
deprecation_with_replacement,
logger_warning,
)
Expand Down Expand Up @@ -1613,17 +1614,26 @@ def compress_identical_objects(
self,
remove_identicals: bool = True,
remove_orphans: bool = True,
remove_unreferenced: bool = True
) -> None:
"""
Parse the PDF file and merge objects that have the same hash.
Parse the PDF file objects that have the same hash.
This will make objects common to multiple pages.
Recommended to be used just before writing output.

Args:
remove_identicals: Remove identical objects.
remove_orphans: Remove unreferenced objects.
remove_orphans: Remove unreferenced objects, deprecated use remove_unreferenced.
remove_unreferenced: Remove unreferenced objects.

"""
deprecate_with_replacement(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This deprecation does not work and will always trigger - even without using the old parameter.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename_kwargs should be used instead?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this works with positional arguments as well, yes. Otherwise, we would have to think of another approach like setting a object() constant as default for the deprecated parameter to be able to identify it correctly.

old_name="remove_orphans",
new_name="remove_unreferenced",
removed_in="6.0.0",
)
if not remove_orphans:
remove_unreferenced = remove_orphans

def replace_in_obj(
obj: PdfObject, crossref: Dict[IndirectObject, IndirectObject]
Expand All @@ -1637,17 +1647,17 @@ def replace_in_obj(
assert isinstance(obj, (DictionaryObject, ArrayObject))
for k, v in key_val:
if isinstance(v, IndirectObject):
orphans[v.idnum - 1] = False
unreferenced[v.idnum - 1] = False
if v in crossref:
obj[k] = crossref[v]
else:
"""the filtering on DictionaryObject and ArrayObject only
will be performed within replace_in_obj"""
replace_in_obj(v, crossref)

# _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
# _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...])
self._idnum_hash = {}
orphans = [True] * len(self._objects)
unreferenced = [True] * len(self._objects)
# look for similar objects
for idx, obj in enumerate(self._objects):
if is_null_or_none(obj):
Expand All @@ -1672,17 +1682,15 @@ def replace_in_obj(
if isinstance(obj, (DictionaryObject, ArrayObject)):
replace_in_obj(obj, cnv_rev)

# remove orphans (if applicable)
orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore

try:
orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore
except AttributeError:
pass
for i in compress(range(len(self._objects)), orphans):
self._objects[i] = None
if remove_unreferenced:
unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore
unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore
try:
unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore
except AttributeError:
pass
for i in compress(range(len(self._objects)), unreferenced):
self._objects[i] = None

def _sweep_indirect_references(
self,
Expand Down
6 changes: 5 additions & 1 deletion tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,11 @@ def test_transform_compress_identical_objects():
op = Transformation().scale(sx=0.8, sy=0.8)
page.add_transformation(op)
writer.add_page(page)
writer.compress_identical_objects()
with pytest.warns(
DeprecationWarning,
match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.",
):
writer.compress_identical_objects()
bytes_out = BytesIO()
writer.write(bytes_out)
result_reader = PdfReader(bytes_out)
Expand Down
18 changes: 15 additions & 3 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2399,7 +2399,11 @@ def test_compress_identical_objects():
name = "iss2794.pdf"
in_bytes = BytesIO(get_data_from_url(url, name=name))
writer = PdfWriter(in_bytes)
writer.compress_identical_objects(remove_orphans=False)
with pytest.warns(
DeprecationWarning,
match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.",
):
writer.compress_identical_objects(remove_orphans=False)
out1 = BytesIO()
writer.write(out1)
assert 0.5 * len(in_bytes.getvalue()) > len(out1.getvalue())
Expand All @@ -2409,7 +2413,11 @@ def test_compress_identical_objects():
out2 = BytesIO()
writer.write(out2)
assert len(out1.getvalue()) - 100 < len(out2.getvalue())
writer.compress_identical_objects(remove_identicals=False)
with pytest.warns(
DeprecationWarning,
match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.",
):
writer.compress_identical_objects(remove_identicals=False)
out3 = BytesIO()
writer.write(out3)
assert len(out2.getvalue()) > len(out3.getvalue())
Expand Down Expand Up @@ -2694,7 +2702,11 @@ def test_compress_identical_objects__after_remove_images():
"""Test for #3237"""
writer = PdfWriter(clone_from=RESOURCE_ROOT / "AutoCad_Diagram.pdf")
writer.remove_images()
writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)
with pytest.warns(
DeprecationWarning,
match="remove_orphans is deprecated and will be removed in pypdf 6.0.0. Use remove_unreferenced instead.",
):
writer.compress_identical_objects(remove_identicals=True, remove_orphans=True)


def test_merge__process_named_dests__no_dests_in_source_file():
Expand Down
Loading