Skip to content

Commit c17f03a

Browse files
authored
ENH: Automatically preserve links in added pages (#3298)
Related to #3290.
1 parent bfe7178 commit c17f03a

File tree

6 files changed

+263
-2
lines changed

6 files changed

+263
-2
lines changed

pypdf/_writer.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,13 @@
103103
NumberObject,
104104
PdfObject,
105105
RectangleObject,
106+
ReferenceLink,
106107
StreamObject,
107108
TextStringObject,
108109
TreeObject,
109110
ViewerPreferences,
110111
create_string_object,
112+
extract_links,
111113
hex_to_rgb,
112114
is_null_or_none,
113115
)
@@ -209,6 +211,11 @@ def __init__(
209211
"""The PDF file identifier,
210212
defined by the ID in the PDF file's trailer dictionary."""
211213

214+
self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []
215+
"Tracks links in pages added to the writer for resolving later."
216+
self._merged_in_pages: Dict[Optional[IndirectObject], Optional[IndirectObject]] = {}
217+
"Tracks pages added to the writer and what page they turned into."
218+
212219
if self.incremental:
213220
if isinstance(fileobj, (str, Path)):
214221
with open(fileobj, "rb") as f:
@@ -479,12 +486,14 @@ def _add_page(
479486
]
480487
except Exception:
481488
pass
489+
482490
page = cast(
483491
"PageObject", page_org.clone(self, False, excluded_keys).get_object()
484492
)
485493
if page_org.pdf is not None:
486494
other = page_org.pdf.pdf_header
487495
self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
496+
488497
node, idx = self._get_page_in_node(index)
489498
page[NameObject(PA.PARENT)] = node.indirect_reference
490499

@@ -502,6 +511,15 @@ def _add_page(
502511
recurse += 1
503512
if recurse > 1000:
504513
raise PyPdfError("Too many recursive calls!")
514+
515+
if page_org.pdf is not None:
516+
# the page may contain links to other pages, and those other
517+
# pages may or may not already be added. we store the
518+
# information we need, so that we can resolve the references
519+
# later.
520+
self._unresolved_links.extend(extract_links(page, page_org))
521+
self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
522+
505523
return page
506524

507525
def set_need_appearances_writer(self, state: bool = True) -> None:
@@ -1379,6 +1397,19 @@ def encrypt(
13791397
self._add_object(entry)
13801398
self._encrypt_entry = entry
13811399

1400+
def _resolve_links(self) -> None:
1401+
"""Patch up links that were added to the document earlier, to
1402+
make sure they still point to the same pages.
1403+
"""
1404+
for (new_link, old_link) in self._unresolved_links:
1405+
old_page = old_link.find_referenced_page()
1406+
if not old_page:
1407+
continue
1408+
new_page = self._merged_in_pages.get(old_page)
1409+
if new_page is None:
1410+
continue
1411+
new_link.patch_reference(self, new_page)
1412+
13821413
def write_stream(self, stream: StreamType) -> None:
13831414
if hasattr(stream, "mode") and "b" not in stream.mode:
13841415
logger_warning(
@@ -1390,6 +1421,7 @@ def write_stream(self, stream: StreamType) -> None:
13901421
# if not self._root:
13911422
# self._root = self._add_object(self._root_object)
13921423
# self._sweep_indirect_references(self._root)
1424+
self._resolve_links()
13931425

13941426
if self.incremental:
13951427
self._reader.stream.seek(0)

pypdf/generic/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
)
6363
from ._files import EmbeddedFile
6464
from ._fit import Fit
65+
from ._link import DirectReferenceLink, NamedReferenceLink, ReferenceLink, extract_links
6566
from ._outline import OutlineItem
6667
from ._rectangle import RectangleObject
6768
from ._utils import (
@@ -208,19 +209,22 @@ def link(
208209
"DecodedStreamObject",
209210
"Destination",
210211
"DictionaryObject",
212+
"DirectReferenceLink",
211213
"EmbeddedFile",
212214
"EncodedStreamObject",
213215
"Field",
214216
"Fit",
215217
"FloatObject",
216218
"IndirectObject",
217219
"NameObject",
220+
"NamedReferenceLink",
218221
"NullObject",
219222
"NumberObject",
220223
"OutlineFontFlag",
221224
"OutlineItem",
222225
"PdfObject",
223226
"RectangleObject",
227+
"ReferenceLink",
224228
"StreamObject",
225229
"TextStringObject",
226230
"TreeObject",
@@ -229,6 +233,7 @@ def link(
229233
"create_string_object",
230234
"decode_pdfdocencoding",
231235
"encode_pdfdocencoding",
236+
"extract_links",
232237
"hex_to_rgb",
233238
"is_null_or_none",
234239
"read_hex_string_from_stream",

pypdf/generic/_data_structures.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,8 +1707,8 @@ def title(self) -> Optional[str]:
17071707
return self.get("/Title")
17081708

17091709
@property
1710-
def page(self) -> Optional[int]:
1711-
"""Read-only property accessing the destination page number."""
1710+
def page(self) -> Optional[IndirectObject]:
1711+
"""Read-only property accessing the IndirectObject of the destination page."""
17121712
return self.get("/Page")
17131713

17141714
@property

pypdf/generic/_link.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions are
5+
# met:
6+
#
7+
# * Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright notice,
10+
# this list of conditions and the following disclaimer in the documentation
11+
# and/or other materials provided with the distribution.
12+
# * The name of the author may not be used to endorse or promote products
13+
# derived from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25+
# POSSIBILITY OF SUCH DAMAGE.
26+
27+
28+
# This module contains code used by _writer.py to track links in pages
29+
# being added to the writer until the links can be resolved.
30+
31+
from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
32+
33+
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
34+
35+
if TYPE_CHECKING:
36+
from .._page import PageObject
37+
from .._reader import PdfReader
38+
from .._writer import PdfWriter
39+
40+
41+
class NamedReferenceLink:
42+
"""Named reference link being preserved until we can resolve it correctly."""
43+
44+
def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
45+
"""reference: TextStringObject with named reference"""
46+
self._reference = reference
47+
self._source_pdf = source_pdf
48+
49+
def find_referenced_page(self) -> Union[IndirectObject, None]:
50+
destination = self._source_pdf.named_destinations.get(str(self._reference))
51+
return destination.page if destination else None
52+
53+
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
54+
"""target_pdf: PdfWriter which the new link went into"""
55+
# point named destination in new PDF to the new page
56+
if str(self._reference) not in target_pdf.named_destinations:
57+
target_pdf.add_named_destination(str(self._reference), new_page.page_number)
58+
59+
60+
class DirectReferenceLink:
61+
"""Direct reference link being preserved until we can resolve it correctly."""
62+
63+
def __init__(self, reference: ArrayObject) -> None:
64+
"""reference: an ArrayObject whose first element is the Page indirect object"""
65+
self._reference = reference
66+
67+
def find_referenced_page(self) -> IndirectObject:
68+
return self._reference[0]
69+
70+
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
71+
"""target_pdf: PdfWriter which the new link went into"""
72+
self._reference[0] = new_page
73+
74+
75+
ReferenceLink = Union[NamedReferenceLink, DirectReferenceLink]
76+
77+
78+
def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[ReferenceLink, ReferenceLink]]:
79+
"""Extracts links from two pages on the assumption that the two pages are
80+
the same. Produces one list of (new link, old link) tuples.
81+
"""
82+
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
83+
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
84+
85+
return [
86+
(new_link, old_link) for (new_link, old_link)
87+
in zip(new_links, old_links)
88+
if new_link and old_link
89+
]
90+
91+
92+
def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
93+
src = cast("PdfReader", page.pdf)
94+
link = cast(DictionaryObject, indirect_object.get_object())
95+
if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link":
96+
return None
97+
98+
if "/A" in link:
99+
action = cast(DictionaryObject, link["/A"])
100+
if action.get("/S") != "/GoTo":
101+
return None
102+
103+
return _create_link(action["/D"], src)
104+
105+
if "/Dest" in link:
106+
return _create_link(link["/Dest"], src)
107+
108+
return None # Nothing to do here
109+
110+
111+
def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
112+
if isinstance(reference, TextStringObject):
113+
return NamedReferenceLink(reference, source_pdf)
114+
if isinstance(reference, ArrayObject):
115+
return DirectReferenceLink(reference)
116+
return None

tests/example_files.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,5 +110,9 @@
110110
url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf
111111
- local_filename: iss3268.pdf
112112
url: https://github.com/user-attachments/files/20060394/broken.pdf
113+
- local_filename: direct-link.pdf
114+
url: https://github.com/user-attachments/files/20348304/tst.pdf
115+
- local_filename: named-reference.pdf
116+
url: https://github.com/user-attachments/files/20455804/MinimalJob.pdf
113117
- local_filename: large_lzw_example_encoded.dat
114118
url: https://github.com/user-attachments/files/20923310/large_lzw_example_encoded.dat.txt

tests/test_merger.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,3 +409,107 @@ def test_deprecate_pdfmerger():
409409
def test_get_reference():
410410
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
411411
assert writer.get_reference(writer.pages[0]) == writer.pages[0].indirect_reference
412+
413+
414+
@pytest.mark.enable_socket
415+
def test_direct_link_preserved(pdf_file_path):
416+
# this could be any PDF -- we don't care which
417+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
418+
writer = PdfWriter(clone_from=reader)
419+
420+
# this PDF has a direct link from p1 to p2
421+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
422+
for p in merger.pages:
423+
writer.add_page(p)
424+
425+
writer.write(pdf_file_path)
426+
427+
check = PdfReader(pdf_file_path)
428+
page3 = check.pages[2]
429+
link = page3["/Annots"][0].get_object()
430+
assert link["/Subtype"] == "/Link"
431+
dest = link["/Dest"][0] # indirect reference of page referred to
432+
433+
page4 = check.flattened_pages[3]
434+
assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken"
435+
436+
437+
@pytest.mark.enable_socket
438+
def test_direct_link_preserved_reordering(pdf_file_path):
439+
# this could be any PDF -- we don't care which
440+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
441+
writer = PdfWriter(clone_from=reader)
442+
443+
# this PDF has a direct link from p1 to p2
444+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
445+
for p in merger.pages:
446+
writer.add_page(p)
447+
448+
# let's insert a page to mess up the page order
449+
writer.insert_page(reader.pages[0], 3)
450+
451+
writer.write(pdf_file_path)
452+
453+
check = PdfReader(pdf_file_path)
454+
page3 = check.pages[2]
455+
link = page3["/Annots"][0].get_object()
456+
assert link["/Subtype"] == "/Link"
457+
dest = link["/Dest"][0] # indirect reference of page referred to
458+
459+
page5 = check.flattened_pages[4] # it moved one out
460+
assert dest == page5.indirect_reference, "Link from page 3 to page 5 is broken"
461+
462+
463+
@pytest.mark.enable_socket
464+
def test_direct_link_page_missing(pdf_file_path):
465+
# this could be any PDF -- we don't care which
466+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
467+
writer = PdfWriter(clone_from=reader)
468+
469+
# this PDF has a direct link from p1 to p2
470+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
471+
writer.add_page(merger.pages[0])
472+
# but we're not adding page 2
473+
474+
writer.write(pdf_file_path) # verify nothing crashes
475+
476+
477+
@pytest.mark.enable_socket
478+
def test_named_reference_preserved(pdf_file_path):
479+
# this could be any PDF -- we don't care which
480+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
481+
writer = PdfWriter(clone_from=reader)
482+
483+
# this PDF has a named reference from from p3 to p5
484+
merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
485+
for p in merger.pages:
486+
writer.add_page(p)
487+
488+
writer.write(pdf_file_path)
489+
490+
check = PdfReader(pdf_file_path)
491+
page5 = check.pages[4]
492+
page7 = check.flattened_pages[6]
493+
for link in page5["/Annots"]:
494+
action = link["/A"]
495+
assert action.get("/S") == "/GoTo"
496+
dest = str(action["/D"])
497+
assert dest in check.named_destinations
498+
pref = check.named_destinations[dest].page
499+
500+
assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken"
501+
502+
503+
@pytest.mark.enable_socket
504+
def test_named_ref_to_page_that_is_gone(pdf_file_path):
505+
source = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
506+
buf = BytesIO()
507+
tmp = PdfWriter()
508+
tmp.add_page(source.pages[2]) # we add only the page with the reference
509+
tmp.write(buf)
510+
511+
source = PdfReader(buf)
512+
513+
writer = PdfWriter()
514+
writer.add_page(source.pages[0]) # now references to non-existent page
515+
writer.write(pdf_file_path) # don't crash

0 commit comments

Comments
 (0)