Skip to content

Commit 7274240

Browse files
committed
ENH: Automatically preserve links in added pages
1 parent ae7a064 commit 7274240

File tree

5 files changed

+258
-0
lines changed

5 files changed

+258
-0
lines changed

pypdf/_writer.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,14 +95,17 @@
9595
DecodedStreamObject,
9696
Destination,
9797
DictionaryObject,
98+
DirectRefLink,
9899
Fit,
99100
FloatObject,
100101
IndirectObject,
102+
NamedRefLink,
101103
NameObject,
102104
NullObject,
103105
NumberObject,
104106
PdfObject,
105107
RectangleObject,
108+
RefLink,
106109
StreamObject,
107110
TextStringObject,
108111
TreeObject,
@@ -209,6 +212,11 @@ def __init__(
209212
"""The PDF file identifier,
210213
defined by the ID in the PDF file's trailer dictionary."""
211214

215+
self._unresolved_links: list[tuple[RefLink,RefLink]] = []
216+
"Tracks links in pages added to the writer for resolving later."
217+
self._merged_in_pages: List[Tuple[Optional[IndirectObject],Optional[IndirectObject]]] = []
218+
"Tracks pages added to the writer and what page they turned into."
219+
212220
if self.incremental:
213221
if isinstance(fileobj, (str, Path)):
214222
with open(fileobj, "rb") as f:
@@ -482,12 +490,47 @@ def _add_page(
482490
]
483491
except Exception:
484492
pass
493+
494+
def _extract_links(new_page: PageObject, old_page: PageObject) -> List[Tuple[RefLink,RefLink]]:
495+
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
496+
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
497+
498+
return [(new_link, old_link) for (new_link, old_link)
499+
in zip(new_links, old_links)
500+
if new_link and old_link]
501+
502+
def _build_link(indir_obj: IndirectObject, page: PageObject) -> Optional[RefLink]:
503+
src = cast(PdfReader, page.pdf)
504+
link = cast(DictionaryObject, indir_obj.get_object())
505+
if link.get("/Subtype") != "/Link":
506+
return None
507+
508+
if "/A" in link:
509+
action = cast(DictionaryObject, link["/A"])
510+
if action.get("/S") != "/GoTo":
511+
return None
512+
513+
return _create_link(action["/D"], src)
514+
515+
if "/Dest" in link:
516+
return _create_link(link["/Dest"], src)
517+
518+
return None # nothing we need to do
519+
520+
def _create_link(ref: PdfObject, src: PdfReader)-> Optional[RefLink]:
521+
if isinstance(ref, TextStringObject):
522+
return NamedRefLink(ref, src)
523+
if isinstance(ref, ArrayObject):
524+
return DirectRefLink(ref)
525+
return None
526+
485527
page = cast(
486528
"PageObject", page_org.clone(self, False, excluded_keys).get_object()
487529
)
488530
if page_org.pdf is not None:
489531
other = page_org.pdf.pdf_header
490532
self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other)
533+
491534
node, idx = self._get_page_in_node(index)
492535
page[NameObject(PA.PARENT)] = node.indirect_reference
493536

@@ -505,6 +548,16 @@ def _add_page(
505548
recurse += 1
506549
if recurse > 1000:
507550
raise PyPdfError("Too many recursive calls!")
551+
552+
if page_org.pdf is not None:
553+
# the page may contain links to other pages, and those other
554+
# pages may or may not already be added. we store the
555+
# information we need, so that we can resolve the references
556+
# later.
557+
self._unresolved_links.extend(_extract_links(page, page_org))
558+
self._merged_in_pages.append( (page_org.indirect_reference,
559+
page.indirect_reference) )
560+
508561
return page
509562

510563
def set_need_appearances_writer(self, state: bool = True) -> None:
@@ -1349,6 +1402,22 @@ def encrypt(
13491402
self._add_object(entry)
13501403
self._encrypt_entry = entry
13511404

1405+
def _resolve_links(self) -> None:
1406+
"""Patch up links that were added to the document earlier, to
1407+
make sure they still point to the same pages.
1408+
"""
1409+
for (new_link, old_link) in self._unresolved_links:
1410+
old_page = old_link.find_referenced_page()
1411+
if not old_page:
1412+
continue
1413+
new_page = None
1414+
for (page_org, page_created) in self._merged_in_pages:
1415+
if page_org == old_page:
1416+
new_page = page_created
1417+
if new_page is None:
1418+
continue
1419+
new_link.patch_reference(self, new_page)
1420+
13521421
def write_stream(self, stream: StreamType) -> None:
13531422
if hasattr(stream, "mode") and "b" not in stream.mode:
13541423
logger_warning(
@@ -1360,6 +1429,7 @@ def write_stream(self, stream: StreamType) -> None:
13601429
# if not self._root:
13611430
# self._root = self._add_object(self._root_object)
13621431
# self._sweep_indirect_references(self._root)
1432+
self._resolve_links()
13631433

13641434
if self.incremental:
13651435
self._reader.stream.seek(0)

pypdf/generic/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
)
6363
from ._files import EmbeddedFile
6464
from ._fit import Fit
65+
from ._link import DirectRefLink, NamedRefLink, RefLink
6566
from ._outline import OutlineItem
6667
from ._rectangle import RectangleObject
6768
from ._utils import (
@@ -208,19 +209,22 @@ def link(
208209
"DecodedStreamObject",
209210
"Destination",
210211
"DictionaryObject",
212+
"DirectRefLink",
211213
"EmbeddedFile",
212214
"EncodedStreamObject",
213215
"Field",
214216
"Fit",
215217
"FloatObject",
216218
"IndirectObject",
217219
"NameObject",
220+
"NamedRefLink",
218221
"NullObject",
219222
"NumberObject",
220223
"OutlineFontFlag",
221224
"OutlineItem",
222225
"PdfObject",
223226
"RectangleObject",
227+
"RefLink",
224228
"StreamObject",
225229
"TextStringObject",
226230
"TreeObject",

pypdf/generic/_link.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions are
5+
# met:
6+
#
7+
# * Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright notice,
10+
# this list of conditions and the following disclaimer in the documentation
11+
# and/or other materials provided with the distribution.
12+
# * The name of the author may not be used to endorse or promote products
13+
# derived from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25+
# POSSIBILITY OF SUCH DAMAGE.
26+
27+
28+
# This module contains classes used by _writer.py to track links in
29+
# pages being added to the writer until the links can be resolved.
30+
31+
from typing import TYPE_CHECKING, Union
32+
33+
from . import ArrayObject, IndirectObject, TextStringObject
34+
35+
if TYPE_CHECKING:
36+
from .._reader import PdfReader
37+
from .._writer import PdfWriter
38+
39+
class NamedRefLink:
40+
"""Named reference link being preserved until we can resolve it correctly."""
41+
42+
def __init__(self, ref: TextStringObject, source_pdf: "PdfReader") -> None:
43+
"""ref: TextStringObject with named reference"""
44+
self._ref = ref
45+
self._source_pdf = source_pdf
46+
47+
def find_referenced_page(self) -> Union[int,None]:
48+
dest = self._source_pdf.named_destinations.get(str(self._ref))
49+
return dest.page if dest else None
50+
51+
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
52+
"""target_pdf: PdfWriter which the new link went into"""
53+
# point named destination in new PDF to the new page
54+
if str(self._ref) not in target_pdf.named_destinations:
55+
new_page_ix = [ # we already checked that it's here
56+
p.indirect_reference for p in (target_pdf.flattened_pages or [])
57+
].index(new_page)
58+
target_pdf.add_named_destination(str(self._ref), new_page_ix)
59+
60+
61+
class DirectRefLink:
62+
"""Direct reference link being preserved until we can resolve it correctly."""
63+
64+
def __init__(self, ref: ArrayObject) -> None:
65+
"""ref: an ArrayObject whose first element is the Page indir obj"""
66+
self._ref = ref
67+
68+
def find_referenced_page(self) -> IndirectObject:
69+
return self._ref[0]
70+
71+
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
72+
"""target_pdf: PdfWriter which the new link went into"""
73+
self._ref[0] = new_page
74+
75+
76+
RefLink = Union[NamedRefLink,DirectRefLink]

tests/example_files.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,7 @@
110110
url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf
111111
- local_filename: iss3268.pdf
112112
url: https://github.com/user-attachments/files/20060394/broken.pdf
113+
- local_filename: direct-link.pdf
114+
url: https://github.com/user-attachments/files/20348304/tst.pdf
115+
- local_filename: named-reference.pdf
116+
url: https://github.com/user-attachments/files/20455804/MinimalJob.pdf

tests/test_merger.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,3 +409,107 @@ def test_deprecate_pdfmerger():
409409
def test_get_reference():
410410
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
411411
assert writer.get_reference(writer.pages[0]) == writer.pages[0].indirect_reference
412+
413+
414+
@pytest.mark.enable_socket
415+
def test_direct_link_preserved(pdf_file_path):
416+
# this could be any PDF -- we don't care which
417+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
418+
writer = PdfWriter(clone_from = reader)
419+
420+
# this PDF has a direct link from p1 to p2
421+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
422+
for p in merger.pages:
423+
writer.add_page(p)
424+
425+
writer.write(pdf_file_path)
426+
427+
check = PdfReader(pdf_file_path)
428+
page3 = check.pages[2]
429+
link = page3["/Annots"][0].get_object()
430+
assert link["/Subtype"] == "/Link"
431+
dest = link["/Dest"][0] # indirect ref of page referred to
432+
433+
page4 = check.flattened_pages[3]
434+
assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken"
435+
436+
437+
@pytest.mark.enable_socket
438+
def test_direct_link_preserved_reordering(pdf_file_path):
439+
# this could be any PDF -- we don't care which
440+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
441+
writer = PdfWriter(clone_from = reader)
442+
443+
# this PDF has a direct link from p1 to p2
444+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
445+
for p in merger.pages:
446+
writer.add_page(p)
447+
448+
# let's insert a page to mess up the page order
449+
writer.insert_page(reader.pages[0], 3)
450+
451+
writer.write(pdf_file_path)
452+
453+
check = PdfReader(pdf_file_path)
454+
page3 = check.pages[2]
455+
link = page3["/Annots"][0].get_object()
456+
assert link["/Subtype"] == "/Link"
457+
dest = link["/Dest"][0] # indirect ref of page referred to
458+
459+
page5 = check.flattened_pages[4] # it moved one out
460+
assert dest == page5.indirect_reference, "Link from page 3 to page 5 is broken"
461+
462+
463+
@pytest.mark.enable_socket
464+
def test_direct_link_page_missing(pdf_file_path):
465+
# this could be any PDF -- we don't care which
466+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
467+
writer = PdfWriter(clone_from = reader)
468+
469+
# this PDF has a direct link from p1 to p2
470+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
471+
writer.add_page(merger.pages[0])
472+
# but we're not adding page 2
473+
474+
writer.write(pdf_file_path) # verify nothing crashes
475+
476+
477+
@pytest.mark.enable_socket
478+
def test_named_reference_preserved(pdf_file_path):
479+
# this could be any PDF -- we don't care which
480+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
481+
writer = PdfWriter(clone_from = reader)
482+
483+
# this PDF has a named reference from from p3 to p5
484+
merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
485+
for p in merger.pages:
486+
writer.add_page(p)
487+
488+
writer.write(pdf_file_path)
489+
490+
check = PdfReader(pdf_file_path)
491+
page5 = check.pages[4]
492+
page7 = check.flattened_pages[6]
493+
for link in page5["/Annots"]:
494+
action = link["/A"]
495+
assert action.get("/S") == "/GoTo"
496+
dest = str(action["/D"])
497+
assert dest in check.named_destinations
498+
pref = check.named_destinations[dest].page
499+
500+
assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken"
501+
502+
503+
@pytest.mark.enable_socket
504+
def test_named_ref_to_page_thats_gone(pdf_file_path):
505+
source = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
506+
buf = BytesIO()
507+
tmp = PdfWriter()
508+
tmp.add_page(source.pages[2]) # we add only the page with the reference
509+
tmp.write(buf)
510+
511+
source = PdfReader(buf)
512+
513+
writer = PdfWriter()
514+
writer.add_page(source.pages[0]) # now references to non-existent page
515+
writer.write(pdf_file_path) # don't crash

0 commit comments

Comments
 (0)