Skip to content

Commit ae7c333

Browse files
committed
Changes to comply with review comments
1 parent 235c784 commit ae7c333

File tree

4 files changed

+65
-63
lines changed

4 files changed

+65
-63
lines changed

pypdf/_writer.py

Lines changed: 5 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -95,22 +95,21 @@
9595
DecodedStreamObject,
9696
Destination,
9797
DictionaryObject,
98-
DirectRefLink,
9998
Fit,
10099
FloatObject,
101100
IndirectObject,
102-
NamedRefLink,
103101
NameObject,
104102
NullObject,
105103
NumberObject,
106104
PdfObject,
107105
RectangleObject,
108-
RefLink,
106+
ReferenceLink,
109107
StreamObject,
110108
TextStringObject,
111109
TreeObject,
112110
ViewerPreferences,
113111
create_string_object,
112+
extract_links,
114113
hex_to_rgb,
115114
is_null_or_none,
116115
)
@@ -212,9 +211,9 @@ def __init__(
212211
"""The PDF file identifier,
213212
defined by the ID in the PDF file's trailer dictionary."""
214213

215-
self._unresolved_links: list[tuple[RefLink,RefLink]] = []
214+
self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = []
216215
"Tracks links in pages added to the writer for resolving later."
217-
self._merged_in_pages: Dict[Optional[IndirectObject],Optional[IndirectObject]] = {}
216+
self._merged_in_pages: Dict[Optional[IndirectObject], Optional[IndirectObject]] = {}
218217
"Tracks pages added to the writer and what page they turned into."
219218

220219
if self.incremental:
@@ -491,39 +490,6 @@ def _add_page(
491490
except Exception:
492491
pass
493492

494-
def _extract_links(new_page: PageObject, old_page: PageObject) -> List[Tuple[RefLink,RefLink]]:
495-
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
496-
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
497-
498-
return [(new_link, old_link) for (new_link, old_link)
499-
in zip(new_links, old_links)
500-
if new_link and old_link]
501-
502-
def _build_link(indir_obj: IndirectObject, page: PageObject) -> Optional[RefLink]:
503-
src = cast(PdfReader, page.pdf)
504-
link = cast(DictionaryObject, indir_obj.get_object())
505-
if link.get("/Subtype") != "/Link":
506-
return None
507-
508-
if "/A" in link:
509-
action = cast(DictionaryObject, link["/A"])
510-
if action.get("/S") != "/GoTo":
511-
return None
512-
513-
return _create_link(action["/D"], src)
514-
515-
if "/Dest" in link:
516-
return _create_link(link["/Dest"], src)
517-
518-
return None # nothing we need to do
519-
520-
def _create_link(ref: PdfObject, src: PdfReader)-> Optional[RefLink]:
521-
if isinstance(ref, TextStringObject):
522-
return NamedRefLink(ref, src)
523-
if isinstance(ref, ArrayObject):
524-
return DirectRefLink(ref)
525-
return None
526-
527493
page = cast(
528494
"PageObject", page_org.clone(self, False, excluded_keys).get_object()
529495
)
@@ -554,7 +520,7 @@ def _create_link(ref: PdfObject, src: PdfReader)-> Optional[RefLink]:
554520
# pages may or may not already be added. we store the
555521
# information we need, so that we can resolve the references
556522
# later.
557-
self._unresolved_links.extend(_extract_links(page, page_org))
523+
self._unresolved_links.extend(extract_links(page, page_org))
558524
self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
559525

560526
return page

pypdf/generic/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
)
6363
from ._files import EmbeddedFile
6464
from ._fit import Fit
65-
from ._link import DirectRefLink, NamedRefLink, RefLink
65+
from ._link import ReferenceLink, extract_links
6666
from ._outline import OutlineItem
6767
from ._rectangle import RectangleObject
6868
from ._utils import (
@@ -209,22 +209,20 @@ def link(
209209
"DecodedStreamObject",
210210
"Destination",
211211
"DictionaryObject",
212-
"DirectRefLink",
213212
"EmbeddedFile",
214213
"EncodedStreamObject",
215214
"Field",
216215
"Fit",
217216
"FloatObject",
218217
"IndirectObject",
219218
"NameObject",
220-
"NamedRefLink",
221219
"NullObject",
222220
"NumberObject",
223221
"OutlineFontFlag",
224222
"OutlineItem",
225223
"PdfObject",
226224
"RectangleObject",
227-
"RefLink",
225+
"ReferenceLink",
228226
"StreamObject",
229227
"TextStringObject",
230228
"TreeObject",
@@ -233,6 +231,7 @@ def link(
233231
"create_string_object",
234232
"decode_pdfdocencoding",
235233
"encode_pdfdocencoding",
234+
"extract_links",
236235
"hex_to_rgb",
237236
"is_null_or_none",
238237
"read_hex_string_from_stream",

pypdf/generic/_link.py

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,50 +25,87 @@
2525
# POSSIBILITY OF SUCH DAMAGE.
2626

2727

28-
# This module contains classes used by _writer.py to track links in
29-
# pages being added to the writer until the links can be resolved.
28+
# This module contains code used by _writer.py to track links in pages
29+
# being added to the writer until the links can be resolved.
3030

31-
from typing import TYPE_CHECKING, Union
31+
from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
3232

33-
from . import ArrayObject, IndirectObject, TextStringObject
33+
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
3434

3535
if TYPE_CHECKING:
36+
from .._page import PageObject
3637
from .._reader import PdfReader
3738
from .._writer import PdfWriter
3839

3940

40-
class NamedRefLink:
41+
class NamedReferenceLink:
4142
"""Named reference link being preserved until we can resolve it correctly."""
4243

43-
def __init__(self, ref: TextStringObject, source_pdf: "PdfReader") -> None:
44-
"""ref: TextStringObject with named reference"""
45-
self._ref = ref
44+
def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
45+
"""reference: TextStringObject with named reference"""
46+
self._reference = reference
4647
self._source_pdf = source_pdf
4748

48-
def find_referenced_page(self) -> Union[IndirectObject,None]:
49-
dest = self._source_pdf.named_destinations.get(str(self._ref))
49+
def find_referenced_page(self) -> Union[IndirectObject, None]:
50+
dest = self._source_pdf.named_destinations.get(str(self._reference))
5051
return dest.page if dest else None
5152

5253
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
5354
"""target_pdf: PdfWriter which the new link went into"""
5455
# point named destination in new PDF to the new page
55-
if str(self._ref) not in target_pdf.named_destinations:
56-
target_pdf.add_named_destination(str(self._ref), new_page.page_number)
56+
if str(self._reference) not in target_pdf.named_destinations:
57+
target_pdf.add_named_destination(str(self._reference), new_page.page_number)
5758

5859

59-
class DirectRefLink:
60+
class DirectReferenceLink:
6061
"""Direct reference link being preserved until we can resolve it correctly."""
6162

62-
def __init__(self, ref: ArrayObject) -> None:
63-
"""ref: an ArrayObject whose first element is the Page indir obj"""
64-
self._ref = ref
63+
def __init__(self, reference: ArrayObject) -> None:
64+
"""reference: an ArrayObject whose first element is the Page indir obj"""
65+
self._reference = reference
6566

6667
def find_referenced_page(self) -> IndirectObject:
67-
return self._ref[0]
68+
return self._reference[0]
6869

6970
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
7071
"""target_pdf: PdfWriter which the new link went into"""
71-
self._ref[0] = new_page
72+
self._reference[0] = new_page
7273

7374

74-
RefLink = Union[NamedRefLink,DirectRefLink]
75+
ReferenceLink = Union[NamedReferenceLink, DirectReferenceLink]
76+
77+
78+
def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[ReferenceLink, ReferenceLink]]:
79+
new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])]
80+
old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])]
81+
82+
return [(new_link, old_link) for (new_link, old_link)
83+
in zip(new_links, old_links)
84+
if new_link and old_link]
85+
86+
87+
def _build_link(indir_obj: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
88+
src = cast("PdfReader", page.pdf)
89+
link = cast(DictionaryObject, indir_obj.get_object())
90+
if link.get("/Subtype") != "/Link":
91+
return None
92+
93+
if "/A" in link:
94+
action = cast(DictionaryObject, link["/A"])
95+
if action.get("/S") != "/GoTo":
96+
return None
97+
98+
return _create_link(action["/D"], src)
99+
100+
if "/Dest" in link:
101+
return _create_link(link["/Dest"], src)
102+
103+
return None # nothing we need to do
104+
105+
106+
def _create_link(ref: PdfObject, src: "PdfReader")-> Optional[ReferenceLink]:
107+
if isinstance(ref, TextStringObject):
108+
return NamedReferenceLink(ref, src)
109+
if isinstance(ref, ArrayObject):
110+
return DirectReferenceLink(ref)
111+
return None

tests/test_merger.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ def test_named_reference_preserved(pdf_file_path):
501501

502502

503503
@pytest.mark.enable_socket
504-
def test_named_ref_to_page_thats_gone(pdf_file_path):
504+
def test_named_ref_to_page_that_is_gone(pdf_file_path):
505505
source = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
506506
buf = BytesIO()
507507
tmp = PdfWriter()

0 commit comments

Comments
 (0)