From 09ea9b0eead502a59e3cbccc3903bfe5bdb652b7 Mon Sep 17 00:00:00 2001 From: Lars Marius Garshol Date: Tue, 27 May 2025 15:42:55 +0200 Subject: [PATCH 1/7] ENH: Automatically preserve links in added pages --- pypdf/_writer.py | 66 +++++++++++++++++++ pypdf/generic/__init__.py | 4 ++ pypdf/generic/_data_structures.py | 4 +- pypdf/generic/_link.py | 74 +++++++++++++++++++++ tests/example_files.yaml | 4 ++ tests/test_merger.py | 104 ++++++++++++++++++++++++++++++ 6 files changed, 254 insertions(+), 2 deletions(-) create mode 100644 pypdf/generic/_link.py diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 7a3649e1f..963a38b4c 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -95,14 +95,17 @@ DecodedStreamObject, Destination, DictionaryObject, + DirectRefLink, Fit, FloatObject, IndirectObject, + NamedRefLink, NameObject, NullObject, NumberObject, PdfObject, RectangleObject, + RefLink, StreamObject, TextStringObject, TreeObject, @@ -209,6 +212,11 @@ def __init__( """The PDF file identifier, defined by the ID in the PDF file's trailer dictionary.""" + self._unresolved_links: list[tuple[RefLink,RefLink]] = [] + "Tracks links in pages added to the writer for resolving later." + self._merged_in_pages: Dict[Optional[IndirectObject],Optional[IndirectObject]] = {} + "Tracks pages added to the writer and what page they turned into." + if self.incremental: if isinstance(fileobj, (str, Path)): with open(fileobj, "rb") as f: @@ -482,12 +490,47 @@ def _add_page( ] except Exception: pass + + def _extract_links(new_page: PageObject, old_page: PageObject) -> List[Tuple[RefLink,RefLink]]: + new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])] + old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])] + + return [(new_link, old_link) for (new_link, old_link) + in zip(new_links, old_links) + if new_link and old_link] + + def _build_link(indir_obj: IndirectObject, page: PageObject) -> Optional[RefLink]: + src = cast(PdfReader, page.pdf) + link = cast(DictionaryObject, indir_obj.get_object()) + if link.get("/Subtype") != "/Link": + return None + + if "/A" in link: + action = cast(DictionaryObject, link["/A"]) + if action.get("/S") != "/GoTo": + return None + + return _create_link(action["/D"], src) + + if "/Dest" in link: + return _create_link(link["/Dest"], src) + + return None # nothing we need to do + + def _create_link(ref: PdfObject, src: PdfReader)-> Optional[RefLink]: + if isinstance(ref, TextStringObject): + return NamedRefLink(ref, src) + if isinstance(ref, ArrayObject): + return DirectRefLink(ref) + return None + page = cast( "PageObject", page_org.clone(self, False, excluded_keys).get_object() ) if page_org.pdf is not None: other = page_org.pdf.pdf_header self.pdf_header = _get_max_pdf_version_header(self.pdf_header, other) + node, idx = self._get_page_in_node(index) page[NameObject(PA.PARENT)] = node.indirect_reference @@ -505,6 +548,15 @@ def _add_page( recurse += 1 if recurse > 1000: raise PyPdfError("Too many recursive calls!") + + if page_org.pdf is not None: + # the page may contain links to other pages, and those other + # pages may or may not already be added. we store the + # information we need, so that we can resolve the references + # later. + self._unresolved_links.extend(_extract_links(page, page_org)) + self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference + return page def set_need_appearances_writer(self, state: bool = True) -> None: @@ -1349,6 +1401,19 @@ def encrypt( self._add_object(entry) self._encrypt_entry = entry + def _resolve_links(self) -> None: + """Patch up links that were added to the document earlier, to + make sure they still point to the same pages. + """ + for (new_link, old_link) in self._unresolved_links: + old_page = old_link.find_referenced_page() + if not old_page: + continue + new_page = self._merged_in_pages.get(old_page) + if new_page is None: + continue + new_link.patch_reference(self, new_page) + def write_stream(self, stream: StreamType) -> None: if hasattr(stream, "mode") and "b" not in stream.mode: logger_warning( @@ -1360,6 +1425,7 @@ def write_stream(self, stream: StreamType) -> None: # if not self._root: # self._root = self._add_object(self._root_object) # self._sweep_indirect_references(self._root) + self._resolve_links() if self.incremental: self._reader.stream.seek(0) diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index dc4545993..e83fd85ca 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -62,6 +62,7 @@ ) from ._files import EmbeddedFile from ._fit import Fit +from ._link import DirectRefLink, NamedRefLink, RefLink from ._outline import OutlineItem from ._rectangle import RectangleObject from ._utils import ( @@ -208,6 +209,7 @@ def link( "DecodedStreamObject", "Destination", "DictionaryObject", + "DirectRefLink", "EmbeddedFile", "EncodedStreamObject", "Field", @@ -215,12 +217,14 @@ def link( "FloatObject", "IndirectObject", "NameObject", + "NamedRefLink", "NullObject", "NumberObject", "OutlineFontFlag", "OutlineItem", "PdfObject", "RectangleObject", + "RefLink", "StreamObject", "TextStringObject", "TreeObject", diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 5675fa7c6..88f87351c 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1706,8 +1706,8 @@ def title(self) -> Optional[str]: return self.get("/Title") @property - def page(self) -> Optional[int]: - """Read-only property accessing the destination page number.""" + def page(self) -> Optional[IndirectObject]: + """Read-only property accessing the IndirectObject of the destination page.""" return self.get("/Page") @property diff --git a/pypdf/generic/_link.py b/pypdf/generic/_link.py new file mode 100644 index 000000000..b5b0ad1a5 --- /dev/null +++ b/pypdf/generic/_link.py @@ -0,0 +1,74 @@ +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +# This module contains classes used by _writer.py to track links in +# pages being added to the writer until the links can be resolved. + +from typing import TYPE_CHECKING, Union + +from . import ArrayObject, IndirectObject, TextStringObject + +if TYPE_CHECKING: + from .._reader import PdfReader + from .._writer import PdfWriter + + +class NamedRefLink: + """Named reference link being preserved until we can resolve it correctly.""" + + def __init__(self, ref: TextStringObject, source_pdf: "PdfReader") -> None: + """ref: TextStringObject with named reference""" + self._ref = ref + self._source_pdf = source_pdf + + def find_referenced_page(self) -> Union[IndirectObject,None]: + dest = self._source_pdf.named_destinations.get(str(self._ref)) + return dest.page if dest else None + + def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: + """target_pdf: PdfWriter which the new link went into""" + # point named destination in new PDF to the new page + if str(self._ref) not in target_pdf.named_destinations: + target_pdf.add_named_destination(str(self._ref), new_page.page_number) + + +class DirectRefLink: + """Direct reference link being preserved until we can resolve it correctly.""" + + def __init__(self, ref: ArrayObject) -> None: + """ref: an ArrayObject whose first element is the Page indir obj""" + self._ref = ref + + def find_referenced_page(self) -> IndirectObject: + return self._ref[0] + + def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: + """target_pdf: PdfWriter which the new link went into""" + self._ref[0] = new_page + + +RefLink = Union[NamedRefLink,DirectRefLink] diff --git a/tests/example_files.yaml b/tests/example_files.yaml index 4ea82a0d5..249c6a265 100644 --- a/tests/example_files.yaml +++ b/tests/example_files.yaml @@ -110,3 +110,7 @@ url: https://github.com/py-pdf/pypdf/files/12483807/AEO.1172.pdf - local_filename: iss3268.pdf url: https://github.com/user-attachments/files/20060394/broken.pdf +- local_filename: direct-link.pdf + url: https://github.com/user-attachments/files/20348304/tst.pdf +- local_filename: named-reference.pdf + url: https://github.com/user-attachments/files/20455804/MinimalJob.pdf diff --git a/tests/test_merger.py b/tests/test_merger.py index e5680d647..8eec54195 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -409,3 +409,107 @@ def test_deprecate_pdfmerger(): def test_get_reference(): writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf") assert writer.get_reference(writer.pages[0]) == writer.pages[0].indirect_reference + + +@pytest.mark.enable_socket +def test_direct_link_preserved(pdf_file_path): + # this could be any PDF -- we don't care which + reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) + writer = PdfWriter(clone_from = reader) + + # this PDF has a direct link from p1 to p2 + merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf"))) + for p in merger.pages: + writer.add_page(p) + + writer.write(pdf_file_path) + + check = PdfReader(pdf_file_path) + page3 = check.pages[2] + link = page3["/Annots"][0].get_object() + assert link["/Subtype"] == "/Link" + dest = link["/Dest"][0] # indirect ref of page referred to + + page4 = check.flattened_pages[3] + assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken" + + +@pytest.mark.enable_socket +def test_direct_link_preserved_reordering(pdf_file_path): + # this could be any PDF -- we don't care which + reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) + writer = PdfWriter(clone_from = reader) + + # this PDF has a direct link from p1 to p2 + merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf"))) + for p in merger.pages: + writer.add_page(p) + + # let's insert a page to mess up the page order + writer.insert_page(reader.pages[0], 3) + + writer.write(pdf_file_path) + + check = PdfReader(pdf_file_path) + page3 = check.pages[2] + link = page3["/Annots"][0].get_object() + assert link["/Subtype"] == "/Link" + dest = link["/Dest"][0] # indirect ref of page referred to + + page5 = check.flattened_pages[4] # it moved one out + assert dest == page5.indirect_reference, "Link from page 3 to page 5 is broken" + + +@pytest.mark.enable_socket +def test_direct_link_page_missing(pdf_file_path): + # this could be any PDF -- we don't care which + reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) + writer = PdfWriter(clone_from = reader) + + # this PDF has a direct link from p1 to p2 + merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf"))) + writer.add_page(merger.pages[0]) + # but we're not adding page 2 + + writer.write(pdf_file_path) # verify nothing crashes + + +@pytest.mark.enable_socket +def test_named_reference_preserved(pdf_file_path): + # this could be any PDF -- we don't care which + reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf"))) + writer = PdfWriter(clone_from = reader) + + # this PDF has a named reference from from p3 to p5 + merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf"))) + for p in merger.pages: + writer.add_page(p) + + writer.write(pdf_file_path) + + check = PdfReader(pdf_file_path) + page5 = check.pages[4] + page7 = check.flattened_pages[6] + for link in page5["/Annots"]: + action = link["/A"] + assert action.get("/S") == "/GoTo" + dest = str(action["/D"]) + assert dest in check.named_destinations + pref = check.named_destinations[dest].page + + assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken" + + +@pytest.mark.enable_socket +def test_named_ref_to_page_thats_gone(pdf_file_path): + source = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf"))) + buf = BytesIO() + tmp = PdfWriter() + tmp.add_page(source.pages[2]) # we add only the page with the reference + tmp.write(buf) + + source = PdfReader(buf) + + writer = PdfWriter() + writer.add_page(source.pages[0]) # now references to non-existent page + writer.write(pdf_file_path) # don't crash From c4486ddb58e27e5d6a9aa77f06516bbd9d935f1e Mon Sep 17 00:00:00 2001 From: Lars Marius Garshol Date: Wed, 18 Jun 2025 10:36:03 +0200 Subject: [PATCH 2/7] Changes to comply with review comments --- pypdf/_writer.py | 44 +++------------------- pypdf/generic/__init__.py | 9 +++-- pypdf/generic/_link.py | 78 +++++++++++++++++++++++++++++---------- tests/test_merger.py | 2 +- 4 files changed, 70 insertions(+), 63 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 73c44f41b..09f17591d 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -95,22 +95,21 @@ DecodedStreamObject, Destination, DictionaryObject, - DirectRefLink, Fit, FloatObject, IndirectObject, - NamedRefLink, NameObject, NullObject, NumberObject, PdfObject, RectangleObject, - RefLink, + ReferenceLink, StreamObject, TextStringObject, TreeObject, ViewerPreferences, create_string_object, + extract_links, hex_to_rgb, is_null_or_none, ) @@ -212,9 +211,9 @@ def __init__( """The PDF file identifier, defined by the ID in the PDF file's trailer dictionary.""" - self._unresolved_links: list[tuple[RefLink,RefLink]] = [] + self._unresolved_links: list[tuple[ReferenceLink, ReferenceLink]] = [] "Tracks links in pages added to the writer for resolving later." - self._merged_in_pages: Dict[Optional[IndirectObject],Optional[IndirectObject]] = {} + self._merged_in_pages: Dict[Optional[IndirectObject], Optional[IndirectObject]] = {} "Tracks pages added to the writer and what page they turned into." if self.incremental: @@ -491,39 +490,6 @@ def _add_page( except Exception: pass - def _extract_links(new_page: PageObject, old_page: PageObject) -> List[Tuple[RefLink,RefLink]]: - new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])] - old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])] - - return [(new_link, old_link) for (new_link, old_link) - in zip(new_links, old_links) - if new_link and old_link] - - def _build_link(indir_obj: IndirectObject, page: PageObject) -> Optional[RefLink]: - src = cast(PdfReader, page.pdf) - link = cast(DictionaryObject, indir_obj.get_object()) - if link.get("/Subtype") != "/Link": - return None - - if "/A" in link: - action = cast(DictionaryObject, link["/A"]) - if action.get("/S") != "/GoTo": - return None - - return _create_link(action["/D"], src) - - if "/Dest" in link: - return _create_link(link["/Dest"], src) - - return None # nothing we need to do - - def _create_link(ref: PdfObject, src: PdfReader)-> Optional[RefLink]: - if isinstance(ref, TextStringObject): - return NamedRefLink(ref, src) - if isinstance(ref, ArrayObject): - return DirectRefLink(ref) - return None - page = cast( "PageObject", page_org.clone(self, False, excluded_keys).get_object() ) @@ -554,7 +520,7 @@ def _create_link(ref: PdfObject, src: PdfReader)-> Optional[RefLink]: # pages may or may not already be added. we store the # information we need, so that we can resolve the references # later. - self._unresolved_links.extend(_extract_links(page, page_org)) + self._unresolved_links.extend(extract_links(page, page_org)) self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference return page diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index e83fd85ca..4df7852db 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -62,7 +62,7 @@ ) from ._files import EmbeddedFile from ._fit import Fit -from ._link import DirectRefLink, NamedRefLink, RefLink +from ._link import DirectReferenceLink, NamedReferenceLink, ReferenceLink, extract_links from ._outline import OutlineItem from ._rectangle import RectangleObject from ._utils import ( @@ -209,7 +209,7 @@ def link( "DecodedStreamObject", "Destination", "DictionaryObject", - "DirectRefLink", + "DirectReferenceLink", "EmbeddedFile", "EncodedStreamObject", "Field", @@ -217,14 +217,14 @@ def link( "FloatObject", "IndirectObject", "NameObject", - "NamedRefLink", + "NamedReferenceLink", "NullObject", "NumberObject", "OutlineFontFlag", "OutlineItem", "PdfObject", "RectangleObject", - "RefLink", + "ReferenceLink", "StreamObject", "TextStringObject", "TreeObject", @@ -233,6 +233,7 @@ def link( "create_string_object", "decode_pdfdocencoding", "encode_pdfdocencoding", + "extract_links", "hex_to_rgb", "is_null_or_none", "read_hex_string_from_stream", diff --git a/pypdf/generic/_link.py b/pypdf/generic/_link.py index b5b0ad1a5..23feb6927 100644 --- a/pypdf/generic/_link.py +++ b/pypdf/generic/_link.py @@ -25,50 +25,90 @@ # POSSIBILITY OF SUCH DAMAGE. -# This module contains classes used by _writer.py to track links in -# pages being added to the writer until the links can be resolved. +# This module contains code used by _writer.py to track links in pages +# being added to the writer until the links can be resolved. -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast -from . import ArrayObject, IndirectObject, TextStringObject +from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject if TYPE_CHECKING: + from .._page import PageObject from .._reader import PdfReader from .._writer import PdfWriter -class NamedRefLink: +class NamedReferenceLink: """Named reference link being preserved until we can resolve it correctly.""" - def __init__(self, ref: TextStringObject, source_pdf: "PdfReader") -> None: - """ref: TextStringObject with named reference""" - self._ref = ref + def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None: + """reference: TextStringObject with named reference""" + self._reference = reference self._source_pdf = source_pdf - def find_referenced_page(self) -> Union[IndirectObject,None]: - dest = self._source_pdf.named_destinations.get(str(self._ref)) + def find_referenced_page(self) -> Union[IndirectObject, None]: + dest = self._source_pdf.named_destinations.get(str(self._reference)) return dest.page if dest else None def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: """target_pdf: PdfWriter which the new link went into""" # point named destination in new PDF to the new page - if str(self._ref) not in target_pdf.named_destinations: - target_pdf.add_named_destination(str(self._ref), new_page.page_number) + if str(self._reference) not in target_pdf.named_destinations: + target_pdf.add_named_destination(str(self._reference), new_page.page_number) -class DirectRefLink: +class DirectReferenceLink: """Direct reference link being preserved until we can resolve it correctly.""" - def __init__(self, ref: ArrayObject) -> None: - """ref: an ArrayObject whose first element is the Page indir obj""" - self._ref = ref + def __init__(self, reference: ArrayObject) -> None: + """reference: an ArrayObject whose first element is the Page indir obj""" + self._reference = reference def find_referenced_page(self) -> IndirectObject: - return self._ref[0] + return self._reference[0] def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: """target_pdf: PdfWriter which the new link went into""" - self._ref[0] = new_page + self._reference[0] = new_page -RefLink = Union[NamedRefLink,DirectRefLink] +ReferenceLink = Union[NamedReferenceLink, DirectReferenceLink] + + +def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[ReferenceLink, ReferenceLink]]: + """Extracts links from two pages on the assumption that the two pages are + the same. Produces one list of (new link, old link) tuples. + """ + new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])] + old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])] + + return [(new_link, old_link) for (new_link, old_link) + in zip(new_links, old_links) + if new_link and old_link] + + +def _build_link(indir_obj: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]: + src = cast("PdfReader", page.pdf) + link = cast(DictionaryObject, indir_obj.get_object()) + if link.get("/Subtype") != "/Link": + return None + + if "/A" in link: + action = cast(DictionaryObject, link["/A"]) + if action.get("/S") != "/GoTo": + return None + + return _create_link(action["/D"], src) + + if "/Dest" in link: + return _create_link(link["/Dest"], src) + + return None # nothing we need to do + + +def _create_link(ref: PdfObject, src: "PdfReader")-> Optional[ReferenceLink]: + if isinstance(ref, TextStringObject): + return NamedReferenceLink(ref, src) + if isinstance(ref, ArrayObject): + return DirectReferenceLink(ref) + return None diff --git a/tests/test_merger.py b/tests/test_merger.py index 8eec54195..ca105f475 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -501,7 +501,7 @@ def test_named_reference_preserved(pdf_file_path): @pytest.mark.enable_socket -def test_named_ref_to_page_thats_gone(pdf_file_path): +def test_named_ref_to_page_that_is_gone(pdf_file_path): source = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf"))) buf = BytesIO() tmp = PdfWriter() From 0098f59429ef5bb02d6666172000e321504811a6 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 1 Jul 2025 11:58:51 +0200 Subject: [PATCH 3/7] improve wording --- pypdf/generic/_link.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pypdf/generic/_link.py b/pypdf/generic/_link.py index 23feb6927..11c4a918c 100644 --- a/pypdf/generic/_link.py +++ b/pypdf/generic/_link.py @@ -47,8 +47,8 @@ def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None self._source_pdf = source_pdf def find_referenced_page(self) -> Union[IndirectObject, None]: - dest = self._source_pdf.named_destinations.get(str(self._reference)) - return dest.page if dest else None + destination = self._source_pdf.named_destinations.get(str(self._reference)) + return destination.page if destinatino else None def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None: """target_pdf: PdfWriter which the new link went into""" @@ -61,7 +61,7 @@ class DirectReferenceLink: """Direct reference link being preserved until we can resolve it correctly.""" def __init__(self, reference: ArrayObject) -> None: - """reference: an ArrayObject whose first element is the Page indir obj""" + """reference: an ArrayObject whose first element is the Page indirect object""" self._reference = reference def find_referenced_page(self) -> IndirectObject: From d9f50403fb8fb28151c01338c62055d280f36151 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 1 Jul 2025 12:01:10 +0200 Subject: [PATCH 4/7] improve naming --- pypdf/generic/_link.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pypdf/generic/_link.py b/pypdf/generic/_link.py index 11c4a918c..1750e2b46 100644 --- a/pypdf/generic/_link.py +++ b/pypdf/generic/_link.py @@ -82,14 +82,16 @@ def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[ new_links = [_build_link(link, new_page) for link in new_page.get("/Annots", [])] old_links = [_build_link(link, old_page) for link in old_page.get("/Annots", [])] - return [(new_link, old_link) for (new_link, old_link) - in zip(new_links, old_links) - if new_link and old_link] + return [ + (new_link, old_link) for (new_link, old_link) + in zip(new_links, old_links) + if new_link and old_link + ] -def _build_link(indir_obj: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]: +def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]: src = cast("PdfReader", page.pdf) - link = cast(DictionaryObject, indir_obj.get_object()) + link = cast(DictionaryObject, indirect_object.get_object()) if link.get("/Subtype") != "/Link": return None @@ -103,12 +105,12 @@ def _build_link(indir_obj: IndirectObject, page: "PageObject") -> Optional[Refer if "/Dest" in link: return _create_link(link["/Dest"], src) - return None # nothing we need to do + return None # Nothing to do here -def _create_link(ref: PdfObject, src: "PdfReader")-> Optional[ReferenceLink]: - if isinstance(ref, TextStringObject): - return NamedReferenceLink(ref, src) - if isinstance(ref, ArrayObject): - return DirectReferenceLink(ref) +def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]: + if isinstance(reference, TextStringObject): + return NamedReferenceLink(reference, source_pdf) + if isinstance(reference, ArrayObject): + return DirectReferenceLink(reference) return None From 89d72fabc00b4145c8cbd706dbad7eb16be7b696 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 1 Jul 2025 12:03:26 +0200 Subject: [PATCH 5/7] improve formatting --- tests/test_merger.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index ca105f475..74c2415f4 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -428,7 +428,7 @@ def test_direct_link_preserved(pdf_file_path): page3 = check.pages[2] link = page3["/Annots"][0].get_object() assert link["/Subtype"] == "/Link" - dest = link["/Dest"][0] # indirect ref of page referred to + dest = link["/Dest"][0] # indirect reference of page referred to page4 = check.flattened_pages[3] assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken" @@ -454,7 +454,7 @@ def test_direct_link_preserved_reordering(pdf_file_path): page3 = check.pages[2] link = page3["/Annots"][0].get_object() assert link["/Subtype"] == "/Link" - dest = link["/Dest"][0] # indirect ref of page referred to + dest = link["/Dest"][0] # indirect reference of page referred to page5 = check.flattened_pages[4] # it moved one out assert dest == page5.indirect_reference, "Link from page 3 to page 5 is broken" @@ -505,11 +505,11 @@ def test_named_ref_to_page_that_is_gone(pdf_file_path): source = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf"))) buf = BytesIO() tmp = PdfWriter() - tmp.add_page(source.pages[2]) # we add only the page with the reference + tmp.add_page(source.pages[2]) # we add only the page with the reference tmp.write(buf) source = PdfReader(buf) writer = PdfWriter() - writer.add_page(source.pages[0]) # now references to non-existent page - writer.write(pdf_file_path) # don't crash + writer.add_page(source.pages[0]) # now references to non-existent page + writer.write(pdf_file_path) # don't crash From 1c05e70a3080315463f96e020575017fe3be3a4f Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 1 Jul 2025 12:04:06 +0200 Subject: [PATCH 6/7] improve formatting --- tests/test_merger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index 74c2415f4..e5c835fab 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -456,7 +456,7 @@ def test_direct_link_preserved_reordering(pdf_file_path): assert link["/Subtype"] == "/Link" dest = link["/Dest"][0] # indirect reference of page referred to - page5 = check.flattened_pages[4] # it moved one out + page5 = check.flattened_pages[4] # it moved one out assert dest == page5.indirect_reference, "Link from page 3 to page 5 is broken" From d4c29628050c336ac8dd0b7cb2f454f71035ef1e Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Tue, 1 Jul 2025 12:04:23 +0200 Subject: [PATCH 7/7] improve formatting --- tests/test_merger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_merger.py b/tests/test_merger.py index e5c835fab..2683ba5e5 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -471,7 +471,7 @@ def test_direct_link_page_missing(pdf_file_path): writer.add_page(merger.pages[0]) # but we're not adding page 2 - writer.write(pdf_file_path) # verify nothing crashes + writer.write(pdf_file_path) # verify nothing crashes @pytest.mark.enable_socket