From e64fd47a65f693b3fbf8a6effd2393bed09d039c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 12 Sep 2023 23:10:23 +0200 Subject: [PATCH 01/13] BUG : Accepts Kids in EmbeddedFiles closes #2087 closes #2090 --- pypdf/_protocols.py | 3 + pypdf/_reader.py | 141 +- pypdf/_writer.py | 2 + pypdf/generic/__init__.py | 4 + pypdf/generic/_base.py | 11 + pypdf/generic/_data_structures.py | 220 ++ tests/test_writer.py | 3617 +++++++++++++++-------------- 7 files changed, 2124 insertions(+), 1874 deletions(-) diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index c6f2bbebd..69d21ba8a 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -76,6 +76,9 @@ def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO]: def _add_object(self, obj: Any) -> Any: ... + def _replace_object(self, indirect_reference: Any, obj: Any) -> Any: + ... + @property def pages(self) -> List[Any]: ... diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 75738fd17..01529ec47 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -39,7 +39,6 @@ Callable, Dict, Iterable, - Iterator, List, Mapping, Optional, @@ -98,6 +97,7 @@ FloatObject, IndirectObject, NameObject, + NameTree, NullObject, NumberObject, PdfObject, @@ -2206,14 +2206,56 @@ def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: interim[NameObject("/T")] = TextStringObject(name) return interim + def _get_embedded_files_root(self) -> Optional[NameTree]: + """ + Returns the EmbeddedFiles root as a NameTree Object + if the root does not exists, return None + """ + catalog = cast(DictionaryObject, self.trailer["/Root"]) + if "/Names" not in catalog: + return None + ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None) + if ef is None: + return None + efo = ef.get_object() + # not for reader + """ + if not isinstance(efo,NameTree): + if isinstance(ef,IndirectObject): + ef.replace_object(efo) + else: + cast(DictionaryObject,catalog["/Names"])[ + NameObject("/EmbeddedFiles")] = NameTree(efo) + """ + return NameTree(efo) + + @property + def detailed_embedded_files(self) -> Optional[Mapping[str, PdfObject]]: + ef = self._get_embedded_files_root() + if ef: + return ef.list_items() + else: + return None + + @property + def embedded_files(self) -> Optional[Mapping[str, List[bytes]]]: + ef = self._get_embedded_files_root() + if ef: + return {k: v["/EF"]["/F"].get_data() for k, v in ef.list_items().items()} # type: ignore + else: + return None + @property def attachments(self) -> Mapping[str, List[bytes]]: - return LazyDict( - { - name: (self._get_attachment_list, name) - for name in self._list_attachments() - } - ) + ef = self._get_embedded_files_root() + if ef: + d = {} + for k, v in ef.list_items().items(): + if isinstance(v, list): + d[k] = [e["/EF"]["/F"].get_data() for e in v] + return d + else: + return {} def _list_attachments(self) -> List[str]: """ @@ -2222,20 +2264,20 @@ def _list_attachments(self) -> List[str]: Returns: list of filenames """ - catalog = cast(DictionaryObject, self.trailer["/Root"]) - # From the catalog get the embedded file names - try: - filenames = cast( - ArrayObject, - cast( - DictionaryObject, - cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], - )["/Names"], - ) - except KeyError: - return [] - attachments_names = [f for f in filenames if isinstance(f, str)] - return attachments_names + ef = self._get_embedded_files_root() + if ef: + lst = ef.list_keys() + else: + lst = [] + """ + for ip, p in enumerate(self.pages): + for a in [_a.get_object() + for _a in p.get("/Annots",[])]: + if _a.get_object().get("/Subtype","") != "/FileAttachements": + continue + lst.append(f"$page_{ip}.{get_name_from_file_specification(_a)}") + """ + return lst def _get_attachment_list(self, name: str) -> List[bytes]: out = self._get_attachments(name)[name] @@ -2260,53 +2302,18 @@ def _get_attachments( dictionary of filename -> Union[bytestring or List[ByteString]] if the filename exists multiple times a List of the different version will be provided """ - catalog = cast(DictionaryObject, self.trailer["/Root"]) - # From the catalog get the embedded file names - try: - filenames = cast( - ArrayObject, - cast( - DictionaryObject, - cast(DictionaryObject, catalog["/Names"])["/EmbeddedFiles"], - )["/Names"], - ) - except KeyError: + ef = self._get_embedded_files_root() + if ef is None: return {} - attachments: Dict[str, Union[bytes, List[bytes]]] = {} - # Loop through attachments - for i in range(len(filenames)): - f = filenames[i] - if isinstance(f, str): - if filename is not None and f != filename: - continue - name = f - f_dict = filenames[i + 1].get_object() - f_data = f_dict["/EF"]["/F"].get_data() - if name in attachments: - if not isinstance(attachments[name], list): - attachments[name] = [attachments[name]] # type:ignore - attachments[name].append(f_data) # type:ignore - else: - attachments[name] = f_data - return attachments - - -class LazyDict(Mapping): - def __init__(self, *args: Any, **kw: Any) -> None: - self._raw_dict = dict(*args, **kw) - - def __getitem__(self, key: str) -> Any: - func, arg = self._raw_dict.__getitem__(key) - return func(arg) - - def __iter__(self) -> Iterator[Any]: - return iter(self._raw_dict) - - def __len__(self) -> int: - return len(self._raw_dict) - - def __str__(self) -> str: - return f"LazyDict(keys={list(self.keys())})" + if filename is None: + return {k: v if len(v) > 1 else v[0] for k, v in self.attachments.items()} + else: + lst = ef.list_get(filename) + return { + filename: [x["/EF"]["/F"].get_data() for x in lst] # type: ignore + if isinstance(lst, list) + else lst["/EF"]["/F"].get_data() # type: ignore + } class PdfFileReader(PdfReader): # deprecated diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 25a6444d3..1d70bba50 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -297,10 +297,12 @@ def _replace_object( if isinstance(indirect_reference, IndirectObject): assert indirect_reference.pdf == self indirect_reference = indirect_reference.idnum + gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore self._objects[indirect_reference - 1] = obj return self._objects[indirect_reference - 1] if indirect_reference.pdf != self: raise ValueError("pdf must be self") + obj.indirect_reference = IndirectObject(indirect_reference, gen, self) return self._objects[indirect_reference.idnum - 1] # type: ignore def _add_page( diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index 778a9339e..bed5eb601 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -53,8 +53,10 @@ DictionaryObject, EncodedStreamObject, Field, + NameTree, StreamObject, TreeObject, + get_name_from_file_specification, read_object, ) from ._fit import Fit @@ -444,6 +446,8 @@ def link( "RectangleObject", "Field", "Destination", + "NameTree", + "get_name_from_file_specification", "ViewerPreferences", # --- More specific stuff # Outline diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 6c3e41647..bd1d15f05 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -314,6 +314,17 @@ def get_object(self) -> Optional["PdfObject"]: return None return obj.get_object() + def replace_object(self, obj: "PdfObject") -> None: + """ + Replace the pointed object with obj + Only applies to IndirectObjects within a PdfWriter + """ + pdf = self.pdf + if not hasattr(pdf, "_replace_object"): + raise TypeError("Trying to replace Object in a non PdfWriter") + pdf._replace_object(self.idnum, obj) + obj.indirect_reference = self + def __repr__(self) -> str: return f"IndirectObject({self.idnum!r}, {self.generation!r}, {id(self.pdf)})" diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 9ad98c240..59e28250a 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1442,6 +1442,226 @@ def additionalActions(self) -> Optional[DictionaryObject]: # deprecated return self.additional_actions +class NameTree(DictionaryObject): + """ + Name Tree Structure + Allow to list, get and set objects In a Name Tree + """ + + def __init__(self, obj: Optional[PdfObject] = None) -> None: + if not isinstance(obj, DictionaryObject) or all( + x not in obj for x in ("/Names", "/Kids") + ): + raise ValueError("source object is not a valid source object") + DictionaryObject.__init__(self) + obj = cast(DictionaryObject, obj) + if obj is not None: + self.update(obj) + else: # building a new Name Tree + self[NameObject("/Names")] = ArrayObject() + if hasattr(obj, "indirect_reference"): + self.indirect_reference = obj.indirect_reference + + def list_keys(self) -> List[str]: + """ + Provides the list of keys of the items in the Name Tree + + Returns: + List of str keys + """ + + def _list(o: Optional[PdfObject]) -> List[str]: + if o is None: + return [] + o = cast(DictionaryObject, o) + _l = o.get("/Names", None) + a = o.get("/Kids", None) + _l = _l.get_object() if _l else [] + a = a.get_object() if a else [] + ll = [v for v in _l if isinstance(v, str)] # and v not in ll: + for x in a: + ll.extend(_list(x.get_object())) + # for v in _list(x.get_object()): + # if v not in ll: + # ll.append(v) + return ll + + _l = _list(self) + _l.sort() + return _l + + def list_items(self) -> dict[str, PdfObject]: + """ + Provides the Name Tree Entries as a dictionary + + Returns: + dictionary of objects + """ + + def _list( + o: Optional[PdfObject], lout: List[Tuple[str, PdfObject]] + ) -> List[Tuple[str, PdfObject]]: + def _append_with_dup( + ll: List[Tuple[str, Any]], _l: List[Tuple[str, Any]] + ) -> None: + for k, v in _l: + try: + i = tuple(x[0] for x in ll).index(k) + ll[i][1].append(v) + except ValueError: + ll.append((k, [v])) + + if o is None: + return lout + o = cast(DictionaryObject, o) + _l = o.get("/Names", None) + a = o.get("/Kids", None) + _l = _l.get_object() if _l else [] + a = a.get_object() if a else [] + _l = [ + (v, None if isinstance(_l[i + 1], str) else _l[i + 1]) + for i, v in enumerate(_l) + if isinstance(v, str) + ] + # to handle duplicates + _append_with_dup(lout, _l) + for x in a: + # _append_with_dup(lout, _list(x.get_object(),lout)) + _list(x.get_object(), lout) + return lout + + _l: List[Tuple[str, PdfObject]] = [] + _list(self, _l) + return dict(_l) + + def list_get(self, key: str) -> List[PdfObject]: + """ + Get the entry from the Name Tree + + Args: + key: searched entry + + Returns: + matching PdfObject; None i + attributeEntries as a dictionary + """ + + def _get(key: str, o: Optional[PdfObject]) -> List[PdfObject]: + if o is None: + return [] + rst = [] + o = cast(DictionaryObject, o) + _l = o.get("/Names", None) + a = o.get("/Kids", None) + _l = _l.get_object() if _l else [] + a = a.get_object() if a else [] + for i, x in enumerate(_l): + if x == key: + rst.append(_l[i + 1]) + for x in a: + rst.extend(_get(key, x)) + return rst + + return _get(key, self) + + def list_set( + self, key: str, data: PdfObject, overwrite: bool = False + ) -> Optional[IndirectObject]: + """ + Add the data entry from the Name Tree + + Args: + key: entry + data: PdfObject (it will be added to the list of objects + overwrite: allow to overwrite existing key + + Returns: + matching PdfObject; None i + attributeEntries as a dictionary + """ + try: + if self.indirect_reference is None: + raise TypeError + writer = self.indirect_reference.pdf + if not hasattr(writer, "_add_object"): + raise TypeError + except (TypeError, AttributeError): + raise TypeError("Object does not belong to a PdfWriter") + + def _update_limits( + obj: DictionaryObject, lo: Optional[str], hi: Optional[str] + ) -> bool: + if "/Limits" not in obj: + return False + a = cast("ArrayObject", obj["/Limits"]) + if lo is not None and lo < a[0]: + a[0] = TextStringObject(lo) + return True + if hi is not None and hi > a[0]: + a[1] = TextStringObject(lo) + return True + return False + + def _set_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: + nonlocal overwrite, writer, key, data + if o is None: + return None + o = cast(DictionaryObject, o) + if "/Names" in o: + _l = cast(ArrayObject, o["/Names"]) + li = o.get("/Limits", [_l[0], _l[-2]]) + if key < li[0]: + return None + if not app and _l > li[1]: + return None + i = 0 + while i < len(_l): + if _l[i] == key: + if not overwrite: + continue + d = _l[i + 1] + if isinstance(d, IndirectObject): + d.replace_object(data) + else: # pragma: no cover + # should not occur iaw pdf spec + _l[i + 1] = data + return _l[i + 1] + elif key < _l[i]: + _l.insert(i, key) + _l.insert(i + 1, writer._add_object(data)) + _update_limits(o, key, None) + return _l[i + 1] + i += 1 + if app: + _l.append(key) + _l.append(writer._add_object(data)) + _update_limits(o, key, None) + return _l[-1] + return None + else: # kids + ar = cast(ArrayObject, o["/Kids"]) + for x in ar: + r = _set_in(x, x == ar[-1]) + if r: + _update_limits(o, key, key) + return r + return None + + o = _set_in(self, True) + return o.indirect_reference if o is not None else None + + +def get_name_from_file_specification(_a: DictionaryObject) -> str: + return cast( + str, + _a.get("/UF") + or _a.get("/F") + or _a.get("/DOS") + or _a.get("/Unix") + or _a.get("/Mac"), + ) + + class Destination(TreeObject): """ A class representing a destination within a PDF file. diff --git a/tests/test_writer.py b/tests/test_writer.py index c9766f979..81ed09440 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1,1608 +1,1611 @@ -"""Test the pypdf._writer module.""" -import re -import shutil -import subprocess -from io import BytesIO -from pathlib import Path - -import pytest - -from pypdf import ( - ObjectDeletionFlag, - PageObject, - PdfMerger, - PdfReader, - PdfWriter, - Transformation, -) -from pypdf.errors import DeprecationError, PageSizeNotDefinedError, PyPdfError -from pypdf.generic import ( - ArrayObject, - ContentStream, - DictionaryObject, - Fit, - IndirectObject, - NameObject, - NullObject, - NumberObject, - RectangleObject, - StreamObject, - TextStringObject, -) - -from . import get_data_from_url, is_sublist -from .test_images import image_similarity - -TESTS_ROOT = Path(__file__).parent.resolve() -PROJECT_ROOT = TESTS_ROOT.parent -RESOURCE_ROOT = PROJECT_ROOT / "resources" -SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files" -GHOSTSCRIPT_BINARY = shutil.which("gs") - - -def test_writer_exception_non_binary(tmp_path, caplog): - src = RESOURCE_ROOT / "pdflatex-outline.pdf" - - reader = PdfReader(src) - writer = PdfWriter() - writer.add_page(reader.pages[0]) - - with open(tmp_path / "out.txt", "w") as fp, pytest.raises(TypeError): - writer.write_stream(fp) - ending = "to write to is not in binary mode. It may not be written to correctly.\n" - assert caplog.text.endswith(ending) - - -def test_writer_clone(): - src = RESOURCE_ROOT / "pdflatex-outline.pdf" - - reader = PdfReader(src) - writer = PdfWriter(clone_from=reader) - assert len(writer.pages) == 4 - assert "PageObject" in str(type(writer.pages[0])) - - writer = PdfWriter(clone_from=src) - assert len(writer.pages) == 4 - assert "PageObject" in str(type(writer.pages[0])) - - -def test_writer_clone_bookmarks(): - # Arrange - src = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" - reader = PdfReader(src) - writer = PdfWriter() - - # Act + test cat - cat = "" - - def cat1(p) -> None: - nonlocal cat - cat += p.__repr__() - - writer.clone_document_from_reader(reader, cat1) - assert "/Page" in cat - assert writer.pages[0].raw_get("/Parent") == writer._pages - writer.add_outline_item("Page 1", 0) - writer.add_outline_item("Page 2", 1) - - # Assert - bytes_stream = BytesIO() - writer.write(bytes_stream) - bytes_stream.seek(0) - reader2 = PdfReader(bytes_stream) - assert len(reader2.pages) == len(reader.pages) - assert len(reader2.outline) == 2 - - # test with append - writer = PdfWriter() - writer.append(reader) - writer.add_outline_item("Page 1", 0) - writer.add_outline_item("Page 2", 1) - - # Assert - bytes_stream = BytesIO() - writer.write(bytes_stream) - bytes_stream.seek(0) - reader2 = PdfReader(bytes_stream) - assert len(reader2.pages) == len(reader.pages) - assert len(reader2.outline) == 2 - - -def writer_operate(writer: PdfWriter) -> None: - """ - To test the writer that initialized by each of the four usages. - - Args: - writer: A PdfWriter object - """ - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - pdf_outline_path = RESOURCE_ROOT / "pdflatex-outline.pdf" - - reader = PdfReader(pdf_path) - reader_outline = PdfReader(pdf_outline_path) - - page = reader.pages[0] - with pytest.raises(PageSizeNotDefinedError) as exc: - writer.add_blank_page() - assert exc.value.args == () - writer.insert_page(page, 1) - writer.insert_page(reader_outline.pages[0], 0) - writer.add_outline_item_destination(page) - writer.remove_links() - writer.add_outline_item_destination(page) - oi = writer.add_outline_item( - "An outline item", 0, None, (255, 0, 15), True, True, Fit.fit_box_vertically(10) - ) - writer.add_outline_item( - "The XYZ fit", 0, oi, (255, 0, 15), True, True, Fit.xyz(left=10, top=20, zoom=3) - ) - writer.add_outline_item( - "The XYZ fit no args", 0, oi, (255, 0, 15), True, True, Fit.xyz() - ) - writer.add_outline_item( - "The FitH fit", 0, oi, (255, 0, 15), True, True, Fit.fit_horizontally(top=10) - ) - writer.add_outline_item( - "The FitV fit", 0, oi, (255, 0, 15), True, True, Fit.fit_vertically(left=10) - ) - writer.add_outline_item( - "The FitR fit", - 0, - oi, - (255, 0, 15), - True, - True, - Fit.fit_rectangle(left=10, bottom=20, right=30, top=40), - ) - writer.add_outline_item( - "The FitB fit", 0, oi, (255, 0, 15), True, True, Fit.fit_box() - ) - writer.add_outline_item( - "The FitBH fit", - 0, - oi, - (255, 0, 15), - True, - True, - Fit.fit_box_horizontally(top=10), - ) - writer.add_outline_item( - "The FitBV fit", - 0, - oi, - (255, 0, 15), - True, - True, - Fit.fit_box_vertically(left=10), - ) - writer.add_blank_page() - writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) - with pytest.warns( - DeprecationWarning, match="'pagenum' argument of add_uri is deprecated" - ): - writer.add_uri( - 2, "https://example.com", RectangleObject([0, 0, 100, 100]), pagenum=2 - ) - with pytest.raises(DeprecationError): - writer.add_link(2, 1, RectangleObject([0, 0, 100, 100])) - assert writer._get_page_layout() is None - writer.page_layout = "broken" - assert writer.page_layout == "broken" - writer.page_layout = NameObject("/SinglePage") - assert writer._get_page_layout() == "/SinglePage" - assert writer._get_page_mode() is None - writer.set_page_mode("/UseNone") - assert writer._get_page_mode() == "/UseNone" - writer.set_page_mode(NameObject("/UseOC")) - assert writer._get_page_mode() == "/UseOC" - writer.insert_blank_page(width=100, height=100) - writer.insert_blank_page() # without parameters - - writer.remove_images() - - writer.add_metadata(reader.metadata) - writer.add_metadata({"/Author": "Martin Thoma"}) - writer.add_metadata({"/MyCustom": 1234}) - - writer.add_attachment("foobar.gif", b"foobarcontent") - - # Check that every key in _idnum_hash is correct - objects_hash = [o.hash_value() for o in writer._objects] - for k, v in writer._idnum_hash.items(): - assert v.pdf == writer - assert k in objects_hash, f"Missing {v}" - - -tmp_path = "dont_commit_writer.pdf" - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operations_by_traditional_usage(write_data_here, needs_cleanup): - writer = PdfWriter() - - writer_operate(writer) - - # finally, write "output" to pypdf-output.pdf - if needs_cleanup: - with open(write_data_here, "wb") as output_stream: - writer.write(output_stream) - else: - output_stream = write_data_here - writer.write(output_stream) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operations_by_semi_traditional_usage(write_data_here, needs_cleanup): - with PdfWriter() as writer: - writer_operate(writer) - - # finally, write "output" to pypdf-output.pdf - if needs_cleanup: - with open(write_data_here, "wb") as output_stream: - writer.write(output_stream) - else: - output_stream = write_data_here - writer.write(output_stream) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operations_by_semi_new_traditional_usage( - write_data_here, needs_cleanup -): - with PdfWriter() as writer: - writer_operate(writer) - - # finally, write "output" to pypdf-output.pdf - writer.write(write_data_here) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operation_by_new_usage(write_data_here, needs_cleanup): - # This includes write "output" to pypdf-output.pdf - with PdfWriter(write_data_here) as writer: - writer_operate(writer) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - "input_path", - [ - "side-by-side-subfig.pdf", - "reportlab-inline-image.pdf", - ], -) -def test_remove_images(pdf_file_path, input_path): - pdf_path = RESOURCE_ROOT / input_path - - reader = PdfReader(pdf_path) - writer = PdfWriter() - - page = reader.pages[0] - writer.insert_page(page, 0) - writer.remove_images() - page_contents_stream = writer.pages[0]["/Contents"]._data - assert len(page_contents_stream.strip()) - - # finally, write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - with open(pdf_file_path, "rb") as input_stream: - reader = PdfReader(input_stream) - if input_path == "side-by-side-subfig.pdf": - extracted_text = reader.pages[0].extract_text() - assert extracted_text - assert "Lorem ipsum dolor sit amet" in extracted_text - - -@pytest.mark.enable_socket() -def test_remove_images_sub_level(): - """Cf #2035""" - url = "https://github.com/py-pdf/pypdf/files/12394781/2210.03142-1.pdf" - name = "iss2103.pdf" - writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) - writer.remove_images() - assert ( - len( - [ - o.get_object() - for o in writer.pages[0]["/Resources"]["/XObject"]["/Fm1"][ - "/Resources" - ]["/XObject"]["/Im1"]["/Resources"]["/XObject"].values() - if not isinstance(o.get_object(), NullObject) - ] - ) - == 0 - ) - - -@pytest.mark.parametrize( - "input_path", - [ - "side-by-side-subfig.pdf", - "reportlab-inline-image.pdf", - ], -) -def test_remove_text(input_path, pdf_file_path): - pdf_path = RESOURCE_ROOT / input_path - - reader = PdfReader(pdf_path) - writer = PdfWriter() - - page = reader.pages[0] - writer.insert_page(page, 0) - writer.remove_text() - - # finally, write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_remove_text_all_operators(pdf_file_path): - stream = ( - b"BT " - b"/F0 36 Tf " - b"50 706 Td " - b"36 TL " - b"(The Tj operator) Tj " - b'1 2 (The double quote operator) " ' - b"(The single quote operator) ' " - b"ET" - ) - pdf_data = ( - b"%%PDF-1.7\n" - b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n" - b"2 0 obj << >> endobj\n" - b"3 0 obj << >> endobj\n" - b"4 0 obj << /Length %d >>\n" - b"stream\n" + (b"%s\n" % stream) + b"endstream\n" - b"endobj\n" - b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n" - b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" - b" /Resources << /Font << >> >>" - b" /Rotate 0 /Type /Page >> endobj\n" - b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" - b"xref 1 6\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"trailer << /Root 6 0 R /Size 6 >>\n" - b"startxref\n%d\n" - b"%%%%EOF" - ) - startx_correction = -1 - pdf_data = pdf_data % ( - len(stream), - pdf_data.find(b"1 0 obj") + startx_correction, - pdf_data.find(b"2 0 obj") + startx_correction, - pdf_data.find(b"3 0 obj") + startx_correction, - pdf_data.find(b"4 0 obj") + startx_correction, - pdf_data.find(b"5 0 obj") + startx_correction, - pdf_data.find(b"6 0 obj") + startx_correction, - # startx_correction should be -1 due to double % at the beginning - # inducing an error on startxref computation - pdf_data.find(b"xref"), - ) - pdf_stream = BytesIO(pdf_data) - - reader = PdfReader(pdf_stream, strict=False) - writer = PdfWriter() - - page = reader.pages[0] - writer.insert_page(page, 0) - writer.remove_text() - - # finally, write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_write_metadata(pdf_file_path): - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - - reader = PdfReader(pdf_path) - writer = PdfWriter() - - writer.add_page(reader.pages[0]) - for page in reader.pages: - writer.add_page(page) - - metadata = reader.metadata - writer.add_metadata(metadata) - - writer.add_metadata({"/Title": "The Crazy Ones"}) - - # finally, write data to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - # Check if the title was set - reader = PdfReader(pdf_file_path) - metadata = reader.metadata - assert metadata.get("/Title") == "The Crazy Ones" - - -def test_fill_form(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - writer = PdfWriter() - - writer.append(reader, [0]) - writer.append(RESOURCE_ROOT / "crazyones.pdf", [0]) - - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"}, flags=1 - ) - - # check if no fields to fill in the page - writer.update_page_form_field_values( - writer.pages[1], {"foo": "some filled in text"}, flags=1 - ) - - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"} - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_fill_form_with_qualified(): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - reader.add_form_topname("top") - - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.add_page(reader.pages[0]) - writer.update_page_form_field_values( - writer.pages[0], {"top.foo": "filling"}, flags=1 - ) - b = BytesIO() - writer.write(b) - - reader2 = PdfReader(b) - fields = reader2.get_fields() - assert fields["top.foo"]["/V"] == "filling" - - -@pytest.mark.parametrize( - ("use_128bit", "user_password", "owner_password"), - [(True, "userpwd", "ownerpwd"), (False, "userpwd", "ownerpwd")], -) -def test_encrypt(use_128bit, user_password, owner_password, pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - writer = PdfWriter() - - page = reader.pages[0] - orig_text = page.extract_text() - - writer.add_page(page) - - with pytest.raises(ValueError, match="owner_pwd of encrypt is deprecated."): - writer.encrypt( - owner_pwd=user_password, - owner_password=owner_password, - user_password=user_password, - use_128bit=use_128bit, - ) - with pytest.raises(ValueError, match="'user_pwd' argument is deprecated"): - writer.encrypt( - owner_password=owner_password, - user_password=user_password, - user_pwd=user_password, - use_128bit=use_128bit, - ) - writer.encrypt( - user_password=user_password, - owner_password=owner_password, - use_128bit=use_128bit, - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - # Test that the data is not there in clear text - with open(pdf_file_path, "rb") as input_stream: - data = input_stream.read() - assert b"foo" not in data - - # Test the user password (str): - reader = PdfReader(pdf_file_path, password="userpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - # Test the owner password (str): - reader = PdfReader(pdf_file_path, password="ownerpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - # Test the user password (bytes): - reader = PdfReader(pdf_file_path, password=b"userpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - # Test the owner password (stbytesr): - reader = PdfReader(pdf_file_path, password=b"ownerpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - -def test_add_outline_item(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - - for page in reader.pages: - writer.add_page(page) - - outline_item = writer.add_outline_item( - "An outline item", - 1, - None, - (255, 0, 15), - True, - True, - Fit.fit(), - is_open=False, - ) - _o2a = writer.add_outline_item( - "Another", 2, outline_item, None, False, False, Fit.fit() - ) - _o2b = writer.add_outline_item( - "Another bis", 2, outline_item, None, False, False, Fit.fit() - ) - outline_item2 = writer.add_outline_item( - "An outline item 2", - 1, - None, - (255, 0, 15), - True, - True, - Fit.fit(), - is_open=True, - ) - _o3a = writer.add_outline_item( - "Another 2", 2, outline_item2, None, False, False, Fit.fit() - ) - _o3b = writer.add_outline_item( - "Another 2bis", 2, outline_item2, None, False, False, Fit.fit() - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "w+b") as output_stream: - writer.write(output_stream) - output_stream.seek(0) - reader = PdfReader(output_stream) - assert reader.trailer["/Root"]["/Outlines"]["/Count"] == 3 - assert reader.outline[0]["/Count"] == -2 - assert reader.outline[0]["/%is_open%"] == False # noqa - assert reader.outline[2]["/Count"] == 2 - assert reader.outline[2]["/%is_open%"] == True # noqa - assert reader.outline[1][0]["/Count"] == 0 - - -def test_add_named_destination(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - assert writer.get_named_dest_root() == [] - - for page in reader.pages: - writer.add_page(page) - - assert writer.get_named_dest_root() == [] - - writer.add_named_destination(TextStringObject("A named dest"), 2) - writer.add_named_destination(TextStringObject("A named dest2"), 2) - - with pytest.warns(DeprecationWarning, match="pagenum is deprecated as an argument"): - writer.add_named_destination(TextStringObject("A named dest3"), pagenum=2) - - with pytest.raises(ValueError): - writer.add_named_destination( - TextStringObject("A named dest3"), pagenum=2, page_number=2 - ) - - root = writer.get_named_dest_root() - assert root[0] == "A named dest" - assert root[1].pdf == writer - assert root[1].get_object()["/S"] == NameObject("/GoTo") - assert root[1].get_object()["/D"][0] == writer.pages[2].indirect_reference - assert root[2] == "A named dest2" - assert root[3].pdf == writer - assert root[3].get_object()["/S"] == NameObject("/GoTo") - assert root[3].get_object()["/D"][0] == writer.pages[2].indirect_reference - assert root[4] == "A named dest3" - - # test get_object - - assert writer.get_object(root[1].idnum) == writer.get_object(root[1]) - with pytest.raises(ValueError) as exc: - writer.get_object(reader.pages[0].indirect_reference) - assert exc.value.args[0] == "pdf must be self" - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_add_named_destination_sort_order(pdf_file_path): - """ - Issue #1927 does not appear. - - add_named_destination() maintains the named destination list sort order - """ - writer = PdfWriter() - - assert writer.get_named_dest_root() == [] - - writer.add_blank_page(200, 200) - writer.add_named_destination("b", 0) - # "a" should be moved before "b" on insert - writer.add_named_destination("a", 0) - - root = writer.get_named_dest_root() - - assert len(root) == 4 - assert ( - root[0] == "a" - ), '"a" was not inserted before "b" in the named destination root' - assert root[2] == "b" - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_add_uri(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - - for page in reader.pages: - writer.add_page(page) - - writer.add_uri( - 1, - "http://www.example.com", - RectangleObject([0, 0, 100, 100]), - border=[1, 2, 3, [4]], - ) - writer.add_uri( - 2, - "https://pypdf.readthedocs.io/en/latest/", - RectangleObject([20, 30, 50, 80]), - border=[1, 2, 3], - ) - writer.add_uri( - 3, - "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", - "[ 200 300 250 350 ]", - border=[0, 0, 0], - ) - writer.add_uri( - 3, - "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", - [100, 200, 150, 250], - border=[0, 0, 0], - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_add_link(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - - for page in reader.pages: - writer.add_page(page) - - with pytest.raises( - DeprecationError, - match=( - re.escape( - "add_link is deprecated and was removed in pypdf 3.0.0. " - "Use add_annotation(pypdf.annotations.Link(...)) instead." - ) - ), - ): - writer.add_link( - 1, - 2, - RectangleObject([0, 0, 100, 100]), - border=[1, 2, 3, [4]], - fit="/Fit", - ) - writer.add_link( - 2, 3, RectangleObject([20, 30, 50, 80]), [1, 2, 3], "/FitH", None - ) - writer.add_link( - 3, - 0, - "[ 200 300 250 350 ]", - [0, 0, 0], - "/XYZ", - 0, - 0, - 2, - ) - writer.add_link( - 3, - 0, - [100, 200, 150, 250], - border=[0, 0, 0], - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_io_streams(): - """This is the example from the docs ("Streaming data").""" - filepath = RESOURCE_ROOT / "pdflatex-outline.pdf" - with open(filepath, "rb") as fh: - bytes_stream = BytesIO(fh.read()) - - # Read from bytes stream - reader = PdfReader(bytes_stream) - assert len(reader.pages) == 4 - - # Write to bytes stream - writer = PdfWriter() - with BytesIO() as output_stream: - writer.write(output_stream) - - -def test_regression_issue670(pdf_file_path): - filepath = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(filepath, strict=False) - for _ in range(2): - writer = PdfWriter() - writer.add_page(reader.pages[0]) - with open(pdf_file_path, "wb") as f_pdf: - writer.write(f_pdf) - - -def test_issue301(): - """Test with invalid stream length object.""" - with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: - reader = PdfReader(f) - writer = PdfWriter() - writer.append_pages_from_reader(reader) - b = BytesIO() - writer.write(b) - - -def test_append_pages_from_reader_append(): - """Use append_pages_from_reader with a callable.""" - with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: - reader = PdfReader(f) - writer = PdfWriter() - writer.append_pages_from_reader(reader, callable) - b = BytesIO() - writer.write(b) - - -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_sweep_indirect_references_nullobject_exception(pdf_file_path): - # TODO: Check this more closely... this looks weird - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - - -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.parametrize( - ("url", "name"), - [ - ( - "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf", - "test_sweep_indirect_references_nullobject_exception.pdf", - ), - ( - "https://corpora.tika.apache.org/base/docs/govdocs1/922/922840.pdf", - "test_write_outline_item_on_page_fitv.pdf", - ), - ("https://github.com/py-pdf/pypdf/files/10715624/test.pdf", "iss1627.pdf"), - ], -) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_some_appends(pdf_file_path, url, name): - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - # PdfMerger - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - # PdfWriter - merger = PdfWriter() - merger.append(reader) - merger.write(pdf_file_path) - - -def test_pdf_header(): - writer = PdfWriter() - assert writer.pdf_header == b"%PDF-1.3" - - reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") - writer.add_page(reader.pages[0]) - assert writer.pdf_header == b"%PDF-1.5" - - writer.pdf_header = b"%PDF-1.6" - assert writer.pdf_header == b"%PDF-1.6" - - -def test_write_dict_stream_object(pdf_file_path): - stream = ( - b"BT " - b"/F0 36 Tf " - b"50 706 Td " - b"36 TL " - b"(The Tj operator) Tj " - b'1 2 (The double quote operator) " ' - b"(The single quote operator) ' " - b"ET" - ) - - stream_object = StreamObject() - stream_object[NameObject("/Type")] = NameObject("/Text") - stream_object._data = stream - - writer = PdfWriter() - - page_object = PageObject.create_blank_page(writer, 1000, 1000) - # Construct dictionary object (PageObject) with stream object - # Writer will replace this stream object with indirect object - page_object[NameObject("/Test")] = stream_object - - page_object = writer.add_page(page_object) - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - for k, v in page_object.items(): - if k == "/Test": - assert str(v) != str(stream_object) - assert isinstance(v, IndirectObject) - assert str(v.get_object()) == str(stream_object) - break - else: - pytest.fail("/Test not found") - - # Check that every key in _idnum_hash is correct - objects_hash = [o.hash_value() for o in writer._objects] - for k, v in writer._idnum_hash.items(): - assert v.pdf == writer - assert k in objects_hash, "Missing %s" % v - - -def test_add_single_annotation(pdf_file_path): - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - annot_dict = { - "/Type": "/Annot", - "/Subtype": "/Text", - "/Rect": [270.75, 596.25, 294.75, 620.25], - "/Contents": "Note in second paragraph", - "/C": [1, 1, 0], - "/M": "D:20220406191858+02'00", - "/Popup": { - "/Type": "/Annot", - "/Subtype": "/Popup", - "/Rect": [294.75, 446.25, 494.75, 596.25], - "/M": "D:20220406191847+02'00", - }, - "/T": "moose", - } - writer.add_annotation(0, annot_dict) - - # Inspect manually by adding 'assert False' and viewing the PDF - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_deprecation_bookmark_decorator(): - reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") - page = reader.pages[0] - outline_item = reader.outline[0] - writer = PdfWriter() - writer.add_page(page) - with pytest.raises( - DeprecationError, - match="bookmark is deprecated as an argument. Use outline_item instead", - ): - writer.add_outline_item_dict(bookmark=outline_item) - - -@pytest.mark.samples() -def test_colors_in_outline_item(pdf_file_path): - reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") - writer = PdfWriter() - writer.clone_document_from_reader(reader) - purple_rgb = (0.5019607843137255, 0.0, 0.5019607843137255) - writer.add_outline_item("First Outline Item", page_number=2, color="800080") - writer.add_outline_item("Second Outline Item", page_number=3, color="#800080") - writer.add_outline_item("Third Outline Item", page_number=4, color=purple_rgb) - - with open(pdf_file_path, "wb") as f: - writer.write(f) - - reader2 = PdfReader(pdf_file_path) - for outline_item in reader2.outline: - # convert float to string because of mutability - assert [str(c) for c in outline_item.color] == [str(p) for p in purple_rgb] - - -@pytest.mark.samples() -def test_write_empty_stream(): - reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") - writer = PdfWriter() - writer.clone_document_from_reader(reader) - - with pytest.raises(ValueError) as exc: - writer.write("") - assert exc.value.args[0] == "Output(stream=) is empty." - - -def test_startup_dest(): - pdf_file_writer = PdfWriter() - pdf_file_writer.append_pages_from_reader(PdfReader(RESOURCE_ROOT / "issue-604.pdf")) - - assert pdf_file_writer.open_destination is None - pdf_file_writer.open_destination = pdf_file_writer.pages[9] - # checked also using Acrobrat to verify the good page is opened - op = pdf_file_writer._root_object["/OpenAction"] - assert op[0] == pdf_file_writer.pages[9].indirect_reference - assert op[1] == "/Fit" - op = pdf_file_writer.open_destination - assert op.raw_get("/Page") == pdf_file_writer.pages[9].indirect_reference - assert op["/Type"] == "/Fit" - pdf_file_writer.open_destination = op - assert pdf_file_writer.open_destination == op - - # irrelevant, just for coverage - pdf_file_writer._root_object[NameObject("/OpenAction")][0] = NumberObject(0) - pdf_file_writer.open_destination - with pytest.raises(Exception) as exc: - del pdf_file_writer._root_object[NameObject("/OpenAction")][0] - pdf_file_writer.open_destination - assert "Invalid Destination" in str(exc.value) - - pdf_file_writer.open_destination = "Test" - # checked also using Acrobrat to verify open_destination - op = pdf_file_writer._root_object["/OpenAction"] - assert isinstance(op, TextStringObject) - assert op == "Test" - op = pdf_file_writer.open_destination - assert isinstance(op, TextStringObject) - assert op == "Test" - - # irrelevant, this is just for coverage - pdf_file_writer._root_object[NameObject("/OpenAction")] = NumberObject(0) - assert pdf_file_writer.open_destination is None - pdf_file_writer.open_destination = None - assert "/OpenAction" not in pdf_file_writer._root_object - pdf_file_writer.open_destination = None - - -@pytest.mark.enable_socket() -def test_iss471(): - url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" - name = "book_471.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - - writer = PdfWriter() - writer.append(reader, excluded_fields=[]) - assert isinstance( - writer.pages[0]["/Annots"][0].get_object()["/Dest"], TextStringObject - ) - - -@pytest.mark.enable_socket() -def test_reset_translation(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader, (0, 10)) - nb = len(writer._objects) - writer.append(reader, (0, 10)) - assert ( - len(writer._objects) == nb + 11 - ) # +10 (pages) +1 because of the added outline - nb += 1 - writer.reset_translation(reader) - writer.append(reader, (0, 10)) - assert len(writer._objects) >= nb + 200 - nb = len(writer._objects) - writer.reset_translation(reader.pages[0].indirect_reference) - writer.append(reader, (0, 10)) - assert len(writer._objects) >= nb + 200 - nb = len(writer._objects) - writer.reset_translation() - writer.append(reader, (0, 10)) - assert len(writer._objects) >= nb + 200 - nb = len(writer.pages) - writer.append(reader, [reader.pages[0], reader.pages[0]]) - assert len(writer.pages) == nb + 2 - - -def test_threads_empty(): - writer = PdfWriter() - thr = writer.threads - assert isinstance(thr, ArrayObject) - assert len(thr) == 0 - thr2 = writer.threads - assert thr == thr2 - - -@pytest.mark.enable_socket() -def test_append_without_annots_and_articles(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader, None, (0, 10), True, ["/B"]) - writer.reset_translation() - writer.append(reader, (0, 10), True, ["/B"]) - assert writer.threads == [] - writer = PdfWriter() - writer.append(reader, None, (0, 10), True, ["/Annots"]) - assert "/Annots" not in writer.pages[5] - writer = PdfWriter() - writer.append(reader, None, (0, 10), True, []) - assert "/Annots" in writer.pages[5] - assert len(writer.threads) >= 1 - - -@pytest.mark.enable_socket() -def test_append_multiple(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append( - reader, [0, 0, 0] - ) # to demonstre multiple insertion of same page at once - writer.append(reader, [0, 0, 0]) # second pack - pages = writer._root_object["/Pages"]["/Kids"] - assert pages[0] not in pages[1:] # page not repeated - assert pages[-1] not in pages[0:-1] # page not repeated - - -@pytest.mark.samples() -def test_set_page_label(pdf_file_path): - src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels - reader = PdfReader(src) - - expected = [ - "i", - "ii", - "1", - "2", - "A", - "B", - "1", - "2", - "3", - "4", - "A", - "i", - "I", - "II", - "1", - "2", - "3", - "I", - "II", - ] - - # Tests full lenght with labels assigned at first and last elements - # Tests different labels assigned to consecutive ranges - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(0, 1, "/r") - writer.set_page_label(4, 5, "/A") - writer.set_page_label(10, 10, "/A") - writer.set_page_label(11, 11, "/r") - writer.set_page_label(12, 13, "/R") - writer.set_page_label(17, 18, "/R") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels == expected - - writer = PdfWriter() # Same labels, different set order - writer.clone_document_from_reader(reader) - writer.set_page_label(17, 18, "/R") - writer.set_page_label(4, 5, "/A") - writer.set_page_label(10, 10, "/A") - writer.set_page_label(0, 1, "/r") - writer.set_page_label(12, 13, "/R") - writer.set_page_label(11, 11, "/r") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels == expected - - # Tests labels assigned only in the middle - # Tests label assigned to a range already containing labled ranges - expected = ["1", "2", "i", "ii", "iii", "iv", "v", "1"] - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(3, 4, "/a") - writer.set_page_label(5, 5, "/A") - writer.set_page_label(2, 6, "/r") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - # Tests labels assigned inside a previously existing range - expected = ["1", "2", "i", "a", "b", "A", "1", "1", "2"] - # Ones repeat because user didnt cover the entire original range - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(2, 6, "/r") - writer.set_page_label(3, 4, "/a") - writer.set_page_label(5, 5, "/A") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - # Tests invalid user input - writer = PdfWriter() - writer.clone_document_from_reader(reader) - with pytest.raises( - ValueError, match="at least one between style and prefix must be given" - ): - writer.set_page_label(0, 5, start=2) - with pytest.raises( - ValueError, match="page_index_from must be equal or greater then 0" - ): - writer.set_page_label(-1, 5, "/r") - with pytest.raises( - ValueError, match="page_index_to must be equal or greater then page_index_from" - ): - writer.set_page_label(5, 0, "/r") - with pytest.raises(ValueError, match="page_index_to exceeds number of pages"): - writer.set_page_label(0, 19, "/r") - with pytest.raises( - ValueError, match="if given, start must be equal or greater than one" - ): - writer.set_page_label(0, 5, "/r", start=-1) - - pdf_file_path.unlink() - - src = ( - SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" - ) # File with pre existing labels - reader = PdfReader(src) - - # Tests adding labels to existing ones - expected = ["i", "ii", "A", "B", "1"] - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(2, 3, "/A") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - # Tests replacing existing lables - expected = ["A", "B", "1", "1", "2"] - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(0, 1, "/A") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - pdf_file_path.unlink() - - # Tests prefix and start. - src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels - reader = PdfReader(src) - writer = PdfWriter() - writer.clone_document_from_reader(reader) - - writer.set_page_label(0, 0, prefix="FRONT") - writer.set_page_label(1, 2, "/D", start=2) - writer.set_page_label(3, 6, prefix="UPDATES") - writer.set_page_label(7, 10, "/D", prefix="THYR-") - writer.set_page_label(11, 21, "/D", prefix="PAP-") - writer.set_page_label(22, 30, "/D", prefix="FOLL-") - writer.set_page_label(31, 39, "/D", prefix="HURT-") - writer.write(pdf_file_path) - - -@pytest.mark.enable_socket() -def test_iss1601(): - url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" - name = "badge-38.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - original_cs_operations = ContentStream( - reader.pages[0].get_contents(), reader - ).operations - writer = PdfWriter() - page_1 = writer.add_blank_page( - reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] - ) - page_1.merge_transformed_page(reader.pages[0], Transformation()) - page_1_cs_operations = page_1.get_contents().operations - assert is_sublist(original_cs_operations, page_1_cs_operations) - page_1 = writer.add_blank_page( - reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] - ) - page_1.merge_page(reader.pages[0]) - page_1_cs_operations = page_1.get_contents().operations - assert is_sublist(original_cs_operations, page_1_cs_operations) - - -def test_attachments(): - writer = PdfWriter() - writer.add_blank_page(100, 100) - b = BytesIO() - writer.write(b) - b.seek(0) - reader = PdfReader(b) - b = None - assert reader.attachments == {} - assert reader._list_attachments() == [] - assert reader._get_attachments() == {} - to_add = [ - ("foobar.txt", b"foobarcontent"), - ("foobar2.txt", b"foobarcontent2"), - ("foobar2.txt", b"2nd_foobarcontent"), - ] - for name, content in to_add: - writer.add_attachment(name, content) - - b = BytesIO() - writer.write(b) - b.seek(0) - reader = PdfReader(b) - b = None - assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add}) - assert str(reader.attachments) == "LazyDict(keys=['foobar.txt', 'foobar2.txt'])" - assert reader._list_attachments() == [name for name, _ in to_add] - - # We've added the same key twice - hence only 2 and not 3: - att = reader._get_attachments() - assert len(att) == 2 # we have 2 keys, but 3 attachments! - - # The content for foobar.txt is clear and just a single value: - assert att["foobar.txt"] == b"foobarcontent" - - # The content for foobar2.txt is a list! - att = reader._get_attachments("foobar2.txt") - assert len(att) == 1 - assert att["foobar2.txt"] == [b"foobarcontent2", b"2nd_foobarcontent"] - - # Let's do both cases with the public interface: - assert reader.attachments["foobar.txt"][0] == b"foobarcontent" - assert reader.attachments["foobar2.txt"][0] == b"foobarcontent2" - assert reader.attachments["foobar2.txt"][1] == b"2nd_foobarcontent" - - -@pytest.mark.enable_socket() -def test_iss1614(): - # test of an annotation(link) directly stored in the /Annots in the page - url = "https://github.com/py-pdf/pypdf/files/10669995/broke.pdf" - name = "iss1614.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - # test for 2nd error case reported in #1614 - url = "https://github.com/py-pdf/pypdf/files/10696390/broken.pdf" - name = "iss1614.2.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_new_removes(): - # test of an annotation(link) directly stored in the /Annots in the page - url = "https://github.com/py-pdf/pypdf/files/10807951/tt.pdf" - name = "iss1650.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.remove_images() - b = BytesIO() - writer.write(b) - bb = bytes(b.getbuffer()) - assert b"/Im0 Do" not in bb - assert b"/Fm0 Do" in bb - assert b" TJ" in bb - - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.remove_text() - b = BytesIO() - writer.write(b) - bb = bytes(b.getbuffer()) - assert b"/Im0" in bb - assert b"Chap" not in bb - assert b" TJ" not in bb - - url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf" - name = "GeoBaseWithComments.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer.append(reader) - writer.remove_objects_from_page(writer.pages[0], [ObjectDeletionFlag.LINKS]) - assert "/Links" not in [ - a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] - ] - writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.ATTACHMENTS) - assert "/FileAttachment" not in [ - a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] - ] - - writer.pages[0]["/Annots"].append( - DictionaryObject({NameObject("/Subtype"): TextStringObject("/3D")}) - ) - assert "/3D" in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] - writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.OBJECTS_3D) - assert "/3D" not in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] - - writer.remove_links() - assert len(writer.pages[0]["/Annots"]) == 0 - assert len(writer.pages[3]["/Annots"]) == 0 - - writer.remove_annotations("/Text") - - -@pytest.mark.enable_socket() -def test_late_iss1654(): - url = "https://github.com/py-pdf/pypdf/files/10935632/bid1.pdf" - name = "bid1.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.clone_document_from_reader(reader) - for p in writer.pages: - p.compress_content_streams() - b = BytesIO() - writer.write(b) - - -@pytest.mark.enable_socket() -def test_iss1723(): - # test of an annotation(link) directly stored in the /Annots in the page - url = "https://github.com/py-pdf/pypdf/files/11015242/inputFile.pdf" - name = "iss1723.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader, (3, 5)) - - -@pytest.mark.enable_socket() -def test_iss1767(): - # test with a pdf which is buggy because the object 389,0 exists 3 times: - # twice to define catalog and one as an XObject inducing a loop when - # cloning - url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" - name = "iss1723.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - PdfWriter(clone_from=reader) - - -@pytest.mark.enable_socket() -def test_named_dest_page_number(): - """ - Closes iss471 - tests appending with named destinations as integers - """ - url = "https://github.com/py-pdf/pypdf/files/10704333/central.pdf" - name = "central.pdf" - writer = PdfWriter() - writer.add_blank_page(100, 100) - writer.append(BytesIO(get_data_from_url(url, name=name)), pages=[0, 1, 2]) - assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 2 - assert writer._root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) - writer.append(BytesIO(get_data_from_url(url, name=name))) - assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 - writer2 = PdfWriter() - writer2.add_blank_page(100, 100) - dest = writer2.add_named_destination("toto", 0) - dest.get_object()[NameObject("/D")][0] = NullObject() - b = BytesIO() - writer2.write(b) - b.seek(0) - writer.append(b) - assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ( - "dont_commit_writer.pdf", - True, - ) - ], -) -def test_update_form_fields(write_data_here, needs_cleanup): - writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") - writer.update_page_form_field_values( - writer.pages[0], - { - "CheckBox1": "/Yes", - "Text1": "mon Text1", - "Text2": "ligne1\nligne2", - "RadioGroup1": "/2", - "RdoS1": "/", - "Combo1": "!!monCombo!!", - "Liste1": "Liste2", - "Liste2": ["Lst1", "Lst3"], - "DropList1": "DropListe3", - }, - auto_regenerate=False, - ) - del writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"] - writer.update_page_form_field_values( - writer.pages[0], - {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"}, - auto_regenerate=False, - ) - - writer.write("dont_commit_writer.pdf") - reader = PdfReader("dont_commit_writer.pdf") - flds = reader.get_fields() - assert flds["CheckBox1"]["/V"] == "/Yes" - assert flds["CheckBox1"].indirect_reference.get_object()["/AS"] == "/Yes" - assert ( - b"(my Text1)" - in flds["Text1"].indirect_reference.get_object()["/AP"]["/N"].get_data() - ) - assert flds["Text2"]["/V"] == "ligne1\nligne2\nligne3" - assert ( - b"(ligne3)" - in flds["Text2"].indirect_reference.get_object()["/AP"]["/N"].get_data() - ) - assert flds["RadioGroup1"]["/V"] == "/2" - assert flds["RadioGroup1"]["/Kids"][0].get_object()["/AS"] == "/Off" - assert flds["RadioGroup1"]["/Kids"][1].get_object()["/AS"] == "/2" - assert all(x in flds["Liste2"]["/V"] for x in ["Lst1", "Lst3"]) - - assert all(x in flds["CheckBox1"]["/_States_"] for x in ["/Off", "/Yes"]) - assert all(x in flds["RadioGroup1"]["/_States_"] for x in ["/1", "/2", "/3"]) - assert all(x in flds["Liste1"]["/_States_"] for x in ["Liste1", "Liste2", "Liste3"]) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.enable_socket() -def test_iss1862(): - # The file here has "/B" entry to define the font in a object below the page - # The excluded field shall be considered only at first level (page) and not - # below - url = "https://github.com/py-pdf/pypdf/files/11708801/intro.pdf" - name = "iss1862.pdf" - writer = PdfWriter() - writer.append(BytesIO(get_data_from_url(url, name=name))) - # check that "/B" is in the font - writer.pages[0]["/Resources"]["/Font"]["/F1"]["/CharProcs"]["/B"].get_data() - - -def test_empty_objects_before_cloning(): - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - writer = PdfWriter(clone_from=reader) - nb_obj_reader = len(reader.xref_objStm) + sum( - len(reader.xref[i]) for i in reader.xref - ) - nb_obj_reader -= 1 # for trailer - nb_obj_reader -= len( - {x: 1 for x, y in reader.xref_objStm.values()} - ) # to remove object streams - assert len(writer._objects) == nb_obj_reader - - -@pytest.mark.enable_socket() -def test_watermark(): - url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" - name = "bgwatermark.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" - name = "srcwatermark.pdf" - writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) - for p in writer.pages: - p.merge_page(reader.pages[0], over=False) - - assert isinstance(p["/Contents"], ArrayObject) - assert isinstance(p["/Contents"][0], IndirectObject) - - b = BytesIO() - writer.write(b) - assert len(b.getvalue()) < 2.1 * 1024 * 1024 - - -@pytest.mark.enable_socket() -@pytest.mark.timeout(4) # this was a lot slower before PR #2086 -def test_watermarking_speed(): - url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" - name = "bgwatermark.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url = "https://arxiv.org/pdf/2201.00214.pdf" - name = "2201.00214.pdf" - writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) - for p in writer.pages: - p.merge_page(reader.pages[0], over=False) - out_pdf_bytesio = BytesIO() - writer.write(out_pdf_bytesio) - pdf_size_in_mib = len(out_pdf_bytesio.getvalue()) / 1024 / 1024 - assert pdf_size_in_mib < 20 - - -@pytest.mark.enable_socket() -@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") -def test_watermark_rendering(tmp_path): - """Ensure the visual appearance of watermarking stays correct.""" - url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" - name = "bgwatermark.pdf" - watermark = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] - url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" - name = "srcwatermark.pdf" - page = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] - writer = PdfWriter() - page.merge_page(watermark, over=False) - writer.add_page(page) - - target_png_path = tmp_path / "target.png" - url = "https://github.com/py-pdf/pypdf/assets/96178532/d5c72d0e-7047-4504-bbf6-bc591c80d7c0" - name = "dstwatermark.png" - target_png_path.write_bytes(get_data_from_url(url, name=name)) - - pdf_path = tmp_path / "out.pdf" - png_path = tmp_path / "out.png" +"""Test the pypdf._writer module.""" +import re +import shutil +import subprocess +from io import BytesIO +from pathlib import Path + +import pytest + +from pypdf import ( + ObjectDeletionFlag, + PageObject, + PdfMerger, + PdfReader, + PdfWriter, + Transformation, +) +from pypdf.errors import DeprecationError, PageSizeNotDefinedError, PyPdfError +from pypdf.generic import ( + ArrayObject, + ContentStream, + DictionaryObject, + Fit, + IndirectObject, + NameObject, + NullObject, + NumberObject, + RectangleObject, + StreamObject, + TextStringObject, +) + +from . import get_data_from_url, is_sublist +from .test_images import image_similarity + +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" +SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files" +GHOSTSCRIPT_BINARY = shutil.which("gs") + + +def test_writer_exception_non_binary(tmp_path, caplog): + src = RESOURCE_ROOT / "pdflatex-outline.pdf" + + reader = PdfReader(src) + writer = PdfWriter() + writer.add_page(reader.pages[0]) + + with open(tmp_path / "out.txt", "w") as fp, pytest.raises(TypeError): + writer.write_stream(fp) + ending = "to write to is not in binary mode. It may not be written to correctly.\n" + assert caplog.text.endswith(ending) + + +def test_writer_clone(): + src = RESOURCE_ROOT / "pdflatex-outline.pdf" + + reader = PdfReader(src) + writer = PdfWriter(clone_from=reader) + assert len(writer.pages) == 4 + assert "PageObject" in str(type(writer.pages[0])) + + writer = PdfWriter(clone_from=src) + assert len(writer.pages) == 4 + assert "PageObject" in str(type(writer.pages[0])) + + +def test_writer_clone_bookmarks(): + # Arrange + src = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" + reader = PdfReader(src) + writer = PdfWriter() + + # Act + test cat + cat = "" + + def cat1(p) -> None: + nonlocal cat + cat += p.__repr__() + + writer.clone_document_from_reader(reader, cat1) + assert "/Page" in cat + assert writer.pages[0].raw_get("/Parent") == writer._pages + writer.add_outline_item("Page 1", 0) + writer.add_outline_item("Page 2", 1) + + # Assert + bytes_stream = BytesIO() + writer.write(bytes_stream) + bytes_stream.seek(0) + reader2 = PdfReader(bytes_stream) + assert len(reader2.pages) == len(reader.pages) + assert len(reader2.outline) == 2 + + # test with append + writer = PdfWriter() + writer.append(reader) + writer.add_outline_item("Page 1", 0) + writer.add_outline_item("Page 2", 1) + + # Assert + bytes_stream = BytesIO() + writer.write(bytes_stream) + bytes_stream.seek(0) + reader2 = PdfReader(bytes_stream) + assert len(reader2.pages) == len(reader.pages) + assert len(reader2.outline) == 2 + + +def writer_operate(writer: PdfWriter) -> None: + """ + To test the writer that initialized by each of the four usages. + + Args: + writer: A PdfWriter object + """ + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + pdf_outline_path = RESOURCE_ROOT / "pdflatex-outline.pdf" + + reader = PdfReader(pdf_path) + reader_outline = PdfReader(pdf_outline_path) + + page = reader.pages[0] + with pytest.raises(PageSizeNotDefinedError) as exc: + writer.add_blank_page() + assert exc.value.args == () + writer.insert_page(page, 1) + writer.insert_page(reader_outline.pages[0], 0) + writer.add_outline_item_destination(page) + writer.remove_links() + writer.add_outline_item_destination(page) + oi = writer.add_outline_item( + "An outline item", 0, None, (255, 0, 15), True, True, Fit.fit_box_vertically(10) + ) + writer.add_outline_item( + "The XYZ fit", 0, oi, (255, 0, 15), True, True, Fit.xyz(left=10, top=20, zoom=3) + ) + writer.add_outline_item( + "The XYZ fit no args", 0, oi, (255, 0, 15), True, True, Fit.xyz() + ) + writer.add_outline_item( + "The FitH fit", 0, oi, (255, 0, 15), True, True, Fit.fit_horizontally(top=10) + ) + writer.add_outline_item( + "The FitV fit", 0, oi, (255, 0, 15), True, True, Fit.fit_vertically(left=10) + ) + writer.add_outline_item( + "The FitR fit", + 0, + oi, + (255, 0, 15), + True, + True, + Fit.fit_rectangle(left=10, bottom=20, right=30, top=40), + ) + writer.add_outline_item( + "The FitB fit", 0, oi, (255, 0, 15), True, True, Fit.fit_box() + ) + writer.add_outline_item( + "The FitBH fit", + 0, + oi, + (255, 0, 15), + True, + True, + Fit.fit_box_horizontally(top=10), + ) + writer.add_outline_item( + "The FitBV fit", + 0, + oi, + (255, 0, 15), + True, + True, + Fit.fit_box_vertically(left=10), + ) + writer.add_blank_page() + writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) + with pytest.warns( + DeprecationWarning, match="'pagenum' argument of add_uri is deprecated" + ): + writer.add_uri( + 2, "https://example.com", RectangleObject([0, 0, 100, 100]), pagenum=2 + ) + with pytest.raises(DeprecationError): + writer.add_link(2, 1, RectangleObject([0, 0, 100, 100])) + assert writer._get_page_layout() is None + writer.page_layout = "broken" + assert writer.page_layout == "broken" + writer.page_layout = NameObject("/SinglePage") + assert writer._get_page_layout() == "/SinglePage" + assert writer._get_page_mode() is None + writer.set_page_mode("/UseNone") + assert writer._get_page_mode() == "/UseNone" + writer.set_page_mode(NameObject("/UseOC")) + assert writer._get_page_mode() == "/UseOC" + writer.insert_blank_page(width=100, height=100) + writer.insert_blank_page() # without parameters + + writer.remove_images() + + writer.add_metadata(reader.metadata) + writer.add_metadata({"/Author": "Martin Thoma"}) + writer.add_metadata({"/MyCustom": 1234}) + + writer.add_attachment("foobar.gif", b"foobarcontent") + + # Check that every key in _idnum_hash is correct + objects_hash = [o.hash_value() for o in writer._objects] + for k, v in writer._idnum_hash.items(): + assert v.pdf == writer + assert k in objects_hash, f"Missing {v}" + + +tmp_path = "dont_commit_writer.pdf" + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operations_by_traditional_usage(write_data_here, needs_cleanup): + writer = PdfWriter() + + writer_operate(writer) + + # finally, write "output" to pypdf-output.pdf + if needs_cleanup: + with open(write_data_here, "wb") as output_stream: + writer.write(output_stream) + else: + output_stream = write_data_here + writer.write(output_stream) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operations_by_semi_traditional_usage(write_data_here, needs_cleanup): + with PdfWriter() as writer: + writer_operate(writer) + + # finally, write "output" to pypdf-output.pdf + if needs_cleanup: + with open(write_data_here, "wb") as output_stream: + writer.write(output_stream) + else: + output_stream = write_data_here + writer.write(output_stream) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operations_by_semi_new_traditional_usage( + write_data_here, needs_cleanup +): + with PdfWriter() as writer: + writer_operate(writer) + + # finally, write "output" to pypdf-output.pdf + writer.write(write_data_here) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operation_by_new_usage(write_data_here, needs_cleanup): + # This includes write "output" to pypdf-output.pdf + with PdfWriter(write_data_here) as writer: + writer_operate(writer) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + "input_path", + [ + "side-by-side-subfig.pdf", + "reportlab-inline-image.pdf", + ], +) +def test_remove_images(pdf_file_path, input_path): + pdf_path = RESOURCE_ROOT / input_path + + reader = PdfReader(pdf_path) + writer = PdfWriter() + + page = reader.pages[0] + writer.insert_page(page, 0) + writer.remove_images() + page_contents_stream = writer.pages[0]["/Contents"]._data + assert len(page_contents_stream.strip()) + + # finally, write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + with open(pdf_file_path, "rb") as input_stream: + reader = PdfReader(input_stream) + if input_path == "side-by-side-subfig.pdf": + extracted_text = reader.pages[0].extract_text() + assert extracted_text + assert "Lorem ipsum dolor sit amet" in extracted_text + + +@pytest.mark.enable_socket() +def test_remove_images_sub_level(): + """Cf #2035""" + url = "https://github.com/py-pdf/pypdf/files/12394781/2210.03142-1.pdf" + name = "iss2103.pdf" + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) + writer.remove_images() + assert ( + len( + [ + o.get_object() + for o in writer.pages[0]["/Resources"]["/XObject"]["/Fm1"][ + "/Resources" + ]["/XObject"]["/Im1"]["/Resources"]["/XObject"].values() + if not isinstance(o.get_object(), NullObject) + ] + ) + == 0 + ) + + +@pytest.mark.parametrize( + "input_path", + [ + "side-by-side-subfig.pdf", + "reportlab-inline-image.pdf", + ], +) +def test_remove_text(input_path, pdf_file_path): + pdf_path = RESOURCE_ROOT / input_path + + reader = PdfReader(pdf_path) + writer = PdfWriter() + + page = reader.pages[0] + writer.insert_page(page, 0) + writer.remove_text() + + # finally, write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_remove_text_all_operators(pdf_file_path): + stream = ( + b"BT " + b"/F0 36 Tf " + b"50 706 Td " + b"36 TL " + b"(The Tj operator) Tj " + b'1 2 (The double quote operator) " ' + b"(The single quote operator) ' " + b"ET" + ) + pdf_data = ( + b"%%PDF-1.7\n" + b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n" + b"2 0 obj << >> endobj\n" + b"3 0 obj << >> endobj\n" + b"4 0 obj << /Length %d >>\n" + b"stream\n" + (b"%s\n" % stream) + b"endstream\n" + b"endobj\n" + b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n" + b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" + b" /Resources << /Font << >> >>" + b" /Rotate 0 /Type /Page >> endobj\n" + b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" + b"xref 1 6\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"trailer << /Root 6 0 R /Size 6 >>\n" + b"startxref\n%d\n" + b"%%%%EOF" + ) + startx_correction = -1 + pdf_data = pdf_data % ( + len(stream), + pdf_data.find(b"1 0 obj") + startx_correction, + pdf_data.find(b"2 0 obj") + startx_correction, + pdf_data.find(b"3 0 obj") + startx_correction, + pdf_data.find(b"4 0 obj") + startx_correction, + pdf_data.find(b"5 0 obj") + startx_correction, + pdf_data.find(b"6 0 obj") + startx_correction, + # startx_correction should be -1 due to double % at the beginning + # inducing an error on startxref computation + pdf_data.find(b"xref"), + ) + pdf_stream = BytesIO(pdf_data) + + reader = PdfReader(pdf_stream, strict=False) + writer = PdfWriter() + + page = reader.pages[0] + writer.insert_page(page, 0) + writer.remove_text() + + # finally, write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_write_metadata(pdf_file_path): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + + reader = PdfReader(pdf_path) + writer = PdfWriter() + + writer.add_page(reader.pages[0]) + for page in reader.pages: + writer.add_page(page) + + metadata = reader.metadata + writer.add_metadata(metadata) + + writer.add_metadata({"/Title": "The Crazy Ones"}) + + # finally, write data to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + # Check if the title was set + reader = PdfReader(pdf_file_path) + metadata = reader.metadata + assert metadata.get("/Title") == "The Crazy Ones" + + +def test_fill_form(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + writer = PdfWriter() + + writer.append(reader, [0]) + writer.append(RESOURCE_ROOT / "crazyones.pdf", [0]) + + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"}, flags=1 + ) + + # check if no fields to fill in the page + writer.update_page_form_field_values( + writer.pages[1], {"foo": "some filled in text"}, flags=1 + ) + + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"} + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_fill_form_with_qualified(): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + reader.add_form_topname("top") + + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.add_page(reader.pages[0]) + writer.update_page_form_field_values( + writer.pages[0], {"top.foo": "filling"}, flags=1 + ) + b = BytesIO() + writer.write(b) + + reader2 = PdfReader(b) + fields = reader2.get_fields() + assert fields["top.foo"]["/V"] == "filling" + + +@pytest.mark.parametrize( + ("use_128bit", "user_password", "owner_password"), + [(True, "userpwd", "ownerpwd"), (False, "userpwd", "ownerpwd")], +) +def test_encrypt(use_128bit, user_password, owner_password, pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + writer = PdfWriter() + + page = reader.pages[0] + orig_text = page.extract_text() + + writer.add_page(page) + + with pytest.raises(ValueError, match="owner_pwd of encrypt is deprecated."): + writer.encrypt( + owner_pwd=user_password, + owner_password=owner_password, + user_password=user_password, + use_128bit=use_128bit, + ) + with pytest.raises(ValueError, match="'user_pwd' argument is deprecated"): + writer.encrypt( + owner_password=owner_password, + user_password=user_password, + user_pwd=user_password, + use_128bit=use_128bit, + ) + writer.encrypt( + user_password=user_password, + owner_password=owner_password, + use_128bit=use_128bit, + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + # Test that the data is not there in clear text + with open(pdf_file_path, "rb") as input_stream: + data = input_stream.read() + assert b"foo" not in data + + # Test the user password (str): + reader = PdfReader(pdf_file_path, password="userpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + # Test the owner password (str): + reader = PdfReader(pdf_file_path, password="ownerpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + # Test the user password (bytes): + reader = PdfReader(pdf_file_path, password=b"userpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + # Test the owner password (stbytesr): + reader = PdfReader(pdf_file_path, password=b"ownerpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + +def test_add_outline_item(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + + for page in reader.pages: + writer.add_page(page) + + outline_item = writer.add_outline_item( + "An outline item", + 1, + None, + (255, 0, 15), + True, + True, + Fit.fit(), + is_open=False, + ) + _o2a = writer.add_outline_item( + "Another", 2, outline_item, None, False, False, Fit.fit() + ) + _o2b = writer.add_outline_item( + "Another bis", 2, outline_item, None, False, False, Fit.fit() + ) + outline_item2 = writer.add_outline_item( + "An outline item 2", + 1, + None, + (255, 0, 15), + True, + True, + Fit.fit(), + is_open=True, + ) + _o3a = writer.add_outline_item( + "Another 2", 2, outline_item2, None, False, False, Fit.fit() + ) + _o3b = writer.add_outline_item( + "Another 2bis", 2, outline_item2, None, False, False, Fit.fit() + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "w+b") as output_stream: + writer.write(output_stream) + output_stream.seek(0) + reader = PdfReader(output_stream) + assert reader.trailer["/Root"]["/Outlines"]["/Count"] == 3 + assert reader.outline[0]["/Count"] == -2 + assert reader.outline[0]["/%is_open%"] == False # noqa + assert reader.outline[2]["/Count"] == 2 + assert reader.outline[2]["/%is_open%"] == True # noqa + assert reader.outline[1][0]["/Count"] == 0 + + +def test_add_named_destination(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + assert writer.get_named_dest_root() == [] + + for page in reader.pages: + writer.add_page(page) + + assert writer.get_named_dest_root() == [] + + writer.add_named_destination(TextStringObject("A named dest"), 2) + writer.add_named_destination(TextStringObject("A named dest2"), 2) + + with pytest.warns(DeprecationWarning, match="pagenum is deprecated as an argument"): + writer.add_named_destination(TextStringObject("A named dest3"), pagenum=2) + + with pytest.raises(ValueError): + writer.add_named_destination( + TextStringObject("A named dest3"), pagenum=2, page_number=2 + ) + + root = writer.get_named_dest_root() + assert root[0] == "A named dest" + assert root[1].pdf == writer + assert root[1].get_object()["/S"] == NameObject("/GoTo") + assert root[1].get_object()["/D"][0] == writer.pages[2].indirect_reference + assert root[2] == "A named dest2" + assert root[3].pdf == writer + assert root[3].get_object()["/S"] == NameObject("/GoTo") + assert root[3].get_object()["/D"][0] == writer.pages[2].indirect_reference + assert root[4] == "A named dest3" + + # test get_object + + assert writer.get_object(root[1].idnum) == writer.get_object(root[1]) + with pytest.raises(ValueError) as exc: + writer.get_object(reader.pages[0].indirect_reference) + assert exc.value.args[0] == "pdf must be self" + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_add_named_destination_sort_order(pdf_file_path): + """ + Issue #1927 does not appear. + + add_named_destination() maintains the named destination list sort order + """ + writer = PdfWriter() + + assert writer.get_named_dest_root() == [] + + writer.add_blank_page(200, 200) + writer.add_named_destination("b", 0) + # "a" should be moved before "b" on insert + writer.add_named_destination("a", 0) + + root = writer.get_named_dest_root() + + assert len(root) == 4 + assert ( + root[0] == "a" + ), '"a" was not inserted before "b" in the named destination root' + assert root[2] == "b" + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_add_uri(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + + for page in reader.pages: + writer.add_page(page) + + writer.add_uri( + 1, + "http://www.example.com", + RectangleObject([0, 0, 100, 100]), + border=[1, 2, 3, [4]], + ) + writer.add_uri( + 2, + "https://pypdf.readthedocs.io/en/latest/", + RectangleObject([20, 30, 50, 80]), + border=[1, 2, 3], + ) + writer.add_uri( + 3, + "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", + "[ 200 300 250 350 ]", + border=[0, 0, 0], + ) + writer.add_uri( + 3, + "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", + [100, 200, 150, 250], + border=[0, 0, 0], + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_add_link(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + + for page in reader.pages: + writer.add_page(page) + + with pytest.raises( + DeprecationError, + match=( + re.escape( + "add_link is deprecated and was removed in pypdf 3.0.0. " + "Use add_annotation(pypdf.annotations.Link(...)) instead." + ) + ), + ): + writer.add_link( + 1, + 2, + RectangleObject([0, 0, 100, 100]), + border=[1, 2, 3, [4]], + fit="/Fit", + ) + writer.add_link( + 2, 3, RectangleObject([20, 30, 50, 80]), [1, 2, 3], "/FitH", None + ) + writer.add_link( + 3, + 0, + "[ 200 300 250 350 ]", + [0, 0, 0], + "/XYZ", + 0, + 0, + 2, + ) + writer.add_link( + 3, + 0, + [100, 200, 150, 250], + border=[0, 0, 0], + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_io_streams(): + """This is the example from the docs ("Streaming data").""" + filepath = RESOURCE_ROOT / "pdflatex-outline.pdf" + with open(filepath, "rb") as fh: + bytes_stream = BytesIO(fh.read()) + + # Read from bytes stream + reader = PdfReader(bytes_stream) + assert len(reader.pages) == 4 + + # Write to bytes stream + writer = PdfWriter() + with BytesIO() as output_stream: + writer.write(output_stream) + + +def test_regression_issue670(pdf_file_path): + filepath = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(filepath, strict=False) + for _ in range(2): + writer = PdfWriter() + writer.add_page(reader.pages[0]) + with open(pdf_file_path, "wb") as f_pdf: + writer.write(f_pdf) + + +def test_issue301(): + """Test with invalid stream length object.""" + with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: + reader = PdfReader(f) + writer = PdfWriter() + writer.append_pages_from_reader(reader) + b = BytesIO() + writer.write(b) + + +def test_append_pages_from_reader_append(): + """Use append_pages_from_reader with a callable.""" + with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: + reader = PdfReader(f) + writer = PdfWriter() + writer.append_pages_from_reader(reader, callable) + b = BytesIO() + writer.write(b) + + +@pytest.mark.enable_socket() +@pytest.mark.slow() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_sweep_indirect_references_nullobject_exception(pdf_file_path): + # TODO: Check this more closely... this looks weird + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + merger = PdfMerger() + merger.append(reader) + merger.write(pdf_file_path) + + +@pytest.mark.enable_socket() +@pytest.mark.slow() +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf", + "test_sweep_indirect_references_nullobject_exception.pdf", + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/922/922840.pdf", + "test_write_outline_item_on_page_fitv.pdf", + ), + ("https://github.com/py-pdf/pypdf/files/10715624/test.pdf", "iss1627.pdf"), + ], +) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_some_appends(pdf_file_path, url, name): + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + # PdfMerger + merger = PdfMerger() + merger.append(reader) + merger.write(pdf_file_path) + # PdfWriter + merger = PdfWriter() + merger.append(reader) + merger.write(pdf_file_path) + + +def test_pdf_header(): + writer = PdfWriter() + assert writer.pdf_header == b"%PDF-1.3" + + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + writer.add_page(reader.pages[0]) + assert writer.pdf_header == b"%PDF-1.5" + + writer.pdf_header = b"%PDF-1.6" + assert writer.pdf_header == b"%PDF-1.6" + + +def test_write_dict_stream_object(pdf_file_path): + stream = ( + b"BT " + b"/F0 36 Tf " + b"50 706 Td " + b"36 TL " + b"(The Tj operator) Tj " + b'1 2 (The double quote operator) " ' + b"(The single quote operator) ' " + b"ET" + ) + + stream_object = StreamObject() + stream_object[NameObject("/Type")] = NameObject("/Text") + stream_object._data = stream + + writer = PdfWriter() + + page_object = PageObject.create_blank_page(writer, 1000, 1000) + # Construct dictionary object (PageObject) with stream object + # Writer will replace this stream object with indirect object + page_object[NameObject("/Test")] = stream_object + + page_object = writer.add_page(page_object) + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + for k, v in page_object.items(): + if k == "/Test": + assert str(v) != str(stream_object) + assert isinstance(v, IndirectObject) + assert str(v.get_object()) == str(stream_object) + break + else: + pytest.fail("/Test not found") + + # Check that every key in _idnum_hash is correct + objects_hash = [o.hash_value() for o in writer._objects] + for k, v in writer._idnum_hash.items(): + assert v.pdf == writer + assert k in objects_hash, "Missing %s" % v + + +def test_add_single_annotation(pdf_file_path): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + annot_dict = { + "/Type": "/Annot", + "/Subtype": "/Text", + "/Rect": [270.75, 596.25, 294.75, 620.25], + "/Contents": "Note in second paragraph", + "/C": [1, 1, 0], + "/M": "D:20220406191858+02'00", + "/Popup": { + "/Type": "/Annot", + "/Subtype": "/Popup", + "/Rect": [294.75, 446.25, 494.75, 596.25], + "/M": "D:20220406191847+02'00", + }, + "/T": "moose", + } + writer.add_annotation(0, annot_dict) + + # Inspect manually by adding 'assert False' and viewing the PDF + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_deprecation_bookmark_decorator(): + reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") + page = reader.pages[0] + outline_item = reader.outline[0] + writer = PdfWriter() + writer.add_page(page) + with pytest.raises( + DeprecationError, + match="bookmark is deprecated as an argument. Use outline_item instead", + ): + writer.add_outline_item_dict(bookmark=outline_item) + + +@pytest.mark.samples() +def test_colors_in_outline_item(pdf_file_path): + reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") + writer = PdfWriter() + writer.clone_document_from_reader(reader) + purple_rgb = (0.5019607843137255, 0.0, 0.5019607843137255) + writer.add_outline_item("First Outline Item", page_number=2, color="800080") + writer.add_outline_item("Second Outline Item", page_number=3, color="#800080") + writer.add_outline_item("Third Outline Item", page_number=4, color=purple_rgb) + + with open(pdf_file_path, "wb") as f: + writer.write(f) + + reader2 = PdfReader(pdf_file_path) + for outline_item in reader2.outline: + # convert float to string because of mutability + assert [str(c) for c in outline_item.color] == [str(p) for p in purple_rgb] + + +@pytest.mark.samples() +def test_write_empty_stream(): + reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") + writer = PdfWriter() + writer.clone_document_from_reader(reader) + + with pytest.raises(ValueError) as exc: + writer.write("") + assert exc.value.args[0] == "Output(stream=) is empty." + + +def test_startup_dest(): + pdf_file_writer = PdfWriter() + pdf_file_writer.append_pages_from_reader(PdfReader(RESOURCE_ROOT / "issue-604.pdf")) + + assert pdf_file_writer.open_destination is None + pdf_file_writer.open_destination = pdf_file_writer.pages[9] + # checked also using Acrobrat to verify the good page is opened + op = pdf_file_writer._root_object["/OpenAction"] + assert op[0] == pdf_file_writer.pages[9].indirect_reference + assert op[1] == "/Fit" + op = pdf_file_writer.open_destination + assert op.raw_get("/Page") == pdf_file_writer.pages[9].indirect_reference + assert op["/Type"] == "/Fit" + pdf_file_writer.open_destination = op + assert pdf_file_writer.open_destination == op + + # irrelevant, just for coverage + pdf_file_writer._root_object[NameObject("/OpenAction")][0] = NumberObject(0) + pdf_file_writer.open_destination + with pytest.raises(Exception) as exc: + del pdf_file_writer._root_object[NameObject("/OpenAction")][0] + pdf_file_writer.open_destination + assert "Invalid Destination" in str(exc.value) + + pdf_file_writer.open_destination = "Test" + # checked also using Acrobrat to verify open_destination + op = pdf_file_writer._root_object["/OpenAction"] + assert isinstance(op, TextStringObject) + assert op == "Test" + op = pdf_file_writer.open_destination + assert isinstance(op, TextStringObject) + assert op == "Test" + + # irrelevant, this is just for coverage + pdf_file_writer._root_object[NameObject("/OpenAction")] = NumberObject(0) + assert pdf_file_writer.open_destination is None + pdf_file_writer.open_destination = None + assert "/OpenAction" not in pdf_file_writer._root_object + pdf_file_writer.open_destination = None + + +@pytest.mark.enable_socket() +def test_iss471(): + url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" + name = "book_471.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + writer = PdfWriter() + writer.append(reader, excluded_fields=[]) + assert isinstance( + writer.pages[0]["/Annots"][0].get_object()["/Dest"], TextStringObject + ) + + +@pytest.mark.enable_socket() +def test_reset_translation(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader, (0, 10)) + nb = len(writer._objects) + writer.append(reader, (0, 10)) + assert ( + len(writer._objects) == nb + 11 + ) # +10 (pages) +1 because of the added outline + nb += 1 + writer.reset_translation(reader) + writer.append(reader, (0, 10)) + assert len(writer._objects) >= nb + 200 + nb = len(writer._objects) + writer.reset_translation(reader.pages[0].indirect_reference) + writer.append(reader, (0, 10)) + assert len(writer._objects) >= nb + 200 + nb = len(writer._objects) + writer.reset_translation() + writer.append(reader, (0, 10)) + assert len(writer._objects) >= nb + 200 + nb = len(writer.pages) + writer.append(reader, [reader.pages[0], reader.pages[0]]) + assert len(writer.pages) == nb + 2 + + +def test_threads_empty(): + writer = PdfWriter() + thr = writer.threads + assert isinstance(thr, ArrayObject) + assert len(thr) == 0 + thr2 = writer.threads + assert thr == thr2 + + +@pytest.mark.enable_socket() +def test_append_without_annots_and_articles(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader, None, (0, 10), True, ["/B"]) + writer.reset_translation() + writer.append(reader, (0, 10), True, ["/B"]) + assert writer.threads == [] + writer = PdfWriter() + writer.append(reader, None, (0, 10), True, ["/Annots"]) + assert "/Annots" not in writer.pages[5] + writer = PdfWriter() + writer.append(reader, None, (0, 10), True, []) + assert "/Annots" in writer.pages[5] + assert len(writer.threads) >= 1 + + +@pytest.mark.enable_socket() +def test_append_multiple(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append( + reader, [0, 0, 0] + ) # to demonstre multiple insertion of same page at once + writer.append(reader, [0, 0, 0]) # second pack + pages = writer._root_object["/Pages"]["/Kids"] + assert pages[0] not in pages[1:] # page not repeated + assert pages[-1] not in pages[0:-1] # page not repeated + + +@pytest.mark.samples() +def test_set_page_label(pdf_file_path): + src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels + reader = PdfReader(src) + + expected = [ + "i", + "ii", + "1", + "2", + "A", + "B", + "1", + "2", + "3", + "4", + "A", + "i", + "I", + "II", + "1", + "2", + "3", + "I", + "II", + ] + + # Tests full lenght with labels assigned at first and last elements + # Tests different labels assigned to consecutive ranges + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(0, 1, "/r") + writer.set_page_label(4, 5, "/A") + writer.set_page_label(10, 10, "/A") + writer.set_page_label(11, 11, "/r") + writer.set_page_label(12, 13, "/R") + writer.set_page_label(17, 18, "/R") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels == expected + + writer = PdfWriter() # Same labels, different set order + writer.clone_document_from_reader(reader) + writer.set_page_label(17, 18, "/R") + writer.set_page_label(4, 5, "/A") + writer.set_page_label(10, 10, "/A") + writer.set_page_label(0, 1, "/r") + writer.set_page_label(12, 13, "/R") + writer.set_page_label(11, 11, "/r") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels == expected + + # Tests labels assigned only in the middle + # Tests label assigned to a range already containing labled ranges + expected = ["1", "2", "i", "ii", "iii", "iv", "v", "1"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(3, 4, "/a") + writer.set_page_label(5, 5, "/A") + writer.set_page_label(2, 6, "/r") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + # Tests labels assigned inside a previously existing range + expected = ["1", "2", "i", "a", "b", "A", "1", "1", "2"] + # Ones repeat because user didnt cover the entire original range + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(2, 6, "/r") + writer.set_page_label(3, 4, "/a") + writer.set_page_label(5, 5, "/A") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + # Tests invalid user input + writer = PdfWriter() + writer.clone_document_from_reader(reader) + with pytest.raises( + ValueError, match="at least one between style and prefix must be given" + ): + writer.set_page_label(0, 5, start=2) + with pytest.raises( + ValueError, match="page_index_from must be equal or greater then 0" + ): + writer.set_page_label(-1, 5, "/r") + with pytest.raises( + ValueError, match="page_index_to must be equal or greater then page_index_from" + ): + writer.set_page_label(5, 0, "/r") + with pytest.raises(ValueError, match="page_index_to exceeds number of pages"): + writer.set_page_label(0, 19, "/r") + with pytest.raises( + ValueError, match="if given, start must be equal or greater than one" + ): + writer.set_page_label(0, 5, "/r", start=-1) + + pdf_file_path.unlink() + + src = ( + SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" + ) # File with pre existing labels + reader = PdfReader(src) + + # Tests adding labels to existing ones + expected = ["i", "ii", "A", "B", "1"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(2, 3, "/A") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + # Tests replacing existing lables + expected = ["A", "B", "1", "1", "2"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(0, 1, "/A") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + pdf_file_path.unlink() + + # Tests prefix and start. + src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels + reader = PdfReader(src) + writer = PdfWriter() + writer.clone_document_from_reader(reader) + + writer.set_page_label(0, 0, prefix="FRONT") + writer.set_page_label(1, 2, "/D", start=2) + writer.set_page_label(3, 6, prefix="UPDATES") + writer.set_page_label(7, 10, "/D", prefix="THYR-") + writer.set_page_label(11, 21, "/D", prefix="PAP-") + writer.set_page_label(22, 30, "/D", prefix="FOLL-") + writer.set_page_label(31, 39, "/D", prefix="HURT-") + writer.write(pdf_file_path) + + +@pytest.mark.enable_socket() +def test_iss1601(): + url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" + name = "badge-38.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + original_cs_operations = ContentStream( + reader.pages[0].get_contents(), reader + ).operations + writer = PdfWriter() + page_1 = writer.add_blank_page( + reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] + ) + page_1.merge_transformed_page(reader.pages[0], Transformation()) + page_1_cs_operations = page_1.get_contents().operations + assert is_sublist(original_cs_operations, page_1_cs_operations) + page_1 = writer.add_blank_page( + reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] + ) + page_1.merge_page(reader.pages[0]) + page_1_cs_operations = page_1.get_contents().operations + assert is_sublist(original_cs_operations, page_1_cs_operations) + + +def test_attachments(): + writer = PdfWriter() + writer.add_blank_page(100, 100) + b = BytesIO() + writer.write(b) + b.seek(0) + reader = PdfReader(b) + b = None + assert reader.attachments == {} + assert reader._list_attachments() == [] + assert reader._get_attachments() == {} + to_add = [ + ("foobar.txt", b"foobarcontent"), + ("foobar2.txt", b"foobarcontent2"), + ("foobar2.txt", b"2nd_foobarcontent"), + ] + for name, content in to_add: + writer.add_attachment(name, content) + + b = BytesIO() + writer.write(b) + b.seek(0) + reader = PdfReader(b) + b = None + assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add}) + assert reader.attachments == { + "foobar.txt": [b"foobarcontent"], + "foobar2.txt": [b"foobarcontent2", b"2nd_foobarcontent"], + } + assert reader._list_attachments() == [name for name, _ in to_add] + + # We've added the same key twice - hence only 2 and not 3: + att = reader._get_attachments() + assert len(att) == 2 # we have 2 keys, but 3 attachments! + + # The content for foobar.txt is clear and just a single value: + assert att["foobar.txt"] == b"foobarcontent" + + # The content for foobar2.txt is a list! + att = reader._get_attachments("foobar2.txt") + assert len(att) == 1 + assert att["foobar2.txt"] == [b"foobarcontent2", b"2nd_foobarcontent"] + + # Let's do both cases with the public interface: + assert reader.attachments["foobar.txt"][0] == b"foobarcontent" + assert reader.attachments["foobar2.txt"][0] == b"foobarcontent2" + assert reader.attachments["foobar2.txt"][1] == b"2nd_foobarcontent" + + +@pytest.mark.enable_socket() +def test_iss1614(): + # test of an annotation(link) directly stored in the /Annots in the page + url = "https://github.com/py-pdf/pypdf/files/10669995/broke.pdf" + name = "iss1614.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + # test for 2nd error case reported in #1614 + url = "https://github.com/py-pdf/pypdf/files/10696390/broken.pdf" + name = "iss1614.2.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_new_removes(): + # test of an annotation(link) directly stored in the /Annots in the page + url = "https://github.com/py-pdf/pypdf/files/10807951/tt.pdf" + name = "iss1650.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.remove_images() + b = BytesIO() + writer.write(b) + bb = bytes(b.getbuffer()) + assert b"/Im0 Do" not in bb + assert b"/Fm0 Do" in bb + assert b" TJ" in bb + + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.remove_text() + b = BytesIO() + writer.write(b) + bb = bytes(b.getbuffer()) + assert b"/Im0" in bb + assert b"Chap" not in bb + assert b" TJ" not in bb + + url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf" + name = "GeoBaseWithComments.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer.append(reader) + writer.remove_objects_from_page(writer.pages[0], [ObjectDeletionFlag.LINKS]) + assert "/Links" not in [ + a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] + ] + writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.ATTACHMENTS) + assert "/FileAttachment" not in [ + a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] + ] + + writer.pages[0]["/Annots"].append( + DictionaryObject({NameObject("/Subtype"): TextStringObject("/3D")}) + ) + assert "/3D" in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] + writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.OBJECTS_3D) + assert "/3D" not in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] + + writer.remove_links() + assert len(writer.pages[0]["/Annots"]) == 0 + assert len(writer.pages[3]["/Annots"]) == 0 + + writer.remove_annotations("/Text") + + +@pytest.mark.enable_socket() +def test_late_iss1654(): + url = "https://github.com/py-pdf/pypdf/files/10935632/bid1.pdf" + name = "bid1.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.clone_document_from_reader(reader) + for p in writer.pages: + p.compress_content_streams() + b = BytesIO() + writer.write(b) + + +@pytest.mark.enable_socket() +def test_iss1723(): + # test of an annotation(link) directly stored in the /Annots in the page + url = "https://github.com/py-pdf/pypdf/files/11015242/inputFile.pdf" + name = "iss1723.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader, (3, 5)) + + +@pytest.mark.enable_socket() +def test_iss1767(): + # test with a pdf which is buggy because the object 389,0 exists 3 times: + # twice to define catalog and one as an XObject inducing a loop when + # cloning + url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" + name = "iss1723.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + PdfWriter(clone_from=reader) + + +@pytest.mark.enable_socket() +def test_named_dest_page_number(): + """ + Closes iss471 + tests appending with named destinations as integers + """ + url = "https://github.com/py-pdf/pypdf/files/10704333/central.pdf" + name = "central.pdf" + writer = PdfWriter() + writer.add_blank_page(100, 100) + writer.append(BytesIO(get_data_from_url(url, name=name)), pages=[0, 1, 2]) + assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 2 + assert writer._root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) + writer.append(BytesIO(get_data_from_url(url, name=name))) + assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 + writer2 = PdfWriter() + writer2.add_blank_page(100, 100) + dest = writer2.add_named_destination("toto", 0) + dest.get_object()[NameObject("/D")][0] = NullObject() + b = BytesIO() + writer2.write(b) + b.seek(0) + writer.append(b) + assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ( + "dont_commit_writer.pdf", + True, + ) + ], +) +def test_update_form_fields(write_data_here, needs_cleanup): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") + writer.update_page_form_field_values( + writer.pages[0], + { + "CheckBox1": "/Yes", + "Text1": "mon Text1", + "Text2": "ligne1\nligne2", + "RadioGroup1": "/2", + "RdoS1": "/", + "Combo1": "!!monCombo!!", + "Liste1": "Liste2", + "Liste2": ["Lst1", "Lst3"], + "DropList1": "DropListe3", + }, + auto_regenerate=False, + ) + del writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"] + writer.update_page_form_field_values( + writer.pages[0], + {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"}, + auto_regenerate=False, + ) + + writer.write("dont_commit_writer.pdf") + reader = PdfReader("dont_commit_writer.pdf") + flds = reader.get_fields() + assert flds["CheckBox1"]["/V"] == "/Yes" + assert flds["CheckBox1"].indirect_reference.get_object()["/AS"] == "/Yes" + assert ( + b"(my Text1)" + in flds["Text1"].indirect_reference.get_object()["/AP"]["/N"].get_data() + ) + assert flds["Text2"]["/V"] == "ligne1\nligne2\nligne3" + assert ( + b"(ligne3)" + in flds["Text2"].indirect_reference.get_object()["/AP"]["/N"].get_data() + ) + assert flds["RadioGroup1"]["/V"] == "/2" + assert flds["RadioGroup1"]["/Kids"][0].get_object()["/AS"] == "/Off" + assert flds["RadioGroup1"]["/Kids"][1].get_object()["/AS"] == "/2" + assert all(x in flds["Liste2"]["/V"] for x in ["Lst1", "Lst3"]) + + assert all(x in flds["CheckBox1"]["/_States_"] for x in ["/Off", "/Yes"]) + assert all(x in flds["RadioGroup1"]["/_States_"] for x in ["/1", "/2", "/3"]) + assert all(x in flds["Liste1"]["/_States_"] for x in ["Liste1", "Liste2", "Liste3"]) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.enable_socket() +def test_iss1862(): + # The file here has "/B" entry to define the font in a object below the page + # The excluded field shall be considered only at first level (page) and not + # below + url = "https://github.com/py-pdf/pypdf/files/11708801/intro.pdf" + name = "iss1862.pdf" + writer = PdfWriter() + writer.append(BytesIO(get_data_from_url(url, name=name))) + # check that "/B" is in the font + writer.pages[0]["/Resources"]["/Font"]["/F1"]["/CharProcs"]["/B"].get_data() + + +def test_empty_objects_before_cloning(): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + writer = PdfWriter(clone_from=reader) + nb_obj_reader = len(reader.xref_objStm) + sum( + len(reader.xref[i]) for i in reader.xref + ) + nb_obj_reader -= 1 # for trailer + nb_obj_reader -= len( + {x: 1 for x, y in reader.xref_objStm.values()} + ) # to remove object streams + assert len(writer._objects) == nb_obj_reader + + +@pytest.mark.enable_socket() +def test_watermark(): + url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" + name = "bgwatermark.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" + name = "srcwatermark.pdf" + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) + for p in writer.pages: + p.merge_page(reader.pages[0], over=False) + + assert isinstance(p["/Contents"], ArrayObject) + assert isinstance(p["/Contents"][0], IndirectObject) + + b = BytesIO() + writer.write(b) + assert len(b.getvalue()) < 2.1 * 1024 * 1024 + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(4) # this was a lot slower before PR #2086 +def test_watermarking_speed(): + url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" + name = "bgwatermark.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://arxiv.org/pdf/2201.00214.pdf" + name = "2201.00214.pdf" + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) + for p in writer.pages: + p.merge_page(reader.pages[0], over=False) + out_pdf_bytesio = BytesIO() + writer.write(out_pdf_bytesio) + pdf_size_in_mib = len(out_pdf_bytesio.getvalue()) / 1024 / 1024 + assert pdf_size_in_mib < 20 + + +@pytest.mark.enable_socket() +@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") +def test_watermark_rendering(tmp_path): + """Ensure the visual appearance of watermarking stays correct.""" + url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" + name = "bgwatermark.pdf" + watermark = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] + url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" + name = "srcwatermark.pdf" + page = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] + writer = PdfWriter() + page.merge_page(watermark, over=False) + writer.add_page(page) + + target_png_path = tmp_path / "target.png" + url = "https://github.com/py-pdf/pypdf/assets/96178532/d5c72d0e-7047-4504-bbf6-bc591c80d7c0" + name = "dstwatermark.png" + target_png_path.write_bytes(get_data_from_url(url, name=name)) + + pdf_path = tmp_path / "out.pdf" + png_path = tmp_path / "out.png" writer.write(pdf_path) # False positive: https://github.com/PyCQA/bandit/issues/333 @@ -1610,205 +1613,205 @@ def test_watermark_rendering(tmp_path): assert png_path.is_file() assert image_similarity(png_path, target_png_path) >= 0.95 - -@pytest.mark.enable_socket() -def test_da_missing_in_annot(): - url = "https://github.com/py-pdf/pypdf/files/12136285/Building.Division.Permit.Application.pdf" - name = "BuildingDivisionPermitApplication.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter(clone_from=reader) - writer.update_page_form_field_values( - writer.pages[0], {"PCN-1": "0"}, auto_regenerate=False - ) - b = BytesIO() - writer.write(b) - reader = PdfReader(BytesIO(b.getvalue())) - ff = reader.get_fields() - # check for autosize processing - assert ( - b"0 Tf" - not in ff["PCN-1"].indirect_reference.get_object()["/AP"]["/N"].get_data() - ) - f2 = writer.get_object(ff["PCN-2"].indirect_reference.idnum) - f2[NameObject("/Parent")] = writer.get_object( - ff["PCN-1"].indirect_reference.idnum - ).indirect_reference - writer.update_page_form_field_values( - writer.pages[0], {"PCN-2": "1"}, auto_regenerate=False - ) - - -def test_missing_fields(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - - writer = PdfWriter() - writer.add_page(reader.pages[0]) - - with pytest.raises(PyPdfError) as exc: - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"}, flags=1 - ) - assert exc.value.args[0] == "No /AcroForm dictionary in PdfWriter Object" - - writer = PdfWriter() - writer.append(reader, [0]) - del writer._root_object["/AcroForm"]["/Fields"] - with pytest.raises(PyPdfError) as exc: - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"}, flags=1 - ) - assert exc.value.args[0] == "No /Fields dictionary in Pdf in PdfWriter Object" - - -def test_missing_info(): - reader = PdfReader(RESOURCE_ROOT / "missing_info.pdf") - - writer = PdfWriter(clone_from=reader) - assert len(writer.pages) == len(reader.pages) - - -@pytest.mark.enable_socket() -def test_germanfields(): - """Cf #2035""" - url = "https://github.com/py-pdf/pypdf/files/12194195/test.pdf" - name = "germanfields.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter(clone_from=reader) - form_fields = {"Text Box 1": "test æ ø å"} - writer.update_page_form_field_values( - writer.pages[0], form_fields, auto_regenerate=False - ) - bytes_stream = BytesIO() - writer.write(bytes_stream) - bytes_stream.seek(0) - reader2 = PdfReader(bytes_stream) - assert ( - b"test \xe6 \xf8 \xe5" - in reader2.get_fields()["Text Box 1"] - .indirect_reference.get_object()["/AP"]["/N"] - .get_data() - ) - - -@pytest.mark.enable_socket() -def test_no_t_in_articles(): - """Cf #2078""" - url = "https://github.com/py-pdf/pypdf/files/12311735/bad.pdf" - name = "iss2078.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_no_i_in_articles(): - """Cf #2089""" - url = "https://github.com/py-pdf/pypdf/files/12352793/kim2002.pdf" - name = "iss2089.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_damaged_pdf_length_returning_none(): - """ - Cf #140 - https://github.com/py-pdf/pypdf/issues/140#issuecomment-1685380549 - """ - url = "https://github.com/py-pdf/pypdf/files/12168578/bad_pdf_example.pdf" - name = "iss140_bad_pdf.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_viewerpreferences(): - """ - Add Tests for ViewerPreferences - https://github.com/py-pdf/pypdf/issues/140#issuecomment-1685380549 - """ - url = "https://github.com/py-pdf/pypdf/files/9175966/2015._pb_decode_pg0.pdf" - name = "2015._pb_decode_pg0.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - v = reader.viewer_preferences - assert v.center_window == True # noqa: E712 - writer = PdfWriter(clone_from=reader) - v = writer.viewer_preferences - assert v.center_window == True # noqa: E712 - v.center_window = False - assert ( - writer._root_object["/ViewerPreferences"]["/CenterWindow"] - == False # noqa: E712 - ) - assert v.print_area == "/CropBox" - with pytest.raises(ValueError): - v.non_fullscreen_pagemode = "toto" - with pytest.raises(ValueError): - v.non_fullscreen_pagemode = "/toto" - v.non_fullscreen_pagemode = "/UseOutlines" - assert ( - writer._root_object["/ViewerPreferences"]["/NonFullScreenPageMode"] - == "/UseOutlines" - ) - writer = PdfWriter(clone_from=reader) - v = writer.viewer_preferences - assert v.center_window == True # noqa: E712 - v.center_window = False - assert ( - writer._root_object["/ViewerPreferences"]["/CenterWindow"] - == False # noqa: E712 - ) - - writer = PdfWriter(clone_from=reader) - writer._root_object[NameObject("/ViewerPreferences")] = writer._add_object( - writer._root_object["/ViewerPreferences"] - ) - v = writer.viewer_preferences - v.center_window = False - assert ( - writer._root_object["/ViewerPreferences"]["/CenterWindow"] - == False # noqa: E712 - ) - v.num_copies = 1 - assert v.num_copies == 1 - assert v.print_pagerange is None - with pytest.raises(ValueError): - v.print_pagerange = "toto" - v.print_pagerange = ArrayObject() - assert len(v.print_pagerange) == 0 - - writer.create_viewer_preference() - assert len(writer._root_object["/ViewerPreferences"]) == 0 - - del reader.trailer["/Root"]["/ViewerPreferences"] - assert reader.viewer_preferences is None - writer = PdfWriter(clone_from=reader) - assert writer.viewer_preferences is None - - -def test_extra_spaces_in_da_text(caplog): - writer = PdfWriter(clone_from=RESOURCE_ROOT / "form.pdf") - t = writer.pages[0]["/Annots"][0].get_object()["/DA"] - t = t.replace("/Helv", "/Helv ") - writer.pages[0]["/Annots"][0].get_object()[NameObject("/DA")] = TextStringObject(t) - writer.update_page_form_field_values( - writer.pages[0], {"foo": "abcd"}, auto_regenerate=False - ) - t = writer.pages[0]["/Annots"][0].get_object()["/AP"]["/N"].get_data() - assert "Font dictionary for not found." not in caplog.text - assert b"/Helv" in t - assert b"(abcd)" in t - - -@pytest.mark.enable_socket() -def test_object_contains_indirect_reference_to_self(): - url = "https://github.com/py-pdf/pypdf/files/12389243/testbook.pdf" - name = "iss2102.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - width, height = 595, 841 - outpage = writer.add_blank_page(width, height) - outpage.merge_page(reader.pages[6]) - writer.append(reader) + +@pytest.mark.enable_socket() +def test_da_missing_in_annot(): + url = "https://github.com/py-pdf/pypdf/files/12136285/Building.Division.Permit.Application.pdf" + name = "BuildingDivisionPermitApplication.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter(clone_from=reader) + writer.update_page_form_field_values( + writer.pages[0], {"PCN-1": "0"}, auto_regenerate=False + ) + b = BytesIO() + writer.write(b) + reader = PdfReader(BytesIO(b.getvalue())) + ff = reader.get_fields() + # check for autosize processing + assert ( + b"0 Tf" + not in ff["PCN-1"].indirect_reference.get_object()["/AP"]["/N"].get_data() + ) + f2 = writer.get_object(ff["PCN-2"].indirect_reference.idnum) + f2[NameObject("/Parent")] = writer.get_object( + ff["PCN-1"].indirect_reference.idnum + ).indirect_reference + writer.update_page_form_field_values( + writer.pages[0], {"PCN-2": "1"}, auto_regenerate=False + ) + + +def test_missing_fields(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + + writer = PdfWriter() + writer.add_page(reader.pages[0]) + + with pytest.raises(PyPdfError) as exc: + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"}, flags=1 + ) + assert exc.value.args[0] == "No /AcroForm dictionary in PdfWriter Object" + + writer = PdfWriter() + writer.append(reader, [0]) + del writer._root_object["/AcroForm"]["/Fields"] + with pytest.raises(PyPdfError) as exc: + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"}, flags=1 + ) + assert exc.value.args[0] == "No /Fields dictionary in Pdf in PdfWriter Object" + + +def test_missing_info(): + reader = PdfReader(RESOURCE_ROOT / "missing_info.pdf") + + writer = PdfWriter(clone_from=reader) + assert len(writer.pages) == len(reader.pages) + + +@pytest.mark.enable_socket() +def test_germanfields(): + """Cf #2035""" + url = "https://github.com/py-pdf/pypdf/files/12194195/test.pdf" + name = "germanfields.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter(clone_from=reader) + form_fields = {"Text Box 1": "test æ ø å"} + writer.update_page_form_field_values( + writer.pages[0], form_fields, auto_regenerate=False + ) + bytes_stream = BytesIO() + writer.write(bytes_stream) + bytes_stream.seek(0) + reader2 = PdfReader(bytes_stream) + assert ( + b"test \xe6 \xf8 \xe5" + in reader2.get_fields()["Text Box 1"] + .indirect_reference.get_object()["/AP"]["/N"] + .get_data() + ) + + +@pytest.mark.enable_socket() +def test_no_t_in_articles(): + """Cf #2078""" + url = "https://github.com/py-pdf/pypdf/files/12311735/bad.pdf" + name = "iss2078.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_no_i_in_articles(): + """Cf #2089""" + url = "https://github.com/py-pdf/pypdf/files/12352793/kim2002.pdf" + name = "iss2089.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_damaged_pdf_length_returning_none(): + """ + Cf #140 + https://github.com/py-pdf/pypdf/issues/140#issuecomment-1685380549 + """ + url = "https://github.com/py-pdf/pypdf/files/12168578/bad_pdf_example.pdf" + name = "iss140_bad_pdf.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_viewerpreferences(): + """ + Add Tests for ViewerPreferences + https://github.com/py-pdf/pypdf/issues/140#issuecomment-1685380549 + """ + url = "https://github.com/py-pdf/pypdf/files/9175966/2015._pb_decode_pg0.pdf" + name = "2015._pb_decode_pg0.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + v = reader.viewer_preferences + assert v.center_window == True # noqa: E712 + writer = PdfWriter(clone_from=reader) + v = writer.viewer_preferences + assert v.center_window == True # noqa: E712 + v.center_window = False + assert ( + writer._root_object["/ViewerPreferences"]["/CenterWindow"] + == False # noqa: E712 + ) + assert v.print_area == "/CropBox" + with pytest.raises(ValueError): + v.non_fullscreen_pagemode = "toto" + with pytest.raises(ValueError): + v.non_fullscreen_pagemode = "/toto" + v.non_fullscreen_pagemode = "/UseOutlines" + assert ( + writer._root_object["/ViewerPreferences"]["/NonFullScreenPageMode"] + == "/UseOutlines" + ) + writer = PdfWriter(clone_from=reader) + v = writer.viewer_preferences + assert v.center_window == True # noqa: E712 + v.center_window = False + assert ( + writer._root_object["/ViewerPreferences"]["/CenterWindow"] + == False # noqa: E712 + ) + + writer = PdfWriter(clone_from=reader) + writer._root_object[NameObject("/ViewerPreferences")] = writer._add_object( + writer._root_object["/ViewerPreferences"] + ) + v = writer.viewer_preferences + v.center_window = False + assert ( + writer._root_object["/ViewerPreferences"]["/CenterWindow"] + == False # noqa: E712 + ) + v.num_copies = 1 + assert v.num_copies == 1 + assert v.print_pagerange is None + with pytest.raises(ValueError): + v.print_pagerange = "toto" + v.print_pagerange = ArrayObject() + assert len(v.print_pagerange) == 0 + + writer.create_viewer_preference() + assert len(writer._root_object["/ViewerPreferences"]) == 0 + + del reader.trailer["/Root"]["/ViewerPreferences"] + assert reader.viewer_preferences is None + writer = PdfWriter(clone_from=reader) + assert writer.viewer_preferences is None + + +def test_extra_spaces_in_da_text(caplog): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "form.pdf") + t = writer.pages[0]["/Annots"][0].get_object()["/DA"] + t = t.replace("/Helv", "/Helv ") + writer.pages[0]["/Annots"][0].get_object()[NameObject("/DA")] = TextStringObject(t) + writer.update_page_form_field_values( + writer.pages[0], {"foo": "abcd"}, auto_regenerate=False + ) + t = writer.pages[0]["/Annots"][0].get_object()["/AP"]["/N"].get_data() + assert "Font dictionary for not found." not in caplog.text + assert b"/Helv" in t + assert b"(abcd)" in t + + +@pytest.mark.enable_socket() +def test_object_contains_indirect_reference_to_self(): + url = "https://github.com/py-pdf/pypdf/files/12389243/testbook.pdf" + name = "iss2102.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + width, height = 595, 841 + outpage = writer.add_blank_page(width, height) + outpage.merge_page(reader.pages[6]) + writer.append(reader) From 97026c560c27b93001daf188b1ad9249735a553d Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 16 Sep 2023 13:26:52 +0200 Subject: [PATCH 02/13] fix --- pypdf/_reader.py | 12 ++------- pypdf/_writer.py | 45 +++++++++++++++++++++++++++++++ pypdf/generic/_data_structures.py | 17 ++++++------ 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 01529ec47..dcc324ed5 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -2230,21 +2230,13 @@ def _get_embedded_files_root(self) -> Optional[NameTree]: return NameTree(efo) @property - def detailed_embedded_files(self) -> Optional[Mapping[str, PdfObject]]: + def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: ef = self._get_embedded_files_root() if ef: return ef.list_items() else: return None - @property - def embedded_files(self) -> Optional[Mapping[str, List[bytes]]]: - ef = self._get_embedded_files_root() - if ef: - return {k: v["/EF"]["/F"].get_data() for k, v in ef.list_items().items()} # type: ignore - else: - return None - @property def attachments(self) -> Mapping[str, List[bytes]]: ef = self._get_embedded_files_root() @@ -2252,7 +2244,7 @@ def attachments(self) -> Mapping[str, List[bytes]]: d = {} for k, v in ef.list_items().items(): if isinstance(v, list): - d[k] = [e["/EF"]["/F"].get_data() for e in v] + d[k] = [e["/EF"]["/F"].get_data() for e in v] # type: ignore return d else: return {} diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 1d70bba50..befe617d0 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -46,6 +46,7 @@ Dict, Iterable, List, + Mapping, Optional, Pattern, Tuple, @@ -104,6 +105,7 @@ FloatObject, IndirectObject, NameObject, + NameTree, NullObject, NumberObject, PdfObject, @@ -699,6 +701,49 @@ def addJS(self, javascript: str) -> None: # deprecated deprecation_with_replacement("addJS", "add_js", "3.0.0") return self.add_js(javascript) + def _get_embedded_files_root(self) -> Optional[NameTree]: + """ + Returns the EmbeddedFiles root as a NameTree Object + if the root does not exists, return None + """ + catalog = self._root_object + if "/Names" not in catalog: + return None + ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None) + if ef is None: + return None + efo = ef.get_object() + # not for reader + """ + if not isinstance(efo,NameTree): + if isinstance(ef,IndirectObject): + ef.replace_object(efo) + else: + cast(DictionaryObject,catalog["/Names"])[ + NameObject("/EmbeddedFiles")] = NameTree(efo) + """ + return NameTree(efo) + + @property + def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: + ef = self._get_embedded_files_root() + if ef: + return ef.list_items() + else: + return None + + @property + def attachments(self) -> Mapping[str, List[bytes]]: + ef = self._get_embedded_files_root() + if ef: + d = {} + for k, v in ef.list_items().items(): + if isinstance(v, list): + d[k] = [e["/EF"]["/F"].get_data() for e in v] # type: ignore + return d + else: + return {} + def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: """ Embed a file inside the PDF. diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 59e28250a..dd14945fa 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -38,6 +38,7 @@ Dict, Iterable, List, + Mapping, Optional, Sequence, Tuple, @@ -1490,7 +1491,7 @@ def _list(o: Optional[PdfObject]) -> List[str]: _l.sort() return _l - def list_items(self) -> dict[str, PdfObject]: + def list_items(self) -> Mapping[str, List[PdfObject]]: """ Provides the Name Tree Entries as a dictionary @@ -1499,8 +1500,8 @@ def list_items(self) -> dict[str, PdfObject]: """ def _list( - o: Optional[PdfObject], lout: List[Tuple[str, PdfObject]] - ) -> List[Tuple[str, PdfObject]]: + o: Optional[PdfObject], lout: List[Tuple[str, List[PdfObject]]] + ) -> List[Tuple[str, List[PdfObject]]]: def _append_with_dup( ll: List[Tuple[str, Any]], _l: List[Tuple[str, Any]] ) -> None: @@ -1530,7 +1531,7 @@ def _append_with_dup( _list(x.get_object(), lout) return lout - _l: List[Tuple[str, PdfObject]] = [] + _l: List[Tuple[str, List[PdfObject]]] = [] _list(self, _l) return dict(_l) @@ -1564,7 +1565,7 @@ def _get(key: str, o: Optional[PdfObject]) -> List[PdfObject]: return _get(key, self) - def list_set( + def list_add( self, key: str, data: PdfObject, overwrite: bool = False ) -> Optional[IndirectObject]: """ @@ -1602,7 +1603,7 @@ def _update_limits( return True return False - def _set_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: + def _add_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: nonlocal overwrite, writer, key, data if o is None: return None @@ -1641,13 +1642,13 @@ def _set_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: else: # kids ar = cast(ArrayObject, o["/Kids"]) for x in ar: - r = _set_in(x, x == ar[-1]) + r = _add_in(x, x == ar[-1]) if r: _update_limits(o, key, key) return r return None - o = _set_in(self, True) + o = _add_in(self, True) return o.indirect_reference if o is not None else None From 1124824b547dbc6d09a3d4d4bf1297b7840cae60 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 17 Sep 2023 23:03:35 +0200 Subject: [PATCH 03/13] add_attachements and rf --- pypdf/_reader.py | 4647 +++++++++++++++-------------- pypdf/_writer.py | 137 +- pypdf/constants.py | 5 +- pypdf/generic/_data_structures.py | 29 +- 4 files changed, 2439 insertions(+), 2379 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index dcc324ed5..8bd9e2454 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1,2316 +1,2331 @@ -# Copyright (c) 2006, Mathieu Fenniak -# Copyright (c) 2007, Ashish Kulkarni -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - -import os -import re -import struct -import zlib -from datetime import datetime -from io import BytesIO, UnsupportedOperation -from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Mapping, - Optional, - Tuple, - Union, - cast, -) - -from ._encryption import Encryption, PasswordType -from ._page import PageObject, _VirtualList -from ._page_labels import index2label as page_index2page_label -from ._utils import ( - StrByteType, - StreamType, - b_, - deprecate_no_replacement, - deprecation_no_replacement, - deprecation_with_replacement, - logger_warning, - parse_iso8824_date, - read_non_whitespace, - read_previous_line, - read_until_whitespace, - skip_over_comment, - skip_over_whitespace, -) -from .constants import CatalogAttributes as CA -from .constants import CatalogDictionary as CD -from .constants import ( - CheckboxRadioButtonAttributes, - GoToActionArguments, -) -from .constants import Core as CO -from .constants import DocumentInformationAttributes as DI -from .constants import FieldDictionaryAttributes as FA -from .constants import PageAttributes as PG -from .constants import PagesAttributes as PA -from .constants import TrailerKeys as TK -from .errors import ( - EmptyFileError, - FileNotDecryptedError, - PdfReadError, - PdfStreamError, - WrongPasswordError, -) -from .generic import ( - ArrayObject, - BooleanObject, - ContentStream, - DecodedStreamObject, - Destination, - DictionaryObject, - EncodedStreamObject, - Field, - Fit, - FloatObject, - IndirectObject, - NameObject, - NameTree, - NullObject, - NumberObject, - PdfObject, - TextStringObject, - TreeObject, - ViewerPreferences, - read_object, -) -from .types import OutlineType, PagemodeType -from .xmp import XmpInformation - - -def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: - if size > 8: - raise PdfReadError("invalid size in convert_to_int") - d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d - d = d[-8:] - return struct.unpack(">q", d)[0] - - -def convertToInt(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: # deprecated - deprecation_with_replacement("convertToInt", "convert_to_int") - return convert_to_int(d, size) - - -class DocumentInformation(DictionaryObject): - """ - A class representing the basic document metadata provided in a PDF File. - This class is accessible through - :py:class:`PdfReader.metadata`. - - All text properties of the document metadata have - *two* properties, eg. author and author_raw. The non-raw property will - always return a ``TextStringObject``, making it ideal for a case where - the metadata is being displayed. The raw property can sometimes return - a ``ByteStringObject``, if pypdf was unable to decode the string's - text encoding; this requires additional safety in the caller and - therefore is not as commonly accessed. - """ - - def __init__(self) -> None: - DictionaryObject.__init__(self) - - def _get_text(self, key: str) -> Optional[str]: - retval = self.get(key, None) - if isinstance(retval, TextStringObject): - return retval - return None - - def getText(self, key: str) -> Optional[str]: # deprecated - """ - Use the attributes (e.g. :py:attr:`title` / :py:attr:`author`). - - .. deprecated:: 1.28.0 - """ - deprecation_no_replacement("getText", "3.0.0") - return self._get_text(key) - - @property - def title(self) -> Optional[str]: - """ - Read-only property accessing the document's title. - - Returns a ``TextStringObject`` or ``None`` if the title is not - specified. - """ - return ( - self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore - if self.get(DI.TITLE) - else None - ) - - @property - def title_raw(self) -> Optional[str]: - """The "raw" version of title; can return a ``ByteStringObject``.""" - return self.get(DI.TITLE) - - @property - def author(self) -> Optional[str]: - """ - Read-only property accessing the document's author. - - Returns a ``TextStringObject`` or ``None`` if the author is not - specified. - """ - return self._get_text(DI.AUTHOR) - - @property - def author_raw(self) -> Optional[str]: - """The "raw" version of author; can return a ``ByteStringObject``.""" - return self.get(DI.AUTHOR) - - @property - def subject(self) -> Optional[str]: - """ - Read-only property accessing the document's subject. - - Returns a ``TextStringObject`` or ``None`` if the subject is not - specified. - """ - return self._get_text(DI.SUBJECT) - - @property - def subject_raw(self) -> Optional[str]: - """The "raw" version of subject; can return a ``ByteStringObject``.""" - return self.get(DI.SUBJECT) - - @property - def creator(self) -> Optional[str]: - """ - Read-only property accessing the document's creator. - - If the document was converted to PDF from another format, this is the - name of the application (e.g. OpenOffice) that created the original - document from which it was converted. Returns a ``TextStringObject`` or - ``None`` if the creator is not specified. - """ - return self._get_text(DI.CREATOR) - - @property - def creator_raw(self) -> Optional[str]: - """The "raw" version of creator; can return a ``ByteStringObject``.""" - return self.get(DI.CREATOR) - - @property - def producer(self) -> Optional[str]: - """ - Read-only property accessing the document's producer. - - If the document was converted to PDF from another format, this is the - name of the application (for example, OSX Quartz) that converted it to - PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not - specified. - """ - return self._get_text(DI.PRODUCER) - - @property - def producer_raw(self) -> Optional[str]: - """The "raw" version of producer; can return a ``ByteStringObject``.""" - return self.get(DI.PRODUCER) - - @property - def creation_date(self) -> Optional[datetime]: - """Read-only property accessing the document's creation date.""" - return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) - - @property - def creation_date_raw(self) -> Optional[str]: - """ - The "raw" version of creation date; can return a ``ByteStringObject``. - - Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix - is the offset from UTC. - """ - return self.get(DI.CREATION_DATE) - - @property - def modification_date(self) -> Optional[datetime]: - """ - Read-only property accessing the document's modification date. - - The date and time the document was most recently modified. - """ - return parse_iso8824_date(self._get_text(DI.MOD_DATE)) - - @property - def modification_date_raw(self) -> Optional[str]: - """ - The "raw" version of modification date; can return a - ``ByteStringObject``. - - Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix - is the offset from UTC. - """ - return self.get(DI.MOD_DATE) - - -class PdfReader: - """ - Initialize a PdfReader object. - - This operation can take some time, as the PDF stream's cross-reference - tables are read into memory. - - Args: - stream: A File object or an object that supports the standard read - and seek methods similar to a File object. Could also be a - string representing a path to a PDF file. - strict: Determines whether user should be warned of all - problems and also causes some correctable problems to be fatal. - Defaults to ``False``. - password: Decrypt PDF file at initialization. If the - password is None, the file will not be decrypted. - Defaults to ``None`` - """ - - @property - def viewer_preferences(self) -> Optional[ViewerPreferences]: - """Returns the existing ViewerPreferences as an overloaded dictionary.""" - o = cast(DictionaryObject, self.trailer["/Root"]).get( - CD.VIEWER_PREFERENCES, None - ) - if o is None: - return None - o = o.get_object() - if not isinstance(o, ViewerPreferences): - o = ViewerPreferences(o) - return o - - def __init__( - self, - stream: Union[StrByteType, Path], - strict: bool = False, - password: Union[None, str, bytes] = None, - ) -> None: - self.strict = strict - self.flattened_pages: Optional[List[PageObject]] = None - self.resolved_objects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {} - self.xref_index = 0 - self._page_id2num: Optional[ - Dict[Any, Any] - ] = None # map page indirect_reference number to Page Number - if hasattr(stream, "mode") and "b" not in stream.mode: # type: ignore - logger_warning( - "PdfReader stream/file object is not in binary mode. " - "It may not be read correctly.", - __name__, - ) - if isinstance(stream, (str, Path)): - with open(stream, "rb") as fh: - stream = BytesIO(fh.read()) - self.read(stream) - self.stream = stream - - self._override_encryption = False - self._encryption: Optional[Encryption] = None - if self.is_encrypted: - self._override_encryption = True - # Some documents may not have a /ID, use two empty - # byte strings instead. Solves - # https://github.com/py-pdf/pypdf/issues/608 - id_entry = self.trailer.get(TK.ID) - id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" - encrypt_entry = cast( - DictionaryObject, self.trailer[TK.ENCRYPT].get_object() - ) - self._encryption = Encryption.read(encrypt_entry, id1_entry) - - # try empty password if no password provided - pwd = password if password is not None else b"" - if ( - self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED - and password is not None - ): - # raise if password provided - raise WrongPasswordError("Wrong password") - self._override_encryption = False - elif password is not None: - raise PdfReadError("Not encrypted file") - - @property - def pdf_header(self) -> str: - """ - The first 8 bytes of the file. - - This is typically something like ``'%PDF-1.6'`` and can be used to - detect if the file is actually a PDF file and which version it is. - """ - # TODO: Make this return a bytes object for consistency - # but that needs a deprecation - loc = self.stream.tell() - self.stream.seek(0, 0) - pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") - self.stream.seek(loc, 0) # return to where it was - return pdf_file_version - - @property - def metadata(self) -> Optional[DocumentInformation]: - """ - Retrieve the PDF file's document information dictionary, if it exists. - - Note that some PDF files use metadata streams instead of docinfo - dictionaries, and these metadata streams will not be accessed by this - function. - """ - if TK.INFO not in self.trailer: - return None - obj = self.trailer[TK.INFO] - retval = DocumentInformation() - if isinstance(obj, type(None)): - raise PdfReadError( - "trailer not found or does not point to document information directory" - ) - retval.update(obj) # type: ignore - return retval - - def getDocumentInfo(self) -> Optional[DocumentInformation]: # deprecated - """ - Use the attribute :py:attr:`metadata` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getDocumentInfo", "metadata", "3.0.0") - return self.metadata - - @property - def documentInfo(self) -> Optional[DocumentInformation]: # deprecated - """ - Use the attribute :py:attr:`metadata` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("documentInfo", "metadata", "3.0.0") - return self.metadata - - @property - def xmp_metadata(self) -> Optional[XmpInformation]: - """XMP (Extensible Metadata Platform) data.""" - try: - self._override_encryption = True - return self.trailer[TK.ROOT].xmp_metadata # type: ignore - finally: - self._override_encryption = False - - def getXmpMetadata(self) -> Optional[XmpInformation]: # deprecated - """ - Use the attribute :py:attr:`metadata` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getXmpMetadata", "xmp_metadata", "3.0.0") - return self.xmp_metadata - - @property - def xmpMetadata(self) -> Optional[XmpInformation]: # deprecated - """ - Use the attribute :py:attr:`xmp_metadata` instead. - - .. deprecated:: 1.28.0. - """ - deprecation_with_replacement("xmpMetadata", "xmp_metadata", "3.0.0") - return self.xmp_metadata - - def _get_num_pages(self) -> int: - """ - Calculate the number of pages in this PDF file. - - Returns: - The number of pages of the parsed PDF file - - Raises: - PdfReadError: if file is encrypted and restrictions prevent - this action. - """ - # Flattened pages will not work on an Encrypted PDF; - # the PDF file's page count is used in this case. Otherwise, - # the original method (flattened page count) is used. - if self.is_encrypted: - return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore - else: - if self.flattened_pages is None: - self._flatten() - return len(self.flattened_pages) # type: ignore - - def getNumPages(self) -> int: # deprecated - """ - Use :code:`len(reader.pages)` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("reader.getNumPages", "len(reader.pages)", "3.0.0") - return self._get_num_pages() - - @property - def numPages(self) -> int: # deprecated - """ - Use :code:`len(reader.pages)` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("reader.numPages", "len(reader.pages)", "3.0.0") - return self._get_num_pages() - - def getPage(self, pageNumber: int) -> PageObject: # deprecated - """ - Use :code:`reader.pages[page_number]` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "reader.getPage(pageNumber)", "reader.pages[page_number]", "3.0.0" - ) - return self._get_page(pageNumber) - - def _get_page(self, page_number: int) -> PageObject: - """ - Retrieve a page by number from this PDF file. - - Args: - page_number: The page number to retrieve - (pages begin at zero) - - Returns: - A :class:`PageObject` instance. - """ - if self.flattened_pages is None: - self._flatten() - assert self.flattened_pages is not None, "hint for mypy" - return self.flattened_pages[page_number] - - @property - def namedDestinations(self) -> Dict[str, Any]: # deprecated - """ - Use :py:attr:`named_destinations` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("namedDestinations", "named_destinations", "3.0.0") - return self.named_destinations - - @property - def named_destinations(self) -> Dict[str, Any]: - """ - A read-only dictionary which maps names to - :class:`Destinations` - """ - return self._get_named_destinations() - - # A select group of relevant field attributes. For the complete list, - # see section 8.6.2 of the PDF 1.7 reference. - - def get_fields( - self, - tree: Optional[TreeObject] = None, - retval: Optional[Dict[Any, Any]] = None, - fileobj: Optional[Any] = None, - ) -> Optional[Dict[str, Any]]: - """ - Extract field data if this PDF contains interactive form fields. - - The *tree* and *retval* parameters are for recursive use. - - Args: - tree: - retval: - fileobj: A file object (usually a text file) to write - a report to on all interactive form fields found. - - Returns: - A dictionary where each key is a field name, and each - value is a :class:`Field` object. By - default, the mapping name is used for keys. - ``None`` if form data could not be located. - """ - field_attributes = FA.attributes_dict() - field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) - if retval is None: - retval = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - # get the AcroForm tree - if CD.ACRO_FORM in catalog: - tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) - else: - return None - if tree is None: - return retval - self._check_kids(tree, retval, fileobj) - for attr in field_attributes: - if attr in tree: - # Tree is a field - self._build_field(tree, retval, fileobj, field_attributes) - break - - if "/Fields" in tree: - fields = cast(ArrayObject, tree["/Fields"]) - for f in fields: - field = f.get_object() - self._build_field(field, retval, fileobj, field_attributes) - - return retval - - def getFields( - self, - tree: Optional[TreeObject] = None, - retval: Optional[Dict[Any, Any]] = None, - fileobj: Optional[Any] = None, - ) -> Optional[Dict[str, Any]]: # deprecated - """ - Use :meth:`get_fields` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getFields", "get_fields", "3.0.0") - return self.get_fields(tree, retval, fileobj) - - def _get_qualified_field_name(self, parent: DictionaryObject) -> str: - if "/TM" in parent: - return cast(str, parent["/TM"]) - elif "/Parent" in parent: - return ( - self._get_qualified_field_name( - cast(DictionaryObject, parent["/Parent"]) - ) - + "." - + cast(str, parent["/T"]) - ) - else: - return cast(str, parent["/T"]) - - def _build_field( - self, - field: Union[TreeObject, DictionaryObject], - retval: Dict[Any, Any], - fileobj: Any, - field_attributes: Any, - ) -> None: - self._check_kids(field, retval, fileobj) - try: - key = cast(str, field["/TM"]) - except KeyError: - try: - if "/Parent" in field: - key = ( - self._get_qualified_field_name( - cast(DictionaryObject, field["/Parent"]) - ) - + "." - ) - else: - key = "" - key += cast(str, field["/T"]) - except KeyError: - # Ignore no-name field for now - return - if fileobj: - self._write_field(fileobj, field, field_attributes) - fileobj.write("\n") - retval[key] = Field(field) - obj = retval[key].indirect_reference.get_object() # to get the full object - if obj.get(FA.FT, "") == "/Ch": - retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] - if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: - # Checkbox - retval[key][NameObject("/_States_")] = ArrayObject( - list(obj["/AP"]["/N"].keys()) - ) - if "/Off" not in retval[key]["/_States_"]: - retval[key][NameObject("/_States_")].append(NameObject("/Off")) - elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: - states = [] - for k in obj.get(FA.Kids, {}): - k = k.get_object() - for s in list(k["/AP"]["/N"].keys()): - if s not in states: - states.append(s) - retval[key][NameObject("/_States_")] = ArrayObject(states) - if ( - obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 - and "/Off" in retval[key]["/_States_"] - ): - del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] - - def _check_kids( - self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any - ) -> None: - if PA.KIDS in tree: - # recurse down the tree - for kid in tree[PA.KIDS]: # type: ignore - self.get_fields(kid.get_object(), retval, fileobj) - - def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: - field_attributes_tuple = FA.attributes() - field_attributes_tuple = ( - field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() - ) - - for attr in field_attributes_tuple: - if attr in ( - FA.Kids, - FA.AA, - ): - continue - attr_name = field_attributes[attr] - try: - if attr == FA.FT: - # Make the field type value more clear - types = { - "/Btn": "Button", - "/Tx": "Text", - "/Ch": "Choice", - "/Sig": "Signature", - } - if field[attr] in types: - fileobj.write(f"{attr_name}: {types[field[attr]]}\n") - elif attr == FA.Parent: - # Let's just write the name of the parent - try: - name = field[attr][FA.TM] - except KeyError: - name = field[attr][FA.T] - fileobj.write(f"{attr_name}: {name}\n") - else: - fileobj.write(f"{attr_name}: {field[attr]}\n") - except KeyError: - # Field attribute is N/A or unknown, so don't write anything - pass - - def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]: - """ - Retrieve form fields from the document with textual data. - - Args: - full_qualified_name: to get full name - - Returns: - A dictionary. The key is the name of the form field, - the value is the content of the field. - - If the document contains multiple form fields with the same name, the - second and following will get the suffix .2, .3, ... - """ - - def indexed_key(k: str, fields: dict) -> str: - if k not in fields: - return k - else: - return ( - k - + "." - + str(sum([1 for kk in fields if kk.startswith(k + ".")]) + 2) - ) - - # Retrieve document form fields - formfields = self.get_fields() - if formfields is None: - return {} - ff = {} - for field, value in formfields.items(): - if value.get("/FT") == "/Tx": - if full_qualified_name: - ff[field] = value.get("/V") - else: - ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") - return ff - - def getFormTextFields(self) -> Dict[str, Any]: # deprecated - """ - Use :meth:`get_form_text_fields` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "getFormTextFields", "get_form_text_fields", "3.0.0" - ) - return self.get_form_text_fields() - - def _get_named_destinations( - self, - tree: Union[TreeObject, None] = None, - retval: Optional[Any] = None, - ) -> Dict[str, Any]: - """ - Retrieve the named destinations present in the document. - - Args: - tree: - retval: - - Returns: - A dictionary which maps names to - :class:`Destinations`. - """ - if retval is None: - retval = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - # get the name tree - if CA.DESTS in catalog: - tree = cast(TreeObject, catalog[CA.DESTS]) - elif CA.NAMES in catalog: - names = cast(DictionaryObject, catalog[CA.NAMES]) - if CA.DESTS in names: - tree = cast(TreeObject, names[CA.DESTS]) - - if tree is None: - return retval - - if PA.KIDS in tree: - # recurse down the tree - for kid in cast(ArrayObject, tree[PA.KIDS]): - self._get_named_destinations(kid.get_object(), retval) - # TABLE 3.33 Entries in a name tree node dictionary (PDF 1.7 specs) - elif CA.NAMES in tree: # KIDS and NAMES are exclusives (PDF 1.7 specs p 162) - names = cast(DictionaryObject, tree[CA.NAMES]) - i = 0 - while i < len(names): - key = cast(str, names[i].get_object()) - i += 1 - if not isinstance(key, str): - continue - try: - value = names[i].get_object() - except IndexError: - break - i += 1 - if isinstance(value, DictionaryObject) and "/D" in value: - value = value["/D"] - dest = self._build_destination(key, value) # type: ignore - if dest is not None: - retval[key] = dest - else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF1.1 - for k__, v__ in tree.items(): - val = v__.get_object() - if isinstance(val, DictionaryObject): - val = val["/D"].get_object() - dest = self._build_destination(k__, val) - if dest is not None: - retval[k__] = dest - return retval - - def getNamedDestinations( - self, - tree: Union[TreeObject, None] = None, - retval: Optional[Any] = None, - ) -> Dict[str, Any]: # deprecated - """ - Use :py:attr:`named_destinations` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "getNamedDestinations", "named_destinations", "3.0.0" - ) - return self._get_named_destinations(tree, retval) - - @property - def outline(self) -> OutlineType: - """ - Read-only property for the outline present in the document. - - (i.e., a collection of 'outline items' which are also known as - 'bookmarks') - """ - return self._get_outline() - - @property - def outlines(self) -> OutlineType: # deprecated - """ - Use :py:attr:`outline` instead. - - .. deprecated:: 2.9.0 - """ - deprecation_with_replacement("outlines", "outline", "3.0.0") - return self.outline - - def _get_outline( - self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None - ) -> OutlineType: - if outline is None: - outline = [] - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - # get the outline dictionary and named destinations - if CO.OUTLINES in catalog: - lines = cast(DictionaryObject, catalog[CO.OUTLINES]) - - if isinstance(lines, NullObject): - return outline - - # TABLE 8.3 Entries in the outline dictionary - if lines is not None and "/First" in lines: - node = cast(DictionaryObject, lines["/First"]) - self._namedDests = self._get_named_destinations() - - if node is None: - return outline - - # see if there are any more outline items - while True: - outline_obj = self._build_outline_item(node) - if outline_obj: - outline.append(outline_obj) - - # check for sub-outline - if "/First" in node: - sub_outline: List[Any] = [] - self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) - if sub_outline: - outline.append(sub_outline) - - if "/Next" not in node: - break - node = cast(DictionaryObject, node["/Next"]) - - return outline - - def getOutlines( - self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None - ) -> OutlineType: # deprecated - """ - Use :py:attr:`outline` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getOutlines", "outline", "3.0.0") - return self._get_outline(node, outline) - - @property - def threads(self) -> Optional[ArrayObject]: - """ - Read-only property for the list of threads. - - See §8.3.2 from PDF 1.7 spec. - - It's an array of dictionaries with "/F" and "/I" properties or - None if there are no articles. - """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - if CO.THREADS in catalog: - return cast("ArrayObject", catalog[CO.THREADS]) - else: - return None - - def _get_page_number_by_indirect( - self, indirect_reference: Union[None, int, NullObject, IndirectObject] - ) -> int: - """ - Generate _page_id2num. - - Args: - indirect_reference: - - Returns: - The page number. - """ - if self._page_id2num is None: - self._page_id2num = { - x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore - } - - if indirect_reference is None or isinstance(indirect_reference, NullObject): - return -1 - if isinstance(indirect_reference, int): - idnum = indirect_reference - else: - idnum = indirect_reference.idnum - assert self._page_id2num is not None, "hint for mypy" - ret = self._page_id2num.get(idnum, -1) - return ret - - def get_page_number(self, page: PageObject) -> int: - """ - Retrieve page number of a given PageObject. - - Args: - page: The page to get page number. Should be - an instance of :class:`PageObject` - - Returns: - The page number or -1 if page is not found - """ - return self._get_page_number_by_indirect(page.indirect_reference) - - def getPageNumber(self, page: PageObject) -> int: # deprecated - """ - Use :meth:`get_page_number` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getPageNumber", "get_page_number", "3.0.0") - return self.get_page_number(page) - - def get_destination_page_number(self, destination: Destination) -> int: - """ - Retrieve page number of a given Destination object. - - Args: - destination: The destination to get page number. - - Returns: - The page number or -1 if page is not found - """ - return self._get_page_number_by_indirect(destination.page) - - def getDestinationPageNumber(self, destination: Destination) -> int: # deprecated - """ - Use :meth:`get_destination_page_number` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "getDestinationPageNumber", "get_destination_page_number", "3.0.0" - ) - return self.get_destination_page_number(destination) - - def _build_destination( - self, - title: str, - array: Optional[ - List[ - Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] - ] - ], - ) -> Destination: - page, typ = None, None - # handle outline items with missing or invalid destination - if ( - isinstance(array, (NullObject, str)) - or (isinstance(array, ArrayObject) and len(array) == 0) - or array is None - ): - page = NullObject() - return Destination(title, page, Fit.fit()) - else: - page, typ = array[0:2] # type: ignore - array = array[2:] - try: - return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore - except PdfReadError: - logger_warning(f"Unknown destination: {title} {array}", __name__) - if self.strict: - raise - # create a link to first Page - tmp = self.pages[0].indirect_reference - indirect_reference = NullObject() if tmp is None else tmp - return Destination(title, indirect_reference, Fit.fit()) # type: ignore - - def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: - dest, title, outline_item = None, None, None - - # title required for valid outline - # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary - try: - title = cast("str", node["/Title"]) - except KeyError: - if self.strict: - raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") - title = "" # type: ignore - - if "/A" in node: - # Action, PDFv1.7 Section 12.6 (only type GoTo supported) - action = cast(DictionaryObject, node["/A"]) - action_type = cast(NameObject, action[GoToActionArguments.S]) - if action_type == "/GoTo": - dest = action[GoToActionArguments.D] - elif "/Dest" in node: - # Destination, PDFv1.7 Section 12.3.2 - dest = node["/Dest"] - # if array was referenced in another object, will be a dict w/ key "/D" - if isinstance(dest, DictionaryObject) and "/D" in dest: - dest = dest["/D"] - - if isinstance(dest, ArrayObject): - outline_item = self._build_destination(title, dest) - elif isinstance(dest, str): - # named destination, addresses NameObject Issue #193 - # TODO : keep named destination instead of replacing it ? - try: - outline_item = self._build_destination( - title, self._namedDests[dest].dest_array - ) - except KeyError: - # named destination not found in Name Dict - outline_item = self._build_destination(title, None) - elif dest is None: - # outline item not required to have destination or action - # PDFv1.7 Table 153 - outline_item = self._build_destination(title, dest) - else: - if self.strict: - raise PdfReadError(f"Unexpected destination {dest!r}") - else: - logger_warning( - f"Removed unexpected destination {dest!r} from destination", - __name__, - ) - outline_item = self._build_destination(title, None) # type: ignore - - # if outline item created, add color, format, and child count if present - if outline_item: - if "/C" in node: - # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 - outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore - if "/F" in node: - # specifies style characteristics bold and/or italic - # with 1=italic, 2=bold, 3=both - outline_item[NameObject("/F")] = node["/F"] - if "/Count" in node: - # absolute value = num. visible children - # with positive = open/unfolded, negative = closed/folded - outline_item[NameObject("/Count")] = node["/Count"] - # if count is 0 we will consider it as open ( in order to have always an is_open to simplify - outline_item[NameObject("/%is_open%")] = BooleanObject( - node.get("/Count", 0) >= 0 - ) - outline_item.node = node - try: - outline_item.indirect_reference = node.indirect_reference - except AttributeError: - pass - return outline_item - - @property - def pages(self) -> List[PageObject]: - """Read-only property that emulates a list of :py:class:`Page` objects.""" - return _VirtualList(self._get_num_pages, self._get_page) # type: ignore - - @property - def page_labels(self) -> List[str]: - """ - A list of labels for the pages in this document. - - This property is read-only. The labels are in the order that the pages - appear in the document. - """ - return [page_index2page_label(self, i) for i in range(len(self.pages))] - - @property - def page_layout(self) -> Optional[str]: - """ - Get the page layout currently being used. - - .. list-table:: Valid ``layout`` values - :widths: 50 200 - - * - /NoLayout - - Layout explicitly not specified - * - /SinglePage - - Show one page at a time - * - /OneColumn - - Show one column at a time - * - /TwoColumnLeft - - Show pages in two columns, odd-numbered pages on the left - * - /TwoColumnRight - - Show pages in two columns, odd-numbered pages on the right - * - /TwoPageLeft - - Show two pages at a time, odd-numbered pages on the left - * - /TwoPageRight - - Show two pages at a time, odd-numbered pages on the right - """ - trailer = cast(DictionaryObject, self.trailer[TK.ROOT]) - if CD.PAGE_LAYOUT in trailer: - return cast(NameObject, trailer[CD.PAGE_LAYOUT]) - return None - - def getPageLayout(self) -> Optional[str]: # deprecated - """ - Use :py:attr:`page_layout` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getPageLayout", "page_layout", "3.0.0") - return self.page_layout - - @property - def pageLayout(self) -> Optional[str]: # deprecated - """ - Use :py:attr:`page_layout` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") - return self.page_layout - - @property - def page_mode(self) -> Optional[PagemodeType]: - """ - Get the page mode currently being used. - - .. list-table:: Valid ``mode`` values - :widths: 50 200 - - * - /UseNone - - Do not show outline or thumbnails panels - * - /UseOutlines - - Show outline (aka bookmarks) panel - * - /UseThumbs - - Show page thumbnails panel - * - /FullScreen - - Fullscreen view - * - /UseOC - - Show Optional Content Group (OCG) panel - * - /UseAttachments - - Show attachments panel - """ - try: - return self.trailer[TK.ROOT]["/PageMode"] # type: ignore - except KeyError: - return None - - def getPageMode(self) -> Optional[PagemodeType]: # deprecated - """ - Use :py:attr:`page_mode` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getPageMode", "page_mode", "3.0.0") - return self.page_mode - - @property - def pageMode(self) -> Optional[PagemodeType]: # deprecated - """ - Use :py:attr:`page_mode` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("pageMode", "page_mode", "3.0.0") - return self.page_mode - - def _flatten( - self, - pages: Union[None, DictionaryObject, PageObject] = None, - inherit: Optional[Dict[str, Any]] = None, - indirect_reference: Optional[IndirectObject] = None, - ) -> None: - inheritable_page_attributes = ( - NameObject(PG.RESOURCES), - NameObject(PG.MEDIABOX), - NameObject(PG.CROPBOX), - NameObject(PG.ROTATE), - ) - if inherit is None: - inherit = {} - if pages is None: - # Fix issue 327: set flattened_pages attribute only for - # decrypted file - catalog = self.trailer[TK.ROOT].get_object() - pages = catalog["/Pages"].get_object() # type: ignore - self.flattened_pages = [] - - if PA.TYPE in pages: - t = pages[PA.TYPE] # type: ignore - # if pdf has no type, considered as a page if /Kids is missing - elif PA.KIDS not in pages: - t = "/Page" - else: - t = "/Pages" - - if t == "/Pages": - for attr in inheritable_page_attributes: - if attr in pages: - inherit[attr] = pages[attr] - for page in pages[PA.KIDS]: # type: ignore - addt = {} - if isinstance(page, IndirectObject): - addt["indirect_reference"] = page - obj = page.get_object() - if obj: - # damaged file may have invalid child in /Pages - self._flatten(obj, inherit, **addt) - elif t == "/Page": - for attr_in, value in list(inherit.items()): - # if the page has it's own value, it does not inherit the - # parent's value: - if attr_in not in pages: - pages[attr_in] = value - page_obj = PageObject(self, indirect_reference) - page_obj.update(pages) - - # TODO: Could flattened_pages be None at this point? - self.flattened_pages.append(page_obj) # type: ignore - - def _get_object_from_stream( - self, indirect_reference: IndirectObject - ) -> Union[int, PdfObject, str]: - # indirect reference to object in object stream - # read the entire object stream into memory - stmnum, idx = self.xref_objStm[indirect_reference.idnum] - obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore - # This is an xref to a stream, so its type better be a stream - assert cast(str, obj_stm["/Type"]) == "/ObjStm" - # /N is the number of indirect objects in the stream - assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.get_data())) - for i in range(obj_stm["/N"]): # type: ignore - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - objnum = NumberObject.read_from_stream(stream_data) - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - offset = NumberObject.read_from_stream(stream_data) - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - if objnum != indirect_reference.idnum: - # We're only interested in one object - continue - if self.strict and idx != i: - raise PdfReadError("Object is in wrong index.") - stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore - - # to cope with some case where the 'pointer' is on a white space - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - - try: - obj = read_object(stream_data, self) - except PdfStreamError as exc: - # Stream object cannot be read. Normally, a critical error, but - # Adobe Reader doesn't complain, so continue (in strict mode?) - logger_warning( - f"Invalid stream (index {i}) within object " - f"{indirect_reference.idnum} {indirect_reference.generation}: " - f"{exc}", - __name__, - ) - - if self.strict: - raise PdfReadError(f"Can't read object stream: {exc}") - # Replace with null. Hopefully it's nothing important. - obj = NullObject() - return obj - - if self.strict: - raise PdfReadError("This is a fatal error in strict mode.") - return NullObject() - - def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: - """ - Used to ease development. - - This is equivalent to generic.IndirectObject(num,gen,self).get_object() - - Args: - num: The object number of the indirect object. - gen: The generation number of the indirect object. - - Returns: - A PdfObject - """ - return IndirectObject(num, gen, self).get_object() - - def get_object( - self, indirect_reference: Union[int, IndirectObject] - ) -> Optional[PdfObject]: - if isinstance(indirect_reference, int): - indirect_reference = IndirectObject(indirect_reference, 0, self) - retval = self.cache_get_indirect_object( - indirect_reference.generation, indirect_reference.idnum - ) - if retval is not None: - return retval - if ( - indirect_reference.generation == 0 - and indirect_reference.idnum in self.xref_objStm - ): - retval = self._get_object_from_stream(indirect_reference) # type: ignore - elif ( - indirect_reference.generation in self.xref - and indirect_reference.idnum in self.xref[indirect_reference.generation] - ): - if self.xref_free_entry.get(indirect_reference.generation, {}).get( - indirect_reference.idnum, False - ): - return NullObject() - start = self.xref[indirect_reference.generation][indirect_reference.idnum] - self.stream.seek(start, 0) - try: - idnum, generation = self.read_object_header(self.stream) - except Exception: - if hasattr(self.stream, "getbuffer"): - buf = bytes(self.stream.getbuffer()) # type: ignore - else: - p = self.stream.tell() - self.stream.seek(0, 0) - buf = self.stream.read(-1) - self.stream.seek(p, 0) - m = re.search( - rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), - buf, - ) - if m is not None: - logger_warning( - f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", - __name__, - ) - self.xref[indirect_reference.generation][ - indirect_reference.idnum - ] = (m.start(0) + 1) - self.stream.seek(m.start(0) + 1) - idnum, generation = self.read_object_header(self.stream) - else: - idnum = -1 # exception will be raised below - if idnum != indirect_reference.idnum and self.xref_index: - # Xref table probably had bad indexes due to not being zero-indexed - if self.strict: - raise PdfReadError( - f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " - f"does not match actual ({idnum} {generation}); " - "xref table not zero-indexed." - ) - # xref table is corrected in non-strict mode - elif idnum != indirect_reference.idnum and self.strict: - # some other problem - raise PdfReadError( - f"Expected object ID ({indirect_reference.idnum} " - f"{indirect_reference.generation}) does not match actual " - f"({idnum} {generation})." - ) - if self.strict: - assert generation == indirect_reference.generation - retval = read_object(self.stream, self) # type: ignore - - # override encryption is used for the /Encrypt dictionary - if not self._override_encryption and self._encryption is not None: - # if we don't have the encryption key: - if not self._encryption.is_decrypted(): - raise FileNotDecryptedError("File has not been decrypted") - # otherwise, decrypt here... - retval = cast(PdfObject, retval) - retval = self._encryption.decrypt_object( - retval, indirect_reference.idnum, indirect_reference.generation - ) - else: - if hasattr(self.stream, "getbuffer"): - buf = bytes(self.stream.getbuffer()) # type: ignore - else: - p = self.stream.tell() - self.stream.seek(0, 0) - buf = self.stream.read(-1) - self.stream.seek(p, 0) - m = re.search( - rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), - buf, - ) - if m is not None: - logger_warning( - f"Object {indirect_reference.idnum} {indirect_reference.generation} found", - __name__, - ) - if indirect_reference.generation not in self.xref: - self.xref[indirect_reference.generation] = {} - self.xref[indirect_reference.generation][indirect_reference.idnum] = ( - m.start(0) + 1 - ) - self.stream.seek(m.end(0) + 1) - skip_over_whitespace(self.stream) - self.stream.seek(-1, 1) - retval = read_object(self.stream, self) # type: ignore - - # override encryption is used for the /Encrypt dictionary - if not self._override_encryption and self._encryption is not None: - # if we don't have the encryption key: - if not self._encryption.is_decrypted(): - raise FileNotDecryptedError("File has not been decrypted") - # otherwise, decrypt here... - retval = cast(PdfObject, retval) - retval = self._encryption.decrypt_object( - retval, indirect_reference.idnum, indirect_reference.generation - ) - else: - logger_warning( - f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", - __name__, - ) - if self.strict: - raise PdfReadError("Could not find object.") - self.cache_indirect_object( - indirect_reference.generation, indirect_reference.idnum, retval - ) - return retval - - def getObject( - self, indirectReference: IndirectObject - ) -> Optional[PdfObject]: # deprecated - """ - Use :meth:`get_object` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getObject", "get_object", "3.0.0") - return self.get_object(indirectReference) - - def read_object_header(self, stream: StreamType) -> Tuple[int, int]: - # Should never be necessary to read out whitespace, since the - # cross-reference table should put us in the right spot to read the - # object header. In reality... some files have stupid cross reference - # tables that are off by whitespace bytes. - extra = False - skip_over_comment(stream) - extra |= skip_over_whitespace(stream) - stream.seek(-1, 1) - idnum = read_until_whitespace(stream) - extra |= skip_over_whitespace(stream) - stream.seek(-1, 1) - generation = read_until_whitespace(stream) - extra |= skip_over_whitespace(stream) - stream.seek(-1, 1) - - # although it's not used, it might still be necessary to read - _obj = stream.read(3) - - read_non_whitespace(stream) - stream.seek(-1, 1) - if extra and self.strict: - logger_warning( - f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore - __name__, - ) - return int(idnum), int(generation) - - def readObjectHeader(self, stream: StreamType) -> Tuple[int, int]: # deprecated - """ - Use :meth:`read_object_header` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("readObjectHeader", "read_object_header", "3.0.0") - return self.read_object_header(stream) - - def cache_get_indirect_object( - self, generation: int, idnum: int - ) -> Optional[PdfObject]: - return self.resolved_objects.get((generation, idnum)) - - def cacheGetIndirectObject( - self, generation: int, idnum: int - ) -> Optional[PdfObject]: # deprecated - """ - Use :meth:`cache_get_indirect_object` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "cacheGetIndirectObject", "cache_get_indirect_object", "3.0.0" - ) - return self.cache_get_indirect_object(generation, idnum) - - def cache_indirect_object( - self, generation: int, idnum: int, obj: Optional[PdfObject] - ) -> Optional[PdfObject]: - if (generation, idnum) in self.resolved_objects: - msg = f"Overwriting cache for {generation} {idnum}" - if self.strict: - raise PdfReadError(msg) - logger_warning(msg, __name__) - self.resolved_objects[(generation, idnum)] = obj - if obj is not None: - obj.indirect_reference = IndirectObject(idnum, generation, self) - return obj - - def cacheIndirectObject( - self, generation: int, idnum: int, obj: Optional[PdfObject] - ) -> Optional[PdfObject]: # deprecated - """ - Use :meth:`cache_indirect_object` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("cacheIndirectObject", "cache_indirect_object") - return self.cache_indirect_object(generation, idnum, obj) - - def read(self, stream: StreamType) -> None: - self._basic_validation(stream) - self._find_eof_marker(stream) - startxref = self._find_startxref_pos(stream) - - # check and eventually correct the startxref only in not strict - xref_issue_nr = self._get_xref_issues(stream, startxref) - if xref_issue_nr != 0: - if self.strict and xref_issue_nr: - raise PdfReadError("Broken xref table") - logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) - - # read all cross reference tables and their trailers - self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) - - # if not zero-indexed, verify that the table is correct; change it if necessary - if self.xref_index and not self.strict: - loc = stream.tell() - for gen, xref_entry in self.xref.items(): - if gen == 65535: - continue - xref_k = sorted( - xref_entry.keys() - ) # must ensure ascendant to prevent damage - for id in xref_k: - stream.seek(xref_entry[id], 0) - try: - pid, _pgen = self.read_object_header(stream) - except ValueError: - break - if pid == id - self.xref_index: - # fixing index item per item is required for revised PDF. - self.xref[gen][pid] = self.xref[gen][id] - del self.xref[gen][id] - # if not, then either it's just plain wrong, or the - # non-zero-index is actually correct - stream.seek(loc, 0) # return to where it was - - def _basic_validation(self, stream: StreamType) -> None: - """Ensure file is not empty. Read at most 5 bytes.""" - stream.seek(0, os.SEEK_SET) - try: - header_byte = stream.read(5) - except UnicodeDecodeError: - raise UnsupportedOperation("cannot read header") - if header_byte == b"": - raise EmptyFileError("Cannot read an empty file") - elif header_byte != b"%PDF-": - if self.strict: - raise PdfReadError( - f"PDF starts with '{header_byte.decode('utf8')}', " - "but '%PDF-' expected" - ) - else: - logger_warning(f"invalid pdf header: {header_byte}", __name__) - stream.seek(0, os.SEEK_END) - - def _find_eof_marker(self, stream: StreamType) -> None: - """ - Jump to the %%EOF marker. - - According to the specs, the %%EOF marker should be at the very end of - the file. Hence for standard-compliant PDF documents this function will - read only the last part (DEFAULT_BUFFER_SIZE). - """ - HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' - line = b"" - while line[:5] != b"%%EOF": - if stream.tell() < HEADER_SIZE: - if self.strict: - raise PdfReadError("EOF marker not found") - else: - logger_warning("EOF marker not found", __name__) - line = read_previous_line(stream) - - def _find_startxref_pos(self, stream: StreamType) -> int: - """ - Find startxref entry - the location of the xref table. - - Args: - stream: - - Returns: - The bytes offset - """ - line = read_previous_line(stream) - try: - startxref = int(line) - except ValueError: - # 'startxref' may be on the same line as the location - if not line.startswith(b"startxref"): - raise PdfReadError("startxref not found") - startxref = int(line[9:].strip()) - logger_warning("startxref on same line as offset", __name__) - else: - line = read_previous_line(stream) - if line[:9] != b"startxref": - raise PdfReadError("startxref not found") - return startxref - - def _read_standard_xref_table(self, stream: StreamType) -> None: - # standard cross-reference table - ref = stream.read(3) - if ref != b"ref": - raise PdfReadError("xref table read error") - read_non_whitespace(stream) - stream.seek(-1, 1) - first_time = True # check if the first time looking at the xref table - while True: - num = cast(int, read_object(stream, self)) - if first_time and num != 0: - self.xref_index = num - if self.strict: - logger_warning( - "Xref table not zero-indexed. ID numbers for objects will be corrected.", - __name__, - ) - # if table not zero indexed, could be due to error from when PDF was created - # which will lead to mismatched indices later on, only warned and corrected if self.strict==True - first_time = False - read_non_whitespace(stream) - stream.seek(-1, 1) - size = cast(int, read_object(stream, self)) - read_non_whitespace(stream) - stream.seek(-1, 1) - cnt = 0 - while cnt < size: - line = stream.read(20) - - # It's very clear in section 3.4.3 of the PDF spec - # that all cross-reference table lines are a fixed - # 20 bytes (as of PDF 1.7). However, some files have - # 21-byte entries (or more) due to the use of \r\n - # (CRLF) EOL's. Detect that case, and adjust the line - # until it does not begin with a \r (CR) or \n (LF). - while line[0] in b"\x0D\x0A": - stream.seek(-20 + 1, 1) - line = stream.read(20) - - # On the other hand, some malformed PDF files - # use a single character EOL without a preceding - # space. Detect that case, and seek the stream - # back one character. (0-9 means we've bled into - # the next xref entry, t means we've bled into the - # text "trailer"): - if line[-1] in b"0123456789t": - stream.seek(-1, 1) - - try: - offset_b, generation_b = line[:16].split(b" ") - entry_type_b = line[17:18] - - offset, generation = int(offset_b), int(generation_b) - except Exception: - # if something wrong occurred - if hasattr(stream, "getbuffer"): - buf = bytes(stream.getbuffer()) # type: ignore - else: - p = stream.tell() - stream.seek(0, 0) - buf = stream.read(-1) - stream.seek(p) - - f = re.search(f"{num}\\s+(\\d+)\\s+obj".encode(), buf) - if f is None: - logger_warning( - f"entry {num} in Xref table invalid; object not found", - __name__, - ) - generation = 65535 - offset = -1 - else: - logger_warning( - f"entry {num} in Xref table invalid but object found", - __name__, - ) - generation = int(f.group(1)) - offset = f.start() - - if generation not in self.xref: - self.xref[generation] = {} - self.xref_free_entry[generation] = {} - if num in self.xref[generation]: - # It really seems like we should allow the last - # xref table in the file to override previous - # ones. Since we read the file backwards, assume - # any existing key is already set correctly. - pass - else: - self.xref[generation][num] = offset - try: - self.xref_free_entry[generation][num] = entry_type_b == b"f" - except Exception: - pass - try: - self.xref_free_entry[65535][num] = entry_type_b == b"f" - except Exception: - pass - cnt += 1 - num += 1 - read_non_whitespace(stream) - stream.seek(-1, 1) - trailer_tag = stream.read(7) - if trailer_tag != b"trailer": - # more xrefs! - stream.seek(-7, 1) - else: - break - - def _read_xref_tables_and_trailers( - self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int - ) -> None: - self.xref: Dict[int, Dict[Any, Any]] = {} - self.xref_free_entry: Dict[int, Dict[Any, Any]] = {} - self.xref_objStm: Dict[int, Tuple[Any, Any]] = {} - self.trailer = DictionaryObject() - while startxref is not None: - # load the xref table - stream.seek(startxref, 0) - x = stream.read(1) - if x in b"\r\n": - x = stream.read(1) - if x == b"x": - startxref = self._read_xref(stream) - elif xref_issue_nr: - try: - self._rebuild_xref_table(stream) - break - except Exception: - xref_issue_nr = 0 - elif x.isdigit(): - try: - xrefstream = self._read_pdf15_xref_stream(stream) - except Exception as e: - if TK.ROOT in self.trailer: - logger_warning( - f"Previous trailer can not be read {e.args}", - __name__, - ) - break - else: - raise PdfReadError(f"trailer can not be read {e.args}") - trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE - for key in trailer_keys: - if key in xrefstream and key not in self.trailer: - self.trailer[NameObject(key)] = xrefstream.raw_get(key) - if "/XRefStm" in xrefstream: - p = stream.tell() - stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) - self._read_pdf15_xref_stream(stream) - stream.seek(p, 0) - if "/Prev" in xrefstream: - startxref = cast(int, xrefstream["/Prev"]) - else: - break - else: - startxref = self._read_xref_other_error(stream, startxref) - - def _read_xref(self, stream: StreamType) -> Optional[int]: - self._read_standard_xref_table(stream) - read_non_whitespace(stream) - stream.seek(-1, 1) - new_trailer = cast(Dict[str, Any], read_object(stream, self)) - for key, value in new_trailer.items(): - if key not in self.trailer: - self.trailer[key] = value - if "/XRefStm" in new_trailer: - p = stream.tell() - stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) - try: - self._read_pdf15_xref_stream(stream) - except Exception: - logger_warning( - f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", - __name__, - ) - stream.seek(p, 0) - if "/Prev" in new_trailer: - startxref = new_trailer["/Prev"] - return startxref - else: - return None - - def _read_xref_other_error( - self, stream: StreamType, startxref: int - ) -> Optional[int]: - # some PDFs have /Prev=0 in the trailer, instead of no /Prev - if startxref == 0: - if self.strict: - raise PdfReadError( - "/Prev=0 in the trailer (try opening with strict=False)" - ) - logger_warning( - "/Prev=0 in the trailer - assuming there is no previous xref table", - __name__, - ) - return None - # bad xref character at startxref. Let's see if we can find - # the xref table nearby, as we've observed this error with an - # off-by-one before. - stream.seek(-11, 1) - tmp = stream.read(20) - xref_loc = tmp.find(b"xref") - if xref_loc != -1: - startxref -= 10 - xref_loc - return startxref - # No explicit xref table, try finding a cross-reference stream. - stream.seek(startxref, 0) - for look in range(25): # value extended to cope with more linearized files - if stream.read(1).isdigit(): - # This is not a standard PDF, consider adding a warning - startxref += look - return startxref - # no xref table found at specified location - if "/Root" in self.trailer and not self.strict: - # if Root has been already found, just raise warning - logger_warning("Invalid parent xref., rebuild xref", __name__) - try: - self._rebuild_xref_table(stream) - return None - except Exception: - raise PdfReadError("can not rebuild xref") - raise PdfReadError("Could not find xref table at specified location") - - def _read_pdf15_xref_stream( - self, stream: StreamType - ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: - # PDF 1.5+ Cross-Reference Stream - stream.seek(-1, 1) - idnum, generation = self.read_object_header(stream) - xrefstream = cast(ContentStream, read_object(stream, self)) - assert cast(str, xrefstream["/Type"]) == "/XRef" - self.cache_indirect_object(generation, idnum, xrefstream) - stream_data = BytesIO(b_(xrefstream.get_data())) - # Index pairs specify the subsections in the dictionary. If - # none create one subsection that spans everything. - idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) - entry_sizes = cast(Dict[Any, Any], xrefstream.get("/W")) - assert len(entry_sizes) >= 3 - if self.strict and len(entry_sizes) > 3: - raise PdfReadError(f"Too many entry sizes: {entry_sizes}") - - def get_entry(i: int) -> Union[int, Tuple[int, ...]]: - # Reads the correct number of bytes for each entry. See the - # discussion of the W parameter in PDF spec table 17. - if entry_sizes[i] > 0: - d = stream_data.read(entry_sizes[i]) - return convert_to_int(d, entry_sizes[i]) - - # PDF Spec Table 17: A value of zero for an element in the - # W array indicates...the default value shall be used - if i == 0: - return 1 # First value defaults to 1 - else: - return 0 - - def used_before(num: int, generation: Union[int, Tuple[int, ...]]) -> bool: - # We move backwards through the xrefs, don't replace any. - return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore - - # Iterate through each subsection - self._read_xref_subsections(idx_pairs, get_entry, used_before) - return xrefstream - - @staticmethod - def _get_xref_issues(stream: StreamType, startxref: int) -> int: - """ - Return an int which indicates an issue. 0 means there is no issue. - - Args: - stream: - startxref: - - Returns: - 0 means no issue, other values represent specific issues. - """ - stream.seek(startxref - 1, 0) # -1 to check character before - line = stream.read(1) - if line == b"j": - line = stream.read(1) - if line not in b"\r\n \t": - return 1 - line = stream.read(4) - if line != b"xref": - # not an xref so check if it is an XREF object - line = b"" - while line in b"0123456789 \t": - line = stream.read(1) - if line == b"": - return 2 - line += stream.read(2) # 1 char already read, +2 to check "obj" - if line.lower() != b"obj": - return 3 - return 0 - - def _rebuild_xref_table(self, stream: StreamType) -> None: - self.xref = {} - stream.seek(0, 0) - f_ = stream.read(-1) - - for m in re.finditer(rb"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj", f_): - idnum = int(m.group(1)) - generation = int(m.group(2)) - if generation not in self.xref: - self.xref[generation] = {} - self.xref[generation][idnum] = m.start(1) - stream.seek(0, 0) - for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): - stream.seek(m.start(1), 0) - new_trailer = cast(Dict[Any, Any], read_object(stream, self)) - # Here, we are parsing the file from start to end, the new data have to erase the existing. - for key, value in list(new_trailer.items()): - self.trailer[key] = value - - def _read_xref_subsections( - self, - idx_pairs: List[int], - get_entry: Callable[[int], Union[int, Tuple[int, ...]]], - used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], - ) -> None: - for start, size in self._pairs(idx_pairs): - # The subsections must increase - for num in range(start, start + size): - # The first entry is the type - xref_type = get_entry(0) - # The rest of the elements depend on the xref_type - if xref_type == 0: - # linked list of free objects - next_free_object = get_entry(1) # noqa: F841 - next_generation = get_entry(2) # noqa: F841 - elif xref_type == 1: - # objects that are in use but are not compressed - byte_offset = get_entry(1) - generation = get_entry(2) - if generation not in self.xref: - self.xref[generation] = {} # type: ignore - if not used_before(num, generation): - self.xref[generation][num] = byte_offset # type: ignore - elif xref_type == 2: - # compressed objects - objstr_num = get_entry(1) - obstr_idx = get_entry(2) - generation = 0 # PDF spec table 18, generation is 0 - if not used_before(num, generation): - self.xref_objStm[num] = (objstr_num, obstr_idx) - elif self.strict: - raise PdfReadError(f"Unknown xref type: {xref_type}") - - def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]: - i = 0 - while True: - yield array[i], array[i + 1] - i += 2 - if (i + 1) >= len(array): - break - - def read_next_end_line( - self, stream: StreamType, limit_offset: int = 0 - ) -> bytes: # deprecated - """.. deprecated:: 2.1.0""" - deprecate_no_replacement("read_next_end_line", removed_in="4.0.0") - line_parts = [] - while True: - # Prevent infinite loops in malformed PDFs - if stream.tell() == 0 or stream.tell() == limit_offset: - raise PdfReadError("Could not read malformed PDF file") - x = stream.read(1) - if stream.tell() < 2: - raise PdfReadError("EOL marker not found") - stream.seek(-2, 1) - if x in (b"\n", b"\r"): # \n = LF; \r = CR - crlf = False - while x in (b"\n", b"\r"): - x = stream.read(1) - if x in (b"\n", b"\r"): # account for CR+LF - stream.seek(-1, 1) - crlf = True - if stream.tell() < 2: - raise PdfReadError("EOL marker not found") - stream.seek(-2, 1) - stream.seek( - 2 if crlf else 1, 1 - ) # if using CR+LF, go back 2 bytes, else 1 - break - else: - line_parts.append(x) - line_parts.reverse() - return b"".join(line_parts) - - def readNextEndLine( - self, stream: StreamType, limit_offset: int = 0 - ) -> bytes: # deprecated - """.. deprecated:: 1.28.0""" - deprecation_no_replacement("readNextEndLine", "3.0.0") - return self.read_next_end_line(stream, limit_offset) - - def decrypt(self, password: Union[str, bytes]) -> PasswordType: - """ - When using an encrypted / secured PDF file with the PDF Standard - encryption handler, this function will allow the file to be decrypted. - It checks the given password against the document's user password and - owner password, and then stores the resulting decryption key if either - password is correct. - - It does not matter which password was matched. Both passwords provide - the correct decryption key that will allow the document to be used with - this library. - - Args: - password: The password to match. - - Returns: - An indicator if the document was decrypted and weather it was the - owner password or the user password. - """ - if not self._encryption: - raise PdfReadError("Not encrypted file") - # TODO: raise Exception for wrong password - return self._encryption.verify(password) - - def decode_permissions(self, permissions_code: int) -> Dict[str, bool]: - # Takes the permissions as an integer, returns the allowed access - permissions = {} - permissions["print"] = permissions_code & (1 << 3 - 1) != 0 # bit 3 - permissions["modify"] = permissions_code & (1 << 4 - 1) != 0 # bit 4 - permissions["copy"] = permissions_code & (1 << 5 - 1) != 0 # bit 5 - permissions["annotations"] = permissions_code & (1 << 6 - 1) != 0 # bit 6 - permissions["forms"] = permissions_code & (1 << 9 - 1) != 0 # bit 9 - permissions["accessability"] = permissions_code & (1 << 10 - 1) != 0 # bit 10 - permissions["assemble"] = permissions_code & (1 << 11 - 1) != 0 # bit 11 - permissions["print_high_quality"] = ( - permissions_code & (1 << 12 - 1) != 0 - ) # bit 12 - return permissions - - @property - def is_encrypted(self) -> bool: - """ - Read-only boolean property showing whether this PDF file is encrypted. - - Note that this property, if true, will remain true even after the - :meth:`decrypt()` method is called. - """ - return TK.ENCRYPT in self.trailer - - def getIsEncrypted(self) -> bool: # deprecated - """ - Use :py:attr:`is_encrypted` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getIsEncrypted", "is_encrypted", "3.0.0") - return self.is_encrypted - - @property - def isEncrypted(self) -> bool: # deprecated - """ - Use :py:attr:`is_encrypted` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("isEncrypted", "is_encrypted", "3.0.0") - return self.is_encrypted - - @property - def xfa(self) -> Optional[Dict[str, Any]]: - tree: Optional[TreeObject] = None - retval: Dict[str, Any] = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not catalog["/AcroForm"]: - return None - - tree = cast(TreeObject, catalog["/AcroForm"]) - - if "/XFA" in tree: - fields = cast(ArrayObject, tree["/XFA"]) - i = iter(fields) - for f in i: - tag = f - f = next(i) - if isinstance(f, IndirectObject): - field = cast(Optional[EncodedStreamObject], f.get_object()) - if field: - es = zlib.decompress(b_(field._data)) - retval[tag] = es - return retval - - def add_form_topname(self, name: str) -> Optional[DictionaryObject]: - """ - Add a top level form that groups all form fields below it. - - Args: - name: text string of the "/T" Attribute of the created object - - Returns: - The created object. ``None`` means no object was created. - """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not isinstance( - catalog["/AcroForm"], DictionaryObject - ): - return None - acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) - if "/Fields" not in acroform: - # TODO: :No error returns but may be extended for XFA Forms - return None - - interim = DictionaryObject() - interim[NameObject("/T")] = TextStringObject(name) - interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] - self.cache_indirect_object( - 0, - max([i for (g, i) in self.resolved_objects if g == 0]) + 1, - interim, - ) - arr = ArrayObject() - arr.append(interim.indirect_reference) - acroform[NameObject("/Fields")] = arr - for o in cast(ArrayObject, interim["/Kids"]): - obj = o.get_object() - if "/Parent" in obj: - logger_warning( - f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", - __name__, - ) - obj[NameObject("/Parent")] = interim.indirect_reference - return interim - - def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: - """ - Rename top level form field that all form fields below it. - - Args: - name: text string of the "/T" field of the created object - - Returns: - The modified object. ``None`` means no object was modified. - """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not isinstance( - catalog["/AcroForm"], DictionaryObject - ): - return None - acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) - if "/Fields" not in acroform: - return None - - interim = cast( - DictionaryObject, - cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), - ) - interim[NameObject("/T")] = TextStringObject(name) - return interim - - def _get_embedded_files_root(self) -> Optional[NameTree]: - """ - Returns the EmbeddedFiles root as a NameTree Object - if the root does not exists, return None - """ - catalog = cast(DictionaryObject, self.trailer["/Root"]) - if "/Names" not in catalog: - return None - ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None) - if ef is None: - return None - efo = ef.get_object() - # not for reader - """ - if not isinstance(efo,NameTree): - if isinstance(ef,IndirectObject): - ef.replace_object(efo) - else: - cast(DictionaryObject,catalog["/Names"])[ - NameObject("/EmbeddedFiles")] = NameTree(efo) - """ - return NameTree(efo) - - @property - def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: - ef = self._get_embedded_files_root() - if ef: - return ef.list_items() - else: - return None - - @property - def attachments(self) -> Mapping[str, List[bytes]]: - ef = self._get_embedded_files_root() - if ef: - d = {} - for k, v in ef.list_items().items(): - if isinstance(v, list): - d[k] = [e["/EF"]["/F"].get_data() for e in v] # type: ignore - return d - else: - return {} - - def _list_attachments(self) -> List[str]: - """ - Retrieves the list of filenames of file attachments. - - Returns: - list of filenames - """ - ef = self._get_embedded_files_root() - if ef: - lst = ef.list_keys() - else: - lst = [] - """ - for ip, p in enumerate(self.pages): - for a in [_a.get_object() - for _a in p.get("/Annots",[])]: - if _a.get_object().get("/Subtype","") != "/FileAttachements": - continue - lst.append(f"$page_{ip}.{get_name_from_file_specification(_a)}") - """ - return lst - - def _get_attachment_list(self, name: str) -> List[bytes]: - out = self._get_attachments(name)[name] - if isinstance(out, list): - return out - return [out] - - def _get_attachments( - self, filename: Optional[str] = None - ) -> Dict[str, Union[bytes, List[bytes]]]: - """ - Retrieves all or selected file attachments of the PDF as a dictionary of file names - and the file data as a bytestring. - - Args: - filename: If filename is None, then a dictionary of all attachments - will be returned, where the key is the filename and the value - is the content. Otherwise, a dictionary with just a single key - - the filename - and its content will be returned. - - Returns: - dictionary of filename -> Union[bytestring or List[ByteString]] - if the filename exists multiple times a List of the different version will be provided - """ - ef = self._get_embedded_files_root() - if ef is None: - return {} - if filename is None: - return {k: v if len(v) > 1 else v[0] for k, v in self.attachments.items()} - else: - lst = ef.list_get(filename) - return { - filename: [x["/EF"]["/F"].get_data() for x in lst] # type: ignore - if isinstance(lst, list) - else lst["/EF"]["/F"].get_data() # type: ignore - } - - -class PdfFileReader(PdfReader): # deprecated - def __init__(self, *args: Any, **kwargs: Any) -> None: - deprecation_with_replacement("PdfFileReader", "PdfReader", "3.0.0") - if "strict" not in kwargs and len(args) < 2: - kwargs["strict"] = True # maintain the default - super().__init__(*args, **kwargs) +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import struct +import zlib +from datetime import datetime +from io import BytesIO, UnsupportedOperation +from pathlib import Path +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Mapping, + Optional, + Tuple, + Union, + cast, +) + +from ._encryption import Encryption, PasswordType +from ._page import PageObject, _VirtualList +from ._page_labels import index2label as page_index2page_label +from ._utils import ( + StrByteType, + StreamType, + b_, + deprecate_no_replacement, + deprecation_no_replacement, + deprecation_with_replacement, + logger_warning, + parse_iso8824_date, + read_non_whitespace, + read_previous_line, + read_until_whitespace, + skip_over_comment, + skip_over_whitespace, +) +from .constants import CatalogAttributes as CA +from .constants import CatalogDictionary as CD +from .constants import ( + CheckboxRadioButtonAttributes, + GoToActionArguments, +) +from .constants import Core as CO +from .constants import DocumentInformationAttributes as DI +from .constants import FieldDictionaryAttributes as FA +from .constants import PageAttributes as PG +from .constants import PagesAttributes as PA +from .constants import TrailerKeys as TK +from .errors import ( + EmptyFileError, + FileNotDecryptedError, + PdfReadError, + PdfStreamError, + WrongPasswordError, +) +from .generic import ( + ArrayObject, + BooleanObject, + ContentStream, + DecodedStreamObject, + Destination, + DictionaryObject, + EncodedStreamObject, + Field, + Fit, + FloatObject, + IndirectObject, + NameObject, + NameTree, + NullObject, + NumberObject, + PdfObject, + TextStringObject, + TreeObject, + ViewerPreferences, + read_object, +) +from .types import OutlineType, PagemodeType +from .xmp import XmpInformation + + +def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: + if size > 8: + raise PdfReadError("invalid size in convert_to_int") + d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d + d = d[-8:] + return struct.unpack(">q", d)[0] + + +def convertToInt(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: # deprecated + deprecation_with_replacement("convertToInt", "convert_to_int") + return convert_to_int(d, size) + + +class DocumentInformation(DictionaryObject): + """ + A class representing the basic document metadata provided in a PDF File. + This class is accessible through + :py:class:`PdfReader.metadata`. + + All text properties of the document metadata have + *two* properties, eg. author and author_raw. The non-raw property will + always return a ``TextStringObject``, making it ideal for a case where + the metadata is being displayed. The raw property can sometimes return + a ``ByteStringObject``, if pypdf was unable to decode the string's + text encoding; this requires additional safety in the caller and + therefore is not as commonly accessed. + """ + + def __init__(self) -> None: + DictionaryObject.__init__(self) + + def _get_text(self, key: str) -> Optional[str]: + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + def getText(self, key: str) -> Optional[str]: # deprecated + """ + Use the attributes (e.g. :py:attr:`title` / :py:attr:`author`). + + .. deprecated:: 1.28.0 + """ + deprecation_no_replacement("getText", "3.0.0") + return self._get_text(key) + + @property + def title(self) -> Optional[str]: + """ + Read-only property accessing the document's title. + + Returns a ``TextStringObject`` or ``None`` if the title is not + specified. + """ + return ( + self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore + if self.get(DI.TITLE) + else None + ) + + @property + def title_raw(self) -> Optional[str]: + """The "raw" version of title; can return a ``ByteStringObject``.""" + return self.get(DI.TITLE) + + @property + def author(self) -> Optional[str]: + """ + Read-only property accessing the document's author. + + Returns a ``TextStringObject`` or ``None`` if the author is not + specified. + """ + return self._get_text(DI.AUTHOR) + + @property + def author_raw(self) -> Optional[str]: + """The "raw" version of author; can return a ``ByteStringObject``.""" + return self.get(DI.AUTHOR) + + @property + def subject(self) -> Optional[str]: + """ + Read-only property accessing the document's subject. + + Returns a ``TextStringObject`` or ``None`` if the subject is not + specified. + """ + return self._get_text(DI.SUBJECT) + + @property + def subject_raw(self) -> Optional[str]: + """The "raw" version of subject; can return a ``ByteStringObject``.""" + return self.get(DI.SUBJECT) + + @property + def creator(self) -> Optional[str]: + """ + Read-only property accessing the document's creator. + + If the document was converted to PDF from another format, this is the + name of the application (e.g. OpenOffice) that created the original + document from which it was converted. Returns a ``TextStringObject`` or + ``None`` if the creator is not specified. + """ + return self._get_text(DI.CREATOR) + + @property + def creator_raw(self) -> Optional[str]: + """The "raw" version of creator; can return a ``ByteStringObject``.""" + return self.get(DI.CREATOR) + + @property + def producer(self) -> Optional[str]: + """ + Read-only property accessing the document's producer. + + If the document was converted to PDF from another format, this is the + name of the application (for example, OSX Quartz) that converted it to + PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not + specified. + """ + return self._get_text(DI.PRODUCER) + + @property + def producer_raw(self) -> Optional[str]: + """The "raw" version of producer; can return a ``ByteStringObject``.""" + return self.get(DI.PRODUCER) + + @property + def creation_date(self) -> Optional[datetime]: + """Read-only property accessing the document's creation date.""" + return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) + + @property + def creation_date_raw(self) -> Optional[str]: + """ + The "raw" version of creation date; can return a ``ByteStringObject``. + + Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix + is the offset from UTC. + """ + return self.get(DI.CREATION_DATE) + + @property + def modification_date(self) -> Optional[datetime]: + """ + Read-only property accessing the document's modification date. + + The date and time the document was most recently modified. + """ + return parse_iso8824_date(self._get_text(DI.MOD_DATE)) + + @property + def modification_date_raw(self) -> Optional[str]: + """ + The "raw" version of modification date; can return a + ``ByteStringObject``. + + Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix + is the offset from UTC. + """ + return self.get(DI.MOD_DATE) + + +class PdfReader: + """ + Initialize a PdfReader object. + + This operation can take some time, as the PDF stream's cross-reference + tables are read into memory. + + Args: + stream: A File object or an object that supports the standard read + and seek methods similar to a File object. Could also be a + string representing a path to a PDF file. + strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``False``. + password: Decrypt PDF file at initialization. If the + password is None, the file will not be decrypted. + Defaults to ``None`` + """ + + @property + def viewer_preferences(self) -> Optional[ViewerPreferences]: + """Returns the existing ViewerPreferences as an overloaded dictionary.""" + o = cast(DictionaryObject, self.trailer["/Root"]).get( + CD.VIEWER_PREFERENCES, None + ) + if o is None: + return None + o = o.get_object() + if not isinstance(o, ViewerPreferences): + o = ViewerPreferences(o) + return o + + def __init__( + self, + stream: Union[StrByteType, Path], + strict: bool = False, + password: Union[None, str, bytes] = None, + ) -> None: + self.strict = strict + self.flattened_pages: Optional[List[PageObject]] = None + self.resolved_objects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {} + self.xref_index = 0 + self._page_id2num: Optional[ + Dict[Any, Any] + ] = None # map page indirect_reference number to Page Number + if hasattr(stream, "mode") and "b" not in stream.mode: # type: ignore + logger_warning( + "PdfReader stream/file object is not in binary mode. " + "It may not be read correctly.", + __name__, + ) + if isinstance(stream, (str, Path)): + with open(stream, "rb") as fh: + stream = BytesIO(fh.read()) + self.read(stream) + self.stream = stream + + self._override_encryption = False + self._encryption: Optional[Encryption] = None + if self.is_encrypted: + self._override_encryption = True + # Some documents may not have a /ID, use two empty + # byte strings instead. Solves + # https://github.com/py-pdf/pypdf/issues/608 + id_entry = self.trailer.get(TK.ID) + id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" + encrypt_entry = cast( + DictionaryObject, self.trailer[TK.ENCRYPT].get_object() + ) + self._encryption = Encryption.read(encrypt_entry, id1_entry) + + # try empty password if no password provided + pwd = password if password is not None else b"" + if ( + self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED + and password is not None + ): + # raise if password provided + raise WrongPasswordError("Wrong password") + self._override_encryption = False + elif password is not None: + raise PdfReadError("Not encrypted file") + + @property + def pdf_header(self) -> str: + """ + The first 8 bytes of the file. + + This is typically something like ``'%PDF-1.6'`` and can be used to + detect if the file is actually a PDF file and which version it is. + """ + # TODO: Make this return a bytes object for consistency + # but that needs a deprecation + loc = self.stream.tell() + self.stream.seek(0, 0) + pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") + self.stream.seek(loc, 0) # return to where it was + return pdf_file_version + + @property + def metadata(self) -> Optional[DocumentInformation]: + """ + Retrieve the PDF file's document information dictionary, if it exists. + + Note that some PDF files use metadata streams instead of docinfo + dictionaries, and these metadata streams will not be accessed by this + function. + """ + if TK.INFO not in self.trailer: + return None + obj = self.trailer[TK.INFO] + retval = DocumentInformation() + if isinstance(obj, type(None)): + raise PdfReadError( + "trailer not found or does not point to document information directory" + ) + retval.update(obj) # type: ignore + return retval + + def getDocumentInfo(self) -> Optional[DocumentInformation]: # deprecated + """ + Use the attribute :py:attr:`metadata` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getDocumentInfo", "metadata", "3.0.0") + return self.metadata + + @property + def documentInfo(self) -> Optional[DocumentInformation]: # deprecated + """ + Use the attribute :py:attr:`metadata` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("documentInfo", "metadata", "3.0.0") + return self.metadata + + @property + def xmp_metadata(self) -> Optional[XmpInformation]: + """XMP (Extensible Metadata Platform) data.""" + try: + self._override_encryption = True + return self.trailer[TK.ROOT].xmp_metadata # type: ignore + finally: + self._override_encryption = False + + def getXmpMetadata(self) -> Optional[XmpInformation]: # deprecated + """ + Use the attribute :py:attr:`metadata` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getXmpMetadata", "xmp_metadata", "3.0.0") + return self.xmp_metadata + + @property + def xmpMetadata(self) -> Optional[XmpInformation]: # deprecated + """ + Use the attribute :py:attr:`xmp_metadata` instead. + + .. deprecated:: 1.28.0. + """ + deprecation_with_replacement("xmpMetadata", "xmp_metadata", "3.0.0") + return self.xmp_metadata + + def _get_num_pages(self) -> int: + """ + Calculate the number of pages in this PDF file. + + Returns: + The number of pages of the parsed PDF file + + Raises: + PdfReadError: if file is encrypted and restrictions prevent + this action. + """ + # Flattened pages will not work on an Encrypted PDF; + # the PDF file's page count is used in this case. Otherwise, + # the original method (flattened page count) is used. + if self.is_encrypted: + return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore + else: + if self.flattened_pages is None: + self._flatten() + return len(self.flattened_pages) # type: ignore + + def getNumPages(self) -> int: # deprecated + """ + Use :code:`len(reader.pages)` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("reader.getNumPages", "len(reader.pages)", "3.0.0") + return self._get_num_pages() + + @property + def numPages(self) -> int: # deprecated + """ + Use :code:`len(reader.pages)` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("reader.numPages", "len(reader.pages)", "3.0.0") + return self._get_num_pages() + + def getPage(self, pageNumber: int) -> PageObject: # deprecated + """ + Use :code:`reader.pages[page_number]` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "reader.getPage(pageNumber)", "reader.pages[page_number]", "3.0.0" + ) + return self._get_page(pageNumber) + + def _get_page(self, page_number: int) -> PageObject: + """ + Retrieve a page by number from this PDF file. + + Args: + page_number: The page number to retrieve + (pages begin at zero) + + Returns: + A :class:`PageObject` instance. + """ + if self.flattened_pages is None: + self._flatten() + assert self.flattened_pages is not None, "hint for mypy" + return self.flattened_pages[page_number] + + @property + def namedDestinations(self) -> Dict[str, Any]: # deprecated + """ + Use :py:attr:`named_destinations` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("namedDestinations", "named_destinations", "3.0.0") + return self.named_destinations + + @property + def named_destinations(self) -> Dict[str, Any]: + """ + A read-only dictionary which maps names to + :class:`Destinations` + """ + return self._get_named_destinations() + + # A select group of relevant field attributes. For the complete list, + # see section 8.6.2 of the PDF 1.7 reference. + + def get_fields( + self, + tree: Optional[TreeObject] = None, + retval: Optional[Dict[Any, Any]] = None, + fileobj: Optional[Any] = None, + ) -> Optional[Dict[str, Any]]: + """ + Extract field data if this PDF contains interactive form fields. + + The *tree* and *retval* parameters are for recursive use. + + Args: + tree: + retval: + fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. + + Returns: + A dictionary where each key is a field name, and each + value is a :class:`Field` object. By + default, the mapping name is used for keys. + ``None`` if form data could not be located. + """ + field_attributes = FA.attributes_dict() + field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) + if retval is None: + retval = {} + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + # get the AcroForm tree + if CD.ACRO_FORM in catalog: + tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) + else: + return None + if tree is None: + return retval + self._check_kids(tree, retval, fileobj) + for attr in field_attributes: + if attr in tree: + # Tree is a field + self._build_field(tree, retval, fileobj, field_attributes) + break + + if "/Fields" in tree: + fields = cast(ArrayObject, tree["/Fields"]) + for f in fields: + field = f.get_object() + self._build_field(field, retval, fileobj, field_attributes) + + return retval + + def getFields( + self, + tree: Optional[TreeObject] = None, + retval: Optional[Dict[Any, Any]] = None, + fileobj: Optional[Any] = None, + ) -> Optional[Dict[str, Any]]: # deprecated + """ + Use :meth:`get_fields` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getFields", "get_fields", "3.0.0") + return self.get_fields(tree, retval, fileobj) + + def _get_qualified_field_name(self, parent: DictionaryObject) -> str: + if "/TM" in parent: + return cast(str, parent["/TM"]) + elif "/Parent" in parent: + return ( + self._get_qualified_field_name( + cast(DictionaryObject, parent["/Parent"]) + ) + + "." + + cast(str, parent["/T"]) + ) + else: + return cast(str, parent["/T"]) + + def _build_field( + self, + field: Union[TreeObject, DictionaryObject], + retval: Dict[Any, Any], + fileobj: Any, + field_attributes: Any, + ) -> None: + self._check_kids(field, retval, fileobj) + try: + key = cast(str, field["/TM"]) + except KeyError: + try: + if "/Parent" in field: + key = ( + self._get_qualified_field_name( + cast(DictionaryObject, field["/Parent"]) + ) + + "." + ) + else: + key = "" + key += cast(str, field["/T"]) + except KeyError: + # Ignore no-name field for now + return + if fileobj: + self._write_field(fileobj, field, field_attributes) + fileobj.write("\n") + retval[key] = Field(field) + obj = retval[key].indirect_reference.get_object() # to get the full object + if obj.get(FA.FT, "") == "/Ch": + retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] + if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: + # Checkbox + retval[key][NameObject("/_States_")] = ArrayObject( + list(obj["/AP"]["/N"].keys()) + ) + if "/Off" not in retval[key]["/_States_"]: + retval[key][NameObject("/_States_")].append(NameObject("/Off")) + elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: + states = [] + for k in obj.get(FA.Kids, {}): + k = k.get_object() + for s in list(k["/AP"]["/N"].keys()): + if s not in states: + states.append(s) + retval[key][NameObject("/_States_")] = ArrayObject(states) + if ( + obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 + and "/Off" in retval[key]["/_States_"] + ): + del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] + + def _check_kids( + self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any + ) -> None: + if PA.KIDS in tree: + # recurse down the tree + for kid in tree[PA.KIDS]: # type: ignore + self.get_fields(kid.get_object(), retval, fileobj) + + def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: + field_attributes_tuple = FA.attributes() + field_attributes_tuple = ( + field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() + ) + + for attr in field_attributes_tuple: + if attr in ( + FA.Kids, + FA.AA, + ): + continue + attr_name = field_attributes[attr] + try: + if attr == FA.FT: + # Make the field type value more clear + types = { + "/Btn": "Button", + "/Tx": "Text", + "/Ch": "Choice", + "/Sig": "Signature", + } + if field[attr] in types: + fileobj.write(f"{attr_name}: {types[field[attr]]}\n") + elif attr == FA.Parent: + # Let's just write the name of the parent + try: + name = field[attr][FA.TM] + except KeyError: + name = field[attr][FA.T] + fileobj.write(f"{attr_name}: {name}\n") + else: + fileobj.write(f"{attr_name}: {field[attr]}\n") + except KeyError: + # Field attribute is N/A or unknown, so don't write anything + pass + + def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]: + """ + Retrieve form fields from the document with textual data. + + Args: + full_qualified_name: to get full name + + Returns: + A dictionary. The key is the name of the form field, + the value is the content of the field. + + If the document contains multiple form fields with the same name, the + second and following will get the suffix .2, .3, ... + """ + + def indexed_key(k: str, fields: dict) -> str: + if k not in fields: + return k + else: + return ( + k + + "." + + str(sum([1 for kk in fields if kk.startswith(k + ".")]) + 2) + ) + + # Retrieve document form fields + formfields = self.get_fields() + if formfields is None: + return {} + ff = {} + for field, value in formfields.items(): + if value.get("/FT") == "/Tx": + if full_qualified_name: + ff[field] = value.get("/V") + else: + ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") + return ff + + def getFormTextFields(self) -> Dict[str, Any]: # deprecated + """ + Use :meth:`get_form_text_fields` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "getFormTextFields", "get_form_text_fields", "3.0.0" + ) + return self.get_form_text_fields() + + def _get_named_destinations( + self, + tree: Union[TreeObject, None] = None, + retval: Optional[Any] = None, + ) -> Dict[str, Any]: + """ + Retrieve the named destinations present in the document. + + Args: + tree: + retval: + + Returns: + A dictionary which maps names to + :class:`Destinations`. + """ + if retval is None: + retval = {} + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + # get the name tree + if CA.DESTS in catalog: + tree = cast(TreeObject, catalog[CA.DESTS]) + elif CA.NAMES in catalog: + names = cast(DictionaryObject, catalog[CA.NAMES]) + if CA.DESTS in names: + tree = cast(TreeObject, names[CA.DESTS]) + + if tree is None: + return retval + + if PA.KIDS in tree: + # recurse down the tree + for kid in cast(ArrayObject, tree[PA.KIDS]): + self._get_named_destinations(kid.get_object(), retval) + # TABLE 3.33 Entries in a name tree node dictionary (PDF 1.7 specs) + elif CA.NAMES in tree: # KIDS and NAMES are exclusives (PDF 1.7 specs p 162) + names = cast(DictionaryObject, tree[CA.NAMES]) + i = 0 + while i < len(names): + key = cast(str, names[i].get_object()) + i += 1 + if not isinstance(key, str): + continue + try: + value = names[i].get_object() + except IndexError: + break + i += 1 + if isinstance(value, DictionaryObject) and "/D" in value: + value = value["/D"] + dest = self._build_destination(key, value) # type: ignore + if dest is not None: + retval[key] = dest + else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF1.1 + for k__, v__ in tree.items(): + val = v__.get_object() + if isinstance(val, DictionaryObject): + val = val["/D"].get_object() + dest = self._build_destination(k__, val) + if dest is not None: + retval[k__] = dest + return retval + + def getNamedDestinations( + self, + tree: Union[TreeObject, None] = None, + retval: Optional[Any] = None, + ) -> Dict[str, Any]: # deprecated + """ + Use :py:attr:`named_destinations` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "getNamedDestinations", "named_destinations", "3.0.0" + ) + return self._get_named_destinations(tree, retval) + + @property + def outline(self) -> OutlineType: + """ + Read-only property for the outline present in the document. + + (i.e., a collection of 'outline items' which are also known as + 'bookmarks') + """ + return self._get_outline() + + @property + def outlines(self) -> OutlineType: # deprecated + """ + Use :py:attr:`outline` instead. + + .. deprecated:: 2.9.0 + """ + deprecation_with_replacement("outlines", "outline", "3.0.0") + return self.outline + + def _get_outline( + self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None + ) -> OutlineType: + if outline is None: + outline = [] + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + # get the outline dictionary and named destinations + if CO.OUTLINES in catalog: + lines = cast(DictionaryObject, catalog[CO.OUTLINES]) + + if isinstance(lines, NullObject): + return outline + + # TABLE 8.3 Entries in the outline dictionary + if lines is not None and "/First" in lines: + node = cast(DictionaryObject, lines["/First"]) + self._namedDests = self._get_named_destinations() + + if node is None: + return outline + + # see if there are any more outline items + while True: + outline_obj = self._build_outline_item(node) + if outline_obj: + outline.append(outline_obj) + + # check for sub-outline + if "/First" in node: + sub_outline: List[Any] = [] + self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) + if sub_outline: + outline.append(sub_outline) + + if "/Next" not in node: + break + node = cast(DictionaryObject, node["/Next"]) + + return outline + + def getOutlines( + self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None + ) -> OutlineType: # deprecated + """ + Use :py:attr:`outline` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getOutlines", "outline", "3.0.0") + return self._get_outline(node, outline) + + @property + def threads(self) -> Optional[ArrayObject]: + """ + Read-only property for the list of threads. + + See §8.3.2 from PDF 1.7 spec. + + It's an array of dictionaries with "/F" and "/I" properties or + None if there are no articles. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + if CO.THREADS in catalog: + return cast("ArrayObject", catalog[CO.THREADS]) + else: + return None + + def _get_page_number_by_indirect( + self, indirect_reference: Union[None, int, NullObject, IndirectObject] + ) -> int: + """ + Generate _page_id2num. + + Args: + indirect_reference: + + Returns: + The page number. + """ + if self._page_id2num is None: + self._page_id2num = { + x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore + } + + if indirect_reference is None or isinstance(indirect_reference, NullObject): + return -1 + if isinstance(indirect_reference, int): + idnum = indirect_reference + else: + idnum = indirect_reference.idnum + assert self._page_id2num is not None, "hint for mypy" + ret = self._page_id2num.get(idnum, -1) + return ret + + def get_page_number(self, page: PageObject) -> int: + """ + Retrieve page number of a given PageObject. + + Args: + page: The page to get page number. Should be + an instance of :class:`PageObject` + + Returns: + The page number or -1 if page is not found + """ + return self._get_page_number_by_indirect(page.indirect_reference) + + def getPageNumber(self, page: PageObject) -> int: # deprecated + """ + Use :meth:`get_page_number` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getPageNumber", "get_page_number", "3.0.0") + return self.get_page_number(page) + + def get_destination_page_number(self, destination: Destination) -> int: + """ + Retrieve page number of a given Destination object. + + Args: + destination: The destination to get page number. + + Returns: + The page number or -1 if page is not found + """ + return self._get_page_number_by_indirect(destination.page) + + def getDestinationPageNumber(self, destination: Destination) -> int: # deprecated + """ + Use :meth:`get_destination_page_number` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "getDestinationPageNumber", "get_destination_page_number", "3.0.0" + ) + return self.get_destination_page_number(destination) + + def _build_destination( + self, + title: str, + array: Optional[ + List[ + Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] + ] + ], + ) -> Destination: + page, typ = None, None + # handle outline items with missing or invalid destination + if ( + isinstance(array, (NullObject, str)) + or (isinstance(array, ArrayObject) and len(array) == 0) + or array is None + ): + page = NullObject() + return Destination(title, page, Fit.fit()) + else: + page, typ = array[0:2] # type: ignore + array = array[2:] + try: + return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore + except PdfReadError: + logger_warning(f"Unknown destination: {title} {array}", __name__) + if self.strict: + raise + # create a link to first Page + tmp = self.pages[0].indirect_reference + indirect_reference = NullObject() if tmp is None else tmp + return Destination(title, indirect_reference, Fit.fit()) # type: ignore + + def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: + dest, title, outline_item = None, None, None + + # title required for valid outline + # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary + try: + title = cast("str", node["/Title"]) + except KeyError: + if self.strict: + raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") + title = "" # type: ignore + + if "/A" in node: + # Action, PDFv1.7 Section 12.6 (only type GoTo supported) + action = cast(DictionaryObject, node["/A"]) + action_type = cast(NameObject, action[GoToActionArguments.S]) + if action_type == "/GoTo": + dest = action[GoToActionArguments.D] + elif "/Dest" in node: + # Destination, PDFv1.7 Section 12.3.2 + dest = node["/Dest"] + # if array was referenced in another object, will be a dict w/ key "/D" + if isinstance(dest, DictionaryObject) and "/D" in dest: + dest = dest["/D"] + + if isinstance(dest, ArrayObject): + outline_item = self._build_destination(title, dest) + elif isinstance(dest, str): + # named destination, addresses NameObject Issue #193 + # TODO : keep named destination instead of replacing it ? + try: + outline_item = self._build_destination( + title, self._namedDests[dest].dest_array + ) + except KeyError: + # named destination not found in Name Dict + outline_item = self._build_destination(title, None) + elif dest is None: + # outline item not required to have destination or action + # PDFv1.7 Table 153 + outline_item = self._build_destination(title, dest) + else: + if self.strict: + raise PdfReadError(f"Unexpected destination {dest!r}") + else: + logger_warning( + f"Removed unexpected destination {dest!r} from destination", + __name__, + ) + outline_item = self._build_destination(title, None) # type: ignore + + # if outline item created, add color, format, and child count if present + if outline_item: + if "/C" in node: + # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 + outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore + if "/F" in node: + # specifies style characteristics bold and/or italic + # with 1=italic, 2=bold, 3=both + outline_item[NameObject("/F")] = node["/F"] + if "/Count" in node: + # absolute value = num. visible children + # with positive = open/unfolded, negative = closed/folded + outline_item[NameObject("/Count")] = node["/Count"] + # if count is 0 we will consider it as open ( in order to have always an is_open to simplify + outline_item[NameObject("/%is_open%")] = BooleanObject( + node.get("/Count", 0) >= 0 + ) + outline_item.node = node + try: + outline_item.indirect_reference = node.indirect_reference + except AttributeError: + pass + return outline_item + + @property + def pages(self) -> List[PageObject]: + """Read-only property that emulates a list of :py:class:`Page` objects.""" + return _VirtualList(self._get_num_pages, self._get_page) # type: ignore + + @property + def page_labels(self) -> List[str]: + """ + A list of labels for the pages in this document. + + This property is read-only. The labels are in the order that the pages + appear in the document. + """ + return [page_index2page_label(self, i) for i in range(len(self.pages))] + + @property + def page_layout(self) -> Optional[str]: + """ + Get the page layout currently being used. + + .. list-table:: Valid ``layout`` values + :widths: 50 200 + + * - /NoLayout + - Layout explicitly not specified + * - /SinglePage + - Show one page at a time + * - /OneColumn + - Show one column at a time + * - /TwoColumnLeft + - Show pages in two columns, odd-numbered pages on the left + * - /TwoColumnRight + - Show pages in two columns, odd-numbered pages on the right + * - /TwoPageLeft + - Show two pages at a time, odd-numbered pages on the left + * - /TwoPageRight + - Show two pages at a time, odd-numbered pages on the right + """ + trailer = cast(DictionaryObject, self.trailer[TK.ROOT]) + if CD.PAGE_LAYOUT in trailer: + return cast(NameObject, trailer[CD.PAGE_LAYOUT]) + return None + + def getPageLayout(self) -> Optional[str]: # deprecated + """ + Use :py:attr:`page_layout` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getPageLayout", "page_layout", "3.0.0") + return self.page_layout + + @property + def pageLayout(self) -> Optional[str]: # deprecated + """ + Use :py:attr:`page_layout` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") + return self.page_layout + + @property + def page_mode(self) -> Optional[PagemodeType]: + """ + Get the page mode currently being used. + + .. list-table:: Valid ``mode`` values + :widths: 50 200 + + * - /UseNone + - Do not show outline or thumbnails panels + * - /UseOutlines + - Show outline (aka bookmarks) panel + * - /UseThumbs + - Show page thumbnails panel + * - /FullScreen + - Fullscreen view + * - /UseOC + - Show Optional Content Group (OCG) panel + * - /UseAttachments + - Show attachments panel + """ + try: + return self.trailer[TK.ROOT]["/PageMode"] # type: ignore + except KeyError: + return None + + def getPageMode(self) -> Optional[PagemodeType]: # deprecated + """ + Use :py:attr:`page_mode` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getPageMode", "page_mode", "3.0.0") + return self.page_mode + + @property + def pageMode(self) -> Optional[PagemodeType]: # deprecated + """ + Use :py:attr:`page_mode` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("pageMode", "page_mode", "3.0.0") + return self.page_mode + + def _flatten( + self, + pages: Union[None, DictionaryObject, PageObject] = None, + inherit: Optional[Dict[str, Any]] = None, + indirect_reference: Optional[IndirectObject] = None, + ) -> None: + inheritable_page_attributes = ( + NameObject(PG.RESOURCES), + NameObject(PG.MEDIABOX), + NameObject(PG.CROPBOX), + NameObject(PG.ROTATE), + ) + if inherit is None: + inherit = {} + if pages is None: + # Fix issue 327: set flattened_pages attribute only for + # decrypted file + catalog = self.trailer[TK.ROOT].get_object() + pages = catalog["/Pages"].get_object() # type: ignore + self.flattened_pages = [] + + if PA.TYPE in pages: + t = pages[PA.TYPE] # type: ignore + # if pdf has no type, considered as a page if /Kids is missing + elif PA.KIDS not in pages: + t = "/Page" + else: + t = "/Pages" + + if t == "/Pages": + for attr in inheritable_page_attributes: + if attr in pages: + inherit[attr] = pages[attr] + for page in pages[PA.KIDS]: # type: ignore + addt = {} + if isinstance(page, IndirectObject): + addt["indirect_reference"] = page + obj = page.get_object() + if obj: + # damaged file may have invalid child in /Pages + self._flatten(obj, inherit, **addt) + elif t == "/Page": + for attr_in, value in list(inherit.items()): + # if the page has it's own value, it does not inherit the + # parent's value: + if attr_in not in pages: + pages[attr_in] = value + page_obj = PageObject(self, indirect_reference) + page_obj.update(pages) + + # TODO: Could flattened_pages be None at this point? + self.flattened_pages.append(page_obj) # type: ignore + + def _get_object_from_stream( + self, indirect_reference: IndirectObject + ) -> Union[int, PdfObject, str]: + # indirect reference to object in object stream + # read the entire object stream into memory + stmnum, idx = self.xref_objStm[indirect_reference.idnum] + obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore + # This is an xref to a stream, so its type better be a stream + assert cast(str, obj_stm["/Type"]) == "/ObjStm" + # /N is the number of indirect objects in the stream + assert idx < obj_stm["/N"] + stream_data = BytesIO(b_(obj_stm.get_data())) + for i in range(obj_stm["/N"]): # type: ignore + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + objnum = NumberObject.read_from_stream(stream_data) + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + offset = NumberObject.read_from_stream(stream_data) + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + if objnum != indirect_reference.idnum: + # We're only interested in one object + continue + if self.strict and idx != i: + raise PdfReadError("Object is in wrong index.") + stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore + + # to cope with some case where the 'pointer' is on a white space + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + + try: + obj = read_object(stream_data, self) + except PdfStreamError as exc: + # Stream object cannot be read. Normally, a critical error, but + # Adobe Reader doesn't complain, so continue (in strict mode?) + logger_warning( + f"Invalid stream (index {i}) within object " + f"{indirect_reference.idnum} {indirect_reference.generation}: " + f"{exc}", + __name__, + ) + + if self.strict: + raise PdfReadError(f"Can't read object stream: {exc}") + # Replace with null. Hopefully it's nothing important. + obj = NullObject() + return obj + + if self.strict: + raise PdfReadError("This is a fatal error in strict mode.") + return NullObject() + + def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: + """ + Used to ease development. + + This is equivalent to generic.IndirectObject(num,gen,self).get_object() + + Args: + num: The object number of the indirect object. + gen: The generation number of the indirect object. + + Returns: + A PdfObject + """ + return IndirectObject(num, gen, self).get_object() + + def get_object( + self, indirect_reference: Union[int, IndirectObject] + ) -> Optional[PdfObject]: + if isinstance(indirect_reference, int): + indirect_reference = IndirectObject(indirect_reference, 0, self) + retval = self.cache_get_indirect_object( + indirect_reference.generation, indirect_reference.idnum + ) + if retval is not None: + return retval + if ( + indirect_reference.generation == 0 + and indirect_reference.idnum in self.xref_objStm + ): + retval = self._get_object_from_stream(indirect_reference) # type: ignore + elif ( + indirect_reference.generation in self.xref + and indirect_reference.idnum in self.xref[indirect_reference.generation] + ): + if self.xref_free_entry.get(indirect_reference.generation, {}).get( + indirect_reference.idnum, False + ): + return NullObject() + start = self.xref[indirect_reference.generation][indirect_reference.idnum] + self.stream.seek(start, 0) + try: + idnum, generation = self.read_object_header(self.stream) + except Exception: + if hasattr(self.stream, "getbuffer"): + buf = bytes(self.stream.getbuffer()) # type: ignore + else: + p = self.stream.tell() + self.stream.seek(0, 0) + buf = self.stream.read(-1) + self.stream.seek(p, 0) + m = re.search( + rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), + buf, + ) + if m is not None: + logger_warning( + f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", + __name__, + ) + self.xref[indirect_reference.generation][ + indirect_reference.idnum + ] = (m.start(0) + 1) + self.stream.seek(m.start(0) + 1) + idnum, generation = self.read_object_header(self.stream) + else: + idnum = -1 # exception will be raised below + if idnum != indirect_reference.idnum and self.xref_index: + # Xref table probably had bad indexes due to not being zero-indexed + if self.strict: + raise PdfReadError( + f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " + f"does not match actual ({idnum} {generation}); " + "xref table not zero-indexed." + ) + # xref table is corrected in non-strict mode + elif idnum != indirect_reference.idnum and self.strict: + # some other problem + raise PdfReadError( + f"Expected object ID ({indirect_reference.idnum} " + f"{indirect_reference.generation}) does not match actual " + f"({idnum} {generation})." + ) + if self.strict: + assert generation == indirect_reference.generation + retval = read_object(self.stream, self) # type: ignore + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self._encryption is not None: + # if we don't have the encryption key: + if not self._encryption.is_decrypted(): + raise FileNotDecryptedError("File has not been decrypted") + # otherwise, decrypt here... + retval = cast(PdfObject, retval) + retval = self._encryption.decrypt_object( + retval, indirect_reference.idnum, indirect_reference.generation + ) + else: + if hasattr(self.stream, "getbuffer"): + buf = bytes(self.stream.getbuffer()) # type: ignore + else: + p = self.stream.tell() + self.stream.seek(0, 0) + buf = self.stream.read(-1) + self.stream.seek(p, 0) + m = re.search( + rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), + buf, + ) + if m is not None: + logger_warning( + f"Object {indirect_reference.idnum} {indirect_reference.generation} found", + __name__, + ) + if indirect_reference.generation not in self.xref: + self.xref[indirect_reference.generation] = {} + self.xref[indirect_reference.generation][indirect_reference.idnum] = ( + m.start(0) + 1 + ) + self.stream.seek(m.end(0) + 1) + skip_over_whitespace(self.stream) + self.stream.seek(-1, 1) + retval = read_object(self.stream, self) # type: ignore + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self._encryption is not None: + # if we don't have the encryption key: + if not self._encryption.is_decrypted(): + raise FileNotDecryptedError("File has not been decrypted") + # otherwise, decrypt here... + retval = cast(PdfObject, retval) + retval = self._encryption.decrypt_object( + retval, indirect_reference.idnum, indirect_reference.generation + ) + else: + logger_warning( + f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", + __name__, + ) + if self.strict: + raise PdfReadError("Could not find object.") + self.cache_indirect_object( + indirect_reference.generation, indirect_reference.idnum, retval + ) + return retval + + def getObject( + self, indirectReference: IndirectObject + ) -> Optional[PdfObject]: # deprecated + """ + Use :meth:`get_object` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getObject", "get_object", "3.0.0") + return self.get_object(indirectReference) + + def read_object_header(self, stream: StreamType) -> Tuple[int, int]: + # Should never be necessary to read out whitespace, since the + # cross-reference table should put us in the right spot to read the + # object header. In reality... some files have stupid cross reference + # tables that are off by whitespace bytes. + extra = False + skip_over_comment(stream) + extra |= skip_over_whitespace(stream) + stream.seek(-1, 1) + idnum = read_until_whitespace(stream) + extra |= skip_over_whitespace(stream) + stream.seek(-1, 1) + generation = read_until_whitespace(stream) + extra |= skip_over_whitespace(stream) + stream.seek(-1, 1) + + # although it's not used, it might still be necessary to read + _obj = stream.read(3) + + read_non_whitespace(stream) + stream.seek(-1, 1) + if extra and self.strict: + logger_warning( + f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore + __name__, + ) + return int(idnum), int(generation) + + def readObjectHeader(self, stream: StreamType) -> Tuple[int, int]: # deprecated + """ + Use :meth:`read_object_header` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("readObjectHeader", "read_object_header", "3.0.0") + return self.read_object_header(stream) + + def cache_get_indirect_object( + self, generation: int, idnum: int + ) -> Optional[PdfObject]: + return self.resolved_objects.get((generation, idnum)) + + def cacheGetIndirectObject( + self, generation: int, idnum: int + ) -> Optional[PdfObject]: # deprecated + """ + Use :meth:`cache_get_indirect_object` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "cacheGetIndirectObject", "cache_get_indirect_object", "3.0.0" + ) + return self.cache_get_indirect_object(generation, idnum) + + def cache_indirect_object( + self, generation: int, idnum: int, obj: Optional[PdfObject] + ) -> Optional[PdfObject]: + if (generation, idnum) in self.resolved_objects: + msg = f"Overwriting cache for {generation} {idnum}" + if self.strict: + raise PdfReadError(msg) + logger_warning(msg, __name__) + self.resolved_objects[(generation, idnum)] = obj + if obj is not None: + obj.indirect_reference = IndirectObject(idnum, generation, self) + return obj + + def cacheIndirectObject( + self, generation: int, idnum: int, obj: Optional[PdfObject] + ) -> Optional[PdfObject]: # deprecated + """ + Use :meth:`cache_indirect_object` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("cacheIndirectObject", "cache_indirect_object") + return self.cache_indirect_object(generation, idnum, obj) + + def read(self, stream: StreamType) -> None: + self._basic_validation(stream) + self._find_eof_marker(stream) + startxref = self._find_startxref_pos(stream) + + # check and eventually correct the startxref only in not strict + xref_issue_nr = self._get_xref_issues(stream, startxref) + if xref_issue_nr != 0: + if self.strict and xref_issue_nr: + raise PdfReadError("Broken xref table") + logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) + + # read all cross reference tables and their trailers + self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) + + # if not zero-indexed, verify that the table is correct; change it if necessary + if self.xref_index and not self.strict: + loc = stream.tell() + for gen, xref_entry in self.xref.items(): + if gen == 65535: + continue + xref_k = sorted( + xref_entry.keys() + ) # must ensure ascendant to prevent damage + for id in xref_k: + stream.seek(xref_entry[id], 0) + try: + pid, _pgen = self.read_object_header(stream) + except ValueError: + break + if pid == id - self.xref_index: + # fixing index item per item is required for revised PDF. + self.xref[gen][pid] = self.xref[gen][id] + del self.xref[gen][id] + # if not, then either it's just plain wrong, or the + # non-zero-index is actually correct + stream.seek(loc, 0) # return to where it was + + def _basic_validation(self, stream: StreamType) -> None: + """Ensure file is not empty. Read at most 5 bytes.""" + stream.seek(0, os.SEEK_SET) + try: + header_byte = stream.read(5) + except UnicodeDecodeError: + raise UnsupportedOperation("cannot read header") + if header_byte == b"": + raise EmptyFileError("Cannot read an empty file") + elif header_byte != b"%PDF-": + if self.strict: + raise PdfReadError( + f"PDF starts with '{header_byte.decode('utf8')}', " + "but '%PDF-' expected" + ) + else: + logger_warning(f"invalid pdf header: {header_byte}", __name__) + stream.seek(0, os.SEEK_END) + + def _find_eof_marker(self, stream: StreamType) -> None: + """ + Jump to the %%EOF marker. + + According to the specs, the %%EOF marker should be at the very end of + the file. Hence for standard-compliant PDF documents this function will + read only the last part (DEFAULT_BUFFER_SIZE). + """ + HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' + line = b"" + while line[:5] != b"%%EOF": + if stream.tell() < HEADER_SIZE: + if self.strict: + raise PdfReadError("EOF marker not found") + else: + logger_warning("EOF marker not found", __name__) + line = read_previous_line(stream) + + def _find_startxref_pos(self, stream: StreamType) -> int: + """ + Find startxref entry - the location of the xref table. + + Args: + stream: + + Returns: + The bytes offset + """ + line = read_previous_line(stream) + try: + startxref = int(line) + except ValueError: + # 'startxref' may be on the same line as the location + if not line.startswith(b"startxref"): + raise PdfReadError("startxref not found") + startxref = int(line[9:].strip()) + logger_warning("startxref on same line as offset", __name__) + else: + line = read_previous_line(stream) + if line[:9] != b"startxref": + raise PdfReadError("startxref not found") + return startxref + + def _read_standard_xref_table(self, stream: StreamType) -> None: + # standard cross-reference table + ref = stream.read(3) + if ref != b"ref": + raise PdfReadError("xref table read error") + read_non_whitespace(stream) + stream.seek(-1, 1) + first_time = True # check if the first time looking at the xref table + while True: + num = cast(int, read_object(stream, self)) + if first_time and num != 0: + self.xref_index = num + if self.strict: + logger_warning( + "Xref table not zero-indexed. ID numbers for objects will be corrected.", + __name__, + ) + # if table not zero indexed, could be due to error from when PDF was created + # which will lead to mismatched indices later on, only warned and corrected if self.strict==True + first_time = False + read_non_whitespace(stream) + stream.seek(-1, 1) + size = cast(int, read_object(stream, self)) + read_non_whitespace(stream) + stream.seek(-1, 1) + cnt = 0 + while cnt < size: + line = stream.read(20) + + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes (as of PDF 1.7). However, some files have + # 21-byte entries (or more) due to the use of \r\n + # (CRLF) EOL's. Detect that case, and adjust the line + # until it does not begin with a \r (CR) or \n (LF). + while line[0] in b"\x0D\x0A": + stream.seek(-20 + 1, 1) + line = stream.read(20) + + # On the other hand, some malformed PDF files + # use a single character EOL without a preceding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in b"0123456789t": + stream.seek(-1, 1) + + try: + offset_b, generation_b = line[:16].split(b" ") + entry_type_b = line[17:18] + + offset, generation = int(offset_b), int(generation_b) + except Exception: + # if something wrong occurred + if hasattr(stream, "getbuffer"): + buf = bytes(stream.getbuffer()) # type: ignore + else: + p = stream.tell() + stream.seek(0, 0) + buf = stream.read(-1) + stream.seek(p) + + f = re.search(f"{num}\\s+(\\d+)\\s+obj".encode(), buf) + if f is None: + logger_warning( + f"entry {num} in Xref table invalid; object not found", + __name__, + ) + generation = 65535 + offset = -1 + else: + logger_warning( + f"entry {num} in Xref table invalid but object found", + __name__, + ) + generation = int(f.group(1)) + offset = f.start() + + if generation not in self.xref: + self.xref[generation] = {} + self.xref_free_entry[generation] = {} + if num in self.xref[generation]: + # It really seems like we should allow the last + # xref table in the file to override previous + # ones. Since we read the file backwards, assume + # any existing key is already set correctly. + pass + else: + self.xref[generation][num] = offset + try: + self.xref_free_entry[generation][num] = entry_type_b == b"f" + except Exception: + pass + try: + self.xref_free_entry[65535][num] = entry_type_b == b"f" + except Exception: + pass + cnt += 1 + num += 1 + read_non_whitespace(stream) + stream.seek(-1, 1) + trailer_tag = stream.read(7) + if trailer_tag != b"trailer": + # more xrefs! + stream.seek(-7, 1) + else: + break + + def _read_xref_tables_and_trailers( + self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int + ) -> None: + self.xref: Dict[int, Dict[Any, Any]] = {} + self.xref_free_entry: Dict[int, Dict[Any, Any]] = {} + self.xref_objStm: Dict[int, Tuple[Any, Any]] = {} + self.trailer = DictionaryObject() + while startxref is not None: + # load the xref table + stream.seek(startxref, 0) + x = stream.read(1) + if x in b"\r\n": + x = stream.read(1) + if x == b"x": + startxref = self._read_xref(stream) + elif xref_issue_nr: + try: + self._rebuild_xref_table(stream) + break + except Exception: + xref_issue_nr = 0 + elif x.isdigit(): + try: + xrefstream = self._read_pdf15_xref_stream(stream) + except Exception as e: + if TK.ROOT in self.trailer: + logger_warning( + f"Previous trailer can not be read {e.args}", + __name__, + ) + break + else: + raise PdfReadError(f"trailer can not be read {e.args}") + trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE + for key in trailer_keys: + if key in xrefstream and key not in self.trailer: + self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if "/XRefStm" in xrefstream: + p = stream.tell() + stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) + self._read_pdf15_xref_stream(stream) + stream.seek(p, 0) + if "/Prev" in xrefstream: + startxref = cast(int, xrefstream["/Prev"]) + else: + break + else: + startxref = self._read_xref_other_error(stream, startxref) + + def _read_xref(self, stream: StreamType) -> Optional[int]: + self._read_standard_xref_table(stream) + read_non_whitespace(stream) + stream.seek(-1, 1) + new_trailer = cast(Dict[str, Any], read_object(stream, self)) + for key, value in new_trailer.items(): + if key not in self.trailer: + self.trailer[key] = value + if "/XRefStm" in new_trailer: + p = stream.tell() + stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) + try: + self._read_pdf15_xref_stream(stream) + except Exception: + logger_warning( + f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", + __name__, + ) + stream.seek(p, 0) + if "/Prev" in new_trailer: + startxref = new_trailer["/Prev"] + return startxref + else: + return None + + def _read_xref_other_error( + self, stream: StreamType, startxref: int + ) -> Optional[int]: + # some PDFs have /Prev=0 in the trailer, instead of no /Prev + if startxref == 0: + if self.strict: + raise PdfReadError( + "/Prev=0 in the trailer (try opening with strict=False)" + ) + logger_warning( + "/Prev=0 in the trailer - assuming there is no previous xref table", + __name__, + ) + return None + # bad xref character at startxref. Let's see if we can find + # the xref table nearby, as we've observed this error with an + # off-by-one before. + stream.seek(-11, 1) + tmp = stream.read(20) + xref_loc = tmp.find(b"xref") + if xref_loc != -1: + startxref -= 10 - xref_loc + return startxref + # No explicit xref table, try finding a cross-reference stream. + stream.seek(startxref, 0) + for look in range(25): # value extended to cope with more linearized files + if stream.read(1).isdigit(): + # This is not a standard PDF, consider adding a warning + startxref += look + return startxref + # no xref table found at specified location + if "/Root" in self.trailer and not self.strict: + # if Root has been already found, just raise warning + logger_warning("Invalid parent xref., rebuild xref", __name__) + try: + self._rebuild_xref_table(stream) + return None + except Exception: + raise PdfReadError("can not rebuild xref") + raise PdfReadError("Could not find xref table at specified location") + + def _read_pdf15_xref_stream( + self, stream: StreamType + ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: + # PDF 1.5+ Cross-Reference Stream + stream.seek(-1, 1) + idnum, generation = self.read_object_header(stream) + xrefstream = cast(ContentStream, read_object(stream, self)) + assert cast(str, xrefstream["/Type"]) == "/XRef" + self.cache_indirect_object(generation, idnum, xrefstream) + stream_data = BytesIO(b_(xrefstream.get_data())) + # Index pairs specify the subsections in the dictionary. If + # none create one subsection that spans everything. + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + entry_sizes = cast(Dict[Any, Any], xrefstream.get("/W")) + assert len(entry_sizes) >= 3 + if self.strict and len(entry_sizes) > 3: + raise PdfReadError(f"Too many entry sizes: {entry_sizes}") + + def get_entry(i: int) -> Union[int, Tuple[int, ...]]: + # Reads the correct number of bytes for each entry. See the + # discussion of the W parameter in PDF spec table 17. + if entry_sizes[i] > 0: + d = stream_data.read(entry_sizes[i]) + return convert_to_int(d, entry_sizes[i]) + + # PDF Spec Table 17: A value of zero for an element in the + # W array indicates...the default value shall be used + if i == 0: + return 1 # First value defaults to 1 + else: + return 0 + + def used_before(num: int, generation: Union[int, Tuple[int, ...]]) -> bool: + # We move backwards through the xrefs, don't replace any. + return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore + + # Iterate through each subsection + self._read_xref_subsections(idx_pairs, get_entry, used_before) + return xrefstream + + @staticmethod + def _get_xref_issues(stream: StreamType, startxref: int) -> int: + """ + Return an int which indicates an issue. 0 means there is no issue. + + Args: + stream: + startxref: + + Returns: + 0 means no issue, other values represent specific issues. + """ + stream.seek(startxref - 1, 0) # -1 to check character before + line = stream.read(1) + if line == b"j": + line = stream.read(1) + if line not in b"\r\n \t": + return 1 + line = stream.read(4) + if line != b"xref": + # not an xref so check if it is an XREF object + line = b"" + while line in b"0123456789 \t": + line = stream.read(1) + if line == b"": + return 2 + line += stream.read(2) # 1 char already read, +2 to check "obj" + if line.lower() != b"obj": + return 3 + return 0 + + def _rebuild_xref_table(self, stream: StreamType) -> None: + self.xref = {} + stream.seek(0, 0) + f_ = stream.read(-1) + + for m in re.finditer(rb"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj", f_): + idnum = int(m.group(1)) + generation = int(m.group(2)) + if generation not in self.xref: + self.xref[generation] = {} + self.xref[generation][idnum] = m.start(1) + stream.seek(0, 0) + for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): + stream.seek(m.start(1), 0) + new_trailer = cast(Dict[Any, Any], read_object(stream, self)) + # Here, we are parsing the file from start to end, the new data have to erase the existing. + for key, value in list(new_trailer.items()): + self.trailer[key] = value + + def _read_xref_subsections( + self, + idx_pairs: List[int], + get_entry: Callable[[int], Union[int, Tuple[int, ...]]], + used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], + ) -> None: + for start, size in self._pairs(idx_pairs): + # The subsections must increase + for num in range(start, start + size): + # The first entry is the type + xref_type = get_entry(0) + # The rest of the elements depend on the xref_type + if xref_type == 0: + # linked list of free objects + next_free_object = get_entry(1) # noqa: F841 + next_generation = get_entry(2) # noqa: F841 + elif xref_type == 1: + # objects that are in use but are not compressed + byte_offset = get_entry(1) + generation = get_entry(2) + if generation not in self.xref: + self.xref[generation] = {} # type: ignore + if not used_before(num, generation): + self.xref[generation][num] = byte_offset # type: ignore + elif xref_type == 2: + # compressed objects + objstr_num = get_entry(1) + obstr_idx = get_entry(2) + generation = 0 # PDF spec table 18, generation is 0 + if not used_before(num, generation): + self.xref_objStm[num] = (objstr_num, obstr_idx) + elif self.strict: + raise PdfReadError(f"Unknown xref type: {xref_type}") + + def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]: + i = 0 + while True: + yield array[i], array[i + 1] + i += 2 + if (i + 1) >= len(array): + break + + def read_next_end_line( + self, stream: StreamType, limit_offset: int = 0 + ) -> bytes: # deprecated + """.. deprecated:: 2.1.0""" + deprecate_no_replacement("read_next_end_line", removed_in="4.0.0") + line_parts = [] + while True: + # Prevent infinite loops in malformed PDFs + if stream.tell() == 0 or stream.tell() == limit_offset: + raise PdfReadError("Could not read malformed PDF file") + x = stream.read(1) + if stream.tell() < 2: + raise PdfReadError("EOL marker not found") + stream.seek(-2, 1) + if x in (b"\n", b"\r"): # \n = LF; \r = CR + crlf = False + while x in (b"\n", b"\r"): + x = stream.read(1) + if x in (b"\n", b"\r"): # account for CR+LF + stream.seek(-1, 1) + crlf = True + if stream.tell() < 2: + raise PdfReadError("EOL marker not found") + stream.seek(-2, 1) + stream.seek( + 2 if crlf else 1, 1 + ) # if using CR+LF, go back 2 bytes, else 1 + break + else: + line_parts.append(x) + line_parts.reverse() + return b"".join(line_parts) + + def readNextEndLine( + self, stream: StreamType, limit_offset: int = 0 + ) -> bytes: # deprecated + """.. deprecated:: 1.28.0""" + deprecation_no_replacement("readNextEndLine", "3.0.0") + return self.read_next_end_line(stream, limit_offset) + + def decrypt(self, password: Union[str, bytes]) -> PasswordType: + """ + When using an encrypted / secured PDF file with the PDF Standard + encryption handler, this function will allow the file to be decrypted. + It checks the given password against the document's user password and + owner password, and then stores the resulting decryption key if either + password is correct. + + It does not matter which password was matched. Both passwords provide + the correct decryption key that will allow the document to be used with + this library. + + Args: + password: The password to match. + + Returns: + An indicator if the document was decrypted and weather it was the + owner password or the user password. + """ + if not self._encryption: + raise PdfReadError("Not encrypted file") + # TODO: raise Exception for wrong password + return self._encryption.verify(password) + + def decode_permissions(self, permissions_code: int) -> Dict[str, bool]: + # Takes the permissions as an integer, returns the allowed access + permissions = {} + permissions["print"] = permissions_code & (1 << 3 - 1) != 0 # bit 3 + permissions["modify"] = permissions_code & (1 << 4 - 1) != 0 # bit 4 + permissions["copy"] = permissions_code & (1 << 5 - 1) != 0 # bit 5 + permissions["annotations"] = permissions_code & (1 << 6 - 1) != 0 # bit 6 + permissions["forms"] = permissions_code & (1 << 9 - 1) != 0 # bit 9 + permissions["accessability"] = permissions_code & (1 << 10 - 1) != 0 # bit 10 + permissions["assemble"] = permissions_code & (1 << 11 - 1) != 0 # bit 11 + permissions["print_high_quality"] = ( + permissions_code & (1 << 12 - 1) != 0 + ) # bit 12 + return permissions + + @property + def is_encrypted(self) -> bool: + """ + Read-only boolean property showing whether this PDF file is encrypted. + + Note that this property, if true, will remain true even after the + :meth:`decrypt()` method is called. + """ + return TK.ENCRYPT in self.trailer + + def getIsEncrypted(self) -> bool: # deprecated + """ + Use :py:attr:`is_encrypted` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getIsEncrypted", "is_encrypted", "3.0.0") + return self.is_encrypted + + @property + def isEncrypted(self) -> bool: # deprecated + """ + Use :py:attr:`is_encrypted` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("isEncrypted", "is_encrypted", "3.0.0") + return self.is_encrypted + + @property + def xfa(self) -> Optional[Dict[str, Any]]: + tree: Optional[TreeObject] = None + retval: Dict[str, Any] = {} + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not catalog["/AcroForm"]: + return None + + tree = cast(TreeObject, catalog["/AcroForm"]) + + if "/XFA" in tree: + fields = cast(ArrayObject, tree["/XFA"]) + i = iter(fields) + for f in i: + tag = f + f = next(i) + if isinstance(f, IndirectObject): + field = cast(Optional[EncodedStreamObject], f.get_object()) + if field: + es = zlib.decompress(b_(field._data)) + retval[tag] = es + return retval + + def add_form_topname(self, name: str) -> Optional[DictionaryObject]: + """ + Add a top level form that groups all form fields below it. + + Args: + name: text string of the "/T" Attribute of the created object + + Returns: + The created object. ``None`` means no object was created. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not isinstance( + catalog["/AcroForm"], DictionaryObject + ): + return None + acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + if "/Fields" not in acroform: + # TODO: :No error returns but may be extended for XFA Forms + return None + + interim = DictionaryObject() + interim[NameObject("/T")] = TextStringObject(name) + interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] + self.cache_indirect_object( + 0, + max([i for (g, i) in self.resolved_objects if g == 0]) + 1, + interim, + ) + arr = ArrayObject() + arr.append(interim.indirect_reference) + acroform[NameObject("/Fields")] = arr + for o in cast(ArrayObject, interim["/Kids"]): + obj = o.get_object() + if "/Parent" in obj: + logger_warning( + f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", + __name__, + ) + obj[NameObject("/Parent")] = interim.indirect_reference + return interim + + def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: + """ + Rename top level form field that all form fields below it. + + Args: + name: text string of the "/T" field of the created object + + Returns: + The modified object. ``None`` means no object was modified. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not isinstance( + catalog["/AcroForm"], DictionaryObject + ): + return None + acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + if "/Fields" not in acroform: + return None + + interim = cast( + DictionaryObject, + cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), + ) + interim[NameObject("/T")] = TextStringObject(name) + return interim + + def _get_embedded_files_root(self) -> Optional[NameTree]: + """ + Returns the EmbeddedFiles root as a NameTree Object + if the root does not exists, return None + """ + catalog = cast(DictionaryObject, self.trailer["/Root"]) + if "/Names" not in catalog: + return None + ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None) + if ef is None: + return None + efo = ef.get_object() + # not for reader + """ + if not isinstance(efo,NameTree): + if isinstance(ef,IndirectObject): + ef.replace_object(efo) + else: + cast(DictionaryObject,catalog["/Names"])[ + NameObject("/EmbeddedFiles")] = NameTree(efo) + """ + return NameTree(efo) + + @property + def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: + ef = self._get_embedded_files_root() + if ef: + return ef.list_items() + else: + return None + + @property + def attachments(self) -> Mapping[str, Union[List[bytes], List[Dict[str, bytes]]]]: + ef = self._get_embedded_files_root() + if ef: + d: Dict[str, Union[List[bytes], List[Dict[str, bytes]]]] = {} + for k, v in ef.list_items().items(): + if isinstance(v, list): + if k not in d: + d[k] = [] # type: ignore + for e in v: + e = cast(DictionaryObject, e.get_object()) + if "/EF" in e: + d[k].append(e["/EF"]["/F"].get_data()) # type: ignore + elif "/RF" in e: + r = cast( + ArrayObject, cast(DictionaryObject, e["/RF"])["/F"] + ) + di: Dict[str, bytes] = {} + i = 0 + while i < len(r): + di[cast(str, r[i])] = r[i + 1].get_object().get_data() + i += 2 + d[k].append(di) + return d + else: + return {} + + def _list_attachments(self) -> List[str]: + """ + Retrieves the list of filenames of file attachments. + + Returns: + list of filenames + """ + ef = self._get_embedded_files_root() + if ef: + lst = ef.list_keys() + else: + lst = [] + """ + for ip, p in enumerate(self.pages): + for a in [_a.get_object() + for _a in p.get("/Annots",[])]: + if _a.get_object().get("/Subtype","") != "/FileAttachements": + continue + lst.append(f"$page_{ip}.{get_name_from_file_specification(_a)}") + """ + return lst + + def _get_attachment_list(self, name: str) -> List[bytes]: + out = self._get_attachments(name)[name] + if isinstance(out, list): + return out + return [out] + + def _get_attachments( + self, filename: Optional[str] = None + ) -> Dict[str, Union[bytes, List[bytes], Dict[str, bytes]]]: + """ + Retrieves all or selected file attachments of the PDF as a dictionary of file names + and the file data as a bytestring. + + Args: + filename: If filename is None, then a dictionary of all attachments + will be returned, where the key is the filename and the value + is the content. Otherwise, a dictionary with just a single key + - the filename - and its content will be returned. + + Returns: + dictionary of filename -> Union[bytestring or List[ByteString]] + if the filename exists multiple times a List of the different version will be provided + """ + ef = self._get_embedded_files_root() + if ef is None: + return {} + if filename is None: + return {k: v if len(v) > 1 else v[0] for k, v in self.attachments.items()} # type: ignore + else: + lst = ef.list_get(filename) + return { + filename: [x["/EF"]["/F"].get_data() for x in lst] # type: ignore + if isinstance(lst, list) + else lst["/EF"]["/F"].get_data() # type: ignore + } + + +class PdfFileReader(PdfReader): # deprecated + def __init__(self, *args: Any, **kwargs: Any) -> None: + deprecation_with_replacement("PdfFileReader", "PdfReader", "3.0.0") + if "strict" not in kwargs and len(args) < 2: + kwargs["strict"] = True # maintain the default + super().__init__(*args, **kwargs) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index befe617d0..4b29d3a9e 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -713,16 +713,28 @@ def _get_embedded_files_root(self) -> Optional[NameTree]: if ef is None: return None efo = ef.get_object() - # not for reader - """ - if not isinstance(efo,NameTree): - if isinstance(ef,IndirectObject): + if not isinstance(efo, NameTree): + efo = NameTree(efo) + if isinstance(ef, IndirectObject): ef.replace_object(efo) else: - cast(DictionaryObject,catalog["/Names"])[ - NameObject("/EmbeddedFiles")] = NameTree(efo) - """ - return NameTree(efo) + cast(DictionaryObject, catalog["/Names"])[ + NameObject("/EmbeddedFiles") + ] = efo + return efo + + def _create_attachment_root(self) -> NameTree: + if "/Names" not in self._root_object: + self._root_object[NameObject("/Names")] = self._add_object( + DictionaryObject() + ) + node = cast(DictionaryObject, self._root_object["/Names"]) + if "/EmbeddedFiles" not in node: + node[NameObject("/EmbeddedFiles")] = self._add_object(NameTree()) + node = cast(NameTree, node["/EmbeddedFiles"]) + if "/Kids" not in node and "/Names" not in node: + node[NameObject("/Names")] = ArrayObject() + return node @property def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: @@ -733,18 +745,37 @@ def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: return None @property - def attachments(self) -> Mapping[str, List[bytes]]: + def attachments(self) -> Mapping[str, Union[List[bytes], List[Dict[str, bytes]]]]: ef = self._get_embedded_files_root() if ef: d = {} for k, v in ef.list_items().items(): if isinstance(v, list): - d[k] = [e["/EF"]["/F"].get_data() for e in v] # type: ignore + if k not in d: + d[k] = [] + for e in v: + e = e.get_object() + if "/EF" in e: + d[k].append(e["/EF"]["/F"].get_data()) # type: ignore + elif "/RF" in e: + r = cast(ArrayObject, e["/RF"]["/F"]) + di = {} + i = 0 + while i < len(r): + di[r[i]] = r[i + 1].get_object().get_data() + i += 2 + d[k].append(di) return d else: return {} - def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: + def add_attachment( + self, + filename: str, + data: Union[str, bytes, List[Tuple[str, bytes]]], + fname: Optional[str] = None, + desc: str = "", + ) -> DictionaryObject: """ Embed a file inside the PDF. @@ -753,9 +784,20 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: Section 7.11.3 Args: - filename: The filename to display. + filename: The filename to display (in UTF-16). data: The data in the file. + if data is an array, it will feed + fname: an old style name for "/F" entry (should be ansi). if None will be automatically proposed + desc: a description string + + Returns: + The filespec DictionaryObject """ + if fname is None: + st = filename.replace("/", "\\/").replace("\\\\/", "\\/") + fname = st.encode().decode("ansi", errors="xmlcharreplace") + fname = f"{fname}" # to escape string + # We need three entries: # * The file's data # * The /Filespec entry @@ -773,9 +815,22 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: # endstream # endobj - file_entry = DecodedStreamObject() - file_entry.set_data(b_(data)) - file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) + if isinstance(data, list): + ef_entry = DictionaryObject() + a = ArrayObject() + ef_entry.update({NameObject("/F"): self._add_object(a)}) + for fn, da in data: + a.append(TextStringObject(fn)) + file_entry = DecodedStreamObject() + file_entry.set_data(b_(da)) + file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) + a.append(self._add_object(file_entry)) + else: + file_entry = DecodedStreamObject() + file_entry.set_data(b_(data)) + file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) + ef_entry = DictionaryObject() + ef_entry.update({NameObject("/F"): self._add_object(file_entry)}) # The Filespec entry # Sample: @@ -786,51 +841,29 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: # /EF << /F 8 0 R >> # >> - ef_entry = DictionaryObject() - ef_entry.update({NameObject("/F"): self._add_object(file_entry)}) - filespec = DictionaryObject() filespec.update( { NameObject(PA.TYPE): NameObject("/Filespec"), - NameObject(FileSpecificationDictionaryEntries.F): create_string_object( + NameObject(FileSpecificationDictionaryEntries.UF): TextStringObject( filename - ), # Perhaps also try TextStringObject - NameObject(FileSpecificationDictionaryEntries.EF): ef_entry, + ), + NameObject(FileSpecificationDictionaryEntries.F): TextStringObject( + fname + ), + NameObject(FileSpecificationDictionaryEntries.DESC): TextStringObject( + desc + ), } ) - - # Then create the entry for the root, as it needs - # a reference to the Filespec - # Sample: - # 1 0 obj - # << - # /Type /Catalog - # /Outlines 2 0 R - # /Pages 3 0 R - # /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >> - # >> - # endobj - - if CA.NAMES not in self._root_object: - self._root_object[NameObject(CA.NAMES)] = self._add_object( - DictionaryObject() - ) - if "/EmbeddedFiles" not in cast(DictionaryObject, self._root_object[CA.NAMES]): - embedded_files_names_dictionary = DictionaryObject( - {NameObject(CA.NAMES): ArrayObject()} - ) - cast(DictionaryObject, self._root_object[CA.NAMES])[ - NameObject("/EmbeddedFiles") - ] = self._add_object(embedded_files_names_dictionary) + if isinstance(data, list): + filespec[NameObject(FileSpecificationDictionaryEntries.RF)] = ef_entry else: - embedded_files_names_dictionary = cast( - DictionaryObject, - cast(DictionaryObject, self._root_object[CA.NAMES])["/EmbeddedFiles"], - ) - cast(ArrayObject, embedded_files_names_dictionary[CA.NAMES]).extend( - [create_string_object(filename), filespec] - ) + filespec[NameObject(FileSpecificationDictionaryEntries.EF)] = ef_entry + + nm = self._get_embedded_files_root() or self._create_attachment_root() + nm.list_add(filename, self._add_object(filespec)) + return filespec def addAttachment(self, fname: str, fdata: Union[str, bytes]) -> None: # deprecated """ diff --git a/pypdf/constants.py b/pypdf/constants.py index bde9ff22d..7f282d48e 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -149,8 +149,11 @@ class FileSpecificationDictionaryEntries: Type = "/Type" FS = "/FS" # The name of the file system to be used to interpret this file specification - F = "/F" # A file specification string of the form described in Section 3.10.1 + F = "/F" # A file specification string of the file as described in Section 3.10.1 + UF = "/UF" # A unicode string of the file as described in Section 3.10.1 EF = "/EF" # dictionary, containing a subset of the keys F , UF , DOS , Mac , and Unix + RF = "/RF" # dictionary, containing arrays of /EmbeddedFile + DESC = "/Desc" # description of the file as de class StreamAttributes: diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index dd14945fa..b86a494e3 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1566,7 +1566,10 @@ def _get(key: str, o: Optional[PdfObject]) -> List[PdfObject]: return _get(key, self) def list_add( - self, key: str, data: PdfObject, overwrite: bool = False + self, + key: Union[str, TextStringObject], + data: PdfObject, + overwrite: bool = False, ) -> Optional[IndirectObject]: """ Add the data entry from the Name Tree @@ -1588,22 +1591,28 @@ def list_add( raise TypeError except (TypeError, AttributeError): raise TypeError("Object does not belong to a PdfWriter") + if not isinstance(key, TextStringObject): + key = TextStringObject(key) def _update_limits( - obj: DictionaryObject, lo: Optional[str], hi: Optional[str] + obj: DictionaryObject, + lo: Optional[TextStringObject], + hi: Optional[TextStringObject], ) -> bool: if "/Limits" not in obj: return False a = cast("ArrayObject", obj["/Limits"]) if lo is not None and lo < a[0]: - a[0] = TextStringObject(lo) + a[0] = lo return True if hi is not None and hi > a[0]: - a[1] = TextStringObject(lo) + a[1] = hi return True return False - def _add_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: + def _add_in( + o: Optional[PdfObject], appb: bool = True, app: bool = True + ) -> Optional[PdfObject]: nonlocal overwrite, writer, key, data if o is None: return None @@ -1611,9 +1620,9 @@ def _add_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: if "/Names" in o: _l = cast(ArrayObject, o["/Names"]) li = o.get("/Limits", [_l[0], _l[-2]]) - if key < li[0]: + if not appb and key < li[0]: return None - if not app and _l > li[1]: + if not app and key > li[1]: return None i = 0 while i < len(_l): @@ -1632,7 +1641,7 @@ def _add_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: _l.insert(i + 1, writer._add_object(data)) _update_limits(o, key, None) return _l[i + 1] - i += 1 + i += 2 if app: _l.append(key) _l.append(writer._add_object(data)) @@ -1642,13 +1651,13 @@ def _add_in(o: Optional[PdfObject], app: bool = True) -> Optional[PdfObject]: else: # kids ar = cast(ArrayObject, o["/Kids"]) for x in ar: - r = _add_in(x, x == ar[-1]) + r = _add_in(x, x == ar[0], x == ar[-1]) if r: _update_limits(o, key, key) return r return None - o = _add_in(self, True) + o = _add_in(self, True, True) return o.indirect_reference if o is not None else None From 8b99ea02c6bd7a20a6574df98879a2e5bbeb2bb6 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 5 Oct 2023 23:36:47 +0200 Subject: [PATCH 04/13] new commit --- pypdf/_reader.py | 4663 +++++++++++++++-------------- pypdf/_writer.py | 40 +- pypdf/generic/_data_structures.py | 28 +- tests/test_writer.py | 3735 +++++++++++------------ 4 files changed, 4247 insertions(+), 4219 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 8bd9e2454..f5d0c5ada 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -1,2331 +1,2332 @@ -# Copyright (c) 2006, Mathieu Fenniak -# Copyright (c) 2007, Ashish Kulkarni -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# * The name of the author may not be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - -import os -import re -import struct -import zlib -from datetime import datetime -from io import BytesIO, UnsupportedOperation -from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Mapping, - Optional, - Tuple, - Union, - cast, -) - -from ._encryption import Encryption, PasswordType -from ._page import PageObject, _VirtualList -from ._page_labels import index2label as page_index2page_label -from ._utils import ( - StrByteType, - StreamType, - b_, - deprecate_no_replacement, - deprecation_no_replacement, - deprecation_with_replacement, - logger_warning, - parse_iso8824_date, - read_non_whitespace, - read_previous_line, - read_until_whitespace, - skip_over_comment, - skip_over_whitespace, -) -from .constants import CatalogAttributes as CA -from .constants import CatalogDictionary as CD -from .constants import ( - CheckboxRadioButtonAttributes, - GoToActionArguments, -) -from .constants import Core as CO -from .constants import DocumentInformationAttributes as DI -from .constants import FieldDictionaryAttributes as FA -from .constants import PageAttributes as PG -from .constants import PagesAttributes as PA -from .constants import TrailerKeys as TK -from .errors import ( - EmptyFileError, - FileNotDecryptedError, - PdfReadError, - PdfStreamError, - WrongPasswordError, -) -from .generic import ( - ArrayObject, - BooleanObject, - ContentStream, - DecodedStreamObject, - Destination, - DictionaryObject, - EncodedStreamObject, - Field, - Fit, - FloatObject, - IndirectObject, - NameObject, - NameTree, - NullObject, - NumberObject, - PdfObject, - TextStringObject, - TreeObject, - ViewerPreferences, - read_object, -) -from .types import OutlineType, PagemodeType -from .xmp import XmpInformation - - -def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: - if size > 8: - raise PdfReadError("invalid size in convert_to_int") - d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d - d = d[-8:] - return struct.unpack(">q", d)[0] - - -def convertToInt(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: # deprecated - deprecation_with_replacement("convertToInt", "convert_to_int") - return convert_to_int(d, size) - - -class DocumentInformation(DictionaryObject): - """ - A class representing the basic document metadata provided in a PDF File. - This class is accessible through - :py:class:`PdfReader.metadata`. - - All text properties of the document metadata have - *two* properties, eg. author and author_raw. The non-raw property will - always return a ``TextStringObject``, making it ideal for a case where - the metadata is being displayed. The raw property can sometimes return - a ``ByteStringObject``, if pypdf was unable to decode the string's - text encoding; this requires additional safety in the caller and - therefore is not as commonly accessed. - """ - - def __init__(self) -> None: - DictionaryObject.__init__(self) - - def _get_text(self, key: str) -> Optional[str]: - retval = self.get(key, None) - if isinstance(retval, TextStringObject): - return retval - return None - - def getText(self, key: str) -> Optional[str]: # deprecated - """ - Use the attributes (e.g. :py:attr:`title` / :py:attr:`author`). - - .. deprecated:: 1.28.0 - """ - deprecation_no_replacement("getText", "3.0.0") - return self._get_text(key) - - @property - def title(self) -> Optional[str]: - """ - Read-only property accessing the document's title. - - Returns a ``TextStringObject`` or ``None`` if the title is not - specified. - """ - return ( - self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore - if self.get(DI.TITLE) - else None - ) - - @property - def title_raw(self) -> Optional[str]: - """The "raw" version of title; can return a ``ByteStringObject``.""" - return self.get(DI.TITLE) - - @property - def author(self) -> Optional[str]: - """ - Read-only property accessing the document's author. - - Returns a ``TextStringObject`` or ``None`` if the author is not - specified. - """ - return self._get_text(DI.AUTHOR) - - @property - def author_raw(self) -> Optional[str]: - """The "raw" version of author; can return a ``ByteStringObject``.""" - return self.get(DI.AUTHOR) - - @property - def subject(self) -> Optional[str]: - """ - Read-only property accessing the document's subject. - - Returns a ``TextStringObject`` or ``None`` if the subject is not - specified. - """ - return self._get_text(DI.SUBJECT) - - @property - def subject_raw(self) -> Optional[str]: - """The "raw" version of subject; can return a ``ByteStringObject``.""" - return self.get(DI.SUBJECT) - - @property - def creator(self) -> Optional[str]: - """ - Read-only property accessing the document's creator. - - If the document was converted to PDF from another format, this is the - name of the application (e.g. OpenOffice) that created the original - document from which it was converted. Returns a ``TextStringObject`` or - ``None`` if the creator is not specified. - """ - return self._get_text(DI.CREATOR) - - @property - def creator_raw(self) -> Optional[str]: - """The "raw" version of creator; can return a ``ByteStringObject``.""" - return self.get(DI.CREATOR) - - @property - def producer(self) -> Optional[str]: - """ - Read-only property accessing the document's producer. - - If the document was converted to PDF from another format, this is the - name of the application (for example, OSX Quartz) that converted it to - PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not - specified. - """ - return self._get_text(DI.PRODUCER) - - @property - def producer_raw(self) -> Optional[str]: - """The "raw" version of producer; can return a ``ByteStringObject``.""" - return self.get(DI.PRODUCER) - - @property - def creation_date(self) -> Optional[datetime]: - """Read-only property accessing the document's creation date.""" - return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) - - @property - def creation_date_raw(self) -> Optional[str]: - """ - The "raw" version of creation date; can return a ``ByteStringObject``. - - Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix - is the offset from UTC. - """ - return self.get(DI.CREATION_DATE) - - @property - def modification_date(self) -> Optional[datetime]: - """ - Read-only property accessing the document's modification date. - - The date and time the document was most recently modified. - """ - return parse_iso8824_date(self._get_text(DI.MOD_DATE)) - - @property - def modification_date_raw(self) -> Optional[str]: - """ - The "raw" version of modification date; can return a - ``ByteStringObject``. - - Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix - is the offset from UTC. - """ - return self.get(DI.MOD_DATE) - - -class PdfReader: - """ - Initialize a PdfReader object. - - This operation can take some time, as the PDF stream's cross-reference - tables are read into memory. - - Args: - stream: A File object or an object that supports the standard read - and seek methods similar to a File object. Could also be a - string representing a path to a PDF file. - strict: Determines whether user should be warned of all - problems and also causes some correctable problems to be fatal. - Defaults to ``False``. - password: Decrypt PDF file at initialization. If the - password is None, the file will not be decrypted. - Defaults to ``None`` - """ - - @property - def viewer_preferences(self) -> Optional[ViewerPreferences]: - """Returns the existing ViewerPreferences as an overloaded dictionary.""" - o = cast(DictionaryObject, self.trailer["/Root"]).get( - CD.VIEWER_PREFERENCES, None - ) - if o is None: - return None - o = o.get_object() - if not isinstance(o, ViewerPreferences): - o = ViewerPreferences(o) - return o - - def __init__( - self, - stream: Union[StrByteType, Path], - strict: bool = False, - password: Union[None, str, bytes] = None, - ) -> None: - self.strict = strict - self.flattened_pages: Optional[List[PageObject]] = None - self.resolved_objects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {} - self.xref_index = 0 - self._page_id2num: Optional[ - Dict[Any, Any] - ] = None # map page indirect_reference number to Page Number - if hasattr(stream, "mode") and "b" not in stream.mode: # type: ignore - logger_warning( - "PdfReader stream/file object is not in binary mode. " - "It may not be read correctly.", - __name__, - ) - if isinstance(stream, (str, Path)): - with open(stream, "rb") as fh: - stream = BytesIO(fh.read()) - self.read(stream) - self.stream = stream - - self._override_encryption = False - self._encryption: Optional[Encryption] = None - if self.is_encrypted: - self._override_encryption = True - # Some documents may not have a /ID, use two empty - # byte strings instead. Solves - # https://github.com/py-pdf/pypdf/issues/608 - id_entry = self.trailer.get(TK.ID) - id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" - encrypt_entry = cast( - DictionaryObject, self.trailer[TK.ENCRYPT].get_object() - ) - self._encryption = Encryption.read(encrypt_entry, id1_entry) - - # try empty password if no password provided - pwd = password if password is not None else b"" - if ( - self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED - and password is not None - ): - # raise if password provided - raise WrongPasswordError("Wrong password") - self._override_encryption = False - elif password is not None: - raise PdfReadError("Not encrypted file") - - @property - def pdf_header(self) -> str: - """ - The first 8 bytes of the file. - - This is typically something like ``'%PDF-1.6'`` and can be used to - detect if the file is actually a PDF file and which version it is. - """ - # TODO: Make this return a bytes object for consistency - # but that needs a deprecation - loc = self.stream.tell() - self.stream.seek(0, 0) - pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") - self.stream.seek(loc, 0) # return to where it was - return pdf_file_version - - @property - def metadata(self) -> Optional[DocumentInformation]: - """ - Retrieve the PDF file's document information dictionary, if it exists. - - Note that some PDF files use metadata streams instead of docinfo - dictionaries, and these metadata streams will not be accessed by this - function. - """ - if TK.INFO not in self.trailer: - return None - obj = self.trailer[TK.INFO] - retval = DocumentInformation() - if isinstance(obj, type(None)): - raise PdfReadError( - "trailer not found or does not point to document information directory" - ) - retval.update(obj) # type: ignore - return retval - - def getDocumentInfo(self) -> Optional[DocumentInformation]: # deprecated - """ - Use the attribute :py:attr:`metadata` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getDocumentInfo", "metadata", "3.0.0") - return self.metadata - - @property - def documentInfo(self) -> Optional[DocumentInformation]: # deprecated - """ - Use the attribute :py:attr:`metadata` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("documentInfo", "metadata", "3.0.0") - return self.metadata - - @property - def xmp_metadata(self) -> Optional[XmpInformation]: - """XMP (Extensible Metadata Platform) data.""" - try: - self._override_encryption = True - return self.trailer[TK.ROOT].xmp_metadata # type: ignore - finally: - self._override_encryption = False - - def getXmpMetadata(self) -> Optional[XmpInformation]: # deprecated - """ - Use the attribute :py:attr:`metadata` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getXmpMetadata", "xmp_metadata", "3.0.0") - return self.xmp_metadata - - @property - def xmpMetadata(self) -> Optional[XmpInformation]: # deprecated - """ - Use the attribute :py:attr:`xmp_metadata` instead. - - .. deprecated:: 1.28.0. - """ - deprecation_with_replacement("xmpMetadata", "xmp_metadata", "3.0.0") - return self.xmp_metadata - - def _get_num_pages(self) -> int: - """ - Calculate the number of pages in this PDF file. - - Returns: - The number of pages of the parsed PDF file - - Raises: - PdfReadError: if file is encrypted and restrictions prevent - this action. - """ - # Flattened pages will not work on an Encrypted PDF; - # the PDF file's page count is used in this case. Otherwise, - # the original method (flattened page count) is used. - if self.is_encrypted: - return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore - else: - if self.flattened_pages is None: - self._flatten() - return len(self.flattened_pages) # type: ignore - - def getNumPages(self) -> int: # deprecated - """ - Use :code:`len(reader.pages)` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("reader.getNumPages", "len(reader.pages)", "3.0.0") - return self._get_num_pages() - - @property - def numPages(self) -> int: # deprecated - """ - Use :code:`len(reader.pages)` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("reader.numPages", "len(reader.pages)", "3.0.0") - return self._get_num_pages() - - def getPage(self, pageNumber: int) -> PageObject: # deprecated - """ - Use :code:`reader.pages[page_number]` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "reader.getPage(pageNumber)", "reader.pages[page_number]", "3.0.0" - ) - return self._get_page(pageNumber) - - def _get_page(self, page_number: int) -> PageObject: - """ - Retrieve a page by number from this PDF file. - - Args: - page_number: The page number to retrieve - (pages begin at zero) - - Returns: - A :class:`PageObject` instance. - """ - if self.flattened_pages is None: - self._flatten() - assert self.flattened_pages is not None, "hint for mypy" - return self.flattened_pages[page_number] - - @property - def namedDestinations(self) -> Dict[str, Any]: # deprecated - """ - Use :py:attr:`named_destinations` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("namedDestinations", "named_destinations", "3.0.0") - return self.named_destinations - - @property - def named_destinations(self) -> Dict[str, Any]: - """ - A read-only dictionary which maps names to - :class:`Destinations` - """ - return self._get_named_destinations() - - # A select group of relevant field attributes. For the complete list, - # see section 8.6.2 of the PDF 1.7 reference. - - def get_fields( - self, - tree: Optional[TreeObject] = None, - retval: Optional[Dict[Any, Any]] = None, - fileobj: Optional[Any] = None, - ) -> Optional[Dict[str, Any]]: - """ - Extract field data if this PDF contains interactive form fields. - - The *tree* and *retval* parameters are for recursive use. - - Args: - tree: - retval: - fileobj: A file object (usually a text file) to write - a report to on all interactive form fields found. - - Returns: - A dictionary where each key is a field name, and each - value is a :class:`Field` object. By - default, the mapping name is used for keys. - ``None`` if form data could not be located. - """ - field_attributes = FA.attributes_dict() - field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) - if retval is None: - retval = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - # get the AcroForm tree - if CD.ACRO_FORM in catalog: - tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) - else: - return None - if tree is None: - return retval - self._check_kids(tree, retval, fileobj) - for attr in field_attributes: - if attr in tree: - # Tree is a field - self._build_field(tree, retval, fileobj, field_attributes) - break - - if "/Fields" in tree: - fields = cast(ArrayObject, tree["/Fields"]) - for f in fields: - field = f.get_object() - self._build_field(field, retval, fileobj, field_attributes) - - return retval - - def getFields( - self, - tree: Optional[TreeObject] = None, - retval: Optional[Dict[Any, Any]] = None, - fileobj: Optional[Any] = None, - ) -> Optional[Dict[str, Any]]: # deprecated - """ - Use :meth:`get_fields` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getFields", "get_fields", "3.0.0") - return self.get_fields(tree, retval, fileobj) - - def _get_qualified_field_name(self, parent: DictionaryObject) -> str: - if "/TM" in parent: - return cast(str, parent["/TM"]) - elif "/Parent" in parent: - return ( - self._get_qualified_field_name( - cast(DictionaryObject, parent["/Parent"]) - ) - + "." - + cast(str, parent["/T"]) - ) - else: - return cast(str, parent["/T"]) - - def _build_field( - self, - field: Union[TreeObject, DictionaryObject], - retval: Dict[Any, Any], - fileobj: Any, - field_attributes: Any, - ) -> None: - self._check_kids(field, retval, fileobj) - try: - key = cast(str, field["/TM"]) - except KeyError: - try: - if "/Parent" in field: - key = ( - self._get_qualified_field_name( - cast(DictionaryObject, field["/Parent"]) - ) - + "." - ) - else: - key = "" - key += cast(str, field["/T"]) - except KeyError: - # Ignore no-name field for now - return - if fileobj: - self._write_field(fileobj, field, field_attributes) - fileobj.write("\n") - retval[key] = Field(field) - obj = retval[key].indirect_reference.get_object() # to get the full object - if obj.get(FA.FT, "") == "/Ch": - retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] - if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: - # Checkbox - retval[key][NameObject("/_States_")] = ArrayObject( - list(obj["/AP"]["/N"].keys()) - ) - if "/Off" not in retval[key]["/_States_"]: - retval[key][NameObject("/_States_")].append(NameObject("/Off")) - elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: - states = [] - for k in obj.get(FA.Kids, {}): - k = k.get_object() - for s in list(k["/AP"]["/N"].keys()): - if s not in states: - states.append(s) - retval[key][NameObject("/_States_")] = ArrayObject(states) - if ( - obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 - and "/Off" in retval[key]["/_States_"] - ): - del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] - - def _check_kids( - self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any - ) -> None: - if PA.KIDS in tree: - # recurse down the tree - for kid in tree[PA.KIDS]: # type: ignore - self.get_fields(kid.get_object(), retval, fileobj) - - def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: - field_attributes_tuple = FA.attributes() - field_attributes_tuple = ( - field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() - ) - - for attr in field_attributes_tuple: - if attr in ( - FA.Kids, - FA.AA, - ): - continue - attr_name = field_attributes[attr] - try: - if attr == FA.FT: - # Make the field type value more clear - types = { - "/Btn": "Button", - "/Tx": "Text", - "/Ch": "Choice", - "/Sig": "Signature", - } - if field[attr] in types: - fileobj.write(f"{attr_name}: {types[field[attr]]}\n") - elif attr == FA.Parent: - # Let's just write the name of the parent - try: - name = field[attr][FA.TM] - except KeyError: - name = field[attr][FA.T] - fileobj.write(f"{attr_name}: {name}\n") - else: - fileobj.write(f"{attr_name}: {field[attr]}\n") - except KeyError: - # Field attribute is N/A or unknown, so don't write anything - pass - - def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]: - """ - Retrieve form fields from the document with textual data. - - Args: - full_qualified_name: to get full name - - Returns: - A dictionary. The key is the name of the form field, - the value is the content of the field. - - If the document contains multiple form fields with the same name, the - second and following will get the suffix .2, .3, ... - """ - - def indexed_key(k: str, fields: dict) -> str: - if k not in fields: - return k - else: - return ( - k - + "." - + str(sum([1 for kk in fields if kk.startswith(k + ".")]) + 2) - ) - - # Retrieve document form fields - formfields = self.get_fields() - if formfields is None: - return {} - ff = {} - for field, value in formfields.items(): - if value.get("/FT") == "/Tx": - if full_qualified_name: - ff[field] = value.get("/V") - else: - ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") - return ff - - def getFormTextFields(self) -> Dict[str, Any]: # deprecated - """ - Use :meth:`get_form_text_fields` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "getFormTextFields", "get_form_text_fields", "3.0.0" - ) - return self.get_form_text_fields() - - def _get_named_destinations( - self, - tree: Union[TreeObject, None] = None, - retval: Optional[Any] = None, - ) -> Dict[str, Any]: - """ - Retrieve the named destinations present in the document. - - Args: - tree: - retval: - - Returns: - A dictionary which maps names to - :class:`Destinations`. - """ - if retval is None: - retval = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - # get the name tree - if CA.DESTS in catalog: - tree = cast(TreeObject, catalog[CA.DESTS]) - elif CA.NAMES in catalog: - names = cast(DictionaryObject, catalog[CA.NAMES]) - if CA.DESTS in names: - tree = cast(TreeObject, names[CA.DESTS]) - - if tree is None: - return retval - - if PA.KIDS in tree: - # recurse down the tree - for kid in cast(ArrayObject, tree[PA.KIDS]): - self._get_named_destinations(kid.get_object(), retval) - # TABLE 3.33 Entries in a name tree node dictionary (PDF 1.7 specs) - elif CA.NAMES in tree: # KIDS and NAMES are exclusives (PDF 1.7 specs p 162) - names = cast(DictionaryObject, tree[CA.NAMES]) - i = 0 - while i < len(names): - key = cast(str, names[i].get_object()) - i += 1 - if not isinstance(key, str): - continue - try: - value = names[i].get_object() - except IndexError: - break - i += 1 - if isinstance(value, DictionaryObject) and "/D" in value: - value = value["/D"] - dest = self._build_destination(key, value) # type: ignore - if dest is not None: - retval[key] = dest - else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF1.1 - for k__, v__ in tree.items(): - val = v__.get_object() - if isinstance(val, DictionaryObject): - val = val["/D"].get_object() - dest = self._build_destination(k__, val) - if dest is not None: - retval[k__] = dest - return retval - - def getNamedDestinations( - self, - tree: Union[TreeObject, None] = None, - retval: Optional[Any] = None, - ) -> Dict[str, Any]: # deprecated - """ - Use :py:attr:`named_destinations` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "getNamedDestinations", "named_destinations", "3.0.0" - ) - return self._get_named_destinations(tree, retval) - - @property - def outline(self) -> OutlineType: - """ - Read-only property for the outline present in the document. - - (i.e., a collection of 'outline items' which are also known as - 'bookmarks') - """ - return self._get_outline() - - @property - def outlines(self) -> OutlineType: # deprecated - """ - Use :py:attr:`outline` instead. - - .. deprecated:: 2.9.0 - """ - deprecation_with_replacement("outlines", "outline", "3.0.0") - return self.outline - - def _get_outline( - self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None - ) -> OutlineType: - if outline is None: - outline = [] - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - # get the outline dictionary and named destinations - if CO.OUTLINES in catalog: - lines = cast(DictionaryObject, catalog[CO.OUTLINES]) - - if isinstance(lines, NullObject): - return outline - - # TABLE 8.3 Entries in the outline dictionary - if lines is not None and "/First" in lines: - node = cast(DictionaryObject, lines["/First"]) - self._namedDests = self._get_named_destinations() - - if node is None: - return outline - - # see if there are any more outline items - while True: - outline_obj = self._build_outline_item(node) - if outline_obj: - outline.append(outline_obj) - - # check for sub-outline - if "/First" in node: - sub_outline: List[Any] = [] - self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) - if sub_outline: - outline.append(sub_outline) - - if "/Next" not in node: - break - node = cast(DictionaryObject, node["/Next"]) - - return outline - - def getOutlines( - self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None - ) -> OutlineType: # deprecated - """ - Use :py:attr:`outline` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getOutlines", "outline", "3.0.0") - return self._get_outline(node, outline) - - @property - def threads(self) -> Optional[ArrayObject]: - """ - Read-only property for the list of threads. - - See §8.3.2 from PDF 1.7 spec. - - It's an array of dictionaries with "/F" and "/I" properties or - None if there are no articles. - """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - if CO.THREADS in catalog: - return cast("ArrayObject", catalog[CO.THREADS]) - else: - return None - - def _get_page_number_by_indirect( - self, indirect_reference: Union[None, int, NullObject, IndirectObject] - ) -> int: - """ - Generate _page_id2num. - - Args: - indirect_reference: - - Returns: - The page number. - """ - if self._page_id2num is None: - self._page_id2num = { - x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore - } - - if indirect_reference is None or isinstance(indirect_reference, NullObject): - return -1 - if isinstance(indirect_reference, int): - idnum = indirect_reference - else: - idnum = indirect_reference.idnum - assert self._page_id2num is not None, "hint for mypy" - ret = self._page_id2num.get(idnum, -1) - return ret - - def get_page_number(self, page: PageObject) -> int: - """ - Retrieve page number of a given PageObject. - - Args: - page: The page to get page number. Should be - an instance of :class:`PageObject` - - Returns: - The page number or -1 if page is not found - """ - return self._get_page_number_by_indirect(page.indirect_reference) - - def getPageNumber(self, page: PageObject) -> int: # deprecated - """ - Use :meth:`get_page_number` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getPageNumber", "get_page_number", "3.0.0") - return self.get_page_number(page) - - def get_destination_page_number(self, destination: Destination) -> int: - """ - Retrieve page number of a given Destination object. - - Args: - destination: The destination to get page number. - - Returns: - The page number or -1 if page is not found - """ - return self._get_page_number_by_indirect(destination.page) - - def getDestinationPageNumber(self, destination: Destination) -> int: # deprecated - """ - Use :meth:`get_destination_page_number` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "getDestinationPageNumber", "get_destination_page_number", "3.0.0" - ) - return self.get_destination_page_number(destination) - - def _build_destination( - self, - title: str, - array: Optional[ - List[ - Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] - ] - ], - ) -> Destination: - page, typ = None, None - # handle outline items with missing or invalid destination - if ( - isinstance(array, (NullObject, str)) - or (isinstance(array, ArrayObject) and len(array) == 0) - or array is None - ): - page = NullObject() - return Destination(title, page, Fit.fit()) - else: - page, typ = array[0:2] # type: ignore - array = array[2:] - try: - return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore - except PdfReadError: - logger_warning(f"Unknown destination: {title} {array}", __name__) - if self.strict: - raise - # create a link to first Page - tmp = self.pages[0].indirect_reference - indirect_reference = NullObject() if tmp is None else tmp - return Destination(title, indirect_reference, Fit.fit()) # type: ignore - - def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: - dest, title, outline_item = None, None, None - - # title required for valid outline - # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary - try: - title = cast("str", node["/Title"]) - except KeyError: - if self.strict: - raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") - title = "" # type: ignore - - if "/A" in node: - # Action, PDFv1.7 Section 12.6 (only type GoTo supported) - action = cast(DictionaryObject, node["/A"]) - action_type = cast(NameObject, action[GoToActionArguments.S]) - if action_type == "/GoTo": - dest = action[GoToActionArguments.D] - elif "/Dest" in node: - # Destination, PDFv1.7 Section 12.3.2 - dest = node["/Dest"] - # if array was referenced in another object, will be a dict w/ key "/D" - if isinstance(dest, DictionaryObject) and "/D" in dest: - dest = dest["/D"] - - if isinstance(dest, ArrayObject): - outline_item = self._build_destination(title, dest) - elif isinstance(dest, str): - # named destination, addresses NameObject Issue #193 - # TODO : keep named destination instead of replacing it ? - try: - outline_item = self._build_destination( - title, self._namedDests[dest].dest_array - ) - except KeyError: - # named destination not found in Name Dict - outline_item = self._build_destination(title, None) - elif dest is None: - # outline item not required to have destination or action - # PDFv1.7 Table 153 - outline_item = self._build_destination(title, dest) - else: - if self.strict: - raise PdfReadError(f"Unexpected destination {dest!r}") - else: - logger_warning( - f"Removed unexpected destination {dest!r} from destination", - __name__, - ) - outline_item = self._build_destination(title, None) # type: ignore - - # if outline item created, add color, format, and child count if present - if outline_item: - if "/C" in node: - # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 - outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore - if "/F" in node: - # specifies style characteristics bold and/or italic - # with 1=italic, 2=bold, 3=both - outline_item[NameObject("/F")] = node["/F"] - if "/Count" in node: - # absolute value = num. visible children - # with positive = open/unfolded, negative = closed/folded - outline_item[NameObject("/Count")] = node["/Count"] - # if count is 0 we will consider it as open ( in order to have always an is_open to simplify - outline_item[NameObject("/%is_open%")] = BooleanObject( - node.get("/Count", 0) >= 0 - ) - outline_item.node = node - try: - outline_item.indirect_reference = node.indirect_reference - except AttributeError: - pass - return outline_item - - @property - def pages(self) -> List[PageObject]: - """Read-only property that emulates a list of :py:class:`Page` objects.""" - return _VirtualList(self._get_num_pages, self._get_page) # type: ignore - - @property - def page_labels(self) -> List[str]: - """ - A list of labels for the pages in this document. - - This property is read-only. The labels are in the order that the pages - appear in the document. - """ - return [page_index2page_label(self, i) for i in range(len(self.pages))] - - @property - def page_layout(self) -> Optional[str]: - """ - Get the page layout currently being used. - - .. list-table:: Valid ``layout`` values - :widths: 50 200 - - * - /NoLayout - - Layout explicitly not specified - * - /SinglePage - - Show one page at a time - * - /OneColumn - - Show one column at a time - * - /TwoColumnLeft - - Show pages in two columns, odd-numbered pages on the left - * - /TwoColumnRight - - Show pages in two columns, odd-numbered pages on the right - * - /TwoPageLeft - - Show two pages at a time, odd-numbered pages on the left - * - /TwoPageRight - - Show two pages at a time, odd-numbered pages on the right - """ - trailer = cast(DictionaryObject, self.trailer[TK.ROOT]) - if CD.PAGE_LAYOUT in trailer: - return cast(NameObject, trailer[CD.PAGE_LAYOUT]) - return None - - def getPageLayout(self) -> Optional[str]: # deprecated - """ - Use :py:attr:`page_layout` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getPageLayout", "page_layout", "3.0.0") - return self.page_layout - - @property - def pageLayout(self) -> Optional[str]: # deprecated - """ - Use :py:attr:`page_layout` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") - return self.page_layout - - @property - def page_mode(self) -> Optional[PagemodeType]: - """ - Get the page mode currently being used. - - .. list-table:: Valid ``mode`` values - :widths: 50 200 - - * - /UseNone - - Do not show outline or thumbnails panels - * - /UseOutlines - - Show outline (aka bookmarks) panel - * - /UseThumbs - - Show page thumbnails panel - * - /FullScreen - - Fullscreen view - * - /UseOC - - Show Optional Content Group (OCG) panel - * - /UseAttachments - - Show attachments panel - """ - try: - return self.trailer[TK.ROOT]["/PageMode"] # type: ignore - except KeyError: - return None - - def getPageMode(self) -> Optional[PagemodeType]: # deprecated - """ - Use :py:attr:`page_mode` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getPageMode", "page_mode", "3.0.0") - return self.page_mode - - @property - def pageMode(self) -> Optional[PagemodeType]: # deprecated - """ - Use :py:attr:`page_mode` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("pageMode", "page_mode", "3.0.0") - return self.page_mode - - def _flatten( - self, - pages: Union[None, DictionaryObject, PageObject] = None, - inherit: Optional[Dict[str, Any]] = None, - indirect_reference: Optional[IndirectObject] = None, - ) -> None: - inheritable_page_attributes = ( - NameObject(PG.RESOURCES), - NameObject(PG.MEDIABOX), - NameObject(PG.CROPBOX), - NameObject(PG.ROTATE), - ) - if inherit is None: - inherit = {} - if pages is None: - # Fix issue 327: set flattened_pages attribute only for - # decrypted file - catalog = self.trailer[TK.ROOT].get_object() - pages = catalog["/Pages"].get_object() # type: ignore - self.flattened_pages = [] - - if PA.TYPE in pages: - t = pages[PA.TYPE] # type: ignore - # if pdf has no type, considered as a page if /Kids is missing - elif PA.KIDS not in pages: - t = "/Page" - else: - t = "/Pages" - - if t == "/Pages": - for attr in inheritable_page_attributes: - if attr in pages: - inherit[attr] = pages[attr] - for page in pages[PA.KIDS]: # type: ignore - addt = {} - if isinstance(page, IndirectObject): - addt["indirect_reference"] = page - obj = page.get_object() - if obj: - # damaged file may have invalid child in /Pages - self._flatten(obj, inherit, **addt) - elif t == "/Page": - for attr_in, value in list(inherit.items()): - # if the page has it's own value, it does not inherit the - # parent's value: - if attr_in not in pages: - pages[attr_in] = value - page_obj = PageObject(self, indirect_reference) - page_obj.update(pages) - - # TODO: Could flattened_pages be None at this point? - self.flattened_pages.append(page_obj) # type: ignore - - def _get_object_from_stream( - self, indirect_reference: IndirectObject - ) -> Union[int, PdfObject, str]: - # indirect reference to object in object stream - # read the entire object stream into memory - stmnum, idx = self.xref_objStm[indirect_reference.idnum] - obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore - # This is an xref to a stream, so its type better be a stream - assert cast(str, obj_stm["/Type"]) == "/ObjStm" - # /N is the number of indirect objects in the stream - assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.get_data())) - for i in range(obj_stm["/N"]): # type: ignore - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - objnum = NumberObject.read_from_stream(stream_data) - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - offset = NumberObject.read_from_stream(stream_data) - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - if objnum != indirect_reference.idnum: - # We're only interested in one object - continue - if self.strict and idx != i: - raise PdfReadError("Object is in wrong index.") - stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore - - # to cope with some case where the 'pointer' is on a white space - read_non_whitespace(stream_data) - stream_data.seek(-1, 1) - - try: - obj = read_object(stream_data, self) - except PdfStreamError as exc: - # Stream object cannot be read. Normally, a critical error, but - # Adobe Reader doesn't complain, so continue (in strict mode?) - logger_warning( - f"Invalid stream (index {i}) within object " - f"{indirect_reference.idnum} {indirect_reference.generation}: " - f"{exc}", - __name__, - ) - - if self.strict: - raise PdfReadError(f"Can't read object stream: {exc}") - # Replace with null. Hopefully it's nothing important. - obj = NullObject() - return obj - - if self.strict: - raise PdfReadError("This is a fatal error in strict mode.") - return NullObject() - - def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: - """ - Used to ease development. - - This is equivalent to generic.IndirectObject(num,gen,self).get_object() - - Args: - num: The object number of the indirect object. - gen: The generation number of the indirect object. - - Returns: - A PdfObject - """ - return IndirectObject(num, gen, self).get_object() - - def get_object( - self, indirect_reference: Union[int, IndirectObject] - ) -> Optional[PdfObject]: - if isinstance(indirect_reference, int): - indirect_reference = IndirectObject(indirect_reference, 0, self) - retval = self.cache_get_indirect_object( - indirect_reference.generation, indirect_reference.idnum - ) - if retval is not None: - return retval - if ( - indirect_reference.generation == 0 - and indirect_reference.idnum in self.xref_objStm - ): - retval = self._get_object_from_stream(indirect_reference) # type: ignore - elif ( - indirect_reference.generation in self.xref - and indirect_reference.idnum in self.xref[indirect_reference.generation] - ): - if self.xref_free_entry.get(indirect_reference.generation, {}).get( - indirect_reference.idnum, False - ): - return NullObject() - start = self.xref[indirect_reference.generation][indirect_reference.idnum] - self.stream.seek(start, 0) - try: - idnum, generation = self.read_object_header(self.stream) - except Exception: - if hasattr(self.stream, "getbuffer"): - buf = bytes(self.stream.getbuffer()) # type: ignore - else: - p = self.stream.tell() - self.stream.seek(0, 0) - buf = self.stream.read(-1) - self.stream.seek(p, 0) - m = re.search( - rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), - buf, - ) - if m is not None: - logger_warning( - f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", - __name__, - ) - self.xref[indirect_reference.generation][ - indirect_reference.idnum - ] = (m.start(0) + 1) - self.stream.seek(m.start(0) + 1) - idnum, generation = self.read_object_header(self.stream) - else: - idnum = -1 # exception will be raised below - if idnum != indirect_reference.idnum and self.xref_index: - # Xref table probably had bad indexes due to not being zero-indexed - if self.strict: - raise PdfReadError( - f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " - f"does not match actual ({idnum} {generation}); " - "xref table not zero-indexed." - ) - # xref table is corrected in non-strict mode - elif idnum != indirect_reference.idnum and self.strict: - # some other problem - raise PdfReadError( - f"Expected object ID ({indirect_reference.idnum} " - f"{indirect_reference.generation}) does not match actual " - f"({idnum} {generation})." - ) - if self.strict: - assert generation == indirect_reference.generation - retval = read_object(self.stream, self) # type: ignore - - # override encryption is used for the /Encrypt dictionary - if not self._override_encryption and self._encryption is not None: - # if we don't have the encryption key: - if not self._encryption.is_decrypted(): - raise FileNotDecryptedError("File has not been decrypted") - # otherwise, decrypt here... - retval = cast(PdfObject, retval) - retval = self._encryption.decrypt_object( - retval, indirect_reference.idnum, indirect_reference.generation - ) - else: - if hasattr(self.stream, "getbuffer"): - buf = bytes(self.stream.getbuffer()) # type: ignore - else: - p = self.stream.tell() - self.stream.seek(0, 0) - buf = self.stream.read(-1) - self.stream.seek(p, 0) - m = re.search( - rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), - buf, - ) - if m is not None: - logger_warning( - f"Object {indirect_reference.idnum} {indirect_reference.generation} found", - __name__, - ) - if indirect_reference.generation not in self.xref: - self.xref[indirect_reference.generation] = {} - self.xref[indirect_reference.generation][indirect_reference.idnum] = ( - m.start(0) + 1 - ) - self.stream.seek(m.end(0) + 1) - skip_over_whitespace(self.stream) - self.stream.seek(-1, 1) - retval = read_object(self.stream, self) # type: ignore - - # override encryption is used for the /Encrypt dictionary - if not self._override_encryption and self._encryption is not None: - # if we don't have the encryption key: - if not self._encryption.is_decrypted(): - raise FileNotDecryptedError("File has not been decrypted") - # otherwise, decrypt here... - retval = cast(PdfObject, retval) - retval = self._encryption.decrypt_object( - retval, indirect_reference.idnum, indirect_reference.generation - ) - else: - logger_warning( - f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", - __name__, - ) - if self.strict: - raise PdfReadError("Could not find object.") - self.cache_indirect_object( - indirect_reference.generation, indirect_reference.idnum, retval - ) - return retval - - def getObject( - self, indirectReference: IndirectObject - ) -> Optional[PdfObject]: # deprecated - """ - Use :meth:`get_object` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getObject", "get_object", "3.0.0") - return self.get_object(indirectReference) - - def read_object_header(self, stream: StreamType) -> Tuple[int, int]: - # Should never be necessary to read out whitespace, since the - # cross-reference table should put us in the right spot to read the - # object header. In reality... some files have stupid cross reference - # tables that are off by whitespace bytes. - extra = False - skip_over_comment(stream) - extra |= skip_over_whitespace(stream) - stream.seek(-1, 1) - idnum = read_until_whitespace(stream) - extra |= skip_over_whitespace(stream) - stream.seek(-1, 1) - generation = read_until_whitespace(stream) - extra |= skip_over_whitespace(stream) - stream.seek(-1, 1) - - # although it's not used, it might still be necessary to read - _obj = stream.read(3) - - read_non_whitespace(stream) - stream.seek(-1, 1) - if extra and self.strict: - logger_warning( - f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore - __name__, - ) - return int(idnum), int(generation) - - def readObjectHeader(self, stream: StreamType) -> Tuple[int, int]: # deprecated - """ - Use :meth:`read_object_header` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("readObjectHeader", "read_object_header", "3.0.0") - return self.read_object_header(stream) - - def cache_get_indirect_object( - self, generation: int, idnum: int - ) -> Optional[PdfObject]: - return self.resolved_objects.get((generation, idnum)) - - def cacheGetIndirectObject( - self, generation: int, idnum: int - ) -> Optional[PdfObject]: # deprecated - """ - Use :meth:`cache_get_indirect_object` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement( - "cacheGetIndirectObject", "cache_get_indirect_object", "3.0.0" - ) - return self.cache_get_indirect_object(generation, idnum) - - def cache_indirect_object( - self, generation: int, idnum: int, obj: Optional[PdfObject] - ) -> Optional[PdfObject]: - if (generation, idnum) in self.resolved_objects: - msg = f"Overwriting cache for {generation} {idnum}" - if self.strict: - raise PdfReadError(msg) - logger_warning(msg, __name__) - self.resolved_objects[(generation, idnum)] = obj - if obj is not None: - obj.indirect_reference = IndirectObject(idnum, generation, self) - return obj - - def cacheIndirectObject( - self, generation: int, idnum: int, obj: Optional[PdfObject] - ) -> Optional[PdfObject]: # deprecated - """ - Use :meth:`cache_indirect_object` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("cacheIndirectObject", "cache_indirect_object") - return self.cache_indirect_object(generation, idnum, obj) - - def read(self, stream: StreamType) -> None: - self._basic_validation(stream) - self._find_eof_marker(stream) - startxref = self._find_startxref_pos(stream) - - # check and eventually correct the startxref only in not strict - xref_issue_nr = self._get_xref_issues(stream, startxref) - if xref_issue_nr != 0: - if self.strict and xref_issue_nr: - raise PdfReadError("Broken xref table") - logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) - - # read all cross reference tables and their trailers - self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) - - # if not zero-indexed, verify that the table is correct; change it if necessary - if self.xref_index and not self.strict: - loc = stream.tell() - for gen, xref_entry in self.xref.items(): - if gen == 65535: - continue - xref_k = sorted( - xref_entry.keys() - ) # must ensure ascendant to prevent damage - for id in xref_k: - stream.seek(xref_entry[id], 0) - try: - pid, _pgen = self.read_object_header(stream) - except ValueError: - break - if pid == id - self.xref_index: - # fixing index item per item is required for revised PDF. - self.xref[gen][pid] = self.xref[gen][id] - del self.xref[gen][id] - # if not, then either it's just plain wrong, or the - # non-zero-index is actually correct - stream.seek(loc, 0) # return to where it was - - def _basic_validation(self, stream: StreamType) -> None: - """Ensure file is not empty. Read at most 5 bytes.""" - stream.seek(0, os.SEEK_SET) - try: - header_byte = stream.read(5) - except UnicodeDecodeError: - raise UnsupportedOperation("cannot read header") - if header_byte == b"": - raise EmptyFileError("Cannot read an empty file") - elif header_byte != b"%PDF-": - if self.strict: - raise PdfReadError( - f"PDF starts with '{header_byte.decode('utf8')}', " - "but '%PDF-' expected" - ) - else: - logger_warning(f"invalid pdf header: {header_byte}", __name__) - stream.seek(0, os.SEEK_END) - - def _find_eof_marker(self, stream: StreamType) -> None: - """ - Jump to the %%EOF marker. - - According to the specs, the %%EOF marker should be at the very end of - the file. Hence for standard-compliant PDF documents this function will - read only the last part (DEFAULT_BUFFER_SIZE). - """ - HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' - line = b"" - while line[:5] != b"%%EOF": - if stream.tell() < HEADER_SIZE: - if self.strict: - raise PdfReadError("EOF marker not found") - else: - logger_warning("EOF marker not found", __name__) - line = read_previous_line(stream) - - def _find_startxref_pos(self, stream: StreamType) -> int: - """ - Find startxref entry - the location of the xref table. - - Args: - stream: - - Returns: - The bytes offset - """ - line = read_previous_line(stream) - try: - startxref = int(line) - except ValueError: - # 'startxref' may be on the same line as the location - if not line.startswith(b"startxref"): - raise PdfReadError("startxref not found") - startxref = int(line[9:].strip()) - logger_warning("startxref on same line as offset", __name__) - else: - line = read_previous_line(stream) - if line[:9] != b"startxref": - raise PdfReadError("startxref not found") - return startxref - - def _read_standard_xref_table(self, stream: StreamType) -> None: - # standard cross-reference table - ref = stream.read(3) - if ref != b"ref": - raise PdfReadError("xref table read error") - read_non_whitespace(stream) - stream.seek(-1, 1) - first_time = True # check if the first time looking at the xref table - while True: - num = cast(int, read_object(stream, self)) - if first_time and num != 0: - self.xref_index = num - if self.strict: - logger_warning( - "Xref table not zero-indexed. ID numbers for objects will be corrected.", - __name__, - ) - # if table not zero indexed, could be due to error from when PDF was created - # which will lead to mismatched indices later on, only warned and corrected if self.strict==True - first_time = False - read_non_whitespace(stream) - stream.seek(-1, 1) - size = cast(int, read_object(stream, self)) - read_non_whitespace(stream) - stream.seek(-1, 1) - cnt = 0 - while cnt < size: - line = stream.read(20) - - # It's very clear in section 3.4.3 of the PDF spec - # that all cross-reference table lines are a fixed - # 20 bytes (as of PDF 1.7). However, some files have - # 21-byte entries (or more) due to the use of \r\n - # (CRLF) EOL's. Detect that case, and adjust the line - # until it does not begin with a \r (CR) or \n (LF). - while line[0] in b"\x0D\x0A": - stream.seek(-20 + 1, 1) - line = stream.read(20) - - # On the other hand, some malformed PDF files - # use a single character EOL without a preceding - # space. Detect that case, and seek the stream - # back one character. (0-9 means we've bled into - # the next xref entry, t means we've bled into the - # text "trailer"): - if line[-1] in b"0123456789t": - stream.seek(-1, 1) - - try: - offset_b, generation_b = line[:16].split(b" ") - entry_type_b = line[17:18] - - offset, generation = int(offset_b), int(generation_b) - except Exception: - # if something wrong occurred - if hasattr(stream, "getbuffer"): - buf = bytes(stream.getbuffer()) # type: ignore - else: - p = stream.tell() - stream.seek(0, 0) - buf = stream.read(-1) - stream.seek(p) - - f = re.search(f"{num}\\s+(\\d+)\\s+obj".encode(), buf) - if f is None: - logger_warning( - f"entry {num} in Xref table invalid; object not found", - __name__, - ) - generation = 65535 - offset = -1 - else: - logger_warning( - f"entry {num} in Xref table invalid but object found", - __name__, - ) - generation = int(f.group(1)) - offset = f.start() - - if generation not in self.xref: - self.xref[generation] = {} - self.xref_free_entry[generation] = {} - if num in self.xref[generation]: - # It really seems like we should allow the last - # xref table in the file to override previous - # ones. Since we read the file backwards, assume - # any existing key is already set correctly. - pass - else: - self.xref[generation][num] = offset - try: - self.xref_free_entry[generation][num] = entry_type_b == b"f" - except Exception: - pass - try: - self.xref_free_entry[65535][num] = entry_type_b == b"f" - except Exception: - pass - cnt += 1 - num += 1 - read_non_whitespace(stream) - stream.seek(-1, 1) - trailer_tag = stream.read(7) - if trailer_tag != b"trailer": - # more xrefs! - stream.seek(-7, 1) - else: - break - - def _read_xref_tables_and_trailers( - self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int - ) -> None: - self.xref: Dict[int, Dict[Any, Any]] = {} - self.xref_free_entry: Dict[int, Dict[Any, Any]] = {} - self.xref_objStm: Dict[int, Tuple[Any, Any]] = {} - self.trailer = DictionaryObject() - while startxref is not None: - # load the xref table - stream.seek(startxref, 0) - x = stream.read(1) - if x in b"\r\n": - x = stream.read(1) - if x == b"x": - startxref = self._read_xref(stream) - elif xref_issue_nr: - try: - self._rebuild_xref_table(stream) - break - except Exception: - xref_issue_nr = 0 - elif x.isdigit(): - try: - xrefstream = self._read_pdf15_xref_stream(stream) - except Exception as e: - if TK.ROOT in self.trailer: - logger_warning( - f"Previous trailer can not be read {e.args}", - __name__, - ) - break - else: - raise PdfReadError(f"trailer can not be read {e.args}") - trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE - for key in trailer_keys: - if key in xrefstream and key not in self.trailer: - self.trailer[NameObject(key)] = xrefstream.raw_get(key) - if "/XRefStm" in xrefstream: - p = stream.tell() - stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) - self._read_pdf15_xref_stream(stream) - stream.seek(p, 0) - if "/Prev" in xrefstream: - startxref = cast(int, xrefstream["/Prev"]) - else: - break - else: - startxref = self._read_xref_other_error(stream, startxref) - - def _read_xref(self, stream: StreamType) -> Optional[int]: - self._read_standard_xref_table(stream) - read_non_whitespace(stream) - stream.seek(-1, 1) - new_trailer = cast(Dict[str, Any], read_object(stream, self)) - for key, value in new_trailer.items(): - if key not in self.trailer: - self.trailer[key] = value - if "/XRefStm" in new_trailer: - p = stream.tell() - stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) - try: - self._read_pdf15_xref_stream(stream) - except Exception: - logger_warning( - f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", - __name__, - ) - stream.seek(p, 0) - if "/Prev" in new_trailer: - startxref = new_trailer["/Prev"] - return startxref - else: - return None - - def _read_xref_other_error( - self, stream: StreamType, startxref: int - ) -> Optional[int]: - # some PDFs have /Prev=0 in the trailer, instead of no /Prev - if startxref == 0: - if self.strict: - raise PdfReadError( - "/Prev=0 in the trailer (try opening with strict=False)" - ) - logger_warning( - "/Prev=0 in the trailer - assuming there is no previous xref table", - __name__, - ) - return None - # bad xref character at startxref. Let's see if we can find - # the xref table nearby, as we've observed this error with an - # off-by-one before. - stream.seek(-11, 1) - tmp = stream.read(20) - xref_loc = tmp.find(b"xref") - if xref_loc != -1: - startxref -= 10 - xref_loc - return startxref - # No explicit xref table, try finding a cross-reference stream. - stream.seek(startxref, 0) - for look in range(25): # value extended to cope with more linearized files - if stream.read(1).isdigit(): - # This is not a standard PDF, consider adding a warning - startxref += look - return startxref - # no xref table found at specified location - if "/Root" in self.trailer and not self.strict: - # if Root has been already found, just raise warning - logger_warning("Invalid parent xref., rebuild xref", __name__) - try: - self._rebuild_xref_table(stream) - return None - except Exception: - raise PdfReadError("can not rebuild xref") - raise PdfReadError("Could not find xref table at specified location") - - def _read_pdf15_xref_stream( - self, stream: StreamType - ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: - # PDF 1.5+ Cross-Reference Stream - stream.seek(-1, 1) - idnum, generation = self.read_object_header(stream) - xrefstream = cast(ContentStream, read_object(stream, self)) - assert cast(str, xrefstream["/Type"]) == "/XRef" - self.cache_indirect_object(generation, idnum, xrefstream) - stream_data = BytesIO(b_(xrefstream.get_data())) - # Index pairs specify the subsections in the dictionary. If - # none create one subsection that spans everything. - idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) - entry_sizes = cast(Dict[Any, Any], xrefstream.get("/W")) - assert len(entry_sizes) >= 3 - if self.strict and len(entry_sizes) > 3: - raise PdfReadError(f"Too many entry sizes: {entry_sizes}") - - def get_entry(i: int) -> Union[int, Tuple[int, ...]]: - # Reads the correct number of bytes for each entry. See the - # discussion of the W parameter in PDF spec table 17. - if entry_sizes[i] > 0: - d = stream_data.read(entry_sizes[i]) - return convert_to_int(d, entry_sizes[i]) - - # PDF Spec Table 17: A value of zero for an element in the - # W array indicates...the default value shall be used - if i == 0: - return 1 # First value defaults to 1 - else: - return 0 - - def used_before(num: int, generation: Union[int, Tuple[int, ...]]) -> bool: - # We move backwards through the xrefs, don't replace any. - return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore - - # Iterate through each subsection - self._read_xref_subsections(idx_pairs, get_entry, used_before) - return xrefstream - - @staticmethod - def _get_xref_issues(stream: StreamType, startxref: int) -> int: - """ - Return an int which indicates an issue. 0 means there is no issue. - - Args: - stream: - startxref: - - Returns: - 0 means no issue, other values represent specific issues. - """ - stream.seek(startxref - 1, 0) # -1 to check character before - line = stream.read(1) - if line == b"j": - line = stream.read(1) - if line not in b"\r\n \t": - return 1 - line = stream.read(4) - if line != b"xref": - # not an xref so check if it is an XREF object - line = b"" - while line in b"0123456789 \t": - line = stream.read(1) - if line == b"": - return 2 - line += stream.read(2) # 1 char already read, +2 to check "obj" - if line.lower() != b"obj": - return 3 - return 0 - - def _rebuild_xref_table(self, stream: StreamType) -> None: - self.xref = {} - stream.seek(0, 0) - f_ = stream.read(-1) - - for m in re.finditer(rb"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj", f_): - idnum = int(m.group(1)) - generation = int(m.group(2)) - if generation not in self.xref: - self.xref[generation] = {} - self.xref[generation][idnum] = m.start(1) - stream.seek(0, 0) - for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): - stream.seek(m.start(1), 0) - new_trailer = cast(Dict[Any, Any], read_object(stream, self)) - # Here, we are parsing the file from start to end, the new data have to erase the existing. - for key, value in list(new_trailer.items()): - self.trailer[key] = value - - def _read_xref_subsections( - self, - idx_pairs: List[int], - get_entry: Callable[[int], Union[int, Tuple[int, ...]]], - used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], - ) -> None: - for start, size in self._pairs(idx_pairs): - # The subsections must increase - for num in range(start, start + size): - # The first entry is the type - xref_type = get_entry(0) - # The rest of the elements depend on the xref_type - if xref_type == 0: - # linked list of free objects - next_free_object = get_entry(1) # noqa: F841 - next_generation = get_entry(2) # noqa: F841 - elif xref_type == 1: - # objects that are in use but are not compressed - byte_offset = get_entry(1) - generation = get_entry(2) - if generation not in self.xref: - self.xref[generation] = {} # type: ignore - if not used_before(num, generation): - self.xref[generation][num] = byte_offset # type: ignore - elif xref_type == 2: - # compressed objects - objstr_num = get_entry(1) - obstr_idx = get_entry(2) - generation = 0 # PDF spec table 18, generation is 0 - if not used_before(num, generation): - self.xref_objStm[num] = (objstr_num, obstr_idx) - elif self.strict: - raise PdfReadError(f"Unknown xref type: {xref_type}") - - def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]: - i = 0 - while True: - yield array[i], array[i + 1] - i += 2 - if (i + 1) >= len(array): - break - - def read_next_end_line( - self, stream: StreamType, limit_offset: int = 0 - ) -> bytes: # deprecated - """.. deprecated:: 2.1.0""" - deprecate_no_replacement("read_next_end_line", removed_in="4.0.0") - line_parts = [] - while True: - # Prevent infinite loops in malformed PDFs - if stream.tell() == 0 or stream.tell() == limit_offset: - raise PdfReadError("Could not read malformed PDF file") - x = stream.read(1) - if stream.tell() < 2: - raise PdfReadError("EOL marker not found") - stream.seek(-2, 1) - if x in (b"\n", b"\r"): # \n = LF; \r = CR - crlf = False - while x in (b"\n", b"\r"): - x = stream.read(1) - if x in (b"\n", b"\r"): # account for CR+LF - stream.seek(-1, 1) - crlf = True - if stream.tell() < 2: - raise PdfReadError("EOL marker not found") - stream.seek(-2, 1) - stream.seek( - 2 if crlf else 1, 1 - ) # if using CR+LF, go back 2 bytes, else 1 - break - else: - line_parts.append(x) - line_parts.reverse() - return b"".join(line_parts) - - def readNextEndLine( - self, stream: StreamType, limit_offset: int = 0 - ) -> bytes: # deprecated - """.. deprecated:: 1.28.0""" - deprecation_no_replacement("readNextEndLine", "3.0.0") - return self.read_next_end_line(stream, limit_offset) - - def decrypt(self, password: Union[str, bytes]) -> PasswordType: - """ - When using an encrypted / secured PDF file with the PDF Standard - encryption handler, this function will allow the file to be decrypted. - It checks the given password against the document's user password and - owner password, and then stores the resulting decryption key if either - password is correct. - - It does not matter which password was matched. Both passwords provide - the correct decryption key that will allow the document to be used with - this library. - - Args: - password: The password to match. - - Returns: - An indicator if the document was decrypted and weather it was the - owner password or the user password. - """ - if not self._encryption: - raise PdfReadError("Not encrypted file") - # TODO: raise Exception for wrong password - return self._encryption.verify(password) - - def decode_permissions(self, permissions_code: int) -> Dict[str, bool]: - # Takes the permissions as an integer, returns the allowed access - permissions = {} - permissions["print"] = permissions_code & (1 << 3 - 1) != 0 # bit 3 - permissions["modify"] = permissions_code & (1 << 4 - 1) != 0 # bit 4 - permissions["copy"] = permissions_code & (1 << 5 - 1) != 0 # bit 5 - permissions["annotations"] = permissions_code & (1 << 6 - 1) != 0 # bit 6 - permissions["forms"] = permissions_code & (1 << 9 - 1) != 0 # bit 9 - permissions["accessability"] = permissions_code & (1 << 10 - 1) != 0 # bit 10 - permissions["assemble"] = permissions_code & (1 << 11 - 1) != 0 # bit 11 - permissions["print_high_quality"] = ( - permissions_code & (1 << 12 - 1) != 0 - ) # bit 12 - return permissions - - @property - def is_encrypted(self) -> bool: - """ - Read-only boolean property showing whether this PDF file is encrypted. - - Note that this property, if true, will remain true even after the - :meth:`decrypt()` method is called. - """ - return TK.ENCRYPT in self.trailer - - def getIsEncrypted(self) -> bool: # deprecated - """ - Use :py:attr:`is_encrypted` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("getIsEncrypted", "is_encrypted", "3.0.0") - return self.is_encrypted - - @property - def isEncrypted(self) -> bool: # deprecated - """ - Use :py:attr:`is_encrypted` instead. - - .. deprecated:: 1.28.0 - """ - deprecation_with_replacement("isEncrypted", "is_encrypted", "3.0.0") - return self.is_encrypted - - @property - def xfa(self) -> Optional[Dict[str, Any]]: - tree: Optional[TreeObject] = None - retval: Dict[str, Any] = {} - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not catalog["/AcroForm"]: - return None - - tree = cast(TreeObject, catalog["/AcroForm"]) - - if "/XFA" in tree: - fields = cast(ArrayObject, tree["/XFA"]) - i = iter(fields) - for f in i: - tag = f - f = next(i) - if isinstance(f, IndirectObject): - field = cast(Optional[EncodedStreamObject], f.get_object()) - if field: - es = zlib.decompress(b_(field._data)) - retval[tag] = es - return retval - - def add_form_topname(self, name: str) -> Optional[DictionaryObject]: - """ - Add a top level form that groups all form fields below it. - - Args: - name: text string of the "/T" Attribute of the created object - - Returns: - The created object. ``None`` means no object was created. - """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not isinstance( - catalog["/AcroForm"], DictionaryObject - ): - return None - acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) - if "/Fields" not in acroform: - # TODO: :No error returns but may be extended for XFA Forms - return None - - interim = DictionaryObject() - interim[NameObject("/T")] = TextStringObject(name) - interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] - self.cache_indirect_object( - 0, - max([i for (g, i) in self.resolved_objects if g == 0]) + 1, - interim, - ) - arr = ArrayObject() - arr.append(interim.indirect_reference) - acroform[NameObject("/Fields")] = arr - for o in cast(ArrayObject, interim["/Kids"]): - obj = o.get_object() - if "/Parent" in obj: - logger_warning( - f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", - __name__, - ) - obj[NameObject("/Parent")] = interim.indirect_reference - return interim - - def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: - """ - Rename top level form field that all form fields below it. - - Args: - name: text string of the "/T" field of the created object - - Returns: - The modified object. ``None`` means no object was modified. - """ - catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) - - if "/AcroForm" not in catalog or not isinstance( - catalog["/AcroForm"], DictionaryObject - ): - return None - acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) - if "/Fields" not in acroform: - return None - - interim = cast( - DictionaryObject, - cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), - ) - interim[NameObject("/T")] = TextStringObject(name) - return interim - - def _get_embedded_files_root(self) -> Optional[NameTree]: - """ - Returns the EmbeddedFiles root as a NameTree Object - if the root does not exists, return None - """ - catalog = cast(DictionaryObject, self.trailer["/Root"]) - if "/Names" not in catalog: - return None - ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None) - if ef is None: - return None - efo = ef.get_object() - # not for reader - """ - if not isinstance(efo,NameTree): - if isinstance(ef,IndirectObject): - ef.replace_object(efo) - else: - cast(DictionaryObject,catalog["/Names"])[ - NameObject("/EmbeddedFiles")] = NameTree(efo) - """ - return NameTree(efo) - - @property - def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: - ef = self._get_embedded_files_root() - if ef: - return ef.list_items() - else: - return None - - @property - def attachments(self) -> Mapping[str, Union[List[bytes], List[Dict[str, bytes]]]]: - ef = self._get_embedded_files_root() - if ef: - d: Dict[str, Union[List[bytes], List[Dict[str, bytes]]]] = {} - for k, v in ef.list_items().items(): - if isinstance(v, list): - if k not in d: - d[k] = [] # type: ignore - for e in v: - e = cast(DictionaryObject, e.get_object()) - if "/EF" in e: - d[k].append(e["/EF"]["/F"].get_data()) # type: ignore - elif "/RF" in e: - r = cast( - ArrayObject, cast(DictionaryObject, e["/RF"])["/F"] - ) - di: Dict[str, bytes] = {} - i = 0 - while i < len(r): - di[cast(str, r[i])] = r[i + 1].get_object().get_data() - i += 2 - d[k].append(di) - return d - else: - return {} - - def _list_attachments(self) -> List[str]: - """ - Retrieves the list of filenames of file attachments. - - Returns: - list of filenames - """ - ef = self._get_embedded_files_root() - if ef: - lst = ef.list_keys() - else: - lst = [] - """ - for ip, p in enumerate(self.pages): - for a in [_a.get_object() - for _a in p.get("/Annots",[])]: - if _a.get_object().get("/Subtype","") != "/FileAttachements": - continue - lst.append(f"$page_{ip}.{get_name_from_file_specification(_a)}") - """ - return lst - - def _get_attachment_list(self, name: str) -> List[bytes]: - out = self._get_attachments(name)[name] - if isinstance(out, list): - return out - return [out] - - def _get_attachments( - self, filename: Optional[str] = None - ) -> Dict[str, Union[bytes, List[bytes], Dict[str, bytes]]]: - """ - Retrieves all or selected file attachments of the PDF as a dictionary of file names - and the file data as a bytestring. - - Args: - filename: If filename is None, then a dictionary of all attachments - will be returned, where the key is the filename and the value - is the content. Otherwise, a dictionary with just a single key - - the filename - and its content will be returned. - - Returns: - dictionary of filename -> Union[bytestring or List[ByteString]] - if the filename exists multiple times a List of the different version will be provided - """ - ef = self._get_embedded_files_root() - if ef is None: - return {} - if filename is None: - return {k: v if len(v) > 1 else v[0] for k, v in self.attachments.items()} # type: ignore - else: - lst = ef.list_get(filename) - return { - filename: [x["/EF"]["/F"].get_data() for x in lst] # type: ignore - if isinstance(lst, list) - else lst["/EF"]["/F"].get_data() # type: ignore - } - - -class PdfFileReader(PdfReader): # deprecated - def __init__(self, *args: Any, **kwargs: Any) -> None: - deprecation_with_replacement("PdfFileReader", "PdfReader", "3.0.0") - if "strict" not in kwargs and len(args) < 2: - kwargs["strict"] = True # maintain the default - super().__init__(*args, **kwargs) +# Copyright (c) 2006, Mathieu Fenniak +# Copyright (c) 2007, Ashish Kulkarni +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# * The name of the author may not be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import struct +import zlib +from datetime import datetime +from io import BytesIO, UnsupportedOperation +from pathlib import Path +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Mapping, + Optional, + Tuple, + Union, + cast, +) + +from ._encryption import Encryption, PasswordType +from ._page import PageObject, _VirtualList +from ._page_labels import index2label as page_index2page_label +from ._utils import ( + StrByteType, + StreamType, + b_, + deprecate_no_replacement, + deprecation_no_replacement, + deprecation_with_replacement, + logger_warning, + parse_iso8824_date, + read_non_whitespace, + read_previous_line, + read_until_whitespace, + skip_over_comment, + skip_over_whitespace, +) +from .constants import CatalogAttributes as CA +from .constants import CatalogDictionary as CD +from .constants import ( + CheckboxRadioButtonAttributes, + GoToActionArguments, +) +from .constants import Core as CO +from .constants import DocumentInformationAttributes as DI +from .constants import FieldDictionaryAttributes as FA +from .constants import PageAttributes as PG +from .constants import PagesAttributes as PA +from .constants import TrailerKeys as TK +from .errors import ( + EmptyFileError, + FileNotDecryptedError, + PdfReadError, + PdfStreamError, + WrongPasswordError, +) +from .generic import ( + ArrayObject, + BooleanObject, + ContentStream, + DecodedStreamObject, + Destination, + DictionaryObject, + EncodedStreamObject, + Field, + Fit, + FloatObject, + IndirectObject, + NameObject, + NameTree, + NullObject, + NumberObject, + PdfObject, + TextStringObject, + TreeObject, + ViewerPreferences, + read_object, +) +from .types import OutlineType, PagemodeType +from .xmp import XmpInformation + + +def convert_to_int(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: + if size > 8: + raise PdfReadError("invalid size in convert_to_int") + d = b"\x00\x00\x00\x00\x00\x00\x00\x00" + d + d = d[-8:] + return struct.unpack(">q", d)[0] + + +def convertToInt(d: bytes, size: int) -> Union[int, Tuple[Any, ...]]: # deprecated + deprecation_with_replacement("convertToInt", "convert_to_int") + return convert_to_int(d, size) + + +class DocumentInformation(DictionaryObject): + """ + A class representing the basic document metadata provided in a PDF File. + This class is accessible through + :py:class:`PdfReader.metadata`. + + All text properties of the document metadata have + *two* properties, eg. author and author_raw. The non-raw property will + always return a ``TextStringObject``, making it ideal for a case where + the metadata is being displayed. The raw property can sometimes return + a ``ByteStringObject``, if pypdf was unable to decode the string's + text encoding; this requires additional safety in the caller and + therefore is not as commonly accessed. + """ + + def __init__(self) -> None: + DictionaryObject.__init__(self) + + def _get_text(self, key: str) -> Optional[str]: + retval = self.get(key, None) + if isinstance(retval, TextStringObject): + return retval + return None + + def getText(self, key: str) -> Optional[str]: # deprecated + """ + Use the attributes (e.g. :py:attr:`title` / :py:attr:`author`). + + .. deprecated:: 1.28.0 + """ + deprecation_no_replacement("getText", "3.0.0") + return self._get_text(key) + + @property + def title(self) -> Optional[str]: + """ + Read-only property accessing the document's title. + + Returns a ``TextStringObject`` or ``None`` if the title is not + specified. + """ + return ( + self._get_text(DI.TITLE) or self.get(DI.TITLE).get_object() # type: ignore + if self.get(DI.TITLE) + else None + ) + + @property + def title_raw(self) -> Optional[str]: + """The "raw" version of title; can return a ``ByteStringObject``.""" + return self.get(DI.TITLE) + + @property + def author(self) -> Optional[str]: + """ + Read-only property accessing the document's author. + + Returns a ``TextStringObject`` or ``None`` if the author is not + specified. + """ + return self._get_text(DI.AUTHOR) + + @property + def author_raw(self) -> Optional[str]: + """The "raw" version of author; can return a ``ByteStringObject``.""" + return self.get(DI.AUTHOR) + + @property + def subject(self) -> Optional[str]: + """ + Read-only property accessing the document's subject. + + Returns a ``TextStringObject`` or ``None`` if the subject is not + specified. + """ + return self._get_text(DI.SUBJECT) + + @property + def subject_raw(self) -> Optional[str]: + """The "raw" version of subject; can return a ``ByteStringObject``.""" + return self.get(DI.SUBJECT) + + @property + def creator(self) -> Optional[str]: + """ + Read-only property accessing the document's creator. + + If the document was converted to PDF from another format, this is the + name of the application (e.g. OpenOffice) that created the original + document from which it was converted. Returns a ``TextStringObject`` or + ``None`` if the creator is not specified. + """ + return self._get_text(DI.CREATOR) + + @property + def creator_raw(self) -> Optional[str]: + """The "raw" version of creator; can return a ``ByteStringObject``.""" + return self.get(DI.CREATOR) + + @property + def producer(self) -> Optional[str]: + """ + Read-only property accessing the document's producer. + + If the document was converted to PDF from another format, this is the + name of the application (for example, OSX Quartz) that converted it to + PDF. Returns a ``TextStringObject`` or ``None`` if the producer is not + specified. + """ + return self._get_text(DI.PRODUCER) + + @property + def producer_raw(self) -> Optional[str]: + """The "raw" version of producer; can return a ``ByteStringObject``.""" + return self.get(DI.PRODUCER) + + @property + def creation_date(self) -> Optional[datetime]: + """Read-only property accessing the document's creation date.""" + return parse_iso8824_date(self._get_text(DI.CREATION_DATE)) + + @property + def creation_date_raw(self) -> Optional[str]: + """ + The "raw" version of creation date; can return a ``ByteStringObject``. + + Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix + is the offset from UTC. + """ + return self.get(DI.CREATION_DATE) + + @property + def modification_date(self) -> Optional[datetime]: + """ + Read-only property accessing the document's modification date. + + The date and time the document was most recently modified. + """ + return parse_iso8824_date(self._get_text(DI.MOD_DATE)) + + @property + def modification_date_raw(self) -> Optional[str]: + """ + The "raw" version of modification date; can return a + ``ByteStringObject``. + + Typically in the format ``D:YYYYMMDDhhmmss[+Z-]hh'mm`` where the suffix + is the offset from UTC. + """ + return self.get(DI.MOD_DATE) + + +class PdfReader: + """ + Initialize a PdfReader object. + + This operation can take some time, as the PDF stream's cross-reference + tables are read into memory. + + Args: + stream: A File object or an object that supports the standard read + and seek methods similar to a File object. Could also be a + string representing a path to a PDF file. + strict: Determines whether user should be warned of all + problems and also causes some correctable problems to be fatal. + Defaults to ``False``. + password: Decrypt PDF file at initialization. If the + password is None, the file will not be decrypted. + Defaults to ``None`` + """ + + @property + def viewer_preferences(self) -> Optional[ViewerPreferences]: + """Returns the existing ViewerPreferences as an overloaded dictionary.""" + o = cast(DictionaryObject, self.trailer["/Root"]).get( + CD.VIEWER_PREFERENCES, None + ) + if o is None: + return None + o = o.get_object() + if not isinstance(o, ViewerPreferences): + o = ViewerPreferences(o) + return o + + def __init__( + self, + stream: Union[StrByteType, Path], + strict: bool = False, + password: Union[None, str, bytes] = None, + ) -> None: + self.strict = strict + self.flattened_pages: Optional[List[PageObject]] = None + self.resolved_objects: Dict[Tuple[Any, Any], Optional[PdfObject]] = {} + self.xref_index = 0 + self._page_id2num: Optional[ + Dict[Any, Any] + ] = None # map page indirect_reference number to Page Number + if hasattr(stream, "mode") and "b" not in stream.mode: # type: ignore + logger_warning( + "PdfReader stream/file object is not in binary mode. " + "It may not be read correctly.", + __name__, + ) + if isinstance(stream, (str, Path)): + with open(stream, "rb") as fh: + stream = BytesIO(fh.read()) + self.read(stream) + self.stream = stream + + self._override_encryption = False + self._encryption: Optional[Encryption] = None + if self.is_encrypted: + self._override_encryption = True + # Some documents may not have a /ID, use two empty + # byte strings instead. Solves + # https://github.com/py-pdf/pypdf/issues/608 + id_entry = self.trailer.get(TK.ID) + id1_entry = id_entry[0].get_object().original_bytes if id_entry else b"" + encrypt_entry = cast( + DictionaryObject, self.trailer[TK.ENCRYPT].get_object() + ) + self._encryption = Encryption.read(encrypt_entry, id1_entry) + + # try empty password if no password provided + pwd = password if password is not None else b"" + if ( + self._encryption.verify(pwd) == PasswordType.NOT_DECRYPTED + and password is not None + ): + # raise if password provided + raise WrongPasswordError("Wrong password") + self._override_encryption = False + elif password is not None: + raise PdfReadError("Not encrypted file") + + @property + def pdf_header(self) -> str: + """ + The first 8 bytes of the file. + + This is typically something like ``'%PDF-1.6'`` and can be used to + detect if the file is actually a PDF file and which version it is. + """ + # TODO: Make this return a bytes object for consistency + # but that needs a deprecation + loc = self.stream.tell() + self.stream.seek(0, 0) + pdf_file_version = self.stream.read(8).decode("utf-8", "backslashreplace") + self.stream.seek(loc, 0) # return to where it was + return pdf_file_version + + @property + def metadata(self) -> Optional[DocumentInformation]: + """ + Retrieve the PDF file's document information dictionary, if it exists. + + Note that some PDF files use metadata streams instead of docinfo + dictionaries, and these metadata streams will not be accessed by this + function. + """ + if TK.INFO not in self.trailer: + return None + obj = self.trailer[TK.INFO] + retval = DocumentInformation() + if isinstance(obj, type(None)): + raise PdfReadError( + "trailer not found or does not point to document information directory" + ) + retval.update(obj) # type: ignore + return retval + + def getDocumentInfo(self) -> Optional[DocumentInformation]: # deprecated + """ + Use the attribute :py:attr:`metadata` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getDocumentInfo", "metadata", "3.0.0") + return self.metadata + + @property + def documentInfo(self) -> Optional[DocumentInformation]: # deprecated + """ + Use the attribute :py:attr:`metadata` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("documentInfo", "metadata", "3.0.0") + return self.metadata + + @property + def xmp_metadata(self) -> Optional[XmpInformation]: + """XMP (Extensible Metadata Platform) data.""" + try: + self._override_encryption = True + return self.trailer[TK.ROOT].xmp_metadata # type: ignore + finally: + self._override_encryption = False + + def getXmpMetadata(self) -> Optional[XmpInformation]: # deprecated + """ + Use the attribute :py:attr:`metadata` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getXmpMetadata", "xmp_metadata", "3.0.0") + return self.xmp_metadata + + @property + def xmpMetadata(self) -> Optional[XmpInformation]: # deprecated + """ + Use the attribute :py:attr:`xmp_metadata` instead. + + .. deprecated:: 1.28.0. + """ + deprecation_with_replacement("xmpMetadata", "xmp_metadata", "3.0.0") + return self.xmp_metadata + + def _get_num_pages(self) -> int: + """ + Calculate the number of pages in this PDF file. + + Returns: + The number of pages of the parsed PDF file + + Raises: + PdfReadError: if file is encrypted and restrictions prevent + this action. + """ + # Flattened pages will not work on an Encrypted PDF; + # the PDF file's page count is used in this case. Otherwise, + # the original method (flattened page count) is used. + if self.is_encrypted: + return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore + else: + if self.flattened_pages is None: + self._flatten() + return len(self.flattened_pages) # type: ignore + + def getNumPages(self) -> int: # deprecated + """ + Use :code:`len(reader.pages)` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("reader.getNumPages", "len(reader.pages)", "3.0.0") + return self._get_num_pages() + + @property + def numPages(self) -> int: # deprecated + """ + Use :code:`len(reader.pages)` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("reader.numPages", "len(reader.pages)", "3.0.0") + return self._get_num_pages() + + def getPage(self, pageNumber: int) -> PageObject: # deprecated + """ + Use :code:`reader.pages[page_number]` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "reader.getPage(pageNumber)", "reader.pages[page_number]", "3.0.0" + ) + return self._get_page(pageNumber) + + def _get_page(self, page_number: int) -> PageObject: + """ + Retrieve a page by number from this PDF file. + + Args: + page_number: The page number to retrieve + (pages begin at zero) + + Returns: + A :class:`PageObject` instance. + """ + if self.flattened_pages is None: + self._flatten() + assert self.flattened_pages is not None, "hint for mypy" + return self.flattened_pages[page_number] + + @property + def namedDestinations(self) -> Dict[str, Any]: # deprecated + """ + Use :py:attr:`named_destinations` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("namedDestinations", "named_destinations", "3.0.0") + return self.named_destinations + + @property + def named_destinations(self) -> Dict[str, Any]: + """ + A read-only dictionary which maps names to + :class:`Destinations` + """ + return self._get_named_destinations() + + # A select group of relevant field attributes. For the complete list, + # see section 8.6.2 of the PDF 1.7 reference. + + def get_fields( + self, + tree: Optional[TreeObject] = None, + retval: Optional[Dict[Any, Any]] = None, + fileobj: Optional[Any] = None, + ) -> Optional[Dict[str, Any]]: + """ + Extract field data if this PDF contains interactive form fields. + + The *tree* and *retval* parameters are for recursive use. + + Args: + tree: + retval: + fileobj: A file object (usually a text file) to write + a report to on all interactive form fields found. + + Returns: + A dictionary where each key is a field name, and each + value is a :class:`Field` object. By + default, the mapping name is used for keys. + ``None`` if form data could not be located. + """ + field_attributes = FA.attributes_dict() + field_attributes.update(CheckboxRadioButtonAttributes.attributes_dict()) + if retval is None: + retval = {} + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + # get the AcroForm tree + if CD.ACRO_FORM in catalog: + tree = cast(Optional[TreeObject], catalog[CD.ACRO_FORM]) + else: + return None + if tree is None: + return retval + self._check_kids(tree, retval, fileobj) + for attr in field_attributes: + if attr in tree: + # Tree is a field + self._build_field(tree, retval, fileobj, field_attributes) + break + + if "/Fields" in tree: + fields = cast(ArrayObject, tree["/Fields"]) + for f in fields: + field = f.get_object() + self._build_field(field, retval, fileobj, field_attributes) + + return retval + + def getFields( + self, + tree: Optional[TreeObject] = None, + retval: Optional[Dict[Any, Any]] = None, + fileobj: Optional[Any] = None, + ) -> Optional[Dict[str, Any]]: # deprecated + """ + Use :meth:`get_fields` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getFields", "get_fields", "3.0.0") + return self.get_fields(tree, retval, fileobj) + + def _get_qualified_field_name(self, parent: DictionaryObject) -> str: + if "/TM" in parent: + return cast(str, parent["/TM"]) + elif "/Parent" in parent: + return ( + self._get_qualified_field_name( + cast(DictionaryObject, parent["/Parent"]) + ) + + "." + + cast(str, parent["/T"]) + ) + else: + return cast(str, parent["/T"]) + + def _build_field( + self, + field: Union[TreeObject, DictionaryObject], + retval: Dict[Any, Any], + fileobj: Any, + field_attributes: Any, + ) -> None: + self._check_kids(field, retval, fileobj) + try: + key = cast(str, field["/TM"]) + except KeyError: + try: + if "/Parent" in field: + key = ( + self._get_qualified_field_name( + cast(DictionaryObject, field["/Parent"]) + ) + + "." + ) + else: + key = "" + key += cast(str, field["/T"]) + except KeyError: + # Ignore no-name field for now + return + if fileobj: + self._write_field(fileobj, field, field_attributes) + fileobj.write("\n") + retval[key] = Field(field) + obj = retval[key].indirect_reference.get_object() # to get the full object + if obj.get(FA.FT, "") == "/Ch": + retval[key][NameObject("/_States_")] = obj[NameObject(FA.Opt)] + if obj.get(FA.FT, "") == "/Btn" and "/AP" in obj: + # Checkbox + retval[key][NameObject("/_States_")] = ArrayObject( + list(obj["/AP"]["/N"].keys()) + ) + if "/Off" not in retval[key]["/_States_"]: + retval[key][NameObject("/_States_")].append(NameObject("/Off")) + elif obj.get(FA.FT, "") == "/Btn" and obj.get(FA.Ff, 0) & FA.FfBits.Radio != 0: + states = [] + for k in obj.get(FA.Kids, {}): + k = k.get_object() + for s in list(k["/AP"]["/N"].keys()): + if s not in states: + states.append(s) + retval[key][NameObject("/_States_")] = ArrayObject(states) + if ( + obj.get(FA.Ff, 0) & FA.FfBits.NoToggleToOff != 0 + and "/Off" in retval[key]["/_States_"] + ): + del retval[key]["/_States_"][retval[key]["/_States_"].index("/Off")] + + def _check_kids( + self, tree: Union[TreeObject, DictionaryObject], retval: Any, fileobj: Any + ) -> None: + if PA.KIDS in tree: + # recurse down the tree + for kid in tree[PA.KIDS]: # type: ignore + self.get_fields(kid.get_object(), retval, fileobj) + + def _write_field(self, fileobj: Any, field: Any, field_attributes: Any) -> None: + field_attributes_tuple = FA.attributes() + field_attributes_tuple = ( + field_attributes_tuple + CheckboxRadioButtonAttributes.attributes() + ) + + for attr in field_attributes_tuple: + if attr in ( + FA.Kids, + FA.AA, + ): + continue + attr_name = field_attributes[attr] + try: + if attr == FA.FT: + # Make the field type value more clear + types = { + "/Btn": "Button", + "/Tx": "Text", + "/Ch": "Choice", + "/Sig": "Signature", + } + if field[attr] in types: + fileobj.write(f"{attr_name}: {types[field[attr]]}\n") + elif attr == FA.Parent: + # Let's just write the name of the parent + try: + name = field[attr][FA.TM] + except KeyError: + name = field[attr][FA.T] + fileobj.write(f"{attr_name}: {name}\n") + else: + fileobj.write(f"{attr_name}: {field[attr]}\n") + except KeyError: + # Field attribute is N/A or unknown, so don't write anything + pass + + def get_form_text_fields(self, full_qualified_name: bool = False) -> Dict[str, Any]: + """ + Retrieve form fields from the document with textual data. + + Args: + full_qualified_name: to get full name + + Returns: + A dictionary. The key is the name of the form field, + the value is the content of the field. + + If the document contains multiple form fields with the same name, the + second and following will get the suffix .2, .3, ... + """ + + def indexed_key(k: str, fields: dict) -> str: + if k not in fields: + return k + else: + return ( + k + + "." + + str(sum([1 for kk in fields if kk.startswith(k + ".")]) + 2) + ) + + # Retrieve document form fields + formfields = self.get_fields() + if formfields is None: + return {} + ff = {} + for field, value in formfields.items(): + if value.get("/FT") == "/Tx": + if full_qualified_name: + ff[field] = value.get("/V") + else: + ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V") + return ff + + def getFormTextFields(self) -> Dict[str, Any]: # deprecated + """ + Use :meth:`get_form_text_fields` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "getFormTextFields", "get_form_text_fields", "3.0.0" + ) + return self.get_form_text_fields() + + def _get_named_destinations( + self, + tree: Union[TreeObject, None] = None, + retval: Optional[Any] = None, + ) -> Dict[str, Any]: + """ + Retrieve the named destinations present in the document. + + Args: + tree: + retval: + + Returns: + A dictionary which maps names to + :class:`Destinations`. + """ + if retval is None: + retval = {} + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + # get the name tree + if CA.DESTS in catalog: + tree = cast(TreeObject, catalog[CA.DESTS]) + elif CA.NAMES in catalog: + names = cast(DictionaryObject, catalog[CA.NAMES]) + if CA.DESTS in names: + tree = cast(TreeObject, names[CA.DESTS]) + + if tree is None: + return retval + + if PA.KIDS in tree: + # recurse down the tree + for kid in cast(ArrayObject, tree[PA.KIDS]): + self._get_named_destinations(kid.get_object(), retval) + # TABLE 3.33 Entries in a name tree node dictionary (PDF 1.7 specs) + elif CA.NAMES in tree: # KIDS and NAMES are exclusives (PDF 1.7 specs p 162) + names = cast(DictionaryObject, tree[CA.NAMES]) + i = 0 + while i < len(names): + key = cast(str, names[i].get_object()) + i += 1 + if not isinstance(key, str): + continue + try: + value = names[i].get_object() + except IndexError: + break + i += 1 + if isinstance(value, DictionaryObject) and "/D" in value: + value = value["/D"] + dest = self._build_destination(key, value) # type: ignore + if dest is not None: + retval[key] = dest + else: # case where Dests is in root catalog (PDF 1.7 specs, §2 about PDF1.1 + for k__, v__ in tree.items(): + val = v__.get_object() + if isinstance(val, DictionaryObject): + val = val["/D"].get_object() + dest = self._build_destination(k__, val) + if dest is not None: + retval[k__] = dest + return retval + + def getNamedDestinations( + self, + tree: Union[TreeObject, None] = None, + retval: Optional[Any] = None, + ) -> Dict[str, Any]: # deprecated + """ + Use :py:attr:`named_destinations` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "getNamedDestinations", "named_destinations", "3.0.0" + ) + return self._get_named_destinations(tree, retval) + + @property + def outline(self) -> OutlineType: + """ + Read-only property for the outline present in the document. + + (i.e., a collection of 'outline items' which are also known as + 'bookmarks') + """ + return self._get_outline() + + @property + def outlines(self) -> OutlineType: # deprecated + """ + Use :py:attr:`outline` instead. + + .. deprecated:: 2.9.0 + """ + deprecation_with_replacement("outlines", "outline", "3.0.0") + return self.outline + + def _get_outline( + self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None + ) -> OutlineType: + if outline is None: + outline = [] + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + # get the outline dictionary and named destinations + if CO.OUTLINES in catalog: + lines = cast(DictionaryObject, catalog[CO.OUTLINES]) + + if isinstance(lines, NullObject): + return outline + + # TABLE 8.3 Entries in the outline dictionary + if lines is not None and "/First" in lines: + node = cast(DictionaryObject, lines["/First"]) + self._namedDests = self._get_named_destinations() + + if node is None: + return outline + + # see if there are any more outline items + while True: + outline_obj = self._build_outline_item(node) + if outline_obj: + outline.append(outline_obj) + + # check for sub-outline + if "/First" in node: + sub_outline: List[Any] = [] + self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) + if sub_outline: + outline.append(sub_outline) + + if "/Next" not in node: + break + node = cast(DictionaryObject, node["/Next"]) + + return outline + + def getOutlines( + self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None + ) -> OutlineType: # deprecated + """ + Use :py:attr:`outline` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getOutlines", "outline", "3.0.0") + return self._get_outline(node, outline) + + @property + def threads(self) -> Optional[ArrayObject]: + """ + Read-only property for the list of threads. + + See §8.3.2 from PDF 1.7 spec. + + It's an array of dictionaries with "/F" and "/I" properties or + None if there are no articles. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + if CO.THREADS in catalog: + return cast("ArrayObject", catalog[CO.THREADS]) + else: + return None + + def _get_page_number_by_indirect( + self, indirect_reference: Union[None, int, NullObject, IndirectObject] + ) -> int: + """ + Generate _page_id2num. + + Args: + indirect_reference: + + Returns: + The page number. + """ + if self._page_id2num is None: + self._page_id2num = { + x.indirect_reference.idnum: i for i, x in enumerate(self.pages) # type: ignore + } + + if indirect_reference is None or isinstance(indirect_reference, NullObject): + return -1 + if isinstance(indirect_reference, int): + idnum = indirect_reference + else: + idnum = indirect_reference.idnum + assert self._page_id2num is not None, "hint for mypy" + ret = self._page_id2num.get(idnum, -1) + return ret + + def get_page_number(self, page: PageObject) -> int: + """ + Retrieve page number of a given PageObject. + + Args: + page: The page to get page number. Should be + an instance of :class:`PageObject` + + Returns: + The page number or -1 if page is not found + """ + return self._get_page_number_by_indirect(page.indirect_reference) + + def getPageNumber(self, page: PageObject) -> int: # deprecated + """ + Use :meth:`get_page_number` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getPageNumber", "get_page_number", "3.0.0") + return self.get_page_number(page) + + def get_destination_page_number(self, destination: Destination) -> int: + """ + Retrieve page number of a given Destination object. + + Args: + destination: The destination to get page number. + + Returns: + The page number or -1 if page is not found + """ + return self._get_page_number_by_indirect(destination.page) + + def getDestinationPageNumber(self, destination: Destination) -> int: # deprecated + """ + Use :meth:`get_destination_page_number` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "getDestinationPageNumber", "get_destination_page_number", "3.0.0" + ) + return self.get_destination_page_number(destination) + + def _build_destination( + self, + title: str, + array: Optional[ + List[ + Union[NumberObject, IndirectObject, None, NullObject, DictionaryObject] + ] + ], + ) -> Destination: + page, typ = None, None + # handle outline items with missing or invalid destination + if ( + isinstance(array, (NullObject, str)) + or (isinstance(array, ArrayObject) and len(array) == 0) + or array is None + ): + page = NullObject() + return Destination(title, page, Fit.fit()) + else: + page, typ = array[0:2] # type: ignore + array = array[2:] + try: + return Destination(title, page, Fit(fit_type=typ, fit_args=array)) # type: ignore + except PdfReadError: + logger_warning(f"Unknown destination: {title} {array}", __name__) + if self.strict: + raise + # create a link to first Page + tmp = self.pages[0].indirect_reference + indirect_reference = NullObject() if tmp is None else tmp + return Destination(title, indirect_reference, Fit.fit()) # type: ignore + + def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: + dest, title, outline_item = None, None, None + + # title required for valid outline + # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary + try: + title = cast("str", node["/Title"]) + except KeyError: + if self.strict: + raise PdfReadError(f"Outline Entry Missing /Title attribute: {node!r}") + title = "" # type: ignore + + if "/A" in node: + # Action, PDFv1.7 Section 12.6 (only type GoTo supported) + action = cast(DictionaryObject, node["/A"]) + action_type = cast(NameObject, action[GoToActionArguments.S]) + if action_type == "/GoTo": + dest = action[GoToActionArguments.D] + elif "/Dest" in node: + # Destination, PDFv1.7 Section 12.3.2 + dest = node["/Dest"] + # if array was referenced in another object, will be a dict w/ key "/D" + if isinstance(dest, DictionaryObject) and "/D" in dest: + dest = dest["/D"] + + if isinstance(dest, ArrayObject): + outline_item = self._build_destination(title, dest) + elif isinstance(dest, str): + # named destination, addresses NameObject Issue #193 + # TODO : keep named destination instead of replacing it ? + try: + outline_item = self._build_destination( + title, self._namedDests[dest].dest_array + ) + except KeyError: + # named destination not found in Name Dict + outline_item = self._build_destination(title, None) + elif dest is None: + # outline item not required to have destination or action + # PDFv1.7 Table 153 + outline_item = self._build_destination(title, dest) + else: + if self.strict: + raise PdfReadError(f"Unexpected destination {dest!r}") + else: + logger_warning( + f"Removed unexpected destination {dest!r} from destination", + __name__, + ) + outline_item = self._build_destination(title, None) # type: ignore + + # if outline item created, add color, format, and child count if present + if outline_item: + if "/C" in node: + # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 + outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore + if "/F" in node: + # specifies style characteristics bold and/or italic + # with 1=italic, 2=bold, 3=both + outline_item[NameObject("/F")] = node["/F"] + if "/Count" in node: + # absolute value = num. visible children + # with positive = open/unfolded, negative = closed/folded + outline_item[NameObject("/Count")] = node["/Count"] + # if count is 0 we will consider it as open ( in order to have always an is_open to simplify + outline_item[NameObject("/%is_open%")] = BooleanObject( + node.get("/Count", 0) >= 0 + ) + outline_item.node = node + try: + outline_item.indirect_reference = node.indirect_reference + except AttributeError: + pass + return outline_item + + @property + def pages(self) -> List[PageObject]: + """Read-only property that emulates a list of :py:class:`Page` objects.""" + return _VirtualList(self._get_num_pages, self._get_page) # type: ignore + + @property + def page_labels(self) -> List[str]: + """ + A list of labels for the pages in this document. + + This property is read-only. The labels are in the order that the pages + appear in the document. + """ + return [page_index2page_label(self, i) for i in range(len(self.pages))] + + @property + def page_layout(self) -> Optional[str]: + """ + Get the page layout currently being used. + + .. list-table:: Valid ``layout`` values + :widths: 50 200 + + * - /NoLayout + - Layout explicitly not specified + * - /SinglePage + - Show one page at a time + * - /OneColumn + - Show one column at a time + * - /TwoColumnLeft + - Show pages in two columns, odd-numbered pages on the left + * - /TwoColumnRight + - Show pages in two columns, odd-numbered pages on the right + * - /TwoPageLeft + - Show two pages at a time, odd-numbered pages on the left + * - /TwoPageRight + - Show two pages at a time, odd-numbered pages on the right + """ + trailer = cast(DictionaryObject, self.trailer[TK.ROOT]) + if CD.PAGE_LAYOUT in trailer: + return cast(NameObject, trailer[CD.PAGE_LAYOUT]) + return None + + def getPageLayout(self) -> Optional[str]: # deprecated + """ + Use :py:attr:`page_layout` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getPageLayout", "page_layout", "3.0.0") + return self.page_layout + + @property + def pageLayout(self) -> Optional[str]: # deprecated + """ + Use :py:attr:`page_layout` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("pageLayout", "page_layout", "3.0.0") + return self.page_layout + + @property + def page_mode(self) -> Optional[PagemodeType]: + """ + Get the page mode currently being used. + + .. list-table:: Valid ``mode`` values + :widths: 50 200 + + * - /UseNone + - Do not show outline or thumbnails panels + * - /UseOutlines + - Show outline (aka bookmarks) panel + * - /UseThumbs + - Show page thumbnails panel + * - /FullScreen + - Fullscreen view + * - /UseOC + - Show Optional Content Group (OCG) panel + * - /UseAttachments + - Show attachments panel + """ + try: + return self.trailer[TK.ROOT]["/PageMode"] # type: ignore + except KeyError: + return None + + def getPageMode(self) -> Optional[PagemodeType]: # deprecated + """ + Use :py:attr:`page_mode` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getPageMode", "page_mode", "3.0.0") + return self.page_mode + + @property + def pageMode(self) -> Optional[PagemodeType]: # deprecated + """ + Use :py:attr:`page_mode` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("pageMode", "page_mode", "3.0.0") + return self.page_mode + + def _flatten( + self, + pages: Union[None, DictionaryObject, PageObject] = None, + inherit: Optional[Dict[str, Any]] = None, + indirect_reference: Optional[IndirectObject] = None, + ) -> None: + inheritable_page_attributes = ( + NameObject(PG.RESOURCES), + NameObject(PG.MEDIABOX), + NameObject(PG.CROPBOX), + NameObject(PG.ROTATE), + ) + if inherit is None: + inherit = {} + if pages is None: + # Fix issue 327: set flattened_pages attribute only for + # decrypted file + catalog = self.trailer[TK.ROOT].get_object() + pages = catalog["/Pages"].get_object() # type: ignore + self.flattened_pages = [] + + if PA.TYPE in pages: + t = pages[PA.TYPE] # type: ignore + # if pdf has no type, considered as a page if /Kids is missing + elif PA.KIDS not in pages: + t = "/Page" + else: + t = "/Pages" + + if t == "/Pages": + for attr in inheritable_page_attributes: + if attr in pages: + inherit[attr] = pages[attr] + for page in pages[PA.KIDS]: # type: ignore + addt = {} + if isinstance(page, IndirectObject): + addt["indirect_reference"] = page + obj = page.get_object() + if obj: + # damaged file may have invalid child in /Pages + self._flatten(obj, inherit, **addt) + elif t == "/Page": + for attr_in, value in list(inherit.items()): + # if the page has it's own value, it does not inherit the + # parent's value: + if attr_in not in pages: + pages[attr_in] = value + page_obj = PageObject(self, indirect_reference) + page_obj.update(pages) + + # TODO: Could flattened_pages be None at this point? + self.flattened_pages.append(page_obj) # type: ignore + + def _get_object_from_stream( + self, indirect_reference: IndirectObject + ) -> Union[int, PdfObject, str]: + # indirect reference to object in object stream + # read the entire object stream into memory + stmnum, idx = self.xref_objStm[indirect_reference.idnum] + obj_stm: EncodedStreamObject = IndirectObject(stmnum, 0, self).get_object() # type: ignore + # This is an xref to a stream, so its type better be a stream + assert cast(str, obj_stm["/Type"]) == "/ObjStm" + # /N is the number of indirect objects in the stream + assert idx < obj_stm["/N"] + stream_data = BytesIO(b_(obj_stm.get_data())) + for i in range(obj_stm["/N"]): # type: ignore + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + objnum = NumberObject.read_from_stream(stream_data) + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + offset = NumberObject.read_from_stream(stream_data) + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + if objnum != indirect_reference.idnum: + # We're only interested in one object + continue + if self.strict and idx != i: + raise PdfReadError("Object is in wrong index.") + stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore + + # to cope with some case where the 'pointer' is on a white space + read_non_whitespace(stream_data) + stream_data.seek(-1, 1) + + try: + obj = read_object(stream_data, self) + except PdfStreamError as exc: + # Stream object cannot be read. Normally, a critical error, but + # Adobe Reader doesn't complain, so continue (in strict mode?) + logger_warning( + f"Invalid stream (index {i}) within object " + f"{indirect_reference.idnum} {indirect_reference.generation}: " + f"{exc}", + __name__, + ) + + if self.strict: + raise PdfReadError(f"Can't read object stream: {exc}") + # Replace with null. Hopefully it's nothing important. + obj = NullObject() + return obj + + if self.strict: + raise PdfReadError("This is a fatal error in strict mode.") + return NullObject() + + def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: + """ + Used to ease development. + + This is equivalent to generic.IndirectObject(num,gen,self).get_object() + + Args: + num: The object number of the indirect object. + gen: The generation number of the indirect object. + + Returns: + A PdfObject + """ + return IndirectObject(num, gen, self).get_object() + + def get_object( + self, indirect_reference: Union[int, IndirectObject] + ) -> Optional[PdfObject]: + if isinstance(indirect_reference, int): + indirect_reference = IndirectObject(indirect_reference, 0, self) + retval = self.cache_get_indirect_object( + indirect_reference.generation, indirect_reference.idnum + ) + if retval is not None: + return retval + if ( + indirect_reference.generation == 0 + and indirect_reference.idnum in self.xref_objStm + ): + retval = self._get_object_from_stream(indirect_reference) # type: ignore + elif ( + indirect_reference.generation in self.xref + and indirect_reference.idnum in self.xref[indirect_reference.generation] + ): + if self.xref_free_entry.get(indirect_reference.generation, {}).get( + indirect_reference.idnum, False + ): + return NullObject() + start = self.xref[indirect_reference.generation][indirect_reference.idnum] + self.stream.seek(start, 0) + try: + idnum, generation = self.read_object_header(self.stream) + except Exception: + if hasattr(self.stream, "getbuffer"): + buf = bytes(self.stream.getbuffer()) # type: ignore + else: + p = self.stream.tell() + self.stream.seek(0, 0) + buf = self.stream.read(-1) + self.stream.seek(p, 0) + m = re.search( + rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), + buf, + ) + if m is not None: + logger_warning( + f"Object ID {indirect_reference.idnum},{indirect_reference.generation} ref repaired", + __name__, + ) + self.xref[indirect_reference.generation][ + indirect_reference.idnum + ] = (m.start(0) + 1) + self.stream.seek(m.start(0) + 1) + idnum, generation = self.read_object_header(self.stream) + else: + idnum = -1 # exception will be raised below + if idnum != indirect_reference.idnum and self.xref_index: + # Xref table probably had bad indexes due to not being zero-indexed + if self.strict: + raise PdfReadError( + f"Expected object ID ({indirect_reference.idnum} {indirect_reference.generation}) " + f"does not match actual ({idnum} {generation}); " + "xref table not zero-indexed." + ) + # xref table is corrected in non-strict mode + elif idnum != indirect_reference.idnum and self.strict: + # some other problem + raise PdfReadError( + f"Expected object ID ({indirect_reference.idnum} " + f"{indirect_reference.generation}) does not match actual " + f"({idnum} {generation})." + ) + if self.strict: + assert generation == indirect_reference.generation + retval = read_object(self.stream, self) # type: ignore + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self._encryption is not None: + # if we don't have the encryption key: + if not self._encryption.is_decrypted(): + raise FileNotDecryptedError("File has not been decrypted") + # otherwise, decrypt here... + retval = cast(PdfObject, retval) + retval = self._encryption.decrypt_object( + retval, indirect_reference.idnum, indirect_reference.generation + ) + else: + if hasattr(self.stream, "getbuffer"): + buf = bytes(self.stream.getbuffer()) # type: ignore + else: + p = self.stream.tell() + self.stream.seek(0, 0) + buf = self.stream.read(-1) + self.stream.seek(p, 0) + m = re.search( + rf"\s{indirect_reference.idnum}\s+{indirect_reference.generation}\s+obj".encode(), + buf, + ) + if m is not None: + logger_warning( + f"Object {indirect_reference.idnum} {indirect_reference.generation} found", + __name__, + ) + if indirect_reference.generation not in self.xref: + self.xref[indirect_reference.generation] = {} + self.xref[indirect_reference.generation][indirect_reference.idnum] = ( + m.start(0) + 1 + ) + self.stream.seek(m.end(0) + 1) + skip_over_whitespace(self.stream) + self.stream.seek(-1, 1) + retval = read_object(self.stream, self) # type: ignore + + # override encryption is used for the /Encrypt dictionary + if not self._override_encryption and self._encryption is not None: + # if we don't have the encryption key: + if not self._encryption.is_decrypted(): + raise FileNotDecryptedError("File has not been decrypted") + # otherwise, decrypt here... + retval = cast(PdfObject, retval) + retval = self._encryption.decrypt_object( + retval, indirect_reference.idnum, indirect_reference.generation + ) + else: + logger_warning( + f"Object {indirect_reference.idnum} {indirect_reference.generation} not defined.", + __name__, + ) + if self.strict: + raise PdfReadError("Could not find object.") + self.cache_indirect_object( + indirect_reference.generation, indirect_reference.idnum, retval + ) + return retval + + def getObject( + self, indirectReference: IndirectObject + ) -> Optional[PdfObject]: # deprecated + """ + Use :meth:`get_object` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getObject", "get_object", "3.0.0") + return self.get_object(indirectReference) + + def read_object_header(self, stream: StreamType) -> Tuple[int, int]: + # Should never be necessary to read out whitespace, since the + # cross-reference table should put us in the right spot to read the + # object header. In reality... some files have stupid cross reference + # tables that are off by whitespace bytes. + extra = False + skip_over_comment(stream) + extra |= skip_over_whitespace(stream) + stream.seek(-1, 1) + idnum = read_until_whitespace(stream) + extra |= skip_over_whitespace(stream) + stream.seek(-1, 1) + generation = read_until_whitespace(stream) + extra |= skip_over_whitespace(stream) + stream.seek(-1, 1) + + # although it's not used, it might still be necessary to read + _obj = stream.read(3) + + read_non_whitespace(stream) + stream.seek(-1, 1) + if extra and self.strict: + logger_warning( + f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore + __name__, + ) + return int(idnum), int(generation) + + def readObjectHeader(self, stream: StreamType) -> Tuple[int, int]: # deprecated + """ + Use :meth:`read_object_header` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("readObjectHeader", "read_object_header", "3.0.0") + return self.read_object_header(stream) + + def cache_get_indirect_object( + self, generation: int, idnum: int + ) -> Optional[PdfObject]: + return self.resolved_objects.get((generation, idnum)) + + def cacheGetIndirectObject( + self, generation: int, idnum: int + ) -> Optional[PdfObject]: # deprecated + """ + Use :meth:`cache_get_indirect_object` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement( + "cacheGetIndirectObject", "cache_get_indirect_object", "3.0.0" + ) + return self.cache_get_indirect_object(generation, idnum) + + def cache_indirect_object( + self, generation: int, idnum: int, obj: Optional[PdfObject] + ) -> Optional[PdfObject]: + if (generation, idnum) in self.resolved_objects: + msg = f"Overwriting cache for {generation} {idnum}" + if self.strict: + raise PdfReadError(msg) + logger_warning(msg, __name__) + self.resolved_objects[(generation, idnum)] = obj + if obj is not None: + obj.indirect_reference = IndirectObject(idnum, generation, self) + return obj + + def cacheIndirectObject( + self, generation: int, idnum: int, obj: Optional[PdfObject] + ) -> Optional[PdfObject]: # deprecated + """ + Use :meth:`cache_indirect_object` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("cacheIndirectObject", "cache_indirect_object") + return self.cache_indirect_object(generation, idnum, obj) + + def read(self, stream: StreamType) -> None: + self._basic_validation(stream) + self._find_eof_marker(stream) + startxref = self._find_startxref_pos(stream) + + # check and eventually correct the startxref only in not strict + xref_issue_nr = self._get_xref_issues(stream, startxref) + if xref_issue_nr != 0: + if self.strict and xref_issue_nr: + raise PdfReadError("Broken xref table") + logger_warning(f"incorrect startxref pointer({xref_issue_nr})", __name__) + + # read all cross reference tables and their trailers + self._read_xref_tables_and_trailers(stream, startxref, xref_issue_nr) + + # if not zero-indexed, verify that the table is correct; change it if necessary + if self.xref_index and not self.strict: + loc = stream.tell() + for gen, xref_entry in self.xref.items(): + if gen == 65535: + continue + xref_k = sorted( + xref_entry.keys() + ) # must ensure ascendant to prevent damage + for id in xref_k: + stream.seek(xref_entry[id], 0) + try: + pid, _pgen = self.read_object_header(stream) + except ValueError: + break + if pid == id - self.xref_index: + # fixing index item per item is required for revised PDF. + self.xref[gen][pid] = self.xref[gen][id] + del self.xref[gen][id] + # if not, then either it's just plain wrong, or the + # non-zero-index is actually correct + stream.seek(loc, 0) # return to where it was + + def _basic_validation(self, stream: StreamType) -> None: + """Ensure file is not empty. Read at most 5 bytes.""" + stream.seek(0, os.SEEK_SET) + try: + header_byte = stream.read(5) + except UnicodeDecodeError: + raise UnsupportedOperation("cannot read header") + if header_byte == b"": + raise EmptyFileError("Cannot read an empty file") + elif header_byte != b"%PDF-": + if self.strict: + raise PdfReadError( + f"PDF starts with '{header_byte.decode('utf8')}', " + "but '%PDF-' expected" + ) + else: + logger_warning(f"invalid pdf header: {header_byte}", __name__) + stream.seek(0, os.SEEK_END) + + def _find_eof_marker(self, stream: StreamType) -> None: + """ + Jump to the %%EOF marker. + + According to the specs, the %%EOF marker should be at the very end of + the file. Hence for standard-compliant PDF documents this function will + read only the last part (DEFAULT_BUFFER_SIZE). + """ + HEADER_SIZE = 8 # to parse whole file, Header is e.g. '%PDF-1.6' + line = b"" + while line[:5] != b"%%EOF": + if stream.tell() < HEADER_SIZE: + if self.strict: + raise PdfReadError("EOF marker not found") + else: + logger_warning("EOF marker not found", __name__) + line = read_previous_line(stream) + + def _find_startxref_pos(self, stream: StreamType) -> int: + """ + Find startxref entry - the location of the xref table. + + Args: + stream: + + Returns: + The bytes offset + """ + line = read_previous_line(stream) + try: + startxref = int(line) + except ValueError: + # 'startxref' may be on the same line as the location + if not line.startswith(b"startxref"): + raise PdfReadError("startxref not found") + startxref = int(line[9:].strip()) + logger_warning("startxref on same line as offset", __name__) + else: + line = read_previous_line(stream) + if line[:9] != b"startxref": + raise PdfReadError("startxref not found") + return startxref + + def _read_standard_xref_table(self, stream: StreamType) -> None: + # standard cross-reference table + ref = stream.read(3) + if ref != b"ref": + raise PdfReadError("xref table read error") + read_non_whitespace(stream) + stream.seek(-1, 1) + first_time = True # check if the first time looking at the xref table + while True: + num = cast(int, read_object(stream, self)) + if first_time and num != 0: + self.xref_index = num + if self.strict: + logger_warning( + "Xref table not zero-indexed. ID numbers for objects will be corrected.", + __name__, + ) + # if table not zero indexed, could be due to error from when PDF was created + # which will lead to mismatched indices later on, only warned and corrected if self.strict==True + first_time = False + read_non_whitespace(stream) + stream.seek(-1, 1) + size = cast(int, read_object(stream, self)) + read_non_whitespace(stream) + stream.seek(-1, 1) + cnt = 0 + while cnt < size: + line = stream.read(20) + + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes (as of PDF 1.7). However, some files have + # 21-byte entries (or more) due to the use of \r\n + # (CRLF) EOL's. Detect that case, and adjust the line + # until it does not begin with a \r (CR) or \n (LF). + while line[0] in b"\x0D\x0A": + stream.seek(-20 + 1, 1) + line = stream.read(20) + + # On the other hand, some malformed PDF files + # use a single character EOL without a preceding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in b"0123456789t": + stream.seek(-1, 1) + + try: + offset_b, generation_b = line[:16].split(b" ") + entry_type_b = line[17:18] + + offset, generation = int(offset_b), int(generation_b) + except Exception: + # if something wrong occurred + if hasattr(stream, "getbuffer"): + buf = bytes(stream.getbuffer()) # type: ignore + else: + p = stream.tell() + stream.seek(0, 0) + buf = stream.read(-1) + stream.seek(p) + + f = re.search(f"{num}\\s+(\\d+)\\s+obj".encode(), buf) + if f is None: + logger_warning( + f"entry {num} in Xref table invalid; object not found", + __name__, + ) + generation = 65535 + offset = -1 + else: + logger_warning( + f"entry {num} in Xref table invalid but object found", + __name__, + ) + generation = int(f.group(1)) + offset = f.start() + + if generation not in self.xref: + self.xref[generation] = {} + self.xref_free_entry[generation] = {} + if num in self.xref[generation]: + # It really seems like we should allow the last + # xref table in the file to override previous + # ones. Since we read the file backwards, assume + # any existing key is already set correctly. + pass + else: + self.xref[generation][num] = offset + try: + self.xref_free_entry[generation][num] = entry_type_b == b"f" + except Exception: + pass + try: + self.xref_free_entry[65535][num] = entry_type_b == b"f" + except Exception: + pass + cnt += 1 + num += 1 + read_non_whitespace(stream) + stream.seek(-1, 1) + trailer_tag = stream.read(7) + if trailer_tag != b"trailer": + # more xrefs! + stream.seek(-7, 1) + else: + break + + def _read_xref_tables_and_trailers( + self, stream: StreamType, startxref: Optional[int], xref_issue_nr: int + ) -> None: + self.xref: Dict[int, Dict[Any, Any]] = {} + self.xref_free_entry: Dict[int, Dict[Any, Any]] = {} + self.xref_objStm: Dict[int, Tuple[Any, Any]] = {} + self.trailer = DictionaryObject() + while startxref is not None: + # load the xref table + stream.seek(startxref, 0) + x = stream.read(1) + if x in b"\r\n": + x = stream.read(1) + if x == b"x": + startxref = self._read_xref(stream) + elif xref_issue_nr: + try: + self._rebuild_xref_table(stream) + break + except Exception: + xref_issue_nr = 0 + elif x.isdigit(): + try: + xrefstream = self._read_pdf15_xref_stream(stream) + except Exception as e: + if TK.ROOT in self.trailer: + logger_warning( + f"Previous trailer can not be read {e.args}", + __name__, + ) + break + else: + raise PdfReadError(f"trailer can not be read {e.args}") + trailer_keys = TK.ROOT, TK.ENCRYPT, TK.INFO, TK.ID, TK.SIZE + for key in trailer_keys: + if key in xrefstream and key not in self.trailer: + self.trailer[NameObject(key)] = xrefstream.raw_get(key) + if "/XRefStm" in xrefstream: + p = stream.tell() + stream.seek(cast(int, xrefstream["/XRefStm"]) + 1, 0) + self._read_pdf15_xref_stream(stream) + stream.seek(p, 0) + if "/Prev" in xrefstream: + startxref = cast(int, xrefstream["/Prev"]) + else: + break + else: + startxref = self._read_xref_other_error(stream, startxref) + + def _read_xref(self, stream: StreamType) -> Optional[int]: + self._read_standard_xref_table(stream) + read_non_whitespace(stream) + stream.seek(-1, 1) + new_trailer = cast(Dict[str, Any], read_object(stream, self)) + for key, value in new_trailer.items(): + if key not in self.trailer: + self.trailer[key] = value + if "/XRefStm" in new_trailer: + p = stream.tell() + stream.seek(cast(int, new_trailer["/XRefStm"]) + 1, 0) + try: + self._read_pdf15_xref_stream(stream) + except Exception: + logger_warning( + f"XRef object at {new_trailer['/XRefStm']} can not be read, some object may be missing", + __name__, + ) + stream.seek(p, 0) + if "/Prev" in new_trailer: + startxref = new_trailer["/Prev"] + return startxref + else: + return None + + def _read_xref_other_error( + self, stream: StreamType, startxref: int + ) -> Optional[int]: + # some PDFs have /Prev=0 in the trailer, instead of no /Prev + if startxref == 0: + if self.strict: + raise PdfReadError( + "/Prev=0 in the trailer (try opening with strict=False)" + ) + logger_warning( + "/Prev=0 in the trailer - assuming there is no previous xref table", + __name__, + ) + return None + # bad xref character at startxref. Let's see if we can find + # the xref table nearby, as we've observed this error with an + # off-by-one before. + stream.seek(-11, 1) + tmp = stream.read(20) + xref_loc = tmp.find(b"xref") + if xref_loc != -1: + startxref -= 10 - xref_loc + return startxref + # No explicit xref table, try finding a cross-reference stream. + stream.seek(startxref, 0) + for look in range(25): # value extended to cope with more linearized files + if stream.read(1).isdigit(): + # This is not a standard PDF, consider adding a warning + startxref += look + return startxref + # no xref table found at specified location + if "/Root" in self.trailer and not self.strict: + # if Root has been already found, just raise warning + logger_warning("Invalid parent xref., rebuild xref", __name__) + try: + self._rebuild_xref_table(stream) + return None + except Exception: + raise PdfReadError("can not rebuild xref") + raise PdfReadError("Could not find xref table at specified location") + + def _read_pdf15_xref_stream( + self, stream: StreamType + ) -> Union[ContentStream, EncodedStreamObject, DecodedStreamObject]: + # PDF 1.5+ Cross-Reference Stream + stream.seek(-1, 1) + idnum, generation = self.read_object_header(stream) + xrefstream = cast(ContentStream, read_object(stream, self)) + assert cast(str, xrefstream["/Type"]) == "/XRef" + self.cache_indirect_object(generation, idnum, xrefstream) + stream_data = BytesIO(b_(xrefstream.get_data())) + # Index pairs specify the subsections in the dictionary. If + # none create one subsection that spans everything. + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + entry_sizes = cast(Dict[Any, Any], xrefstream.get("/W")) + assert len(entry_sizes) >= 3 + if self.strict and len(entry_sizes) > 3: + raise PdfReadError(f"Too many entry sizes: {entry_sizes}") + + def get_entry(i: int) -> Union[int, Tuple[int, ...]]: + # Reads the correct number of bytes for each entry. See the + # discussion of the W parameter in PDF spec table 17. + if entry_sizes[i] > 0: + d = stream_data.read(entry_sizes[i]) + return convert_to_int(d, entry_sizes[i]) + + # PDF Spec Table 17: A value of zero for an element in the + # W array indicates...the default value shall be used + if i == 0: + return 1 # First value defaults to 1 + else: + return 0 + + def used_before(num: int, generation: Union[int, Tuple[int, ...]]) -> bool: + # We move backwards through the xrefs, don't replace any. + return num in self.xref.get(generation, []) or num in self.xref_objStm # type: ignore + + # Iterate through each subsection + self._read_xref_subsections(idx_pairs, get_entry, used_before) + return xrefstream + + @staticmethod + def _get_xref_issues(stream: StreamType, startxref: int) -> int: + """ + Return an int which indicates an issue. 0 means there is no issue. + + Args: + stream: + startxref: + + Returns: + 0 means no issue, other values represent specific issues. + """ + stream.seek(startxref - 1, 0) # -1 to check character before + line = stream.read(1) + if line == b"j": + line = stream.read(1) + if line not in b"\r\n \t": + return 1 + line = stream.read(4) + if line != b"xref": + # not an xref so check if it is an XREF object + line = b"" + while line in b"0123456789 \t": + line = stream.read(1) + if line == b"": + return 2 + line += stream.read(2) # 1 char already read, +2 to check "obj" + if line.lower() != b"obj": + return 3 + return 0 + + def _rebuild_xref_table(self, stream: StreamType) -> None: + self.xref = {} + stream.seek(0, 0) + f_ = stream.read(-1) + + for m in re.finditer(rb"[\r\n \t][ \t]*(\d+)[ \t]+(\d+)[ \t]+obj", f_): + idnum = int(m.group(1)) + generation = int(m.group(2)) + if generation not in self.xref: + self.xref[generation] = {} + self.xref[generation][idnum] = m.start(1) + stream.seek(0, 0) + for m in re.finditer(rb"[\r\n \t][ \t]*trailer[\r\n \t]*(<<)", f_): + stream.seek(m.start(1), 0) + new_trailer = cast(Dict[Any, Any], read_object(stream, self)) + # Here, we are parsing the file from start to end, the new data have to erase the existing. + for key, value in list(new_trailer.items()): + self.trailer[key] = value + + def _read_xref_subsections( + self, + idx_pairs: List[int], + get_entry: Callable[[int], Union[int, Tuple[int, ...]]], + used_before: Callable[[int, Union[int, Tuple[int, ...]]], bool], + ) -> None: + for start, size in self._pairs(idx_pairs): + # The subsections must increase + for num in range(start, start + size): + # The first entry is the type + xref_type = get_entry(0) + # The rest of the elements depend on the xref_type + if xref_type == 0: + # linked list of free objects + next_free_object = get_entry(1) # noqa: F841 + next_generation = get_entry(2) # noqa: F841 + elif xref_type == 1: + # objects that are in use but are not compressed + byte_offset = get_entry(1) + generation = get_entry(2) + if generation not in self.xref: + self.xref[generation] = {} # type: ignore + if not used_before(num, generation): + self.xref[generation][num] = byte_offset # type: ignore + elif xref_type == 2: + # compressed objects + objstr_num = get_entry(1) + obstr_idx = get_entry(2) + generation = 0 # PDF spec table 18, generation is 0 + if not used_before(num, generation): + self.xref_objStm[num] = (objstr_num, obstr_idx) + elif self.strict: + raise PdfReadError(f"Unknown xref type: {xref_type}") + + def _pairs(self, array: List[int]) -> Iterable[Tuple[int, int]]: + i = 0 + while True: + yield array[i], array[i + 1] + i += 2 + if (i + 1) >= len(array): + break + + def read_next_end_line( + self, stream: StreamType, limit_offset: int = 0 + ) -> bytes: # deprecated + """.. deprecated:: 2.1.0""" + deprecate_no_replacement("read_next_end_line", removed_in="4.0.0") + line_parts = [] + while True: + # Prevent infinite loops in malformed PDFs + if stream.tell() == 0 or stream.tell() == limit_offset: + raise PdfReadError("Could not read malformed PDF file") + x = stream.read(1) + if stream.tell() < 2: + raise PdfReadError("EOL marker not found") + stream.seek(-2, 1) + if x in (b"\n", b"\r"): # \n = LF; \r = CR + crlf = False + while x in (b"\n", b"\r"): + x = stream.read(1) + if x in (b"\n", b"\r"): # account for CR+LF + stream.seek(-1, 1) + crlf = True + if stream.tell() < 2: + raise PdfReadError("EOL marker not found") + stream.seek(-2, 1) + stream.seek( + 2 if crlf else 1, 1 + ) # if using CR+LF, go back 2 bytes, else 1 + break + else: + line_parts.append(x) + line_parts.reverse() + return b"".join(line_parts) + + def readNextEndLine( + self, stream: StreamType, limit_offset: int = 0 + ) -> bytes: # deprecated + """.. deprecated:: 1.28.0""" + deprecation_no_replacement("readNextEndLine", "3.0.0") + return self.read_next_end_line(stream, limit_offset) + + def decrypt(self, password: Union[str, bytes]) -> PasswordType: + """ + When using an encrypted / secured PDF file with the PDF Standard + encryption handler, this function will allow the file to be decrypted. + It checks the given password against the document's user password and + owner password, and then stores the resulting decryption key if either + password is correct. + + It does not matter which password was matched. Both passwords provide + the correct decryption key that will allow the document to be used with + this library. + + Args: + password: The password to match. + + Returns: + An indicator if the document was decrypted and weather it was the + owner password or the user password. + """ + if not self._encryption: + raise PdfReadError("Not encrypted file") + # TODO: raise Exception for wrong password + return self._encryption.verify(password) + + def decode_permissions(self, permissions_code: int) -> Dict[str, bool]: + # Takes the permissions as an integer, returns the allowed access + permissions = {} + permissions["print"] = permissions_code & (1 << 3 - 1) != 0 # bit 3 + permissions["modify"] = permissions_code & (1 << 4 - 1) != 0 # bit 4 + permissions["copy"] = permissions_code & (1 << 5 - 1) != 0 # bit 5 + permissions["annotations"] = permissions_code & (1 << 6 - 1) != 0 # bit 6 + permissions["forms"] = permissions_code & (1 << 9 - 1) != 0 # bit 9 + permissions["accessability"] = permissions_code & (1 << 10 - 1) != 0 # bit 10 + permissions["assemble"] = permissions_code & (1 << 11 - 1) != 0 # bit 11 + permissions["print_high_quality"] = ( + permissions_code & (1 << 12 - 1) != 0 + ) # bit 12 + return permissions + + @property + def is_encrypted(self) -> bool: + """ + Read-only boolean property showing whether this PDF file is encrypted. + + Note that this property, if true, will remain true even after the + :meth:`decrypt()` method is called. + """ + return TK.ENCRYPT in self.trailer + + def getIsEncrypted(self) -> bool: # deprecated + """ + Use :py:attr:`is_encrypted` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("getIsEncrypted", "is_encrypted", "3.0.0") + return self.is_encrypted + + @property + def isEncrypted(self) -> bool: # deprecated + """ + Use :py:attr:`is_encrypted` instead. + + .. deprecated:: 1.28.0 + """ + deprecation_with_replacement("isEncrypted", "is_encrypted", "3.0.0") + return self.is_encrypted + + @property + def xfa(self) -> Optional[Dict[str, Any]]: + tree: Optional[TreeObject] = None + retval: Dict[str, Any] = {} + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not catalog["/AcroForm"]: + return None + + tree = cast(TreeObject, catalog["/AcroForm"]) + + if "/XFA" in tree: + fields = cast(ArrayObject, tree["/XFA"]) + i = iter(fields) + for f in i: + tag = f + f = next(i) + if isinstance(f, IndirectObject): + field = cast(Optional[EncodedStreamObject], f.get_object()) + if field: + es = zlib.decompress(b_(field._data)) + retval[tag] = es + return retval + + def add_form_topname(self, name: str) -> Optional[DictionaryObject]: + """ + Add a top level form that groups all form fields below it. + + Args: + name: text string of the "/T" Attribute of the created object + + Returns: + The created object. ``None`` means no object was created. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not isinstance( + catalog["/AcroForm"], DictionaryObject + ): + return None + acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + if "/Fields" not in acroform: + # TODO: :No error returns but may be extended for XFA Forms + return None + + interim = DictionaryObject() + interim[NameObject("/T")] = TextStringObject(name) + interim[NameObject("/Kids")] = acroform[NameObject("/Fields")] + self.cache_indirect_object( + 0, + max([i for (g, i) in self.resolved_objects if g == 0]) + 1, + interim, + ) + arr = ArrayObject() + arr.append(interim.indirect_reference) + acroform[NameObject("/Fields")] = arr + for o in cast(ArrayObject, interim["/Kids"]): + obj = o.get_object() + if "/Parent" in obj: + logger_warning( + f"Top Level Form Field {obj.indirect_reference} have a non-expected parent", + __name__, + ) + obj[NameObject("/Parent")] = interim.indirect_reference + return interim + + def rename_form_topname(self, name: str) -> Optional[DictionaryObject]: + """ + Rename top level form field that all form fields below it. + + Args: + name: text string of the "/T" field of the created object + + Returns: + The modified object. ``None`` means no object was modified. + """ + catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) + + if "/AcroForm" not in catalog or not isinstance( + catalog["/AcroForm"], DictionaryObject + ): + return None + acroform = cast(DictionaryObject, catalog[NameObject("/AcroForm")]) + if "/Fields" not in acroform: + return None + + interim = cast( + DictionaryObject, + cast(ArrayObject, acroform[NameObject("/Fields")])[0].get_object(), + ) + interim[NameObject("/T")] = TextStringObject(name) + return interim + + def _get_embedded_files_root(self) -> Optional[NameTree]: + """ + Returns the EmbeddedFiles root as a NameTree Object + if the root does not exists, return None + """ + catalog = cast(DictionaryObject, self.trailer["/Root"]) + if "/Names" not in catalog: + return None + ef = cast(DictionaryObject, catalog["/Names"]).get("/EmbeddedFiles", None) + if ef is None: + return None + efo = ef.get_object() + # not for reader + """ + if not isinstance(efo,NameTree): + if isinstance(ef,IndirectObject): + ef.replace_object(efo) + else: + cast(DictionaryObject,catalog["/Names"])[ + NameObject("/EmbeddedFiles")] = NameTree(efo) + """ + return NameTree(efo) + + @property + def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: + ef = self._get_embedded_files_root() + if ef: + return ef.list_items() + else: + return None + + @property + def attachments(self) -> Mapping[str, List[Union[bytes, Dict[str, bytes]]]]: + ef = self._get_embedded_files_root() + if ef: + d: Dict[str, List[Union[bytes, Dict[str, bytes]]]] = {} + for k, v in ef.list_items().items(): + if isinstance(v, list): + if k not in d: + d[k] = [] # type: ignore + for e in v: + e = cast(DictionaryObject, e.get_object()) + if "/EF" in e: + d[k].append(e["/EF"]["/F"].get_data()) # type: ignore + elif "/RF" in e: + r = cast( + ArrayObject, cast(DictionaryObject, e["/RF"])["/F"] + ) + di: Dict[str, bytes] = {} + i = 0 + while i < len(r): + di[cast(str, r[i])] = r[i + 1].get_object().get_data() + i += 2 + d[k].append(di) + return d + else: + return {} + + def _list_attachments(self) -> List[str]: + """ + Retrieves the list of filenames of file attachments. + + Returns: + list of filenames + """ + ef = self._get_embedded_files_root() + if ef: + lst = ef.list_keys() + else: + lst = [] + """ + for ip, p in enumerate(self.pages): + for a in [_a.get_object() + for _a in p.get("/Annots",[])]: + if _a.get_object().get("/Subtype","") != "/FileAttachements": + continue + lst.append(f"$page_{ip}.{get_name_from_file_specification(_a)}") + """ + return lst + + def _get_attachment_list(self, name: str) -> List[Union[bytes, Dict[str, bytes]]]: + out = self._get_attachments(name)[name] + if isinstance(out, list): + return out + return [out] + + def _get_attachments( + self, filename: Optional[str] = None + ) -> Dict[str, List[Union[bytes, Dict[str, bytes]]]]: + """ + Retrieves all or selected file attachments of the PDF as a dictionary of file names + and the file data as a bytestring. + + Args: + filename: If filename is None, then a dictionary of all attachments + will be returned, where the key is the filename and the value + is the content. Otherwise, a dictionary with just a single key + - the filename - and its content will be returned. + + Returns: + dictionary of filename -> Union[bytestring or List[ByteString]] + if the filename exists multiple times a List of the different version will be provided + """ + ef = self._get_embedded_files_root() + if ef is None: + return {} + if filename is None: + return {k: v if len(v) > 1 else v[0] for k, v in self.attachments.items()} # type: ignore + else: + lst = ef.list_get(filename) + return { + filename: [(x.get_object())["/EF"].get_object( # type: ignore + )["/F"].get_object().get_data() for x in lst] # type: ignore + if isinstance(lst, list) + else (lst.get_object())["/EF"].get_object()["/F"].get_object().get_data() # type: ignore + } + + +class PdfFileReader(PdfReader): # deprecated + def __init__(self, *args: Any, **kwargs: Any) -> None: + deprecation_with_replacement("PdfFileReader", "PdfReader", "3.0.0") + if "strict" not in kwargs and len(args) < 2: + kwargs["strict"] = True # maintain the default + super().__init__(*args, **kwargs) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 0ea864305..bea4c11ab 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -297,15 +297,13 @@ def _replace_object( obj: PdfObject, ) -> PdfObject: if isinstance(indirect_reference, IndirectObject): - assert indirect_reference.pdf == self + if indirect_reference.pdf != self: + raise ValueError("pdf must be self") indirect_reference = indirect_reference.idnum gen = self._objects[indirect_reference - 1].indirect_reference.generation # type: ignore self._objects[indirect_reference - 1] = obj - return self._objects[indirect_reference - 1] - if indirect_reference.pdf != self: - raise ValueError("pdf must be self") obj.indirect_reference = IndirectObject(indirect_reference, gen, self) - return self._objects[indirect_reference.idnum - 1] # type: ignore + return self._objects[indirect_reference - 1] # type: ignore def _add_page( self, @@ -744,25 +742,36 @@ def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: else: return None + def _list_attachments(self) -> List[str]: + ef = self._get_embedded_files_root() + if ef: + return ef.list_keys() + else: + return [] + @property - def attachments(self) -> Mapping[str, Union[List[bytes], List[Dict[str, bytes]]]]: + def attachments(self) -> Mapping[str, List[Union[bytes, Dict[str, bytes]]]]: ef = self._get_embedded_files_root() if ef: - d = {} + d: Dict[str, List[Union[bytes, Dict[str, bytes]]]] = {} for k, v in ef.list_items().items(): if isinstance(v, list): if k not in d: - d[k] = [] + d[k] = [] # type: ignore for e in v: - e = e.get_object() + e = cast(DictionaryObject, e.get_object()) if "/EF" in e: d[k].append(e["/EF"]["/F"].get_data()) # type: ignore elif "/RF" in e: - r = cast(ArrayObject, e["/RF"]["/F"]) + r = cast( + ArrayObject, cast(DictionaryObject, e["/RF"])["/F"] + ) di = {} i = 0 while i < len(r): - di[r[i]] = r[i + 1].get_object().get_data() + di[cast(str, r[i])] = cast( + bytes, r[i + 1].get_object().get_data() + ) i += 2 d[k].append(di) return d @@ -773,9 +782,10 @@ def add_attachment( self, filename: str, data: Union[str, bytes, List[Tuple[str, bytes]]], + overwrite: bool = True, fname: Optional[str] = None, desc: str = "", - ) -> DictionaryObject: + ) -> Optional[DictionaryObject]: """ Embed a file inside the PDF. @@ -793,6 +803,8 @@ def add_attachment( Returns: The filespec DictionaryObject """ + if not overwrite and filename in self._list_attachments(): + return None if fname is None: st = filename.replace("/", "\\/").replace("\\\\/", "\\/") fname = st.encode().decode("ansi", errors="xmlcharreplace") @@ -862,7 +874,7 @@ def add_attachment( filespec[NameObject(FileSpecificationDictionaryEntries.EF)] = ef_entry nm = self._get_embedded_files_root() or self._create_attachment_root() - nm.list_add(filename, self._add_object(filespec)) + nm.list_add(filename, filespec, overwrite=True) return filespec def addAttachment(self, fname: str, fdata: Union[str, bytes]) -> None: # deprecated @@ -872,7 +884,7 @@ def addAttachment(self, fname: str, fdata: Union[str, bytes]) -> None: # deprec .. deprecated:: 1.28.0 """ deprecation_with_replacement("addAttachment", "add_attachment", "3.0.0") - return self.add_attachment(fname, fdata) + self.add_attachment(fname, fdata) def append_pages_from_reader( self, diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 6e78f2543..a0c56c69f 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1457,11 +1457,14 @@ class NameTree(DictionaryObject): """ def __init__(self, obj: Optional[PdfObject] = None) -> None: + DictionaryObject.__init__(self) + if obj is None: + self[NameObject("/Names")] = ArrayObject() + return if not isinstance(obj, DictionaryObject) or all( x not in obj for x in ("/Names", "/Kids") ): raise ValueError("source object is not a valid source object") - DictionaryObject.__init__(self) obj = cast(DictionaryObject, obj) if obj is not None: self.update(obj) @@ -1603,16 +1606,20 @@ def list_add( def _update_limits( obj: DictionaryObject, - lo: Optional[TextStringObject], - hi: Optional[TextStringObject], + lo: Optional[Union[str, TextStringObject]], + hi: Optional[Union[str, TextStringObject]], ) -> bool: if "/Limits" not in obj: return False a = cast("ArrayObject", obj["/Limits"]) if lo is not None and lo < a[0]: + if not isinstance(lo, TextStringObject): + lo = TextStringObject(lo) a[0] = lo return True if hi is not None and hi > a[0]: + if not isinstance(hi, TextStringObject): + lo = TextStringObject(hi) a[1] = hi return True return False @@ -1626,17 +1633,18 @@ def _add_in( o = cast(DictionaryObject, o) if "/Names" in o: _l = cast(ArrayObject, o["/Names"]) - li = o.get("/Limits", [_l[0], _l[-2]]) - if not appb and key < li[0]: - return None - if not app and key > li[1]: - return None + if len(_l) > 0: + li = o.get("/Limits", [_l[0], _l[-2]]) + if not appb and key < li[0]: + return None + if not app and key > li[1]: + return None i = 0 while i < len(_l): if _l[i] == key: - if not overwrite: - continue d = _l[i + 1] + if not overwrite: + return d if isinstance(d, IndirectObject): d.replace_object(data) else: # pragma: no cover diff --git a/tests/test_writer.py b/tests/test_writer.py index cab469903..cc0574459 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1,1864 +1,1871 @@ -"""Test the pypdf._writer module.""" -import re -import shutil -import subprocess -from io import BytesIO -from pathlib import Path - -import pytest - -from pypdf import ( - ObjectDeletionFlag, - PageObject, - PdfMerger, - PdfReader, - PdfWriter, - Transformation, -) -from pypdf.errors import DeprecationError, PageSizeNotDefinedError, PyPdfError -from pypdf.generic import ( - ArrayObject, - ContentStream, - DictionaryObject, - Fit, - IndirectObject, - NameObject, - NullObject, - NumberObject, - RectangleObject, - StreamObject, - TextStringObject, -) - -from . import get_data_from_url, is_sublist -from .test_images import image_similarity - -TESTS_ROOT = Path(__file__).parent.resolve() -PROJECT_ROOT = TESTS_ROOT.parent -RESOURCE_ROOT = PROJECT_ROOT / "resources" -SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files" -GHOSTSCRIPT_BINARY = shutil.which("gs") - - -def test_writer_exception_non_binary(tmp_path, caplog): - src = RESOURCE_ROOT / "pdflatex-outline.pdf" - - reader = PdfReader(src) - writer = PdfWriter() - writer.add_page(reader.pages[0]) - - with open(tmp_path / "out.txt", "w") as fp, pytest.raises(TypeError): - writer.write_stream(fp) - ending = "to write to is not in binary mode. It may not be written to correctly.\n" - assert caplog.text.endswith(ending) - - -def test_writer_clone(): - src = RESOURCE_ROOT / "pdflatex-outline.pdf" - - reader = PdfReader(src) - writer = PdfWriter(clone_from=reader) - assert len(writer.pages) == 4 - assert "PageObject" in str(type(writer.pages[0])) - - writer = PdfWriter(clone_from=src) - assert len(writer.pages) == 4 - assert "PageObject" in str(type(writer.pages[0])) - - -def test_writer_clone_bookmarks(): - # Arrange - src = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" - reader = PdfReader(src) - writer = PdfWriter() - - # Act + test cat - cat = "" - - def cat1(p) -> None: - nonlocal cat - cat += p.__repr__() - - writer.clone_document_from_reader(reader, cat1) - assert "/Page" in cat - assert writer.pages[0].raw_get("/Parent") == writer._pages - writer.add_outline_item("Page 1", 0) - writer.add_outline_item("Page 2", 1) - - # Assert - bytes_stream = BytesIO() - writer.write(bytes_stream) - bytes_stream.seek(0) - reader2 = PdfReader(bytes_stream) - assert len(reader2.pages) == len(reader.pages) - assert len(reader2.outline) == 2 - - # test with append - writer = PdfWriter() - writer.append(reader) - writer.add_outline_item("Page 1", 0) - writer.add_outline_item("Page 2", 1) - - # Assert - bytes_stream = BytesIO() - writer.write(bytes_stream) - bytes_stream.seek(0) - reader2 = PdfReader(bytes_stream) - assert len(reader2.pages) == len(reader.pages) - assert len(reader2.outline) == 2 - - -def writer_operate(writer: PdfWriter) -> None: - """ - To test the writer that initialized by each of the four usages. - - Args: - writer: A PdfWriter object - """ - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - pdf_outline_path = RESOURCE_ROOT / "pdflatex-outline.pdf" - - reader = PdfReader(pdf_path) - reader_outline = PdfReader(pdf_outline_path) - - page = reader.pages[0] - with pytest.raises(PageSizeNotDefinedError) as exc: - writer.add_blank_page() - assert exc.value.args == () - writer.insert_page(page, 1) - writer.insert_page(reader_outline.pages[0], 0) - writer.add_outline_item_destination(page) - writer.remove_links() - writer.add_outline_item_destination(page) - oi = writer.add_outline_item( - "An outline item", 0, None, (255, 0, 15), True, True, Fit.fit_box_vertically(10) - ) - writer.add_outline_item( - "The XYZ fit", 0, oi, (255, 0, 15), True, True, Fit.xyz(left=10, top=20, zoom=3) - ) - writer.add_outline_item( - "The XYZ fit no args", 0, oi, (255, 0, 15), True, True, Fit.xyz() - ) - writer.add_outline_item( - "The FitH fit", 0, oi, (255, 0, 15), True, True, Fit.fit_horizontally(top=10) - ) - writer.add_outline_item( - "The FitV fit", 0, oi, (255, 0, 15), True, True, Fit.fit_vertically(left=10) - ) - writer.add_outline_item( - "The FitR fit", - 0, - oi, - (255, 0, 15), - True, - True, - Fit.fit_rectangle(left=10, bottom=20, right=30, top=40), - ) - writer.add_outline_item( - "The FitB fit", 0, oi, (255, 0, 15), True, True, Fit.fit_box() - ) - writer.add_outline_item( - "The FitBH fit", - 0, - oi, - (255, 0, 15), - True, - True, - Fit.fit_box_horizontally(top=10), - ) - writer.add_outline_item( - "The FitBV fit", - 0, - oi, - (255, 0, 15), - True, - True, - Fit.fit_box_vertically(left=10), - ) - writer.add_blank_page() - writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) - with pytest.warns( - DeprecationWarning, match="'pagenum' argument of add_uri is deprecated" - ): - writer.add_uri( - 2, "https://example.com", RectangleObject([0, 0, 100, 100]), pagenum=2 - ) - with pytest.raises(DeprecationError): - writer.add_link(2, 1, RectangleObject([0, 0, 100, 100])) - assert writer._get_page_layout() is None - writer.page_layout = "broken" - assert writer.page_layout == "broken" - writer.page_layout = NameObject("/SinglePage") - assert writer._get_page_layout() == "/SinglePage" - assert writer._get_page_mode() is None - writer.set_page_mode("/UseNone") - assert writer._get_page_mode() == "/UseNone" - writer.set_page_mode(NameObject("/UseOC")) - assert writer._get_page_mode() == "/UseOC" - writer.insert_blank_page(width=100, height=100) - writer.insert_blank_page() # without parameters - - writer.remove_images() - - writer.add_metadata(reader.metadata) - writer.add_metadata({"/Author": "Martin Thoma"}) - writer.add_metadata({"/MyCustom": 1234}) - - writer.add_attachment("foobar.gif", b"foobarcontent") - - # Check that every key in _idnum_hash is correct - objects_hash = [o.hash_value() for o in writer._objects] - for k, v in writer._idnum_hash.items(): - assert v.pdf == writer - assert k in objects_hash, f"Missing {v}" - - -tmp_path = "dont_commit_writer.pdf" - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operations_by_traditional_usage(write_data_here, needs_cleanup): - writer = PdfWriter() - - writer_operate(writer) - - # finally, write "output" to pypdf-output.pdf - if needs_cleanup: - with open(write_data_here, "wb") as output_stream: - writer.write(output_stream) - else: - output_stream = write_data_here - writer.write(output_stream) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operations_by_semi_traditional_usage(write_data_here, needs_cleanup): - with PdfWriter() as writer: - writer_operate(writer) - - # finally, write "output" to pypdf-output.pdf - if needs_cleanup: - with open(write_data_here, "wb") as output_stream: - writer.write(output_stream) - else: - output_stream = write_data_here - writer.write(output_stream) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operations_by_semi_new_traditional_usage( - write_data_here, needs_cleanup -): - with PdfWriter() as writer: - writer_operate(writer) - - # finally, write "output" to pypdf-output.pdf - writer.write(write_data_here) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ("dont_commit_writer.pdf", True), - (Path("dont_commit_writer.pdf"), True), - (BytesIO(), False), - ], -) -def test_writer_operation_by_new_usage(write_data_here, needs_cleanup): - # This includes write "output" to pypdf-output.pdf - with PdfWriter(write_data_here) as writer: - writer_operate(writer) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.parametrize( - "input_path", - [ - "side-by-side-subfig.pdf", - "reportlab-inline-image.pdf", - ], -) -def test_remove_images(pdf_file_path, input_path): - pdf_path = RESOURCE_ROOT / input_path - - reader = PdfReader(pdf_path) - writer = PdfWriter() - - page = reader.pages[0] - writer.insert_page(page, 0) - writer.remove_images() - page_contents_stream = writer.pages[0]["/Contents"]._data - assert len(page_contents_stream.strip()) - - # finally, write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - with open(pdf_file_path, "rb") as input_stream: - reader = PdfReader(input_stream) - if input_path == "side-by-side-subfig.pdf": - extracted_text = reader.pages[0].extract_text() - assert extracted_text - assert "Lorem ipsum dolor sit amet" in extracted_text - - -@pytest.mark.enable_socket() -def test_remove_images_sub_level(): - """Cf #2035""" - url = "https://github.com/py-pdf/pypdf/files/12394781/2210.03142-1.pdf" - name = "iss2103.pdf" - writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) - writer.remove_images() - assert ( - len( - [ - o.get_object() - for o in writer.pages[0]["/Resources"]["/XObject"]["/Fm1"][ - "/Resources" - ]["/XObject"]["/Im1"]["/Resources"]["/XObject"].values() - if not isinstance(o.get_object(), NullObject) - ] - ) - == 0 - ) - - -@pytest.mark.parametrize( - "input_path", - [ - "side-by-side-subfig.pdf", - "reportlab-inline-image.pdf", - ], -) -def test_remove_text(input_path, pdf_file_path): - pdf_path = RESOURCE_ROOT / input_path - - reader = PdfReader(pdf_path) - writer = PdfWriter() - - page = reader.pages[0] - writer.insert_page(page, 0) - writer.remove_text() - - # finally, write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_remove_text_all_operators(pdf_file_path): - stream = ( - b"BT " - b"/F0 36 Tf " - b"50 706 Td " - b"36 TL " - b"(The Tj operator) Tj " - b'1 2 (The double quote operator) " ' - b"(The single quote operator) ' " - b"ET" - ) - pdf_data = ( - b"%%PDF-1.7\n" - b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n" - b"2 0 obj << >> endobj\n" - b"3 0 obj << >> endobj\n" - b"4 0 obj << /Length %d >>\n" - b"stream\n" + (b"%s\n" % stream) + b"endstream\n" - b"endobj\n" - b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n" - b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" - b" /Resources << /Font << >> >>" - b" /Rotate 0 /Type /Page >> endobj\n" - b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" - b"xref 1 6\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"%010d 00000 n\n" - b"trailer << /Root 6 0 R /Size 6 >>\n" - b"startxref\n%d\n" - b"%%%%EOF" - ) - startx_correction = -1 - pdf_data = pdf_data % ( - len(stream), - pdf_data.find(b"1 0 obj") + startx_correction, - pdf_data.find(b"2 0 obj") + startx_correction, - pdf_data.find(b"3 0 obj") + startx_correction, - pdf_data.find(b"4 0 obj") + startx_correction, - pdf_data.find(b"5 0 obj") + startx_correction, - pdf_data.find(b"6 0 obj") + startx_correction, - # startx_correction should be -1 due to double % at the beginning - # inducing an error on startxref computation - pdf_data.find(b"xref"), - ) - pdf_stream = BytesIO(pdf_data) - - reader = PdfReader(pdf_stream, strict=False) - writer = PdfWriter() - - page = reader.pages[0] - writer.insert_page(page, 0) - writer.remove_text() - - # finally, write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_write_metadata(pdf_file_path): - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - - reader = PdfReader(pdf_path) - writer = PdfWriter() - - writer.add_page(reader.pages[0]) - for page in reader.pages: - writer.add_page(page) - - metadata = reader.metadata - writer.add_metadata(metadata) - - writer.add_metadata({"/Title": "The Crazy Ones"}) - - # finally, write data to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - # Check if the title was set - reader = PdfReader(pdf_file_path) - metadata = reader.metadata - assert metadata.get("/Title") == "The Crazy Ones" - - -def test_fill_form(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - writer = PdfWriter() - - writer.append(reader, [0]) - writer.append(RESOURCE_ROOT / "crazyones.pdf", [0]) - - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"}, flags=1 - ) - - # check if no fields to fill in the page - writer.update_page_form_field_values( - writer.pages[1], {"foo": "some filled in text"}, flags=1 - ) - - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"} - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_fill_form_with_qualified(): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - reader.add_form_topname("top") - - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.add_page(reader.pages[0]) - writer.update_page_form_field_values( - writer.pages[0], {"top.foo": "filling"}, flags=1 - ) - b = BytesIO() - writer.write(b) - - reader2 = PdfReader(b) - fields = reader2.get_fields() - assert fields["top.foo"]["/V"] == "filling" - - -@pytest.mark.parametrize( - ("use_128bit", "user_password", "owner_password"), - [(True, "userpwd", "ownerpwd"), (False, "userpwd", "ownerpwd")], -) -def test_encrypt(use_128bit, user_password, owner_password, pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - writer = PdfWriter() - - page = reader.pages[0] - orig_text = page.extract_text() - - writer.add_page(page) - - with pytest.raises(ValueError, match="owner_pwd of encrypt is deprecated."): - writer.encrypt( - owner_pwd=user_password, - owner_password=owner_password, - user_password=user_password, - use_128bit=use_128bit, - ) - with pytest.raises(ValueError, match="'user_pwd' argument is deprecated"): - writer.encrypt( - owner_password=owner_password, - user_password=user_password, - user_pwd=user_password, - use_128bit=use_128bit, - ) - writer.encrypt( - user_password=user_password, - owner_password=owner_password, - use_128bit=use_128bit, - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - # Test that the data is not there in clear text - with open(pdf_file_path, "rb") as input_stream: - data = input_stream.read() - assert b"foo" not in data - - # Test the user password (str): - reader = PdfReader(pdf_file_path, password="userpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - # Test the owner password (str): - reader = PdfReader(pdf_file_path, password="ownerpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - # Test the user password (bytes): - reader = PdfReader(pdf_file_path, password=b"userpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - # Test the owner password (stbytesr): - reader = PdfReader(pdf_file_path, password=b"ownerpwd") - new_text = reader.pages[0].extract_text() - assert reader.metadata.get("/Producer") == "pypdf" - assert new_text == orig_text - - -def test_add_outline_item(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - - for page in reader.pages: - writer.add_page(page) - - outline_item = writer.add_outline_item( - "An outline item", - 1, - None, - (255, 0, 15), - True, - True, - Fit.fit(), - is_open=False, - ) - _o2a = writer.add_outline_item( - "Another", 2, outline_item, None, False, False, Fit.fit() - ) - _o2b = writer.add_outline_item( - "Another bis", 2, outline_item, None, False, False, Fit.fit() - ) - outline_item2 = writer.add_outline_item( - "An outline item 2", - 1, - None, - (255, 0, 15), - True, - True, - Fit.fit(), - is_open=True, - ) - _o3a = writer.add_outline_item( - "Another 2", 2, outline_item2, None, False, False, Fit.fit() - ) - _o3b = writer.add_outline_item( - "Another 2bis", 2, outline_item2, None, False, False, Fit.fit() - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "w+b") as output_stream: - writer.write(output_stream) - output_stream.seek(0) - reader = PdfReader(output_stream) - assert reader.trailer["/Root"]["/Outlines"]["/Count"] == 3 - assert reader.outline[0]["/Count"] == -2 - assert reader.outline[0]["/%is_open%"] == False # noqa - assert reader.outline[2]["/Count"] == 2 - assert reader.outline[2]["/%is_open%"] == True # noqa - assert reader.outline[1][0]["/Count"] == 0 - - -def test_add_named_destination(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - assert writer.get_named_dest_root() == [] - - for page in reader.pages: - writer.add_page(page) - - assert writer.get_named_dest_root() == [] - - writer.add_named_destination(TextStringObject("A named dest"), 2) - writer.add_named_destination(TextStringObject("A named dest2"), 2) - - with pytest.warns(DeprecationWarning, match="pagenum is deprecated as an argument"): - writer.add_named_destination(TextStringObject("A named dest3"), pagenum=2) - - with pytest.raises(ValueError): - writer.add_named_destination( - TextStringObject("A named dest3"), pagenum=2, page_number=2 - ) - - root = writer.get_named_dest_root() - assert root[0] == "A named dest" - assert root[1].pdf == writer - assert root[1].get_object()["/S"] == NameObject("/GoTo") - assert root[1].get_object()["/D"][0] == writer.pages[2].indirect_reference - assert root[2] == "A named dest2" - assert root[3].pdf == writer - assert root[3].get_object()["/S"] == NameObject("/GoTo") - assert root[3].get_object()["/D"][0] == writer.pages[2].indirect_reference - assert root[4] == "A named dest3" - - # test get_object - - assert writer.get_object(root[1].idnum) == writer.get_object(root[1]) - with pytest.raises(ValueError) as exc: - writer.get_object(reader.pages[0].indirect_reference) - assert exc.value.args[0] == "pdf must be self" - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_add_named_destination_sort_order(pdf_file_path): - """ - Issue #1927 does not appear. - - add_named_destination() maintains the named destination list sort order - """ - writer = PdfWriter() - - assert writer.get_named_dest_root() == [] - - writer.add_blank_page(200, 200) - writer.add_named_destination("b", 0) - # "a" should be moved before "b" on insert - writer.add_named_destination("a", 0) - - root = writer.get_named_dest_root() - - assert len(root) == 4 - assert ( - root[0] == "a" - ), '"a" was not inserted before "b" in the named destination root' - assert root[2] == "b" - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_add_uri(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - - for page in reader.pages: - writer.add_page(page) - - writer.add_uri( - 1, - "http://www.example.com", - RectangleObject([0, 0, 100, 100]), - border=[1, 2, 3, [4]], - ) - writer.add_uri( - 2, - "https://pypdf.readthedocs.io/en/latest/", - RectangleObject([20, 30, 50, 80]), - border=[1, 2, 3], - ) - writer.add_uri( - 3, - "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", - "[ 200 300 250 350 ]", - border=[0, 0, 0], - ) - writer.add_uri( - 3, - "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", - [100, 200, 150, 250], - border=[0, 0, 0], - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_add_link(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") - writer = PdfWriter() - - for page in reader.pages: - writer.add_page(page) - - with pytest.raises( - DeprecationError, - match=( - re.escape( - "add_link is deprecated and was removed in pypdf 3.0.0. " - "Use add_annotation(pypdf.annotations.Link(...)) instead." - ) - ), - ): - writer.add_link( - 1, - 2, - RectangleObject([0, 0, 100, 100]), - border=[1, 2, 3, [4]], - fit="/Fit", - ) - writer.add_link( - 2, 3, RectangleObject([20, 30, 50, 80]), [1, 2, 3], "/FitH", None - ) - writer.add_link( - 3, - 0, - "[ 200 300 250 350 ]", - [0, 0, 0], - "/XYZ", - 0, - 0, - 2, - ) - writer.add_link( - 3, - 0, - [100, 200, 150, 250], - border=[0, 0, 0], - ) - - # write "output" to pypdf-output.pdf - with open(pdf_file_path, "wb") as output_stream: - writer.write(output_stream) - - -def test_io_streams(): - """This is the example from the docs ("Streaming data").""" - filepath = RESOURCE_ROOT / "pdflatex-outline.pdf" - with open(filepath, "rb") as fh: - bytes_stream = BytesIO(fh.read()) - - # Read from bytes stream - reader = PdfReader(bytes_stream) - assert len(reader.pages) == 4 - - # Write to bytes stream - writer = PdfWriter() - with BytesIO() as output_stream: - writer.write(output_stream) - - -def test_regression_issue670(pdf_file_path): - filepath = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(filepath, strict=False) - for _ in range(2): - writer = PdfWriter() - writer.add_page(reader.pages[0]) - with open(pdf_file_path, "wb") as f_pdf: - writer.write(f_pdf) - - -def test_issue301(): - """Test with invalid stream length object.""" - with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: - reader = PdfReader(f) - writer = PdfWriter() - writer.append_pages_from_reader(reader) - b = BytesIO() - writer.write(b) - - -def test_append_pages_from_reader_append(): - """Use append_pages_from_reader with a callable.""" - with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: - reader = PdfReader(f) - writer = PdfWriter() - writer.append_pages_from_reader(reader, callable) - b = BytesIO() - writer.write(b) - - -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_sweep_indirect_references_nullobject_exception(pdf_file_path): - # TODO: Check this more closely... this looks weird - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - - -@pytest.mark.enable_socket() -@pytest.mark.slow() -@pytest.mark.parametrize( - ("url", "name"), - [ - ( - "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf", - "test_sweep_indirect_references_nullobject_exception.pdf", - ), - ( - "https://corpora.tika.apache.org/base/docs/govdocs1/922/922840.pdf", - "test_write_outline_item_on_page_fitv.pdf", - ), - ("https://github.com/py-pdf/pypdf/files/10715624/test.pdf", "iss1627.pdf"), - ], -) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_some_appends(pdf_file_path, url, name): - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - # PdfMerger - merger = PdfMerger() - merger.append(reader) - merger.write(pdf_file_path) - # PdfWriter - merger = PdfWriter() - merger.append(reader) - merger.write(pdf_file_path) - - -def test_pdf_header(): - writer = PdfWriter() - assert writer.pdf_header == b"%PDF-1.3" - - reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") - writer.add_page(reader.pages[0]) - assert writer.pdf_header == b"%PDF-1.5" - - writer.pdf_header = b"%PDF-1.6" - assert writer.pdf_header == b"%PDF-1.6" - - -def test_write_dict_stream_object(pdf_file_path): - stream = ( - b"BT " - b"/F0 36 Tf " - b"50 706 Td " - b"36 TL " - b"(The Tj operator) Tj " - b'1 2 (The double quote operator) " ' - b"(The single quote operator) ' " - b"ET" - ) - - stream_object = StreamObject() - stream_object[NameObject("/Type")] = NameObject("/Text") - stream_object._data = stream - - writer = PdfWriter() - - page_object = PageObject.create_blank_page(writer, 1000, 1000) - # Construct dictionary object (PageObject) with stream object - # Writer will replace this stream object with indirect object - page_object[NameObject("/Test")] = stream_object - - page_object = writer.add_page(page_object) - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - for k, v in page_object.items(): - if k == "/Test": - assert str(v) != str(stream_object) - assert isinstance(v, IndirectObject) - assert str(v.get_object()) == str(stream_object) - break - else: - pytest.fail("/Test not found") - - # Check that every key in _idnum_hash is correct - objects_hash = [o.hash_value() for o in writer._objects] - for k, v in writer._idnum_hash.items(): - assert v.pdf == writer - assert k in objects_hash, "Missing %s" % v - - -def test_add_single_annotation(pdf_file_path): - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - page = reader.pages[0] - writer = PdfWriter() - writer.add_page(page) - - annot_dict = { - "/Type": "/Annot", - "/Subtype": "/Text", - "/Rect": [270.75, 596.25, 294.75, 620.25], - "/Contents": "Note in second paragraph", - "/C": [1, 1, 0], - "/M": "D:20220406191858+02'00", - "/Popup": { - "/Type": "/Annot", - "/Subtype": "/Popup", - "/Rect": [294.75, 446.25, 494.75, 596.25], - "/M": "D:20220406191847+02'00", - }, - "/T": "moose", - } - writer.add_annotation(0, annot_dict) - - # Inspect manually by adding 'assert False' and viewing the PDF - with open(pdf_file_path, "wb") as fp: - writer.write(fp) - - -def test_deprecation_bookmark_decorator(): - reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") - page = reader.pages[0] - outline_item = reader.outline[0] - writer = PdfWriter() - writer.add_page(page) - with pytest.raises( - DeprecationError, - match="bookmark is deprecated as an argument. Use outline_item instead", - ): - writer.add_outline_item_dict(bookmark=outline_item) - - -@pytest.mark.samples() -def test_colors_in_outline_item(pdf_file_path): - reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") - writer = PdfWriter() - writer.clone_document_from_reader(reader) - purple_rgb = (0.5019607843137255, 0.0, 0.5019607843137255) - writer.add_outline_item("First Outline Item", page_number=2, color="800080") - writer.add_outline_item("Second Outline Item", page_number=3, color="#800080") - writer.add_outline_item("Third Outline Item", page_number=4, color=purple_rgb) - - with open(pdf_file_path, "wb") as f: - writer.write(f) - - reader2 = PdfReader(pdf_file_path) - for outline_item in reader2.outline: - # convert float to string because of mutability - assert ["%.5f" % c for c in outline_item.color] == [ - "%.5f" % p for p in purple_rgb - ] - - -@pytest.mark.samples() -def test_write_empty_stream(): - reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") - writer = PdfWriter() - writer.clone_document_from_reader(reader) - - with pytest.raises(ValueError) as exc: - writer.write("") - assert exc.value.args[0] == "Output(stream=) is empty." - - -def test_startup_dest(): - pdf_file_writer = PdfWriter() - pdf_file_writer.append_pages_from_reader(PdfReader(RESOURCE_ROOT / "issue-604.pdf")) - - assert pdf_file_writer.open_destination is None - pdf_file_writer.open_destination = pdf_file_writer.pages[9] - # checked also using Acrobrat to verify the good page is opened - op = pdf_file_writer._root_object["/OpenAction"] - assert op[0] == pdf_file_writer.pages[9].indirect_reference - assert op[1] == "/Fit" - op = pdf_file_writer.open_destination - assert op.raw_get("/Page") == pdf_file_writer.pages[9].indirect_reference - assert op["/Type"] == "/Fit" - pdf_file_writer.open_destination = op - assert pdf_file_writer.open_destination == op - - # irrelevant, just for coverage - pdf_file_writer._root_object[NameObject("/OpenAction")][0] = NumberObject(0) - pdf_file_writer.open_destination - with pytest.raises(Exception) as exc: - del pdf_file_writer._root_object[NameObject("/OpenAction")][0] - pdf_file_writer.open_destination - assert "Invalid Destination" in str(exc.value) - - pdf_file_writer.open_destination = "Test" - # checked also using Acrobrat to verify open_destination - op = pdf_file_writer._root_object["/OpenAction"] - assert isinstance(op, TextStringObject) - assert op == "Test" - op = pdf_file_writer.open_destination - assert isinstance(op, TextStringObject) - assert op == "Test" - - # irrelevant, this is just for coverage - pdf_file_writer._root_object[NameObject("/OpenAction")] = NumberObject(0) - assert pdf_file_writer.open_destination is None - pdf_file_writer.open_destination = None - assert "/OpenAction" not in pdf_file_writer._root_object - pdf_file_writer.open_destination = None - - -@pytest.mark.enable_socket() -def test_iss471(): - url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" - name = "book_471.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - - writer = PdfWriter() - writer.append(reader, excluded_fields=[]) - assert isinstance( - writer.pages[0]["/Annots"][0].get_object()["/Dest"], TextStringObject - ) - - -@pytest.mark.enable_socket() -def test_reset_translation(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader, (0, 10)) - nb = len(writer._objects) - writer.append(reader, (0, 10)) - assert ( - len(writer._objects) == nb + 11 - ) # +10 (pages) +1 because of the added outline - nb += 1 - writer.reset_translation(reader) - writer.append(reader, (0, 10)) - assert len(writer._objects) >= nb + 200 - nb = len(writer._objects) - writer.reset_translation(reader.pages[0].indirect_reference) - writer.append(reader, (0, 10)) - assert len(writer._objects) >= nb + 200 - nb = len(writer._objects) - writer.reset_translation() - writer.append(reader, (0, 10)) - assert len(writer._objects) >= nb + 200 - nb = len(writer.pages) - writer.append(reader, [reader.pages[0], reader.pages[0]]) - assert len(writer.pages) == nb + 2 - - -def test_threads_empty(): - writer = PdfWriter() - thr = writer.threads - assert isinstance(thr, ArrayObject) - assert len(thr) == 0 - thr2 = writer.threads - assert thr == thr2 - - -@pytest.mark.enable_socket() -def test_append_without_annots_and_articles(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader, None, (0, 10), True, ["/B"]) - writer.reset_translation() - writer.append(reader, (0, 10), True, ["/B"]) - assert writer.threads == [] - writer = PdfWriter() - writer.append(reader, None, (0, 10), True, ["/Annots"]) - assert "/Annots" not in writer.pages[5] - writer = PdfWriter() - writer.append(reader, None, (0, 10), True, []) - assert "/Annots" in writer.pages[5] - assert len(writer.threads) >= 1 - - -@pytest.mark.enable_socket() -def test_append_multiple(): - url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" - name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append( - reader, [0, 0, 0] - ) # to demonstre multiple insertion of same page at once - writer.append(reader, [0, 0, 0]) # second pack - pages = writer._root_object["/Pages"]["/Kids"] - assert pages[0] not in pages[1:] # page not repeated - assert pages[-1] not in pages[0:-1] # page not repeated - - -@pytest.mark.samples() -def test_set_page_label(pdf_file_path): - src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels - reader = PdfReader(src) - - expected = [ - "i", - "ii", - "1", - "2", - "A", - "B", - "1", - "2", - "3", - "4", - "A", - "i", - "I", - "II", - "1", - "2", - "3", - "I", - "II", - ] - - # Tests full lenght with labels assigned at first and last elements - # Tests different labels assigned to consecutive ranges - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(0, 1, "/r") - writer.set_page_label(4, 5, "/A") - writer.set_page_label(10, 10, "/A") - writer.set_page_label(11, 11, "/r") - writer.set_page_label(12, 13, "/R") - writer.set_page_label(17, 18, "/R") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels == expected - - writer = PdfWriter() # Same labels, different set order - writer.clone_document_from_reader(reader) - writer.set_page_label(17, 18, "/R") - writer.set_page_label(4, 5, "/A") - writer.set_page_label(10, 10, "/A") - writer.set_page_label(0, 1, "/r") - writer.set_page_label(12, 13, "/R") - writer.set_page_label(11, 11, "/r") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels == expected - - # Tests labels assigned only in the middle - # Tests label assigned to a range already containing labled ranges - expected = ["1", "2", "i", "ii", "iii", "iv", "v", "1"] - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(3, 4, "/a") - writer.set_page_label(5, 5, "/A") - writer.set_page_label(2, 6, "/r") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - # Tests labels assigned inside a previously existing range - expected = ["1", "2", "i", "a", "b", "A", "1", "1", "2"] - # Ones repeat because user didnt cover the entire original range - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(2, 6, "/r") - writer.set_page_label(3, 4, "/a") - writer.set_page_label(5, 5, "/A") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - # Tests invalid user input - writer = PdfWriter() - writer.clone_document_from_reader(reader) - with pytest.raises( - ValueError, match="at least one between style and prefix must be given" - ): - writer.set_page_label(0, 5, start=2) - with pytest.raises( - ValueError, match="page_index_from must be equal or greater then 0" - ): - writer.set_page_label(-1, 5, "/r") - with pytest.raises( - ValueError, match="page_index_to must be equal or greater then page_index_from" - ): - writer.set_page_label(5, 0, "/r") - with pytest.raises(ValueError, match="page_index_to exceeds number of pages"): - writer.set_page_label(0, 19, "/r") - with pytest.raises( - ValueError, match="if given, start must be equal or greater than one" - ): - writer.set_page_label(0, 5, "/r", start=-1) - - pdf_file_path.unlink() - - src = ( - SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" - ) # File with pre existing labels - reader = PdfReader(src) - - # Tests adding labels to existing ones - expected = ["i", "ii", "A", "B", "1"] - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(2, 3, "/A") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - # Tests replacing existing lables - expected = ["A", "B", "1", "1", "2"] - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.set_page_label(0, 1, "/A") - writer.write(pdf_file_path) - assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected - - pdf_file_path.unlink() - - # Tests prefix and start. - src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels - reader = PdfReader(src) - writer = PdfWriter() - writer.clone_document_from_reader(reader) - - writer.set_page_label(0, 0, prefix="FRONT") - writer.set_page_label(1, 2, "/D", start=2) - writer.set_page_label(3, 6, prefix="UPDATES") - writer.set_page_label(7, 10, "/D", prefix="THYR-") - writer.set_page_label(11, 21, "/D", prefix="PAP-") - writer.set_page_label(22, 30, "/D", prefix="FOLL-") - writer.set_page_label(31, 39, "/D", prefix="HURT-") - writer.write(pdf_file_path) - - -@pytest.mark.enable_socket() -def test_iss1601(): - url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" - name = "badge-38.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - original_cs_operations = ContentStream( - reader.pages[0].get_contents(), reader - ).operations - writer = PdfWriter() - page_1 = writer.add_blank_page( - reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] - ) - page_1.merge_transformed_page(reader.pages[0], Transformation()) - page_1_cs_operations = page_1.get_contents().operations - assert is_sublist(original_cs_operations, page_1_cs_operations) - page_1 = writer.add_blank_page( - reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] - ) - page_1.merge_page(reader.pages[0]) - page_1_cs_operations = page_1.get_contents().operations - assert is_sublist(original_cs_operations, page_1_cs_operations) - - -def test_attachments(): - writer = PdfWriter() - writer.add_blank_page(100, 100) - b = BytesIO() - writer.write(b) - b.seek(0) - reader = PdfReader(b) - b = None - assert reader.attachments == {} - assert reader._list_attachments() == [] - assert reader._get_attachments() == {} - to_add = [ - ("foobar.txt", b"foobarcontent"), - ("foobar2.txt", b"foobarcontent2"), - ("foobar2.txt", b"2nd_foobarcontent"), - ] - for name, content in to_add: - writer.add_attachment(name, content) - - b = BytesIO() - writer.write(b) - b.seek(0) - reader = PdfReader(b) - b = None - assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add}) - assert reader.attachments == { - "foobar.txt": [b"foobarcontent"], - "foobar2.txt": [b"foobarcontent2", b"2nd_foobarcontent"], - } - assert reader._list_attachments() == [name for name, _ in to_add] - - # We've added the same key twice - hence only 2 and not 3: - att = reader._get_attachments() - assert len(att) == 2 # we have 2 keys, but 3 attachments! - - # The content for foobar.txt is clear and just a single value: - assert att["foobar.txt"] == b"foobarcontent" - - # The content for foobar2.txt is a list! - att = reader._get_attachments("foobar2.txt") - assert len(att) == 1 - assert att["foobar2.txt"] == [b"foobarcontent2", b"2nd_foobarcontent"] - - # Let's do both cases with the public interface: - assert reader.attachments["foobar.txt"][0] == b"foobarcontent" - assert reader.attachments["foobar2.txt"][0] == b"foobarcontent2" - assert reader.attachments["foobar2.txt"][1] == b"2nd_foobarcontent" - - -@pytest.mark.enable_socket() -def test_iss1614(): - # test of an annotation(link) directly stored in the /Annots in the page - url = "https://github.com/py-pdf/pypdf/files/10669995/broke.pdf" - name = "iss1614.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - # test for 2nd error case reported in #1614 - url = "https://github.com/py-pdf/pypdf/files/10696390/broken.pdf" - name = "iss1614.2.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_new_removes(): - # test of an annotation(link) directly stored in the /Annots in the page - url = "https://github.com/py-pdf/pypdf/files/10807951/tt.pdf" - name = "iss1650.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.remove_images() - b = BytesIO() - writer.write(b) - bb = bytes(b.getbuffer()) - assert b"/Im0 Do" not in bb - assert b"/Fm0 Do" in bb - assert b" TJ" in bb - - writer = PdfWriter() - writer.clone_document_from_reader(reader) - writer.remove_text() - b = BytesIO() - writer.write(b) - bb = bytes(b.getbuffer()) - assert b"/Im0" in bb - assert b"Chap" not in bb - assert b" TJ" not in bb - - url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf" - name = "GeoBaseWithComments.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer.append(reader) - writer.remove_objects_from_page(writer.pages[0], [ObjectDeletionFlag.LINKS]) - assert "/Links" not in [ - a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] - ] - writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.ATTACHMENTS) - assert "/FileAttachment" not in [ - a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] - ] - - writer.pages[0]["/Annots"].append( - DictionaryObject({NameObject("/Subtype"): TextStringObject("/3D")}) - ) - assert "/3D" in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] - writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.OBJECTS_3D) - assert "/3D" not in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] - - writer.remove_links() - assert len(writer.pages[0]["/Annots"]) == 0 - assert len(writer.pages[3]["/Annots"]) == 0 - - writer.remove_annotations("/Text") - - -@pytest.mark.enable_socket() -def test_late_iss1654(): - url = "https://github.com/py-pdf/pypdf/files/10935632/bid1.pdf" - name = "bid1.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.clone_document_from_reader(reader) - for p in writer.pages: - p.compress_content_streams() - b = BytesIO() - writer.write(b) - - -@pytest.mark.enable_socket() -def test_iss1723(): - # test of an annotation(link) directly stored in the /Annots in the page - url = "https://github.com/py-pdf/pypdf/files/11015242/inputFile.pdf" - name = "iss1723.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader, (3, 5)) - - -@pytest.mark.enable_socket() -def test_iss1767(): - # test with a pdf which is buggy because the object 389,0 exists 3 times: - # twice to define catalog and one as an XObject inducing a loop when - # cloning - url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" - name = "iss1723.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - PdfWriter(clone_from=reader) - - -@pytest.mark.enable_socket() -def test_named_dest_page_number(): - """ - Closes iss471 - tests appending with named destinations as integers - """ - url = "https://github.com/py-pdf/pypdf/files/10704333/central.pdf" - name = "central.pdf" - writer = PdfWriter() - writer.add_blank_page(100, 100) - writer.append(BytesIO(get_data_from_url(url, name=name)), pages=[0, 1, 2]) - assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 2 - assert writer._root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) - writer.append(BytesIO(get_data_from_url(url, name=name))) - assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 - writer2 = PdfWriter() - writer2.add_blank_page(100, 100) - dest = writer2.add_named_destination("toto", 0) - dest.get_object()[NameObject("/D")][0] = NullObject() - b = BytesIO() - writer2.write(b) - b.seek(0) - writer.append(b) - assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 - - -@pytest.mark.parametrize( - ("write_data_here", "needs_cleanup"), - [ - ( - "dont_commit_writer.pdf", - True, - ) - ], -) -def test_update_form_fields(write_data_here, needs_cleanup): - writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") - writer.update_page_form_field_values( - writer.pages[0], - { - "CheckBox1": "/Yes", - "Text1": "mon Text1", - "Text2": "ligne1\nligne2", - "RadioGroup1": "/2", - "RdoS1": "/", - "Combo1": "!!monCombo!!", - "Liste1": "Liste2", - "Liste2": ["Lst1", "Lst3"], - "DropList1": "DropListe3", - }, - auto_regenerate=False, - ) - del writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"] - writer.update_page_form_field_values( - writer.pages[0], - {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"}, - auto_regenerate=False, - ) - - writer.write("dont_commit_writer.pdf") - reader = PdfReader("dont_commit_writer.pdf") - flds = reader.get_fields() - assert flds["CheckBox1"]["/V"] == "/Yes" - assert flds["CheckBox1"].indirect_reference.get_object()["/AS"] == "/Yes" - assert ( - b"(my Text1)" - in flds["Text1"].indirect_reference.get_object()["/AP"]["/N"].get_data() - ) - assert flds["Text2"]["/V"] == "ligne1\nligne2\nligne3" - assert ( - b"(ligne3)" - in flds["Text2"].indirect_reference.get_object()["/AP"]["/N"].get_data() - ) - assert flds["RadioGroup1"]["/V"] == "/2" - assert flds["RadioGroup1"]["/Kids"][0].get_object()["/AS"] == "/Off" - assert flds["RadioGroup1"]["/Kids"][1].get_object()["/AS"] == "/2" - assert all(x in flds["Liste2"]["/V"] for x in ["Lst1", "Lst3"]) - - assert all(x in flds["CheckBox1"]["/_States_"] for x in ["/Off", "/Yes"]) - assert all(x in flds["RadioGroup1"]["/_States_"] for x in ["/1", "/2", "/3"]) - assert all(x in flds["Liste1"]["/_States_"] for x in ["Liste1", "Liste2", "Liste3"]) - - if needs_cleanup: - Path(write_data_here).unlink() - - -@pytest.mark.enable_socket() -def test_iss1862(): - # The file here has "/B" entry to define the font in a object below the page - # The excluded field shall be considered only at first level (page) and not - # below - url = "https://github.com/py-pdf/pypdf/files/11708801/intro.pdf" - name = "iss1862.pdf" - writer = PdfWriter() - writer.append(BytesIO(get_data_from_url(url, name=name))) - # check that "/B" is in the font - writer.pages[0]["/Resources"]["/Font"]["/F1"]["/CharProcs"]["/B"].get_data() - - -def test_empty_objects_before_cloning(): - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - writer = PdfWriter(clone_from=reader) - nb_obj_reader = len(reader.xref_objStm) + sum( - len(reader.xref[i]) for i in reader.xref - ) - nb_obj_reader -= 1 # for trailer - nb_obj_reader -= len( - {x: 1 for x, y in reader.xref_objStm.values()} - ) # to remove object streams - assert len(writer._objects) == nb_obj_reader - - -@pytest.mark.enable_socket() -def test_watermark(): - url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" - name = "bgwatermark.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" - name = "srcwatermark.pdf" - writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) - for p in writer.pages: - p.merge_page(reader.pages[0], over=False) - - assert isinstance(p["/Contents"], ArrayObject) - assert isinstance(p["/Contents"][0], IndirectObject) - - b = BytesIO() - writer.write(b) - assert len(b.getvalue()) < 2.1 * 1024 * 1024 - - -@pytest.mark.enable_socket() -@pytest.mark.timeout(4) -def test_watermarking_speed(): - url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" - name = "bgwatermark.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - url = "https://arxiv.org/pdf/2201.00214.pdf" - name = "2201.00214.pdf" - writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) - for p in writer.pages: - p.merge_page(reader.pages[0], over=False) - out_pdf_bytesio = BytesIO() - writer.write(out_pdf_bytesio) - pdf_size_in_mib = len(out_pdf_bytesio.getvalue()) / 1024 / 1024 - assert pdf_size_in_mib < 20 - - -@pytest.mark.enable_socket() -@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") -def test_watermark_rendering(tmp_path): - """Ensure the visual appearance of watermarking stays correct.""" - url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" - name = "bgwatermark.pdf" - watermark = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] - url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" - name = "srcwatermark.pdf" - page = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] - writer = PdfWriter() - page.merge_page(watermark, over=False) - writer.add_page(page) - - target_png_path = tmp_path / "target.png" - url = "https://github.com/py-pdf/pypdf/assets/96178532/d5c72d0e-7047-4504-bbf6-bc591c80d7c0" - name = "dstwatermark.png" - target_png_path.write_bytes(get_data_from_url(url, name=name)) - - pdf_path = tmp_path / "out.pdf" - png_path = tmp_path / "out.png" - writer.write(pdf_path) - - # False positive: https://github.com/PyCQA/bandit/issues/333 - subprocess.run( - [ # noqa: S603 - GHOSTSCRIPT_BINARY, - "-sDEVICE=pngalpha", - "-o", - png_path, - pdf_path, - ] - ) - assert png_path.is_file() - assert image_similarity(png_path, target_png_path) >= 0.95 - - -@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") -def test_watermarking_reportlab_rendering(tmp_path): - """ - This test is showing a rotated+mirrored watermark in pypdf==3.15.4. - - Replacing the generate_base with e.g. the crazyones did not show the issue. - """ - base_path = SAMPLE_ROOT / "022-pdfkit/pdfkit.pdf" - watermark_path = SAMPLE_ROOT / "013-reportlab-overlay/reportlab-overlay.pdf" - - reader = PdfReader(base_path) - base_page = reader.pages[0] - watermark = PdfReader(watermark_path).pages[0] - - writer = PdfWriter() - base_page.merge_page(watermark) - writer.add_page(base_page) - - target_png_path = RESOURCE_ROOT / "test_watermarking_reportlab_rendering.png" - pdf_path = tmp_path / "out.pdf" - png_path = tmp_path / "test_watermarking_reportlab_rendering.png" - - writer.write(pdf_path) - # False positive: https://github.com/PyCQA/bandit/issues/333 - subprocess.run( - [ # noqa: S603 - GHOSTSCRIPT_BINARY, - "-r120", - "-sDEVICE=pngalpha", - "-o", - png_path, - pdf_path, - ] - ) - assert png_path.is_file() - assert image_similarity(png_path, target_png_path) >= 0.999 - - -@pytest.mark.enable_socket() -def test_da_missing_in_annot(): - url = "https://github.com/py-pdf/pypdf/files/12136285/Building.Division.Permit.Application.pdf" - name = "BuildingDivisionPermitApplication.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter(clone_from=reader) - writer.update_page_form_field_values( - writer.pages[0], {"PCN-1": "0"}, auto_regenerate=False - ) - b = BytesIO() - writer.write(b) - reader = PdfReader(BytesIO(b.getvalue())) - ff = reader.get_fields() - # check for autosize processing - assert ( - b"0 Tf" - not in ff["PCN-1"].indirect_reference.get_object()["/AP"]["/N"].get_data() - ) - f2 = writer.get_object(ff["PCN-2"].indirect_reference.idnum) - f2[NameObject("/Parent")] = writer.get_object( - ff["PCN-1"].indirect_reference.idnum - ).indirect_reference - writer.update_page_form_field_values( - writer.pages[0], {"PCN-2": "1"}, auto_regenerate=False - ) - - -def test_missing_fields(pdf_file_path): - reader = PdfReader(RESOURCE_ROOT / "form.pdf") - - writer = PdfWriter() - writer.add_page(reader.pages[0]) - - with pytest.raises(PyPdfError) as exc: - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"}, flags=1 - ) - assert exc.value.args[0] == "No /AcroForm dictionary in PdfWriter Object" - - writer = PdfWriter() - writer.append(reader, [0]) - del writer._root_object["/AcroForm"]["/Fields"] - with pytest.raises(PyPdfError) as exc: - writer.update_page_form_field_values( - writer.pages[0], {"foo": "some filled in text"}, flags=1 - ) - assert exc.value.args[0] == "No /Fields dictionary in Pdf in PdfWriter Object" - - -def test_missing_info(): - reader = PdfReader(RESOURCE_ROOT / "missing_info.pdf") - - writer = PdfWriter(clone_from=reader) - assert len(writer.pages) == len(reader.pages) - - -@pytest.mark.enable_socket() -def test_germanfields(): - """Cf #2035""" - url = "https://github.com/py-pdf/pypdf/files/12194195/test.pdf" - name = "germanfields.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter(clone_from=reader) - form_fields = {"Text Box 1": "test æ ø å"} - writer.update_page_form_field_values( - writer.pages[0], form_fields, auto_regenerate=False - ) - bytes_stream = BytesIO() - writer.write(bytes_stream) - bytes_stream.seek(0) - reader2 = PdfReader(bytes_stream) - assert ( - b"test \xe6 \xf8 \xe5" - in reader2.get_fields()["Text Box 1"] - .indirect_reference.get_object()["/AP"]["/N"] - .get_data() - ) - - -@pytest.mark.enable_socket() -def test_no_t_in_articles(): - """Cf #2078""" - url = "https://github.com/py-pdf/pypdf/files/12311735/bad.pdf" - name = "iss2078.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_no_i_in_articles(): - """Cf #2089""" - url = "https://github.com/py-pdf/pypdf/files/12352793/kim2002.pdf" - name = "iss2089.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_damaged_pdf_length_returning_none(): - """ - Cf #140 - https://github.com/py-pdf/pypdf/issues/140#issuecomment-1685380549 - """ - url = "https://github.com/py-pdf/pypdf/files/12168578/bad_pdf_example.pdf" - name = "iss140_bad_pdf.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - writer.append(reader) - - -@pytest.mark.enable_socket() -def test_viewerpreferences(): - """Add Tests for ViewerPreferences""" - url = "https://github.com/py-pdf/pypdf/files/9175966/2015._pb_decode_pg0.pdf" - name = "2015._pb_decode_pg0.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - v = reader.viewer_preferences - assert v.center_window == True # noqa: E712 - writer = PdfWriter(clone_from=reader) - v = writer.viewer_preferences - assert v.center_window == True # noqa: E712 - v.center_window = False - assert ( - writer._root_object["/ViewerPreferences"]["/CenterWindow"] - == False # noqa: E712 - ) - assert v.print_area == "/CropBox" - with pytest.raises(ValueError): - v.non_fullscreen_pagemode = "toto" - with pytest.raises(ValueError): - v.non_fullscreen_pagemode = "/toto" - v.non_fullscreen_pagemode = "/UseOutlines" - assert ( - writer._root_object["/ViewerPreferences"]["/NonFullScreenPageMode"] - == "/UseOutlines" - ) - writer = PdfWriter(clone_from=reader) - v = writer.viewer_preferences - assert v.center_window == True # noqa: E712 - v.center_window = False - assert ( - writer._root_object["/ViewerPreferences"]["/CenterWindow"] - == False # noqa: E712 - ) - - writer = PdfWriter(clone_from=reader) - writer._root_object[NameObject("/ViewerPreferences")] = writer._add_object( - writer._root_object["/ViewerPreferences"] - ) - v = writer.viewer_preferences - v.center_window = False - assert ( - writer._root_object["/ViewerPreferences"]["/CenterWindow"] - == False # noqa: E712 - ) - v.num_copies = 1 - assert v.num_copies == 1 - assert v.print_pagerange is None - with pytest.raises(ValueError): - v.print_pagerange = "toto" - v.print_pagerange = ArrayObject() - assert len(v.print_pagerange) == 0 - - writer.create_viewer_preferences() - assert len(writer._root_object["/ViewerPreferences"]) == 0 - writer.viewer_preferences.direction = "/R2L" - assert len(writer._root_object["/ViewerPreferences"]) == 1 - - del reader.trailer["/Root"]["/ViewerPreferences"] - assert reader.viewer_preferences is None - writer = PdfWriter(clone_from=reader) - assert writer.viewer_preferences is None - - -def test_extra_spaces_in_da_text(caplog): - writer = PdfWriter(clone_from=RESOURCE_ROOT / "form.pdf") - t = writer.pages[0]["/Annots"][0].get_object()["/DA"] - t = t.replace("/Helv", "/Helv ") - writer.pages[0]["/Annots"][0].get_object()[NameObject("/DA")] = TextStringObject(t) - writer.update_page_form_field_values( - writer.pages[0], {"foo": "abcd"}, auto_regenerate=False - ) - t = writer.pages[0]["/Annots"][0].get_object()["/AP"]["/N"].get_data() - assert "Font dictionary for not found." not in caplog.text - assert b"/Helv" in t - assert b"(abcd)" in t - - -@pytest.mark.enable_socket() -def test_object_contains_indirect_reference_to_self(): - url = "https://github.com/py-pdf/pypdf/files/12389243/testbook.pdf" - name = "iss2102.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) - writer = PdfWriter() - width, height = 595, 841 - outpage = writer.add_blank_page(width, height) - outpage.merge_page(reader.pages[6]) - writer.append(reader) +"""Test the pypdf._writer module.""" +import re +import shutil +import subprocess +from io import BytesIO +from pathlib import Path + +import pytest + +from pypdf import ( + ObjectDeletionFlag, + PageObject, + PdfMerger, + PdfReader, + PdfWriter, + Transformation, +) +from pypdf.errors import DeprecationError, PageSizeNotDefinedError, PyPdfError +from pypdf.generic import ( + ArrayObject, + ContentStream, + DictionaryObject, + Fit, + IndirectObject, + NameObject, + NullObject, + NumberObject, + RectangleObject, + StreamObject, + TextStringObject, +) + +from . import get_data_from_url, is_sublist +from .test_images import image_similarity + +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" +SAMPLE_ROOT = Path(PROJECT_ROOT) / "sample-files" +GHOSTSCRIPT_BINARY = shutil.which("gs") + + +def test_writer_exception_non_binary(tmp_path, caplog): + src = RESOURCE_ROOT / "pdflatex-outline.pdf" + + reader = PdfReader(src) + writer = PdfWriter() + writer.add_page(reader.pages[0]) + + with open(tmp_path / "out.txt", "w") as fp, pytest.raises(TypeError): + writer.write_stream(fp) + ending = "to write to is not in binary mode. It may not be written to correctly.\n" + assert caplog.text.endswith(ending) + + +def test_writer_clone(): + src = RESOURCE_ROOT / "pdflatex-outline.pdf" + + reader = PdfReader(src) + writer = PdfWriter(clone_from=reader) + assert len(writer.pages) == 4 + assert "PageObject" in str(type(writer.pages[0])) + + writer = PdfWriter(clone_from=src) + assert len(writer.pages) == 4 + assert "PageObject" in str(type(writer.pages[0])) + + +def test_writer_clone_bookmarks(): + # Arrange + src = RESOURCE_ROOT / "Seige_of_Vicksburg_Sample_OCR-crazyones-merged.pdf" + reader = PdfReader(src) + writer = PdfWriter() + + # Act + test cat + cat = "" + + def cat1(p) -> None: + nonlocal cat + cat += p.__repr__() + + writer.clone_document_from_reader(reader, cat1) + assert "/Page" in cat + assert writer.pages[0].raw_get("/Parent") == writer._pages + writer.add_outline_item("Page 1", 0) + writer.add_outline_item("Page 2", 1) + + # Assert + bytes_stream = BytesIO() + writer.write(bytes_stream) + bytes_stream.seek(0) + reader2 = PdfReader(bytes_stream) + assert len(reader2.pages) == len(reader.pages) + assert len(reader2.outline) == 2 + + # test with append + writer = PdfWriter() + writer.append(reader) + writer.add_outline_item("Page 1", 0) + writer.add_outline_item("Page 2", 1) + + # Assert + bytes_stream = BytesIO() + writer.write(bytes_stream) + bytes_stream.seek(0) + reader2 = PdfReader(bytes_stream) + assert len(reader2.pages) == len(reader.pages) + assert len(reader2.outline) == 2 + + +def writer_operate(writer: PdfWriter) -> None: + """ + To test the writer that initialized by each of the four usages. + + Args: + writer: A PdfWriter object + """ + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + pdf_outline_path = RESOURCE_ROOT / "pdflatex-outline.pdf" + + reader = PdfReader(pdf_path) + reader_outline = PdfReader(pdf_outline_path) + + page = reader.pages[0] + with pytest.raises(PageSizeNotDefinedError) as exc: + writer.add_blank_page() + assert exc.value.args == () + writer.insert_page(page, 1) + writer.insert_page(reader_outline.pages[0], 0) + writer.add_outline_item_destination(page) + writer.remove_links() + writer.add_outline_item_destination(page) + oi = writer.add_outline_item( + "An outline item", 0, None, (255, 0, 15), True, True, Fit.fit_box_vertically(10) + ) + writer.add_outline_item( + "The XYZ fit", 0, oi, (255, 0, 15), True, True, Fit.xyz(left=10, top=20, zoom=3) + ) + writer.add_outline_item( + "The XYZ fit no args", 0, oi, (255, 0, 15), True, True, Fit.xyz() + ) + writer.add_outline_item( + "The FitH fit", 0, oi, (255, 0, 15), True, True, Fit.fit_horizontally(top=10) + ) + writer.add_outline_item( + "The FitV fit", 0, oi, (255, 0, 15), True, True, Fit.fit_vertically(left=10) + ) + writer.add_outline_item( + "The FitR fit", + 0, + oi, + (255, 0, 15), + True, + True, + Fit.fit_rectangle(left=10, bottom=20, right=30, top=40), + ) + writer.add_outline_item( + "The FitB fit", 0, oi, (255, 0, 15), True, True, Fit.fit_box() + ) + writer.add_outline_item( + "The FitBH fit", + 0, + oi, + (255, 0, 15), + True, + True, + Fit.fit_box_horizontally(top=10), + ) + writer.add_outline_item( + "The FitBV fit", + 0, + oi, + (255, 0, 15), + True, + True, + Fit.fit_box_vertically(left=10), + ) + writer.add_blank_page() + writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) + with pytest.warns( + DeprecationWarning, match="'pagenum' argument of add_uri is deprecated" + ): + writer.add_uri( + 2, "https://example.com", RectangleObject([0, 0, 100, 100]), pagenum=2 + ) + with pytest.raises(DeprecationError): + writer.add_link(2, 1, RectangleObject([0, 0, 100, 100])) + assert writer._get_page_layout() is None + writer.page_layout = "broken" + assert writer.page_layout == "broken" + writer.page_layout = NameObject("/SinglePage") + assert writer._get_page_layout() == "/SinglePage" + assert writer._get_page_mode() is None + writer.set_page_mode("/UseNone") + assert writer._get_page_mode() == "/UseNone" + writer.set_page_mode(NameObject("/UseOC")) + assert writer._get_page_mode() == "/UseOC" + writer.insert_blank_page(width=100, height=100) + writer.insert_blank_page() # without parameters + + writer.remove_images() + + writer.add_metadata(reader.metadata) + writer.add_metadata({"/Author": "Martin Thoma"}) + writer.add_metadata({"/MyCustom": 1234}) + + writer.add_attachment("foobar.gif", b"foobarcontent") + + # Check that every key in _idnum_hash is correct + objects_hash = [o.hash_value() for o in writer._objects] + for k, v in writer._idnum_hash.items(): + assert v.pdf == writer + assert k in objects_hash, f"Missing {v}" + + +tmp_path = "dont_commit_writer.pdf" + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operations_by_traditional_usage(write_data_here, needs_cleanup): + writer = PdfWriter() + + writer_operate(writer) + + # finally, write "output" to pypdf-output.pdf + if needs_cleanup: + with open(write_data_here, "wb") as output_stream: + writer.write(output_stream) + else: + output_stream = write_data_here + writer.write(output_stream) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operations_by_semi_traditional_usage(write_data_here, needs_cleanup): + with PdfWriter() as writer: + writer_operate(writer) + + # finally, write "output" to pypdf-output.pdf + if needs_cleanup: + with open(write_data_here, "wb") as output_stream: + writer.write(output_stream) + else: + output_stream = write_data_here + writer.write(output_stream) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operations_by_semi_new_traditional_usage( + write_data_here, needs_cleanup +): + with PdfWriter() as writer: + writer_operate(writer) + + # finally, write "output" to pypdf-output.pdf + writer.write(write_data_here) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ("dont_commit_writer.pdf", True), + (Path("dont_commit_writer.pdf"), True), + (BytesIO(), False), + ], +) +def test_writer_operation_by_new_usage(write_data_here, needs_cleanup): + # This includes write "output" to pypdf-output.pdf + with PdfWriter(write_data_here) as writer: + writer_operate(writer) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.parametrize( + "input_path", + [ + "side-by-side-subfig.pdf", + "reportlab-inline-image.pdf", + ], +) +def test_remove_images(pdf_file_path, input_path): + pdf_path = RESOURCE_ROOT / input_path + + reader = PdfReader(pdf_path) + writer = PdfWriter() + + page = reader.pages[0] + writer.insert_page(page, 0) + writer.remove_images() + page_contents_stream = writer.pages[0]["/Contents"]._data + assert len(page_contents_stream.strip()) + + # finally, write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + with open(pdf_file_path, "rb") as input_stream: + reader = PdfReader(input_stream) + if input_path == "side-by-side-subfig.pdf": + extracted_text = reader.pages[0].extract_text() + assert extracted_text + assert "Lorem ipsum dolor sit amet" in extracted_text + + +@pytest.mark.enable_socket() +def test_remove_images_sub_level(): + """Cf #2035""" + url = "https://github.com/py-pdf/pypdf/files/12394781/2210.03142-1.pdf" + name = "iss2103.pdf" + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) + writer.remove_images() + assert ( + len( + [ + o.get_object() + for o in writer.pages[0]["/Resources"]["/XObject"]["/Fm1"][ + "/Resources" + ]["/XObject"]["/Im1"]["/Resources"]["/XObject"].values() + if not isinstance(o.get_object(), NullObject) + ] + ) + == 0 + ) + + +@pytest.mark.parametrize( + "input_path", + [ + "side-by-side-subfig.pdf", + "reportlab-inline-image.pdf", + ], +) +def test_remove_text(input_path, pdf_file_path): + pdf_path = RESOURCE_ROOT / input_path + + reader = PdfReader(pdf_path) + writer = PdfWriter() + + page = reader.pages[0] + writer.insert_page(page, 0) + writer.remove_text() + + # finally, write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_remove_text_all_operators(pdf_file_path): + stream = ( + b"BT " + b"/F0 36 Tf " + b"50 706 Td " + b"36 TL " + b"(The Tj operator) Tj " + b'1 2 (The double quote operator) " ' + b"(The single quote operator) ' " + b"ET" + ) + pdf_data = ( + b"%%PDF-1.7\n" + b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n" + b"2 0 obj << >> endobj\n" + b"3 0 obj << >> endobj\n" + b"4 0 obj << /Length %d >>\n" + b"stream\n" + (b"%s\n" % stream) + b"endstream\n" + b"endobj\n" + b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n" + b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R" + b" /Resources << /Font << >> >>" + b" /Rotate 0 /Type /Page >> endobj\n" + b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n" + b"xref 1 6\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"%010d 00000 n\n" + b"trailer << /Root 6 0 R /Size 6 >>\n" + b"startxref\n%d\n" + b"%%%%EOF" + ) + startx_correction = -1 + pdf_data = pdf_data % ( + len(stream), + pdf_data.find(b"1 0 obj") + startx_correction, + pdf_data.find(b"2 0 obj") + startx_correction, + pdf_data.find(b"3 0 obj") + startx_correction, + pdf_data.find(b"4 0 obj") + startx_correction, + pdf_data.find(b"5 0 obj") + startx_correction, + pdf_data.find(b"6 0 obj") + startx_correction, + # startx_correction should be -1 due to double % at the beginning + # inducing an error on startxref computation + pdf_data.find(b"xref"), + ) + pdf_stream = BytesIO(pdf_data) + + reader = PdfReader(pdf_stream, strict=False) + writer = PdfWriter() + + page = reader.pages[0] + writer.insert_page(page, 0) + writer.remove_text() + + # finally, write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_write_metadata(pdf_file_path): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + + reader = PdfReader(pdf_path) + writer = PdfWriter() + + writer.add_page(reader.pages[0]) + for page in reader.pages: + writer.add_page(page) + + metadata = reader.metadata + writer.add_metadata(metadata) + + writer.add_metadata({"/Title": "The Crazy Ones"}) + + # finally, write data to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + # Check if the title was set + reader = PdfReader(pdf_file_path) + metadata = reader.metadata + assert metadata.get("/Title") == "The Crazy Ones" + + +def test_fill_form(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + writer = PdfWriter() + + writer.append(reader, [0]) + writer.append(RESOURCE_ROOT / "crazyones.pdf", [0]) + + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"}, flags=1 + ) + + # check if no fields to fill in the page + writer.update_page_form_field_values( + writer.pages[1], {"foo": "some filled in text"}, flags=1 + ) + + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"} + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_fill_form_with_qualified(): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + reader.add_form_topname("top") + + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.add_page(reader.pages[0]) + writer.update_page_form_field_values( + writer.pages[0], {"top.foo": "filling"}, flags=1 + ) + b = BytesIO() + writer.write(b) + + reader2 = PdfReader(b) + fields = reader2.get_fields() + assert fields["top.foo"]["/V"] == "filling" + + +@pytest.mark.parametrize( + ("use_128bit", "user_password", "owner_password"), + [(True, "userpwd", "ownerpwd"), (False, "userpwd", "ownerpwd")], +) +def test_encrypt(use_128bit, user_password, owner_password, pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + writer = PdfWriter() + + page = reader.pages[0] + orig_text = page.extract_text() + + writer.add_page(page) + + with pytest.raises(ValueError, match="owner_pwd of encrypt is deprecated."): + writer.encrypt( + owner_pwd=user_password, + owner_password=owner_password, + user_password=user_password, + use_128bit=use_128bit, + ) + with pytest.raises(ValueError, match="'user_pwd' argument is deprecated"): + writer.encrypt( + owner_password=owner_password, + user_password=user_password, + user_pwd=user_password, + use_128bit=use_128bit, + ) + writer.encrypt( + user_password=user_password, + owner_password=owner_password, + use_128bit=use_128bit, + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + # Test that the data is not there in clear text + with open(pdf_file_path, "rb") as input_stream: + data = input_stream.read() + assert b"foo" not in data + + # Test the user password (str): + reader = PdfReader(pdf_file_path, password="userpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + # Test the owner password (str): + reader = PdfReader(pdf_file_path, password="ownerpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + # Test the user password (bytes): + reader = PdfReader(pdf_file_path, password=b"userpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + # Test the owner password (stbytesr): + reader = PdfReader(pdf_file_path, password=b"ownerpwd") + new_text = reader.pages[0].extract_text() + assert reader.metadata.get("/Producer") == "pypdf" + assert new_text == orig_text + + +def test_add_outline_item(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + + for page in reader.pages: + writer.add_page(page) + + outline_item = writer.add_outline_item( + "An outline item", + 1, + None, + (255, 0, 15), + True, + True, + Fit.fit(), + is_open=False, + ) + _o2a = writer.add_outline_item( + "Another", 2, outline_item, None, False, False, Fit.fit() + ) + _o2b = writer.add_outline_item( + "Another bis", 2, outline_item, None, False, False, Fit.fit() + ) + outline_item2 = writer.add_outline_item( + "An outline item 2", + 1, + None, + (255, 0, 15), + True, + True, + Fit.fit(), + is_open=True, + ) + _o3a = writer.add_outline_item( + "Another 2", 2, outline_item2, None, False, False, Fit.fit() + ) + _o3b = writer.add_outline_item( + "Another 2bis", 2, outline_item2, None, False, False, Fit.fit() + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "w+b") as output_stream: + writer.write(output_stream) + output_stream.seek(0) + reader = PdfReader(output_stream) + assert reader.trailer["/Root"]["/Outlines"]["/Count"] == 3 + assert reader.outline[0]["/Count"] == -2 + assert reader.outline[0]["/%is_open%"] == False # noqa + assert reader.outline[2]["/Count"] == 2 + assert reader.outline[2]["/%is_open%"] == True # noqa + assert reader.outline[1][0]["/Count"] == 0 + + +def test_add_named_destination(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + assert writer.get_named_dest_root() == [] + + for page in reader.pages: + writer.add_page(page) + + assert writer.get_named_dest_root() == [] + + writer.add_named_destination(TextStringObject("A named dest"), 2) + writer.add_named_destination(TextStringObject("A named dest2"), 2) + + with pytest.warns(DeprecationWarning, match="pagenum is deprecated as an argument"): + writer.add_named_destination(TextStringObject("A named dest3"), pagenum=2) + + with pytest.raises(ValueError): + writer.add_named_destination( + TextStringObject("A named dest3"), pagenum=2, page_number=2 + ) + + root = writer.get_named_dest_root() + assert root[0] == "A named dest" + assert root[1].pdf == writer + assert root[1].get_object()["/S"] == NameObject("/GoTo") + assert root[1].get_object()["/D"][0] == writer.pages[2].indirect_reference + assert root[2] == "A named dest2" + assert root[3].pdf == writer + assert root[3].get_object()["/S"] == NameObject("/GoTo") + assert root[3].get_object()["/D"][0] == writer.pages[2].indirect_reference + assert root[4] == "A named dest3" + + # test get_object + + assert writer.get_object(root[1].idnum) == writer.get_object(root[1]) + with pytest.raises(ValueError) as exc: + writer.get_object(reader.pages[0].indirect_reference) + assert exc.value.args[0] == "pdf must be self" + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_add_named_destination_sort_order(pdf_file_path): + """ + Issue #1927 does not appear. + + add_named_destination() maintains the named destination list sort order + """ + writer = PdfWriter() + + assert writer.get_named_dest_root() == [] + + writer.add_blank_page(200, 200) + writer.add_named_destination("b", 0) + # "a" should be moved before "b" on insert + writer.add_named_destination("a", 0) + + root = writer.get_named_dest_root() + + assert len(root) == 4 + assert ( + root[0] == "a" + ), '"a" was not inserted before "b" in the named destination root' + assert root[2] == "b" + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_add_uri(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + + for page in reader.pages: + writer.add_page(page) + + writer.add_uri( + 1, + "http://www.example.com", + RectangleObject([0, 0, 100, 100]), + border=[1, 2, 3, [4]], + ) + writer.add_uri( + 2, + "https://pypdf.readthedocs.io/en/latest/", + RectangleObject([20, 30, 50, 80]), + border=[1, 2, 3], + ) + writer.add_uri( + 3, + "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", + "[ 200 300 250 350 ]", + border=[0, 0, 0], + ) + writer.add_uri( + 3, + "https://pypdf.readthedocs.io/en/latest/user/adding-pdf-annotations.html", + [100, 200, 150, 250], + border=[0, 0, 0], + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_add_link(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "pdflatex-outline.pdf") + writer = PdfWriter() + + for page in reader.pages: + writer.add_page(page) + + with pytest.raises( + DeprecationError, + match=( + re.escape( + "add_link is deprecated and was removed in pypdf 3.0.0. " + "Use add_annotation(pypdf.annotations.Link(...)) instead." + ) + ), + ): + writer.add_link( + 1, + 2, + RectangleObject([0, 0, 100, 100]), + border=[1, 2, 3, [4]], + fit="/Fit", + ) + writer.add_link( + 2, 3, RectangleObject([20, 30, 50, 80]), [1, 2, 3], "/FitH", None + ) + writer.add_link( + 3, + 0, + "[ 200 300 250 350 ]", + [0, 0, 0], + "/XYZ", + 0, + 0, + 2, + ) + writer.add_link( + 3, + 0, + [100, 200, 150, 250], + border=[0, 0, 0], + ) + + # write "output" to pypdf-output.pdf + with open(pdf_file_path, "wb") as output_stream: + writer.write(output_stream) + + +def test_io_streams(): + """This is the example from the docs ("Streaming data").""" + filepath = RESOURCE_ROOT / "pdflatex-outline.pdf" + with open(filepath, "rb") as fh: + bytes_stream = BytesIO(fh.read()) + + # Read from bytes stream + reader = PdfReader(bytes_stream) + assert len(reader.pages) == 4 + + # Write to bytes stream + writer = PdfWriter() + with BytesIO() as output_stream: + writer.write(output_stream) + + +def test_regression_issue670(pdf_file_path): + filepath = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(filepath, strict=False) + for _ in range(2): + writer = PdfWriter() + writer.add_page(reader.pages[0]) + with open(pdf_file_path, "wb") as f_pdf: + writer.write(f_pdf) + + +def test_issue301(): + """Test with invalid stream length object.""" + with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: + reader = PdfReader(f) + writer = PdfWriter() + writer.append_pages_from_reader(reader) + b = BytesIO() + writer.write(b) + + +def test_append_pages_from_reader_append(): + """Use append_pages_from_reader with a callable.""" + with open(RESOURCE_ROOT / "issue-301.pdf", "rb") as f: + reader = PdfReader(f) + writer = PdfWriter() + writer.append_pages_from_reader(reader, callable) + b = BytesIO() + writer.write(b) + + +@pytest.mark.enable_socket() +@pytest.mark.slow() +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_sweep_indirect_references_nullobject_exception(pdf_file_path): + # TODO: Check this more closely... this looks weird + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + merger = PdfMerger() + merger.append(reader) + merger.write(pdf_file_path) + + +@pytest.mark.enable_socket() +@pytest.mark.slow() +@pytest.mark.parametrize( + ("url", "name"), + [ + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf", + "test_sweep_indirect_references_nullobject_exception.pdf", + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/922/922840.pdf", + "test_write_outline_item_on_page_fitv.pdf", + ), + ("https://github.com/py-pdf/pypdf/files/10715624/test.pdf", "iss1627.pdf"), + ], +) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_some_appends(pdf_file_path, url, name): + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + # PdfMerger + merger = PdfMerger() + merger.append(reader) + merger.write(pdf_file_path) + # PdfWriter + merger = PdfWriter() + merger.append(reader) + merger.write(pdf_file_path) + + +def test_pdf_header(): + writer = PdfWriter() + assert writer.pdf_header == b"%PDF-1.3" + + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + writer.add_page(reader.pages[0]) + assert writer.pdf_header == b"%PDF-1.5" + + writer.pdf_header = b"%PDF-1.6" + assert writer.pdf_header == b"%PDF-1.6" + + +def test_write_dict_stream_object(pdf_file_path): + stream = ( + b"BT " + b"/F0 36 Tf " + b"50 706 Td " + b"36 TL " + b"(The Tj operator) Tj " + b'1 2 (The double quote operator) " ' + b"(The single quote operator) ' " + b"ET" + ) + + stream_object = StreamObject() + stream_object[NameObject("/Type")] = NameObject("/Text") + stream_object._data = stream + + writer = PdfWriter() + + page_object = PageObject.create_blank_page(writer, 1000, 1000) + # Construct dictionary object (PageObject) with stream object + # Writer will replace this stream object with indirect object + page_object[NameObject("/Test")] = stream_object + + page_object = writer.add_page(page_object) + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + for k, v in page_object.items(): + if k == "/Test": + assert str(v) != str(stream_object) + assert isinstance(v, IndirectObject) + assert str(v.get_object()) == str(stream_object) + break + else: + pytest.fail("/Test not found") + + # Check that every key in _idnum_hash is correct + objects_hash = [o.hash_value() for o in writer._objects] + for k, v in writer._idnum_hash.items(): + assert v.pdf == writer + assert k in objects_hash, "Missing %s" % v + + +def test_add_single_annotation(pdf_file_path): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + page = reader.pages[0] + writer = PdfWriter() + writer.add_page(page) + + annot_dict = { + "/Type": "/Annot", + "/Subtype": "/Text", + "/Rect": [270.75, 596.25, 294.75, 620.25], + "/Contents": "Note in second paragraph", + "/C": [1, 1, 0], + "/M": "D:20220406191858+02'00", + "/Popup": { + "/Type": "/Annot", + "/Subtype": "/Popup", + "/Rect": [294.75, 446.25, 494.75, 596.25], + "/M": "D:20220406191847+02'00", + }, + "/T": "moose", + } + writer.add_annotation(0, annot_dict) + + # Inspect manually by adding 'assert False' and viewing the PDF + with open(pdf_file_path, "wb") as fp: + writer.write(fp) + + +def test_deprecation_bookmark_decorator(): + reader = PdfReader(RESOURCE_ROOT / "outlines-with-invalid-destinations.pdf") + page = reader.pages[0] + outline_item = reader.outline[0] + writer = PdfWriter() + writer.add_page(page) + with pytest.raises( + DeprecationError, + match="bookmark is deprecated as an argument. Use outline_item instead", + ): + writer.add_outline_item_dict(bookmark=outline_item) + + +@pytest.mark.samples() +def test_colors_in_outline_item(pdf_file_path): + reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") + writer = PdfWriter() + writer.clone_document_from_reader(reader) + purple_rgb = (0.5019607843137255, 0.0, 0.5019607843137255) + writer.add_outline_item("First Outline Item", page_number=2, color="800080") + writer.add_outline_item("Second Outline Item", page_number=3, color="#800080") + writer.add_outline_item("Third Outline Item", page_number=4, color=purple_rgb) + + with open(pdf_file_path, "wb") as f: + writer.write(f) + + reader2 = PdfReader(pdf_file_path) + for outline_item in reader2.outline: + # convert float to string because of mutability + assert ["%.5f" % c for c in outline_item.color] == [ + "%.5f" % p for p in purple_rgb + ] + + +@pytest.mark.samples() +def test_write_empty_stream(): + reader = PdfReader(SAMPLE_ROOT / "004-pdflatex-4-pages/pdflatex-4-pages.pdf") + writer = PdfWriter() + writer.clone_document_from_reader(reader) + + with pytest.raises(ValueError) as exc: + writer.write("") + assert exc.value.args[0] == "Output(stream=) is empty." + + +def test_startup_dest(): + pdf_file_writer = PdfWriter() + pdf_file_writer.append_pages_from_reader(PdfReader(RESOURCE_ROOT / "issue-604.pdf")) + + assert pdf_file_writer.open_destination is None + pdf_file_writer.open_destination = pdf_file_writer.pages[9] + # checked also using Acrobrat to verify the good page is opened + op = pdf_file_writer._root_object["/OpenAction"] + assert op[0] == pdf_file_writer.pages[9].indirect_reference + assert op[1] == "/Fit" + op = pdf_file_writer.open_destination + assert op.raw_get("/Page") == pdf_file_writer.pages[9].indirect_reference + assert op["/Type"] == "/Fit" + pdf_file_writer.open_destination = op + assert pdf_file_writer.open_destination == op + + # irrelevant, just for coverage + pdf_file_writer._root_object[NameObject("/OpenAction")][0] = NumberObject(0) + pdf_file_writer.open_destination + with pytest.raises(Exception) as exc: + del pdf_file_writer._root_object[NameObject("/OpenAction")][0] + pdf_file_writer.open_destination + assert "Invalid Destination" in str(exc.value) + + pdf_file_writer.open_destination = "Test" + # checked also using Acrobrat to verify open_destination + op = pdf_file_writer._root_object["/OpenAction"] + assert isinstance(op, TextStringObject) + assert op == "Test" + op = pdf_file_writer.open_destination + assert isinstance(op, TextStringObject) + assert op == "Test" + + # irrelevant, this is just for coverage + pdf_file_writer._root_object[NameObject("/OpenAction")] = NumberObject(0) + assert pdf_file_writer.open_destination is None + pdf_file_writer.open_destination = None + assert "/OpenAction" not in pdf_file_writer._root_object + pdf_file_writer.open_destination = None + + +@pytest.mark.enable_socket() +def test_iss471(): + url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" + name = "book_471.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + writer = PdfWriter() + writer.append(reader, excluded_fields=[]) + assert isinstance( + writer.pages[0]["/Annots"][0].get_object()["/Dest"], TextStringObject + ) + + +@pytest.mark.enable_socket() +def test_reset_translation(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader, (0, 10)) + nb = len(writer._objects) + writer.append(reader, (0, 10)) + assert ( + len(writer._objects) == nb + 11 + ) # +10 (pages) +1 because of the added outline + nb += 1 + writer.reset_translation(reader) + writer.append(reader, (0, 10)) + assert len(writer._objects) >= nb + 200 + nb = len(writer._objects) + writer.reset_translation(reader.pages[0].indirect_reference) + writer.append(reader, (0, 10)) + assert len(writer._objects) >= nb + 200 + nb = len(writer._objects) + writer.reset_translation() + writer.append(reader, (0, 10)) + assert len(writer._objects) >= nb + 200 + nb = len(writer.pages) + writer.append(reader, [reader.pages[0], reader.pages[0]]) + assert len(writer.pages) == nb + 2 + + +def test_threads_empty(): + writer = PdfWriter() + thr = writer.threads + assert isinstance(thr, ArrayObject) + assert len(thr) == 0 + thr2 = writer.threads + assert thr == thr2 + + +@pytest.mark.enable_socket() +def test_append_without_annots_and_articles(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader, None, (0, 10), True, ["/B"]) + writer.reset_translation() + writer.append(reader, (0, 10), True, ["/B"]) + assert writer.threads == [] + writer = PdfWriter() + writer.append(reader, None, (0, 10), True, ["/Annots"]) + assert "/Annots" not in writer.pages[5] + writer = PdfWriter() + writer.append(reader, None, (0, 10), True, []) + assert "/Annots" in writer.pages[5] + assert len(writer.threads) >= 1 + + +@pytest.mark.enable_socket() +def test_append_multiple(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" + name = "tika-924666.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append( + reader, [0, 0, 0] + ) # to demonstre multiple insertion of same page at once + writer.append(reader, [0, 0, 0]) # second pack + pages = writer._root_object["/Pages"]["/Kids"] + assert pages[0] not in pages[1:] # page not repeated + assert pages[-1] not in pages[0:-1] # page not repeated + + +@pytest.mark.samples() +def test_set_page_label(pdf_file_path): + src = RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" # File without labels + reader = PdfReader(src) + + expected = [ + "i", + "ii", + "1", + "2", + "A", + "B", + "1", + "2", + "3", + "4", + "A", + "i", + "I", + "II", + "1", + "2", + "3", + "I", + "II", + ] + + # Tests full lenght with labels assigned at first and last elements + # Tests different labels assigned to consecutive ranges + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(0, 1, "/r") + writer.set_page_label(4, 5, "/A") + writer.set_page_label(10, 10, "/A") + writer.set_page_label(11, 11, "/r") + writer.set_page_label(12, 13, "/R") + writer.set_page_label(17, 18, "/R") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels == expected + + writer = PdfWriter() # Same labels, different set order + writer.clone_document_from_reader(reader) + writer.set_page_label(17, 18, "/R") + writer.set_page_label(4, 5, "/A") + writer.set_page_label(10, 10, "/A") + writer.set_page_label(0, 1, "/r") + writer.set_page_label(12, 13, "/R") + writer.set_page_label(11, 11, "/r") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels == expected + + # Tests labels assigned only in the middle + # Tests label assigned to a range already containing labled ranges + expected = ["1", "2", "i", "ii", "iii", "iv", "v", "1"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(3, 4, "/a") + writer.set_page_label(5, 5, "/A") + writer.set_page_label(2, 6, "/r") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + # Tests labels assigned inside a previously existing range + expected = ["1", "2", "i", "a", "b", "A", "1", "1", "2"] + # Ones repeat because user didnt cover the entire original range + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(2, 6, "/r") + writer.set_page_label(3, 4, "/a") + writer.set_page_label(5, 5, "/A") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + # Tests invalid user input + writer = PdfWriter() + writer.clone_document_from_reader(reader) + with pytest.raises( + ValueError, match="at least one between style and prefix must be given" + ): + writer.set_page_label(0, 5, start=2) + with pytest.raises( + ValueError, match="page_index_from must be equal or greater then 0" + ): + writer.set_page_label(-1, 5, "/r") + with pytest.raises( + ValueError, match="page_index_to must be equal or greater then page_index_from" + ): + writer.set_page_label(5, 0, "/r") + with pytest.raises(ValueError, match="page_index_to exceeds number of pages"): + writer.set_page_label(0, 19, "/r") + with pytest.raises( + ValueError, match="if given, start must be equal or greater than one" + ): + writer.set_page_label(0, 5, "/r", start=-1) + + pdf_file_path.unlink() + + src = ( + SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf" + ) # File with pre existing labels + reader = PdfReader(src) + + # Tests adding labels to existing ones + expected = ["i", "ii", "A", "B", "1"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(2, 3, "/A") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + # Tests replacing existing lables + expected = ["A", "B", "1", "1", "2"] + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.set_page_label(0, 1, "/A") + writer.write(pdf_file_path) + assert PdfReader(pdf_file_path).page_labels[: len(expected)] == expected + + pdf_file_path.unlink() + + # Tests prefix and start. + src = RESOURCE_ROOT / "issue-604.pdf" # File without page labels + reader = PdfReader(src) + writer = PdfWriter() + writer.clone_document_from_reader(reader) + + writer.set_page_label(0, 0, prefix="FRONT") + writer.set_page_label(1, 2, "/D", start=2) + writer.set_page_label(3, 6, prefix="UPDATES") + writer.set_page_label(7, 10, "/D", prefix="THYR-") + writer.set_page_label(11, 21, "/D", prefix="PAP-") + writer.set_page_label(22, 30, "/D", prefix="FOLL-") + writer.set_page_label(31, 39, "/D", prefix="HURT-") + writer.write(pdf_file_path) + + +@pytest.mark.enable_socket() +def test_iss1601(): + url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" + name = "badge-38.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + original_cs_operations = ContentStream( + reader.pages[0].get_contents(), reader + ).operations + writer = PdfWriter() + page_1 = writer.add_blank_page( + reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] + ) + page_1.merge_transformed_page(reader.pages[0], Transformation()) + page_1_cs_operations = page_1.get_contents().operations + assert is_sublist(original_cs_operations, page_1_cs_operations) + page_1 = writer.add_blank_page( + reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] + ) + page_1.merge_page(reader.pages[0]) + page_1_cs_operations = page_1.get_contents().operations + assert is_sublist(original_cs_operations, page_1_cs_operations) + + +def test_attachments(): + writer = PdfWriter() + writer.add_blank_page(100, 100) + b = BytesIO() + writer.write(b) + b.seek(0) + reader = PdfReader(b) + b = None + assert reader.attachments == {} + assert reader._list_attachments() == [] + assert reader._get_attachments() == {} + to_add = [ + ("foobar.txt", b"foobarcontent"), + ("foobar2.txt", b"foobarcontent2"), + ("foobar2.txt", b"2nd_foobarcontent"), + ] + for name, content in to_add: + writer.add_attachment(name, content) + + b = BytesIO() + writer.write(b) + b.seek(0) + reader = PdfReader(b) + b = None + assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add}) + assert reader.attachments == { + "foobar.txt": [b"foobarcontent"], + "foobar2.txt": [b"2nd_foobarcontent"], + } + writer.add_attachment("foobar2.txt", b"overwrite_ignored", overwrite=False) + assert reader.attachments == { + "foobar.txt": [b"foobarcontent"], + "foobar2.txt": [b"2nd_foobarcontent"], + } + _l = list({name for name, _ in to_add}) + _l.sort() + assert reader._list_attachments() == _l + + # We've added the same key twice - hence only 2 and not 3: + att = reader._get_attachments() + assert len(att) == 2 # we have 2 keys, but 3 attachments! + + # The content for foobar.txt is clear and just a single value: + assert att["foobar.txt"] == b"foobarcontent" + + # The content for foobar2.txt is a list! + att = reader._get_attachments("foobar2.txt") + assert len(att) == 1 + assert att["foobar2.txt"] == [b"2nd_foobarcontent"] + + # Let's do both cases with the public interface: + assert reader.attachments["foobar.txt"][0] == b"foobarcontent" + assert reader.attachments["foobar2.txt"][0] == b"2nd_foobarcontent" + assert len(reader.attachments["foobar2.txt"]) == 1 + + +@pytest.mark.enable_socket() +def test_iss1614(): + # test of an annotation(link) directly stored in the /Annots in the page + url = "https://github.com/py-pdf/pypdf/files/10669995/broke.pdf" + name = "iss1614.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + # test for 2nd error case reported in #1614 + url = "https://github.com/py-pdf/pypdf/files/10696390/broken.pdf" + name = "iss1614.2.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_new_removes(): + # test of an annotation(link) directly stored in the /Annots in the page + url = "https://github.com/py-pdf/pypdf/files/10807951/tt.pdf" + name = "iss1650.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.remove_images() + b = BytesIO() + writer.write(b) + bb = bytes(b.getbuffer()) + assert b"/Im0 Do" not in bb + assert b"/Fm0 Do" in bb + assert b" TJ" in bb + + writer = PdfWriter() + writer.clone_document_from_reader(reader) + writer.remove_text() + b = BytesIO() + writer.write(b) + bb = bytes(b.getbuffer()) + assert b"/Im0" in bb + assert b"Chap" not in bb + assert b" TJ" not in bb + + url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf" + name = "GeoBaseWithComments.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer.append(reader) + writer.remove_objects_from_page(writer.pages[0], [ObjectDeletionFlag.LINKS]) + assert "/Links" not in [ + a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] + ] + writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.ATTACHMENTS) + assert "/FileAttachment" not in [ + a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"] + ] + + writer.pages[0]["/Annots"].append( + DictionaryObject({NameObject("/Subtype"): TextStringObject("/3D")}) + ) + assert "/3D" in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] + writer.remove_objects_from_page(writer.pages[0], ObjectDeletionFlag.OBJECTS_3D) + assert "/3D" not in [a.get_object()["/Subtype"] for a in writer.pages[0]["/Annots"]] + + writer.remove_links() + assert len(writer.pages[0]["/Annots"]) == 0 + assert len(writer.pages[3]["/Annots"]) == 0 + + writer.remove_annotations("/Text") + + +@pytest.mark.enable_socket() +def test_late_iss1654(): + url = "https://github.com/py-pdf/pypdf/files/10935632/bid1.pdf" + name = "bid1.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.clone_document_from_reader(reader) + for p in writer.pages: + p.compress_content_streams() + b = BytesIO() + writer.write(b) + + +@pytest.mark.enable_socket() +def test_iss1723(): + # test of an annotation(link) directly stored in the /Annots in the page + url = "https://github.com/py-pdf/pypdf/files/11015242/inputFile.pdf" + name = "iss1723.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader, (3, 5)) + + +@pytest.mark.enable_socket() +def test_iss1767(): + # test with a pdf which is buggy because the object 389,0 exists 3 times: + # twice to define catalog and one as an XObject inducing a loop when + # cloning + url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" + name = "iss1723.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + PdfWriter(clone_from=reader) + + +@pytest.mark.enable_socket() +def test_named_dest_page_number(): + """ + Closes iss471 + tests appending with named destinations as integers + """ + url = "https://github.com/py-pdf/pypdf/files/10704333/central.pdf" + name = "central.pdf" + writer = PdfWriter() + writer.add_blank_page(100, 100) + writer.append(BytesIO(get_data_from_url(url, name=name)), pages=[0, 1, 2]) + assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 2 + assert writer._root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) + writer.append(BytesIO(get_data_from_url(url, name=name))) + assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 + writer2 = PdfWriter() + writer2.add_blank_page(100, 100) + dest = writer2.add_named_destination("toto", 0) + dest.get_object()[NameObject("/D")][0] = NullObject() + b = BytesIO() + writer2.write(b) + b.seek(0) + writer.append(b) + assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 + + +@pytest.mark.parametrize( + ("write_data_here", "needs_cleanup"), + [ + ( + "dont_commit_writer.pdf", + True, + ) + ], +) +def test_update_form_fields(write_data_here, needs_cleanup): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") + writer.update_page_form_field_values( + writer.pages[0], + { + "CheckBox1": "/Yes", + "Text1": "mon Text1", + "Text2": "ligne1\nligne2", + "RadioGroup1": "/2", + "RdoS1": "/", + "Combo1": "!!monCombo!!", + "Liste1": "Liste2", + "Liste2": ["Lst1", "Lst3"], + "DropList1": "DropListe3", + }, + auto_regenerate=False, + ) + del writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"] + writer.update_page_form_field_values( + writer.pages[0], + {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"}, + auto_regenerate=False, + ) + + writer.write("dont_commit_writer.pdf") + reader = PdfReader("dont_commit_writer.pdf") + flds = reader.get_fields() + assert flds["CheckBox1"]["/V"] == "/Yes" + assert flds["CheckBox1"].indirect_reference.get_object()["/AS"] == "/Yes" + assert ( + b"(my Text1)" + in flds["Text1"].indirect_reference.get_object()["/AP"]["/N"].get_data() + ) + assert flds["Text2"]["/V"] == "ligne1\nligne2\nligne3" + assert ( + b"(ligne3)" + in flds["Text2"].indirect_reference.get_object()["/AP"]["/N"].get_data() + ) + assert flds["RadioGroup1"]["/V"] == "/2" + assert flds["RadioGroup1"]["/Kids"][0].get_object()["/AS"] == "/Off" + assert flds["RadioGroup1"]["/Kids"][1].get_object()["/AS"] == "/2" + assert all(x in flds["Liste2"]["/V"] for x in ["Lst1", "Lst3"]) + + assert all(x in flds["CheckBox1"]["/_States_"] for x in ["/Off", "/Yes"]) + assert all(x in flds["RadioGroup1"]["/_States_"] for x in ["/1", "/2", "/3"]) + assert all(x in flds["Liste1"]["/_States_"] for x in ["Liste1", "Liste2", "Liste3"]) + + if needs_cleanup: + Path(write_data_here).unlink() + + +@pytest.mark.enable_socket() +def test_iss1862(): + # The file here has "/B" entry to define the font in a object below the page + # The excluded field shall be considered only at first level (page) and not + # below + url = "https://github.com/py-pdf/pypdf/files/11708801/intro.pdf" + name = "iss1862.pdf" + writer = PdfWriter() + writer.append(BytesIO(get_data_from_url(url, name=name))) + # check that "/B" is in the font + writer.pages[0]["/Resources"]["/Font"]["/F1"]["/CharProcs"]["/B"].get_data() + + +def test_empty_objects_before_cloning(): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + writer = PdfWriter(clone_from=reader) + nb_obj_reader = len(reader.xref_objStm) + sum( + len(reader.xref[i]) for i in reader.xref + ) + nb_obj_reader -= 1 # for trailer + nb_obj_reader -= len( + {x: 1 for x, y in reader.xref_objStm.values()} + ) # to remove object streams + assert len(writer._objects) == nb_obj_reader + + +@pytest.mark.enable_socket() +def test_watermark(): + url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" + name = "bgwatermark.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" + name = "srcwatermark.pdf" + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) + for p in writer.pages: + p.merge_page(reader.pages[0], over=False) + + assert isinstance(p["/Contents"], ArrayObject) + assert isinstance(p["/Contents"][0], IndirectObject) + + b = BytesIO() + writer.write(b) + assert len(b.getvalue()) < 2.1 * 1024 * 1024 + + +@pytest.mark.enable_socket() +@pytest.mark.timeout(4) +def test_watermarking_speed(): + url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" + name = "bgwatermark.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://arxiv.org/pdf/2201.00214.pdf" + name = "2201.00214.pdf" + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) + for p in writer.pages: + p.merge_page(reader.pages[0], over=False) + out_pdf_bytesio = BytesIO() + writer.write(out_pdf_bytesio) + pdf_size_in_mib = len(out_pdf_bytesio.getvalue()) / 1024 / 1024 + assert pdf_size_in_mib < 20 + + +@pytest.mark.enable_socket() +@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") +def test_watermark_rendering(tmp_path): + """Ensure the visual appearance of watermarking stays correct.""" + url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" + name = "bgwatermark.pdf" + watermark = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] + url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" + name = "srcwatermark.pdf" + page = PdfReader(BytesIO(get_data_from_url(url, name=name))).pages[0] + writer = PdfWriter() + page.merge_page(watermark, over=False) + writer.add_page(page) + + target_png_path = tmp_path / "target.png" + url = "https://github.com/py-pdf/pypdf/assets/96178532/d5c72d0e-7047-4504-bbf6-bc591c80d7c0" + name = "dstwatermark.png" + target_png_path.write_bytes(get_data_from_url(url, name=name)) + + pdf_path = tmp_path / "out.pdf" + png_path = tmp_path / "out.png" + writer.write(pdf_path) + + # False positive: https://github.com/PyCQA/bandit/issues/333 + subprocess.run( + [ # noqa: S603 + GHOSTSCRIPT_BINARY, + "-sDEVICE=pngalpha", + "-o", + png_path, + pdf_path, + ] + ) + assert png_path.is_file() + assert image_similarity(png_path, target_png_path) >= 0.95 + + +@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript") +def test_watermarking_reportlab_rendering(tmp_path): + """ + This test is showing a rotated+mirrored watermark in pypdf==3.15.4. + + Replacing the generate_base with e.g. the crazyones did not show the issue. + """ + base_path = SAMPLE_ROOT / "022-pdfkit/pdfkit.pdf" + watermark_path = SAMPLE_ROOT / "013-reportlab-overlay/reportlab-overlay.pdf" + + reader = PdfReader(base_path) + base_page = reader.pages[0] + watermark = PdfReader(watermark_path).pages[0] + + writer = PdfWriter() + base_page.merge_page(watermark) + writer.add_page(base_page) + + target_png_path = RESOURCE_ROOT / "test_watermarking_reportlab_rendering.png" + pdf_path = tmp_path / "out.pdf" + png_path = tmp_path / "test_watermarking_reportlab_rendering.png" + + writer.write(pdf_path) + # False positive: https://github.com/PyCQA/bandit/issues/333 + subprocess.run( + [ # noqa: S603 + GHOSTSCRIPT_BINARY, + "-r120", + "-sDEVICE=pngalpha", + "-o", + png_path, + pdf_path, + ] + ) + assert png_path.is_file() + assert image_similarity(png_path, target_png_path) >= 0.999 + + +@pytest.mark.enable_socket() +def test_da_missing_in_annot(): + url = "https://github.com/py-pdf/pypdf/files/12136285/Building.Division.Permit.Application.pdf" + name = "BuildingDivisionPermitApplication.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter(clone_from=reader) + writer.update_page_form_field_values( + writer.pages[0], {"PCN-1": "0"}, auto_regenerate=False + ) + b = BytesIO() + writer.write(b) + reader = PdfReader(BytesIO(b.getvalue())) + ff = reader.get_fields() + # check for autosize processing + assert ( + b"0 Tf" + not in ff["PCN-1"].indirect_reference.get_object()["/AP"]["/N"].get_data() + ) + f2 = writer.get_object(ff["PCN-2"].indirect_reference.idnum) + f2[NameObject("/Parent")] = writer.get_object( + ff["PCN-1"].indirect_reference.idnum + ).indirect_reference + writer.update_page_form_field_values( + writer.pages[0], {"PCN-2": "1"}, auto_regenerate=False + ) + + +def test_missing_fields(pdf_file_path): + reader = PdfReader(RESOURCE_ROOT / "form.pdf") + + writer = PdfWriter() + writer.add_page(reader.pages[0]) + + with pytest.raises(PyPdfError) as exc: + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"}, flags=1 + ) + assert exc.value.args[0] == "No /AcroForm dictionary in PdfWriter Object" + + writer = PdfWriter() + writer.append(reader, [0]) + del writer._root_object["/AcroForm"]["/Fields"] + with pytest.raises(PyPdfError) as exc: + writer.update_page_form_field_values( + writer.pages[0], {"foo": "some filled in text"}, flags=1 + ) + assert exc.value.args[0] == "No /Fields dictionary in Pdf in PdfWriter Object" + + +def test_missing_info(): + reader = PdfReader(RESOURCE_ROOT / "missing_info.pdf") + + writer = PdfWriter(clone_from=reader) + assert len(writer.pages) == len(reader.pages) + + +@pytest.mark.enable_socket() +def test_germanfields(): + """Cf #2035""" + url = "https://github.com/py-pdf/pypdf/files/12194195/test.pdf" + name = "germanfields.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter(clone_from=reader) + form_fields = {"Text Box 1": "test æ ø å"} + writer.update_page_form_field_values( + writer.pages[0], form_fields, auto_regenerate=False + ) + bytes_stream = BytesIO() + writer.write(bytes_stream) + bytes_stream.seek(0) + reader2 = PdfReader(bytes_stream) + assert ( + b"test \xe6 \xf8 \xe5" + in reader2.get_fields()["Text Box 1"] + .indirect_reference.get_object()["/AP"]["/N"] + .get_data() + ) + + +@pytest.mark.enable_socket() +def test_no_t_in_articles(): + """Cf #2078""" + url = "https://github.com/py-pdf/pypdf/files/12311735/bad.pdf" + name = "iss2078.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_no_i_in_articles(): + """Cf #2089""" + url = "https://github.com/py-pdf/pypdf/files/12352793/kim2002.pdf" + name = "iss2089.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_damaged_pdf_length_returning_none(): + """ + Cf #140 + https://github.com/py-pdf/pypdf/issues/140#issuecomment-1685380549 + """ + url = "https://github.com/py-pdf/pypdf/files/12168578/bad_pdf_example.pdf" + name = "iss140_bad_pdf.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + writer.append(reader) + + +@pytest.mark.enable_socket() +def test_viewerpreferences(): + """Add Tests for ViewerPreferences""" + url = "https://github.com/py-pdf/pypdf/files/9175966/2015._pb_decode_pg0.pdf" + name = "2015._pb_decode_pg0.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + v = reader.viewer_preferences + assert v.center_window == True # noqa: E712 + writer = PdfWriter(clone_from=reader) + v = writer.viewer_preferences + assert v.center_window == True # noqa: E712 + v.center_window = False + assert ( + writer._root_object["/ViewerPreferences"]["/CenterWindow"] + == False # noqa: E712 + ) + assert v.print_area == "/CropBox" + with pytest.raises(ValueError): + v.non_fullscreen_pagemode = "toto" + with pytest.raises(ValueError): + v.non_fullscreen_pagemode = "/toto" + v.non_fullscreen_pagemode = "/UseOutlines" + assert ( + writer._root_object["/ViewerPreferences"]["/NonFullScreenPageMode"] + == "/UseOutlines" + ) + writer = PdfWriter(clone_from=reader) + v = writer.viewer_preferences + assert v.center_window == True # noqa: E712 + v.center_window = False + assert ( + writer._root_object["/ViewerPreferences"]["/CenterWindow"] + == False # noqa: E712 + ) + + writer = PdfWriter(clone_from=reader) + writer._root_object[NameObject("/ViewerPreferences")] = writer._add_object( + writer._root_object["/ViewerPreferences"] + ) + v = writer.viewer_preferences + v.center_window = False + assert ( + writer._root_object["/ViewerPreferences"]["/CenterWindow"] + == False # noqa: E712 + ) + v.num_copies = 1 + assert v.num_copies == 1 + assert v.print_pagerange is None + with pytest.raises(ValueError): + v.print_pagerange = "toto" + v.print_pagerange = ArrayObject() + assert len(v.print_pagerange) == 0 + + writer.create_viewer_preferences() + assert len(writer._root_object["/ViewerPreferences"]) == 0 + writer.viewer_preferences.direction = "/R2L" + assert len(writer._root_object["/ViewerPreferences"]) == 1 + + del reader.trailer["/Root"]["/ViewerPreferences"] + assert reader.viewer_preferences is None + writer = PdfWriter(clone_from=reader) + assert writer.viewer_preferences is None + + +def test_extra_spaces_in_da_text(caplog): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "form.pdf") + t = writer.pages[0]["/Annots"][0].get_object()["/DA"] + t = t.replace("/Helv", "/Helv ") + writer.pages[0]["/Annots"][0].get_object()[NameObject("/DA")] = TextStringObject(t) + writer.update_page_form_field_values( + writer.pages[0], {"foo": "abcd"}, auto_regenerate=False + ) + t = writer.pages[0]["/Annots"][0].get_object()["/AP"]["/N"].get_data() + assert "Font dictionary for not found." not in caplog.text + assert b"/Helv" in t + assert b"(abcd)" in t + + +@pytest.mark.enable_socket() +def test_object_contains_indirect_reference_to_self(): + url = "https://github.com/py-pdf/pypdf/files/12389243/testbook.pdf" + name = "iss2102.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + writer = PdfWriter() + width, height = 595, 841 + outpage = writer.add_blank_page(width, height) + outpage.merge_page(reader.pages[6]) + writer.append(reader) From be002732ea5bc3d9034eaf24694164fc57c5595b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 6 Oct 2023 07:09:32 +0200 Subject: [PATCH 05/13] fix --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index bea4c11ab..e77e4e9d9 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -807,7 +807,7 @@ def add_attachment( return None if fname is None: st = filename.replace("/", "\\/").replace("\\\\/", "\\/") - fname = st.encode().decode("ansi", errors="xmlcharreplace") + fname = st.encode().decode("ascii", errors="xmlcharreplace") fname = f"{fname}" # to escape string # We need three entries: From 6e16e47bb7ce4d4fd97613429881948ef44acacf Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:55:42 +0200 Subject: [PATCH 06/13] coverage --- pypdf/_reader.py | 34 +++++++++++++++++++++++++------ pypdf/generic/__init__.py | 4 ++-- pypdf/generic/_base.py | 1 + pypdf/generic/_data_structures.py | 23 +++++++++++---------- tests/test_generic.py | 8 ++++++++ tests/test_reader.py | 5 +++++ 6 files changed, 56 insertions(+), 19 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index f5d0c5ada..f6ce71ae1 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -101,9 +101,11 @@ NullObject, NumberObject, PdfObject, + StreamObject, TextStringObject, TreeObject, ViewerPreferences, + get_from_file_specification, read_object, ) from .types import OutlineType, PagemodeType @@ -2316,12 +2318,32 @@ def _get_attachments( return {k: v if len(v) > 1 else v[0] for k, v in self.attachments.items()} # type: ignore else: lst = ef.list_get(filename) - return { - filename: [(x.get_object())["/EF"].get_object( # type: ignore - )["/F"].get_object().get_data() for x in lst] # type: ignore - if isinstance(lst, list) - else (lst.get_object())["/EF"].get_object()["/F"].get_object().get_data() # type: ignore - } + if lst is None: + return {} + lst = cast(DictionaryObject, lst.get_object()) + efo = cast(DictionaryObject, lst["/EF"].get_object()) + rst = cast( + StreamObject, + get_from_file_specification(efo).get_object(), + ).get_data() + if isinstance(rst, str): + rst = rst.encode() + if "/RF" not in lst: + return {filename: [rst]} + else: + rst2 = {"": rst} # /EF will be returned by empty key + lst = cast( + ArrayObject, + get_from_file_specification( + cast(DictionaryObject, lst["/RF"].get_object()) + ), + ) + for i in range(0, len(lst), 2): + t = cast(StreamObject, lst[i + 1].get_object()).get_data() + if isinstance(t, str): + t = t.encode() + rst2[lst[i]] = t + return {filename: [rst2]} class PdfFileReader(PdfReader): # deprecated diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index bed5eb601..9288bf050 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -56,7 +56,7 @@ NameTree, StreamObject, TreeObject, - get_name_from_file_specification, + get_from_file_specification, read_object, ) from ._fit import Fit @@ -447,7 +447,7 @@ def link( "Field", "Destination", "NameTree", - "get_name_from_file_specification", + "get_from_file_specification", "ViewerPreferences", # --- More specific stuff # Outline diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index c17dcbea6..d429f2724 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -319,6 +319,7 @@ def replace_object(self, obj: "PdfObject") -> None: Replace the pointed object with obj Only applies to IndirectObjects within a PdfWriter """ + obj = cast("PdfObject", obj.get_object()) pdf = self.pdf if not hasattr(pdf, "_replace_object"): raise TypeError("Trying to replace Object in a non PdfWriter") diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index a0c56c69f..236ed8fe0 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1545,7 +1545,7 @@ def _append_with_dup( _list(self, _l) return dict(_l) - def list_get(self, key: str) -> List[PdfObject]: + def list_get(self, key: str) -> Optional[PdfObject]: """ Get the entry from the Name Tree @@ -1557,10 +1557,9 @@ def list_get(self, key: str) -> List[PdfObject]: attributeEntries as a dictionary """ - def _get(key: str, o: Optional[PdfObject]) -> List[PdfObject]: + def _get(key: str, o: Optional[PdfObject]) -> Optional[PdfObject]: if o is None: - return [] - rst = [] + return None o = cast(DictionaryObject, o) _l = o.get("/Names", None) a = o.get("/Kids", None) @@ -1568,10 +1567,12 @@ def _get(key: str, o: Optional[PdfObject]) -> List[PdfObject]: a = a.get_object() if a else [] for i, x in enumerate(_l): if x == key: - rst.append(_l[i + 1]) + return _l[i + 1] for x in a: - rst.extend(_get(key, x)) - return rst + v = _get(key, x) + if v is not None: + return v + return None # if we arrive here, it means nothing matched return _get(key, self) @@ -1676,14 +1677,14 @@ def _add_in( return o.indirect_reference if o is not None else None -def get_name_from_file_specification(_a: DictionaryObject) -> str: - return cast( - str, +def get_from_file_specification(_a: DictionaryObject) -> PdfObject: + return ( _a.get("/UF") or _a.get("/F") or _a.get("/DOS") or _a.get("/Unix") - or _a.get("/Mac"), + or _a.get("/Mac") + or DictionaryObject() ) diff --git a/tests/test_generic.py b/tests/test_generic.py index 0e0fff677..0dd724686 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1235,3 +1235,11 @@ def test_encodedstream_set_data(): assert cc["/Filter"] == ["/FlateDecode", "/FlateDecode", "/FlateDecode"] assert str(cc["/DecodeParms"]) == "[NullObject, NullObject, NullObject]" assert cc[NameObject("/Test")] == "/MyTest" + + +def test_replace_object(): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") + i = writer.pages[0]["/Contents"][0].idnum + writer.pages[0]["/Contents"][0].replace_object(NullObject()) + assert writer.pages[0]["/Contents"][0].idnum == i + assert isinstance(writer.pages[0]["/Contents"][0].get_object(), NullObject) diff --git a/tests/test_reader.py b/tests/test_reader.py index 8afb45737..cae355103 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1454,3 +1454,8 @@ def test_issue_140(): b = get_data_from_url(url, name=name) reader = PdfReader(BytesIO(b)) assert len(reader.pages) == 54 + + +def test_embedded_files_no_ef(): + reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") + assert reader.embedded_files is None From cf997de562aa7cf2a424f09cf33d41e60b303e75 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 16 Oct 2023 21:48:31 +0200 Subject: [PATCH 07/13] coverage --- tests/test_reader.py | 5 +++-- tests/test_writer.py | 21 ++++++++++++++++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 067d1e66a..37e7fdfbc 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1455,7 +1455,7 @@ def test_issue_140(): reader = PdfReader(BytesIO(b)) assert len(reader.pages) == 54 - + @pytest.mark.enable_socket() def test_xyz_with_missing_param(): """Cf #2236""" @@ -1470,4 +1470,5 @@ def test_xyz_with_missing_param(): def test_embedded_files_no_ef(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") - assert reader.embedded_files is None \ No newline at end of file + reader[NameObject("/Names")] = DictionaryObject() + assert reader.embedded_files is None diff --git a/tests/test_writer.py b/tests/test_writer.py index 667c5d4fc..8c26124e0 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1321,27 +1321,46 @@ def test_attachments(): "foobar.txt": [b"foobarcontent"], "foobar2.txt": [b"2nd_foobarcontent"], } + assert writer.attachments == { + "foobar.txt": [b"foobarcontent"], + "foobar2.txt": [b"2nd_foobarcontent"], + } _l = list({name for name, _ in to_add}) _l.sort() assert reader._list_attachments() == _l + assert writer._list_attachments() == _l # We've added the same key twice - hence only 2 and not 3: att = reader._get_attachments() - assert len(att) == 2 # we have 2 keys, but 3 attachments! + assert len(att) == 2 # The content for foobar.txt is clear and just a single value: assert att["foobar.txt"] == b"foobarcontent" + # Not applicable for writer + # att = writer._get_attachments() + # assert len(att) == 2 # we have 2 keys only + # assert att["foobar.txt"] == b"foobarcontent" + # The content for foobar2.txt is a list! att = reader._get_attachments("foobar2.txt") assert len(att) == 1 assert att["foobar2.txt"] == [b"2nd_foobarcontent"] + # The content for foobar2.txt is a list! + # att = writer._get_attachments("foobar2.txt") + # assert len(att) == 1 + # assert att["foobar2.txt"] == [b"2nd_foobarcontent"] + # Let's do both cases with the public interface: assert reader.attachments["foobar.txt"][0] == b"foobarcontent" assert reader.attachments["foobar2.txt"][0] == b"2nd_foobarcontent" assert len(reader.attachments["foobar2.txt"]) == 1 + assert writer.attachments["foobar.txt"][0] == b"foobarcontent" + assert writer.attachments["foobar2.txt"][0] == b"2nd_foobarcontent" + assert len(writer.attachments["foobar2.txt"]) == 1 + @pytest.mark.enable_socket() def test_iss1614(): From ba983a8dca45fb893b0958a064f6bda5aec302e0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 16 Oct 2023 22:04:43 +0200 Subject: [PATCH 08/13] oups --- tests/test_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 37e7fdfbc..71d137236 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1470,5 +1470,5 @@ def test_xyz_with_missing_param(): def test_embedded_files_no_ef(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") - reader[NameObject("/Names")] = DictionaryObject() + reader.trailer["/Root"][NameObject("/Names")] = DictionaryObject() assert reader.embedded_files is None From fcc1353b976d654be29168f77e4bbcbcbed56985 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 17 Oct 2023 19:41:44 +0200 Subject: [PATCH 09/13] coverage --- pypdf/generic/_data_structures.py | 5 +---- tests/test_generic.py | 8 ++++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index ed940e50a..9cdf6e2d9 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1467,10 +1467,7 @@ def __init__(self, obj: Optional[PdfObject] = None) -> None: x not in obj for x in ("/Names", "/Kids") ): raise ValueError("source object is not a valid source object") - if obj is not None: - self.update(obj) - else: # building a new Name Tree - self[NameObject("/Names")] = ArrayObject() + self.update(obj) if hasattr(obj, "indirect_reference"): self.indirect_reference = obj.indirect_reference diff --git a/tests/test_generic.py b/tests/test_generic.py index 0dd724686..5f776d5ed 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -19,6 +19,7 @@ FloatObject, IndirectObject, NameObject, + NameTree, NullObject, NumberObject, OutlineItem, @@ -1243,3 +1244,10 @@ def test_replace_object(): writer.pages[0]["/Contents"][0].replace_object(NullObject()) assert writer.pages[0]["/Contents"][0].idnum == i assert isinstance(writer.pages[0]["/Contents"][0].get_object(), NullObject) + + +def test_nametree(): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "crazyones.pdf") + with pytest.raises(ValueError): + NameTree(writer._root_object) + writer._root_object[NameObject("/Names")] = DictionaryObject() From 90a3408dfc9c17d7c4d11290cb1e4d6ce64401c2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 17 Oct 2023 19:50:10 +0200 Subject: [PATCH 10/13] TST: Fix test_image_without_pillow in windows environment fixes test failure in windows environment --- tests/test_filters.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_filters.py b/tests/test_filters.py index 12819c43b..9cfea57b5 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -265,9 +265,11 @@ def test_image_without_pillow(tmp_path): name = "tika-914102.pdf" _ = get_data_from_url(url, name=name) pdf_path = Path(__file__).parent / "pdf_cache" / name + pdf_path_str = str(pdf_path.resolve()).replace("\\", "/") source_file = tmp_path / "script.py" - source_file.write_text(f""" + source_file.write_text( + f""" import sys from pypdf import PdfReader @@ -275,7 +277,7 @@ def test_image_without_pillow(tmp_path): sys.modules["PIL"] = None -reader = PdfReader("{pdf_path.resolve()}", strict=True) +reader = PdfReader("{pdf_path_str}", strict=True) for page in reader.pages: with pytest.raises(ImportError) as exc: @@ -284,13 +286,20 @@ def test_image_without_pillow(tmp_path): "pillow is required to do image extraction. " "It can be installed via 'pip install pypdf[image]'" ), exc.value.args[0] -""") +""" + ) result = subprocess.run( # noqa: UP022 - [shutil.which("python"), source_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE # noqa: S603 + [shutil.which("python"), source_file], # noqa: S603 + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) assert result.returncode == 0 assert result.stdout == b"" - assert result.stderr == b"Superfluous whitespace found in object header b'4' b'0'\n" + assert ( + result.stderr.replace(b"\r", b"") + == b"Superfluous whitespace found in object header b'4' b'0'\n" + ) + @pytest.mark.enable_socket() def test_issue_1737(): From 1abcd0bd7828e888f33037d0f8aaf40caf54d8c0 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:32:20 +0100 Subject: [PATCH 11/13] progressing --- pypdf/_reader.py | 59 +++++------ pypdf/generic/__init__.py | 4 + pypdf/generic/_data_structures.py | 161 ++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+), 28 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index 739a262fa..a90e98afe 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -86,6 +86,7 @@ ) from .generic import ( ArrayObject, + AttachmentBytes, BooleanObject, ContentStream, DecodedStreamObject, @@ -2232,39 +2233,41 @@ def _get_embedded_files_root(self) -> Optional[NameTree]: return NameTree(efo) @property - def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: + def attachments_names(self) -> List[str]: + """ + Returns: + List of names + """ ef = self._get_embedded_files_root() - if ef: - return ef.list_items() - else: - return None + if ef is None: + return [] + return ef.list_keys() @property - def attachments(self) -> Mapping[str, List[Union[bytes, Dict[str, bytes]]]]: + def attachments(self) -> Mapping[str, AttachmentBytes]: + """ + extracts the /EF entries as bytes from the embedded files + Returns: + Dictionary with the filenames as keys and the file content as bytes, + extra data cah be accessed with Attachmentbytes extra properties(.name, + .list_rf_names(), .get_embeddedfile(), .all_files) + + Note: + If you want to access /RF + """ ef = self._get_embedded_files_root() - if ef: - d: Dict[str, List[Union[bytes, Dict[str, bytes]]]] = {} - for k, v in ef.list_items().items(): - if isinstance(v, list): - if k not in d: - d[k] = [] - for e in v: - e = cast(DictionaryObject, e.get_object()) - if "/EF" in e: - d[k].append(e["/EF"]["/F"].get_data()) # type: ignore - elif "/RF" in e: - r = cast( - ArrayObject, cast(DictionaryObject, e["/RF"])["/F"] - ) - di: Dict[str, bytes] = {} - i = 0 - while i < len(r): - di[cast(str, r[i])] = r[i + 1].get_object().get_data() - i += 2 - d[k].append(di) - return d - else: + if ef is None: return {} + d: Dict[str, AttachmentBytes] = {} + for k, v in ef.list_items().items(): + if len(v) > 1: + logger_warning( + "Unexpected amout of entries in attachments, please report" + "and share the file for analysis with pypdf dev team", + __name__, + ) + d[k] = AttachmentBytes(cast(DictionaryObject, v[0].get_object())) + return d def _list_attachments(self) -> List[str]: """ diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index 9288bf050..7bbf362fe 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -46,7 +46,9 @@ encode_pdfdocencoding, ) from ._data_structures import ( + PREFERED_ATTACHMENT, ArrayObject, + AttachmentBytes, ContentStream, DecodedStreamObject, Destination, @@ -437,6 +439,7 @@ def link( "PAGE_FIT", # Data structures "ArrayObject", + "AttachmentBytes", "DictionaryObject", "TreeObject", "StreamObject", @@ -447,6 +450,7 @@ def link( "Field", "Destination", "NameTree", + "PREFERED_ATTACHMENT", "get_from_file_specification", "ViewerPreferences", # --- More specific stuff diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 9cdf6e2d9..70818ea40 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1452,6 +1452,163 @@ def additionalActions(self) -> Optional[DictionaryObject]: # deprecated return self.additional_actions +class AttachmentBytes(bytes): + """Extension of bytes class, adding File Spefication dedicated properties""" + + source_object: Optional[IndirectObject] = None + """ + Pointer to the File Specification entry associated ; + None, if created from a bytes or StreamObject + """ + within_page: Optional[IndirectObject] = None + """ + Page where the File Spefication is referenced, else None + This is relevant only for file attachement annotations + note : this property should be initialized manually out of the constructor + """ + + def __new__( + cls, + src: Optional[ + Union[bytes, IndirectObject, StreamObject, DictionaryObject] + ] = None, + ) -> "AttachmentBytes": + """ + Object Constructor. + + Args: + src [DictionaryObject] : FileSpecification Object to populate the new object + src [bytes/StreamObject] : bytes/StreamObject(EmbeddedFile) to extract the stream + to initialize (partially the object) + src [IndirectObject] : Pointer to the DictionaryObject/StreamObject for init + src [None] : similar to src = b"" + """ + inp: Optional[IndirectObject] = None + obj: Any = src + v: Union[str, bytes] + if isinstance(obj, IndirectObject): + obj = obj.get_object() + if isinstance(obj, bytes): + v = obj + elif isinstance(obj, StreamObject): + v = obj.get_data() + elif isinstance(obj, DictionaryObject) and "/EF" in obj: + inp = obj.indirect_reference + o = cast(DictionaryObject, obj["/EF"]) + o = cast(StreamObject, get_from_file_specification(o).get_object()) + v = o.get_data() + else: + v = b"" + if isinstance(v, str): + v = v.encode() + out = bytes.__new__(cls, v) + if inp is None: + out.source_object = None + else: + out.source_object = inp.indirect_reference + out.within_page = None # has to be set by program + return out + + @property + def name(self) -> Optional[str]: + """Returns the (best) name from the File Specification Object else None""" + o: Any = self.source_object + if o is None: + return None + o = cast(DictionaryObject, o.get_object()) + return cast(str, get_from_file_specification(o)) + + def list_rf_names(self) -> List[str]: + """ + Returns: + List of filenames store in /RF fields; + Empty list if no /RF field exists + + Note: + does not contains "" entry (for EF) + """ + o: Any = self.source_object + if o is None: + return [] + o = cast(DictionaryObject, o.get_object()) + if "/RF" in o: + o = cast(DictionaryObject, o["/RF"]) + o = cast(DictionaryObject, get_from_file_specification(o)) + try: + lst = [o[i] for i in range(0, len(o), 2)] + return lst + except ValueError: + return [] + else: + return [] + + def get_embeddedfile(self, subfile: str = "") -> Optional[StreamObject]: + """ + Returns the EmbeddedFile(Stream Object) containing the data bytes + Args: + subfile: filename of the EmbeddedFile to be returned; + "" returns the EmbeddedFile from the /EF field + Returns: + StreamObject + + Note: + o == o.get_embeddedfile("").get_data() + """ + o: Any = self.source_object + if o is None: + return None + o = cast(DictionaryObject, o.get_object()) + if subfile == "": + o = cast(DictionaryObject, o["/EF"]) + return cast(StreamObject, get_from_file_specification(o).get_object()) + elif "/RF" in o: + o = cast(DictionaryObject, o["/RF"]) + o = cast(DictionaryObject, get_from_file_specification(o)) + try: + i = o.index(subfile) + return cast(StreamObject, o[i + 1].get_object()) + except ValueError: + return None + else: + return None + + @property + def all_files(self) -> Dict[str, bytes]: + """ + Returns: + a dictionary filename/data bytes; + {} if the object is not assocatied with a File Spefication. + + Note: + the results contains also the /EF stored behin "" key + """ + o: Any = self.source_object + if o is None: + return {} + o = cast(DictionaryObject, o.get_object()) + out: Dict[str, bytes] = {} + o = cast(DictionaryObject, o["/EF"]) + v = cast(StreamObject, get_from_file_specification(o)).get_data() + if isinstance(v, str): + v = v.encode() + out[""] = v + if "/RF" in o: + o = cast(DictionaryObject, o["/RF"]) + a = cast(ArrayObject, get_from_file_specification(o)) + try: + for i in range(0, len(a), 2): + v = cast(StreamObject, a[i + 1].get_object()).get_data() + if isinstance(v, str): + v = v.encode() + out[a[i]] = v + return out + except ValueError as exc: + logger_warning(exc.__repr__(), __name__) + return out + else: + return out + + class NameTree(DictionaryObject): """ Name Tree Structure @@ -1675,10 +1832,14 @@ def _add_in( return o.indirect_reference if o is not None else None +PREFERED_ATTACHMENT = "/DOS" + + def get_from_file_specification(_a: DictionaryObject) -> PdfObject: return ( _a.get("/UF") or _a.get("/F") + or _a.get(PREFERED_ATTACHMENT) or _a.get("/DOS") or _a.get("/Unix") or _a.get("/Mac") From a0ee1a417717f923d9972bbe7f271d892c22a175 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 1 Nov 2023 23:57:17 +0100 Subject: [PATCH 12/13] fix but still some mypy --- pypdf/_reader.py | 105 ++---------------------------- pypdf/_writer.py | 62 ++++++------------ pypdf/generic/__init__.py | 2 + pypdf/generic/_data_structures.py | 78 ++++++++++++++++------ tests/test_reader.py | 3 +- tests/test_writer.py | 52 +++++++-------- 6 files changed, 110 insertions(+), 192 deletions(-) diff --git a/pypdf/_reader.py b/pypdf/_reader.py index a90e98afe..eb3f0c35f 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -40,7 +40,6 @@ Dict, Iterable, List, - Mapping, Optional, Tuple, Union, @@ -86,7 +85,7 @@ ) from .generic import ( ArrayObject, - AttachmentBytes, + AttachmentBytesDictionary, BooleanObject, ContentStream, DecodedStreamObject, @@ -102,11 +101,9 @@ NullObject, NumberObject, PdfObject, - StreamObject, TextStringObject, TreeObject, ViewerPreferences, - get_from_file_specification, read_object, ) from .types import OutlineType, PagemodeType @@ -2238,13 +2235,10 @@ def attachments_names(self) -> List[str]: Returns: List of names """ - ef = self._get_embedded_files_root() - if ef is None: - return [] - return ef.list_keys() + return self.attachments.keys() @property - def attachments(self) -> Mapping[str, AttachmentBytes]: + def attachments(self) -> AttachmentBytesDictionary: """ extracts the /EF entries as bytes from the embedded files Returns: @@ -2255,98 +2249,7 @@ def attachments(self) -> Mapping[str, AttachmentBytes]: Note: If you want to access /RF """ - ef = self._get_embedded_files_root() - if ef is None: - return {} - d: Dict[str, AttachmentBytes] = {} - for k, v in ef.list_items().items(): - if len(v) > 1: - logger_warning( - "Unexpected amout of entries in attachments, please report" - "and share the file for analysis with pypdf dev team", - __name__, - ) - d[k] = AttachmentBytes(cast(DictionaryObject, v[0].get_object())) - return d - - def _list_attachments(self) -> List[str]: - """ - Retrieves the list of filenames of file attachments. - - Returns: - list of filenames - """ - ef = self._get_embedded_files_root() - if ef: - lst = ef.list_keys() - else: - lst = [] - """ - for ip, p in enumerate(self.pages): - for a in [_a.get_object() - for _a in p.get("/Annots",[])]: - if _a.get_object().get("/Subtype","") != "/FileAttachements": - continue - lst.append(f"$page_{ip}.{get_name_from_file_specification(_a)}") - """ - return lst - - def _get_attachment_list(self, name: str) -> List[Union[bytes, Dict[str, bytes]]]: - out = self._get_attachments(name)[name] - if isinstance(out, list): - return out - return [out] - - def _get_attachments( - self, filename: Optional[str] = None - ) -> Dict[str, List[Union[bytes, Dict[str, bytes]]]]: - """ - Retrieves all or selected file attachments of the PDF as a dictionary of file names - and the file data as a bytestring. - - Args: - filename: If filename is None, then a dictionary of all attachments - will be returned, where the key is the filename and the value - is the content. Otherwise, a dictionary with just a single key - - the filename - and its content will be returned. - - Returns: - dictionary of filename -> Union[bytestring or List[ByteString]] - if the filename exists multiple times a List of the different version will be provided - """ - ef = self._get_embedded_files_root() - if ef is None: - return {} - if filename is None: - return {k: v if len(v) > 1 else v[0] for k, v in self.attachments.items()} # type: ignore - else: - lst = ef.list_get(filename) - if lst is None: - return {} - lst = cast(DictionaryObject, lst.get_object()) - efo = cast(DictionaryObject, lst["/EF"].get_object()) - rst = cast( - StreamObject, - get_from_file_specification(efo).get_object(), - ).get_data() - if isinstance(rst, str): - rst = rst.encode() - if "/RF" not in lst: - return {filename: [rst]} - else: - rst2 = {"": rst} # /EF will be returned by empty key - lst = cast( - ArrayObject, - get_from_file_specification( - cast(DictionaryObject, lst["/RF"].get_object()) - ), - ) - for i in range(0, len(lst), 2): - t = cast(StreamObject, lst[i + 1].get_object()).get_data() - if isinstance(t, str): - t = t.encode() - rst2[lst[i]] = t - return {filename: [rst2]} + return AttachmentBytesDictionary(self._get_embedded_files_root()) class PdfFileReader(PdfReader): # deprecated diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e36b6867d..a75d01bbb 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -46,7 +46,6 @@ Dict, Iterable, List, - Mapping, Optional, Pattern, Tuple, @@ -96,6 +95,7 @@ from .generic import ( PAGE_FIT, ArrayObject, + AttachmentBytesDictionary, BooleanObject, ByteStringObject, ContentStream, @@ -740,48 +740,26 @@ def _create_attachment_root(self) -> NameTree: return node @property - def embedded_files(self) -> Optional[Mapping[str, List[PdfObject]]]: - ef = self._get_embedded_files_root() - if ef: - return ef.list_items() - else: - return None - - def _list_attachments(self) -> List[str]: - ef = self._get_embedded_files_root() - if ef: - return ef.list_keys() - else: - return [] + def attachments_names(self) -> List[str]: + """ + Returns: + List of names + """ + return self.attachments.keys() @property - def attachments(self) -> Mapping[str, List[Union[bytes, Dict[str, bytes]]]]: - ef = self._get_embedded_files_root() - if ef: - d: Dict[str, List[Union[bytes, Dict[str, bytes]]]] = {} - for k, v in ef.list_items().items(): - if isinstance(v, list): - if k not in d: - d[k] = [] - for e in v: - e = cast(DictionaryObject, e.get_object()) - if "/EF" in e: - d[k].append(e["/EF"]["/F"].get_data()) # type: ignore - elif "/RF" in e: - r = cast( - ArrayObject, cast(DictionaryObject, e["/RF"])["/F"] - ) - di = {} - i = 0 - while i < len(r): - di[cast(str, r[i])] = cast( - bytes, r[i + 1].get_object().get_data() - ) - i += 2 - d[k].append(di) - return d - else: - return {} + def attachments(self) -> AttachmentBytesDictionary: + """ + extracts the /EF entries as bytes from the embedded files + Returns: + Dictionary with the filenames as keys and the file content as bytes, + extra data cah be accessed with Attachmentbytes extra properties(.name, + .list_rf_names(), .get_embeddedfile(), .all_files) + + Note: + If you want to access /RF + """ + return AttachmentBytesDictionary(self._get_embedded_files_root()) def add_attachment( self, @@ -808,7 +786,7 @@ def add_attachment( Returns: The filespec DictionaryObject """ - if not overwrite and filename in self._list_attachments(): + if not overwrite and filename in self.attachments_names: return None if fname is None: st = filename.replace("/", "\\/").replace("\\\\/", "\\/") diff --git a/pypdf/generic/__init__.py b/pypdf/generic/__init__.py index 7bbf362fe..f2eadf079 100644 --- a/pypdf/generic/__init__.py +++ b/pypdf/generic/__init__.py @@ -49,6 +49,7 @@ PREFERED_ATTACHMENT, ArrayObject, AttachmentBytes, + AttachmentBytesDictionary, ContentStream, DecodedStreamObject, Destination, @@ -440,6 +441,7 @@ def link( # Data structures "ArrayObject", "AttachmentBytes", + "AttachmentBytesDictionary", "DictionaryObject", "TreeObject", "StreamObject", diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index cb48bb93d..c3c7328ea 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -36,12 +36,12 @@ Any, Callable, Dict, + Generator, Iterable, List, Mapping, Optional, Sequence, - Set, Tuple, Union, cast, @@ -189,7 +189,6 @@ def clone( except Exception: pass - visited: Set[Tuple[int, int]] = set() # (idnum, generation) d__ = cast( "DictionaryObject", self._reference_clone(self.__class__(), pdf_dest, force_duplicate), @@ -197,7 +196,7 @@ def clone( if ignore_fields is None: ignore_fields = [] if len(d__.keys()) == 0: - d__._clone(self, pdf_dest, force_duplicate, ignore_fields, visited) + d__._clone(self, pdf_dest, force_duplicate, ignore_fields) return d__ def _clone( @@ -206,7 +205,6 @@ def _clone( pdf_dest: PdfWriterProtocol, force_duplicate: bool, ignore_fields: Optional[Sequence[Union[str, int]]], - visited: Set[Tuple[int, int]], # (idnum, generation) ) -> None: """ Update the object from src. @@ -274,14 +272,6 @@ def _clone( cur_obj.__class__(), pdf_dest, force_duplicate ), ) - # check to see if we've previously processed our item - if clon.indirect_reference is not None: - idnum = clon.indirect_reference.idnum - generation = clon.indirect_reference.generation - if (idnum, generation) in visited: - cur_obj = None - break - visited.add((idnum, generation)) objs.append((cur_obj, clon)) assert prev_obj is not None prev_obj[NameObject(k)] = clon.indirect_reference @@ -294,9 +284,7 @@ def _clone( except Exception: cur_obj = None for s, c in objs: - c._clone( - s, pdf_dest, force_duplicate, ignore_fields, visited - ) + c._clone(s, pdf_dest, force_duplicate, ignore_fields) for k, v in src.items(): if k not in ignore_fields: @@ -812,7 +800,6 @@ def _clone( pdf_dest: PdfWriterProtocol, force_duplicate: bool, ignore_fields: Optional[Sequence[Union[str, int]]], - visited: Set[Tuple[int, int]], ) -> None: """ Update the object from src. @@ -835,7 +822,7 @@ def _clone( ) except Exception: pass - super()._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) + super()._clone(src, pdf_dest, force_duplicate, ignore_fields) def get_data(self) -> Union[bytes, str]: return self._data @@ -1063,7 +1050,6 @@ def clone( except Exception: pass - visited: Set[Tuple[int, int]] = set() d__ = cast( "ContentStream", self._reference_clone( @@ -1072,7 +1058,7 @@ def clone( ) if ignore_fields is None: ignore_fields = [] - d__._clone(self, pdf_dest, force_duplicate, ignore_fields, visited) + d__._clone(self, pdf_dest, force_duplicate, ignore_fields) return d__ def _clone( @@ -1081,7 +1067,6 @@ def _clone( pdf_dest: PdfWriterProtocol, force_duplicate: bool, ignore_fields: Optional[Sequence[Union[str, int]]], - visited: Set[Tuple[int, int]], ) -> None: """ Update the object from src. @@ -1098,7 +1083,7 @@ def _clone( self._operations = list(src_cs._operations) self.forced_encoding = src_cs.forced_encoding # no need to call DictionaryObjection or anything - # like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields, visited) + # like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields) def _parse_content_stream(self, stream: StreamType) -> None: # 7.8.2 Content Streams @@ -1863,6 +1848,57 @@ def get_from_file_specification(_a: DictionaryObject) -> PdfObject: ) +class AttachmentBytesDictionary(dict): + """ + Dict[str, AttachmentBytes] + Ease access to Dictionary of Object + """ + + root: Optional[NameTree] + names: List[str] + + def __init__( + self, root: Optional[Union[NameTree, DictionaryObject, IndirectObject]] + ): + dict.__init__(self) + if isinstance(root, IndirectObject): + root = cast(DictionaryObject, root.get_object()) + if root is not None: + self.root = ( + root if isinstance(root, NameTree) else NameTree(root) + ) + self.names = list(self.root.list_keys()) + else: + self.root = None + self.names = [] + + def keys(self) -> List[str]: + return self.names + + def items(self) -> Generator[str, AttachmentBytes]: + if self.root is None: + return [] + else: + for k, v in self.root.list_items().items(): + if len(v) > 1: + logger_warning( + "Unexpected amout of entries in attachments," + "please report" + "and share the file for analysis with pypdf dev team", + __name__, + ) + yield (k, AttachmentBytes(cast(DictionaryObject, v[0].get_object()))) + + def __getitem__(self, k: str) -> AttachmentBytes: + if k not in self.names: + raise KeyError("KeyError: k") + v = self.root.list_get(k) + return AttachmentBytes(cast(DictionaryObject, v.get_object())) + + def __repr__(self) -> str: + return "{ " + ", ".join(["'" + x + "': ..." for x in self.names]) + "}" + + class Destination(TreeObject): """ A class representing a destination within a PDF file. diff --git a/tests/test_reader.py b/tests/test_reader.py index 71d137236..f43f11df6 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1471,4 +1471,5 @@ def test_xyz_with_missing_param(): def test_embedded_files_no_ef(): reader = PdfReader(RESOURCE_ROOT / "crazyones.pdf") reader.trailer["/Root"][NameObject("/Names")] = DictionaryObject() - assert reader.embedded_files is None + assert reader.attachments_names == [] + assert reader.attachments == {} diff --git a/tests/test_writer.py b/tests/test_writer.py index 7b3d98643..1a8a545f0 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1297,8 +1297,8 @@ def test_attachments(): reader = PdfReader(b) b = None assert reader.attachments == {} - assert reader._list_attachments() == [] - assert reader._get_attachments() == {} + # assert reader._list_attachments() == [] + # assert reader._get_attachments() == {} to_add = [ ("foobar.txt", b"foobarcontent"), ("foobar2.txt", b"foobarcontent2"), @@ -1313,30 +1313,30 @@ def test_attachments(): reader = PdfReader(b) b = None assert sorted(reader.attachments.keys()) == sorted({name for name, _ in to_add}) - assert reader.attachments == { - "foobar.txt": [b"foobarcontent"], - "foobar2.txt": [b"2nd_foobarcontent"], + assert dict(reader.attachments.items()) == { + "foobar.txt": b"foobarcontent", + "foobar2.txt": b"2nd_foobarcontent", } writer.add_attachment("foobar2.txt", b"overwrite_ignored", overwrite=False) - assert reader.attachments == { - "foobar.txt": [b"foobarcontent"], - "foobar2.txt": [b"2nd_foobarcontent"], + assert dict(reader.attachments.items()) == { + "foobar.txt": b"foobarcontent", + "foobar2.txt": b"2nd_foobarcontent", } - assert writer.attachments == { - "foobar.txt": [b"foobarcontent"], - "foobar2.txt": [b"2nd_foobarcontent"], + assert dict(writer.attachments.items()) == { + "foobar.txt": b"foobarcontent", + "foobar2.txt": b"2nd_foobarcontent", } - _l = list({name for name, _ in to_add}) - _l.sort() - assert reader._list_attachments() == _l - assert writer._list_attachments() == _l + # _l = list({name for name, _ in to_add}) + # _l.sort() + # assert reader._list_attachments() == _l + # assert writer._list_attachments() == _l # We've added the same key twice - hence only 2 and not 3: - att = reader._get_attachments() - assert len(att) == 2 + # att = reader._get_attachments() + # assert len(att) == 2 # The content for foobar.txt is clear and just a single value: - assert att["foobar.txt"] == b"foobarcontent" + # assert att["foobar.txt"] == b"foobarcontent" # Not applicable for writer # att = writer._get_attachments() @@ -1344,9 +1344,9 @@ def test_attachments(): # assert att["foobar.txt"] == b"foobarcontent" # The content for foobar2.txt is a list! - att = reader._get_attachments("foobar2.txt") - assert len(att) == 1 - assert att["foobar2.txt"] == [b"2nd_foobarcontent"] + # att = reader._get_attachments("foobar2.txt") + # assert len(att) == 1 + # assert att["foobar2.txt"] == [b"2nd_foobarcontent"] # The content for foobar2.txt is a list! # att = writer._get_attachments("foobar2.txt") @@ -1354,13 +1354,11 @@ def test_attachments(): # assert att["foobar2.txt"] == [b"2nd_foobarcontent"] # Let's do both cases with the public interface: - assert reader.attachments["foobar.txt"][0] == b"foobarcontent" - assert reader.attachments["foobar2.txt"][0] == b"2nd_foobarcontent" - assert len(reader.attachments["foobar2.txt"]) == 1 + assert reader.attachments["foobar.txt"] == b"foobarcontent" + assert reader.attachments["foobar2.txt"] == b"2nd_foobarcontent" - assert writer.attachments["foobar.txt"][0] == b"foobarcontent" - assert writer.attachments["foobar2.txt"][0] == b"2nd_foobarcontent" - assert len(writer.attachments["foobar2.txt"]) == 1 + assert writer.attachments["foobar.txt"] == b"foobarcontent" + assert writer.attachments["foobar2.txt"] == b"2nd_foobarcontent" @pytest.mark.enable_socket() From ab963313e2d54d5b757f1f4c6e07f12eea5311bc Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 2 Nov 2023 14:04:57 +0100 Subject: [PATCH 13/13] fix include mypy --- pypdf/generic/_data_structures.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index c3c7328ea..9261eafc3 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -36,8 +36,8 @@ Any, Callable, Dict, - Generator, Iterable, + Iterator, List, Mapping, Optional, @@ -1848,7 +1848,7 @@ def get_from_file_specification(_a: DictionaryObject) -> PdfObject: ) -class AttachmentBytesDictionary(dict): +class AttachmentBytesDictionary(Mapping[str, AttachmentBytes]): """ Dict[str, AttachmentBytes] Ease access to Dictionary of Object @@ -1858,24 +1858,28 @@ class AttachmentBytesDictionary(dict): names: List[str] def __init__( - self, root: Optional[Union[NameTree, DictionaryObject, IndirectObject]] - ): - dict.__init__(self) + self, root: Optional[Union[NameTree, DictionaryObject]] = None + ) -> None: + # super().__init__(self) if isinstance(root, IndirectObject): root = cast(DictionaryObject, root.get_object()) if root is not None: - self.root = ( - root if isinstance(root, NameTree) else NameTree(root) - ) + self.root = root if isinstance(root, NameTree) else NameTree(root) self.names = list(self.root.list_keys()) else: self.root = None self.names = [] - def keys(self) -> List[str]: + def keys(self) -> List[str]: # type: ignore[override] return self.names - def items(self) -> Generator[str, AttachmentBytes]: + def __len__(self) -> int: + return len(self.names) + + def __iter__(self) -> Iterator[str]: # type: ignore + yield from self.names + + def items(self) -> Iterable[Tuple[str, AttachmentBytes]]: # type: ignore[override] if self.root is None: return [] else: @@ -1891,8 +1895,12 @@ def items(self) -> Generator[str, AttachmentBytes]: def __getitem__(self, k: str) -> AttachmentBytes: if k not in self.names: - raise KeyError("KeyError: k") + raise KeyError(f"KeyError: {k}") + if self.root is None: + raise ValueError("Empty Object") v = self.root.list_get(k) + if v is None: + raise KeyError(f"KeyError: {k}") return AttachmentBytes(cast(DictionaryObject, v.get_object())) def __repr__(self) -> str: