BUG: Prevent updating page contents after merging page (stamping/watermarking) (#1952)

pubpub-zz · web-flow · commit abd8342083bb · 2023-07-09T16:19:04.000+02:00
ENH: Add the`over` parameter to `merge_page` closes #1951 closes #1953
diff --git a/docs/user/add-watermark.md b/docs/user/add-watermark.md
@@ -4,80 +4,54 @@ Adding stamps or watermarks are two common ways to manipulate PDF files.
 A stamp is adding something on top of the document, a watermark is in the
 background of the document.
 
-## Stamp (Overlay)
+## Stamp (Overlay) / Watermark(Undelay)
 
-Using the ``Transformation()`` class, one can translate, rotate, scale, etc. the stamp before merging it to the content page.
+The process of stamping and watermarking is the same, you just need to set `over` parameter to `True` for stamping and `False` for watermarking.
 
+You can use `merge_page()` if you don't need to transform the stamp:
 ```python
-from pathlib import Path
-from typing import Union, Literal, List
-
 from pypdf import PdfWriter, PdfReader
 
+stamp = PdfReader("bg.pdf").pages[0]
+writer = PdfWriter(clone_from="source.pdf")
+for page in writer.pages:
+    page.merge_page(stamp, over=False)  # here set to False for watermarking
 
-def stamp(
-    content_pdf: Path,
-    stamp_pdf: Path,
-    pdf_result: Path,
-    page_indices: Union[Literal["ALL"], List[int]] = "ALL",
-):
-    stamp_page = PdfReader(stamp_pdf).pages[0]
-
-    writer = PdfWriter()
-
-    reader = PdfReader(content_pdf)
-    if page_indices == "ALL":
-        page_indices = list(range(0, len(reader.pages)))
-    for index in page_indices:
-        content_page = reader.pages[index]
-        content_page.merge_transformed_page(
-            stamp_page,
-            Transformation(),
-        )
-        writer.add_page(content_page)
-
-    with open(pdf_result, "wb") as fp:
-        writer.write(fp)
+writer.write("out.pdf")
 ```
 
-![stamp.png](stamp.png)
-
-## Watermark (Underlay)
-
-To merge the watermark *under* the content, use the argument ``over=False`` of the method ``merge_transformed_page()``.
-
-Once again, watermark size and position (and more) can be customized using the ``Transformation()`` class.
+Else use `merge_transformed_page()` with Transformation() if you need to translate, rotate, scale, etc. the stamp before merging it to the content page.
 
 ```python
 from pathlib import Path
 from typing import Union, Literal, List
 
-from pypdf import PdfWriter, PdfReader, Transformation
+from pypdf import PdfWriter, PdfReader
 
 
-def watermark(
+def stamp(
     content_pdf: Path,
     stamp_pdf: Path,
     pdf_result: Path,
     page_indices: Union[Literal["ALL"], List[int]] = "ALL",
 ):
-    reader = PdfReader(content_pdf)
-    if page_indices == "ALL":
-        page_indices = range(len(reader.pages))
+    stamp_page = PdfReader(stamp_pdf).pages[0]
 
     writer = PdfWriter()
-    watermark_page = PdfReader(stamp_pdf).pages[0]
-    for index in page_indices:
-        content_page = reader.pages[index]
+    # page_indices can be a List(array) of page, tuples are for range definition
+    writer.append(content, pages=None if page_indices == "ALL" else page_indices)
+
+    for content_page in writer.pages:
         content_page.merge_transformed_page(
-            watermark_page,
-            Transformation(),
-            over=False,
+            stamp_page,
+            Transformation().scale(0.5),
         )
-        writer.add_page(content_page)
 
-    with open(pdf_result, "wb") as fp:
-        writer.write(fp)
+    writer.write(pdf_result)
 ```
 
+example of stamp:
+![stamp.png](stamp.png)
+
+example of watermark:
 ![watermark.png](watermark.png)
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -938,6 +938,11 @@ def replace_contents(
                     self._objects[o.indirect_reference.idnum - 1] = NullObject()  # type: ignore
                 except AttributeError:
                     pass
+
+        if isinstance(content, ArrayObject):
+            for i in range(len(content)):
+                content[i] = self.indirect_reference.pdf._add_object(content[i])
+
         if content is None:
             if PG.CONTENTS not in self:
                 return
@@ -972,7 +977,9 @@ def replace_contents(
                 # this will be fixed with the _add_object
                 self[NameObject(PG.CONTENTS)] = content
 
-    def merge_page(self, page2: "PageObject", expand: bool = False) -> None:
+    def merge_page(
+        self, page2: "PageObject", expand: bool = False,  over: bool = True
+    ) -> None:
         """
         Merge the content streams of two pages into one.
 
@@ -985,10 +992,11 @@ def merge_page(self, page2: "PageObject", expand: bool = False) -> None:
         Args:
             page2: The page to be merged into this one. Should be
                 an instance of :class:`PageObject<PageObject>`.
+            over: set the page2 content over page1 if True(default) else under
             expand: If true, the current page dimensions will be
                 expanded to accommodate the dimensions of the page to be merged.
         """
-        self._merge_page(page2, expand=expand)
+        self._merge_page(page2, over=over, expand=expand)
 
     def mergePage(self, page2: "PageObject") -> None:  # deprecated
         """
diff --git a/tests/test_page.py b/tests/test_page.py
@@ -1237,12 +1237,15 @@ def create_stamp_pdf() -> BytesIO:
     writer.append(SAMPLE_ROOT / "009-pdflatex-geotopo/GeoTopo.pdf", [1])
     nb1 = len(writer._objects)
 
+    # 1 page only is modified
     for page in writer.pages:
         page.merge_page(template_page)
-    assert len(writer._objects) == nb1 + 1  # font is added that's all
+    # font is added; +1 streamobjects + 1 ArrayObject
+    assert len(writer._objects) == nb1 + 1 + 2
     for page in writer.pages:
         page.compress_content_streams()
-    assert len(writer._objects) == nb1 + 1
+    # objects are recycled
+    assert len(writer._objects) == nb1 + 1 + 2
 
     contents = writer.pages[0]["/Contents"]
     writer.pages[0].replace_contents(None)
diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -624,7 +624,9 @@ def test_add_named_destination_sort_order(pdf_file_path):
     root = writer.get_named_dest_root()
 
     assert len(root) == 4
-    assert root[0] == "a", '"a" was not inserted before "b" in the named destination root'
+    assert (
+        root[0] == "a"
+    ), '"a" was not inserted before "b" in the named destination root'
     assert root[2] == "b"
 
     # write "output" to pypdf-output.pdf
@@ -1478,3 +1480,22 @@ def test_empty_objects_before_cloning():
         {x: 1 for x, y in reader.xref_objStm.values()}
     )  # to remove object streams
     assert len(writer._objects) == nb_obj_reader
+
+
+@pytest.mark.enable_socket()
+def test_watermark():
+    url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf"
+    name = "bgwatermark.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf"
+    name = "srcwatermark.pdf"
+    writer = PdfWriter(clone_from=BytesIO(get_pdf_from_url(url, name=name)))
+    for p in writer.pages:
+        p.merge_page(reader.pages[0], over=False)
+
+    assert isinstance(p["/Contents"], ArrayObject)
+    assert isinstance(p["/Contents"][0], IndirectObject)
+
+    b = BytesIO()
+    writer.write(b)
+    assert len(b.getvalue()) < 2.1 * 1024 * 1024