ENH: Implement flattening for writer (#3312)

PJBrs · stefan6419846 · web-flow · commit d76976b7d798 · 2025-07-11T11:56:39.000+02:00
Closes #232. --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -196,6 +196,13 @@ def compress(matrix: TransformationMatrixType) -> CompressedTransformationMatrix
             matrix[2][1],
         )
 
+    def _to_cm(self) -> str:
+        # Returns the cm operation string for the given transformation matrix
+        return (
+            f"{self.ctm[0]:.4f} {self.ctm[1]:.4f} {self.ctm[2]:.4f} "
+            f"{self.ctm[3]:.4f} {self.ctm[4]:.4f} {self.ctm[5]:.4f} cm"
+        )
+
     def transform(self, m: "Transformation") -> "Transformation":
         """
         Apply one transformation to another.
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -55,7 +55,7 @@
 from ._cmap import _default_fonts_space_width, build_char_map_from_dict
 from ._doc_common import DocumentInformation, PdfDocCommon
 from ._encryption import EncryptAlgorithm, Encryption
-from ._page import PageObject
+from ._page import PageObject, Transformation
 from ._page_labels import nums_clear_range, nums_insert, nums_next
 from ._reader import PdfReader
 from ._utils import (
@@ -865,12 +865,102 @@ def append_pages_from_reader(
             if callable(after_page_append):
                 after_page_append(writer_page)
 
+    def _merge_content_stream_to_page(
+        self,
+        page: PageObject,
+        new_content_data: bytes,
+    ) -> None:
+        """
+        Combines existing content stream(s) with new content (as bytes),
+        and returns a new single StreamObject.
+
+        Args:
+            page: The page to which the new content data will be added.
+            new_content_data: A binary-encoded new content stream, for
+                instance the commands to draw an XObject.
+        """
+        # First resolve the existing page content. This always is an IndirectObject:
+        # PDF Explained by John Whitington
+        # https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
+        if NameObject("/Contents") in page:
+            existing_content_ref = page[NameObject("/Contents")]
+            existing_content = existing_content_ref.get_object()
+
+            if isinstance(existing_content, ArrayObject):
+                # Create a new StreamObject for the new_content_data
+                new_stream_obj = StreamObject()
+                new_stream_obj.set_data(new_content_data)
+                existing_content.append(self._add_object(new_stream_obj))
+                page[NameObject("/Contents")] = self._add_object(existing_content)
+            if isinstance(existing_content, StreamObject):
+                # Merge new content to existing StreamObject
+                merged_data = existing_content.get_data() + b"\n" + new_content_data
+                new_stream = StreamObject()
+                new_stream.set_data(merged_data)
+                page[NameObject("/Contents")] = self._add_object(new_stream)
+        else:
+            # If no existing content, then we have an empty page.
+            # Create a new StreamObject in a new /Contents entry.
+            new_stream = StreamObject()
+            new_stream.set_data(new_content_data)
+            page[NameObject("/Contents")] = self._add_object(new_stream)
+
+    def _add_apstream_object(
+            self,
+            page: PageObject,
+            appearance_stream_obj: StreamObject,
+            object_name: str,
+            x_offset: float,
+            y_offset: float,
+            font_res: Optional[DictionaryObject] = None
+        ) -> None:
+        """
+        Adds an appearance stream to the page content in the form of
+        an XObject.
+
+        Args:
+            page: The page to which to add the appearance stream.
+            appearance_stream_obj: The appearance stream.
+            object_name: The name of the appearance stream.
+            x_offset: The horizontal offset for the appearance stream.
+            y_offset: The vertical offset for the appearance stream.
+            font_res: The appearance stream's font resource (if given).
+        """
+        # Prepare XObject resource dictionary on the page
+        pg_res = cast(DictionaryObject, page[PG.RESOURCES])
+        if font_res is not None:
+            font_name = font_res["/BaseFont"]  # [/"Name"] often also exists, but is deprecated
+            if "/Font" not in pg_res:
+                pg_res[NameObject("/Font")] = DictionaryObject()
+            pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")])
+            if font_name not in pg_ft_res:
+                pg_ft_res[NameObject(font_name)] = font_res
+        # Always add the resolved stream object to the writer to get a new IndirectObject.
+        # This ensures we have a valid IndirectObject managed by *this* writer.
+        xobject_ref = self._add_object(appearance_stream_obj)
+        xobject_name = NameObject(f"/Fm_{object_name}")._sanitize()
+        if "/XObject" not in pg_res:
+            pg_res[NameObject("/XObject")] = DictionaryObject()
+        pg_xo_res  = cast(DictionaryObject, pg_res["/XObject"])
+        if xobject_name not in pg_xo_res:
+            pg_xo_res[xobject_name] = xobject_ref
+        else:
+            logger_warning(
+                f"XObject {xobject_name!r} already added to page resources. This might be an issue.",
+                __name__
+            )
+        xobject_cm = Transformation().translate(x_offset, y_offset)
+        xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode()
+        self._merge_content_stream_to_page(page, xobject_drawing_commands)
+
     def _update_field_annotation(
         self,
+        page: PageObject,
         field: DictionaryObject,
         annotation: DictionaryObject,
         font_name: str = "",
         font_size: float = -1,
+        flatten: bool = False,
     ) -> None:
         # Calculate rectangle dimensions
         _rct = cast(RectangleObject, annotation[AA.Rect])
@@ -1013,6 +1103,10 @@ def _update_field_annotation(
             self._objects[n - 1] = dct
             dct.indirect_reference = IndirectObject(n, 0, self)
 
+        if flatten:
+            field_name = self._get_qualified_field_name(annotation)
+            self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res)
+
     FFBITS_NUL = FA.FfBits(0)
 
     def update_page_form_field_values(
@@ -1021,6 +1115,7 @@ def update_page_form_field_values(
         fields: Dict[str, Union[str, List[str], Tuple[str, str, float]]],
         flags: FA.FfBits = FFBITS_NUL,
         auto_regenerate: Optional[bool] = True,
+        flatten: bool = False,
     ) -> None:
         """
         Update the form field values for a given page from a fields dictionary.
@@ -1047,6 +1142,10 @@ def update_page_form_field_values(
             auto_regenerate: Set/unset the need_appearances flag;
                 the flag is unchanged if auto_regenerate is None.
 
+            flatten: Whether or not to flatten the annotation. If True, this adds the annotation's
+                appearance stream to the page contents. Note that this option does not remove the
+                annotation itself.
+
         """
         if CatalogDictionary.ACRO_FORM not in self._root_object:
             raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object")
@@ -1061,7 +1160,7 @@ def update_page_form_field_values(
         if isinstance(page, list):
             for p in page:
                 if PG.ANNOTS in p:  # just to prevent warnings
-                    self.update_page_form_field_values(p, fields, flags, None)
+                    self.update_page_form_field_values(p, fields, flags, None, flatten=flatten)
             return
         if PG.ANNOTS not in page:
             logger_warning("No fields to update on this page", __name__)
@@ -1090,35 +1189,43 @@ def update_page_form_field_values(
                     del parent_annotation["/I"]
                 if flags:
                     annotation[NameObject(FA.Ff)] = NumberObject(flags)
-                if isinstance(value, list):
-                    lst = ArrayObject(TextStringObject(v) for v in value)
-                    parent_annotation[NameObject(FA.V)] = lst
-                elif isinstance(value, tuple):
-                    annotation[NameObject(FA.V)] = TextStringObject(
-                        value[0],
-                    )
-                else:
-                    parent_annotation[NameObject(FA.V)] = TextStringObject(value)
+                if not (value is None and flatten):  # Only change values if given by user and not flattening.
+                    if isinstance(value, list):
+                        lst = ArrayObject(TextStringObject(v) for v in value)
+                        parent_annotation[NameObject(FA.V)] = lst
+                    elif isinstance(value, tuple):
+                        annotation[NameObject(FA.V)] = TextStringObject(
+                            value[0],
+                        )
+                    else:
+                        parent_annotation[NameObject(FA.V)] = TextStringObject(value)
                 if parent_annotation.get(FA.FT) == "/Btn":
                     # Checkbox button (no /FT found in Radio widgets)
                     v = NameObject(value)
                     ap = cast(DictionaryObject, annotation[NameObject(AA.AP)])
-                    if v not in cast(ArrayObject, ap[NameObject("/N")]):
+                    normal_ap = cast(DictionaryObject, ap["/N"])
+                    if v not in normal_ap:
                         v = NameObject("/Off")
+                    appearance_stream_obj = normal_ap.get(v)
                     # other cases will be updated through the for loop
                     annotation[NameObject(AA.AS)] = v
                     annotation[NameObject(FA.V)] = v
+                    if flatten and appearance_stream_obj is not None:
+                        # We basically copy the entire appearance stream, which should be an XObject that
+                        # is already registered. No need to add font resources.
+                        rct = cast(RectangleObject, annotation[AA.Rect])
+                        self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1])
                 elif (
                     parent_annotation.get(FA.FT) == "/Tx"
                     or parent_annotation.get(FA.FT) == "/Ch"
                 ):
                     # textbox
                     if isinstance(value, tuple):
                         self._update_field_annotation(
-                            parent_annotation, annotation, value[1], value[2]
+                            page, parent_annotation, annotation, value[1], value[2], flatten=flatten
                         )
                     else:
-                        self._update_field_annotation(parent_annotation, annotation)
+                        self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten)
                 elif (
                     annotation.get(FA.FT) == "/Sig"
                 ):  # deprecated  # not implemented yet
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
@@ -841,6 +841,21 @@ def renumber(self) -> bytes:
                     out += c.encode("utf-8")
         return out
 
+    def _sanitize(self) -> "NameObject":
+        """
+        Sanitize the NameObject's name to be a valid PDF name part
+        (alphanumeric, underscore, hyphen). The _sanitize method replaces
+        spaces and any non-alphanumeric/non-underscore/non-hyphen with
+        underscores.
+
+        Returns:
+            NameObject with sanitized name.
+        """
+        name = str(self)[1:]  # Remove leading forward slash
+        name = re.sub(r"\ ", "_", name)
+        name = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
+        return NameObject("/" + name)
+
     @classproperty
     def surfix(cls) -> bytes:  # noqa: N805
         deprecate_with_replacement("surfix", "prefix", "6.0.0")
diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -23,7 +23,9 @@
 from pypdf.errors import PageSizeNotDefinedError, PyPdfError
 from pypdf.generic import (
     ArrayObject,
+    ByteStringObject,
     ContentStream,
+    DecodedStreamObject,
     Destination,
     DictionaryObject,
     Fit,
@@ -513,12 +515,12 @@ def test_fill_form(pdf_file_path):
     writer.append(RESOURCE_ROOT / "crazyones.pdf", [0])
 
     writer.update_page_form_field_values(
-        writer.pages[0], {"foo": "some filled in text"}, flags=1
+        writer.pages[0], {"foo": "some filled in text"}, flags=1, flatten=True
     )
 
     # check if no fields to fill in the page
     writer.update_page_form_field_values(
-        writer.pages[1], {"foo": "some filled in text"}, flags=1
+        writer.pages[1], {"foo": "some filled in text"}, flags=1, flatten=True
     )
 
     writer.update_page_form_field_values(
@@ -1526,13 +1528,21 @@ def test_update_form_fields(tmp_path):
             "DropList1": "DropListe3",
         },
         auto_regenerate=False,
+        flatten=True,
     )
     del writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"]
+    del writer.pages[0]["/Resources"]["/Font"]
     writer.update_page_form_field_values(
         writer.pages[0],
         {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
         auto_regenerate=False,
     )
+    writer.update_page_form_field_values(
+        writer.pages[0],
+        {"Text1": None, "Text2": None},
+        auto_regenerate=False,
+        flatten=True,
+    )
 
     writer.write(write_data_here)
     reader = PdfReader(write_data_here)
@@ -1575,11 +1585,71 @@ def test_update_form_fields(tmp_path):
         None,
         {"Text1": "my Text1", "Text2": "ligne1\nligne2\nligne3"},
         auto_regenerate=False,
+        flatten=True
     )
 
     Path(write_data_here).unlink()
 
 
+def test_add_apstream_object():
+    writer = PdfWriter()
+    page = writer.add_blank_page(1000, 1000)
+    assert NameObject("/Contents") not in page
+    apstream_object = DecodedStreamObject.initialize_from_dictionary(
+        {
+            NameObject("/Type"): NameObject("/XObject"),
+            NameObject("/Subtype"): NameObject("/Form"),
+            NameObject("/BBox"): RectangleObject([0.0, 0.0, 10.5, 10.5]),
+            "__streamdata__": ByteStringObject(b"BT /F1 12 Tf (Hello World) Tj ET")
+        }
+    )
+    writer._add_object(apstream_object)
+    object_name = "AA2342!@#$% ^^##aa:-)"
+    x_offset = 200
+    y_offset = 200
+    writer._add_apstream_object(page, apstream_object, object_name, x_offset, y_offset)
+    assert NameObject("/XObject") in page[NameObject("/Resources")]
+    assert "/Fm_AA2342__________aa_-_" in page[NameObject("/Resources")][NameObject("/XObject")]
+    assert NameObject("/Contents") in page
+    contents_obj = page[NameObject("/Contents")]
+    stream = contents_obj.get_object()
+    assert isinstance(stream, StreamObject)
+    assert stream.get_data() == (
+        b"q\n1.0000 0.0000 0.0000 1.0000 200.0000 200.0000 cm\n/Fm_AA2342__________aa_-_ Do\nQ"
+    )
+
+
+def test_merge_content_stream_to_page():
+    """Test that new content data is correctly added to page contents
+    in the form of an ArrayObject or StreamObject. The
+    test_add_apstream_object code already correctly checks that
+    _merge_content_stream_to_page works for an emtpy page.
+    """
+    writer = PdfWriter()
+    page = writer.add_blank_page(100, 100)
+    new_content = b"BT /F1 12 Tf (Hello World) Tj ET"
+    # Call the method under test
+    writer._merge_content_stream_to_page(page, new_content)
+    more_content = b"BT /F1 12 Tf (Hello Again, World) Tj ET"
+    writer._merge_content_stream_to_page(page, more_content)
+    contents_obj = page[NameObject("/Contents")]
+    stream = contents_obj.get_object()
+    assert isinstance(stream, StreamObject)
+    assert stream.get_data() == b"BT /F1 12 Tf (Hello World) Tj ET\nBT /F1 12 Tf (Hello Again, World) Tj ET"
+    new_stream_obj = StreamObject()
+    new_stream_obj.set_data(new_content)
+    content = ArrayObject()
+    content.append(new_stream_obj)
+    page[NameObject("/Contents")] = writer._add_object(content)
+    writer._merge_content_stream_to_page(page, more_content)
+    contents_obj = page[NameObject("/Contents")]
+    array = contents_obj.get_object()
+    assert isinstance(array, ArrayObject)
+    contents = page[NameObject("/Contents")].get_object()
+    assert contents[0].get_object().get_data() == new_content
+    assert contents[1].get_object().get_data() == more_content
+
+
 @pytest.mark.enable_socket
 def test_update_form_fields2():
     my_files = {