ENH: Accept utf strings for metadata (py-pdf#2802)

pubpub-zz · web-flow · commit 0c81f3cfad26 · 2024-08-16T11:52:19.000+02:00
Closes py-pdf#2754.
diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py
@@ -517,23 +517,38 @@ class TextStringObject(str, PdfObject):  # noqa: SLOT000
     autodetect_pdfdocencoding: bool
     autodetect_utf16: bool
     utf16_bom: bytes
+    _original_bytes: Optional[bytes] = None
 
     def __new__(cls, value: Any) -> "TextStringObject":
+        org = None
         if isinstance(value, bytes):
+            org = value
             value = value.decode("charmap")
         o = str.__new__(cls, value)
+        o._original_bytes = org
         o.autodetect_utf16 = False
         o.autodetect_pdfdocencoding = False
         o.utf16_bom = b""
         if value.startswith(("\xfe\xff", "\xff\xfe")):
+            assert org is not None  # for mypy
+            try:
+                o = str.__new__(cls, org.decode("utf-16"))
+            except UnicodeDecodeError as exc:
+                logger_warning(
+                    f"{exc!s}\ninitial string:{exc.object!r}",
+                    __name__,
+                )
+                o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
+            o._original_bytes = org
             o.autodetect_utf16 = True
-            o.utf16_bom = value[:2].encode("charmap")
+            o.utf16_bom = org[:2]
         else:
             try:
                 encode_pdfdocencoding(o)
                 o.autodetect_pdfdocencoding = True
             except UnicodeEncodeError:
                 o.autodetect_utf16 = True
+                o.utf16_bom = codecs.BOM_UTF16_BE
         return o
 
     def clone(
@@ -544,6 +559,7 @@ def clone(
     ) -> "TextStringObject":
         """Clone object into pdf_dest."""
         obj = TextStringObject(self)
+        obj._original_bytes = self._original_bytes
         obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
         obj.autodetect_utf16 = self.autodetect_utf16
         obj.utf16_bom = self.utf16_bom
@@ -559,7 +575,10 @@ def original_bytes(self) -> bytes:
         if that occurs, this "original_bytes" property can be used to
         back-calculate what the original encoded bytes were.
         """
-        return self.get_original_bytes()
+        if self._original_bytes is not None:
+            return self._original_bytes
+        else:
+            return self.get_original_bytes()
 
     def get_original_bytes(self) -> bytes:
         # We're a text string object, but the library is trying to get our raw
@@ -584,6 +603,8 @@ def get_encoded_bytes(self) -> bytes:
         # nicer to look at in the PDF file. Sadly, we take a performance hit
         # here for trying...
         try:
+            if self._original_bytes is not None:
+                return self._original_bytes
             if self.autodetect_utf16:
                 raise UnicodeEncodeError("", "forced", -1, -1, "")
             bytearr = encode_pdfdocencoding(self)
diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py
@@ -148,27 +148,45 @@ def create_string_object(
                     out += forced_encoding[x]
                 except Exception:
                     out += bytes((x,)).decode("charmap")
-            return TextStringObject(out)
+            obj = TextStringObject(out)
+            obj._original_bytes = string
+            return obj
         elif isinstance(forced_encoding, str):
             if forced_encoding == "bytes":
                 return ByteStringObject(string)
-            return TextStringObject(string.decode(forced_encoding))
+            obj = TextStringObject(string.decode(forced_encoding))
+            obj._original_bytes = string
+            return obj
         else:
             try:
                 if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
                     retval = TextStringObject(string.decode("utf-16"))
+                    retval._original_bytes = string
                     retval.autodetect_utf16 = True
                     retval.utf16_bom = string[:2]
                     return retval
-                else:
-                    # This is probably a big performance hit here, but we need
-                    # to convert string objects into the text/unicode-aware
-                    # version if possible... and the only way to check if that's
-                    # possible is to try.
-                    # Some strings are strings, some are just byte arrays.
-                    retval = TextStringObject(decode_pdfdocencoding(string))
-                    retval.autodetect_pdfdocencoding = True
+                if string.startswith(b"\x00"):
+                    retval = TextStringObject(string.decode("utf-16be"))
+                    retval._original_bytes = string
+                    retval.autodetect_utf16 = True
+                    retval.utf16_bom = codecs.BOM_UTF16_BE
                     return retval
+                if string[1:2] == b"\x00":
+                    retval = TextStringObject(string.decode("utf-16le"))
+                    retval._original_bytes = string
+                    retval.autodetect_utf16 = True
+                    retval.utf16_bom = codecs.BOM_UTF16_LE
+                    return retval
+
+                # This is probably a big performance hit here, but we need
+                # to convert string objects into the text/unicode-aware
+                # version if possible... and the only way to check if that's
+                # possible is to try.
+                # Some strings are strings, some are just byte arrays.
+                retval = TextStringObject(decode_pdfdocencoding(string))
+                retval._original_bytes = string
+                retval.autodetect_pdfdocencoding = True
+                return retval
             except UnicodeDecodeError:
                 return ByteStringObject(string)
     else:
diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -494,6 +494,9 @@ def test_textstringobject_autodetect_utf16():
     tso.autodetect_utf16 = True
     tso.utf16_bom = codecs.BOM_UTF16_BE
     assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"
+    tso.utf16_bom = codecs.BOM_UTF16_LE
+    assert tso.get_original_bytes() == b"\xff\xfef\x00o\x00o\x00"
+    assert tso.get_encoded_bytes() == b"\xff\xfef\x00o\x00o\x00"
 
 
 def test_remove_child_not_in_tree():
@@ -1131,6 +1134,16 @@ def test_create_string_object_utf16_bom():
         result.get_encoded_bytes()
         == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
     )
+    result = TextStringObject(
+        b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
+    )
+    assert result == "PaperPort 14\x00"
+    assert result.autodetect_utf16 is True
+    assert result.utf16_bom == b"\xff\xfe"
+    assert (
+        result.get_encoded_bytes()
+        == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
+    )
 
     # utf16-be without bom
     result = TextStringObject("ÿ")
diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -2333,3 +2333,24 @@ def test_set_need_appearances_writer():
     """Minimal test for coverage"""
     writer = PdfWriter()
     writer.set_need_appearances_writer()
+
+
+def test_utf16_metadata():
+    """See #2754"""
+    writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
+    writer.add_metadata(
+        {
+            "/Subject": "Invoice №AI_047",
+        }
+    )
+    b = BytesIO()
+    writer.write(b)
+    b.seek(0)
+    reader = PdfReader(b)
+    assert reader.metadata.subject == "Invoice №AI_047"
+    bb = b.getvalue()
+    i = bb.find(b"/Subject")
+    assert bb[i : i + 100] == (
+        b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e"
+        b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)"
+    )