Skip to content

Commit 0c81f3c

Browse files
authored
ENH: Accept utf strings for metadata (py-pdf#2802)
Closes py-pdf#2754.
1 parent 454a62a commit 0c81f3c

File tree

4 files changed

+85
-12
lines changed

4 files changed

+85
-12
lines changed

pypdf/generic/_base.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -517,23 +517,38 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000
517517
autodetect_pdfdocencoding: bool
518518
autodetect_utf16: bool
519519
utf16_bom: bytes
520+
_original_bytes: Optional[bytes] = None
520521

521522
def __new__(cls, value: Any) -> "TextStringObject":
523+
org = None
522524
if isinstance(value, bytes):
525+
org = value
523526
value = value.decode("charmap")
524527
o = str.__new__(cls, value)
528+
o._original_bytes = org
525529
o.autodetect_utf16 = False
526530
o.autodetect_pdfdocencoding = False
527531
o.utf16_bom = b""
528532
if value.startswith(("\xfe\xff", "\xff\xfe")):
533+
assert org is not None # for mypy
534+
try:
535+
o = str.__new__(cls, org.decode("utf-16"))
536+
except UnicodeDecodeError as exc:
537+
logger_warning(
538+
f"{exc!s}\ninitial string:{exc.object!r}",
539+
__name__,
540+
)
541+
o = str.__new__(cls, exc.object[: exc.start].decode("utf-16"))
542+
o._original_bytes = org
529543
o.autodetect_utf16 = True
530-
o.utf16_bom = value[:2].encode("charmap")
544+
o.utf16_bom = org[:2]
531545
else:
532546
try:
533547
encode_pdfdocencoding(o)
534548
o.autodetect_pdfdocencoding = True
535549
except UnicodeEncodeError:
536550
o.autodetect_utf16 = True
551+
o.utf16_bom = codecs.BOM_UTF16_BE
537552
return o
538553

539554
def clone(
@@ -544,6 +559,7 @@ def clone(
544559
) -> "TextStringObject":
545560
"""Clone object into pdf_dest."""
546561
obj = TextStringObject(self)
562+
obj._original_bytes = self._original_bytes
547563
obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding
548564
obj.autodetect_utf16 = self.autodetect_utf16
549565
obj.utf16_bom = self.utf16_bom
@@ -559,7 +575,10 @@ def original_bytes(self) -> bytes:
559575
if that occurs, this "original_bytes" property can be used to
560576
back-calculate what the original encoded bytes were.
561577
"""
562-
return self.get_original_bytes()
578+
if self._original_bytes is not None:
579+
return self._original_bytes
580+
else:
581+
return self.get_original_bytes()
563582

564583
def get_original_bytes(self) -> bytes:
565584
# We're a text string object, but the library is trying to get our raw
@@ -584,6 +603,8 @@ def get_encoded_bytes(self) -> bytes:
584603
# nicer to look at in the PDF file. Sadly, we take a performance hit
585604
# here for trying...
586605
try:
606+
if self._original_bytes is not None:
607+
return self._original_bytes
587608
if self.autodetect_utf16:
588609
raise UnicodeEncodeError("", "forced", -1, -1, "")
589610
bytearr = encode_pdfdocencoding(self)

pypdf/generic/_utils.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -148,27 +148,45 @@ def create_string_object(
148148
out += forced_encoding[x]
149149
except Exception:
150150
out += bytes((x,)).decode("charmap")
151-
return TextStringObject(out)
151+
obj = TextStringObject(out)
152+
obj._original_bytes = string
153+
return obj
152154
elif isinstance(forced_encoding, str):
153155
if forced_encoding == "bytes":
154156
return ByteStringObject(string)
155-
return TextStringObject(string.decode(forced_encoding))
157+
obj = TextStringObject(string.decode(forced_encoding))
158+
obj._original_bytes = string
159+
return obj
156160
else:
157161
try:
158162
if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
159163
retval = TextStringObject(string.decode("utf-16"))
164+
retval._original_bytes = string
160165
retval.autodetect_utf16 = True
161166
retval.utf16_bom = string[:2]
162167
return retval
163-
else:
164-
# This is probably a big performance hit here, but we need
165-
# to convert string objects into the text/unicode-aware
166-
# version if possible... and the only way to check if that's
167-
# possible is to try.
168-
# Some strings are strings, some are just byte arrays.
169-
retval = TextStringObject(decode_pdfdocencoding(string))
170-
retval.autodetect_pdfdocencoding = True
168+
if string.startswith(b"\x00"):
169+
retval = TextStringObject(string.decode("utf-16be"))
170+
retval._original_bytes = string
171+
retval.autodetect_utf16 = True
172+
retval.utf16_bom = codecs.BOM_UTF16_BE
171173
return retval
174+
if string[1:2] == b"\x00":
175+
retval = TextStringObject(string.decode("utf-16le"))
176+
retval._original_bytes = string
177+
retval.autodetect_utf16 = True
178+
retval.utf16_bom = codecs.BOM_UTF16_LE
179+
return retval
180+
181+
# This is probably a big performance hit here, but we need
182+
# to convert string objects into the text/unicode-aware
183+
# version if possible... and the only way to check if that's
184+
# possible is to try.
185+
# Some strings are strings, some are just byte arrays.
186+
retval = TextStringObject(decode_pdfdocencoding(string))
187+
retval._original_bytes = string
188+
retval.autodetect_pdfdocencoding = True
189+
return retval
172190
except UnicodeDecodeError:
173191
return ByteStringObject(string)
174192
else:

tests/test_generic.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,9 @@ def test_textstringobject_autodetect_utf16():
494494
tso.autodetect_utf16 = True
495495
tso.utf16_bom = codecs.BOM_UTF16_BE
496496
assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o"
497+
tso.utf16_bom = codecs.BOM_UTF16_LE
498+
assert tso.get_original_bytes() == b"\xff\xfef\x00o\x00o\x00"
499+
assert tso.get_encoded_bytes() == b"\xff\xfef\x00o\x00o\x00"
497500

498501

499502
def test_remove_child_not_in_tree():
@@ -1131,6 +1134,16 @@ def test_create_string_object_utf16_bom():
11311134
result.get_encoded_bytes()
11321135
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
11331136
)
1137+
result = TextStringObject(
1138+
b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
1139+
)
1140+
assert result == "PaperPort 14\x00"
1141+
assert result.autodetect_utf16 is True
1142+
assert result.utf16_bom == b"\xff\xfe"
1143+
assert (
1144+
result.get_encoded_bytes()
1145+
== b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00"
1146+
)
11341147

11351148
# utf16-be without bom
11361149
result = TextStringObject("ÿ")

tests/test_writer.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2333,3 +2333,24 @@ def test_set_need_appearances_writer():
23332333
"""Minimal test for coverage"""
23342334
writer = PdfWriter()
23352335
writer.set_need_appearances_writer()
2336+
2337+
2338+
def test_utf16_metadata():
2339+
"""See #2754"""
2340+
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
2341+
writer.add_metadata(
2342+
{
2343+
"/Subject": "Invoice №AI_047",
2344+
}
2345+
)
2346+
b = BytesIO()
2347+
writer.write(b)
2348+
b.seek(0)
2349+
reader = PdfReader(b)
2350+
assert reader.metadata.subject == "Invoice №AI_047"
2351+
bb = b.getvalue()
2352+
i = bb.find(b"/Subject")
2353+
assert bb[i : i + 100] == (
2354+
b"/Subject (\\376\\377\\000I\\000n\\000v\\000o\\000i\\000c\\000e"
2355+
b"\\000 \\041\\026\\000A\\000I\\000\\137\\0000\\0004\\0007)"
2356+
)

0 commit comments

Comments
 (0)