From 8df2dfa1d79f0a710168da7956722982b50858b5 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 15:02:51 +0200 Subject: [PATCH 01/14] ENH : Process /uniHHHH for text_extract /uniHHHH glyphs seems to be generated in laTeX but is ok for other characters addressed partially in #2016 --- pypdf/_cmap.py | 15 ++++++++++++--- tests/test_cmap.py | 10 ++++++++++ tests/test_encryption.py | 22 +++++++++++++++++----- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 14c1e229c..788a1ac19 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -461,10 +461,19 @@ def type1_alternative( continue try: i = int(words[1]) - v = adobe_glyphs[words[2].decode()] - except (ValueError, KeyError): + except ValueError: continue - if v == " ": + try: + v = adobe_glyphs[words[2].decode()] + except KeyError: + if words[2].startswith(b"/uni"): + try: + v = chr(int(words[2][4:], 16)) + except ValueError: + continue + else: + continue + if words[2].decode() == b" ": space_code = i map_dict[chr(i)] = v int_entry.append(i) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index f74da326d..6e7448651 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -179,3 +179,13 @@ def test_latex(): for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"): assert pat in txt # actually the ϕ and φ seems to be crossed in latex + + +@pytest.mark.enable_socket() +def test_unixxx_glyphs(): + url = "https://arxiv.org/pdf/2201.00021.pdf" + name = "unixxx_glyphs.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + txt = reader.pages[0].extract_text() # no error + for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): + assert pat in txt diff --git a/tests/test_encryption.py b/tests/test_encryption.py index ff33d2121..6641977a4 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -125,7 +125,10 @@ def test_encryption(name, requires_pycryptodome): ("r6-both-passwords.pdf", "foo", "bar"), ], ) -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) def test_pdf_with_both_passwords(name, user_passwd, owner_passwd): """ PDFs with both user and owner passwords are handled correctly. @@ -151,7 +154,10 @@ def test_pdf_with_both_passwords(name, user_passwd, owner_passwd): ("crazyones-encrypted-256.pdf", b"password"), ], ) -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) def test_read_page_from_encrypted_file_aes_256(pdffile, password): """ A page can be read from an encrypted. @@ -176,7 +182,10 @@ def test_read_page_from_encrypted_file_aes_256(pdffile, password): ), ], ) -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_encrypted_pdfs(names): """Encrypted PDFs can be merged after decryption.""" @@ -193,7 +202,7 @@ def test_merge_encrypted_pdfs(names): @pytest.mark.skipif( HAS_CRYPTOGRAPHY, - reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494" + reason="Limitations of cryptography. see https://github.com/pyca/cryptography/issues/2494", ) @pytest.mark.parametrize( "cryptcls", @@ -346,7 +355,10 @@ def test_pdf_encrypt_multiple(pdf_file_path, count): assert text0 == text1 -@pytest.mark.skipif(not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, reason="No pycryptodome / cryptography") +@pytest.mark.skipif( + not HAS_PYCRYPTODOME and not HAS_CRYPTOGRAPHY, + reason="No pycryptodome / cryptography", +) def test_aes_decrypt_corrupted_data(): """Just for robustness""" aes = CryptAES(secrets.token_bytes(16)) From 21af042a3fe6d57fd677af3162dccc169d68a536 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 15:35:13 +0200 Subject: [PATCH 02/14] coverage --- pypdf/_cmap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 788a1ac19..6400f89b1 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -461,7 +461,7 @@ def type1_alternative( continue try: i = int(words[1]) - except ValueError: + except ValueError: # pragma: no cover continue try: v = adobe_glyphs[words[2].decode()] @@ -469,7 +469,7 @@ def type1_alternative( if words[2].startswith(b"/uni"): try: v = chr(int(words[2][4:], 16)) - except ValueError: + except ValueError: # pragma: no cover continue else: continue From 1b78427de2c19c2151b38ff3e6e47cb29c6ead93 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 21:22:56 +0200 Subject: [PATCH 03/14] BUG : writing german characters into fields closes #2035 closes #2021 --- pypdf/_cmap.py | 29 ++++++++++++++---- pypdf/_writer.py | 70 ++++++++++++++++++++++++++++++++++++-------- tests/test_writer.py | 23 +++++++++++++++ 3 files changed, 104 insertions(+), 18 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 6400f89b1..b5311c5fb 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -17,15 +17,35 @@ def build_char_map( Determine information about a font. Args: - font_name: - space_width: - obj: + font_name: font name as a string + space_width: default space with if no data found. + obj: XObject or Page where you can find a /Resource dictionary Returns: - Font sub-type, space_width/2, encoding, map character-map, font-dictionary. + Font sub-type, space_width criteria(50% of width), encoding, map character-map, font-dictionary. The font-dictionary itself is suitable for the curious. """ ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore + font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict( + space_width, ft + ) + return font_subtype, font_halfspace, font_encoding, font_map, ft + + +def build_char_map_from_dict( + space_width: float, ft: DictionaryObject +) -> Tuple[str, float, Union[str, Dict[int, str]], Dict]: + """ + Determine information about a font. + + Args: + space_width: default space with if no data found (normally half width of char. + ft: Font Dictionary + + Returns: + Font sub-type, space_width criteria(50% of width), encoding, map character-map. + The font-dictionary itself is suitable for the curious. + """ font_type: str = cast(str, ft["/Subtype"]) space_code = 32 @@ -73,7 +93,6 @@ def build_char_map( encoding, # https://github.com/python/mypy/issues/4374 map_dict, - ft, ) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index ec4896894..7054fb62a 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -54,6 +54,7 @@ cast, ) +from ._cmap import build_char_map_from_dict from ._encryption import EncryptAlgorithm, Encryption from ._page import PageObject, _VirtualList from ._page_labels import nums_clear_range, nums_insert, nums_next @@ -847,6 +848,47 @@ def _update_text_field(self, field: DictionaryObject) -> None: da = " ".join(font_properties) y_offset = rct.height - 1 - font_height + # Retrieve font information from local DR ... + dr: Any = cast(dict, cast(DictionaryObject, field.get("/DR", {}))) + if isinstance(dr, IndirectObject): + dr = dr.get_object() + dr = dr.get("/Font", {}) + if isinstance(dr, IndirectObject): + dr = dr.get_object() + if font_name not in dr: + # ...or AcroForm dictionary + dr = cast( + dict, + cast(DictionaryObject, self._root_object["/AcroForm"]).get("/DR", {}), + ) + if isinstance(dr, IndirectObject): + dr = dr.get_object() + dr = dr.get("/Font", {}) + if isinstance(dr, IndirectObject): + dr = dr.get_object() + font_res = dr.get(font_name) + if font_res is not None: + font_res = cast(DictionaryObject, font_res.get_object()) + font_subtype, _, font_encoding, font_map = build_char_map_from_dict( + 200, font_res + ) + font_full_rev: dict[str, int] + if isinstance(font_encoding, str): + if font_encoding not in ("charmap", "utf-16-be"): + logger_warning( + f"unexpected {font_encoding} : please share pdf with pypdf dev team", + __name__, + ) + font_full_rev = {v: k for k, v in font_map.items()} + else: + font_full_rev = {v: k for k, v in font_encoding.items()} + font_encoding_rev = {v: k for k, v in font_encoding.items()} + for k, v in font_map.items(): + font_full_rev[v] = font_encoding_rev.get(k, ord(k)) + else: + logger_warning(f"can not find font dictionnary for {font_name}", __name__) + font_full_rev = {} + # Retrieve field text and selected values field_flags = field.get(FA.Ff, 0) if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: @@ -872,7 +914,15 @@ def _update_text_field(self, field: DictionaryObject) -> None: else: # Td is a relative translation ap_stream += f"0 {- font_height * 1.4} Td\n".encode() - ap_stream += b"(" + str(line).encode("UTF-8") + b") Tj\n" + enc_line: list[Any] = [font_full_rev.get(c, ord(c)) for c in line] + if all(c > 255 for c in enc_line): + ap_stream += ( + b"<" + + b"".join(b"%04X" % x for x in line.encode("UTF-16-BE")) + + b"> Tj\n" + ) + else: + ap_stream += b"(" + bytes(enc_line) + b") Tj\n" ap_stream += b"ET\nQ\nEMC\nQ\n" # Create appearance dictionary @@ -886,22 +936,16 @@ def _update_text_field(self, field: DictionaryObject) -> None: } ) - # Retrieve font information from AcroForm dictionary - dr: Any = cast( - dict, cast(DictionaryObject, self._root_object["/AcroForm"]).get("/DR", {}) - ) - if isinstance(dr, IndirectObject): - dr = dr.get_object() - dr = dr.get("/Font", {}) - if isinstance(dr, IndirectObject): - dr = dr.get_object() - # Update Resources with font information if necessary - if font_name in dr: + if font_res is not None: dct[NameObject("/Resources")] = DictionaryObject( { NameObject("/Font"): DictionaryObject( - {NameObject(font_name): dr[font_name].indirect_reference} + { + NameObject(font_name): getattr( + font_res, "indirect_reference", font_res + ) + } ) } ) diff --git a/tests/test_writer.py b/tests/test_writer.py index ebeaf60e0..5c1dd51d3 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1582,3 +1582,26 @@ def test_missing_fields(pdf_file_path): writer.pages[0], {"foo": "some filled in text"}, flags=1 ) assert exc.value.args[0] == "No /Fields dictionary in Pdf in PdfWriter Object" + + +@pytest.mark.enable_socket() +def test_germanfields(): + """Cf #2035""" + url = "https://github.com/py-pdf/pypdf/files/12194195/test.pdf" + name = "germanfields.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + writer = PdfWriter(clone_from=reader) + form_fields = {"Text Box 1": "test æ ø å"} + writer.update_page_form_field_values( + writer.pages[0], form_fields, auto_regenerate=False + ) + bytes_stream = BytesIO() + writer.write(bytes_stream) + bytes_stream.seek(0) + reader2 = PdfReader(bytes_stream) + assert ( + b"test \xe6 \xf8 \xe5" + in reader2.get_fields()["Text Box 1"] + .indirect_reference.get_object()["/AP"]["/N"] + .get_data() + ) From 284da98a007d27d02b5fefdbee2c99cfd6bd0a1f Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 21:41:37 +0200 Subject: [PATCH 04/14] mypy --- pypdf/_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 7054fb62a..f269ebdf3 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -872,7 +872,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 200, font_res ) - font_full_rev: dict[str, int] + font_full_rev: Dict[str, int] if isinstance(font_encoding, str): if font_encoding not in ("charmap", "utf-16-be"): logger_warning( @@ -914,7 +914,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: else: # Td is a relative translation ap_stream += f"0 {- font_height * 1.4} Td\n".encode() - enc_line: list[Any] = [font_full_rev.get(c, ord(c)) for c in line] + enc_line: List[Any] = [font_full_rev.get(c, ord(c)) for c in line] if all(c > 255 for c in enc_line): ap_stream += ( b"<" From 807212b9f2f40d79caf562d7420b80502ff56634 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 22:40:14 +0200 Subject: [PATCH 05/14] Update pypdf/_cmap.py Co-authored-by: Martin Thoma --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index b5311c5fb..b9ab31941 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -18,7 +18,7 @@ def build_char_map( Args: font_name: font name as a string - space_width: default space with if no data found. + space_width: default space width if no data is found. obj: XObject or Page where you can find a /Resource dictionary Returns: From 3ef7e337f527f3e0a1c748f3c6185b6235efb6c1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 22:41:29 +0200 Subject: [PATCH 06/14] Update pypdf/_cmap.py Co-authored-by: Martin Thoma --- pypdf/_cmap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index b9ab31941..f7c03a100 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -22,7 +22,7 @@ def build_char_map( obj: XObject or Page where you can find a /Resource dictionary Returns: - Font sub-type, space_width criteria(50% of width), encoding, map character-map, font-dictionary. + Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary. The font-dictionary itself is suitable for the curious. """ ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore From 4fc16b6907cd0b163a1dc01c51b990089a9ebb58 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 23:08:56 +0200 Subject: [PATCH 07/14] coverage --- pypdf/_writer.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index f269ebdf3..0e345888d 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -850,11 +850,9 @@ def _update_text_field(self, field: DictionaryObject) -> None: # Retrieve font information from local DR ... dr: Any = cast(dict, cast(DictionaryObject, field.get("/DR", {}))) - if isinstance(dr, IndirectObject): - dr = dr.get_object() - dr = dr.get("/Font", {}) - if isinstance(dr, IndirectObject): + if isinstance(dr, IndirectObject): # pragma: no cover dr = dr.get_object() + dr = dr.get("/Font", {}).get_object() if font_name not in dr: # ...or AcroForm dictionary dr = cast( @@ -863,9 +861,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: ) if isinstance(dr, IndirectObject): dr = dr.get_object() - dr = dr.get("/Font", {}) - if isinstance(dr, IndirectObject): - dr = dr.get_object() + dr = dr.get("/Font", {}).get_object() font_res = dr.get(font_name) if font_res is not None: font_res = cast(DictionaryObject, font_res.get_object()) @@ -874,6 +870,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: ) font_full_rev: Dict[str, int] if isinstance(font_encoding, str): + assert font_encoding in ("charmap", "utf-16-be") if font_encoding not in ("charmap", "utf-16-be"): logger_warning( f"unexpected {font_encoding} : please share pdf with pypdf dev team", @@ -886,7 +883,8 @@ def _update_text_field(self, field: DictionaryObject) -> None: for k, v in font_map.items(): font_full_rev[v] = font_encoding_rev.get(k, ord(k)) else: - logger_warning(f"can not find font dictionnary for {font_name}", __name__) + raise AssertionError("can not find font dictionary") + logger_warning(f"can not find font dictionary for {font_name}", __name__) font_full_rev = {} # Retrieve field text and selected values @@ -915,7 +913,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: # Td is a relative translation ap_stream += f"0 {- font_height * 1.4} Td\n".encode() enc_line: List[Any] = [font_full_rev.get(c, ord(c)) for c in line] - if all(c > 255 for c in enc_line): + if any(c > 255 for c in enc_line): ap_stream += ( b"<" + b"".join(b"%04X" % x for x in line.encode("UTF-16-BE")) From 883f4396573f3761629a5255e0185463c02417c1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 30 Jul 2023 23:32:38 +0200 Subject: [PATCH 08/14] fix --- pypdf/_writer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 0e345888d..d817d1600 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -849,10 +849,11 @@ def _update_text_field(self, field: DictionaryObject) -> None: y_offset = rct.height - 1 - font_height # Retrieve font information from local DR ... - dr: Any = cast(dict, cast(DictionaryObject, field.get("/DR", {}))) - if isinstance(dr, IndirectObject): # pragma: no cover - dr = dr.get_object() - dr = dr.get("/Font", {}).get_object() + dr: Any = cast( + DictionaryObject, + cast(DictionaryObject, field.get("/DR", DictionaryObject())).get_object(), + ) + dr = dr.get("/Font", DictionaryObject()).get_object() if font_name not in dr: # ...or AcroForm dictionary dr = cast( @@ -861,7 +862,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: ) if isinstance(dr, IndirectObject): dr = dr.get_object() - dr = dr.get("/Font", {}).get_object() + dr = dr.get("/Font", DictionaryObject()).get_object() font_res = dr.get(font_name) if font_res is not None: font_res = cast(DictionaryObject, font_res.get_object()) From c0fd10c634fe6dfa845d926f2cccd195d23d470e Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 4 Aug 2023 15:40:24 +0200 Subject: [PATCH 09/14] fix some cases with utf-16 --- pypdf/_writer.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 8958e0b6f..8e3788ee3 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -869,20 +869,20 @@ def _update_text_field(self, field: DictionaryObject) -> None: font_subtype, _, font_encoding, font_map = build_char_map_from_dict( 200, font_res ) - font_full_rev: Dict[str, int] + try: # get rid of width stored in -1 key + del font_map[-1] + except KeyError: + pass + font_full_rev: Dict[str, bytes] if isinstance(font_encoding, str): - assert font_encoding in ("charmap", "utf-16-be") - if font_encoding not in ("charmap", "utf-16-be"): - logger_warning( - f"unexpected {font_encoding} : please share pdf with pypdf dev team", - __name__, - ) - font_full_rev = {v: k for k, v in font_map.items()} + font_full_rev = { + v: k.encode(font_encoding) for k, v in font_map.items() + } else: - font_full_rev = {v: k for k, v in font_encoding.items()} - font_encoding_rev = {v: k for k, v in font_encoding.items()} - for k, v in font_map.items(): - font_full_rev[v] = font_encoding_rev.get(k, ord(k)) + font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + for kk, v in font_map.items(): + font_full_rev[v] = font_encoding_rev.get(kk, kk) else: raise AssertionError("can not find font dictionary") logger_warning(f"can not find font dictionary for {font_name}", __name__) @@ -913,15 +913,13 @@ def _update_text_field(self, field: DictionaryObject) -> None: else: # Td is a relative translation ap_stream += f"0 {- font_height * 1.4} Td\n".encode() - enc_line: List[Any] = [font_full_rev.get(c, ord(c)) for c in line] - if any(c > 255 for c in enc_line): - ap_stream += ( - b"<" - + b"".join(b"%04X" % x for x in line.encode("UTF-16-BE")) - + b"> Tj\n" - ) + enc_line: List[bytes] = [ + font_full_rev.get(c, c.encode("utf-16-be")) for c in line + ] + if any(len(c) >= 2 for c in enc_line): + ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" else: - ap_stream += b"(" + bytes(enc_line) + b") Tj\n" + ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" ap_stream += b"ET\nQ\nEMC\nQ\n" # Create appearance dictionary From 2b2b1cd2e8ea24cb29c0de01f4c509939f8c9678 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 1 Aug 2023 19:03:21 +0200 Subject: [PATCH 10/14] ENH : allow to change font name and size in fields update --- pypdf/_writer.py | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 8e3788ee3..5a73cee07 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -832,7 +832,9 @@ def _get_qualified_field_name(self, parent: DictionaryObject) -> Optional[str]: return qualified_parent + "." + cast(str, parent["/T"]) return cast(str, parent["/T"]) - def _update_text_field(self, field: DictionaryObject) -> None: + def _update_text_field( + self, field: DictionaryObject, fontname: str = "", fontsize: float = -1 + ) -> None: # Calculate rectangle dimensions _rct = cast(RectangleObject, field[AA.Rect]) rct = RectangleObject((0, 0, _rct[2] - _rct[0], _rct[3] - _rct[1])) @@ -840,10 +842,19 @@ def _update_text_field(self, field: DictionaryObject) -> None: # Extract font information da = cast(str, field[AA.DA]) font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") - font_name = font_properties[font_properties.index("Tf") - 2] - font_height = float(font_properties[font_properties.index("Tf") - 1]) - if font_height == 0: - font_height = rct.height - 2 + font_name = ( + fontname if fontname else font_properties[font_properties.index("Tf") - 2] + ) + font_height = ( + fontsize + if fontsize >= 0 + else float(font_properties[font_properties.index("Tf") - 1]) + ) + if fontname or fontsize >= 0 or font_height == 0: + if fontname: + font_properties[font_properties.index("Tf") - 1] = fontname + if font_height == 0: + font_height = rct.height - 2 font_properties[font_properties.index("Tf") - 1] = str(font_height) da = " ".join(font_properties) y_offset = rct.height - 1 - font_height @@ -975,8 +986,14 @@ def update_page_form_field_values( Args: page: Page reference from PDF writer where the annotations and field data will be updated. - fields: a Python dictionary of field names (/T) and text - values (/V) + fields: a Python dictionary of : + a) field names (/T) as keys and text values (/V) as value + b) field names (/T) as keys and list of text values (/V) + for multiple choice list + c) field names (/T) as keys and tuple of : + * text values (/V) + * font name (must exist) + * font size (0 for autosize) flags: An integer (0 to 7). The first bit sets ReadOnly, the second bit sets Required, the third bit sets NoExport. See PDF Reference Table 8.70 for details. @@ -1012,6 +1029,10 @@ def update_page_form_field_values( if isinstance(value, list): lst = ArrayObject(TextStringObject(v) for v in value) writer_annot[NameObject(FA.V)] = lst + elif isinstance(value, tuple): + writer_annot[NameObject(FA.V)] = TextStringObject( + value[0], + ) else: writer_annot[NameObject(FA.V)] = TextStringObject(value) if writer_annot.get(FA.FT) in ("/Btn"): @@ -1033,7 +1054,10 @@ def update_page_form_field_values( if AA.DA in f: da = f[AA.DA] writer_annot[NameObject(AA.DA)] = da - self._update_text_field(writer_annot) + if isinstance(value, tuple): + self._update_text_field(writer_annot, value[1], value[2]) + else: + self._update_text_field(writer_annot) elif writer_annot.get(FA.FT) == "/Sig": # signature logger_warning("Signature forms not implemented yet", __name__) From 532f015395c938726877ff175dcbb43044b49444 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 4 Aug 2023 16:52:37 +0200 Subject: [PATCH 11/14] Update pypdf/_cmap.py Co-authored-by: Martin Thoma --- pypdf/_cmap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index f7c03a100..04e6c2611 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -39,7 +39,8 @@ def build_char_map_from_dict( Determine information about a font. Args: - space_width: default space with if no data found (normally half width of char. + space_width: default space with if no data found + (normally half the width of a character). ft: Font Dictionary Returns: From 7306998c2ff2597ffeeeae487f5153241a35aae2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 4 Aug 2023 16:59:17 +0200 Subject: [PATCH 12/14] coverage --- pypdf/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 8e3788ee3..c20ff665c 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -860,7 +860,7 @@ def _update_text_field(self, field: DictionaryObject) -> None: dict, cast(DictionaryObject, self._root_object["/AcroForm"]).get("/DR", {}), ) - if isinstance(dr, IndirectObject): + if isinstance(dr, IndirectObject): # pragma: no cover dr = dr.get_object() dr = dr.get("/Font", DictionaryObject()).get_object() font_res = dr.get(font_name) From 6e23da5ee4ee200d3be081c36586fbcc2a013b04 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 4 Aug 2023 23:11:18 +0200 Subject: [PATCH 13/14] fix + test --- pypdf/_writer.py | 7 +++++-- tests/test_writer.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 5a73cee07..3eb73cae3 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -852,7 +852,7 @@ def _update_text_field( ) if fontname or fontsize >= 0 or font_height == 0: if fontname: - font_properties[font_properties.index("Tf") - 1] = fontname + font_properties[font_properties.index("Tf") - 2] = fontname if font_height == 0: font_height = rct.height - 2 font_properties[font_properties.index("Tf") - 1] = str(font_height) @@ -930,7 +930,10 @@ def _update_text_field( if any(len(c) >= 2 for c in enc_line): ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" else: - ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" + enc = b"".join(enc_line) + # for x in range(32): + # enc = enc.replace(bytes((x,)),b"\%03o"%x) + ap_stream += b"(" + enc + b") Tj\n" ap_stream += b"ET\nQ\nEMC\nQ\n" # Create appearance dictionary diff --git a/tests/test_writer.py b/tests/test_writer.py index 88f884e4a..7713d9b7c 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1612,3 +1612,20 @@ def test_germanfields(): .indirect_reference.get_object()["/AP"]["/N"] .get_data() ) + + +def test_selfont(): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "FormTestFromOo.pdf") + writer.update_page_form_field_values( + writer.pages[0], + {"Text1": ("Text", "", 5), "Text2": ("Text", "/F1", 15)}, + auto_regenerate=False, + ) + assert ( + b"/F3 5 Tf" + in writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"].get_data() + ) + assert ( + b"/F1 15 Tf" + in writer.pages[0]["/Annots"][2].get_object()["/AP"]["/N"].get_data() + ) From defdcd47ddf319b3e4b0e332d3bad9114f496827 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 6 Aug 2023 12:51:51 +0200 Subject: [PATCH 14/14] simplify test --- pypdf/_writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 6001942ea..c1dfb3aaf 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -895,7 +895,6 @@ def _update_text_field( for kk, v in font_map.items(): font_full_rev[v] = font_encoding_rev.get(kk, kk) else: - raise AssertionError("can not find font dictionary") logger_warning(f"can not find font dictionary for {font_name}", __name__) font_full_rev = {}