TST: Improve text extraction coverage (#3353)

stefan6419846 · web-flow · commit 6b52a0da34a2 · 2025-07-02T16:29:26.000+02:00
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1657,10 +1657,7 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
             out += "No Font\n"
         return out
 
-
-
-
-    def _extract_text(
+    def _extract_text(  # noqa: C901, PLR0915  # Will be fixed soon.
         self,
         obj: Any,
         pdf: Any,
@@ -1704,9 +1701,12 @@ def _extract_text(
             # file as not damaged, no need to check for TJ or Tj
             return ""
 
-        if "/Font" in resources_dict and (font := resources_dict["/Font"]):
+        if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]):
             for f in cast(DictionaryObject, font):
-                cmaps[f] = build_char_map(f, space_width, obj)
+                try:
+                    cmaps[f] = build_char_map(f, space_width, obj)
+                except TypeError:
+                    pass
         cmap: Tuple[
             Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
         ] = (
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -414,3 +414,31 @@ def test_rotated_layout_mode(caplog):
     assert not caplog.records, "No warnings should be issued"
     assert text, "Text matching the page rotation should be extracted"
     assert re.search(r"\r?\n +69\r?\n +UNCLASSIFIED$", text), "Contents should be in expected layout"
+
+
+@pytest.mark.enable_socket
+@pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning")
+def test_extract_text__none_objects():
+    url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf"
+    name = "tika-957721.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+
+    reader.pages[0].extract_text()
+    reader.pages[8].extract_text()
+
+
+@pytest.mark.enable_socket
+def test_extract_text__with_visitor_text():
+    def visitor_text(*args, **kwargs):  # noqa: ANN002, ANN003, ANN202
+        pass
+
+    url = "https://github.com/user-attachments/files/18381718/tika-952016.pdf"
+    name = "tika-952016.pdf"
+    stream = BytesIO(get_data_from_url(url, name=name))
+    reader = PdfReader(stream)
+    page = reader.pages[0]
+    page.extract_text(visitor_text=visitor_text)
+
+    reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf")))
+    page = reader.pages[0]
+    page.extract_text()