Skip to content

Commit 6b52a0d

Browse files
TST: Improve text extraction coverage (#3353)
1 parent f5c5747 commit 6b52a0d

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

pypdf/_page.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1657,10 +1657,7 @@ def _debug_for_extract(self) -> str: # pragma: no cover
16571657
out += "No Font\n"
16581658
return out
16591659

1660-
1661-
1662-
1663-
def _extract_text(
1660+
def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
16641661
self,
16651662
obj: Any,
16661663
pdf: Any,
@@ -1704,9 +1701,12 @@ def _extract_text(
17041701
# file as not damaged, no need to check for TJ or Tj
17051702
return ""
17061703

1707-
if "/Font" in resources_dict and (font := resources_dict["/Font"]):
1704+
if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]):
17081705
for f in cast(DictionaryObject, font):
1709-
cmaps[f] = build_char_map(f, space_width, obj)
1706+
try:
1707+
cmaps[f] = build_char_map(f, space_width, obj)
1708+
except TypeError:
1709+
pass
17101710
cmap: Tuple[
17111711
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
17121712
] = (

tests/test_text_extraction.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,3 +414,31 @@ def test_rotated_layout_mode(caplog):
414414
assert not caplog.records, "No warnings should be issued"
415415
assert text, "Text matching the page rotation should be extracted"
416416
assert re.search(r"\r?\n +69\r?\n +UNCLASSIFIED$", text), "Contents should be in expected layout"
417+
418+
419+
@pytest.mark.enable_socket
420+
@pytest.mark.filterwarnings("ignore::pypdf.errors.PdfReadWarning")
421+
def test_extract_text__none_objects():
422+
url = "https://github.com/user-attachments/files/18381726/tika-957721.pdf"
423+
name = "tika-957721.pdf"
424+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
425+
426+
reader.pages[0].extract_text()
427+
reader.pages[8].extract_text()
428+
429+
430+
@pytest.mark.enable_socket
431+
def test_extract_text__with_visitor_text():
432+
def visitor_text(*args, **kwargs): # noqa: ANN002, ANN003, ANN202
433+
pass
434+
435+
url = "https://github.com/user-attachments/files/18381718/tika-952016.pdf"
436+
name = "tika-952016.pdf"
437+
stream = BytesIO(get_data_from_url(url, name=name))
438+
reader = PdfReader(stream)
439+
page = reader.pages[0]
440+
page.extract_text(visitor_text=visitor_text)
441+
442+
reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf")))
443+
page = reader.pages[0]
444+
page.extract_text()

0 commit comments

Comments
 (0)