Skip to content

Commit 4657df5

Browse files
authored
BUG: catch the case where w[0] is an IndirectObject instead of an int (#2154)
Closes #2137
1 parent 0ca4d37 commit 4657df5

File tree

2 files changed

+13
-1
lines changed

2 files changed

+13
-1
lines changed

pypdf/_cmap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ def compute_space_width(
410410
else:
411411
w = []
412412
while len(w) > 0:
413-
st = w[0]
413+
st = w[0] if isinstance(w[0], int) else w[0].get_object()
414414
second = w[1].get_object()
415415
if isinstance(second, int):
416416
for x in range(st, second):

tests/test_cmap.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,18 @@ def test_unixxx_glyphs():
191191
assert pat in txt
192192

193193

194+
@pytest.mark.enable_socket()
195+
def test_cmap_compute_space_width():
196+
# issue 2137
197+
# original file URL:
198+
url = "https://arxiv.org/pdf/2005.05909.pdf"
199+
# URL from github issue is too long to pass code stype check, use original arxiv URL instead
200+
# url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf"
201+
name = "TextAttack_paper.pdf"
202+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
203+
reader.pages[0].extract_text() # no error
204+
205+
194206
@pytest.mark.enable_socket()
195207
def test_tabs_in_cmap():
196208
"""Issue #2173"""

0 commit comments

Comments
 (0)