Skip to content

Commit 4dd3fa0

Browse files
committed
Merge branch 'feat/add-brotli-decode' of https://github.com/ash01ish/pypdf into feat/add-brotli-decode
2 parents 89d2a86 + 5454636 commit 4dd3fa0

File tree

5 files changed

+46
-29
lines changed

5 files changed

+46
-29
lines changed

docs/modules/constants.rst

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,3 @@ Constants
2626
:undoc-members:
2727
:exclude-members: FT, Parent, Kids, T, TU, TM, V, DV, AA, Opt, attributes, attributes_dict
2828
:show-inheritance:
29-
30-
.. autoclass:: pypdf.constants.FilterTypes
31-
:members:
32-
:undoc-members:
33-
34-
.. autoclass:: pypdf.constants.FilterTypeAbbreviations
35-
:members:
36-
:undoc-members:
37-
:show-inheritance:

pypdf/_doc_common.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,8 @@ class PdfDocCommon:
270270

271271
strict: bool = False # default
272272

273+
flattened_pages: Optional[List[PageObject]] = None
274+
273275
_encryption: Optional[Encryption] = None
274276

275277
_readonly: bool = False
@@ -333,8 +335,6 @@ def viewer_preferences(self) -> Optional[ViewerPreferences]:
333335
self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
334336
return o
335337

336-
flattened_pages: Optional[List[PageObject]] = None
337-
338338
def get_num_pages(self) -> int:
339339
"""
340340
Calculate the number of pages in this PDF file.
@@ -1128,7 +1128,16 @@ def _flatten(
11281128
indirect_reference: Optional[IndirectObject] = None,
11291129
) -> None:
11301130
"""
1131-
Prepare the document pages to ease searching
1131+
Process the document pages to ease searching.
1132+
1133+
Attributes of a page may inherit from ancestor nodes
1134+
in the page tree. Flattening means moving
1135+
any inheritance data into descendant nodes,
1136+
effectively removing the inheritance dependency.
1137+
1138+
Note: It is distinct from another use of "flattening" applied to PDFs.
1139+
Flattening a PDF also means combining all the contents into one single layer
1140+
and making the file less editable.
11321141
11331142
Args:
11341143
list_only: Will only list the pages within _flatten_pages.
@@ -1156,7 +1165,7 @@ def _flatten(
11561165

11571166
if PA.TYPE in pages:
11581167
t = cast(str, pages[PA.TYPE])
1159-
# if pdf has no type, considered as a page if /Kids is missing
1168+
# if the page tree node has no /Type, consider as a page if /Kids is also missing
11601169
elif PA.KIDS not in pages:
11611170
t = "/Page"
11621171
else:
@@ -1181,8 +1190,8 @@ def _flatten(
11811190
)
11821191
elif t == "/Page":
11831192
for attr_in, value in inherit.items():
1184-
# if the page has it's own value, it does not inherit the
1185-
# parent's value:
1193+
# if the page has its own value, it does not inherit the
1194+
# parent's value
11861195
if attr_in not in pages:
11871196
pages[attr_in] = value
11881197
page_obj = PageObject(self, indirect_reference)

pypdf/_page.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,8 +2009,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20092009
# A special case is a translating only tm:
20102010
# tm = [1, 0, 0, 1, e, f]
20112011
# i.e. tm[4] += tx, tm[5] += ty.
2012-
tx = float(operands[0])
2013-
ty = float(operands[1])
2012+
tx, ty = float(operands[0]), float(operands[1])
20142013
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
20152014
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
20162015
str_widths = compute_str_widths(_actual_str_size["str_widths"])
@@ -2022,7 +2021,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20222021
_actual_str_size["str_widths"] = 0.0
20232022
elif operator == b"T*":
20242023
check_crlf_space = True
2025-
tm_matrix[5] -= TL
2024+
tm_matrix[4] -= TL * tm_matrix[2]
2025+
tm_matrix[5] -= TL * tm_matrix[3]
2026+
str_widths = compute_str_widths(_actual_str_size["str_widths"])
2027+
_actual_str_size["str_widths"] = 0.0
20262028
elif operator == b"Tj":
20272029
check_crlf_space = True
20282030
text, rtl_dir, _actual_str_size = self._handle_tj(

pypdf/_writer.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,13 +1502,12 @@ def _write_increment(self, stream: StreamType) -> None:
15021502

15031503
def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
15041504
object_positions = []
1505-
free_objects = [] # will contain list of all free entries
1505+
free_objects = []
15061506
stream.write(self.pdf_header.encode() + b"\n")
15071507
stream.write(b"%\xE2\xE3\xCF\xD3\n")
15081508

1509-
for i, obj in enumerate(self._objects):
1509+
for idnum, obj in enumerate(self._objects, start=1):
15101510
if obj is not None:
1511-
idnum = i + 1
15121511
object_positions.append(stream.tell())
15131512
stream.write(f"{idnum} 0 obj\n".encode())
15141513
if self._encryption and obj != self._encrypt_entry:
@@ -1517,8 +1516,8 @@ def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]
15171516
stream.write(b"\nendobj\n")
15181517
else:
15191518
object_positions.append(-1)
1520-
free_objects.append(i + 1)
1521-
free_objects.append(0) # add 0 to loop in accordance with PDF spec
1519+
free_objects.append(idnum)
1520+
free_objects.append(0) # add 0 to loop in accordance with specification
15221521
return object_positions, free_objects
15231522

15241523
def _write_xref_table(
@@ -1760,7 +1759,7 @@ def get_reference(self, obj: PdfObject) -> IndirectObject:
17601759

17611760
def get_outline_root(self) -> TreeObject:
17621761
if CO.OUTLINES in self._root_object:
1763-
# Table 3.25 Entries in the catalog dictionary
1762+
# Entries in the catalog dictionary
17641763
outline = cast(TreeObject, self._root_object[CO.OUTLINES])
17651764
if not isinstance(outline, TreeObject):
17661765
t = TreeObject(outline)
@@ -1784,12 +1783,12 @@ def get_threads_root(self) -> ArrayObject:
17841783
See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
17851784
17861785
Returns:
1787-
An array (possibly empty) of Dictionaries with ``/F`` and
1788-
``/I`` properties.
1786+
An array (possibly empty) of Dictionaries with an ``/F`` key,
1787+
and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
17891788
17901789
"""
17911790
if CO.THREADS in self._root_object:
1792-
# Table 3.25 Entries in the catalog dictionary
1791+
# Entries in the catalog dictionary
17931792
threads = cast(ArrayObject, self._root_object[CO.THREADS])
17941793
else:
17951794
threads = ArrayObject()
@@ -1801,9 +1800,10 @@ def threads(self) -> ArrayObject:
18011800
"""
18021801
Read-only property for the list of threads.
18031802
1804-
See §8.3.2 from PDF 1.7 spec.
1803+
See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
18051804
1806-
Each element is a dictionary with ``/F`` and ``/I`` keys.
1805+
Each element is a dictionary with an ``/F`` key, and optionally
1806+
information about the thread in ``/I`` or ``/Metadata`` keys.
18071807
"""
18081808
return self.get_threads_root()
18091809

tests/test_text_extraction.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,3 +353,18 @@ def test_layout_mode_text_state():
353353
expected = get_data_from_url(txt_url, name=txt_name).decode("utf-8").replace("\r\n", "\n")
354354

355355
assert expected == reader.pages[0].extract_text(extraction_mode="layout")
356+
357+
358+
@pytest.mark.enable_socket
359+
def test_rotated_line_wrap():
360+
"""Ensure correct 2D translation of rotated text after a line wrap."""
361+
# Get the PDF from issue #3247
362+
url = "https://github.com/user-attachments/files/19696918/link16-line-wrap.sanitized.pdf"
363+
name = "link16-line-wrap.sanitized.pdf"
364+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
365+
# Get the txt from issue #3247 and normalize line endings
366+
txt_url = "https://github.com/user-attachments/files/19696917/link16-line-wrap.sanitized.expected.txt"
367+
txt_name = "link16-line-wrap.sanitized.expected.txt"
368+
expected = get_data_from_url(txt_url, name=txt_name).decode("utf-8").replace("\r\n", "\n")
369+
370+
assert expected == reader.pages[0].extract_text()

0 commit comments

Comments
 (0)