Merge branch 'feat/add-brotli-decode' of https://github.com/ash01ish/pypdf into feat/add-brotli-decode

ash01ish · ash01ish · commit 4dd3fa078f84 · 2025-04-14T14:50:51.000+05:30
diff --git a/docs/modules/constants.rst b/docs/modules/constants.rst
@@ -26,12 +26,3 @@ Constants
        :undoc-members:
        :exclude-members: FT, Parent, Kids, T, TU, TM, V, DV, AA, Opt, attributes, attributes_dict
        :show-inheritance:
-
-.. autoclass:: pypdf.constants.FilterTypes
-    :members:
-    :undoc-members:
-
-.. autoclass:: pypdf.constants.FilterTypeAbbreviations
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -270,6 +270,8 @@ class PdfDocCommon:
 
     strict: bool = False  # default
 
+    flattened_pages: Optional[List[PageObject]] = None
+
     _encryption: Optional[Encryption] = None
 
     _readonly: bool = False
@@ -333,8 +335,6 @@ def viewer_preferences(self) -> Optional[ViewerPreferences]:
                 self.root_object[NameObject(CD.VIEWER_PREFERENCES)] = o
         return o
 
-    flattened_pages: Optional[List[PageObject]] = None
-
     def get_num_pages(self) -> int:
         """
         Calculate the number of pages in this PDF file.
@@ -1128,7 +1128,16 @@ def _flatten(
         indirect_reference: Optional[IndirectObject] = None,
     ) -> None:
         """
-        Prepare the document pages to ease searching
+        Process the document pages to ease searching.
+
+        Attributes of a page may inherit from ancestor nodes
+        in the page tree. Flattening means moving
+        any inheritance data into descendant nodes,
+        effectively removing the inheritance dependency.
+
+        Note: It is distinct from another use of "flattening" applied to PDFs.
+        Flattening a PDF also means combining all the contents into one single layer
+        and making the file less editable.
 
         Args:
             list_only: Will only list the pages within _flatten_pages.
@@ -1156,7 +1165,7 @@ def _flatten(
 
         if PA.TYPE in pages:
             t = cast(str, pages[PA.TYPE])
-        # if pdf has no type, considered as a page if /Kids is missing
+        # if the page tree node has no /Type, consider as a page if /Kids is also missing
         elif PA.KIDS not in pages:
             t = "/Page"
         else:
@@ -1181,8 +1190,8 @@ def _flatten(
                         )
         elif t == "/Page":
             for attr_in, value in inherit.items():
-                # if the page has it's own value, it does not inherit the
-                # parent's value:
+                # if the page has its own value, it does not inherit the
+                # parent's value
                 if attr_in not in pages:
                     pages[attr_in] = value
             page_obj = PageObject(self, indirect_reference)
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -2009,8 +2009,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 # A special case is a translating only tm:
                 # tm = [1, 0, 0, 1, e, f]
                 # i.e. tm[4] += tx, tm[5] += ty.
-                tx = float(operands[0])
-                ty = float(operands[1])
+                tx, ty = float(operands[0]), float(operands[1])
                 tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
                 tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
                 str_widths = compute_str_widths(_actual_str_size["str_widths"])
@@ -2022,7 +2021,10 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 _actual_str_size["str_widths"] = 0.0
             elif operator == b"T*":
                 check_crlf_space = True
-                tm_matrix[5] -= TL
+                tm_matrix[4] -= TL * tm_matrix[2]
+                tm_matrix[5] -= TL * tm_matrix[3]
+                str_widths = compute_str_widths(_actual_str_size["str_widths"])
+                _actual_str_size["str_widths"] = 0.0
             elif operator == b"Tj":
                 check_crlf_space = True
                 text, rtl_dir, _actual_str_size = self._handle_tj(
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -1502,13 +1502,12 @@ def _write_increment(self, stream: StreamType) -> None:
 
     def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]]:
         object_positions = []
-        free_objects = []  # will contain list of all free entries
+        free_objects = []
         stream.write(self.pdf_header.encode() + b"\n")
         stream.write(b"%\xE2\xE3\xCF\xD3\n")
 
-        for i, obj in enumerate(self._objects):
+        for idnum, obj in enumerate(self._objects, start=1):
             if obj is not None:
-                idnum = i + 1
                 object_positions.append(stream.tell())
                 stream.write(f"{idnum} 0 obj\n".encode())
                 if self._encryption and obj != self._encrypt_entry:
@@ -1517,8 +1516,8 @@ def _write_pdf_structure(self, stream: StreamType) -> Tuple[List[int], List[int]
                 stream.write(b"\nendobj\n")
             else:
                 object_positions.append(-1)
-                free_objects.append(i + 1)
-        free_objects.append(0)  # add 0 to loop in accordance with PDF spec
+                free_objects.append(idnum)
+        free_objects.append(0)  # add 0 to loop in accordance with specification
         return object_positions, free_objects
 
     def _write_xref_table(
@@ -1760,7 +1759,7 @@ def get_reference(self, obj: PdfObject) -> IndirectObject:
 
     def get_outline_root(self) -> TreeObject:
         if CO.OUTLINES in self._root_object:
-            # Table 3.25 Entries in the catalog dictionary
+            # Entries in the catalog dictionary
             outline = cast(TreeObject, self._root_object[CO.OUTLINES])
             if not isinstance(outline, TreeObject):
                 t = TreeObject(outline)
@@ -1784,12 +1783,12 @@ def get_threads_root(self) -> ArrayObject:
         See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
 
         Returns:
-            An array (possibly empty) of Dictionaries with ``/F`` and
-            ``/I`` properties.
+            An array (possibly empty) of Dictionaries with an ``/F`` key,
+            and optionally information about the thread in ``/I`` or ``/Metadata`` keys.
 
         """
         if CO.THREADS in self._root_object:
-            # Table 3.25 Entries in the catalog dictionary
+            # Entries in the catalog dictionary
             threads = cast(ArrayObject, self._root_object[CO.THREADS])
         else:
             threads = ArrayObject()
@@ -1801,9 +1800,10 @@ def threads(self) -> ArrayObject:
         """
         Read-only property for the list of threads.
 
-        See §8.3.2 from PDF 1.7 spec.
+        See §12.4.3 of the PDF 1.7 or PDF 2.0 specification.
 
-        Each element is a dictionary with ``/F`` and ``/I`` keys.
+        Each element is a dictionary with an ``/F`` key, and optionally
+        information about the thread in ``/I`` or ``/Metadata`` keys.
         """
         return self.get_threads_root()
 
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -353,3 +353,18 @@ def test_layout_mode_text_state():
     expected = get_data_from_url(txt_url, name=txt_name).decode("utf-8").replace("\r\n", "\n")
 
     assert expected == reader.pages[0].extract_text(extraction_mode="layout")
+
+
+@pytest.mark.enable_socket
+def test_rotated_line_wrap():
+    """Ensure correct 2D translation of rotated text after a line wrap."""
+    # Get the PDF from issue #3247
+    url = "https://github.com/user-attachments/files/19696918/link16-line-wrap.sanitized.pdf"
+    name = "link16-line-wrap.sanitized.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    # Get the txt from issue #3247 and normalize line endings
+    txt_url = "https://github.com/user-attachments/files/19696917/link16-line-wrap.sanitized.expected.txt"
+    txt_name = "link16-line-wrap.sanitized.expected.txt"
+    expected = get_data_from_url(txt_url, name=txt_name).decode("utf-8").replace("\r\n", "\n")
+
+    assert expected == reader.pages[0].extract_text()