51
51
52
52
from ._cmap import (
53
53
build_char_map ,
54
- unknown_char_map ,
55
54
)
56
55
from ._protocols import PdfCommonDocProtocol
57
56
from ._text_extraction import (
58
- OrientationNotFoundError ,
59
57
_layout_mode ,
60
- crlf_space_check ,
61
- mult ,
62
58
)
63
59
from ._text_extraction ._text_extractor import TextExtraction
64
60
from ._utils import (
@@ -1657,7 +1653,7 @@ def _debug_for_extract(self) -> str: # pragma: no cover
1657
1653
out += "No Font\n "
1658
1654
return out
1659
1655
1660
- def _extract_text ( # noqa: C901, PLR0915 # Will be fixed soon.
1656
+ def _extract_text (
1661
1657
self ,
1662
1658
obj : Any ,
1663
1659
pdf : Any ,
@@ -1678,9 +1674,6 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
1678
1674
1679
1675
"""
1680
1676
extractor = TextExtraction ()
1681
- text : str = ""
1682
- output : str = ""
1683
- rtl_dir : bool = False # right-to-left
1684
1677
cmaps : Dict [
1685
1678
str ,
1686
1679
Tuple [
@@ -1707,14 +1700,6 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
1707
1700
cmaps [f ] = build_char_map (f , space_width , obj )
1708
1701
except TypeError :
1709
1702
pass
1710
- cmap : Tuple [
1711
- Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
1712
- ] = (
1713
- "charmap" ,
1714
- {},
1715
- "NotInitialized" ,
1716
- None ,
1717
- ) # (encoding, CMAP, font resource name, font)
1718
1703
1719
1704
try :
1720
1705
content = (
@@ -1728,245 +1713,57 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
1728
1713
# are strings where the byte->string encoding was unknown, so adding
1729
1714
# them to the text here would be gibberish.
1730
1715
1731
- cm_matrix : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1732
- tm_matrix : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1733
- cm_stack = []
1734
-
1735
- # Store the last modified matrices; can be an intermediate position
1736
- cm_prev : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1737
- tm_prev : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1738
-
1739
- # Store the position at the beginning of building the text
1740
- memo_cm : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1741
- memo_tm : List [float ] = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1742
-
1743
- char_scale = 1.0
1744
- space_scale = 1.0
1745
- _space_width : float = 500.0 # will be set correctly at first Tf
1746
- _actual_str_size : Dict [str , float ] = {
1747
- "str_widths" : 0.0 , "space_width" : 0.0 , "str_height" : 0.0
1748
- } # will be set to string length calculation result
1749
- TL = 0.0
1750
- font_size = 12.0 # init just in case of
1751
-
1752
- def compute_str_widths (str_widths : float ) -> float :
1753
- return str_widths / 1000
1754
-
1755
- def process_operation (operator : bytes , operands : List [Any ]) -> None :
1756
- nonlocal cm_matrix , tm_matrix , cm_stack , cm_prev , tm_prev , memo_cm , memo_tm
1757
- nonlocal char_scale , space_scale , _space_width , TL , font_size , cmap
1758
- nonlocal orientations , rtl_dir , visitor_text , output , text , _actual_str_size
1759
-
1760
- str_widths : float = 0.0
1761
-
1762
- # Table 5.4 page 405
1763
- if operator == b"BT" : # Begin Text
1764
- tm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1765
- # Flush text:
1766
- output += text
1767
- if visitor_text is not None :
1768
- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1769
- text = ""
1770
- memo_cm = cm_matrix .copy ()
1771
- memo_tm = tm_matrix .copy ()
1772
- return
1773
- if operator == b"ET" : # End Text
1774
- # Flush text:
1775
- output += text
1776
- if visitor_text is not None :
1777
- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1778
- text = ""
1779
- memo_cm = cm_matrix .copy ()
1780
- memo_tm = tm_matrix .copy ()
1781
-
1782
- # Table 4.7 "Graphics state operators", page 219
1783
- # cm_matrix calculation is reserved for later
1784
- elif operator == b"q" : # Save graphics state
1785
- cm_stack .append (
1786
- (
1787
- cm_matrix ,
1788
- cmap ,
1789
- font_size ,
1790
- char_scale ,
1791
- space_scale ,
1792
- _space_width ,
1793
- TL ,
1794
- )
1795
- )
1796
- elif operator == b"Q" : # Restore graphics state
1797
- try :
1798
- (
1799
- cm_matrix ,
1800
- cmap ,
1801
- font_size ,
1802
- char_scale ,
1803
- space_scale ,
1804
- _space_width ,
1805
- TL ,
1806
- ) = cm_stack .pop ()
1807
- except Exception :
1808
- cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1809
- elif operator == b"cm" : # Modify current matrix
1810
- output += text
1811
- if visitor_text is not None :
1812
- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1813
- text = ""
1814
- try :
1815
- cm_matrix = mult (
1816
- [float (operand ) for operand in operands [:6 ]],
1817
- cm_matrix
1818
- )
1819
- except Exception :
1820
- cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1821
- memo_cm = cm_matrix .copy ()
1822
- memo_tm = tm_matrix .copy ()
1823
-
1824
- # Table 5.2 page 398
1825
- elif operator == b"Tz" : # Set horizontal text scaling
1826
- char_scale = float (operands [0 ]) / 100 if operands else 1.0
1827
- elif operator == b"Tw" : # Set word spacing
1828
- space_scale = 1.0 + float (operands [0 ] if operands else 0.0 )
1829
- elif operator == b"TL" : # Set Text Leading
1830
- scale_x = math .sqrt (tm_matrix [0 ]** 2 + tm_matrix [2 ]** 2 )
1831
- TL = float (operands [0 ] if operands else 0.0 ) * font_size * scale_x
1832
- elif operator == b"Tf" : # Set font size
1833
- if text != "" :
1834
- output += text # .translate(cmap)
1835
- if visitor_text is not None :
1836
- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1837
- text = ""
1838
- memo_cm = cm_matrix .copy ()
1839
- memo_tm = tm_matrix .copy ()
1840
- try :
1841
- # char_map_tuple: font_type,
1842
- # float(sp_width / 2),
1843
- # encoding,
1844
- # map_dict,
1845
- # font_dict (describes the font)
1846
- char_map_tuple = cmaps [operands [0 ]]
1847
- # current cmap: encoding,
1848
- # map_dict,
1849
- # font resource name (internal name, not the real font name),
1850
- # font_dict
1851
- cmap = (
1852
- char_map_tuple [2 ],
1853
- char_map_tuple [3 ],
1854
- operands [0 ],
1855
- char_map_tuple [4 ],
1856
- )
1857
- _space_width = char_map_tuple [1 ]
1858
- except KeyError : # font not found
1859
- cmap = (
1860
- unknown_char_map [2 ],
1861
- unknown_char_map [3 ],
1862
- f"???{ operands [0 ]} " ,
1863
- None ,
1864
- )
1865
- _space_width = unknown_char_map [1 ]
1866
- try :
1867
- font_size = float (operands [1 ])
1868
- except Exception :
1869
- pass # keep previous size
1870
- # Table 5.5 page 406
1871
- elif operator == b"Td" : # Move text position
1872
- # A special case is a translating only tm:
1873
- # tm = [1, 0, 0, 1, e, f]
1874
- # i.e. tm[4] += tx, tm[5] += ty.
1875
- tx , ty = float (operands [0 ]), float (operands [1 ])
1876
- tm_matrix [4 ] += tx * tm_matrix [0 ] + ty * tm_matrix [2 ]
1877
- tm_matrix [5 ] += tx * tm_matrix [1 ] + ty * tm_matrix [3 ]
1878
- str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
1879
- _actual_str_size ["str_widths" ] = 0.0
1880
- elif operator == b"Tm" : # Set text matrix
1881
- tm_matrix = [float (operand ) for operand in operands [:6 ]]
1882
- str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
1883
- _actual_str_size ["str_widths" ] = 0.0
1884
- elif operator == b"T*" : # Move to next line
1885
- tm_matrix [4 ] -= TL * tm_matrix [2 ]
1886
- tm_matrix [5 ] -= TL * tm_matrix [3 ]
1887
- str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
1888
- _actual_str_size ["str_widths" ] = 0.0
1889
- elif operator == b"Tj" : # Show text
1890
- text , rtl_dir , _actual_str_size = extractor ._handle_tj (
1891
- text ,
1892
- operands ,
1893
- cm_matrix ,
1894
- tm_matrix ,
1895
- cmap ,
1896
- orientations ,
1897
- font_size ,
1898
- rtl_dir ,
1899
- visitor_text ,
1900
- _space_width ,
1901
- _actual_str_size ,
1902
- )
1903
- else :
1904
- return
1905
-
1906
- if operator in {b"Td" , b"Tm" , b"T*" , b"Tj" }:
1907
- try :
1908
- text , output , cm_prev , tm_prev = crlf_space_check (
1909
- text ,
1910
- (cm_prev , tm_prev ),
1911
- (cm_matrix , tm_matrix ),
1912
- (memo_cm , memo_tm ),
1913
- cmap ,
1914
- orientations ,
1915
- output ,
1916
- font_size ,
1917
- visitor_text ,
1918
- str_widths ,
1919
- compute_str_widths (_actual_str_size ["space_width" ]),
1920
- _actual_str_size ["str_height" ]
1921
- )
1922
- if text == "" :
1923
- memo_cm = cm_matrix .copy ()
1924
- memo_tm = tm_matrix .copy ()
1925
- except OrientationNotFoundError :
1926
- return
1716
+ # Initialize the extractor with the necessary parameters
1717
+ extractor .initialize_extraction (orientations , visitor_text , cmaps )
1927
1718
1928
1719
for operands , operator in content .operations :
1929
1720
if visitor_operand_before is not None :
1930
- visitor_operand_before (operator , operands , cm_matrix , tm_matrix )
1721
+ visitor_operand_before (operator , operands , extractor . cm_matrix , extractor . tm_matrix )
1931
1722
# Multiple operators are handled here
1932
1723
if operator == b"'" :
1933
- process_operation (b"T*" , [])
1934
- process_operation (b"Tj" , operands )
1724
+ extractor . process_operation (b"T*" , [])
1725
+ extractor . process_operation (b"Tj" , operands )
1935
1726
elif operator == b'"' :
1936
- process_operation (b"Tw" , [operands [0 ]])
1937
- process_operation (b"Tc" , [operands [1 ]])
1938
- process_operation (b"T*" , [])
1939
- process_operation (b"Tj" , operands [2 :])
1727
+ extractor . process_operation (b"Tw" , [operands [0 ]])
1728
+ extractor . process_operation (b"Tc" , [operands [1 ]])
1729
+ extractor . process_operation (b"T*" , [])
1730
+ extractor . process_operation (b"Tj" , operands [2 :])
1940
1731
elif operator == b"TJ" :
1941
1732
# The space width may be smaller than the font width, so the width should be 95%.
1942
- _confirm_space_width = _space_width * 0.95
1733
+ _confirm_space_width = extractor . _space_width * 0.95
1943
1734
if operands :
1944
1735
for op in operands [0 ]:
1945
1736
if isinstance (op , (str , bytes )):
1946
- process_operation (b"Tj" , [op ])
1737
+ extractor . process_operation (b"Tj" , [op ])
1947
1738
if isinstance (op , (int , float , NumberObject , FloatObject )) and (
1948
1739
abs (float (op )) >= _confirm_space_width
1949
- and text
1950
- and text [- 1 ] != " "
1740
+ and extractor . text
1741
+ and extractor . text [- 1 ] != " "
1951
1742
):
1952
- process_operation (b"Tj" , [" " ])
1743
+ extractor . process_operation (b"Tj" , [" " ])
1953
1744
elif operator == b"TD" :
1954
- process_operation (b"TL" , [- operands [1 ]])
1955
- process_operation (b"Td" , operands )
1745
+ extractor . process_operation (b"TL" , [- operands [1 ]])
1746
+ extractor . process_operation (b"Td" , operands )
1956
1747
elif operator == b"Do" :
1957
- output += text
1748
+ extractor . output += extractor . text
1958
1749
if visitor_text is not None :
1959
- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1750
+ visitor_text (
1751
+ extractor .text ,
1752
+ extractor .memo_cm ,
1753
+ extractor .memo_tm ,
1754
+ extractor .cmap [3 ],
1755
+ extractor .font_size ,
1756
+ )
1960
1757
try :
1961
- if output [- 1 ] != "\n " :
1962
- output += "\n "
1758
+ if extractor . output [- 1 ] != "\n " :
1759
+ extractor . output += "\n "
1963
1760
if visitor_text is not None :
1964
1761
visitor_text (
1965
1762
"\n " ,
1966
- memo_cm ,
1967
- memo_tm ,
1968
- cmap [3 ],
1969
- font_size ,
1763
+ extractor . memo_cm ,
1764
+ extractor . memo_tm ,
1765
+ extractor . cmap [3 ],
1766
+ extractor . font_size ,
1970
1767
)
1971
1768
except IndexError :
1972
1769
pass
@@ -1981,32 +1778,38 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
1981
1778
visitor_operand_after ,
1982
1779
visitor_text ,
1983
1780
)
1984
- output += text
1781
+ extractor . output += text
1985
1782
if visitor_text is not None :
1986
1783
visitor_text (
1987
1784
text ,
1988
- memo_cm ,
1989
- memo_tm ,
1990
- cmap [3 ],
1991
- font_size ,
1785
+ extractor . memo_cm ,
1786
+ extractor . memo_tm ,
1787
+ extractor . cmap [3 ],
1788
+ extractor . font_size ,
1992
1789
)
1993
1790
except Exception as exception :
1994
1791
logger_warning (
1995
1792
f"Impossible to decode XFormObject { operands [0 ]} : { exception } " ,
1996
1793
__name__ ,
1997
1794
)
1998
1795
finally :
1999
- text = ""
2000
- memo_cm = cm_matrix .copy ()
2001
- memo_tm = tm_matrix .copy ()
1796
+ extractor . text = ""
1797
+ extractor . memo_cm = extractor . cm_matrix .copy ()
1798
+ extractor . memo_tm = extractor . tm_matrix .copy ()
2002
1799
else :
2003
- process_operation (operator , operands )
1800
+ extractor . process_operation (operator , operands )
2004
1801
if visitor_operand_after is not None :
2005
- visitor_operand_after (operator , operands , cm_matrix , tm_matrix )
2006
- output += text # just in case
2007
- if text != "" and visitor_text is not None :
2008
- visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
2009
- return output
1802
+ visitor_operand_after (operator , operands , extractor .cm_matrix , extractor .tm_matrix )
1803
+ extractor .output += extractor .text # just in case
1804
+ if extractor .text != "" and visitor_text is not None :
1805
+ visitor_text (
1806
+ extractor .text ,
1807
+ extractor .memo_cm ,
1808
+ extractor .memo_tm ,
1809
+ extractor .cmap [3 ],
1810
+ extractor .font_size ,
1811
+ )
1812
+ return extractor .output
2010
1813
2011
1814
def _layout_mode_fonts (self ) -> Dict [str , _layout_mode .Font ]:
2012
1815
"""
0 commit comments