@@ -95,10 +95,10 @@ def parse_file_type(blocks):
95
95
return FileType .UNKNOWN
96
96
97
97
98
- def parse_investor_info (page_dict ) -> InvestorInfo :
98
+ def parse_investor_info (page_dict , page_rect : fitz . Rect ) -> InvestorInfo :
99
99
"""Parse investor info."""
100
- width = max (page_dict [ " width" ] , 600 )
101
- height = max (page_dict [ " height" ] , 800 )
100
+ width = max (page_rect . width , 600 )
101
+ height = max (page_rect . height , 800 )
102
102
103
103
blocks = sorted (
104
104
[x for x in page_dict ["blocks" ] if x ["bbox" ][1 ] < height / 2 ], key = lambda x : x ["bbox" ][1 ]
@@ -190,7 +190,7 @@ def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData
190
190
191
191
with fp :
192
192
try :
193
- doc = fitz .open (stream = fp .read (), filetype = "pdf" )
193
+ doc = fitz .Document (stream = fp .read (), filetype = "pdf" )
194
194
except Exception as e :
195
195
raise CASParseError ("Unhandled error while opening file :: %s" % (str (e )))
196
196
@@ -210,7 +210,7 @@ def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData
210
210
file_type = parse_file_type (blocks )
211
211
sorted_blocks = sorted (blocks , key = itemgetter (1 , 0 ))
212
212
if investor_info is None :
213
- investor_info = parse_investor_info (page_dict )
213
+ investor_info = parse_investor_info (page_dict , page . rect )
214
214
pages .append (sorted_blocks )
215
215
lines = group_similar_rows (pages )
216
216
return PartialCASData (file_type = file_type , investor_info = investor_info , lines = lines )
0 commit comments