10
10
from bs4 import BeautifulSoup
11
11
12
12
from autoscraper .utils import get_random_str , unique_hashable , unique_stack_list , \
13
- ResultItem , FuzzyText , get_non_rec_text
13
+ ResultItem , FuzzyText , get_non_rec_text , text_match
14
14
15
15
16
16
class AutoScraper (object ):
@@ -119,18 +119,18 @@ def _get_valid_attrs(item):
119
119
return attrs
120
120
121
121
@staticmethod
122
- def _child_has_text (child , text , url ):
122
+ def _child_has_text (child , text , url , text_fuzz_ratio ):
123
123
child_text = child .getText ().strip ()
124
124
125
- if text == child_text :
125
+ if text_match ( text , child_text , text_fuzz_ratio ) :
126
126
parent_text = child .parent .getText ().strip ()
127
127
if child_text == parent_text :
128
128
return False
129
129
130
130
child .wanted_attr = None
131
131
return True
132
132
133
- if text == get_non_rec_text (child ):
133
+ if text_match ( text , get_non_rec_text (child ), text_fuzz_ratio ):
134
134
child .is_non_rec_text = True
135
135
child .wanted_attr = None
136
136
return True
@@ -140,7 +140,7 @@ def _child_has_text(child, text, url):
140
140
continue
141
141
142
142
value = value .strip ()
143
- if text == value :
143
+ if text_match ( text , value , text_fuzz_ratio ) :
144
144
child .wanted_attr = key
145
145
return True
146
146
@@ -153,13 +153,14 @@ def _child_has_text(child, text, url):
153
153
154
154
return False
155
155
156
- def _get_children (self , soup , text , url ):
156
+ def _get_children (self , soup , text , url , text_fuzz_ratio ):
157
157
text = text .strip ()
158
158
children = reversed (soup .findChildren ())
159
- children = [x for x in children if self ._child_has_text (x , text , url )]
159
+ children = [x for x in children if self ._child_has_text (x , text , url , text_fuzz_ratio )]
160
160
return children
161
161
162
- def build (self , url = None , wanted_list = None , wanted_dict = None , html = None , request_args = None , update = False ):
162
+ def build (self , url = None , wanted_list = None , wanted_dict = None , html = None , request_args = None ,
163
+ update = False , text_fuzz_ratio = 1.0 ):
163
164
"""
164
165
Automatically constructs a set of rules to scrape the specified target[s] from a web page.
165
166
The rules are represented as stack_list.
@@ -190,9 +191,12 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
190
191
If True, new learned rules will be added to the previous ones.
191
192
If False, all previously learned rules will be removed.
192
193
194
+ text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
195
+ The fuzziness ratio threshold for matching the wanted contents.
196
+
193
197
Returns:
194
198
--------
195
- None
199
+ List of similar results
196
200
"""
197
201
198
202
soup = self ._get_soup (url = url , html = html , request_args = request_args )
@@ -212,7 +216,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
212
216
wanted_list += wanted_items
213
217
214
218
for wanted in wanted_items :
215
- children = self ._get_children (soup , wanted , url )
219
+ children = self ._get_children (soup , wanted , url , text_fuzz_ratio )
216
220
217
221
for child in children :
218
222
result , stack = self ._get_result_for_child (child , soup , url )
@@ -223,11 +227,8 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
223
227
result_list = [item .text for item in result_list ]
224
228
result_list = unique_hashable (result_list )
225
229
226
- if all (w in result_list for w in wanted_list ):
227
- self .stack_list = unique_stack_list (self .stack_list )
228
- return result_list
229
-
230
- return None
230
+ self .stack_list = unique_stack_list (self .stack_list )
231
+ return result_list
231
232
232
233
@classmethod
233
234
def _build_stack (cls , child , url ):
0 commit comments