Skip to content

Commit 10153e8

Browse files
committed
Add ability to set fuzziness ratio for matching the wanted items
1 parent 17ea783 commit 10153e8

File tree

2 files changed

+22
-15
lines changed

2 files changed

+22
-15
lines changed

autoscraper/auto_scraper.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from bs4 import BeautifulSoup
1111

1212
from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \
13-
ResultItem, FuzzyText, get_non_rec_text
13+
ResultItem, FuzzyText, get_non_rec_text, text_match
1414

1515

1616
class AutoScraper(object):
@@ -119,18 +119,18 @@ def _get_valid_attrs(item):
119119
return attrs
120120

121121
@staticmethod
122-
def _child_has_text(child, text, url):
122+
def _child_has_text(child, text, url, text_fuzz_ratio):
123123
child_text = child.getText().strip()
124124

125-
if text == child_text:
125+
if text_match(text, child_text, text_fuzz_ratio):
126126
parent_text = child.parent.getText().strip()
127127
if child_text == parent_text:
128128
return False
129129

130130
child.wanted_attr = None
131131
return True
132132

133-
if text == get_non_rec_text(child):
133+
if text_match(text, get_non_rec_text(child), text_fuzz_ratio):
134134
child.is_non_rec_text = True
135135
child.wanted_attr = None
136136
return True
@@ -140,7 +140,7 @@ def _child_has_text(child, text, url):
140140
continue
141141

142142
value = value.strip()
143-
if text == value:
143+
if text_match(text, value, text_fuzz_ratio):
144144
child.wanted_attr = key
145145
return True
146146

@@ -153,13 +153,14 @@ def _child_has_text(child, text, url):
153153

154154
return False
155155

156-
def _get_children(self, soup, text, url):
156+
def _get_children(self, soup, text, url, text_fuzz_ratio):
157157
text = text.strip()
158158
children = reversed(soup.findChildren())
159-
children = [x for x in children if self._child_has_text(x, text, url)]
159+
children = [x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)]
160160
return children
161161

162-
def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None, update=False):
162+
def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None,
163+
update=False, text_fuzz_ratio=1.0):
163164
"""
164165
Automatically constructs a set of rules to scrape the specified target[s] from a web page.
165166
The rules are represented as stack_list.
@@ -190,9 +191,12 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
190191
If True, new learned rules will be added to the previous ones.
191192
If False, all previously learned rules will be removed.
192193
194+
text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0
195+
The fuzziness ratio threshold for matching the wanted contents.
196+
193197
Returns:
194198
--------
195-
None
199+
List of similar results
196200
"""
197201

198202
soup = self._get_soup(url=url, html=html, request_args=request_args)
@@ -212,7 +216,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
212216
wanted_list += wanted_items
213217

214218
for wanted in wanted_items:
215-
children = self._get_children(soup, wanted, url)
219+
children = self._get_children(soup, wanted, url, text_fuzz_ratio)
216220

217221
for child in children:
218222
result, stack = self._get_result_for_child(child, soup, url)
@@ -223,11 +227,8 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
223227
result_list = [item.text for item in result_list]
224228
result_list = unique_hashable(result_list)
225229

226-
if all(w in result_list for w in wanted_list):
227-
self.stack_list = unique_stack_list(self.stack_list)
228-
return result_list
229-
230-
return None
230+
self.stack_list = unique_stack_list(self.stack_list)
231+
return result_list
231232

232233
@classmethod
233234
def _build_stack(cls, child, url):

autoscraper/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ def get_non_rec_text(element):
3232
return ''.join(element.find_all(text=True, recursive=False)).strip()
3333

3434

35+
def text_match(t1, t2, ratio_limit):
36+
if ratio_limit >= 1:
37+
return t1 == t2
38+
return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit
39+
40+
3541
class ResultItem():
3642
def __init__(self, text, index):
3743
self.text = text

0 commit comments

Comments
 (0)