Skip to content

Commit 5d5f390

Browse files
committed
Add support for regular expressions for wanted items
1 parent b2e3391 commit 5d5f390

File tree

3 files changed

+17
-9
lines changed

3 files changed

+17
-9
lines changed

autoscraper/auto_scraper.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import hashlib
22
import json
3-
import unicodedata
43

54
from collections import defaultdict
65
from html import unescape
@@ -10,7 +9,7 @@
109
from bs4 import BeautifulSoup
1110

1211
from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \
13-
ResultItem, FuzzyText, get_non_rec_text, text_match
12+
ResultItem, FuzzyText, get_non_rec_text, text_match, normalize
1413

1514

1615
class AutoScraper(object):
@@ -92,7 +91,7 @@ def _get_soup(cls, url=None, html=None, request_args=None):
9291
request_args = request_args or {}
9392

9493
if html:
95-
html = unicodedata.normalize("NFKD", unescape(html))
94+
html = normalize(unescape(html))
9695
return BeautifulSoup(html, 'lxml')
9796

9897
headers = dict(cls.request_headers)
@@ -102,7 +101,7 @@ def _get_soup(cls, url=None, html=None, request_args=None):
102101
user_headers = request_args.pop('headers', {})
103102
headers.update(user_headers)
104103
html = requests.get(url, headers=headers, **request_args).text
105-
html = unicodedata.normalize("NFKD", unescape(html))
104+
html = normalize(unescape(html))
106105

107106
return BeautifulSoup(html, 'lxml')
108107

@@ -154,7 +153,6 @@ def _child_has_text(child, text, url, text_fuzz_ratio):
154153
return False
155154

156155
def _get_children(self, soup, text, url, text_fuzz_ratio):
157-
text = text.strip()
158156
children = reversed(soup.findChildren())
159157
children = [x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)]
160158
return children
@@ -170,13 +168,14 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
170168
url: str, optional
171169
URL of the target web page. You should either pass url or html or both.
172170
173-
wanted_list: list, optional
171+
wanted_list: list of strings or compiled regular expressions, optional
174172
A list of needed contents to be scraped.
175173
AutoScraper learns a set of rules to scrape these targets. If specified,
176174
wanted_dict will be ignored.
177175
178176
wanted_dict: dict, optional
179-
A dict of needed contents to be scraped. Keys are aliases and values are list of target texts.
177+
A dict of needed contents to be scraped. Keys are aliases and values are list of target texts
178+
or compiled regular expressions.
180179
AutoScraper learns a set of rules to scrape these targets and sets its aliases.
181180
182181
html: str, optional
@@ -212,7 +211,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
212211
wanted_list = []
213212

214213
for alias, wanted_items in wanted_dict.items():
215-
wanted_items = [unicodedata.normalize("NFKD", w) for w in wanted_items]
214+
wanted_items = [normalize(w) for w in wanted_items]
216215
wanted_list += wanted_items
217216

218217
for wanted in wanted_items:

autoscraper/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import random
44
import string
5+
import unicodedata
56

67
from difflib import SequenceMatcher
78

@@ -32,7 +33,15 @@ def get_non_rec_text(element):
3233
return ''.join(element.find_all(text=True, recursive=False)).strip()
3334

3435

36+
def normalize(item):
37+
if not isinstance(item, str):
38+
return item
39+
return unicodedata.normalize("NFKD", item.strip())
40+
41+
3542
def text_match(t1, t2, ratio_limit):
43+
if hasattr(t1, 'fullmatch'):
44+
return bool(t1.fullmatch(t2))
3645
if ratio_limit >= 1:
3746
return t1 == t2
3847
return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
setup(
1111
name='autoscraper',
1212

13-
version='1.1.9',
13+
version='1.1.10',
1414

1515
description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python',
1616
long_description_content_type="text/markdown",

0 commit comments

Comments
 (0)