1
1
import hashlib
2
2
import json
3
- import unicodedata
4
3
5
4
from collections import defaultdict
6
5
from html import unescape
10
9
from bs4 import BeautifulSoup
11
10
12
11
from autoscraper .utils import get_random_str , unique_hashable , unique_stack_list , \
13
- ResultItem , FuzzyText , get_non_rec_text , text_match
12
+ ResultItem , FuzzyText , get_non_rec_text , text_match , normalize
14
13
15
14
16
15
class AutoScraper (object ):
@@ -92,7 +91,7 @@ def _get_soup(cls, url=None, html=None, request_args=None):
92
91
request_args = request_args or {}
93
92
94
93
if html :
95
- html = unicodedata . normalize ("NFKD" , unescape (html ))
94
+ html = normalize (unescape (html ))
96
95
return BeautifulSoup (html , 'lxml' )
97
96
98
97
headers = dict (cls .request_headers )
@@ -102,7 +101,7 @@ def _get_soup(cls, url=None, html=None, request_args=None):
102
101
user_headers = request_args .pop ('headers' , {})
103
102
headers .update (user_headers )
104
103
html = requests .get (url , headers = headers , ** request_args ).text
105
- html = unicodedata . normalize ("NFKD" , unescape (html ))
104
+ html = normalize (unescape (html ))
106
105
107
106
return BeautifulSoup (html , 'lxml' )
108
107
@@ -154,7 +153,6 @@ def _child_has_text(child, text, url, text_fuzz_ratio):
154
153
return False
155
154
156
155
def _get_children (self , soup , text , url , text_fuzz_ratio ):
157
- text = text .strip ()
158
156
children = reversed (soup .findChildren ())
159
157
children = [x for x in children if self ._child_has_text (x , text , url , text_fuzz_ratio )]
160
158
return children
@@ -170,13 +168,14 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
170
168
url: str, optional
171
169
URL of the target web page. You should either pass url or html or both.
172
170
173
- wanted_list: list, optional
171
+ wanted_list: list of strings or compiled regular expressions , optional
174
172
A list of needed contents to be scraped.
175
173
AutoScraper learns a set of rules to scrape these targets. If specified,
176
174
wanted_dict will be ignored.
177
175
178
176
wanted_dict: dict, optional
179
- A dict of needed contents to be scraped. Keys are aliases and values are list of target texts.
177
+ A dict of needed contents to be scraped. Keys are aliases and values are list of target texts
178
+ or compiled regular expressions.
180
179
AutoScraper learns a set of rules to scrape these targets and sets its aliases.
181
180
182
181
html: str, optional
@@ -212,7 +211,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
212
211
wanted_list = []
213
212
214
213
for alias , wanted_items in wanted_dict .items ():
215
- wanted_items = [unicodedata . normalize ("NFKD" , w ) for w in wanted_items ]
214
+ wanted_items = [normalize (w ) for w in wanted_items ]
216
215
wanted_list += wanted_items
217
216
218
217
for wanted in wanted_items :
0 commit comments