Add support for regular expressions for wanted items

alirezamika · alirezamika · commit 5d5f39096238 · 2020-11-29T16:31:28.000+03:30
diff --git a/autoscraper/auto_scraper.py b/autoscraper/auto_scraper.py
@@ -1,6 +1,5 @@
 import hashlib
 import json
-import unicodedata
 
 from collections import defaultdict
 from html import unescape
@@ -10,7 +9,7 @@
 from bs4 import BeautifulSoup
 
 from autoscraper.utils import get_random_str, unique_hashable, unique_stack_list, \
-    ResultItem, FuzzyText, get_non_rec_text, text_match
+    ResultItem, FuzzyText, get_non_rec_text, text_match, normalize
 
 
 class AutoScraper(object):
@@ -92,7 +91,7 @@ def _get_soup(cls, url=None, html=None, request_args=None):
         request_args = request_args or {}
 
         if html:
-            html = unicodedata.normalize("NFKD", unescape(html))
+            html = normalize(unescape(html))
             return BeautifulSoup(html, 'lxml')
 
         headers = dict(cls.request_headers)
@@ -102,7 +101,7 @@ def _get_soup(cls, url=None, html=None, request_args=None):
         user_headers = request_args.pop('headers', {})
         headers.update(user_headers)
         html = requests.get(url, headers=headers, **request_args).text
-        html = unicodedata.normalize("NFKD", unescape(html))
+        html = normalize(unescape(html))
 
         return BeautifulSoup(html, 'lxml')
 
@@ -154,7 +153,6 @@ def _child_has_text(child, text, url, text_fuzz_ratio):
         return False
 
     def _get_children(self, soup, text, url, text_fuzz_ratio):
-        text = text.strip()
         children = reversed(soup.findChildren())
         children = [x for x in children if self._child_has_text(x, text, url, text_fuzz_ratio)]
         return children
@@ -170,13 +168,14 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
         url: str, optional
             URL of the target web page. You should either pass url or html or both.
 
-        wanted_list: list, optional
+        wanted_list: list of strings or compiled regular expressions, optional
             A list of needed contents to be scraped.
                 AutoScraper learns a set of rules to scrape these targets. If specified,
                 wanted_dict will be ignored.
         
         wanted_dict: dict, optional
-            A dict of needed contents to be scraped. Keys are aliases and values are list of target texts.
+            A dict of needed contents to be scraped. Keys are aliases and values are list of target texts
+                or compiled regular expressions.
                 AutoScraper learns a set of rules to scrape these targets and sets its aliases.
 
         html: str, optional
@@ -212,7 +211,7 @@ def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request
         wanted_list = []
 
         for alias, wanted_items in wanted_dict.items():
-            wanted_items = [unicodedata.normalize("NFKD", w) for w in wanted_items]
+            wanted_items = [normalize(w) for w in wanted_items]
             wanted_list += wanted_items
 
             for wanted in wanted_items:
diff --git a/autoscraper/utils.py b/autoscraper/utils.py
@@ -2,6 +2,7 @@
 
 import random
 import string
+import unicodedata
 
 from difflib import SequenceMatcher
 
@@ -32,7 +33,15 @@ def get_non_rec_text(element):
     return ''.join(element.find_all(text=True, recursive=False)).strip()
 
 
+def normalize(item):
+    if not isinstance(item, str):
+        return item
+    return unicodedata.normalize("NFKD", item.strip())
+
+
 def text_match(t1, t2, ratio_limit):
+    if hasattr(t1, 'fullmatch'):
+        return bool(t1.fullmatch(t2))
     if ratio_limit >= 1:
         return t1 == t2
     return SequenceMatcher(None, t1, t2).ratio() >= ratio_limit
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 setup(
     name='autoscraper',
 
-    version='1.1.9',
+    version='1.1.10',
 
     description='A Smart, Automatic, Fast and Lightweight Web Scraper for Python',
     long_description_content_type="text/markdown",