new collector, complete guide, awesome tool for checking collectors

DevAlone · DevAlone · commit 1a8f2088081c · 2018-10-05T00:55:44.000+03:00
diff --git a/TODO b/TODO
@@ -1,6 +1,5 @@
 new web interface
 statistics for proxies and for collectors
-logging for collectors
 log slow sql queries
 set limit for total number of proxies
 do not update never working proxies
@@ -12,6 +11,5 @@ floating updating period for collectors
 p2p
 log memory consumption
 stop using ipinfo.io because of rate limits
-fix leak of something, again :(
 remove processor proxies queue model
 write number of asyncio tasks to DB to be able to draw more grahps(everybody loves statistics)
diff --git a/collectors/__init__.py b/collectors/__init__.py
@@ -0,0 +1,2 @@
+from collectors.abstract_collector import AbstractCollector
+from collectors.pages_collector import PagesCollector
diff --git a/collectors/abstract_collector.py b/collectors/abstract_collector.py
@@ -5,7 +5,6 @@
 import models
 
 
-# TODO: refactor saving state
 class AbstractCollector:
     """Base class for all types of collectors"""
 
@@ -38,7 +37,7 @@ async def collect(self):
     async def _collect(self):
         """Do not use! It is called on collector's processing automatically"""
 
-        # TODO: uncomment when python 3.6 come to ubuntu lts
+        # TODO: uncomment when python 3.6 comes to ubuntu lts
         # i = 0
         # async for proxy in self.collect():
         #     if i > settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST:
@@ -74,17 +73,16 @@ async def load_state(self, state):
     async def save_state(self, state: models.CollectorState):
         """
         Function for saving collector's state to database model.
-        It's called automatically, don't worry.
+        It's called automatically, don't worry about it.
         """
         state.last_processing_time = self.last_processing_time
         state.processing_period = self.processing_period
         state.last_processing_proxies_count = self.last_processing_proxies_count
 
         if self.saved_variables is not None:
+            if '_variables' not in self.data:
+                self.data['_variables'] = {}
             for var_name in self.saved_variables:
-                if '_variables' not in self.data:
-                    self.data['_variables'] = {}
-
                 self.data['_variables'][var_name] = getattr(self, var_name)
 
         state.data = json.dumps(self.data)
diff --git a/collectors/web/cn/89ip/collector.py b/collectors/web/cn/89ip/collector.py
@@ -1,5 +1,5 @@
-from collectors.abstract_collector import AbstractCollector
-from parsers.regex_parser import RegexParser
+from collectors import AbstractCollector
+from parsers import RegexParser
 
 import http_client
 
@@ -9,7 +9,8 @@ class Collector(AbstractCollector):
 
     def __init__(self):
         super(Collector, self).__init__()
-        self.processing_period = 30 * 60  # 30 minutes
+        # 30 minutes
+        self.processing_period = 30 * 60
         '''
         floating period means proxy_py will be changing 
         period to not make extra requests and handle 
@@ -20,9 +21,9 @@ def __init__(self):
 
     async def collect(self):
         url = 'http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp='
-        return []
-        # html = await http_client.get_text(url)
-        with open('/tmp/mock') as f:
-            html = f.read()
-
+        # send a request to get html code of the page
+        html = await http_client.get_text(url)
+        # and just parse it using regex parser with a default rule to parse
+        # proxies like this:
+        # 8.8.8.8:8080
         return RegexParser().parse(html)
diff --git a/docs/source/guides/how_to_create_collector.rst b/docs/source/guides/how_to_create_collector.rst
@@ -1,5 +1,5 @@
-proxy_py How to create collector
-================================
+proxy_py How to create a collector
+==================================
 
 Collector is a class which is used to parse proxies from web page or another source.
 All collectors are inherited from `collectors.abstract_collector.AbstractCollector`,
@@ -45,7 +45,7 @@ per domain as you want
 
 .. code-block:: python
 
-    from collectors.abstract_collector import AbstractCollector
+    from collectors import AbstractCollector
 
 
     class Collector(AbstractCollector):
@@ -70,10 +70,67 @@ like this:
 
 
 The last step is to implement `collect()` method.
+Import useful things
+
+.. code-block:: python
+
+    from parsers import RegexParser
+
+    import http_client
+
+
+and implement method like this:
+
+.. code-block:: python
+
+    async def collect(self):
+        url = 'http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp='
+        # send a request to get html code of the page
+        html = await http_client.get_text(url)
+        # and just parse it using regex parser with a default rule to parse
+        # proxies like this:
+        # 8.8.8.8:8080
+        return RegexParser().parse(html)
+
+That's all!
+
+Now is time to little test, to be sure your collector is working
+you can run proxy_py with `--test-collector` option:
+
+.. code-block:: bash
+
+    python3 main.py --test-collector collectors/web/cn/89ip/collector.py:Collector
+
+which means to take class Collector from the file `collectors/web/cn/89ip/collector.py`
+
+It's gonna draw you a pattern like this:
+
+.. image:: https://i.imgur.com/fmVp3Iz.png
+
+Where red cell means not working proxy
+
+- cyan - respond within a second
+- green - slower than 5 seconds
+- yellow - up to 10 seconds
+- magenta - slower than 10 seconds
+
+Note: don't forget that settings.py limits amount of time
+for proxy to respond.
+You can override proxy checking timeout by using
+`--proxy-checking-timeout` option. For example
+
+.. code-block:: bash
+
+    python3 main.py --test-collector collectors/web/cn/89ip/collector.py:Collector --proxy-checking-timeout 60
+
+With 60 seconds timeout it looks better
+
+.. image:: https://i.imgur.com/DmNuzOI.png
 
 Paginated collector
 *******************
 
-So, let's create a simple paginated collector. 
-
+Allright, you've done with a simple collector,
+you're almost a pro, let's now dive a little deeper
 
+# TODO: complete this guide
diff --git a/main.py b/main.py
@@ -29,13 +29,22 @@ def str_to_bool(value):
 
     cmd_parser = argparse.ArgumentParser()
     cmd_parser.add_argument("--debug", type=str_to_bool, help="override settings' debug value")
+    cmd_parser.add_argument(
+        "--proxy-checking-timeout", type=float, help="override settings' proxy checking timeout"
+    )
     cmd_parser.add_argument("--test-collector", help="test collector with a given path")
 
     args = cmd_parser.parse_args()
 
     if args.debug is not None:
         settings.DEBUG = args.debug
 
+    if args.proxy_checking_timeout is not None:
+        if args.proxy_checking_timeout < 0:
+            raise ValueError("--proxy-checking-timeout should be positive")
+
+        settings.PROXY_CHECKING_TIMEOUT = args.proxy_checking_timeout
+
     test_collector_path = args.test_collector
 
 
diff --git a/parsers/__init__.py b/parsers/__init__.py
@@ -0,0 +1 @@
+from parsers.regex_parser import RegexParser
diff --git a/processor.py b/processor.py
@@ -1,14 +1,12 @@
 import proxy_validator
 from checkers.base_checker import CheckerResult
-from parsers.regex_parser import PROXY_VALIDATE_REGEX
 from proxy_py import settings
 from models import Proxy, CollectorState, db
 
 import collectors_list
 import proxy_utils
 import asyncio
 import time
-import re
 import logging
 import peewee
 
diff --git a/requirements.txt b/requirements.txt
@@ -11,3 +11,4 @@ sphinx_rtd_theme
 py_mini_racer
 pytest
 pytest-asyncio
+termcolor
diff --git a/tools/test_collector.py b/tools/test_collector.py
@@ -1,8 +1,16 @@
-import proxy_validator
+import time
+
+from termcolor import colored
+import termcolor
+from models import Proxy
+from proxy_py import settings
 from collectors_list import collectors
 
 import re
 import sys
+import proxy_utils
+import asyncio
+import proxy_validator
 
 from parsers.regex_parser import RegexParser
 
@@ -24,13 +32,51 @@ async def run(path: str):
 
     result = list(await collector.collect())
 
+    print("Total number of proxies: {}".format(len(result)))
+    await asyncio.gather(*[process_proxy(proxy) for proxy in result])
 
-    for proxy in result:
+
+proxies_semaphore = asyncio.BoundedSemaphore(settings.NUMBER_OF_CONCURRENT_TASKS)
+async def process_proxy(proxy_url: str):
+    async with proxies_semaphore:
         try:
-            proxy_validator.retrieve(proxy)
+            _, auth_data, domain, port = proxy_validator.retrieve(proxy_url)
         except proxy_validator.ValidationError as ex:
             raise ValueError(
                 "Your collector returned bad proxy \"{}\". Message: \"{}\"".format(proxy, ex)
             )
 
-    print("Good job, your collector is awesome!")
+        is_working = False
+        for raw_protocol in range(len(Proxy.PROTOCOLS)):
+            proxy_url = "{}://".format(Proxy.PROTOCOLS[raw_protocol])
+            if auth_data:
+                proxy_url += auth_data + "@"
+
+            proxy_url += domain + ":" + str(port)
+
+            start_checking_time = time.time()
+            check_result, checker_additional_info = await proxy_utils.check_proxy(proxy_url)
+            end_checking_time = time.time()
+
+            if check_result:
+                is_working = True
+                break
+
+        response_time = end_checking_time - start_checking_time
+
+        color = ''
+
+        if not is_working:
+            color = 'red'
+        elif response_time < 1:
+            color = 'cyan'
+        elif response_time < 5:
+            color = 'green'
+        elif response_time < 10:
+            color = 'yellow'
+        else:
+            color = 'magenta'
+
+        print(colored(' ', on_color='on_' + color), end='')
+
+        sys.stdout.flush()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from collectors.abstract_collector import AbstractCollector`
	`2`	`+from collectors.pages_collector import PagesCollector`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from parsers.regex_parser import RegexParser`