some code

DevAlone · DevAlone · commit 3435a72f605f · 2018-07-19T23:48:08.000+03:00
diff --git a/checkers/base_checker.py b/checkers/base_checker.py
@@ -35,6 +35,7 @@ def set_attr_if_is_not_none(attribute_name, first_obj, second_obj):
 
 
 class BaseChecker:
+    # TODO: rewrite using HttpClient
     aiohttp_connector = None
 
     def __init__(self, url=None, request_type="GET", timeout=None):
diff --git a/collectors/web/cn/89ip/collector.py b/collectors/web/cn/89ip/collector.py
@@ -0,0 +1,23 @@
+from collectors.abstract_collector import AbstractCollector
+import http_client
+
+
+class Collector(AbstractCollector):
+    __collector__ = True
+
+    def __init__(self):
+        super(Collector, self).__init__()
+        self.processing_period = 30 * 60  # 30 minutes
+        '''
+        floating period means proxy_py will be changing 
+        period to not make extra requests and handle 
+        new proxies in time, you don't need to change 
+        it in most cases
+        '''
+        # self.floating_processing_period = False
+
+    async def collect(self):
+        url = 'http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp='
+        html = await http_client.get_text(url)
+
+        return []
diff --git a/docs/source/guides/how_to_create_collector.rst b/docs/source/guides/how_to_create_collector.rst
@@ -1,14 +1,21 @@
 proxy_py How to create collector
 ================================
 
-Collector is a class which is used to parse proxies from web page or another source. All collectors are inherited from `collectors.abstract_collector.AbstractCollector`, also there is `collectors.pages_collector.PagesCollector` which is used for paginated sources. It's better to understand through examples.
+Collector is a class which is used to parse proxies from web page or another source.
+All collectors are inherited from `collectors.abstract_collector.AbstractCollector`,
+also there is `collectors.pages_collector.PagesCollector` which is used for paginated sources.
+It's better to understand through examples.
 
 Simple collector
 ****************
 
-Let's start with the simplest collector we can imagine, it will be from the page http://www.89ip.cn/ti.html as you can see, it sends form as GET request to this url http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp= 
+Let's start with the simplest collector we can imagine,
+it will be from the page http://www.89ip.cn/ti.html as you can see,
+it sends form as GET request to this url
+http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp=
 
-Firstly we can try to check that these proxies are really good. Just copy and paste list of proxies to file say /tmp/proxies and run this command inside virtual environment
+Firstly we can try to check that these proxies are really good.
+Just copy and paste list of proxies to file say /tmp/proxies and run this command inside virtual environment
 
 .. code-block:: bash
 
@@ -18,10 +25,45 @@ You're gonna get something like this:
 
 `++++++++++++++++++++++-+++++-+++++++++++++++++++++++++++-++++++-++++-+++++++++++++++++++++++++++++++--+++++++-+++++++-++-+-+++-+++++++++-+++++++++++++++++++++--++--+-++++++++++++++++-+++--+++-+-+++++++++++++++++--++++++++++++-+++++-+++-++++++++-+++++-+-+++++++-++-+--++++-+++-++++++++++-++++--+++++++-+++++++-++--+++++-+-+++++++++++++++++++++-++-+++-+++--++++--+++-+++++++-+++++++-+++++++++++++++---+++++-+++++++++-+++++-+-++++++++++++-+--+++--+-+-+-++-+++++-+++--++++++-+++++++++++--+-+++-+-++++--+++++--+++++++++-+-+-++++-+-++++++++++++++-++-++++++--+--++++-+-++--++--+++++-++-+++-++++--++--+---------+--+--++--------+++-++-+--++++++++++++++++-+++++++++-+++++++--+--+--+-+-+++---++------------------+--+----------+-+-+--++-+----------+-------+--+------+----+-+--+--++----+--+-++++++-++-+++`
 
-+ means working proxy with at leat one protocol, - means not working, the result above is perfect, so many good proxies.
++ means working proxy with at least one protocol,
+- means not working, the result above is perfect, so many good proxies.
 
 Note: working means proxy respond with timeout set in settings, if you increase it you'll get more proxies.
 
+Alright, let's code it!
+
+We need to place our collector inside `collectors/web/` directory using reversed domain path,
+it will be `collectors/web/cn/89ip/collector.py`
+
+To make class be collector we need to declare variable `__collector__`
+
+Note: name of file and name of class don't make sense,
+you can declare as many files and classes in each file per domain as you want
+
+.. code-block:: python
+
+    from collectors.abstract_collector import AbstractCollector
+
+
+    class Collector(AbstractCollector):
+        __collector__ = True
+
+We can override default processing period in constructor like this:
+
+.. code-block:: python
+
+    def __init__(self):
+        super(Collector, self).__init__()
+        self.processing_period = 30 * 60  # 30 minutes
+        '''
+        floating period means proxy_py will be changing
+        period to not make extra requests and handle
+        new proxies in time, you don't need to change
+        it in most cases
+        '''
+        # self.floating_processing_period = False
+
+
 Paginated collector
 *******************
 
diff --git a/http_client.py b/http_client.py
@@ -0,0 +1,91 @@
+from proxy_py import settings
+from fake_useragent import UserAgent
+from aiosocks.connector import ProxyConnector, ProxyClientRequest
+
+import aiohttp
+
+
+# TODO: complete cookies saving
+class HttpClient:
+    """
+    Simple class for making http requests,
+    user-agent is set to random one in constructor
+    """
+    _aiohttp_connector = None
+
+    def __init__(self):
+        self.user_agent = UserAgent().random
+        self.timeout = 60
+        if HttpClient._aiohttp_connector is None:
+            HttpClient._aiohttp_connector = ProxyConnector(
+                remote_resolve=True,
+                limit=settings.NUMBER_OF_SIMULTANEOUS_REQUESTS,
+                limit_per_host=settings.NUMBER_OF_SIMULTANEOUS_REQUESTS_PER_HOST,
+            )
+        self.proxy_address = None
+
+    async def get(self, url):
+        """
+        send HTTP GET request
+
+        :param url:
+        :return:
+        """
+        pass
+
+    async def post(self, url, data):
+        """
+        send HTTP POST request
+
+        :param url:
+        :param data:
+        :return:
+        """
+        raise NotImplementedError()
+
+    async def request(self, method, url, data) -> HttpClientResult:
+        headers = {
+            'User-Agent': self.user_agent,
+
+        }
+
+        async with aiohttp.ClientSession(connector=HttpClient._aiohttp_connector,
+                                         connector_owner=False,
+                                         request_class=ProxyClientRequest
+                                         ) as session:
+            async with session.request(method,
+                                       url=url,
+                                       proxy=self.proxy_address,
+                                       timeout=self.timeout,
+                                       headers=headers) as response:
+                return HttpClientResult.make(response)
+
+    @staticmethod
+    async def clean():
+        HttpClient._aiohttp_connector.close()
+
+
+class HttpClientResult:
+    text = None
+    aiohttp_response = None
+
+    @staticmethod
+    async def make(aiohttp_response):
+        obj = HttpClientResult()
+        obj.aiohttp_response = aiohttp_response
+        obj.text = await obj.aiohttp_response.text()
+
+        return obj
+
+    def as_text(self):
+        return self.text
+
+
+async def get_text(url):
+    """
+    fast method for getting get response without creating extra objects
+
+    :param url:
+    :return:
+    """
+    return (await HttpClient().get(url)).as_text()