Skip to content

Commit 1a8f208

Browse files
committed
new collector, complete guide, awesome tool for checking collectors
1 parent 7f775ae commit 1a8f208

File tree

10 files changed

+138
-27
lines changed

10 files changed

+138
-27
lines changed

TODO

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
new web interface
22
statistics for proxies and for collectors
3-
logging for collectors
43
log slow sql queries
54
set limit for total number of proxies
65
do not update never working proxies
@@ -12,6 +11,5 @@ floating updating period for collectors
1211
p2p
1312
log memory consumption
1413
stop using ipinfo.io because of rate limits
15-
fix leak of something, again :(
1614
remove processor proxies queue model
1715
write number of asyncio tasks to DB to be able to draw more grahps(everybody loves statistics)

collectors/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from collectors.abstract_collector import AbstractCollector
2+
from collectors.pages_collector import PagesCollector

collectors/abstract_collector.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import models
66

77

8-
# TODO: refactor saving state
98
class AbstractCollector:
109
"""Base class for all types of collectors"""
1110

@@ -38,7 +37,7 @@ async def collect(self):
3837
async def _collect(self):
3938
"""Do not use! It is called on collector's processing automatically"""
4039

41-
# TODO: uncomment when python 3.6 come to ubuntu lts
40+
# TODO: uncomment when python 3.6 comes to ubuntu lts
4241
# i = 0
4342
# async for proxy in self.collect():
4443
# if i > settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST:
@@ -74,17 +73,16 @@ async def load_state(self, state):
7473
async def save_state(self, state: models.CollectorState):
7574
"""
7675
Function for saving collector's state to database model.
77-
It's called automatically, don't worry.
76+
It's called automatically, don't worry about it.
7877
"""
7978
state.last_processing_time = self.last_processing_time
8079
state.processing_period = self.processing_period
8180
state.last_processing_proxies_count = self.last_processing_proxies_count
8281

8382
if self.saved_variables is not None:
83+
if '_variables' not in self.data:
84+
self.data['_variables'] = {}
8485
for var_name in self.saved_variables:
85-
if '_variables' not in self.data:
86-
self.data['_variables'] = {}
87-
8886
self.data['_variables'][var_name] = getattr(self, var_name)
8987

9088
state.data = json.dumps(self.data)

collectors/web/cn/89ip/collector.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from collectors.abstract_collector import AbstractCollector
2-
from parsers.regex_parser import RegexParser
1+
from collectors import AbstractCollector
2+
from parsers import RegexParser
33

44
import http_client
55

@@ -9,7 +9,8 @@ class Collector(AbstractCollector):
99

1010
def __init__(self):
1111
super(Collector, self).__init__()
12-
self.processing_period = 30 * 60 # 30 minutes
12+
# 30 minutes
13+
self.processing_period = 30 * 60
1314
'''
1415
floating period means proxy_py will be changing
1516
period to not make extra requests and handle
@@ -20,9 +21,9 @@ def __init__(self):
2021

2122
async def collect(self):
2223
url = 'http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp='
23-
return []
24-
# html = await http_client.get_text(url)
25-
with open('/tmp/mock') as f:
26-
html = f.read()
27-
24+
# send a request to get html code of the page
25+
html = await http_client.get_text(url)
26+
# and just parse it using regex parser with a default rule to parse
27+
# proxies like this:
28+
# 8.8.8.8:8080
2829
return RegexParser().parse(html)

docs/source/guides/how_to_create_collector.rst

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
proxy_py How to create collector
2-
================================
1+
proxy_py How to create a collector
2+
==================================
33

44
Collector is a class which is used to parse proxies from web page or another source.
55
All collectors are inherited from `collectors.abstract_collector.AbstractCollector`,
@@ -45,7 +45,7 @@ per domain as you want
4545

4646
.. code-block:: python
4747
48-
from collectors.abstract_collector import AbstractCollector
48+
from collectors import AbstractCollector
4949
5050
5151
class Collector(AbstractCollector):
@@ -70,10 +70,67 @@ like this:
7070
7171
7272
The last step is to implement `collect()` method.
73+
Import useful things
74+
75+
.. code-block:: python
76+
77+
from parsers import RegexParser
78+
79+
import http_client
80+
81+
82+
and implement method like this:
83+
84+
.. code-block:: python
85+
86+
async def collect(self):
87+
url = 'http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp='
88+
# send a request to get html code of the page
89+
html = await http_client.get_text(url)
90+
# and just parse it using regex parser with a default rule to parse
91+
# proxies like this:
92+
# 8.8.8.8:8080
93+
return RegexParser().parse(html)
94+
95+
That's all!
96+
97+
Now is time to little test, to be sure your collector is working
98+
you can run proxy_py with `--test-collector` option:
99+
100+
.. code-block:: bash
101+
102+
python3 main.py --test-collector collectors/web/cn/89ip/collector.py:Collector
103+
104+
which means to take class Collector from the file `collectors/web/cn/89ip/collector.py`
105+
106+
It's gonna draw you a pattern like this:
107+
108+
.. image:: https://i.imgur.com/fmVp3Iz.png
109+
110+
Where red cell means not working proxy
111+
112+
- cyan - respond within a second
113+
- green - slower than 5 seconds
114+
- yellow - up to 10 seconds
115+
- magenta - slower than 10 seconds
116+
117+
Note: don't forget that settings.py limits amount of time
118+
for proxy to respond.
119+
You can override proxy checking timeout by using
120+
`--proxy-checking-timeout` option. For example
121+
122+
.. code-block:: bash
123+
124+
python3 main.py --test-collector collectors/web/cn/89ip/collector.py:Collector --proxy-checking-timeout 60
125+
126+
With 60 seconds timeout it looks better
127+
128+
.. image:: https://i.imgur.com/DmNuzOI.png
73129

74130
Paginated collector
75131
*******************
76132

77-
So, let's create a simple paginated collector.
78-
133+
Allright, you've done with a simple collector,
134+
you're almost a pro, let's now dive a little deeper
79135

136+
# TODO: complete this guide

main.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,22 @@ def str_to_bool(value):
2929

3030
cmd_parser = argparse.ArgumentParser()
3131
cmd_parser.add_argument("--debug", type=str_to_bool, help="override settings' debug value")
32+
cmd_parser.add_argument(
33+
"--proxy-checking-timeout", type=float, help="override settings' proxy checking timeout"
34+
)
3235
cmd_parser.add_argument("--test-collector", help="test collector with a given path")
3336

3437
args = cmd_parser.parse_args()
3538

3639
if args.debug is not None:
3740
settings.DEBUG = args.debug
3841

42+
if args.proxy_checking_timeout is not None:
43+
if args.proxy_checking_timeout < 0:
44+
raise ValueError("--proxy-checking-timeout should be positive")
45+
46+
settings.PROXY_CHECKING_TIMEOUT = args.proxy_checking_timeout
47+
3948
test_collector_path = args.test_collector
4049

4150

parsers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from parsers.regex_parser import RegexParser

processor.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
import proxy_validator
22
from checkers.base_checker import CheckerResult
3-
from parsers.regex_parser import PROXY_VALIDATE_REGEX
43
from proxy_py import settings
54
from models import Proxy, CollectorState, db
65

76
import collectors_list
87
import proxy_utils
98
import asyncio
109
import time
11-
import re
1210
import logging
1311
import peewee
1412

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ sphinx_rtd_theme
1111
py_mini_racer
1212
pytest
1313
pytest-asyncio
14+
termcolor

tools/test_collector.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
1-
import proxy_validator
1+
import time
2+
3+
from termcolor import colored
4+
import termcolor
5+
from models import Proxy
6+
from proxy_py import settings
27
from collectors_list import collectors
38

49
import re
510
import sys
11+
import proxy_utils
12+
import asyncio
13+
import proxy_validator
614

715
from parsers.regex_parser import RegexParser
816

@@ -24,13 +32,51 @@ async def run(path: str):
2432

2533
result = list(await collector.collect())
2634

35+
print("Total number of proxies: {}".format(len(result)))
36+
await asyncio.gather(*[process_proxy(proxy) for proxy in result])
2737

28-
for proxy in result:
38+
39+
proxies_semaphore = asyncio.BoundedSemaphore(settings.NUMBER_OF_CONCURRENT_TASKS)
40+
async def process_proxy(proxy_url: str):
41+
async with proxies_semaphore:
2942
try:
30-
proxy_validator.retrieve(proxy)
43+
_, auth_data, domain, port = proxy_validator.retrieve(proxy_url)
3144
except proxy_validator.ValidationError as ex:
3245
raise ValueError(
3346
"Your collector returned bad proxy \"{}\". Message: \"{}\"".format(proxy, ex)
3447
)
3548

36-
print("Good job, your collector is awesome!")
49+
is_working = False
50+
for raw_protocol in range(len(Proxy.PROTOCOLS)):
51+
proxy_url = "{}://".format(Proxy.PROTOCOLS[raw_protocol])
52+
if auth_data:
53+
proxy_url += auth_data + "@"
54+
55+
proxy_url += domain + ":" + str(port)
56+
57+
start_checking_time = time.time()
58+
check_result, checker_additional_info = await proxy_utils.check_proxy(proxy_url)
59+
end_checking_time = time.time()
60+
61+
if check_result:
62+
is_working = True
63+
break
64+
65+
response_time = end_checking_time - start_checking_time
66+
67+
color = ''
68+
69+
if not is_working:
70+
color = 'red'
71+
elif response_time < 1:
72+
color = 'cyan'
73+
elif response_time < 5:
74+
color = 'green'
75+
elif response_time < 10:
76+
color = 'yellow'
77+
else:
78+
color = 'magenta'
79+
80+
print(colored(' ', on_color='on_' + color), end='')
81+
82+
sys.stdout.flush()

0 commit comments

Comments
 (0)