Skip to content

Commit 7f775ae

Browse files
committed
better proxy validation, parsers, some of guide
1 parent 711d4fa commit 7f775ae

File tree

9 files changed

+266
-89
lines changed

9 files changed

+266
-89
lines changed

collectors/web/cn/89ip/collector.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from collectors.abstract_collector import AbstractCollector
2+
from parsers.regex_parser import RegexParser
3+
24
import http_client
35

46

@@ -18,6 +20,9 @@ def __init__(self):
1820

1921
async def collect(self):
2022
url = 'http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp='
21-
html = await http_client.get_text(url)
22-
2323
return []
24+
# html = await http_client.get_text(url)
25+
with open('/tmp/mock') as f:
26+
html = f.read()
27+
28+
return RegexParser().parse(html)

collectors_list.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -60,19 +60,6 @@ async def init():
6060
)
6161

6262

63-
# def get_collector_state(module_name: str):
64-
# try:
65-
# collector_state = asyncio.get_event_loop().run_until_complete(db.get(
66-
# CollectorState.select().where(
67-
# CollectorState.identifier == module_name
68-
# )
69-
# ))
70-
# except CollectorState.DoesNotExist:
71-
# raise CollectorNotFoundException()
72-
#
73-
# return collector_state
74-
75-
7663
def get_collector_of_module_name(module_name: str):
7764
if module_name not in collectors:
7865
raise CollectorNotFoundException(

docs/source/guides/how_to_create_collector.rst

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ proxy_py How to create collector
44
Collector is a class which is used to parse proxies from web page or another source.
55
All collectors are inherited from `collectors.abstract_collector.AbstractCollector`,
66
also there is `collectors.pages_collector.PagesCollector` which is used for paginated sources.
7-
It's better to understand through examples.
7+
It's always better to learn through the examples.
88

99
Simple collector
1010
****************
1111

1212
Let's start with the simplest collector we can imagine,
13-
it will be from the page http://www.89ip.cn/ti.html as you can see,
14-
it sends form as GET request to this url
13+
it will be collecting from the page http://www.89ip.cn/ti.html
14+
as you can see, it sends form as GET request to this url
1515
http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp=
1616

1717
Firstly we can try to check that these proxies are really good.
@@ -25,20 +25,23 @@ You're gonna get something like this:
2525

2626
`++++++++++++++++++++++-+++++-+++++++++++++++++++++++++++-++++++-++++-+++++++++++++++++++++++++++++++--+++++++-+++++++-++-+-+++-+++++++++-+++++++++++++++++++++--++--+-++++++++++++++++-+++--+++-+-+++++++++++++++++--++++++++++++-+++++-+++-++++++++-+++++-+-+++++++-++-+--++++-+++-++++++++++-++++--+++++++-+++++++-++--+++++-+-+++++++++++++++++++++-++-+++-+++--++++--+++-+++++++-+++++++-+++++++++++++++---+++++-+++++++++-+++++-+-++++++++++++-+--+++--+-+-+-++-+++++-+++--++++++-+++++++++++--+-+++-+-++++--+++++--+++++++++-+-+-++++-+-++++++++++++++-++-++++++--+--++++-+-++--++--+++++-++-+++-++++--++--+---------+--+--++--------+++-++-+--++++++++++++++++-+++++++++-+++++++--+--+--+-+-+++---++------------------+--+----------+-+-+--++-+----------+-------+--+------+----+-+--+--++----+--+-++++++-++-+++`
2727

28-
+ means working proxy with at least one protocol,
29-
- means not working, the result above is perfect, so many good proxies.
28+
"\+" means working proxy with at least one protocol, "\-" means not working, the result above is perfect, so many good proxies.
3029

31-
Note: working means proxy respond with timeout set in settings, if you increase it you'll get more proxies.
30+
Note: working means proxy respond with timeout set in settings,
31+
if you increase it, you're likely to get more proxies.
3232

33-
Alright, let's code it!
33+
Alright, let's code!
3434

35-
We need to place our collector inside `collectors/web/` directory using reversed domain path,
35+
We need to place our collector inside `collectors/web/`
36+
directory using reversed domain path,
3637
it will be `collectors/web/cn/89ip/collector.py`
3738

38-
To make class be collector we need to declare variable `__collector__`
39+
To make class be a collector we need to declare a variable
40+
`__collector__`
3941

4042
Note: name of file and name of class don't make sense,
41-
you can declare as many files and classes in each file per domain as you want
43+
you can declare as many files and classes in each file
44+
per domain as you want
4245

4346
.. code-block:: python
4447
@@ -48,22 +51,26 @@ you can declare as many files and classes in each file per domain as you want
4851
class Collector(AbstractCollector):
4952
__collector__ = True
5053
51-
We can override default processing period in constructor like this:
54+
We can override default processing period in constructor
55+
like this:
5256

5357
.. code-block:: python
5458
5559
def __init__(self):
5660
super(Collector, self).__init__()
57-
self.processing_period = 30 * 60 # 30 minutes
61+
# 30 minutes
62+
self.processing_period = 30 * 60
5863
'''
5964
floating period means proxy_py will be changing
6065
period to not make extra requests and handle
61-
new proxies in time, you don't need to change
66+
new proxies in time, you don't need to disable
6267
it in most cases
6368
'''
6469
# self.floating_processing_period = False
6570
6671
72+
The last step is to implement `collect()` method.
73+
6774
Paginated collector
6875
*******************
6976

main.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,40 @@
88

99
import asyncio
1010
import logging
11+
import argparse
1112

13+
from tools import test_collector
14+
15+
test_collector_path = None
16+
main_logger = None
17+
18+
19+
def process_cmd_arguments():
20+
global test_collector_path
21+
22+
def str_to_bool(value):
23+
if value.lower() in ("yes", "true", "t", "y", "1"):
24+
return True
25+
elif value.lower() in ("no", "false", "f", "n", "0"):
26+
return False
27+
else:
28+
raise argparse.ArgumentTypeError("Boolean value expected.")
29+
30+
cmd_parser = argparse.ArgumentParser()
31+
cmd_parser.add_argument("--debug", type=str_to_bool, help="override settings' debug value")
32+
cmd_parser.add_argument("--test-collector", help="test collector with a given path")
33+
34+
args = cmd_parser.parse_args()
35+
36+
if args.debug is not None:
37+
settings.DEBUG = args.debug
38+
39+
test_collector_path = args.test_collector
40+
41+
42+
def prepare_loggers():
43+
global main_logger
1244

13-
if __name__ == "__main__":
1445
asyncio_logger = logging.getLogger("asyncio")
1546
asyncio_logger_file_handler = logging.FileHandler("logs/asyncio.log")
1647
asyncio_logger_file_handler.setLevel(logging.DEBUG)
@@ -43,8 +74,16 @@
4374

4475
main_logger.addHandler(logger_file_handler)
4576

77+
78+
def main():
79+
process_cmd_arguments()
80+
prepare_loggers()
81+
4682
loop = asyncio.get_event_loop()
4783

84+
if test_collector_path is not None:
85+
return loop.run_until_complete(test_collector.run(test_collector_path))
86+
4887
proxy_processor = Processor.get_instance()
4988

5089
proxy_provider_server = ProxyProviderServer(
@@ -61,6 +100,15 @@
61100
statistics.worker(),
62101
]))
63102
BaseChecker.clean()
103+
except KeyboardInterrupt:
104+
pass
64105
except BaseException as ex:
65106
main_logger.exception(ex)
66107
print("critical error happened, see logs/main.log")
108+
return 1
109+
110+
return 0
111+
112+
113+
if __name__ == "__main__":
114+
exit(main())

parsers/regex_parser.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import re
2+
3+
4+
# TODO: add ipv6 addresses, make domain checking better
5+
_0_TO_255_REGEX = r"([0-9]|[1-8][0-9]|9[0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])"
6+
DOMAIN_LETTER_REGEX = r"[a-zA-Z0-9_\-]"
7+
PROXY_FIND_REGEX = \
8+
r"((?P<protocol>(http|socks4|socks5))://)?" \
9+
r"((?P<auth_data>[a-zA-Z0-9_\.]+:[a-zA-Z0-9_\.]+)@)?" \
10+
r"(?P<domain>" + \
11+
r"(" + _0_TO_255_REGEX + "\.){3}" + _0_TO_255_REGEX + \
12+
r"|" + DOMAIN_LETTER_REGEX + r"+(\.[a-zA-Z]" + DOMAIN_LETTER_REGEX + r"+)+):" \
13+
r"(?P<port>(6553[0-5]|655[0-2][0-9]|65[0-4][0-9]{2}|6[0-4][0-9]{3}|[1-5][0-9]{4}|999[0-9]|99[0-8][0-9]|9[0-8][0-9]{2}|[1-8][0-9]{3}|99[0-9]|9[0-8][0-9]|[1-8][0-9]{2}|9[0-9]|[1-8][0-9]|[1-9]))"
14+
PROXY_VALIDATE_REGEX = '^' + PROXY_FIND_REGEX + '/?$'
15+
16+
17+
class RegexParser:
18+
'''
19+
It's used to scratch proxies from text by regular expression,
20+
you can pass your own expression(see docstring for parse method)
21+
'''
22+
23+
def __init__(self, expression=PROXY_FIND_REGEX, flags=0):
24+
'''
25+
26+
:param expression: expression which is used to parse proxies with named groups:
27+
(?P<protocol>) - for proxy protocol (socks4/5, http)
28+
(?P<auth_data>) - authentication data (login and password)
29+
(?P<domain>) - IP or domain
30+
(?P<port>) - port
31+
There is a default value which parses proxies like these ones:
32+
127.0.0.1:9060
33+
test.proxy.com:8080
34+
socks4://8.8.8.8:65000
35+
http://user:password@proxy.test.info/
36+
37+
:param flags: flags that are passed to re.compile
38+
'''
39+
40+
flags |= re.MULTILINE
41+
self.expression = expression
42+
self.regex = re.compile(expression, flags)
43+
44+
def parse(self, text: str) -> list:
45+
for match in self.regex.finditer(text):
46+
match = match.groupdict()
47+
yield match["domain"] + ':' + match["port"]

processor.py

Lines changed: 41 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import proxy_validator
12
from checkers.base_checker import CheckerResult
3+
from parsers.regex_parser import PROXY_VALIDATE_REGEX
24
from proxy_py import settings
35
from models import Proxy, CollectorState, db
46

@@ -11,16 +13,6 @@
1113
import peewee
1214

1315

14-
# TODO: add ipv6 addresses, make domain checking better
15-
_0_TO_255_REGEX = r"([0-9]|[1-8][0-9]|9[0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])"
16-
DOMAIN_LETTER_REGEX = r"[a-zA-Z0-9_\-]"
17-
PROXY_VALIDATE_REGEX = \
18-
r"^((?P<protocol>(http|socks4|socks5))://)?" \
19-
r"((?P<auth_data>[a-zA-Z0-9_\.]+:[a-zA-Z0-9_\.]+)@)?" \
20-
r"(?P<domain>" + \
21-
r"(" + _0_TO_255_REGEX + "\.){3}" + _0_TO_255_REGEX + \
22-
r"|" + DOMAIN_LETTER_REGEX + r"+(\.[a-zA-Z]" + DOMAIN_LETTER_REGEX + r"+)*):" \
23-
r"(?P<port>([1-9]|[1-8][0-9]|9[0-9]|[1-8][0-9]{2}|9[0-8][0-9]|99[0-9]|[1-8][0-9]{3}|9[0-8][0-9]{2}|99[0-8][0-9]|999[0-9]|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))/?$"
2416

2517
LOGGERS_FORMAT = "%(levelname)s ~ %(asctime)s ~ %(funcName)30s() ~ %(message)s"
2618

@@ -213,59 +205,53 @@ async def process_raw_proxies(self, proxies, collector_id):
213205
async def process_raw_proxy(self, proxy, collector_id):
214206
self.logger.debug("processing raw proxy \"{}\"".format(proxy))
215207

216-
matches = re.match(PROXY_VALIDATE_REGEX, proxy)
217-
if matches:
218-
matches = matches.groupdict()
219-
auth_data = matches["auth_data"]
220-
domain = matches["domain"]
221-
port = matches["port"]
208+
try:
209+
_, auth_data, domain, port = proxy_validator.retrieve(proxy)
210+
except proxy_validator.ValidationError as ex:
211+
self.collectors_logger.error(
212+
"Collector with id \"{}\" returned bad raw proxy \"{}\". "
213+
"Message: {}".format(collector_id, proxy, ex)
214+
)
215+
return
222216

223-
if auth_data is None:
224-
auth_data = ""
225217

226-
if domain is None or port is None:
227-
self.collectors_logger.error(
228-
"Bad raw proxy \"{}\" from collector \"{}\"".format(proxy, collector_id)
218+
# don't care about protocol
219+
try:
220+
proxy = await db.get(
221+
Proxy.select().where(
222+
Proxy.auth_data == auth_data,
223+
Proxy.domain == domain,
224+
Proxy.port == port,
229225
)
230-
return
226+
)
231227

232-
# don't care about protocol
233-
try:
234-
proxy = await db.get(
235-
Proxy.select().where(
236-
Proxy.auth_data == auth_data,
237-
Proxy.domain == domain,
238-
Proxy.port == port,
239-
)
228+
if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time():
229+
proxy_short_address = ""
230+
if auth_data:
231+
proxy_short_address += auth_data + "@"
232+
233+
proxy_short_address += "{}:{}".format(domain, port)
234+
235+
self.logger.debug(
236+
"skipping proxy \"{}\" from collector \"{}\"".format(
237+
proxy_short_address, collector_id)
240238
)
239+
return
240+
except Proxy.DoesNotExist:
241+
pass
241242

242-
if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time():
243-
proxy_short_address = ""
244-
if auth_data:
245-
proxy_short_address += auth_data + "@"
243+
for raw_protocol in range(len(Proxy.PROTOCOLS)):
244+
while not self.good_proxies_are_processed:
245+
# TODO: find a better way
246+
await asyncio.sleep(0.1)
246247

247-
proxy_short_address += "{}:{}".format(domain, port)
248+
new_proxy = Proxy()
249+
new_proxy.raw_protocol = raw_protocol
250+
new_proxy.auth_data = auth_data
251+
new_proxy.domain = domain
252+
new_proxy.port = port
248253

249-
self.logger.debug(
250-
"skipping proxy \"{}\" from collector \"{}\"".format(
251-
proxy_short_address, collector_id)
252-
)
253-
return
254-
except Proxy.DoesNotExist:
255-
pass
256-
257-
for raw_protocol in range(len(Proxy.PROTOCOLS)):
258-
while not self.good_proxies_are_processed:
259-
# TODO: find a better way
260-
await asyncio.sleep(0.1)
261-
262-
new_proxy = Proxy()
263-
new_proxy.raw_protocol = raw_protocol
264-
new_proxy.auth_data = auth_data
265-
new_proxy.domain = domain
266-
new_proxy.port = port
267-
268-
await self.add_proxy_to_queue(new_proxy, collector_id)
254+
await self.add_proxy_to_queue(new_proxy, collector_id)
269255

270256
async def process_proxy(self, raw_protocol: int, auth_data: str, domain: str, port: int, collector_id):
271257
async with self.proxies_semaphore:

0 commit comments

Comments
 (0)