Skip to content

Commit ed4b6ed

Browse files
committed
new collector
1 parent 463e25d commit ed4b6ed

File tree

8 files changed

+472
-3
lines changed

8 files changed

+472
-3
lines changed

async_requests.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def __str__(self):
5959

6060

6161
def get_random_user_agent():
62-
return 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0'
62+
return 'Mozilla/5.0 (Windows NT;) Gecko/20100101 Firefox/58.0'
63+
# return 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0'
6364
# TODO: do it
6465
# return UserAgent().random

collectors/collector.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# TODO: add wrapper for doing requests and saving its cookies and UserAgent
2+
13
class AbstractCollector:
24
# this method should return proxies in any of the following formats:
35
# ip:port

collectors/freeproxylists_net/fake_data

Lines changed: 313 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from collectors.pages_collector import PagesCollector
2+
3+
import async_requests
4+
import re
5+
import lxml
6+
import lxml.html
7+
from urllib import parse
8+
9+
10+
BASE_URL = "http://freeproxylists.net/?page={}"
11+
12+
13+
class Collector(PagesCollector):
14+
__collector__
15+
# pages from 1
16+
17+
def __init__(self):
18+
self.dynamic_pages_count = True
19+
20+
async def process_page(self, page_index):
21+
# TODO: fix it
22+
result = []
23+
# data = {
24+
# "recaptcha_challenge_field": "03AMPJSYUZslP6_QOT6YguO3_jhhi2HJd81u0WHPe6eWrbMIStFeXQkfFV16GGQQuHZ_Za5HKMPKYTpPfLu-K5I8G8EAzbM8OTihtneSzgolnQB47SAhg5xw5sEHaTQj_B1I8u3gnw-Ts9ng3T6UAgi4jKTteAzoqQ4_DyND2DqWOX2kAVjCJKhLEYVZdp6z3-Qi14CS2PXRegNRALrYkBeALq5S3tY51y0A",
25+
# "recaptcha_response_field": "RPETS+ATTENTION",
26+
# }
27+
# resp = await async_requests.post(BASE_URL.format(page_index + 1), data)
28+
29+
headers = {
30+
"Cookie": "hl=en; pv=14; userno=20180103-012864; from=direct; visited=2018%2F01%2F03+19%3A35%3A50; __utmb=251962462.16.10.1514975753; __utmc=251962462; __utmt=1; __atuvc=15%7C1; __atuvs=5a4cb2077864569600e; __utma=251962462.2036678170.1513732296.1513732296.1514975753.2; __utmz=251962462.1513732296.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=251962462.Ukraine"
31+
}
32+
33+
resp = await async_requests.get(BASE_URL.format(page_index + 1), headers=headers)
34+
text = resp.text
35+
# with open("/home/user/python_projects/proxy_py/collectors/freeproxylists_net/fake_data") as f:
36+
# text = f.read()
37+
try:
38+
tree = lxml.html.fromstring(text)
39+
table_element = tree.xpath(".//table[@class='DataGrid']")[0]
40+
except BaseException:
41+
raise Exception("table not found: {}".format(text))
42+
43+
rows = table_element.xpath('.//tr')
44+
for row in rows:
45+
try:
46+
ip = row.xpath('.//td/script')[0].text
47+
port = row.xpath('.//td')[1].text
48+
if ip is None or port is None:
49+
continue
50+
51+
ip = parse.unquote(ip)
52+
ip = re.search(r">([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}).*?</a>", ip)
53+
if ip is None:
54+
continue
55+
ip = ip.groups()[0]
56+
57+
result.append("{}:{}".format(ip, port))
58+
except IndexError:
59+
pass
60+
61+
return result

collectors/nordvpn_com/nordvpn_com.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from collectors.pages_collector import PagesCollector
2+
3+
import async_requests
4+
import json
5+
6+
7+
URL_PATTERN = "https://nordvpn.com/wp-admin/admin-ajax.php?searchParameters[0][name]=proxy-country" \
8+
"&searchParameters[0][value]=&searchParameters[1][name]=proxy-ports&searchParameters[1][value]=" \
9+
"&offset={}&limit={}&action=getProxies"
10+
11+
12+
class Collector(PagesCollector):
13+
# this collector gives a lot of bad proxies
14+
__collector__ = False
15+
processing_period = 10 * 60
16+
17+
def __init__(self):
18+
self.pages_count = 10
19+
self.limit = 100
20+
21+
async def process_page(self, page_index):
22+
offset = page_index * self.limit
23+
resp = await async_requests.get(URL_PATTERN.format(offset, self.limit))
24+
json_response = json.loads(resp.text)
25+
26+
result = []
27+
28+
for item in json_response:
29+
result.append("{}:{}".format(item["ip"], item["port"]))
30+
31+
if result:
32+
self.pages_count = page_index + 2
33+
else:
34+
self.pages_count = page_index
35+
36+
return result

collectors/pages_collector.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
class PagesCollector(AbstractCollector):
66
async def collect(self):
77
proxies = await self.process_page(self.current_page)
8+
9+
if self.dynamic_pages_count:
10+
self.pages_count = self.current_page + 2 if proxies else self.current_page + 1
11+
812
self.current_page += 1
913
if self.current_page >= self.pages_count:
1014
self.current_page = 0
@@ -20,5 +24,7 @@ async def process_page(self, page_index):
2024
# and also you should set pages_count
2125
pages_count = 0
2226
current_page = 0
27+
# or set dynamic pages count
28+
dynamic_pages_count = False
2329

2430
processing_period = 60 * 10

proxy_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515

1616

1717
# TODO: add multiple checks with several sites
18-
async def check_proxy(proxy_url: str, session=None):
18+
async def check_proxy(proxy_url: str, session=None, timeout=None):
1919
try:
2020
res = await _request(
2121
'get',
2222
'https://pikagraphs.d3d.info/OK/',
2323
# 'https://wtfismyip.com/text',
2424
proxy_url,
25-
settings.PROXY_CHECKING_TIMEOUT,
25+
settings.PROXY_CHECKING_TIMEOUT if timeout is None else timeout,
2626
session
2727
)
2828

test_collector.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from collectors.freeproxylists_net.freeproxylists_net import Collector
2+
3+
import proxy_utils
4+
5+
import asyncio
6+
import sys
7+
8+
9+
loop = asyncio.get_event_loop()
10+
11+
12+
async def check_raw_proxy(raw_proxy: str):
13+
protocols = [
14+
"http", "socks4", "socks5"
15+
]
16+
works = False
17+
for protocol in protocols:
18+
# print("checking... {}".format(raw_proxy))
19+
result = await proxy_utils.check_proxy("{}://{}".format(protocol, raw_proxy), timeout=5)
20+
if result:
21+
works = True
22+
break
23+
24+
print("1" if works else "0", end="")
25+
sys.stdout.flush()
26+
27+
return works
28+
29+
30+
async def main():
31+
collector = Collector()
32+
while True:
33+
proxies = await collector.collect()
34+
# print(proxies)
35+
tasks = []
36+
# print(proxies)
37+
for proxy in proxies:
38+
tasks.append(check_raw_proxy(proxy))
39+
40+
if tasks:
41+
await asyncio.wait(tasks)
42+
else:
43+
print("Empty")
44+
print()
45+
46+
await asyncio.sleep(5)
47+
48+
49+
if __name__ == '__main__':
50+
loop.run_until_complete(main())

0 commit comments

Comments
 (0)