Skip to content

Commit 3d3cd6f

Browse files
committed
some refactoring
1 parent 59209f3 commit 3d3cd6f

File tree

4 files changed

+74
-16
lines changed

4 files changed

+74
-16
lines changed

collectors/collector.py

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
import models
66

77

8+
# TODO: refactor saving state
89
class AbstractCollector:
910
"""Base class for all types of collectors"""
1011

1112
async def collect(self):
1213
"""
13-
this method should return proxies in any of the following formats:
14+
This method should return proxies in any of the following formats:
1415
1516
::
1617
@@ -19,22 +20,54 @@ async def collect(self):
1920
protocol://ip:port
2021
protocol://domain:port
2122
23+
24+
ip can be both ipv4 and ipv6
25+
26+
will support yield in the future, now just return list
2227
"""
2328

2429
return []
2530

2631
async def _collect(self):
27-
"""do not use!"""
28-
return (await self.collect())[:settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST]
32+
"""Do not use! It is called on collector's processing automatically"""
33+
34+
# i = 0
35+
# async for proxy in self.collect():
36+
# if i > settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST:
37+
# break
38+
39+
# yield proxy
40+
# i += 1
41+
proxies = list(await self.collect())
42+
proxies = proxies[:settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST]
43+
self.last_processing_proxies_count = len(proxies)
44+
return proxies
2945

3046
async def load_state(self, state: models.CollectorState):
47+
"""
48+
Function for loading collector's state from database model.
49+
It's called automatically, don't worry. All you can do is
50+
to override without forgetting to call parent's method like this:
51+
52+
::
53+
54+
async def load_state(self, state):
55+
super(MyCollector, self).load_state(state)
56+
# do something here
57+
"""
3158
self.last_processing_time = state.last_processing_time
3259
self.processing_period = state.processing_period
60+
self.last_processing_proxies_count = state.last_processing_proxies_count
3361
self.data = json.loads(state.data) if state.data is not None and state.data else {}
3462

35-
async def set_state(self, state: models.CollectorState):
63+
async def save_state(self, state: models.CollectorState):
64+
"""
65+
Function for saving collector's state to database model.
66+
It's called automatically, don't worry.
67+
"""
3668
state.last_processing_time = self.last_processing_time
3769
state.processing_period = self.processing_period
70+
state.last_processing_proxies_count = self.last_processing_proxies_count
3871
state.data = json.dumps(self.data)
3972

4073
last_processing_time = 0
@@ -53,16 +86,25 @@ async def set_state(self, state: models.CollectorState):
5386
override_maximum_processing_period = None
5487
"""
5588
ignore settings' maximum processing period and set
56-
it to value of this variable
89+
it to the value of this variable
5790
"""
5891

5992
override_minimum_processing_period = None
93+
"""
94+
ignore settings' minimum processing period and set
95+
it to the value of this variable, for example
96+
when some collector has requests time limit
97+
"""
6098

6199
data = {}
100+
# TODO: consider namespacing
62101
"""
63102
here you can store some information,
64103
it will be written into and read from database
65104
by magic, don't worry about it :)
66-
Just don't use names starting with underscore
105+
If you're curious, see process_collector_of_state() function
106+
from processor.py file
107+
108+
Don't use names starting with the underscore
67109
like this one: _last_page
68110
"""

collectors/pages_collector.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,31 @@
55
# TODO: save pages to collector state
66
class PagesCollector(AbstractCollector):
77
"""
8-
Collector for paginated APIs.
9-
Here you should override ``process_page(page_index)`` method
8+
Collector for paginated APIs. Pages are started from 0.
9+
Here you should override ``process_page(page_index)`` method.
10+
This collector will care about pages, increment it on each processing
11+
and will reset it to 0 if there is no proxies on the page or if proxies
12+
are the same as those on the previous one. If you don't want such smart
13+
behavior, just set dynamic_pages_count to false and set pages_count manually.
1014
"""
1115

12-
async def collect(self):
16+
async def load_state(self, state):
17+
super(PagesCollector, self).load_state(state)
1318
if "_current_page" in self.data:
1419
self.current_page = self.data["_current_page"]
1520

16-
proxies = (await self.process_page(self.current_page)
17-
)[:settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST]
21+
if "_pages_count" in self.data:
22+
self.pages_count = self.data["_pages_count"]
23+
24+
async def save_state(self, state):
25+
super(PagesCollector, self).save_state(state)
26+
self.data["_current_page"] = self.current_page
27+
self.data["_pages_count"] = self.pages_count
28+
29+
async def collect(self):
30+
proxies = list(
31+
await self.process_page(self.current_page)
32+
)[:settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST]
1833

1934
if self.dynamic_pages_count:
2035
if proxies:
@@ -37,7 +52,6 @@ async def collect(self):
3752
if self.current_page >= self.pages_count:
3853
self.current_page = 0
3954

40-
self.data["_current_page"] = self.current_page
4155

4256
return proxies
4357

collectors_list.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88

99
collectors = {}
1010

11-
_collector_dirs = settings.COLLECTORS_DIRS if type(settings.COLLECTORS_DIRS) is list else [settings.COLLECTORS_DIRS]
12-
for collectors_dir in _collector_dirs:
11+
_collectors_dirs = settings.COLLECTORS_DIRS
12+
if type(_collectors_dirs) is not list:
13+
_collectors_dirs = [_collectors_dirs]
14+
15+
for collectors_dir in _collectors_dirs:
1316
for root, dirs, files in os.walk(collectors_dir):
1417
for file in files:
1518
if file.endswith(".py"):

processor.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ async def process_collector_of_state(self, collector_state):
257257
self.logger.debug(
258258
"start processing collector of type \"{}\"".format(type(collector))
259259
)
260-
proxies = set(await collector._collect())
260+
proxies = await collector._collect()
261261

262262
if not proxies:
263263
self.collectors_logger.warning(
@@ -278,7 +278,6 @@ async def process_collector_of_state(self, collector_state):
278278
finally:
279279
collector.last_processing_time = int(time.time())
280280
await collector.set_state(collector_state)
281-
collector_state.last_processing_proxies_count = len(proxies)
282281
# TODO: save new proxies count
283282
await db.update(collector_state)
284283

0 commit comments

Comments
 (0)