|
17 | 17 | from urllib.parse import ParseResult, urlparse
|
18 | 18 | from weakref import WeakKeyDictionary
|
19 | 19 |
|
| 20 | +from cachetools import LRUCache |
20 | 21 | from tldextract import TLDExtract
|
21 | 22 | from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
|
| 23 | +from yarl import URL |
22 | 24 |
|
23 | 25 | from crawlee import EnqueueStrategy, Glob, service_locator
|
24 | 26 | from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
|
|
32 | 34 | SendRequestFunction,
|
33 | 35 | )
|
34 | 36 | from crawlee._utils.docs import docs_group
|
| 37 | +from crawlee._utils.robots import RobotsTxtFile |
35 | 38 | from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
|
36 | 39 | from crawlee._utils.wait import wait_for
|
37 | 40 | from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
|
@@ -158,6 +161,10 @@ class _BasicCrawlerOptions(TypedDict):
|
158 | 161 | """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
|
159 | 162 | subclasses rather than direct instantiation of `BasicCrawler`."""
|
160 | 163 |
|
| 164 | + respect_robots_txt_file: NotRequired[bool] |
| 165 | + """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, |
| 166 | + and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`.""" |
| 167 | + |
161 | 168 |
|
162 | 169 | class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
|
163 | 170 | """Generic options the `BasicCrawler` constructor."""
|
@@ -238,6 +245,7 @@ def __init__(
|
238 | 245 | keep_alive: bool = False,
|
239 | 246 | configure_logging: bool = True,
|
240 | 247 | statistics_log_format: Literal['table', 'inline'] = 'table',
|
| 248 | + respect_robots_txt_file: bool = False, |
241 | 249 | _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
|
242 | 250 | _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
|
243 | 251 | _logger: logging.Logger | None = None,
|
@@ -280,6 +288,9 @@ def __init__(
|
280 | 288 | configure_logging: If True, the crawler will set up logging infrastructure automatically.
|
281 | 289 | statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
|
282 | 290 | outputs statistics as plain text log messages.
|
| 291 | + respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file |
| 292 | + for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added |
| 293 | + via `EnqueueLinksFunction` |
283 | 294 | _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
|
284 | 295 | Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
|
285 | 296 | _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
|
@@ -335,6 +346,7 @@ def __init__(
|
335 | 346 | self._max_requests_per_crawl = max_requests_per_crawl
|
336 | 347 | self._max_session_rotations = max_session_rotations
|
337 | 348 | self._max_crawl_depth = max_crawl_depth
|
| 349 | + self._respect_robots_txt_file = respect_robots_txt_file |
338 | 350 |
|
339 | 351 | # Timeouts
|
340 | 352 | self._request_handler_timeout = request_handler_timeout
|
@@ -371,6 +383,8 @@ def __init__(
|
371 | 383 | self._additional_context_managers = _additional_context_managers or []
|
372 | 384 |
|
373 | 385 | # Internal, not explicitly configurable components
|
| 386 | + self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) |
| 387 | + self._robots_txt_lock = asyncio.Lock() |
374 | 388 | self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
|
375 | 389 | self._snapshotter = Snapshotter.from_config(config)
|
376 | 390 | self._autoscaled_pool = AutoscaledPool(
|
@@ -645,10 +659,25 @@ async def add_requests(
|
645 | 659 | wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
|
646 | 660 | wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
|
647 | 661 | """
|
| 662 | + allowed_requests = [] |
| 663 | + skipped = [] |
| 664 | + |
| 665 | + for request in requests: |
| 666 | + check_url = request.url if isinstance(request, Request) else request |
| 667 | + if await self._is_allowed_based_on_robots_txt_file(check_url): |
| 668 | + allowed_requests.append(request) |
| 669 | + else: |
| 670 | + skipped.append(request) |
| 671 | + |
| 672 | + if skipped: |
| 673 | + # TODO: https://github.com/apify/crawlee-python/issues/1160 |
| 674 | + # add processing with on_skipped_request hook |
| 675 | + self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file') |
| 676 | + |
648 | 677 | request_manager = await self.get_request_manager()
|
649 | 678 |
|
650 | 679 | await request_manager.add_requests_batched(
|
651 |
| - requests=requests, |
| 680 | + requests=allowed_requests, |
652 | 681 | batch_size=batch_size,
|
653 | 682 | wait_time_between_batches=wait_time_between_batches,
|
654 | 683 | wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
|
@@ -1080,6 +1109,22 @@ async def __run_task_function(self) -> None:
|
1080 | 1109 | if request is None:
|
1081 | 1110 | return
|
1082 | 1111 |
|
| 1112 | + if not (await self._is_allowed_based_on_robots_txt_file(request.url)): |
| 1113 | + self._logger.warning( |
| 1114 | + f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt' |
| 1115 | + ) |
| 1116 | + await wait_for( |
| 1117 | + lambda: request_manager.mark_request_as_handled(request), |
| 1118 | + timeout=self._internal_timeout, |
| 1119 | + timeout_message='Marking request as handled timed out after ' |
| 1120 | + f'{self._internal_timeout.total_seconds()} seconds', |
| 1121 | + logger=self._logger, |
| 1122 | + max_retries=3, |
| 1123 | + ) |
| 1124 | + # TODO: https://github.com/apify/crawlee-python/issues/1160 |
| 1125 | + # add processing with on_skipped_request hook |
| 1126 | + return |
| 1127 | + |
1083 | 1128 | if request.session_id:
|
1084 | 1129 | session = await self._get_session_by_id(request.session_id)
|
1085 | 1130 | else:
|
@@ -1263,3 +1308,38 @@ def _check_request_collision(self, request: Request, session: Session | None) ->
|
1263 | 1308 | raise RequestCollisionError(
|
1264 | 1309 | f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool'
|
1265 | 1310 | )
|
| 1311 | + |
| 1312 | + async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool: |
| 1313 | + """Check if the URL is allowed based on the robots.txt file. |
| 1314 | +
|
| 1315 | + Args: |
| 1316 | + url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted. |
| 1317 | + """ |
| 1318 | + if not self._respect_robots_txt_file: |
| 1319 | + return True |
| 1320 | + robots_txt_file = await self._get_robots_txt_file_for_url(url) |
| 1321 | + return not robots_txt_file or robots_txt_file.is_allowed(url) |
| 1322 | + |
| 1323 | + async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: |
| 1324 | + """Get the RobotsTxtFile for a given URL. |
| 1325 | +
|
| 1326 | + Args: |
| 1327 | + url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. |
| 1328 | + """ |
| 1329 | + if not self._respect_robots_txt_file: |
| 1330 | + return None |
| 1331 | + origin_url = str(URL(url).origin()) |
| 1332 | + robots_txt_file = self._robots_txt_file_cache.get(origin_url) |
| 1333 | + if robots_txt_file: |
| 1334 | + return robots_txt_file |
| 1335 | + |
| 1336 | + async with self._robots_txt_lock: |
| 1337 | + # Check again if the robots.txt file is already cached after acquiring the lock |
| 1338 | + robots_txt_file = self._robots_txt_file_cache.get(origin_url) |
| 1339 | + if robots_txt_file: |
| 1340 | + return robots_txt_file |
| 1341 | + |
| 1342 | + # If not cached, fetch the robots.txt file |
| 1343 | + robots_txt_file = await RobotsTxtFile.find(url, self._http_client) |
| 1344 | + self._robots_txt_file_cache[origin_url] = robots_txt_file |
| 1345 | + return robots_txt_file |
0 commit comments