Skip to content

Commit c23f365

Browse files
MantisusCopilotjanbucharvdusek
authored
feat: add respect_robots_txt_file option (#1162)
### Description - This PR implements automatic skipping of requests based on the robots.txt file. It works based on a new boolean flag in the crawler options called `respect_robots_txt_file`. ### Issues - Related: #1144 ### Testing - Add tests to check `respect_robots_txt_file` functioning in ‘EnqueueLinksFunction’ for crawlers - Add tests for `RobotsTxtFile` --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Jan Buchar <Teyras@gmail.com> Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
1 parent fbaa7c2 commit c23f365

File tree

16 files changed

+393
-5
lines changed

16 files changed

+393
-5
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import asyncio
2+
3+
from crawlee.crawlers import (
4+
BeautifulSoupCrawler,
5+
BeautifulSoupCrawlingContext,
6+
)
7+
8+
9+
async def main() -> None:
10+
# Initialize the crawler with robots.txt compliance enabled
11+
crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
12+
13+
@crawler.router.default_handler
14+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
15+
context.log.info(f'Processing {context.request.url} ...')
16+
17+
# Start the crawler with the specified URLs
18+
# The crawler will check the robots.txt file before making requests
19+
# In this example, 'https://news.ycombinator.com/login' will be skipped
20+
# because it's disallowed in the site's robots.txt file
21+
await crawler.run(
22+
['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
23+
)
24+
25+
26+
if __name__ == '__main__':
27+
asyncio.run(main())
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
---
2+
id: respect-robots-txt-file
3+
title: Respect robots.txt file
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
8+
9+
import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
10+
11+
This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.
12+
13+
To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file.
14+
15+
As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped.
16+
17+
The code below demonstrates this behavior using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>:
18+
19+
<RunnableCodeBlock className="language-python" language="python">
20+
{RespectRobotsTxt}
21+
</RunnableCodeBlock>

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ dependencies = [
4040
"eval-type-backport>=0.2.0",
4141
"httpx[brotli,http2,zstd]>=0.27.0",
4242
"more-itertools>=10.2.0",
43+
"protego>=0.4.0",
4344
"psutil>=6.0.0",
4445
"pydantic-settings>=2.2.0,<2.7.0",
4546
"pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2",
@@ -237,7 +238,9 @@ module = [
237238
"jaro", # Untyped and stubs not available
238239
"litestar", # Example code shows deploy on Google Cloud Run.
239240
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
241+
"protego", # Untyped and stubs not available
240242
"sklearn.linear_model", # Untyped and stubs not available
243+
"sortedcollections", # Untyped and stubs not available
241244
"cookiecutter.*", # Untyped and stubs not available
242245
"inquirer.*", # Untyped and stubs not available
243246
]

src/crawlee/_utils/robots.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from protego import Protego
6+
from yarl import URL
7+
8+
from crawlee._utils.web import is_status_code_client_error
9+
10+
if TYPE_CHECKING:
11+
from typing_extensions import Self
12+
13+
from crawlee.http_clients import HttpClient
14+
from crawlee.proxy_configuration import ProxyInfo
15+
16+
17+
class RobotsTxtFile:
18+
def __init__(self, url: str, robots: Protego) -> None:
19+
self._robots = robots
20+
self._original_url = URL(url).origin()
21+
22+
@classmethod
23+
async def from_content(cls, url: str, content: str) -> Self:
24+
"""Create a `RobotsTxtFile` instance from the given content.
25+
26+
Args:
27+
url: The URL associated with the robots.txt file.
28+
content: The raw string content of the robots.txt file to be parsed.
29+
"""
30+
robots = Protego.parse(content)
31+
return cls(url, robots)
32+
33+
@classmethod
34+
async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
35+
"""Determine the location of a robots.txt file for a URL and fetch it.
36+
37+
Args:
38+
url: The URL whose domain will be used to find the corresponding robots.txt file.
39+
http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
40+
proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
41+
"""
42+
robots_url = URL(url).with_path('/robots.txt')
43+
return await cls.load(str(robots_url), http_client, proxy_info)
44+
45+
@classmethod
46+
async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
47+
"""Load the robots.txt file for a given URL.
48+
49+
Args:
50+
url: The direct URL of the robots.txt file to be loaded.
51+
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
52+
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
53+
"""
54+
response = await http_client.send_request(url, proxy_info=proxy_info)
55+
body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
56+
57+
robots = Protego.parse(body.decode('utf-8'))
58+
59+
return cls(url, robots)
60+
61+
def is_allowed(self, url: str, user_agent: str = '*') -> bool:
62+
"""Check if the given URL is allowed for the given user agent.
63+
64+
Args:
65+
url: The URL to check against the robots.txt rules.
66+
user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent.
67+
"""
68+
check_url = URL(url)
69+
if check_url.origin() != self._original_url:
70+
return True
71+
return bool(self._robots.can_fetch(str(check_url), user_agent))
72+
73+
def get_sitemaps(self) -> list[str]:
74+
"""Get the list of sitemaps urls from the robots.txt file."""
75+
return list(self._robots.sitemaps)
76+
77+
def get_crawl_delay(self, user_agent: str = '*') -> int | None:
78+
"""Get the crawl delay for the given user agent.
79+
80+
Args:
81+
user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any
82+
user-agent.
83+
"""
84+
crawl_delay = self._robots.crawl_delay(user_agent)
85+
return int(crawl_delay) if crawl_delay is not None else None

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,19 @@ async def extract_links(
159159
requests = list[Request]()
160160
base_user_data = user_data or {}
161161

162+
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
163+
162164
for link in self._parser.find_links(parsed_content, selector=selector):
163165
url = link
164166
if not is_url_absolute(url):
165167
base_url = context.request.loaded_url or context.request.url
166168
url = convert_to_absolute_url(base_url, url)
167169

170+
if robots_txt_file and not robots_txt_file.is_allowed(url):
171+
# TODO: https://github.com/apify/crawlee-python/issues/1160
172+
# add processing with on_skipped_request hook
173+
continue
174+
168175
request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
169176

170177
if transform_request_function:

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717
from urllib.parse import ParseResult, urlparse
1818
from weakref import WeakKeyDictionary
1919

20+
from cachetools import LRUCache
2021
from tldextract import TLDExtract
2122
from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
23+
from yarl import URL
2224

2325
from crawlee import EnqueueStrategy, Glob, service_locator
2426
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
@@ -32,6 +34,7 @@
3234
SendRequestFunction,
3335
)
3436
from crawlee._utils.docs import docs_group
37+
from crawlee._utils.robots import RobotsTxtFile
3538
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
3639
from crawlee._utils.wait import wait_for
3740
from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
@@ -158,6 +161,10 @@ class _BasicCrawlerOptions(TypedDict):
158161
"""A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
159162
subclasses rather than direct instantiation of `BasicCrawler`."""
160163

164+
respect_robots_txt_file: NotRequired[bool]
165+
"""If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
166+
and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""
167+
161168

162169
class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
163170
"""Generic options the `BasicCrawler` constructor."""
@@ -238,6 +245,7 @@ def __init__(
238245
keep_alive: bool = False,
239246
configure_logging: bool = True,
240247
statistics_log_format: Literal['table', 'inline'] = 'table',
248+
respect_robots_txt_file: bool = False,
241249
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
242250
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
243251
_logger: logging.Logger | None = None,
@@ -280,6 +288,9 @@ def __init__(
280288
configure_logging: If True, the crawler will set up logging infrastructure automatically.
281289
statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
282290
outputs statistics as plain text log messages.
291+
respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
292+
for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
293+
via `EnqueueLinksFunction`
283294
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
284295
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
285296
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -335,6 +346,7 @@ def __init__(
335346
self._max_requests_per_crawl = max_requests_per_crawl
336347
self._max_session_rotations = max_session_rotations
337348
self._max_crawl_depth = max_crawl_depth
349+
self._respect_robots_txt_file = respect_robots_txt_file
338350

339351
# Timeouts
340352
self._request_handler_timeout = request_handler_timeout
@@ -371,6 +383,8 @@ def __init__(
371383
self._additional_context_managers = _additional_context_managers or []
372384

373385
# Internal, not explicitly configurable components
386+
self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
387+
self._robots_txt_lock = asyncio.Lock()
374388
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
375389
self._snapshotter = Snapshotter.from_config(config)
376390
self._autoscaled_pool = AutoscaledPool(
@@ -645,10 +659,25 @@ async def add_requests(
645659
wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
646660
wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
647661
"""
662+
allowed_requests = []
663+
skipped = []
664+
665+
for request in requests:
666+
check_url = request.url if isinstance(request, Request) else request
667+
if await self._is_allowed_based_on_robots_txt_file(check_url):
668+
allowed_requests.append(request)
669+
else:
670+
skipped.append(request)
671+
672+
if skipped:
673+
# TODO: https://github.com/apify/crawlee-python/issues/1160
674+
# add processing with on_skipped_request hook
675+
self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
676+
648677
request_manager = await self.get_request_manager()
649678

650679
await request_manager.add_requests_batched(
651-
requests=requests,
680+
requests=allowed_requests,
652681
batch_size=batch_size,
653682
wait_time_between_batches=wait_time_between_batches,
654683
wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
@@ -1080,6 +1109,22 @@ async def __run_task_function(self) -> None:
10801109
if request is None:
10811110
return
10821111

1112+
if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
1113+
self._logger.warning(
1114+
f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
1115+
)
1116+
await wait_for(
1117+
lambda: request_manager.mark_request_as_handled(request),
1118+
timeout=self._internal_timeout,
1119+
timeout_message='Marking request as handled timed out after '
1120+
f'{self._internal_timeout.total_seconds()} seconds',
1121+
logger=self._logger,
1122+
max_retries=3,
1123+
)
1124+
# TODO: https://github.com/apify/crawlee-python/issues/1160
1125+
# add processing with on_skipped_request hook
1126+
return
1127+
10831128
if request.session_id:
10841129
session = await self._get_session_by_id(request.session_id)
10851130
else:
@@ -1263,3 +1308,38 @@ def _check_request_collision(self, request: Request, session: Session | None) ->
12631308
raise RequestCollisionError(
12641309
f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool'
12651310
)
1311+
1312+
async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
1313+
"""Check if the URL is allowed based on the robots.txt file.
1314+
1315+
Args:
1316+
url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted.
1317+
"""
1318+
if not self._respect_robots_txt_file:
1319+
return True
1320+
robots_txt_file = await self._get_robots_txt_file_for_url(url)
1321+
return not robots_txt_file or robots_txt_file.is_allowed(url)
1322+
1323+
async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
1324+
"""Get the RobotsTxtFile for a given URL.
1325+
1326+
Args:
1327+
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
1328+
"""
1329+
if not self._respect_robots_txt_file:
1330+
return None
1331+
origin_url = str(URL(url).origin())
1332+
robots_txt_file = self._robots_txt_file_cache.get(origin_url)
1333+
if robots_txt_file:
1334+
return robots_txt_file
1335+
1336+
async with self._robots_txt_lock:
1337+
# Check again if the robots.txt file is already cached after acquiring the lock
1338+
robots_txt_file = self._robots_txt_file_cache.get(origin_url)
1339+
if robots_txt_file:
1340+
return robots_txt_file
1341+
1342+
# If not cached, fetch the robots.txt file
1343+
robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
1344+
self._robots_txt_file_cache[origin_url] = robots_txt_file
1345+
return robots_txt_file

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ async def extract_links(
290290

291291
elements = await context.page.query_selector_all(selector)
292292

293+
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
294+
293295
for element in elements:
294296
url = await element.get_attribute('href')
295297

@@ -300,6 +302,11 @@ async def extract_links(
300302
base_url = context.request.loaded_url or context.request.url
301303
url = convert_to_absolute_url(base_url, url)
302304

305+
if robots_txt_file and not robots_txt_file.is_allowed(url):
306+
# TODO: https://github.com/apify/crawlee-python/issues/1160
307+
# add processing with on_skipped_request hook
308+
continue
309+
303310
request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
304311

305312
if transform_request_function:

src/crawlee/storage_clients/_memory/_request_queue_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from logging import getLogger
99
from typing import TYPE_CHECKING
1010

11-
from sortedcollections import ValueSortedDict # type: ignore[import-untyped]
11+
from sortedcollections import ValueSortedDict
1212
from typing_extensions import override
1313

1414
from crawlee._types import StorageTypes

0 commit comments

Comments
 (0)