Skip to content

Commit bd16f14

Browse files
Mantisusvdusekrenovate[bot]Apify Release Bot
authored
feat: add on_skipped_request decorator, to process links skipped according to robots.txt rules (#1166)
### Description - This PR supplements #1162 by adding an `on_skipped_request` decorator to handle references skipped according to `robots.txt` rules ### Issues - Closes: #1160 - Related #1162 --------- Co-authored-by: Vlada Dusek <v.dusek96@gmail.com> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Apify Release Bot <noreply@apify.com>
1 parent e085ace commit bd16f14

File tree

10 files changed

+183
-21
lines changed

10 files changed

+183
-21
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import asyncio
2+
3+
from crawlee import SkippedReason
4+
from crawlee.crawlers import (
5+
BeautifulSoupCrawler,
6+
BeautifulSoupCrawlingContext,
7+
)
8+
9+
10+
async def main() -> None:
11+
# Initialize the crawler with robots.txt compliance enabled
12+
crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
13+
14+
@crawler.router.default_handler
15+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
16+
context.log.info(f'Processing {context.request.url} ...')
17+
18+
# highlight-start
19+
# This handler is called when a request is skipped
20+
@crawler.on_skipped_request
21+
async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
22+
# Check if the request was skipped due to robots.txt rules
23+
if reason == 'robots_txt':
24+
crawler.log.info(f'Skipped {url} due to robots.txt rules.')
25+
26+
# highlight-end
27+
28+
# Start the crawler with the specified URLs
29+
# The login URL will be skipped and handled by the skipped_request_handler
30+
await crawler.run(
31+
['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
32+
)
33+
34+
35+
if __name__ == '__main__':
36+
asyncio.run(main())

docs/examples/respect_robots_txt_file.mdx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import ApiLink from '@site/src/components/ApiLink';
77
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
88

99
import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
10+
import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py';
1011

1112
This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.
1213

@@ -19,3 +20,13 @@ The code below demonstrates this behavior using the <ApiLink to="class/Beautiful
1920
<RunnableCodeBlock className="language-python" language="python">
2021
{RespectRobotsTxt}
2122
</RunnableCodeBlock>
23+
24+
## Handle with `on_skipped_request`
25+
26+
If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from <ApiLink to="class/BasicCrawler#on_skipped_request">`BasicCrawler`</ApiLink>.
27+
28+
Let's update the code by adding the `on_skipped_request` handler:
29+
30+
<RunnableCodeBlock className="language-python" language="python">
31+
{OnSkippedRequest}
32+
</RunnableCodeBlock>

src/crawlee/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from ._request import Request, RequestOptions
44
from ._service_locator import service_locator
5-
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction
5+
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
66
from ._utils.globs import Glob
77

88
__version__ = metadata.version('crawlee')
@@ -15,5 +15,6 @@
1515
'Request',
1616
'RequestOptions',
1717
'RequestTransformAction',
18+
'SkippedReason',
1819
'service_locator',
1920
]

src/crawlee/_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
EnqueueStrategy: TypeAlias = Literal['all', 'same-domain', 'same-hostname', 'same-origin']
5151
"""Enqueue strategy to be used for determining which links to extract and enqueue."""
5252

53+
SkippedReason: TypeAlias = Literal['robots_txt']
54+
5355

5456
def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
5557
"""Convert all header keys to lowercase, strips whitespace, and returns them sorted by key."""

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
import logging
45
from abc import ABC
56
from typing import TYPE_CHECKING, Any, Callable, Generic, Union
@@ -157,6 +158,7 @@ async def extract_links(
157158
kwargs.setdefault('strategy', 'same-hostname')
158159

159160
requests = list[Request]()
161+
skipped = list[str]()
160162
base_user_data = user_data or {}
161163

162164
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
@@ -168,8 +170,7 @@ async def extract_links(
168170
url = convert_to_absolute_url(base_url, url)
169171

170172
if robots_txt_file and not robots_txt_file.is_allowed(url):
171-
# TODO: https://github.com/apify/crawlee-python/issues/1160
172-
# add processing with on_skipped_request hook
173+
skipped.append(url)
173174
continue
174175

175176
request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
@@ -192,6 +193,12 @@ async def extract_links(
192193
continue
193194

194195
requests.append(request)
196+
197+
if skipped:
198+
skipped_tasks = [
199+
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
200+
]
201+
await asyncio.gather(*skipped_tasks)
195202
return requests
196203

197204
return extract_links

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
HttpHeaders,
3434
RequestHandlerRunResult,
3535
SendRequestFunction,
36+
SkippedReason,
3637
)
3738
from crawlee._utils.docs import docs_group
3839
from crawlee._utils.robots import RobotsTxtFile
@@ -81,6 +82,7 @@
8182
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
8283
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]]
8384
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
85+
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
8486

8587

8688
class _BasicCrawlerOptions(TypedDict):
@@ -335,9 +337,10 @@ def __init__(
335337
self._router = None
336338
self.router.default_handler(request_handler)
337339

338-
# Error & failed request handlers
340+
# Error, failed & skipped request handlers
339341
self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
340342
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
343+
self._on_skipped_request: SkippedRequestCallback | None = None
341344
self._abort_on_error = abort_on_error
342345

343346
# Context of each request with matching result of request handler.
@@ -540,6 +543,14 @@ def failed_request_handler(
540543
self._failed_request_handler = handler
541544
return handler
542545

546+
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
547+
"""Register a function to handle skipped requests.
548+
549+
The skipped request handler is invoked when a request is skipped due to a collision or other reasons.
550+
"""
551+
self._on_skipped_request = callback
552+
return callback
553+
543554
async def run(
544555
self,
545556
requests: Sequence[str | Request] | None = None,
@@ -676,8 +687,10 @@ async def add_requests(
676687
skipped.append(request)
677688

678689
if skipped:
679-
# TODO: https://github.com/apify/crawlee-python/issues/1160
680-
# add processing with on_skipped_request hook
690+
skipped_tasks = [
691+
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
692+
]
693+
await asyncio.gather(*skipped_tasks)
681694
self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
682695

683696
request_manager = await self.get_request_manager()
@@ -996,6 +1009,30 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
9961009
except Exception as e:
9971010
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
9981011

1012+
async def _handle_skipped_request(
1013+
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1014+
) -> None:
1015+
if need_mark and isinstance(request, Request):
1016+
request_manager = await self.get_request_manager()
1017+
1018+
await wait_for(
1019+
lambda: request_manager.mark_request_as_handled(request),
1020+
timeout=self._internal_timeout,
1021+
timeout_message='Marking request as handled timed out after '
1022+
f'{self._internal_timeout.total_seconds()} seconds',
1023+
logger=self._logger,
1024+
max_retries=3,
1025+
)
1026+
request.state = RequestState.SKIPPED
1027+
1028+
url = request.url if isinstance(request, Request) else request
1029+
1030+
if self._on_skipped_request:
1031+
try:
1032+
await self._on_skipped_request(url, reason)
1033+
except Exception as e:
1034+
raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e
1035+
9991036
def _get_message_from_error(self, error: Exception) -> str:
10001037
"""Get error message summary from exception.
10011038
@@ -1152,16 +1189,8 @@ async def __run_task_function(self) -> None:
11521189
self._logger.warning(
11531190
f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
11541191
)
1155-
await wait_for(
1156-
lambda: request_manager.mark_request_as_handled(request),
1157-
timeout=self._internal_timeout,
1158-
timeout_message='Marking request as handled timed out after '
1159-
f'{self._internal_timeout.total_seconds()} seconds',
1160-
logger=self._logger,
1161-
max_retries=3,
1162-
)
1163-
# TODO: https://github.com/apify/crawlee-python/issues/1160
1164-
# add processing with on_skipped_request hook
1192+
1193+
await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
11651194
return
11661195

11671196
if request.session_id:

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import asyncio
34
import logging
45
from functools import partial
56
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union
@@ -292,6 +293,7 @@ async def extract_links(
292293
kwargs.setdefault('strategy', 'same-hostname')
293294

294295
requests = list[Request]()
296+
skipped = list[str]()
295297
base_user_data = user_data or {}
296298

297299
elements = await context.page.query_selector_all(selector)
@@ -309,8 +311,7 @@ async def extract_links(
309311
url = convert_to_absolute_url(base_url, url)
310312

311313
if robots_txt_file and not robots_txt_file.is_allowed(url):
312-
# TODO: https://github.com/apify/crawlee-python/issues/1160
313-
# add processing with on_skipped_request hook
314+
skipped.append(url)
314315
continue
315316

316317
request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
@@ -334,6 +335,12 @@ async def extract_links(
334335

335336
requests.append(request)
336337

338+
if skipped:
339+
skipped_tasks = [
340+
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
341+
]
342+
await asyncio.gather(*skipped_tasks)
343+
337344
return requests
338345

339346
return extract_links

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import TYPE_CHECKING
44
from unittest import mock
55

6-
from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction
6+
from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction, SkippedReason
77
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
88

99
if TYPE_CHECKING:
@@ -160,3 +160,26 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
160160
str(server_url / 'start_enqueue'),
161161
str(server_url / 'sub_index'),
162162
}
163+
164+
165+
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
166+
crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
167+
skip = mock.Mock()
168+
169+
@crawler.router.default_handler
170+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
171+
await context.enqueue_links()
172+
173+
@crawler.on_skipped_request
174+
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
175+
skip(url)
176+
177+
await crawler.run([str(server_url / 'start_enqueue')])
178+
179+
skipped = {call[0][0] for call in skip.call_args_list}
180+
181+
assert skipped == {
182+
str(server_url / 'page_1'),
183+
str(server_url / 'page_2'),
184+
str(server_url / 'page_3'),
185+
}

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import pytest
88

9-
from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction
9+
from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction, SkippedReason
1010
from crawlee.crawlers import ParselCrawler
1111

1212
if TYPE_CHECKING:
@@ -256,3 +256,26 @@ async def request_handler(context: ParselCrawlingContext) -> None:
256256
str(server_url / 'start_enqueue'),
257257
str(server_url / 'sub_index'),
258258
}
259+
260+
261+
async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
262+
crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
263+
skip = mock.Mock()
264+
265+
@crawler.router.default_handler
266+
async def request_handler(context: ParselCrawlingContext) -> None:
267+
await context.enqueue_links()
268+
269+
@crawler.on_skipped_request
270+
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
271+
skip(url)
272+
273+
await crawler.run([str(server_url / 'start_enqueue')])
274+
275+
skipped = {call[0][0] for call in skip.call_args_list}
276+
277+
assert skipped == {
278+
str(server_url / 'page_1'),
279+
str(server_url / 'page_2'),
280+
str(server_url / 'page_3'),
281+
}

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import pytest
1313

14-
from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction
14+
from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction, SkippedReason
1515
from crawlee.crawlers import PlaywrightCrawler
1616
from crawlee.fingerprint_suite import (
1717
DefaultFingerprintGenerator,
@@ -618,6 +618,29 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
618618
}
619619

620620

621+
async def test_on_skipped_request(server_url: URL) -> None:
622+
crawler = PlaywrightCrawler(respect_robots_txt_file=True)
623+
skip = mock.Mock()
624+
625+
@crawler.router.default_handler
626+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
627+
await context.enqueue_links()
628+
629+
@crawler.on_skipped_request
630+
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
631+
skip(url)
632+
633+
await crawler.run([str(server_url / 'start_enqueue')])
634+
635+
skipped = {call[0][0] for call in skip.call_args_list}
636+
637+
assert skipped == {
638+
str(server_url / 'page_1'),
639+
str(server_url / 'page_2'),
640+
str(server_url / 'page_3'),
641+
}
642+
643+
621644
async def test_send_request(server_url: URL) -> None:
622645
"""Check that the persist context works with fingerprints."""
623646
check_data: dict[str, Any] = {}

0 commit comments

Comments
 (0)