Skip to content

Commit 32b9755

Browse files
author
Lorenz Braun
committed
feat: provide Request instances in skipped request callbacks
1 parent 6a9f6f4 commit 32b9755

7 files changed

Lines changed: 99 additions & 60 deletions

File tree

docs/examples/code_examples/respect_robots_on_skipped_request.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import asyncio
22

3-
from crawlee import SkippedReason
3+
from crawlee import Request, SkippedReason
44
from crawlee.crawlers import (
55
BeautifulSoupCrawler,
66
BeautifulSoupCrawlingContext,
@@ -18,7 +18,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
1818
# highlight-start
1919
# This handler is called when a request is skipped
2020
@crawler.on_skipped_request
21-
async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
21+
async def skipped_request_handler(request: Request, reason: SkippedReason) -> None:
22+
url = request.url
23+
2224
# Check if the request was skipped due to robots.txt rules
2325
if reason == 'robots_txt':
2426
crawler.log.info(f'Skipped {url} due to robots.txt rules.')

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,18 @@ async def extract_links(
206206
**kwargs: Unpack[EnqueueLinksKwargs],
207207
) -> list[Request]:
208208
requests = list[Request]()
209+
skipped = list[Request]()
210+
211+
def create_request(request_options: RequestOptions) -> Request | None:
212+
try:
213+
return Request.from_url(**request_options)
214+
except ValidationError as exc:
215+
context.log.debug(
216+
f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
217+
'This may be caused by a malformed URL or unsupported URL scheme. '
218+
'Please ensure the URL is correct and retry.'
219+
)
220+
return None
209221

210222
base_user_data = user_data or {}
211223

@@ -226,11 +238,19 @@ async def extract_links(
226238
else context.request.loaded_url or context.request.url
227239
)
228240
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
241+
skipped_iterator = iter([])
229242

230243
if robots_txt_file:
231-
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
232-
else:
233-
skipped = iter([])
244+
skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
245+
246+
for url in skipped_iterator:
247+
request_options = RequestOptions(
248+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
249+
)
250+
request = create_request(request_options)
251+
252+
if request is not None:
253+
skipped.append(request)
234254

235255
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
236256
request_options = RequestOptions(
@@ -244,17 +264,10 @@ async def extract_links(
244264
if transform_request_options != 'unchanged':
245265
request_options = transform_request_options
246266

247-
try:
248-
request = Request.from_url(**request_options)
249-
except ValidationError as exc:
250-
context.log.debug(
251-
f'Skipping URL "{url}" due to invalid format: {exc}. '
252-
'This may be caused by a malformed URL or unsupported URL scheme. '
253-
'Please ensure the URL is correct and retry.'
254-
)
255-
continue
267+
request = create_request(request_options)
256268

257-
requests.append(request)
269+
if request is not None:
270+
requests.append(request)
258271

259272
skipped_tasks = [
260273
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@
110110

111111
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
112112
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
113-
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
113+
SkippedRequestCallback = Callable[[Request, SkippedReason], Awaitable[None]]
114114

115115

116116
class _BasicCrawlerOptions(TypedDict):
@@ -1210,17 +1210,15 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
12101210
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
12111211

12121212
async def _handle_skipped_request(
1213-
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1213+
self, request: Request, reason: SkippedReason, *, need_mark: bool = False
12141214
) -> None:
12151215
if need_mark and isinstance(request, Request):
12161216
request.state = RequestState.SKIPPED
12171217
await self._mark_request_as_handled(request)
12181218

1219-
url = request.url if isinstance(request, Request) else request
1220-
12211219
if self._on_skipped_request:
12221220
try:
1223-
await self._on_skipped_request(url, reason)
1221+
await self._on_skipped_request(request, reason)
12241222
except Exception as e:
12251223
raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e
12261224

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,18 @@ async def extract_links(
459459
The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function.
460460
"""
461461
requests = list[Request]()
462+
skipped = list[Request]()
463+
464+
def create_request(request_options: RequestOptions) -> Request | None:
465+
try:
466+
return Request.from_url(**request_options)
467+
except ValidationError as exc:
468+
context.log.debug(
469+
f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
470+
'This may be caused by a malformed URL or unsupported URL scheme. '
471+
'Please ensure the URL is correct and retry.'
472+
)
473+
return None
462474

463475
base_user_data = user_data or {}
464476

@@ -478,10 +490,19 @@ async def extract_links(
478490

479491
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
480492

493+
skipped_iterator = iter([])
494+
481495
if robots_txt_file:
482-
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
483-
else:
484-
skipped = iter([])
496+
skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
497+
498+
for url in skipped_iterator:
499+
request_options = RequestOptions(
500+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
501+
)
502+
request = create_request(request_options)
503+
504+
if request is not None:
505+
skipped.append(request)
485506

486507
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
487508
request_options = RequestOptions(
@@ -495,17 +516,10 @@ async def extract_links(
495516
if transform_request_options != 'unchanged':
496517
request_options = transform_request_options
497518

498-
try:
499-
request = Request.from_url(**request_options)
500-
except ValidationError as exc:
501-
context.log.debug(
502-
f'Skipping URL "{url}" due to invalid format: {exc}. '
503-
'This may be caused by a malformed URL or unsupported URL scheme. '
504-
'Please ensure the URL is correct and retry.'
505-
)
506-
continue
519+
request = create_request(request_options)
507520

508-
requests.append(request)
521+
if request is not None:
522+
requests.append(request)
509523

510524
skipped_tasks = [
511525
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -246,18 +246,22 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
246246
await context.enqueue_links()
247247

248248
@crawler.on_skipped_request
249-
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
250-
skip(url)
249+
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
250+
skip(request)
251251

252252
await crawler.run([str(server_url / 'start_enqueue')])
253253

254-
expected_skip_calls = [
255-
mock.call(str(server_url / 'page_1')),
256-
mock.call(str(server_url / 'page_2')),
257-
mock.call(str(server_url / 'page_3')),
258-
mock.call(str(server_url / 'page_4')),
259-
]
260-
skip.assert_has_calls(expected_skip_calls, any_order=True)
254+
expected_skip_urls = {
255+
str(server_url / 'page_1'),
256+
str(server_url / 'page_2'),
257+
str(server_url / 'page_3'),
258+
str(server_url / 'page_4'),
259+
}
260+
261+
requests = [call.args[0] for call in skip.call_args_list]
262+
263+
all(isinstance(request, Request) for request in requests)
264+
assert {request.url for request in requests} == expected_skip_urls
261265

262266

263267
async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -330,18 +330,22 @@ async def request_handler(context: ParselCrawlingContext) -> None:
330330
await context.enqueue_links()
331331

332332
@crawler.on_skipped_request
333-
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
334-
skip(url)
333+
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
334+
skip(request)
335335

336336
await crawler.run([str(server_url / 'start_enqueue')])
337337

338-
expected_skip_calls = [
339-
mock.call(str(server_url / 'page_1')),
340-
mock.call(str(server_url / 'page_2')),
341-
mock.call(str(server_url / 'page_3')),
342-
mock.call(str(server_url / 'page_4')),
343-
]
344-
skip.assert_has_calls(expected_skip_calls, any_order=True)
338+
expected_skip_urls = {
339+
str(server_url / 'page_1'),
340+
str(server_url / 'page_2'),
341+
str(server_url / 'page_3'),
342+
str(server_url / 'page_4'),
343+
}
344+
345+
requests = [call.args[0] for call in skip.call_args_list]
346+
347+
all(isinstance(request, Request) for request in requests)
348+
assert {request.url for request in requests} == expected_skip_urls
345349

346350

347351
async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -765,18 +765,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
765765
await context.enqueue_links()
766766

767767
@crawler.on_skipped_request
768-
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
769-
skip(url)
768+
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
769+
skip(request)
770770

771771
await crawler.run([str(server_url / 'start_enqueue')])
772772

773-
expected_skip_calls = [
774-
mock.call(str(server_url / 'page_1')),
775-
mock.call(str(server_url / 'page_2')),
776-
mock.call(str(server_url / 'page_3')),
777-
mock.call(str(server_url / 'page_4')),
778-
]
779-
skip.assert_has_calls(expected_skip_calls, any_order=True)
773+
expected_skip_urls = {
774+
str(server_url / 'page_1'),
775+
str(server_url / 'page_2'),
776+
str(server_url / 'page_3'),
777+
str(server_url / 'page_4'),
778+
}
779+
780+
requests = [call.args[0] for call in skip.call_args_list]
781+
782+
all(isinstance(request, Request) for request in requests)
783+
assert {request.url for request in requests} == expected_skip_urls
780784

781785

782786
async def test_send_request(server_url: URL) -> None:

0 commit comments

Comments
 (0)