Skip to content

Commit d6bb0b4

Browse files
authored
fix: Fix max_requests_per_crawl excluding failed requests (#1766)
### Description - `BasicCrawler` should also consider failed requests when deciding if `max_requests_per_crawl` was reached ### Issues - Closes: #1765 ### Testing - Added unit test ### Checklist - [ ] CI passed
1 parent 7f7d45e commit d6bb0b4

File tree

2 files changed

+10
-4
lines changed

2 files changed

+10
-4
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ def _stop_if_max_requests_count_exceeded(self) -> None:
566566
if self._max_requests_per_crawl is None:
567567
return
568568

569-
if self._statistics.state.requests_finished >= self._max_requests_per_crawl:
569+
if self._statistics.state.requests_total >= self._max_requests_per_crawl:
570570
self.stop(
571571
reason=f'The crawler has reached its limit of {self._max_requests_per_crawl} requests per crawl. '
572572
)

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -909,7 +909,10 @@ async def handler_three(context: BasicCrawlingContext) -> None:
909909
assert (await store.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_0')) == {'hello': 'last_world'}
910910

911911

912-
async def test_max_requests_per_crawl() -> None:
912+
@pytest.mark.parametrize(
913+
'use_failed_requests', [pytest.param(True, id='failed requests'), pytest.param(False, id='finished requests')]
914+
)
915+
async def test_max_requests_per_crawl(*, use_failed_requests: bool) -> None:
913916
start_urls = [
914917
'http://test.io/1',
915918
'http://test.io/2',
@@ -927,14 +930,17 @@ async def test_max_requests_per_crawl() -> None:
927930

928931
@crawler.router.default_handler
929932
async def handler(context: BasicCrawlingContext) -> None:
933+
if use_failed_requests:
934+
raise RuntimeError('Arbitrary crash for testing purposes')
930935
processed_urls.append(context.request.url)
931936

932937
stats = await crawler.run(start_urls)
933938

934939
# Verify that only 3 out of the 5 provided URLs were made
935-
assert len(processed_urls) == 3
940+
if not use_failed_requests:
941+
assert len(processed_urls) == 3
942+
assert stats.requests_finished == 3
936943
assert stats.requests_total == 3
937-
assert stats.requests_finished == 3
938944

939945

940946
async def test_max_crawl_depth() -> None:

0 commit comments

Comments
 (0)