Skip to content

Commit 71d7867

Browse files
authored
fix: Fix adding extra link for EnqueueLinksFunction with limit (#1674)
### Description - fix adding extra link for `EnqueueLinksFunction` with `limit` ### Issues - Closes: #1673 ### Testing - Add tests for `enqueue_links` with `limit` argument
1 parent 002c332 commit 71d7867

File tree

4 files changed

+79
-3
lines changed

4 files changed

+79
-3
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,9 +1056,10 @@ def _enqueue_links_filter_iterator(
10561056
) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
10571057
yield request
10581058

1059-
limit = limit - 1 if limit is not None else None
1060-
if limit and limit <= 0:
1061-
break
1059+
if limit is not None:
1060+
limit -= 1
1061+
if limit <= 0:
1062+
break
10621063

10631064
def _check_enqueue_strategy(
10641065
self,

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,3 +428,28 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
428428

429429
assert handler_calls.called
430430
assert handler_calls.call_count == 1
431+
432+
433+
async def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None:
434+
start_url = str(server_url / 'sub_index')
435+
requests = [start_url]
436+
437+
crawler = BeautifulSoupCrawler(http_client=http_client)
438+
visit = mock.Mock()
439+
440+
@crawler.router.default_handler
441+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
442+
visit(context.request.url)
443+
await context.enqueue_links(limit=1)
444+
445+
await crawler.run(requests)
446+
447+
first_visited = visit.call_args_list[0][0][0]
448+
visited = {call[0][0] for call in visit.call_args_list}
449+
450+
assert first_visited == start_url
451+
# Only one link should be enqueued from sub_index due to the limit
452+
assert visited == {
453+
start_url,
454+
str(server_url / 'page_3'),
455+
}

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,3 +445,28 @@ async def handler(context: ParselCrawlingContext) -> None:
445445
await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias)
446446

447447
await crawler.run([str(server_url / 'start_enqueue')])
448+
449+
450+
async def test_enqueue_links_with_limit(server_url: URL, http_client: HttpClient) -> None:
451+
start_url = str(server_url / 'sub_index')
452+
requests = [start_url]
453+
454+
crawler = ParselCrawler(http_client=http_client)
455+
visit = mock.Mock()
456+
457+
@crawler.router.default_handler
458+
async def request_handler(context: ParselCrawlingContext) -> None:
459+
visit(context.request.url)
460+
await context.enqueue_links(limit=1)
461+
462+
await crawler.run(requests)
463+
464+
first_visited = visit.call_args_list[0][0][0]
465+
visited = {call[0][0] for call in visit.call_args_list}
466+
467+
assert first_visited == start_url
468+
# Only one link should be enqueued from sub_index due to the limit
469+
assert visited == {
470+
start_url,
471+
str(server_url / 'page_3'),
472+
}

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,3 +1051,28 @@ async def failed_request_handler(context: BasicCrawlingContext, _error: Exceptio
10511051
}
10521052

10531053
await queue.drop()
1054+
1055+
1056+
async def test_enqueue_links_with_limit(server_url: URL) -> None:
1057+
start_url = str(server_url / 'sub_index')
1058+
requests = [start_url]
1059+
1060+
crawler = PlaywrightCrawler()
1061+
visit = mock.Mock()
1062+
1063+
@crawler.router.default_handler
1064+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
1065+
visit(context.request.url)
1066+
await context.enqueue_links(limit=1)
1067+
1068+
await crawler.run(requests)
1069+
1070+
first_visited = visit.call_args_list[0][0][0]
1071+
visited = {call[0][0] for call in visit.call_args_list}
1072+
1073+
assert first_visited == start_url
1074+
# Only one link should be enqueued from sub_index due to the limit
1075+
assert visited == {
1076+
start_url,
1077+
str(server_url / 'page_3'),
1078+
}

0 commit comments

Comments
 (0)