Skip to content

Commit 0b2e3fc

Browse files
authored
fix: Handle invalid URLs in RequestList (#1803)
### Description - Handle invalid URLs in `RequestList`. ### Issues - Closes: #1802 ### Testing - Add new tests for `RequestList
1 parent 9becf12 commit 0b2e3fc

File tree

2 files changed

+47
-8
lines changed

2 files changed

+47
-8
lines changed

src/crawlee/request_loaders/_request_list.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from logging import getLogger
77
from typing import Annotated
88

9-
from pydantic import BaseModel, ConfigDict, Field
9+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
1010
from typing_extensions import override
1111

1212
from crawlee._request import Request
@@ -106,10 +106,14 @@ async def _get_state(self) -> RequestListState:
106106
if self._persist_request_data:
107107
async with self._requests_lock:
108108
if not await self._requests_data.has_persisted_state():
109-
self._requests_data.current_value.requests = [
110-
request if isinstance(request, Request) else Request.from_url(request)
111-
async for request in self._requests
112-
]
109+
self._requests_data.current_value.requests = []
110+
async for processing_request in self._requests:
111+
try:
112+
request = self._transform_request(processing_request)
113+
except ValidationError:
114+
logger.warning(f'Invalid request encountered in the request list: {processing_request}')
115+
continue
116+
self._requests_data.current_value.requests.append(request)
113117
await self._requests_data.persist_state()
114118

115119
self._requests = self._iterate_in_threadpool(
@@ -202,11 +206,18 @@ async def _ensure_next_request(self) -> None:
202206
self._next = (self._next[0], to_enqueue[0])
203207

204208
async def _dequeue_requests(self, count: int) -> AsyncGenerator[Request | None]:
205-
for _ in range(count):
209+
while count > 0:
206210
try:
207-
yield self._transform_request(await self._requests.__anext__())
208-
except StopAsyncIteration: # noqa: PERF203
211+
processing_request = await self._requests.__anext__()
212+
try:
213+
request = self._transform_request(processing_request)
214+
except ValidationError:
215+
logger.warning(f'Invalid request encountered in the request list: {processing_request}')
216+
continue
217+
yield request
218+
except StopAsyncIteration:
209219
yield None
220+
count -= 1
210221

211222
async def _iterate_in_threadpool(self, iterable: Iterable[str | Request]) -> AsyncIterator[str | Request]:
212223
"""Inspired by a function of the same name from encode/starlette."""

tests/unit/request_loaders/test_request_list.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,3 +240,31 @@ async def test_persist_requests_key_only_persists_once() -> None:
240240
fetched_request = await request_list_2.fetch_next_request()
241241
assert fetched_request is not None
242242
assert fetched_request.url == 'https://once2.placeholder.com' # From original data
243+
244+
245+
async def test_handle_invalid_url() -> None:
246+
"""Test that invalid URLs are handled gracefully."""
247+
request_list = RequestList(['invalid-url.com', 'https://valid.placeholder.com'])
248+
249+
# First request is invalid, should be skipped without crashing
250+
request = await request_list.fetch_next_request()
251+
assert request is not None
252+
assert request.url == 'https://valid.placeholder.com'
253+
await request_list.mark_request_as_handled(request)
254+
255+
256+
async def test_handle_invalid_url_with_persistence() -> None:
257+
"""Test that invalid URLs are handled gracefully even when persistence is enabled."""
258+
persist_key = 'test_invalid_url_persistence'
259+
request_list = RequestList(['invalid-url.com', 'https://valid.placeholder.com'], persist_requests_key=persist_key)
260+
261+
# First request is invalid, should be skipped without crashing
262+
request = await request_list.fetch_next_request()
263+
assert request is not None
264+
assert request.url == 'https://valid.placeholder.com'
265+
await request_list.mark_request_as_handled(request)
266+
267+
# Check that the valid URL was persisted and the invalid one was not
268+
kvs = await KeyValueStore.open()
269+
persisted_data = await kvs.get_value(persist_key)
270+
assert persisted_data is not None

0 commit comments

Comments
 (0)