Skip to content

Commit 944753a

Browse files
authored
fix: Return None from add_request when storage client fails to enqueue request (#1775)
### Description - This PR updates the signature for `RequestManager.add_request` to return `None` when the storage client fails to process the request and returns it in `AddRequestsResponse.unprocessed_requests`. Previously, accessing `processed_requests[0]` on an empty list caused an `IndexError`. ### Testing - Tests updated in accordance with the new signature
1 parent ee1501f commit 944753a

File tree

4 files changed

+30
-7
lines changed

4 files changed

+30
-7
lines changed

src/crawlee/request_loaders/_request_manager.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ async def add_request(
2828
request: str | Request,
2929
*,
3030
forefront: bool = False,
31-
) -> ProcessedRequest:
31+
) -> ProcessedRequest | None:
3232
"""Add a single request to the manager and store it in underlying resource client.
3333
3434
Args:
@@ -37,7 +37,7 @@ async def add_request(
3737
of the manager.
3838
3939
Returns:
40-
Information about the request addition to the manager.
40+
Information about the request addition to the manager or None if the request was not added.
4141
"""
4242

4343
async def add_requests(
@@ -64,7 +64,8 @@ async def add_requests(
6464
processed_requests = list[ProcessedRequest]()
6565
for request in requests:
6666
processed_request = await self.add_request(request, forefront=forefront)
67-
processed_requests.append(processed_request)
67+
if processed_request:
68+
processed_requests.append(processed_request)
6869

6970
@abstractmethod
7071
async def reclaim_request(self, request: Request, *, forefront: bool = False) -> ProcessedRequest | None:

src/crawlee/request_loaders/_request_manager_tandem.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ async def is_finished(self) -> bool:
4949
return (await self._read_only_loader.is_finished()) and (await self._read_write_manager.is_finished())
5050

5151
@override
52-
async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest:
52+
async def add_request(self, request: str | Request, *, forefront: bool = False) -> ProcessedRequest | None:
5353
return await self._read_write_manager.add_request(request, forefront=forefront)
5454

5555
@override

src/crawlee/storages/_request_queue.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,23 @@ async def add_request(
158158
request: str | Request,
159159
*,
160160
forefront: bool = False,
161-
) -> ProcessedRequest:
161+
) -> ProcessedRequest | None:
162162
request = self._transform_request(request)
163163
response = await self._client.add_batch_of_requests([request], forefront=forefront)
164-
return response.processed_requests[0]
164+
165+
if response.processed_requests:
166+
return response.processed_requests[0]
167+
168+
if response.unprocessed_requests:
169+
logger.warning(
170+
f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}".'
171+
)
172+
else:
173+
logger.warning(
174+
f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}" '
175+
'received empty response.'
176+
)
177+
return None
165178

166179
@override
167180
async def add_requests(

tests/unit/storages/test_request_queue.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ async def test_add_request_string_url(rq: RequestQueue) -> None:
125125
result = await rq.add_request(url)
126126

127127
# Verify request was added
128+
assert result is not None
128129
assert result.unique_key is not None
129130
assert result.was_already_present is False
130131
assert result.was_already_handled is False
@@ -142,6 +143,7 @@ async def test_add_request_object(rq: RequestQueue) -> None:
142143
result = await rq.add_request(request)
143144

144145
# Verify request was added
146+
assert result is not None
145147
assert result.unique_key is not None
146148
assert result.was_already_present is False
147149
assert result.was_already_handled is False
@@ -158,10 +160,13 @@ async def test_add_duplicate_request(rq: RequestQueue) -> None:
158160
url = 'https://example.com'
159161
first_result = await rq.add_request(url)
160162

163+
assert first_result is not None
164+
161165
# Add the same request again
162166
second_result = await rq.add_request(url)
163167

164168
# Verify the second request was detected as duplicate
169+
assert second_result is not None
165170
assert second_result.was_already_present is True
166171
assert second_result.unique_key == first_result.unique_key
167172

@@ -336,14 +341,17 @@ async def test_fetch_next_request_and_mark_handled(rq: RequestQueue) -> None:
336341
assert metadata.pending_request_count == 0
337342

338343
# Verify queue is empty
339-
empty_request = await rq.fetch_next_request()
344+
empty_request: Request | None = await rq.fetch_next_request()
340345
assert empty_request is None
341346

342347

343348
async def test_get_request_by_id(rq: RequestQueue) -> None:
344349
"""Test retrieving a request by its ID."""
345350
# Add a request
346351
added_result = await rq.add_request('https://example.com')
352+
353+
assert added_result is not None
354+
347355
unique_key = added_result.unique_key
348356

349357
# Retrieve the request by ID
@@ -791,6 +799,7 @@ async def test_alias_request_operations(
791799

792800
for url in urls:
793801
result = await rq.add_request(url)
802+
assert result is not None
794803
assert result.was_already_present is False
795804

796805
# Test queue metadata

0 commit comments

Comments
 (0)