|
11 | 11 | from crawlee._utils.docs import docs_group |
12 | 12 | from crawlee._utils.wait import wait_for_all_tasks_for_finish |
13 | 13 | from crawlee.request_loaders import RequestManager |
| 14 | +from crawlee.storage_clients.models import AddRequestsResponse |
14 | 15 |
|
15 | 16 | from ._base import Storage |
16 | 17 | from ._utils import validate_storage_name |
@@ -181,19 +182,20 @@ async def add_request( |
181 | 182 | forefront: bool = False, |
182 | 183 | ) -> ProcessedRequest | None: |
183 | 184 | request = self._transform_request(request) |
184 | | - response = await self._client.add_batch_of_requests([request], forefront=forefront) |
| 185 | + # Route the single request through the same retry mechanism as `add_requests`, so that adding one |
| 186 | + # request is just as durable as a batched add against best-effort backends that may return a request |
| 187 | + # as unprocessed. |
| 188 | + response = await self._process_batch([request], base_retry_wait=timedelta(seconds=1), forefront=forefront) |
185 | 189 |
|
186 | 190 | if response.processed_requests: |
187 | 191 | return response.processed_requests[0] |
188 | 192 |
|
189 | | - if response.unprocessed_requests: |
| 193 | + # `_process_batch` already warns when requests remain unprocessed after the retries are exhausted, so |
| 194 | + # only the empty-response case (neither processed nor unprocessed) needs a warning here. |
| 195 | + if not response.unprocessed_requests: |
190 | 196 | logger.warning( |
191 | | - f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}".' |
192 | | - ) |
193 | | - else: |
194 | | - logger.warning( |
195 | | - f'Request {request.url} was not processed by storage client "{self._client.__class__.__name__}" ' |
196 | | - 'received empty response.' |
| 197 | + f'Request {request.url} was not processed by storage client ' |
| 198 | + f'"{self._client.__class__.__name__}" (received an empty response).' |
197 | 199 | ) |
198 | 200 | return None |
199 | 201 |
|
@@ -352,33 +354,47 @@ async def _process_batch( |
352 | 354 | base_retry_wait: timedelta, |
353 | 355 | attempt: int = 1, |
354 | 356 | forefront: bool = False, |
355 | | - ) -> None: |
356 | | - """Process a batch of requests with automatic retry mechanism.""" |
| 357 | + ) -> AddRequestsResponse: |
| 358 | + """Process a batch of requests with automatic retry mechanism. |
| 359 | +
|
| 360 | + Returns: |
| 361 | + A response aggregating the requests processed across all attempts together with any that remained |
| 362 | + unprocessed after the retries were exhausted. |
| 363 | + """ |
357 | 364 | max_attempts = 5 |
358 | 365 | response = await self._client.add_batch_of_requests(batch, forefront=forefront) |
359 | 366 |
|
360 | | - if response.unprocessed_requests: |
361 | | - logger.debug(f'Following requests were not processed: {response.unprocessed_requests}.') |
362 | | - if attempt > max_attempts: |
363 | | - logger.warning( |
364 | | - f'Following requests were not processed even after {max_attempts} attempts:\n' |
365 | | - f'{response.unprocessed_requests}' |
366 | | - ) |
367 | | - else: |
368 | | - logger.debug('Retry to add requests.') |
369 | | - unprocessed_requests_unique_keys = {request.unique_key for request in response.unprocessed_requests} |
370 | | - retry_batch = [request for request in batch if request.unique_key in unprocessed_requests_unique_keys] |
371 | | - await asyncio.sleep((base_retry_wait * attempt).total_seconds()) |
372 | | - await self._process_batch( |
373 | | - retry_batch, |
374 | | - base_retry_wait=base_retry_wait, |
375 | | - attempt=attempt + 1, |
376 | | - forefront=forefront, |
377 | | - ) |
378 | | - |
379 | 367 | request_count = len(batch) - len(response.unprocessed_requests) |
380 | | - |
381 | 368 | if request_count: |
382 | 369 | logger.debug( |
383 | 370 | f'Added {request_count} requests to the queue. Processed requests: {response.processed_requests}' |
384 | 371 | ) |
| 372 | + |
| 373 | + if not response.unprocessed_requests: |
| 374 | + return response |
| 375 | + |
| 376 | + logger.debug(f'Following requests were not processed: {response.unprocessed_requests}.') |
| 377 | + if attempt > max_attempts: |
| 378 | + logger.warning( |
| 379 | + f'Following requests were not processed even after {max_attempts} attempts:\n' |
| 380 | + f'{response.unprocessed_requests}' |
| 381 | + ) |
| 382 | + return response |
| 383 | + |
| 384 | + logger.debug('Retry to add requests.') |
| 385 | + unprocessed_requests_unique_keys = {request.unique_key for request in response.unprocessed_requests} |
| 386 | + retry_batch = [request for request in batch if request.unique_key in unprocessed_requests_unique_keys] |
| 387 | + await asyncio.sleep((base_retry_wait * attempt).total_seconds()) |
| 388 | + retry_response = await self._process_batch( |
| 389 | + retry_batch, |
| 390 | + base_retry_wait=base_retry_wait, |
| 391 | + attempt=attempt + 1, |
| 392 | + forefront=forefront, |
| 393 | + ) |
| 394 | + |
| 395 | + # Fold the retry outcome back in: requests processed on retry join this attempt's processed requests, |
| 396 | + # and only those still unprocessed after the final attempt remain. |
| 397 | + return AddRequestsResponse( |
| 398 | + processed_requests=[*response.processed_requests, *retry_response.processed_requests], |
| 399 | + unprocessed_requests=retry_response.unprocessed_requests, |
| 400 | + ) |
0 commit comments