Skip to content

Commit 807edef

Browse files
vdusekclaude
andcommitted
fix: Parallelize response processing and clean up ApifyRequestList
Process all remote URL fetches and their response bodies in parallel using a single asyncio.gather instead of sequential processing. Refactor to use @classmethod, inline _create_request_list into open, and extract _process_remote_url as a static helper. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent cf3d6a3 commit 807edef

File tree

1 file changed

+37
-45
lines changed

1 file changed

+37
-45
lines changed

src/apify/request_loaders/_apify_request_list.py

Lines changed: 37 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import asyncio
44
import re
5+
from itertools import chain
56
from typing import Annotated, Any
67

78
from pydantic import BaseModel, Field, TypeAdapter
@@ -43,8 +44,10 @@ class ApifyRequestList(RequestList):
4344
Method open is used to create RequestList from actor's requestListSources input.
4445
"""
4546

46-
@staticmethod
47+
@classmethod
4748
async def open(
49+
cls,
50+
*,
4851
name: str | None = None,
4952
request_list_sources_input: list[dict[str, Any]] | None = None,
5053
http_client: HttpClient | None = None,
@@ -72,12 +75,7 @@ async def open(
7275
```
7376
"""
7477
request_list_sources_input = request_list_sources_input or []
75-
return await ApifyRequestList._create_request_list(name, request_list_sources_input, http_client)
7678

77-
@staticmethod
78-
async def _create_request_list(
79-
name: str | None, request_list_sources_input: list[dict[str, Any]], http_client: HttpClient | None
80-
) -> ApifyRequestList:
8179
if not http_client:
8280
http_client = ImpitHttpClient()
8381

@@ -86,15 +84,30 @@ async def _create_request_list(
8684
simple_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _SimpleUrlInput)]
8785
remote_url_inputs = [url_input for url_input in url_inputs if isinstance(url_input, _RequestsFromUrlInput)]
8886

89-
simple_url_requests = ApifyRequestList._create_requests_from_input(simple_url_inputs)
90-
remote_url_requests = await ApifyRequestList._fetch_requests_from_url(
91-
remote_url_inputs, http_client=http_client
92-
)
87+
simple_url_requests = cls._create_requests_from_input(simple_url_inputs)
88+
remote_url_requests = await cls._fetch_requests_from_url(remote_url_inputs, http_client)
9389

9490
return ApifyRequestList(name=name, requests=simple_url_requests + remote_url_requests)
9591

92+
@classmethod
93+
async def _fetch_requests_from_url(
94+
cls,
95+
remote_url_requests_inputs: list[_RequestsFromUrlInput],
96+
http_client: HttpClient,
97+
) -> list[Request]:
98+
"""Create list of requests from url.
99+
100+
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Extract links from
101+
each response body using URL_NO_COMMAS_REGEX regex. Create list of Requests from collected links and additional
102+
inputs stored in other attributes of each remote_url_requests_inputs.
103+
"""
104+
tasks = [cls._process_remote_url(request_input, http_client) for request_input in remote_url_requests_inputs]
105+
results = await asyncio.gather(*tasks)
106+
return list(chain.from_iterable(results))
107+
96108
@staticmethod
97109
def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> list[Request]:
110+
"""Create `Request` objects from simple URL inputs."""
98111
return [
99112
Request.from_url(
100113
method=request_input.method,
@@ -107,40 +120,19 @@ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> lis
107120
]
108121

109122
@staticmethod
110-
async def _fetch_requests_from_url(
111-
remote_url_requests_inputs: list[_RequestsFromUrlInput],
112-
http_client: HttpClient,
113-
) -> list[Request]:
114-
"""Create list of requests from url.
123+
async def _process_remote_url(request_input: _RequestsFromUrlInput, http_client: HttpClient) -> list[Request]:
124+
"""Fetch a remote URL and extract links from the response body."""
125+
http_response = await http_client.send_request(method='GET', url=request_input.requests_from_url)
126+
response_body = await http_response.read()
127+
matches = re.finditer(URL_NO_COMMAS_REGEX, response_body.decode('utf-8'))
115128

116-
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Extract links from
117-
each response body using URL_NO_COMMAS_REGEX regex. Create list of Requests from collected links and additional
118-
inputs stored in other attributes of each remote_url_requests_inputs.
119-
"""
120-
created_requests: list[Request] = []
121-
122-
# Fetch all remote URLs in parallel.
123-
responses = await asyncio.gather(
124-
*[
125-
http_client.send_request(method='GET', url=remote_url_input.requests_from_url)
126-
for remote_url_input in remote_url_requests_inputs
127-
]
128-
)
129-
130-
# Process each response and extract links.
131-
for request_input, http_response in zip(remote_url_requests_inputs, responses, strict=True):
132-
response_body = await http_response.read()
133-
matches = re.finditer(URL_NO_COMMAS_REGEX, response_body.decode('utf-8'))
134-
135-
created_requests.extend(
136-
Request.from_url(
137-
match.group(0),
138-
method=request_input.method,
139-
payload=request_input.payload.encode('utf-8'),
140-
headers=request_input.headers,
141-
user_data=request_input.user_data,
142-
)
143-
for match in matches
129+
return [
130+
Request.from_url(
131+
url=match.group(0),
132+
method=request_input.method,
133+
payload=request_input.payload.encode('utf-8'),
134+
headers=request_input.headers,
135+
user_data=request_input.user_data,
144136
)
145-
146-
return created_requests
137+
for match in matches
138+
]

0 commit comments

Comments
 (0)