22
33import asyncio
44import re
5+ from itertools import chain
56from typing import Annotated , Any
67
78from pydantic import BaseModel , Field , TypeAdapter
@@ -43,8 +44,10 @@ class ApifyRequestList(RequestList):
4344 Method open is used to create RequestList from actor's requestListSources input.
4445 """
4546
46- @staticmethod
47+ @classmethod
4748 async def open (
49+ cls ,
50+ * ,
4851 name : str | None = None ,
4952 request_list_sources_input : list [dict [str , Any ]] | None = None ,
5053 http_client : HttpClient | None = None ,
@@ -72,12 +75,7 @@ async def open(
7275 ```
7376 """
7477 request_list_sources_input = request_list_sources_input or []
75- return await ApifyRequestList ._create_request_list (name , request_list_sources_input , http_client )
7678
77- @staticmethod
78- async def _create_request_list (
79- name : str | None , request_list_sources_input : list [dict [str , Any ]], http_client : HttpClient | None
80- ) -> ApifyRequestList :
8179 if not http_client :
8280 http_client = ImpitHttpClient ()
8381
@@ -86,15 +84,30 @@ async def _create_request_list(
8684 simple_url_inputs = [url_input for url_input in url_inputs if isinstance (url_input , _SimpleUrlInput )]
8785 remote_url_inputs = [url_input for url_input in url_inputs if isinstance (url_input , _RequestsFromUrlInput )]
8886
89- simple_url_requests = ApifyRequestList ._create_requests_from_input (simple_url_inputs )
90- remote_url_requests = await ApifyRequestList ._fetch_requests_from_url (
91- remote_url_inputs , http_client = http_client
92- )
87+ simple_url_requests = cls ._create_requests_from_input (simple_url_inputs )
88+ remote_url_requests = await cls ._fetch_requests_from_url (remote_url_inputs , http_client )
9389
9490 return ApifyRequestList (name = name , requests = simple_url_requests + remote_url_requests )
9591
92+ @classmethod
93+ async def _fetch_requests_from_url (
94+ cls ,
95+ remote_url_requests_inputs : list [_RequestsFromUrlInput ],
96+ http_client : HttpClient ,
97+ ) -> list [Request ]:
98+ """Create list of requests from url.
99+
100+ Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Extract links from
101+ each response body using URL_NO_COMMAS_REGEX regex. Create list of Requests from collected links and additional
102+ inputs stored in other attributes of each remote_url_requests_inputs.
103+ """
104+ tasks = [cls ._process_remote_url (request_input , http_client ) for request_input in remote_url_requests_inputs ]
105+ results = await asyncio .gather (* tasks )
106+ return list (chain .from_iterable (results ))
107+
96108 @staticmethod
97109 def _create_requests_from_input (simple_url_inputs : list [_SimpleUrlInput ]) -> list [Request ]:
110+ """Create `Request` objects from simple URL inputs."""
98111 return [
99112 Request .from_url (
100113 method = request_input .method ,
@@ -107,40 +120,19 @@ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> lis
107120 ]
108121
109122 @staticmethod
110- async def _fetch_requests_from_url (
111- remote_url_requests_inputs : list [ _RequestsFromUrlInput ],
112- http_client : HttpClient ,
113- ) -> list [ Request ]:
114- """Create list of requests from url.
123+ async def _process_remote_url ( request_input : _RequestsFromUrlInput , http_client : HttpClient ) -> list [ Request ]:
124+ """Fetch a remote URL and extract links from the response body."""
125+ http_response = await http_client . send_request ( method = 'GET' , url = request_input . requests_from_url )
126+ response_body = await http_response . read ()
127+ matches = re . finditer ( URL_NO_COMMAS_REGEX , response_body . decode ( 'utf-8' ))
115128
116- Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Extract links from
117- each response body using URL_NO_COMMAS_REGEX regex. Create list of Requests from collected links and additional
118- inputs stored in other attributes of each remote_url_requests_inputs.
119- """
120- created_requests : list [Request ] = []
121-
122- # Fetch all remote URLs in parallel.
123- responses = await asyncio .gather (
124- * [
125- http_client .send_request (method = 'GET' , url = remote_url_input .requests_from_url )
126- for remote_url_input in remote_url_requests_inputs
127- ]
128- )
129-
130- # Process each response and extract links.
131- for request_input , http_response in zip (remote_url_requests_inputs , responses , strict = True ):
132- response_body = await http_response .read ()
133- matches = re .finditer (URL_NO_COMMAS_REGEX , response_body .decode ('utf-8' ))
134-
135- created_requests .extend (
136- Request .from_url (
137- match .group (0 ),
138- method = request_input .method ,
139- payload = request_input .payload .encode ('utf-8' ),
140- headers = request_input .headers ,
141- user_data = request_input .user_data ,
142- )
143- for match in matches
129+ return [
130+ Request .from_url (
131+ url = match .group (0 ),
132+ method = request_input .method ,
133+ payload = request_input .payload .encode ('utf-8' ),
134+ headers = request_input .headers ,
135+ user_data = request_input .user_data ,
144136 )
145-
146- return created_requests
137+ for match in matches
138+ ]
0 commit comments