@@ -39,9 +39,30 @@ class _SimpleUrlInput(_RequestDetails):
3939
4040@docs_group ('Request loaders' )
4141class ApifyRequestList (RequestList ):
42- """Extends crawlee RequestList .
42+ """A request list that can be constructed from the standard Apify `requestListSources` Actor input format .
4343
44- Method open is used to create RequestList from actor's requestListSources input.
44+ This extends the Crawlee `RequestList` with the ability to parse the request list sources input commonly
45+ used in Apify Actors. It supports two kinds of entries:
46+
47+ - **Direct URLs** - entries with a `url` key are converted to requests directly.
48+ - **Remote URL lists** - entries with a `requestsFromUrl` key point to a remote resource (e.g. a plain-text
49+ file). The resource is fetched and all URLs found in the response body are extracted and converted to requests.
50+
51+ Both kinds of entries can optionally specify `method`, `payload`, `headers`, and `userData` fields that will be
52+ applied to every request created from that entry.
53+
54+ ### Usage
55+
56+ ```python
57+ from apify import Actor
58+ from apify.request_loaders import ApifyRequestList
59+
60+ async with Actor:
61+ actor_input = await Actor.get_input() or {}
62+ request_list = await ApifyRequestList.open(
63+ request_list_sources_input=actor_input.get('requestListSources', []),
64+ )
65+ ```
4566 """
4667
4768 @classmethod
@@ -52,27 +73,22 @@ async def open(
5273 request_list_sources_input : list [dict [str , Any ]] | None = None ,
5374 http_client : HttpClient | None = None ,
5475 ) -> ApifyRequestList :
55- """Initialize a new instance from request list source input.
76+ """Create a new `ApifyRequestList` from the standard Apify request list sources input.
77+
78+ Each entry in `request_list_sources_input` is a dict with either a `url` key (for a direct URL) or
79+ a `requestsFromUrl` key (for a remote resource whose response body is scanned for URLs). Optional keys
80+ `method`, `payload`, `headers`, and `userData` are applied to every request produced from that entry.
5681
5782 Args:
58- name: Name of the returned RequestList.
59- request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
60- http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.
83+ name: An optional name for the request list, used for state persistence.
84+ request_list_sources_input: A list of request source dicts in the standard Apify format. Each dict must
85+ contain either a `url` key or a `requestsFromUrl` key. If `None` or empty, an empty request list
86+ is returned.
87+ http_client: HTTP client used to fetch remote URL lists (entries with `requestsFromUrl`). Defaults to
88+ `ImpitHttpClient` if not provided.
6189
6290 Returns:
63- RequestList created from request_list_sources_input.
64-
65- ### Usage
66-
67- ```python
68- example_input = [
69- # Gather urls from response body.
70- {'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
71- # Directly include this url.
72- {'url': 'https://crawlee.dev', 'method': 'GET'}
73- ]
74- request_list = await RequestList.open(request_list_sources_input=example_input)
75- ```
91+ A new `ApifyRequestList` populated with the resolved requests.
7692 """
7793 request_list_sources_input = request_list_sources_input or []
7894
@@ -95,11 +111,11 @@ async def _fetch_requests_from_url(
95111 remote_url_requests_inputs : list [_RequestsFromUrlInput ],
96112 http_client : HttpClient ,
97113 ) -> list [Request ]:
98- """Create list of requests from url .
114+ """Fetch all remote URL sources concurrently and return the extracted requests .
99115
100- Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Extract links from
101- each response body using URL_NO_COMMAS_REGEX regex. Create list of Requests from collected links and additional
102- inputs stored in other attributes of each remote_url_requests_inputs .
116+ For each entry, a GET request is sent to the `requests_from_url` URL. All URLs matching `URL_NO_COMMAS_REGEX`
117+ are extracted from the response body and turned into `Request` objects, inheriting `method`, `payload`,
118+ `headers`, and `user_data` from the source entry .
103119 """
104120 tasks = [cls ._process_remote_url (request_input , http_client ) for request_input in remote_url_requests_inputs ]
105121 results = await asyncio .gather (* tasks )
0 commit comments