Skip to content

Commit 99b34f9

Browse files
committed
Improve docstring of ApifyRequestList
1 parent 668ebcb commit 99b34f9

1 file changed

Lines changed: 39 additions & 23 deletions

File tree

src/apify/request_loaders/_apify_request_list.py

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,30 @@ class _SimpleUrlInput(_RequestDetails):
3939

4040
@docs_group('Request loaders')
4141
class ApifyRequestList(RequestList):
42-
"""Extends crawlee RequestList.
42+
"""A request list that can be constructed from the standard Apify `requestListSources` Actor input format.
4343
44-
Method open is used to create RequestList from actor's requestListSources input.
44+
This extends the Crawlee `RequestList` with the ability to parse the request list sources input commonly
45+
used in Apify Actors. It supports two kinds of entries:
46+
47+
- **Direct URLs** - entries with a `url` key are converted to requests directly.
48+
- **Remote URL lists** - entries with a `requestsFromUrl` key point to a remote resource (e.g. a plain-text
49+
file). The resource is fetched and all URLs found in the response body are extracted and converted to requests.
50+
51+
Both kinds of entries can optionally specify `method`, `payload`, `headers`, and `userData` fields that will be
52+
applied to every request created from that entry.
53+
54+
### Usage
55+
56+
```python
57+
from apify import Actor
58+
from apify.request_loaders import ApifyRequestList
59+
60+
async with Actor:
61+
actor_input = await Actor.get_input() or {}
62+
request_list = await ApifyRequestList.open(
63+
request_list_sources_input=actor_input.get('requestListSources', []),
64+
)
65+
```
4566
"""
4667

4768
@classmethod
@@ -52,27 +73,22 @@ async def open(
5273
request_list_sources_input: list[dict[str, Any]] | None = None,
5374
http_client: HttpClient | None = None,
5475
) -> ApifyRequestList:
55-
"""Initialize a new instance from request list source input.
76+
"""Create a new `ApifyRequestList` from the standard Apify request list sources input.
77+
78+
Each entry in `request_list_sources_input` is a dict with either a `url` key (for a direct URL) or
79+
a `requestsFromUrl` key (for a remote resource whose response body is scanned for URLs). Optional keys
80+
`method`, `payload`, `headers`, and `userData` are applied to every request produced from that entry.
5681
5782
Args:
58-
name: Name of the returned RequestList.
59-
request_list_sources_input: List of dicts with either url key or requestsFromUrl key.
60-
http_client: Client that will be used to send get request to urls defined by value of requestsFromUrl keys.
83+
name: An optional name for the request list, used for state persistence.
84+
request_list_sources_input: A list of request source dicts in the standard Apify format. Each dict must
85+
contain either a `url` key or a `requestsFromUrl` key. If `None` or empty, an empty request list
86+
is returned.
87+
http_client: HTTP client used to fetch remote URL lists (entries with `requestsFromUrl`). Defaults to
88+
`ImpitHttpClient` if not provided.
6189
6290
Returns:
63-
RequestList created from request_list_sources_input.
64-
65-
### Usage
66-
67-
```python
68-
example_input = [
69-
# Gather urls from response body.
70-
{'requestsFromUrl': 'https://crawlee.dev/file.txt', 'method': 'GET'},
71-
# Directly include this url.
72-
{'url': 'https://crawlee.dev', 'method': 'GET'}
73-
]
74-
request_list = await RequestList.open(request_list_sources_input=example_input)
75-
```
91+
A new `ApifyRequestList` populated with the resolved requests.
7692
"""
7793
request_list_sources_input = request_list_sources_input or []
7894

@@ -95,11 +111,11 @@ async def _fetch_requests_from_url(
95111
remote_url_requests_inputs: list[_RequestsFromUrlInput],
96112
http_client: HttpClient,
97113
) -> list[Request]:
98-
"""Create list of requests from url.
114+
"""Fetch all remote URL sources concurrently and return the extracted requests.
99115
100-
Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Extract links from
101-
each response body using URL_NO_COMMAS_REGEX regex. Create list of Requests from collected links and additional
102-
inputs stored in other attributes of each remote_url_requests_inputs.
116+
For each entry, a GET request is sent to the `requests_from_url` URL. All URLs matching `URL_NO_COMMAS_REGEX`
117+
are extracted from the response body and turned into `Request` objects, inheriting `method`, `payload`,
118+
`headers`, and `user_data` from the source entry.
103119
"""
104120
tasks = [cls._process_remote_url(request_input, http_client) for request_input in remote_url_requests_inputs]
105121
results = await asyncio.gather(*tasks)

0 commit comments

Comments
 (0)