Skip to content

Commit a603bcb

Browse files
committed
feat: Add support for NDU storages
1 parent 6e4f55d commit a603bcb

File tree

13 files changed

+528
-126
lines changed

13 files changed

+528
-126
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ keywords = [
3636
dependencies = [
3737
"apify-client>=2.0.0,<3.0.0",
3838
"apify-shared>=2.0.0,<3.0.0",
39-
"crawlee==1.0.0rc1",
39+
"crawlee==0.6.13b37",
4040
"cachetools>=5.5.0",
4141
"cryptography>=42.0.0",
4242
"impit>=0.5.3",

src/apify/_actor.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ async def open_dataset(
401401
self,
402402
*,
403403
id: str | None = None,
404+
alias: str | None = None,
404405
name: str | None = None,
405406
force_cloud: bool = False,
406407
) -> Dataset:
@@ -411,10 +412,9 @@ async def open_dataset(
411412
the Apify cloud.
412413
413414
Args:
414-
id: ID of the dataset to be opened. If neither `id` nor `name` are provided, the method returns
415-
the default dataset associated with the Actor run.
416-
name: Name of the dataset to be opened. If neither `id` nor `name` are provided, the method returns
417-
the default dataset associated with the Actor run.
415+
id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
416+
name: The name of the dataset for named storages. Mutually exclusive with alias.
417+
alias: The alias of the dataset for unnamed storages. Mutually exclusive with name.
418418
force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible
419419
to combine local and cloud storage.
420420
@@ -428,6 +428,7 @@ async def open_dataset(
428428

429429
return await Dataset.open(
430430
id=id,
431+
alias=alias,
431432
name=name,
432433
configuration=self._configuration,
433434
storage_client=storage_client,
@@ -437,6 +438,7 @@ async def open_key_value_store(
437438
self,
438439
*,
439440
id: str | None = None,
441+
alias: str | None = None,
440442
name: str | None = None,
441443
force_cloud: bool = False,
442444
) -> KeyValueStore:
@@ -446,10 +448,9 @@ async def open_key_value_store(
446448
and retrieved using a unique key. The actual data is stored either on a local filesystem or in the Apify cloud.
447449
448450
Args:
449-
id: ID of the key-value store to be opened. If neither `id` nor `name` are provided, the method returns
450-
the default key-value store associated with the Actor run.
451-
name: Name of the key-value store to be opened. If neither `id` nor `name` are provided, the method
452-
returns the default key-value store associated with the Actor run.
451+
id: The ID of the KVS to open. If provided, searches for existing KVS by ID.
452+
name: The name of the KVS for named storages. Mutually exclusive with alias.
453+
alias: The alias of the KVS for unnamed storages. Mutually exclusive with name.
453454
force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible
454455
to combine local and cloud storage.
455456
@@ -462,6 +463,7 @@ async def open_key_value_store(
462463

463464
return await KeyValueStore.open(
464465
id=id,
466+
alias=alias,
465467
name=name,
466468
configuration=self._configuration,
467469
storage_client=storage_client,
@@ -471,6 +473,7 @@ async def open_request_queue(
471473
self,
472474
*,
473475
id: str | None = None,
476+
alias: str | None = None,
474477
name: str | None = None,
475478
force_cloud: bool = False,
476479
) -> RequestQueue:
@@ -482,10 +485,9 @@ async def open_request_queue(
482485
crawling orders.
483486
484487
Args:
485-
id: ID of the request queue to be opened. If neither `id` nor `name` are provided, the method returns
486-
the default request queue associated with the Actor run.
487-
name: Name of the request queue to be opened. If neither `id` nor `name` are provided, the method returns
488-
the default request queue associated with the Actor run.
488+
id: The ID of the request queue to open. If provided, searches for existing request queue by ID.
489+
name: The name of the request queue for named storages. Mutually exclusive with alias.
490+
alias: The alias of the request queue for unnamed storages. Mutually exclusive with name.
489491
force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible
490492
to combine local and cloud storage.
491493
@@ -499,6 +501,7 @@ async def open_request_queue(
499501

500502
return await RequestQueue.open(
501503
id=id,
504+
alias=alias,
502505
name=name,
503506
configuration=self._configuration,
504507
storage_client=storage_client,

src/apify/events/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from crawlee.events import EventManager, LocalEventManager
1+
from crawlee.events import Event, EventManager, LocalEventManager
22

33
from ._apify_event_manager import ApifyEventManager
44

5-
__all__ = ['ApifyEventManager', 'EventManager', 'LocalEventManager']
5+
__all__ = ['ApifyEventManager', 'Event', 'EventManager', 'LocalEventManager']

src/apify/storage_clients/_apify/_dataset_client.py

Lines changed: 35 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from crawlee.storage_clients._base import DatasetClient
1313
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
1414

15+
from ._utils import resolve_alias_to_id, store_alias_mapping
16+
1517
if TYPE_CHECKING:
1618
from collections.abc import AsyncIterator
1719

@@ -66,6 +68,7 @@ async def open(
6668
*,
6769
id: str | None,
6870
name: str | None,
71+
alias: str | None,
6972
configuration: Configuration,
7073
) -> ApifyDatasetClient:
7174
"""Open an Apify dataset client.
@@ -75,21 +78,27 @@ async def open(
7578
7679
Args:
7780
id: The ID of an existing dataset to open. If provided, the client will connect to this specific storage.
78-
Cannot be used together with `name`.
81+
Cannot be used together with `name` or `alias`.
7982
name: The name of a dataset to get or create. If a storage with this name exists, it will be opened;
80-
otherwise, a new one will be created. Cannot be used together with `id`.
83+
otherwise, a new one will be created. Cannot be used together with `id` or `alias`.
84+
alias: The alias of a dataset for unnamed storages. If a storage with this alias exists, it will be
85+
opened; otherwise, a new one will be created and the alias will be saved. Cannot be used together
86+
with `id` or `name`.
8187
configuration: The configuration object containing API credentials and settings. Must include a valid
8288
`token` and `api_base_url`. May also contain a `default_dataset_id` for fallback when neither
83-
`id` nor `name` is provided.
89+
`id`, `name`, nor `alias` is provided.
8490
8591
Returns:
8692
An instance for the opened or created storage client.
8793
8894
Raises:
89-
ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name`
90-
are provided, or if neither `id` nor `name` is provided and no default storage ID is available in
91-
the configuration.
95+
ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
96+
`id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
97+
in the configuration.
9298
"""
99+
if sum(1 for param in [id, name, alias] if param is not None) > 1:
100+
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
101+
93102
token = configuration.token
94103
if not token:
95104
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
@@ -115,27 +124,32 @@ async def open(
115124
)
116125
apify_datasets_client = apify_client_async.datasets()
117126

118-
# If both id and name are provided, raise an error.
119-
if id and name:
120-
raise ValueError('Only one of "id" or "name" can be specified, not both.')
121-
122-
# If id is provided, get the storage by ID.
123-
if id and name is None:
124-
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
127+
# Handle alias resolution
128+
if alias:
129+
# Try to resolve alias to existing storage ID
130+
resolved_id = await resolve_alias_to_id(alias, 'dataset', configuration)
131+
if resolved_id:
132+
id = resolved_id
133+
else:
134+
# Create a new storage and store the alias mapping
135+
new_storage_metadata = DatasetMetadata.model_validate(
136+
await apify_datasets_client.get_or_create(),
137+
)
138+
id = new_storage_metadata.id
139+
await store_alias_mapping(alias, 'dataset', id, configuration)
125140

126141
# If name is provided, get or create the storage by name.
127-
if name and id is None:
142+
elif name:
128143
id = DatasetMetadata.model_validate(
129144
await apify_datasets_client.get_or_create(name=name),
130145
).id
131-
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
132146

133-
# If both id and name are None, try to get the default storage ID from environment variables.
134-
# The default storage ID environment variable is set by the Apify platform. It also contains
135-
# a new storage ID after Actor's reboot or migration.
136-
if id is None and name is None:
147+
# If none are provided, try to get the default storage ID from environment variables.
148+
elif id is None:
137149
id = configuration.default_dataset_id
138-
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
150+
151+
# Now create the client for the determined ID
152+
apify_dataset_client = apify_client_async.dataset(dataset_id=id)
139153

140154
# Fetch its metadata.
141155
metadata = await apify_dataset_client.get()
@@ -150,7 +164,7 @@ async def open(
150164
# Verify that the storage exists by fetching its metadata again.
151165
metadata = await apify_dataset_client.get()
152166
if metadata is None:
153-
raise ValueError(f'Opening dataset with id={id} and name={name} failed.')
167+
raise ValueError(f'Opening dataset with id={id}, name={name}, and alias={alias} failed.')
154168

155169
return cls(
156170
api_client=apify_dataset_client,

src/apify/storage_clients/_apify/_key_value_store_client.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from crawlee.storage_clients.models import KeyValueStoreRecord, KeyValueStoreRecordMetadata
1313

1414
from ._models import ApifyKeyValueStoreMetadata, KeyValueStoreListKeysPage
15+
from ._utils import resolve_alias_to_id, store_alias_mapping
1516
from apify._crypto import create_hmac_signature
1617

1718
if TYPE_CHECKING:
@@ -58,6 +59,7 @@ async def open(
5859
*,
5960
id: str | None,
6061
name: str | None,
62+
alias: str | None,
6163
configuration: Configuration,
6264
) -> ApifyKeyValueStoreClient:
6365
"""Open an Apify key-value store client.
@@ -67,21 +69,27 @@ async def open(
6769
6870
Args:
6971
id: The ID of an existing key-value store to open. If provided, the client will connect to this specific
70-
storage. Cannot be used together with `name`.
72+
storage. Cannot be used together with `name` or `alias`.
7173
name: The name of a key-value store to get or create. If a storage with this name exists, it will be
72-
opened; otherwise, a new one will be created. Cannot be used together with `id`.
74+
opened; otherwise, a new one will be created. Cannot be used together with `id` or `alias`.
75+
alias: The alias of a key-value store for unnamed storages. If a storage with this alias exists, it will
76+
be opened; otherwise, a new one will be created and the alias will be saved. Cannot be used together
77+
with `id` or `name`.
7378
configuration: The configuration object containing API credentials and settings. Must include a valid
7479
`token` and `api_base_url`. May also contain a `default_key_value_store_id` for fallback when
75-
neither `id` nor `name` is provided.
80+
neither `id`, `name`, nor `alias` is provided.
7681
7782
Returns:
7883
An instance for the opened or created storage client.
7984
8085
Raises:
81-
ValueError: If the configuration is missing required fields (token, api_base_url), if both `id` and `name`
82-
are provided, or if neither `id` nor `name` is provided and no default storage ID is available
86+
ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
87+
`id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
8388
in the configuration.
8489
"""
90+
if sum(1 for param in [id, name, alias] if param is not None) > 1:
91+
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
92+
8593
token = configuration.token
8694
if not token:
8795
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
@@ -107,27 +115,32 @@ async def open(
107115
)
108116
apify_kvss_client = apify_client_async.key_value_stores()
109117

110-
# If both id and name are provided, raise an error.
111-
if id and name:
112-
raise ValueError('Only one of "id" or "name" can be specified, not both.')
113-
114-
# If id is provided, get the storage by ID.
115-
if id and name is None:
116-
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
118+
# Handle alias resolution
119+
if alias:
120+
# Try to resolve alias to existing storage ID
121+
resolved_id = await resolve_alias_to_id(alias, 'key_value_store', configuration)
122+
if resolved_id:
123+
id = resolved_id
124+
else:
125+
# Create a new storage and store the alias mapping
126+
new_storage_metadata = ApifyKeyValueStoreMetadata.model_validate(
127+
await apify_kvss_client.get_or_create(),
128+
)
129+
id = new_storage_metadata.id
130+
await store_alias_mapping(alias, 'key_value_store', id, configuration)
117131

118132
# If name is provided, get or create the storage by name.
119-
if name and id is None:
133+
elif name:
120134
id = ApifyKeyValueStoreMetadata.model_validate(
121135
await apify_kvss_client.get_or_create(name=name),
122136
).id
123-
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
124137

125-
# If both id and name are None, try to get the default storage ID from environment variables.
126-
# The default storage ID environment variable is set by the Apify platform. It also contains
127-
# a new storage ID after Actor's reboot or migration.
128-
if id is None and name is None:
138+
# If none are provided, try to get the default storage ID from environment variables.
139+
elif id is None:
129140
id = configuration.default_key_value_store_id
130-
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
141+
142+
# Now create the client for the determined ID
143+
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)
131144

132145
# Fetch its metadata.
133146
metadata = await apify_kvs_client.get()
@@ -142,7 +155,7 @@ async def open(
142155
# Verify that the storage exists by fetching its metadata again.
143156
metadata = await apify_kvs_client.get()
144157
if metadata is None:
145-
raise ValueError(f'Opening key-value store with id={id} and name={name} failed.')
158+
raise ValueError(f'Opening key-value store with id={id}, name={name}, and alias={alias} failed.')
146159

147160
return cls(
148161
api_client=apify_kvs_client,

0 commit comments

Comments
 (0)