-
Notifications
You must be signed in to change notification settings - Fork 711
Expand file tree
/
Copy path_key_value_store.py
More file actions
338 lines (268 loc) · 11.2 KB
/
_key_value_store.py
File metadata and controls
338 lines (268 loc) · 11.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
from __future__ import annotations
import asyncio
from collections.abc import AsyncIterator
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, overload
from pydantic import RootModel
from typing_extensions import override
from crawlee import service_locator
from crawlee._types import JsonSerializable # noqa: TC001
from crawlee._utils.docs import docs_group
from crawlee._utils.recoverable_state import RecoverableState
from crawlee.errors import StorageWriteError
from crawlee.storage_clients.models import KeyValueStoreMetadata
from ._base import Storage
from ._utils import validate_storage_name
if TYPE_CHECKING:
from collections.abc import AsyncIterator
from crawlee.configuration import Configuration
from crawlee.storage_clients import StorageClient
from crawlee.storage_clients._base import KeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata
else:
from crawlee._utils.recoverable_state import RecoverableState
T = TypeVar('T')
logger = getLogger(__name__)
class AutosavedValue(RootModel):
root: dict[str, JsonSerializable]
@docs_group('Storages')
class KeyValueStore(Storage):
"""Key-value store is a storage for reading and writing data records with unique key identifiers.
The key-value store class acts as a high-level interface for storing, retrieving, and managing data records
identified by unique string keys. It abstracts away the underlying storage implementation details,
allowing you to work with the same API regardless of whether data is stored in memory, on disk,
or in the cloud.
Each data record is associated with a specific MIME content type, allowing storage of various
data formats such as JSON, text, images, HTML snapshots or any binary data. This class is
commonly used to store inputs, outputs, and other artifacts of crawler operations.
You can instantiate a key-value store using the `open` class method, which will create a store
with the specified name or id. The underlying storage implementation is determined by the configured
storage client.
### Usage
```python
from crawlee.storages import KeyValueStore
# Open a named key-value store
kvs = await KeyValueStore.open(name='my-store')
# Store and retrieve data
await kvs.set_value('product-1234.json', [{'name': 'Smartphone', 'price': 799.99}])
product = await kvs.get_value('product-1234')
```
"""
_autosaved_values: ClassVar[
dict[
str,
dict[str, RecoverableState[AutosavedValue]],
]
] = {}
"""Cache for recoverable (auto-saved) values."""
def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> None:
"""Initialize a new instance.
Preferably use the `KeyValueStore.open` constructor to create a new instance.
Args:
client: An instance of a storage client.
id: The unique identifier of the storage.
name: The name of the storage, if available.
"""
validate_storage_name(name)
self._client = client
self._id = id
self._name = name
self._autosave_lock = asyncio.Lock()
"""Lock for autosaving values to prevent concurrent modifications."""
@property
@override
def id(self) -> str:
return self._id
@property
@override
def name(self) -> str | None:
return self._name
@override
async def get_metadata(self) -> KeyValueStoreMetadata:
return await self._client.get_metadata()
@override
@classmethod
async def open(
cls,
*,
id: str | None = None,
name: str | None = None,
alias: str | None = None,
configuration: Configuration | None = None,
storage_client: StorageClient | None = None,
) -> KeyValueStore:
configuration = service_locator.get_configuration() if configuration is None else configuration
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
client_opener_coro = storage_client.create_kvs_client(
id=id, name=name, alias=alias, configuration=configuration
)
additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
return await service_locator.storage_instance_manager.open_storage_instance(
cls,
id=id,
name=name,
alias=alias,
client_opener_coro=client_opener_coro,
storage_client_cache_key=additional_cache_key,
)
@override
async def drop(self) -> None:
storage_instance_manager = service_locator.storage_instance_manager
storage_instance_manager.remove_from_cache(self)
await self._clear_cache() # Clear cache with persistent values.
await self._client.drop()
@override
async def purge(self) -> None:
await self._client.purge()
@overload
async def get_value(self, key: str) -> Any: ...
@overload
async def get_value(self, key: str, default_value: T) -> T: ...
@overload
async def get_value(self, key: str, default_value: T | None = None) -> T | None: ...
async def get_value(self, key: str, default_value: T | None = None) -> T | None:
"""Get a value from the KVS.
Args:
key: Key of the record to retrieve.
default_value: Default value returned in case the record does not exist.
Returns:
The value associated with the given key. `default_value` is used in case the record does not exist.
"""
record = await self._client.get_value(key=key)
return record.value if record else default_value
async def set_value(
self,
key: str,
value: Any,
content_type: str | None = None,
*,
max_attempts: int = 5,
wait_time_between_retries: timedelta = timedelta(seconds=1),
) -> None:
"""Set a value in the KVS.
Args:
key: Key of the record to set.
value: Value to set.
content_type: The MIME content type string.
max_attempts: The maximum number of attempts to set the value in case of failure.
wait_time_between_retries: Time to wait between retries.
"""
if max_attempts < 1:
raise ValueError('max_attempts must be at least 1')
wait_time_between_retries_seconds = wait_time_between_retries.total_seconds()
last_exception: StorageWriteError | None = None
for attempt in range(max_attempts):
try:
await self._client.set_value(key=key, value=value, content_type=content_type)
break
except StorageWriteError as e:
last_exception = e
if attempt < max_attempts - 1:
await asyncio.sleep(wait_time_between_retries_seconds)
else:
if last_exception:
logger.warning(
f'Failed to set value for key "{key}" after {max_attempts} attempts '
f'with error: {last_exception.cause}'
)
async def delete_value(self, key: str) -> None:
"""Delete a value from the KVS.
Args:
key: Key of the record to delete.
"""
await self._client.delete_value(key=key)
async def iterate_keys(
self,
exclusive_start_key: str | None = None,
limit: int | None = None,
) -> AsyncIterator[KeyValueStoreRecordMetadata]:
"""Iterate over the existing keys in the KVS.
Args:
exclusive_start_key: Key to start the iteration from.
limit: Maximum number of keys to return. None means no limit.
Yields:
Information about the key.
"""
async for item in self._client.iterate_keys(
exclusive_start_key=exclusive_start_key,
limit=limit,
):
yield item
async def list_keys(
self,
exclusive_start_key: str | None = None,
limit: int = 1000,
) -> list[KeyValueStoreRecordMetadata]:
"""List all the existing keys in the KVS.
It uses client's `iterate_keys` method to get the keys.
Args:
exclusive_start_key: Key to start the iteration from.
limit: Maximum number of keys to return.
Returns:
A list of keys in the KVS.
"""
return [
key
async for key in self._client.iterate_keys(
exclusive_start_key=exclusive_start_key,
limit=limit,
)
]
async def record_exists(self, key: str) -> bool:
"""Check if a record with the given key exists in the key-value store.
Args:
key: Key of the record to check for existence.
Returns:
True if a record with the given key exists, False otherwise.
"""
return await self._client.record_exists(key=key)
async def get_public_url(self, key: str) -> str:
"""Get the public URL for the given key.
Args:
key: Key of the record for which URL is required.
Returns:
The public URL for the given key.
"""
return await self._client.get_public_url(key=key)
async def get_auto_saved_value(
self,
key: str,
default_value: dict[str, JsonSerializable] | None = None,
) -> dict[str, JsonSerializable]:
"""Get a value from KVS that will be automatically saved on changes.
Args:
key: Key of the record, to store the value.
default_value: Value to be used if the record does not exist yet. Should be a dictionary.
Returns:
Return the value of the key.
"""
default_value = {} if default_value is None else default_value
async with self._autosave_lock:
cache = self._autosaved_values.setdefault(self.id, {})
if key in cache:
return cache[key].current_value.root
async def kvs_factory() -> KeyValueStore:
return self
cache[key] = recoverable_state = RecoverableState(
default_state=AutosavedValue(default_value),
persist_state_key=key,
persistence_enabled=True,
persist_state_kvs_factory=kvs_factory,
logger=logger,
)
await recoverable_state.initialize()
return recoverable_state.current_value.root
async def persist_autosaved_values(self) -> None:
"""Force autosaved values to be saved without waiting for an event in Event Manager."""
if self.id in self._autosaved_values:
cache = self._autosaved_values[self.id]
for value in cache.values():
await value.persist_state()
async def _clear_cache(self) -> None:
"""Clear cache with autosaved values."""
if self.id in self._autosaved_values:
cache = self._autosaved_values[self.id]
for value in cache.values():
await value.teardown()
cache.clear()