Skip to content

Commit c5cf64e

Browse files
authored
Refactor initialization of storages & memory resource clients (#143)
### Description - Refactor initialization of storages & memory resource clients. - `BaseStorage` and `BaseResourceClients` are only ABC with abstract methods; all the creation-related code is moved to a separate module. - I created `_creation_management.py` helper modules in both `storages/` and `memory_storage_client/`. - I had to move `crawlee/storages/models.py` to `crawlee/models.py` because of the import loops, I merged it with `request.py`. I am open to other ideas in this regard. ### Related issues - #85 ### Testing - Tests pass
1 parent 0fa2317 commit c5cf64e

48 files changed

Lines changed: 1578 additions & 1279 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/crawlee/_utils/env_vars.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import annotations
22

33
import os
4-
from enum import Enum
54
from typing import Any, Literal, get_args
65

76
from crawlee._utils.data_processing import (
@@ -12,15 +11,6 @@
1211
maybe_parse_int,
1312
)
1413

15-
16-
class CrawleeEnvVars(str, Enum):
17-
"""Enum for the environment variables used by Crawlee."""
18-
19-
LOCAL_STORAGE_DIR = 'CRAWLEE_LOCAL_STORAGE_DIR'
20-
PERSIST_STORAGE = 'CRAWLEE_PERSIST_STORAGE'
21-
PURGE_ON_START = 'CRAWLEE_PURGE_ON_START'
22-
23-
2414
INTEGER_ENV_VARS_TYPE = Literal[None]
2515

2616
INTEGER_ENV_VARS: list[INTEGER_ENV_VARS_TYPE] = list(get_args(INTEGER_ENV_VARS_TYPE))

src/crawlee/_utils/file.py

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,13 @@
55
import io
66
import json
77
import mimetypes
8-
import os
98
import re
109
from enum import Enum
1110
from typing import Any
1211

13-
import aiofiles
1412
import aioshutil
1513
from aiofiles import ospath
16-
from aiofiles.os import makedirs, remove, rename
14+
from aiofiles.os import remove, rename
1715

1816

1917
class ContentType(Enum):
@@ -108,29 +106,3 @@ async def json_dumps(obj: Any) -> str:
108106
A string containing the JSON representation of the input object.
109107
"""
110108
return await asyncio.to_thread(json.dumps, obj, ensure_ascii=False, indent=2, default=str)
111-
112-
113-
async def persist_metadata_if_enabled(*, data: dict, entity_directory: str, write_metadata: bool) -> None:
114-
"""Updates or writes metadata to a specified directory.
115-
116-
The function writes a given metadata dictionary to a JSON file within a specified directory.
117-
The writing process is skipped if `write_metadata` is False. Before writing, it ensures that
118-
the target directory exists, creating it if necessary.
119-
120-
Args:
121-
data: A dictionary containing metadata to be written.
122-
entity_directory: The directory path where the metadata file should be stored.
123-
write_metadata: A boolean flag indicating whether the metadata should be written to file.
124-
"""
125-
# Skip metadata write; ensure directory exists first
126-
if not write_metadata:
127-
return
128-
129-
# Ensure the directory for the entity exists
130-
await makedirs(entity_directory, exist_ok=True)
131-
132-
# Write the metadata to the file
133-
file_path = os.path.join(entity_directory, '__metadata__.json')
134-
async with aiofiles.open(file_path, mode='wb') as f:
135-
s = await json_dumps(data)
136-
await f.write(s.encode('utf-8'))

src/crawlee/base_storage_client/base_dataset_client.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING, AsyncIterator
55

66
if TYPE_CHECKING:
7-
from crawlee.storages.models import DatasetItemsListPage, DatasetMetadata
7+
from crawlee.models import DatasetItemsListPage, DatasetMetadata
88
from crawlee.types import JSONSerializable
99

1010

@@ -128,16 +128,16 @@ async def get_items_as_bytes(
128128
item_format: str = 'json',
129129
offset: int | None = None,
130130
limit: int | None = None,
131-
desc: bool | None = None,
132-
clean: bool | None = None,
133-
bom: bool | None = None,
131+
desc: bool = False,
132+
clean: bool = False,
133+
bom: bool = False,
134134
delimiter: str | None = None,
135135
fields: list[str] | None = None,
136136
omit: list[str] | None = None,
137137
unwind: str | None = None,
138-
skip_empty: bool | None = None,
139-
skip_header_row: bool | None = None,
140-
skip_hidden: bool | None = None,
138+
skip_empty: bool = False,
139+
skip_header_row: bool = False,
140+
skip_hidden: bool = False,
141141
xml_root: str | None = None,
142142
xml_row: str | None = None,
143143
flatten: list[str] | None = None,
@@ -173,16 +173,16 @@ async def stream_items(
173173
item_format: str = 'json',
174174
offset: int | None = None,
175175
limit: int | None = None,
176-
desc: bool | None = None,
177-
clean: bool | None = None,
178-
bom: bool | None = None,
176+
desc: bool = False,
177+
clean: bool = False,
178+
bom: bool = False,
179179
delimiter: str | None = None,
180180
fields: list[str] | None = None,
181181
omit: list[str] | None = None,
182182
unwind: str | None = None,
183-
skip_empty: bool | None = None,
184-
skip_header_row: bool | None = None,
185-
skip_hidden: bool | None = None,
183+
skip_empty: bool = False,
184+
skip_header_row: bool = False,
185+
skip_hidden: bool = False,
186186
xml_root: str | None = None,
187187
xml_row: str | None = None,
188188
) -> AsyncIterator[dict]:

src/crawlee/base_storage_client/base_dataset_collection_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
if TYPE_CHECKING:
7-
from crawlee.storages.models import DatasetListPage, DatasetMetadata
7+
from crawlee.models import DatasetListPage, DatasetMetadata
88

99

1010
class BaseDatasetCollectionClient(ABC):
@@ -40,10 +40,10 @@ async def get_or_create(
4040
async def list(
4141
self,
4242
*,
43-
unnamed: bool | None = None,
43+
unnamed: bool = False,
4444
limit: int | None = None,
4545
offset: int | None = None,
46-
desc: bool | None = None,
46+
desc: bool = False,
4747
) -> DatasetListPage:
4848
"""List the available datasets.
4949

src/crawlee/base_storage_client/base_key_value_store_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING, Any, AsyncIterator
55

66
if TYPE_CHECKING:
7-
from crawlee.storages.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
7+
from crawlee.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
88

99

1010
class BaseKeyValueStoreClient(ABC):

src/crawlee/base_storage_client/base_key_value_store_collection_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
if TYPE_CHECKING:
7-
from crawlee.storages.models import KeyValueStoreListPage, KeyValueStoreMetadata
7+
from crawlee.models import KeyValueStoreListPage, KeyValueStoreMetadata
88

99

1010
class BaseKeyValueStoreCollectionClient(ABC):
@@ -40,10 +40,10 @@ async def get_or_create(
4040
async def list(
4141
self,
4242
*,
43-
unnamed: bool | None = None,
43+
unnamed: bool = False,
4444
limit: int | None = None,
4545
offset: int | None = None,
46-
desc: bool | None = None,
46+
desc: bool = False,
4747
) -> KeyValueStoreListPage:
4848
"""List the available key-value stores.
4949

src/crawlee/base_storage_client/base_request_queue_client.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
if TYPE_CHECKING:
7-
from crawlee.request import Request
8-
from crawlee.storages.models import RequestQueueHead, RequestQueueMetadata, RequestQueueOperationInfo
7+
from crawlee.models import Request, RequestQueueHead, RequestQueueMetadata, RequestQueueOperationInfo
98

109

1110
class BaseRequestQueueClient(ABC):

src/crawlee/base_storage_client/base_request_queue_collection_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
if TYPE_CHECKING:
7-
from crawlee.storages.models import RequestQueueListPage, RequestQueueMetadata
7+
from crawlee.models import RequestQueueListPage, RequestQueueMetadata
88

99

1010
class BaseRequestQueueCollectionClient(ABC):
@@ -40,10 +40,10 @@ async def get_or_create(
4040
async def list(
4141
self,
4242
*,
43-
unnamed: bool | None = None,
43+
unnamed: bool = False,
4444
limit: int | None = None,
4545
offset: int | None = None,
46-
desc: bool | None = None,
46+
desc: bool = False,
4747
) -> RequestQueueListPage:
4848
"""List the available request queues.
4949

src/crawlee/base_storage_client/base_storage_client.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,12 @@ def request_queue(self, id: str) -> BaseRequestQueueClient:
4444
@abstractmethod
4545
def request_queues(self) -> BaseRequestQueueCollectionClient:
4646
"""Gets a subclient for request queue collection operations."""
47+
48+
@abstractmethod
49+
async def purge_on_start(self) -> None:
50+
"""Performs a purge of the default storages.
51+
52+
This method ensures that the purge is executed only once during the lifetime of the instance.
53+
It is primarily used to clean up residual data from previous runs to maintain a clean state.
54+
If the storage client does not support purging, leave it empty.
55+
"""

src/crawlee/basic_crawler/basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
from crawlee.enqueue_strategy import EnqueueStrategy
3939
from crawlee.events.local_event_manager import LocalEventManager
4040
from crawlee.http_clients.httpx_client import HttpxClient
41-
from crawlee.request import BaseRequestData, Request, RequestState
41+
from crawlee.models import BaseRequestData, Request, RequestState
4242
from crawlee.sessions import SessionPool
4343
from crawlee.storages.request_queue import RequestQueue
4444

0 commit comments

Comments
 (0)