Skip to content

Commit a463739

Browse files
committed
Remove helper methods as suggested
1 parent 0f90efc commit a463739

7 files changed

Lines changed: 35 additions & 110 deletions

File tree

docs/upgrading/upgrading_to_v1.md

Lines changed: 4 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -214,50 +214,13 @@ service_locator.set_storage_client(MemoryStorageClient()) # Raises an error
214214

215215
## BasicCrawler changes
216216

217-
### Renamed methods for opening storages
218-
- `BasicCrawler.get_dataset` renamed to `BasicCrawler.open_dataset`
219-
- `BasicCrawler.get_key_value_store` renamed to `BasicCrawler.open_key_value_store`
217+
### Removed helper methods for opening storages
218+
- `BasicCrawler.get_dataset` was removed.
219+
- `BasicCrawler.get_key_value_store` was removed.
220220

221-
### Added method for opening RequestQueue that uses configuration and storage client of the crawler
222-
- `BasicCrawler.open_request_queue`
223221

224222
### BasicCrawler has its own instance of ServiceLocator to track its own services
225-
Explicitly passed services to the crawler can be different the global ones accessible in `crawlee.service_locator`. `BasicCrawler` no longer causes the global services in `service_locator` to be set to the crawler's explicitly passed services.
226-
227-
**Before (v0.6):**
228-
229-
```python
230-
from crawlee import service_locator
231-
from crawlee.crawlers import BasicCrawler
232-
from crawlee.storage_clients import MemoryStorageClient
233-
from crawlee.storages import Dataset
234-
235-
236-
async def main() -> None:
237-
custom_storage_client = MemoryStorageClient()
238-
crawler = BasicCrawler(storage_client=custom_storage_client)
239-
240-
assert service_locator.get_storage_client() is custom_storage_client
241-
assert await crawler.get_dataset() is await Dataset.open()
242-
```
243-
**Now (v1.0):**
244-
245-
```python
246-
from crawlee import service_locator
247-
from crawlee.crawlers import BasicCrawler
248-
from crawlee.storage_clients import MemoryStorageClient
249-
from crawlee.storages import Dataset
250-
251-
252-
async def main() -> None:
253-
custom_storage_client = MemoryStorageClient()
254-
crawler = BasicCrawler(storage_client=custom_storage_client)
255-
256-
assert service_locator.get_storage_client() is not custom_storage_client
257-
assert await crawler.open_dataset() is not await Dataset.open()
258-
```
259-
260-
This allows two crawlers with different services at the same time.
223+
Explicitly passed services to the crawler can be different the global ones accessible in `crawlee.service_locator`. `BasicCrawler` no longer causes the global services in `service_locator` to be set to the crawler's explicitly passed services. This allows two crawlers with different services at the same time.
261224

262225
**Now (v1.0):**
263226

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ async def get_input_state(
291291
use_state_function = context.use_state
292292

293293
# New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
294-
result = RequestHandlerRunResult(key_value_store_getter=self.open_key_value_store)
294+
result = RequestHandlerRunResult(key_value_store_getter=self._open_key_value_store)
295295
context_linked_to_result = BasicCrawlingContext(
296296
request=deepcopy(context.request),
297297
session=deepcopy(context.session),

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 20 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -563,42 +563,13 @@ async def _get_proxy_info(self, request: Request, session: Session | None) -> Pr
563563
async def get_request_manager(self) -> RequestManager:
564564
"""Return the configured request manager. If none is configured, open and return the default request queue."""
565565
if not self._request_manager:
566-
self._request_manager = await self.open_request_queue()
566+
self._request_manager = await RequestQueue.open(
567+
storage_client=self._service_locator.get_storage_client(),
568+
configuration=self._service_locator.get_configuration(),
569+
)
567570
return self._request_manager
568571

569-
async def open_request_queue(
570-
self,
571-
*,
572-
id: str | None = None,
573-
name: str | None = None,
574-
alias: str | None = None,
575-
) -> RequestQueue:
576-
"""Return `RequestQueue` with the given ID or name or alias. If none is provided, return the default one."""
577-
return await RequestQueue.open(
578-
id=id,
579-
name=name,
580-
alias=alias,
581-
storage_client=self._service_locator.get_storage_client(),
582-
configuration=self._service_locator.get_configuration(),
583-
)
584-
585-
async def open_dataset(
586-
self,
587-
*,
588-
id: str | None = None,
589-
name: str | None = None,
590-
alias: str | None = None,
591-
) -> Dataset:
592-
"""Return `Dataset` with the given ID or name or alias. If none is provided, return the default one."""
593-
return await Dataset.open(
594-
id=id,
595-
name=name,
596-
alias=alias,
597-
storage_client=self._service_locator.get_storage_client(),
598-
configuration=self._service_locator.get_configuration(),
599-
)
600-
601-
async def open_key_value_store(
572+
async def _open_key_value_store(
602573
self,
603574
*,
604575
id: str | None = None,
@@ -671,7 +642,10 @@ async def run(
671642
request_manager = await self.get_request_manager()
672643
if purge_request_queue and isinstance(request_manager, RequestQueue):
673644
await request_manager.drop()
674-
self._request_manager = await self.open_request_queue()
645+
self._request_manager = await RequestQueue.open(
646+
storage_client=self._service_locator.get_storage_client(),
647+
configuration=self._service_locator.get_configuration(),
648+
)
675649

676650
if requests is not None:
677651
await self.add_requests(requests)
@@ -805,11 +779,11 @@ async def _use_state(
805779
self,
806780
default_value: dict[str, JsonSerializable] | None = None,
807781
) -> dict[str, JsonSerializable]:
808-
kvs = await self.open_key_value_store()
782+
kvs = await self._open_key_value_store()
809783
return await kvs.get_auto_saved_value(self._CRAWLEE_STATE_KEY, default_value)
810784

811785
async def _save_crawler_state(self) -> None:
812-
store = await self.open_key_value_store()
786+
store = await self._open_key_value_store()
813787
await store.persist_autosaved_values()
814788

815789
async def get_data(
@@ -899,7 +873,13 @@ async def _push_data(
899873
dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
900874
kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
901875
"""
902-
dataset = await self.open_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
876+
dataset = await Dataset.open(
877+
id=dataset_id,
878+
name=dataset_name,
879+
alias=dataset_alias,
880+
storage_client=self._service_locator.get_storage_client(),
881+
configuration=self._service_locator.get_configuration(),
882+
)
903883
await dataset.push_data(data, **kwargs)
904884

905885
def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
@@ -1281,7 +1261,7 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) ->
12811261
for push_data_call in result.push_data_calls:
12821262
await self._push_data(**push_data_call)
12831263

1284-
await self._commit_key_value_store_changes(result, get_kvs=self.open_key_value_store)
1264+
await self._commit_key_value_store_changes(result, get_kvs=self._open_key_value_store)
12851265

12861266
@staticmethod
12871267
async def _commit_key_value_store_changes(
@@ -1348,7 +1328,7 @@ async def __run_task_function(self) -> None:
13481328
else:
13491329
session = await self._get_session()
13501330
proxy_info = await self._get_proxy_info(request, session)
1351-
result = RequestHandlerRunResult(key_value_store_getter=self.open_key_value_store)
1331+
result = RequestHandlerRunResult(key_value_store_getter=self._open_key_value_store)
13521332

13531333
context = BasicCrawlingContext(
13541334
request=request,

tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
AdaptiveContextError,
3131
)
3232
from crawlee.statistics import Statistics
33-
from crawlee.storages import KeyValueStore
33+
from crawlee.storages import Dataset, KeyValueStore
3434

3535
if TYPE_CHECKING:
3636
from collections.abc import AsyncGenerator, Iterator
@@ -461,7 +461,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
461461

462462
await crawler.run(test_urls[:1])
463463

464-
dataset = await crawler.open_dataset()
464+
dataset = await Dataset.open()
465465
stored_results = [item async for item in dataset.iterate_items()]
466466

467467
if error_in_pw_crawler:

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -611,12 +611,6 @@ async def test_crawler_get_storages() -> None:
611611
rp = await crawler.get_request_manager()
612612
assert isinstance(rp, RequestQueue)
613613

614-
dataset = await crawler.open_dataset()
615-
assert isinstance(dataset, Dataset)
616-
617-
kvs = await crawler.open_key_value_store()
618-
assert isinstance(kvs, KeyValueStore)
619-
620614

621615
async def test_crawler_run_requests() -> None:
622616
crawler = BasicCrawler()
@@ -725,7 +719,7 @@ async def handler(context: BasicCrawlingContext) -> None:
725719

726720
await crawler.run(['https://hello.world'])
727721

728-
store = await crawler.open_key_value_store()
722+
store = await crawler._open_key_value_store()
729723
assert (await store.get_value('foo')) == 'bar'
730724

731725

@@ -738,7 +732,7 @@ async def handler(context: BasicCrawlingContext) -> None:
738732

739733
await crawler.run(['https://hello.world'])
740734

741-
kvs = await crawler.open_key_value_store()
735+
kvs = await crawler._open_key_value_store()
742736
value = await kvs.get_value(BasicCrawler._CRAWLEE_STATE_KEY)
743737

744738
assert value == {'hello': 'world'}
@@ -781,7 +775,7 @@ async def handler_three(context: BasicCrawlingContext) -> None:
781775
# The state in handler_three must match the final state updated in previous run
782776
assert state_in_handler_three == {'hello': 'last_world'}
783777

784-
store = await crawler.open_key_value_store()
778+
store = await crawler._open_key_value_store()
785779

786780
# The state in the KVS must match with the last set state
787781
assert (await store.get_value(BasicCrawler._CRAWLEE_STATE_KEY)) == {'hello': 'last_world'}
@@ -1103,14 +1097,10 @@ async def test_crawler_uses_default_storages(tmp_path: Path) -> None:
11031097
)
11041098
service_locator.set_configuration(configuration)
11051099

1106-
dataset = await Dataset.open()
1107-
kvs = await KeyValueStore.open()
11081100
rq = await RequestQueue.open()
11091101

11101102
crawler = BasicCrawler()
11111103

1112-
assert dataset is await crawler.open_dataset()
1113-
assert kvs is await crawler.open_key_value_store()
11141104
assert rq is await crawler.get_request_manager()
11151105

11161106

@@ -1121,14 +1111,10 @@ async def test_crawler_can_use_other_storages(tmp_path: Path) -> None:
11211111
)
11221112
service_locator.set_configuration(configuration)
11231113

1124-
dataset = await Dataset.open()
1125-
kvs = await KeyValueStore.open()
11261114
rq = await RequestQueue.open()
11271115

11281116
crawler = BasicCrawler(storage_client=MemoryStorageClient())
11291117

1130-
assert dataset is not await crawler.open_dataset()
1131-
assert kvs is not await crawler.open_key_value_store()
11321118
assert rq is not await crawler.get_request_manager()
11331119

11341120

@@ -1156,16 +1142,12 @@ async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> No
11561142
service_locator.set_configuration(configuration_a)
11571143
service_locator.set_storage_client(FileSystemStorageClient())
11581144
# Create storages based on the global services
1159-
dataset = await Dataset.open()
1160-
kvs = await KeyValueStore.open()
11611145
rq = await RequestQueue.open()
11621146

11631147
# Set the crawler to use different storage client
11641148
crawler = BasicCrawler(storage_client=FileSystemStorageClient(), configuration=configuration_b)
11651149

11661150
# Assert that the storages are different
1167-
assert dataset is not await crawler.open_dataset()
1168-
assert kvs is not await crawler.open_key_value_store()
11691151
assert rq is not await crawler.get_request_manager()
11701152

11711153
# Assert that all storages exists on the filesystem
@@ -1193,7 +1175,7 @@ async def handler(context: BasicCrawlingContext) -> None:
11931175
await crawler.run(['https://does-not-matter.com'])
11941176
assert spy.call_count >= 1
11951177

1196-
dataset = await crawler.open_dataset()
1178+
dataset = await Dataset.open()
11971179
data = await dataset.get_data()
11981180
assert data.items == [{'foo': 'bar'}]
11991181

@@ -1208,7 +1190,7 @@ async def test_context_use_state_race_condition_in_handlers(key_value_store: Key
12081190
from asyncio import Barrier # type:ignore[attr-defined] # noqa: PLC0415
12091191

12101192
crawler = BasicCrawler()
1211-
store = await crawler.open_key_value_store()
1193+
store = await crawler._open_key_value_store()
12121194
await store.set_value(BasicCrawler._CRAWLEE_STATE_KEY, {'counter': 0})
12131195
handler_barrier = Barrier(2)
12141196

@@ -1221,7 +1203,7 @@ async def handler(context: BasicCrawlingContext) -> None:
12211203

12221204
await crawler.run(['https://crawlee.dev/', 'https://crawlee.dev/docs/quick-start'])
12231205

1224-
store = await crawler.open_key_value_store()
1206+
store = await crawler._open_key_value_store()
12251207
# Ensure that local state is pushed back to kvs.
12261208
await store.persist_autosaved_values()
12271209
assert (await store.get_value(BasicCrawler._CRAWLEE_STATE_KEY))['counter'] == 2

tests/unit/crawlers/_http/test_http_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
562562

563563
await crawler.run([str(server_url)])
564564

565-
kvs = await crawler.open_key_value_store()
565+
kvs = await crawler._open_key_value_store()
566566
kvs_content = {}
567567
async for key_info in kvs.iterate_keys():
568568
kvs_content[key_info.key] = await kvs.get_value(key_info.key)

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
627627
[str(server_url), str(server_url / 'page_1'), str(server_url / 'page_2'), str(server_url / 'headers')]
628628
)
629629

630-
kvs = await crawler.open_key_value_store()
630+
kvs = await crawler._open_key_value_store()
631631
kvs_content = {}
632632

633633
async for key_info in kvs.iterate_keys():

0 commit comments

Comments
 (0)