@@ -810,36 +810,39 @@ async def handler(context: BasicCrawlingContext) -> None:
810810 await crawler .run (['https://hello.world' ])
811811
812812 kvs = await crawler .get_key_value_store ()
813- value = await kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' )
813+ value = await kvs .get_value ('CRAWLEE_STATE_0 ' )
814814
815815 assert value == {'hello' : 'world' }
816816
817817
818- async def test_context_use_state_crawlers_share_state () -> None :
818+ async def test_context_use_state_crawlers_share_custom_state () -> None :
819+ custom_state_dict = {}
820+
821+ async def custom_use_state (default_state : dict [str , JsonSerializable ]) -> dict [str , JsonSerializable ]:
822+ if not custom_state_dict :
823+ custom_state_dict .update (default_state )
824+ return custom_state_dict
825+
819826 async def handler (context : BasicCrawlingContext ) -> None :
820827 state = await context .use_state ({'urls' : []})
821828 assert isinstance (state ['urls' ], list )
822829 state ['urls' ].append (context .request .url )
823830
824- crawler_1 = BasicCrawler (crawler_id = 0 , request_handler = handler )
825- crawler_2 = BasicCrawler (crawler_id = 0 , request_handler = handler )
831+ crawler_1 = BasicCrawler (use_state = custom_use_state , request_handler = handler )
832+ crawler_2 = BasicCrawler (use_state = custom_use_state , request_handler = handler )
826833
827834 await crawler_1 .run (['https://a.com' ])
828835 await crawler_2 .run (['https://b.com' ])
829836
830- kvs = await KeyValueStore .open ()
831- assert crawler_1 .id == crawler_2 .id == 0
832- assert await kvs .get_value (f'{ BasicCrawler ._CRAWLEE_STATE_KEY } _{ crawler_1 .id } ' ) == {
833- 'urls' : ['https://a.com' , 'https://b.com' ]
834- }
837+ assert custom_state_dict == {'urls' : ['https://a.com' , 'https://b.com' ]}
835838
836839
837840async def test_crawlers_share_stats () -> None :
838841 async def handler (context : BasicCrawlingContext ) -> None :
839842 await context .use_state ({'urls' : []})
840843
841- crawler_1 = BasicCrawler (crawler_id = 0 , request_handler = handler )
842- crawler_2 = BasicCrawler (crawler_id = 0 , request_handler = handler , statistics = crawler_1 .statistics )
844+ crawler_1 = BasicCrawler (request_handler = handler )
845+ crawler_2 = BasicCrawler (request_handler = handler , statistics = crawler_1 .statistics )
843846
844847 result1 = await crawler_1 .run (['https://a.com' ])
845848 result2 = await crawler_2 .run (['https://b.com' ])
@@ -862,8 +865,8 @@ async def handler(context: BasicCrawlingContext) -> None:
862865 await crawler_2 .run (['https://b.com' ])
863866
864867 kvs = await KeyValueStore .open ()
865- assert await kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' ) == {'urls' : ['https://a.com' ]}
866- assert await kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _1 ' ) == {'urls' : ['https://b.com' ]}
868+ assert await kvs .get_value ('CRAWLEE_STATE_0 ' ) == {'urls' : ['https://a.com' ]}
869+ assert await kvs .get_value ('CRAWLEE_STATE_1 ' ) == {'urls' : ['https://b.com' ]}
867870
868871
869872async def test_context_handlers_use_state (key_value_store : KeyValueStore ) -> None :
@@ -906,7 +909,7 @@ async def handler_three(context: BasicCrawlingContext) -> None:
906909 store = await crawler .get_key_value_store ()
907910
908911 # The state in the KVS must match with the last set state
909- assert (await store .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' )) == {'hello' : 'last_world' }
912+ assert (await store .get_value ('CRAWLEE_STATE_0 ' )) == {'hello' : 'last_world' }
910913
911914
912915async def test_max_requests_per_crawl () -> None :
@@ -1334,7 +1337,7 @@ async def test_context_use_state_race_condition_in_handlers(key_value_store: Key
13341337
13351338 crawler = BasicCrawler ()
13361339 store = await crawler .get_key_value_store ()
1337- await store .set_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' , {'counter' : 0 })
1340+ await store .set_value ('CRAWLEE_STATE_0 ' , {'counter' : 0 })
13381341 handler_barrier = Barrier (2 )
13391342
13401343 @crawler .router .default_handler
@@ -1349,7 +1352,7 @@ async def handler(context: BasicCrawlingContext) -> None:
13491352 store = await crawler .get_key_value_store ()
13501353 # Ensure that local state is pushed back to kvs.
13511354 await store .persist_autosaved_values ()
1352- assert (await store .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' ))['counter' ] == 2
1355+ assert (await store .get_value ('CRAWLEE_STATE_0 ' ))['counter' ] == 2
13531356
13541357
13551358@pytest .mark .run_alone
@@ -1859,7 +1862,7 @@ async def test_crawler_state_persistence(tmp_path: Path) -> None:
18591862 ).result ()[0 ]
18601863 # Expected state after first crawler run
18611864 assert first_run_state .requests_finished == 2
1862- state = await state_kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' )
1865+ state = await state_kvs .get_value ('CRAWLEE_STATE_0 ' )
18631866 assert state .get ('urls' ) == ['https://a.placeholder.com' , 'https://b.placeholder.com' ]
18641867
18651868 # Do not reuse the executor to simulate a fresh process to avoid modified class attributes.
@@ -1875,7 +1878,7 @@ async def test_crawler_state_persistence(tmp_path: Path) -> None:
18751878 # 2 requests from first run and 1 request from second run.
18761879 assert second_run_state .requests_finished == 3
18771880
1878- state = await state_kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' )
1881+ state = await state_kvs .get_value ('CRAWLEE_STATE_0 ' )
18791882 assert state .get ('urls' ) == [
18801883 'https://a.placeholder.com' ,
18811884 'https://b.placeholder.com' ,
@@ -1912,9 +1915,9 @@ async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Pat
19121915 # Expected state after first crawler run
19131916 assert first_run_states [0 ].requests_finished == 1
19141917 assert first_run_states [1 ].requests_finished == 1
1915- state_0 = await state_kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' )
1918+ state_0 = await state_kvs .get_value ('CRAWLEE_STATE_0 ' )
19161919 assert state_0 .get ('urls' ) == ['https://a.placeholder.com' ]
1917- state_1 = await state_kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _1 ' )
1920+ state_1 = await state_kvs .get_value ('CRAWLEE_STATE_1 ' )
19181921 assert state_1 .get ('urls' ) == ['https://c.placeholder.com' ]
19191922
19201923 with ProcessPoolExecutor () as executor :
@@ -1930,9 +1933,9 @@ async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Pat
19301933 # Expected state after first crawler run
19311934 assert second_run_states [0 ].requests_finished == 2
19321935 assert second_run_states [1 ].requests_finished == 2
1933- state_0 = await state_kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _0 ' )
1936+ state_0 = await state_kvs .get_value ('CRAWLEE_STATE_0 ' )
19341937 assert state_0 .get ('urls' ) == ['https://a.placeholder.com' , 'https://b.placeholder.com' ]
1935- state_1 = await state_kvs .get_value (f' { BasicCrawler . _CRAWLEE_STATE_KEY } _1 ' )
1938+ state_1 = await state_kvs .get_value ('CRAWLEE_STATE_1 ' )
19361939 assert state_1 .get ('urls' ) == ['https://c.placeholder.com' , 'https://d.placeholder.com' ]
19371940
19381941
0 commit comments