@@ -26,6 +26,14 @@ Crawlee is available as the [`crawlee`](https://pypi.org/project/crawlee/) PyPI
2626pip install crawlee
2727```
2828
29+ Additional, optional dependencies unlocking more features are shipped as package extras.
30+
31+ If you plan to use ` BeautifulSoupCrawler ` , install ` crawlee ` with ` beautifulsoup ` extra:
32+
33+ ```
34+ pip install crawlee[beautifulsoup]
35+ ```
36+
2937## Features
3038
3139- Unified interface for ** HTTP and headless browser** crawling.
@@ -99,8 +107,8 @@ async def main() -> None:
99107 # Define a handler for processing requests
100108 @crawler.router.default_handler
101109 async def request_handler (context : HttpCrawlingContext) -> None :
102- # Crawler will provide a HttpCrawlingContext instance, from which you can access
103- # the request and response data
110+ # Crawler will provide a HttpCrawlingContext instance,
111+ # from which you can access the request and response data
104112 record = {
105113 ' url' : context.request.url,
106114 ' status_code' : context.http_response.status_code,
@@ -158,10 +166,10 @@ async def main() -> None:
158166 # Define a handler for processing requests
159167 @crawler.router.default_handler
160168 async def request_handler (context : BeautifulSoupCrawlingContext) -> None :
161- # Crawler will provide a BeautifulSoupCrawlingContext instance, from which you can access
162- # the request and response data
169+ # Crawler will provide a BeautifulSoupCrawlingContext instance,
170+ # from which you can access the request and response data
163171 record = {
164- ' title' : context.soup.title.text if context.soup.title else ' ' ,
172+ ' title' : context.soup.title.text,
165173 ' url' : context.request.url,
166174 }
167175 # Extract the record and push it to the dataset
@@ -179,14 +187,20 @@ if __name__ == '__main__':
179187See the following example with the updated request handler:
180188
181189``` python
190+ from crawlee.enqueue_strategy import EnqueueStrategy
191+
192+ # ...
193+
182194 @crawler.router.default_handler
183195 async def request_handler (context : BeautifulSoupCrawlingContext) -> None :
184196 # Use enqueue links helper to enqueue all links from the page with the same domain
185197 await context.enqueue_links(strategy = EnqueueStrategy.SAME_DOMAIN )
198+
186199 record = {
187- ' title' : context.soup.title.text if context.soup.title else ' ' ,
200+ ' title' : context.soup.title.text,
188201 ' url' : context.request.url,
189202 }
203+
190204 await dataset.push_data(record)
191205```
192206
@@ -242,7 +256,7 @@ async def main() -> None:
242256 print (f ' Dataset data: { data.items} ' ) # Dataset data: [{'key1': 'value1'}]
243257
244258 # Open a named dataset
245- dataset_named = await Dataset.open(' some-name' )
259+ dataset_named = await Dataset.open(name = ' some-name' )
246260
247261 # Push multiple records
248262 await dataset_named.push_data([{' key2' : ' value2' }, {' key3' : ' value3' }])
@@ -289,7 +303,7 @@ async def main() -> None:
289303 print (f ' Value of OUTPUT: { value} ' ) # Value of OUTPUT: {'my_result': 123}
290304
291305 # Open a named key-value store
292- kvs_named = await KeyValueStore.open(' some-name' )
306+ kvs_named = await KeyValueStore.open(name = ' some-name' )
293307
294308 # Write a record to the named key-value store
295309 await kvs_named.set_value(' some-key' , {' foo' : ' bar' })
@@ -300,7 +314,6 @@ async def main() -> None:
300314
301315if __name__ == ' __main__' :
302316 asyncio.run(main())
303-
304317```
305318
306319<!-- TODO: link to a real-world example -->
@@ -336,7 +349,7 @@ async def main() -> None:
336349 await rq.add_request(' https://crawlee.dev' )
337350
338351 # Open a named request queue
339- rq_named = await RequestQueue.open(' some-name' )
352+ rq_named = await RequestQueue.open(name = ' some-name' )
340353
341354 # Add multiple requests
342355 await rq_named.add_requests_batched([' https://apify.com' , ' https://example.com' ])
@@ -376,22 +389,31 @@ crawler = HttpCrawler(use_session_pool=True)
376389If you want to configure your own session pool, instantiate it and provide it directly to the crawler.
377390
378391``` python
392+ import asyncio
393+ from datetime import timedelta
394+
379395from crawlee.http_crawler import HttpCrawler
380- from crawlee.sessions import SessionPool
396+ from crawlee.sessions import Session, SessionPool
381397
382- # Use dict as args for new sessions
383- session_pool_v1 = SessionPool(
384- max_pool_size = 10 ,
385- create_session_settings = {' max_age' : timedelta(minutes = 10 )},
386- )
387398
388- # Use lambda creation function for new sessions
389- session_pool_v2 = SessionPool(
390- max_pool_size = 10 ,
391- create_session_function = lambda _ : Session(max_age = timedelta(minutes = 10 )),
392- )
399+ async def main () -> None :
400+ # Use dict as args for new sessions
401+ session_pool_v1 = SessionPool(
402+ max_pool_size = 10 ,
403+ create_session_settings = {' max_age' : timedelta(minutes = 10 )},
404+ )
405+
406+ # Use lambda creation function for new sessions
407+ session_pool_v2 = SessionPool(
408+ max_pool_size = 10 ,
409+ create_session_function = lambda _ : Session(max_age = timedelta(minutes = 10 )),
410+ )
393411
394- crawler = HttpCrawler(session_pool = session_pool_v1, use_session_pool = True )
412+ crawler = HttpCrawler(session_pool = session_pool_v1, use_session_pool = True )
413+
414+
415+ if __name__ == ' __main__' :
416+ asyncio.run(main())
395417```
396418
397419## Running on the Apify platform
0 commit comments