Skip to content

Commit 1523bde

Browse files
authored
docs: add installation of extras and fix code examples (#154)
### Description - Mention how to install extras (beautiful soup) in the installation section - Fix code examples
1 parent f016199 commit 1523bde

1 file changed

Lines changed: 44 additions & 22 deletions

File tree

README.md

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ Crawlee is available as the [`crawlee`](https://pypi.org/project/crawlee/) PyPI
2626
pip install crawlee
2727
```
2828

29+
Additional, optional dependencies unlocking more features are shipped as package extras.
30+
31+
If you plan to use `BeautifulSoupCrawler`, install `crawlee` with `beautifulsoup` extra:
32+
33+
```
34+
pip install crawlee[beautifulsoup]
35+
```
36+
2937
## Features
3038

3139
- Unified interface for **HTTP and headless browser** crawling.
@@ -99,8 +107,8 @@ async def main() -> None:
99107
# Define a handler for processing requests
100108
@crawler.router.default_handler
101109
async def request_handler(context: HttpCrawlingContext) -> None:
102-
# Crawler will provide a HttpCrawlingContext instance, from which you can access
103-
# the request and response data
110+
# Crawler will provide a HttpCrawlingContext instance,
111+
# from which you can access the request and response data
104112
record = {
105113
'url': context.request.url,
106114
'status_code': context.http_response.status_code,
@@ -158,10 +166,10 @@ async def main() -> None:
158166
# Define a handler for processing requests
159167
@crawler.router.default_handler
160168
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
161-
# Crawler will provide a BeautifulSoupCrawlingContext instance, from which you can access
162-
# the request and response data
169+
# Crawler will provide a BeautifulSoupCrawlingContext instance,
170+
# from which you can access the request and response data
163171
record = {
164-
'title': context.soup.title.text if context.soup.title else '',
172+
'title': context.soup.title.text,
165173
'url': context.request.url,
166174
}
167175
# Extract the record and push it to the dataset
@@ -179,14 +187,20 @@ if __name__ == '__main__':
179187
See the following example with the updated request handler:
180188

181189
```python
190+
from crawlee.enqueue_strategy import EnqueueStrategy
191+
192+
# ...
193+
182194
@crawler.router.default_handler
183195
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
184196
# Use enqueue links helper to enqueue all links from the page with the same domain
185197
await context.enqueue_links(strategy=EnqueueStrategy.SAME_DOMAIN)
198+
186199
record = {
187-
'title': context.soup.title.text if context.soup.title else '',
200+
'title': context.soup.title.text,
188201
'url': context.request.url,
189202
}
203+
190204
await dataset.push_data(record)
191205
```
192206

@@ -242,7 +256,7 @@ async def main() -> None:
242256
print(f'Dataset data: {data.items}') # Dataset data: [{'key1': 'value1'}]
243257

244258
# Open a named dataset
245-
dataset_named = await Dataset.open('some-name')
259+
dataset_named = await Dataset.open(name='some-name')
246260

247261
# Push multiple records
248262
await dataset_named.push_data([{'key2': 'value2'}, {'key3': 'value3'}])
@@ -289,7 +303,7 @@ async def main() -> None:
289303
print(f'Value of OUTPUT: {value}') # Value of OUTPUT: {'my_result': 123}
290304

291305
# Open a named key-value store
292-
kvs_named = await KeyValueStore.open('some-name')
306+
kvs_named = await KeyValueStore.open(name='some-name')
293307

294308
# Write a record to the named key-value store
295309
await kvs_named.set_value('some-key', {'foo': 'bar'})
@@ -300,7 +314,6 @@ async def main() -> None:
300314

301315
if __name__ == '__main__':
302316
asyncio.run(main())
303-
304317
```
305318

306319
<!-- TODO: link to a real-world example -->
@@ -336,7 +349,7 @@ async def main() -> None:
336349
await rq.add_request('https://crawlee.dev')
337350

338351
# Open a named request queue
339-
rq_named = await RequestQueue.open('some-name')
352+
rq_named = await RequestQueue.open(name='some-name')
340353

341354
# Add multiple requests
342355
await rq_named.add_requests_batched(['https://apify.com', 'https://example.com'])
@@ -376,22 +389,31 @@ crawler = HttpCrawler(use_session_pool=True)
376389
If you want to configure your own session pool, instantiate it and provide it directly to the crawler.
377390

378391
```python
392+
import asyncio
393+
from datetime import timedelta
394+
379395
from crawlee.http_crawler import HttpCrawler
380-
from crawlee.sessions import SessionPool
396+
from crawlee.sessions import Session, SessionPool
381397

382-
# Use dict as args for new sessions
383-
session_pool_v1 = SessionPool(
384-
max_pool_size=10,
385-
create_session_settings = {'max_age': timedelta(minutes=10)},
386-
)
387398

388-
# Use lambda creation function for new sessions
389-
session_pool_v2 = SessionPool(
390-
max_pool_size=10,
391-
create_session_function=lambda _: Session(max_age=timedelta(minutes=10)),
392-
)
399+
async def main() -> None:
400+
# Use dict as args for new sessions
401+
session_pool_v1 = SessionPool(
402+
max_pool_size=10,
403+
create_session_settings = {'max_age': timedelta(minutes=10)},
404+
)
405+
406+
# Use lambda creation function for new sessions
407+
session_pool_v2 = SessionPool(
408+
max_pool_size=10,
409+
create_session_function=lambda _: Session(max_age=timedelta(minutes=10)),
410+
)
393411

394-
crawler = HttpCrawler(session_pool=session_pool_v1, use_session_pool=True)
412+
crawler = HttpCrawler(session_pool=session_pool_v1, use_session_pool=True)
413+
414+
415+
if __name__ == '__main__':
416+
asyncio.run(main())
395417
```
396418

397419
## Running on the Apify platform

0 commit comments

Comments
 (0)