apify-sdk-python/docs/03_guides/code/05_crawlee_beautifulsoup.py at 43425635ef1dcb139405b1833410dcd4dd1518b3 · apify/apify-sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext

from apify import Actor


async def main() -> None:
    # Enter the context of the Actor.
    async with Actor:
        # Retrieve the Actor input, and use default values if not provided.
        actor_input = await Actor.get_input() or {}
        start_urls = [
            url.get('url')
            for url in actor_input.get(
                'start_urls',
                [{'url': 'https://apify.com'}],
            )
        ]

        # Exit if no start URLs are provided.
        if not start_urls:
            Actor.log.info('No start URLs specified in Actor input, exiting...')
            await Actor.exit()

        # Create a crawler.
        crawler = BeautifulSoupCrawler(
            # Limit the crawl to max requests.
            # Remove or increase it for crawling all links.
            max_requests_per_crawl=50,
        )

        # Define a request handler, which will be called for every request.
        @crawler.router.default_handler
        async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
            url = context.request.url
            Actor.log.info(f'Scraping {url}...')

            # Extract the desired data.
            data = {
                'url': context.request.url,
                'title': context.soup.title.string if context.soup.title else None,
                'h1s': [h1.text for h1 in context.soup.find_all('h1')],
                'h2s': [h2.text for h2 in context.soup.find_all('h2')],
                'h3s': [h3.text for h3 in context.soup.find_all('h3')],
            }

            # Store the extracted data to the default dataset.
            await context.push_data(data)

            # Enqueue additional links found on the current page.
            await context.enqueue_links()

        # Run the crawler with the starting requests.
        await crawler.run(start_urls)