apify-sdk-python/docs/03_guides/code/07_scrapling_browser.py at 15d446c7a0f6e7bb8054b037566a6cbc5472272f · apify/apify-sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import asyncio
from typing import Any
from urllib.parse import urlsplit

from scrapling.fetchers import AsyncDynamicSession

from apify import Actor, Request
from apify.storages import RequestQueue


async def scrape_page(
    session: AsyncDynamicSession,
    url: str,
) -> tuple[dict[str, Any], list[str]]:
    """Fetch a page through the shared browser session and return data and links."""
    # `network_idle` waits until the page stops making network requests.
    response = await session.fetch(url, network_idle=True)

    data = {
        'url': url,
        'title': response.css('title::text').get(),
        'h1s': response.css('h1::text').getall(),
        'h2s': response.css('h2::text').getall(),
        'h3s': response.css('h3::text').getall(),
    }

    # Keep only absolute links on the same host.
    links: list[str] = []
    host = urlsplit(url).netloc
    for href in response.css('a::attr(href)').getall():
        link_url = response.urljoin(href)
        if not link_url.startswith(('http://', 'https://')):
            continue
        if urlsplit(link_url).netloc == host:
            links.append(link_url)

    return data, links


async def enqueue_links(
    request_queue: RequestQueue,
    links: list[str],
    *,
    depth: int,
    max_depth: int,
) -> None:
    """Enqueue the links one level deeper, unless max_depth was reached."""
    if depth >= max_depth:
        return

    for link_url in links:
        Actor.log.info(f'Enqueuing {link_url} ...')
        request = Request.from_url(link_url)
        request.crawl_depth = depth + 1
        await request_queue.add_request(request)


async def main() -> None:
    async with Actor:
        # Read the Actor input.
        actor_input = await Actor.get_input() or {}
        start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}])
        max_depth = actor_input.get('maxDepth', 1)

        if not start_urls:
            Actor.log.info('No start URLs specified in Actor input, exiting...')
            await Actor.exit()

        # Set up Apify Proxy and the request queue.
        proxy_configuration = await Actor.create_proxy_configuration()
        # Scrapling's browser session takes a single proxy at construction time.
        # Per-fetch proxy rotation would need a different session mode. Apify Proxy
        # rotates the exit IP itself, so one session-level proxy URL still varies the
        # IP across requests.
        proxy_url = await proxy_configuration.new_url() if proxy_configuration else None
        request_queue = await Actor.open_request_queue()

        # Enqueue the start URLs (crawl depth defaults to 0).
        for start_url in start_urls:
            url = start_url.get('url')
            Actor.log.info(f'Enqueuing start URL: {url}')
            await request_queue.add_request(Request.from_url(url))

        # Cap the crawl. Raise or remove the limit to follow more pages.
        max_requests = 10
        handled_requests = 0

        # Open the browser once (via Apify Proxy) and reuse it for every page.
        async with AsyncDynamicSession(headless=True, proxy=proxy_url) as session:
            while handled_requests < max_requests and (
                request := await request_queue.fetch_next_request()
            ):
                handled_requests += 1
                url = request.url
                depth = request.crawl_depth
                Actor.log.info(f'Scraping {url} (depth={depth}) ...')

                try:
                    data, links = await scrape_page(session, url)
                    await Actor.push_data(data)
                    Actor.log.info(
                        f'Stored data from {url} '
                        f'(title={data["title"]!r}, {len(links)} links found).'
                    )
                    await enqueue_links(
                        request_queue, links, depth=depth, max_depth=max_depth
                    )

                except Exception:
                    Actor.log.exception(f'Cannot extract data from {url}.')

                finally:
                    await request_queue.mark_request_as_handled(request)


if __name__ == '__main__':
    asyncio.run(main())