apify-sdk-python/docs/03_guides/code/03_playwright.py at 8a854f2149b5c8e1a6f76af18683baa3f67c2418 · apify/apify-sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import asyncio
from typing import Any
from urllib.parse import urljoin, urlsplit

from playwright.async_api import BrowserContext, async_playwright

from apify import Actor, Request
from apify.storages import RequestQueue

# To run locally, install the browsers first: `playwright install --with-deps`.
# On the Apify platform, browsers are already in the Actor's Docker image.


def to_playwright_proxy(proxy_url: str) -> dict[str, str]:
    """Split an Apify Proxy URL into Playwright's server/username/password."""
    parts = urlsplit(proxy_url)
    return {
        'server': f'{parts.scheme}://{parts.hostname}:{parts.port}',
        'username': parts.username or '',
        'password': parts.password or '',
    }


async def scrape_page(
    context: BrowserContext, url: str
) -> tuple[dict[str, Any], list[str]]:
    """Open the URL in a new page and return its data and same-site links."""
    page = await context.new_page()
    try:
        await page.goto(url)

        data = {
            'url': url,
            'title': await page.title(),
            'h1s': [await h1.text_content() for h1 in await page.locator('h1').all()],
            'h2s': [await h2.text_content() for h2 in await page.locator('h2').all()],
            'h3s': [await h3.text_content() for h3 in await page.locator('h3').all()],
        }

        # Keep only absolute links on the same host.
        links: list[str] = []
        host = urlsplit(url).netloc
        for link in await page.locator('a').all():
            link_href = await link.get_attribute('href')
            link_url = urljoin(url, link_href)
            if not link_url.startswith(('http://', 'https://')):
                continue
            if urlsplit(link_url).netloc == host:
                links.append(link_url)

        return data, links

    finally:
        await page.close()


async def enqueue_links(
    request_queue: RequestQueue,
    links: list[str],
    *,
    depth: int,
    max_depth: int,
) -> None:
    """Enqueue the links one level deeper, unless max_depth was reached."""
    if depth >= max_depth:
        return

    for link_url in links:
        Actor.log.info(f'Enqueuing {link_url} ...')
        request = Request.from_url(link_url)
        request.crawl_depth = depth + 1
        await request_queue.add_request(request)


async def main() -> None:
    async with Actor:
        # Read the Actor input.
        actor_input = await Actor.get_input() or {}
        start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}])
        max_depth = actor_input.get('maxDepth', 1)

        if not start_urls:
            Actor.log.info('No start URLs specified in Actor input, exiting...')
            await Actor.exit()

        # Set up the proxy configuration. A fresh proxy URL is fetched per request below.
        proxy_configuration = await Actor.create_proxy_configuration()

        # Open the request queue and enqueue the start URLs (crawl depth 0).
        request_queue = await Actor.open_request_queue()
        for start_url in start_urls:
            url = start_url.get('url')
            Actor.log.info(f'Enqueuing start URL: {url}')
            await request_queue.add_request(Request.from_url(url))

        # Cap the crawl. Raise or remove the limit to follow more pages.
        max_requests = 10
        handled_requests = 0

        Actor.log.info('Launching Playwright...')

        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(
                headless=Actor.configuration.headless,
                args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
            )

            while handled_requests < max_requests and (
                request := await request_queue.fetch_next_request()
            ):
                handled_requests += 1
                url = request.url
                depth = request.crawl_depth
                Actor.log.info(f'Scraping {url} (depth={depth}) ...')

                # A new context with a fresh proxy URL per request rotates the proxy IP.
                proxy_url = (
                    await proxy_configuration.new_url() if proxy_configuration else None
                )
                context = await browser.new_context(
                    proxy=to_playwright_proxy(proxy_url) if proxy_url else None,
                )

                try:
                    data, links = await scrape_page(context, url)
                    await Actor.push_data(data)
                    Actor.log.info(
                        f'Stored data from {url} '
                        f'(title={data["title"]!r}, {len(links)} links found).'
                    )
                    await enqueue_links(
                        request_queue, links, depth=depth, max_depth=max_depth
                    )

                except Exception:
                    Actor.log.exception(f'Cannot extract data from {url}.')

                finally:
                    await context.close()
                    await request_queue.mark_request_as_handled(request)


if __name__ == '__main__':
    asyncio.run(main())