Skip to content

Commit fe22542

Browse files
committed
update doc
1 parent 55be588 commit fe22542

File tree

3 files changed

+93
-10
lines changed

3 files changed

+93
-10
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import logging
5+
from typing import TYPE_CHECKING, Any
6+
7+
from crawlee.browsers import BrowserPool
8+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
9+
from crawlee.storages import KeyValueStore
10+
11+
if TYPE_CHECKING:
12+
from crawlee.browsers._browser_controller import BrowserController
13+
from crawlee.browsers._types import CrawleePage
14+
from crawlee.proxy_configuration import ProxyInfo
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
async def main() -> None:
20+
async with BrowserPool() as browser_pool:
21+
22+
@browser_pool.pre_page_create_hook
23+
async def log_page_init(
24+
page_id: str,
25+
_browser_controller: BrowserController,
26+
_browser_new_context_options: dict[str, Any],
27+
_proxy_info: ProxyInfo | None,
28+
) -> None:
29+
"""Log when a new page is about to be created."""
30+
logger.info(f'Creating page {page_id}...')
31+
32+
@browser_pool.post_page_create_hook
33+
async def set_viewport(
34+
crawlee_page: CrawleePage, _browser_controller: BrowserController
35+
) -> None:
36+
"""Set a fixed viewport size on each newly created page."""
37+
await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})
38+
39+
@browser_pool.pre_page_close_hook
40+
async def save_screenshot(
41+
crawlee_page: CrawleePage, _browser_controller: BrowserController
42+
) -> None:
43+
"""Save a screenshot to KeyValueStore before each page is closed."""
44+
kvs = await KeyValueStore.open()
45+
46+
screenshot = await crawlee_page.page.screenshot()
47+
await kvs.set_value(
48+
key=f'screenshot-{crawlee_page.id}',
49+
value=screenshot,
50+
content_type='image/png',
51+
)
52+
logger.info(f'Saved screenshot for page {crawlee_page.id}.')
53+
54+
@browser_pool.post_page_close_hook
55+
async def log_page_closed(
56+
page_id: str, _browser_controller: BrowserController
57+
) -> None:
58+
"""Log after each page is closed."""
59+
logger.info(f'Page {page_id} closed successfully.')
60+
61+
crawler = PlaywrightCrawler(
62+
browser_pool=browser_pool,
63+
max_requests_per_crawl=5,
64+
)
65+
66+
@crawler.router.default_handler
67+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
68+
context.log.info(f'Processing {context.request.url} ...')
69+
70+
await context.enqueue_links()
71+
72+
# Run the crawler with the initial list of URLs.
73+
await crawler.run(['https://crawlee.dev'])
74+
75+
76+
if __name__ == '__main__':
77+
asyncio.run(main())

docs/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,12 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
1717
await context.enqueue_links()
1818

1919
@crawler.pre_navigation_hook
20-
async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None:
20+
async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:
2121
context.log.info(f'Navigating to {context.request.url} ...')
2222

23-
# will set a timeout for all navigation methods
24-
context.page.set_default_navigation_timeout(600_000)
25-
26-
# will set the page size before you go to the target URL
27-
await context.page.set_viewport_size({'width': 1280, 'height': 1024})
23+
# block stylesheets, images, fonts and other static assets
24+
# to speed up page loading
25+
await context.block_requests()
2826

2927
# Run the crawler with the initial list of URLs.
3028
await crawler.run(['https://crawlee.dev'])

docs/guides/playwright_crawler.mdx

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
1111
import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';
1212
import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';
1313
import PreNavigationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/pre_navigation_hook_example.py';
14-
14+
import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';
1515
import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';
1616

1717
A <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is a browser-based crawler. In contrast to HTTP-based crawlers like <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage.
@@ -57,14 +57,22 @@ You can also configure each plugin used by <ApiLink to="class/BrowserPool">`Brow
5757

5858
For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.
5959

60-
## Page configuration with pre-navigation hooks
60+
## Page configuration with lifecycle page hooks
61+
62+
For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.
63+
64+
<RunnableCodeBlock className="language-python" language="python">
65+
{BrowserPoolPageHooksExample}
66+
</RunnableCodeBlock>
67+
68+
## Navigation hooks
6169

62-
In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> method of the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. This method is called before the page navigates to the target URL and allows you to configure the page instance.
70+
Navigation hooks allow for additional configuration at specific points during page navigation. For example, the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> is called before each navigation and provides <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a <ApiLink to="class/PlaywrightPreNavCrawlingContext#block_requests">`block_requests`</ApiLink> helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough.
6371

6472
<RunnableCodeBlock className="language-python" language="python">
6573
{PreNavigationExample}
6674
</RunnableCodeBlock>
6775

6876
## Conclusion
6977

70-
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, and apply pre-navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
78+
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!

0 commit comments

Comments
 (0)