update doc

Mantisus · Mantisus · commit fe225422cc7c · 2026-03-18T01:21:16.000Z
diff --git a/docs/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py b/docs/guides/code_examples/playwright_crawler/browser_pool_page_hooks_example.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import TYPE_CHECKING, Any
+
+from crawlee.browsers import BrowserPool
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.storages import KeyValueStore
+
+if TYPE_CHECKING:
+    from crawlee.browsers._browser_controller import BrowserController
+    from crawlee.browsers._types import CrawleePage
+    from crawlee.proxy_configuration import ProxyInfo
+
+logger = logging.getLogger(__name__)
+
+
+async def main() -> None:
+    async with BrowserPool() as browser_pool:
+
+        @browser_pool.pre_page_create_hook
+        async def log_page_init(
+            page_id: str,
+            _browser_controller: BrowserController,
+            _browser_new_context_options: dict[str, Any],
+            _proxy_info: ProxyInfo | None,
+        ) -> None:
+            """Log when a new page is about to be created."""
+            logger.info(f'Creating page {page_id}...')
+
+        @browser_pool.post_page_create_hook
+        async def set_viewport(
+            crawlee_page: CrawleePage, _browser_controller: BrowserController
+        ) -> None:
+            """Set a fixed viewport size on each newly created page."""
+            await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})
+
+        @browser_pool.pre_page_close_hook
+        async def save_screenshot(
+            crawlee_page: CrawleePage, _browser_controller: BrowserController
+        ) -> None:
+            """Save a screenshot to KeyValueStore before each page is closed."""
+            kvs = await KeyValueStore.open()
+
+            screenshot = await crawlee_page.page.screenshot()
+            await kvs.set_value(
+                key=f'screenshot-{crawlee_page.id}',
+                value=screenshot,
+                content_type='image/png',
+            )
+            logger.info(f'Saved screenshot for page {crawlee_page.id}.')
+
+        @browser_pool.post_page_close_hook
+        async def log_page_closed(
+            page_id: str, _browser_controller: BrowserController
+        ) -> None:
+            """Log after each page is closed."""
+            logger.info(f'Page {page_id} closed successfully.')
+
+        crawler = PlaywrightCrawler(
+            browser_pool=browser_pool,
+            max_requests_per_crawl=5,
+        )
+
+        @crawler.router.default_handler
+        async def request_handler(context: PlaywrightCrawlingContext) -> None:
+            context.log.info(f'Processing {context.request.url} ...')
+
+            await context.enqueue_links()
+
+        # Run the crawler with the initial list of URLs.
+        await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py b/docs/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py
@@ -17,14 +17,12 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
         await context.enqueue_links()
 
     @crawler.pre_navigation_hook
-    async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None:
+    async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:
         context.log.info(f'Navigating to {context.request.url} ...')
 
-        # will set a timeout for all navigation methods
-        context.page.set_default_navigation_timeout(600_000)
-
-        # will set the page size before you go to the target URL
-        await context.page.set_viewport_size({'width': 1280, 'height': 1024})
+        # block stylesheets, images, fonts and other static assets
+        # to speed up page loading
+        await context.block_requests()
 
     # Run the crawler with the initial list of URLs.
     await crawler.run(['https://crawlee.dev'])
diff --git a/docs/guides/playwright_crawler.mdx b/docs/guides/playwright_crawler.mdx
@@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';
 import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';
 import PreNavigationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/pre_navigation_hook_example.py';
-
+import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';
 import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';
 
 A <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is a browser-based crawler. In contrast to HTTP-based crawlers like <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage.
@@ -57,14 +57,22 @@ You can also configure each plugin used by <ApiLink to="class/BrowserPool">`Brow
 
 For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.
 
-## Page configuration with pre-navigation hooks
+## Page configuration with lifecycle page hooks
+
+For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.
+
+<RunnableCodeBlock className="language-python" language="python">
+    {BrowserPoolPageHooksExample}
+</RunnableCodeBlock>
+
+## Navigation hooks
 
-In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> method of the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. This method is called before the page navigates to the target URL and allows you to configure the page instance.
+Navigation hooks allow for additional configuration at specific points during page navigation. For example, the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> is called before each navigation and provides <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a <ApiLink to="class/PlaywrightPreNavCrawlingContext#block_requests">`block_requests`</ApiLink> helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough.
 
 <RunnableCodeBlock className="language-python" language="python">
     {PreNavigationExample}
 </RunnableCodeBlock>
 
 ## Conclusion
 
-This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, and apply pre-navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
+This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!