Skip to content

Commit 6f2ac13

Browse files
authored
feat: Add page lifecycle hooks to BrowserPool (#1791)
### Description Add four page lifecycle hooks to `BrowserPool` registered as decorators: - `pre_page_create_hook` — called before page creation; `browser_new_context_options` is mutable, so the hook can affect how the page context is configured. - `post_page_create_hook` — called after page creation. - `pre_page_close_hook` — called before page close. - `post_page_close_hook` — called after page close. ### Issues - Relates: #1741 ### Testing - Added new tests for `BrowserPool`.
1 parent d4ba60f commit 6f2ac13

File tree

5 files changed

+345
-12
lines changed

5 files changed

+345
-12
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import logging
5+
from typing import TYPE_CHECKING, Any
6+
7+
from crawlee.browsers import BrowserPool
8+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
9+
from crawlee.storages import KeyValueStore
10+
11+
if TYPE_CHECKING:
12+
from crawlee.browsers._browser_controller import BrowserController
13+
from crawlee.browsers._types import CrawleePage
14+
from crawlee.proxy_configuration import ProxyInfo
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
async def main() -> None:
20+
async with BrowserPool() as browser_pool:
21+
22+
@browser_pool.pre_page_create_hook
23+
async def log_page_init(
24+
page_id: str,
25+
_browser_controller: BrowserController,
26+
_browser_new_context_options: dict[str, Any],
27+
_proxy_info: ProxyInfo | None,
28+
) -> None:
29+
"""Log when a new page is about to be created."""
30+
logger.info(f'Creating page {page_id}...')
31+
32+
@browser_pool.post_page_create_hook
33+
async def set_viewport(
34+
crawlee_page: CrawleePage, _browser_controller: BrowserController
35+
) -> None:
36+
"""Set a fixed viewport size on each newly created page."""
37+
await crawlee_page.page.set_viewport_size({'width': 1280, 'height': 1024})
38+
39+
@browser_pool.pre_page_close_hook
40+
async def save_screenshot(
41+
crawlee_page: CrawleePage, _browser_controller: BrowserController
42+
) -> None:
43+
"""Save a screenshot to KeyValueStore before each page is closed."""
44+
kvs = await KeyValueStore.open()
45+
46+
screenshot = await crawlee_page.page.screenshot()
47+
await kvs.set_value(
48+
key=f'screenshot-{crawlee_page.id}',
49+
value=screenshot,
50+
content_type='image/png',
51+
)
52+
logger.info(f'Saved screenshot for page {crawlee_page.id}.')
53+
54+
@browser_pool.post_page_close_hook
55+
async def log_page_closed(
56+
page_id: str, _browser_controller: BrowserController
57+
) -> None:
58+
"""Log after each page is closed."""
59+
logger.info(f'Page {page_id} closed successfully.')
60+
61+
crawler = PlaywrightCrawler(
62+
browser_pool=browser_pool,
63+
max_requests_per_crawl=5,
64+
)
65+
66+
@crawler.router.default_handler
67+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
68+
context.log.info(f'Processing {context.request.url} ...')
69+
70+
await context.enqueue_links()
71+
72+
# Run the crawler with the initial list of URLs.
73+
await crawler.run(['https://crawlee.dev'])
74+
75+
76+
if __name__ == '__main__':
77+
asyncio.run(main())

docs/guides/code_examples/playwright_crawler/pre_navigation_hook_example.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,12 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
1717
await context.enqueue_links()
1818

1919
@crawler.pre_navigation_hook
20-
async def log_navigation_url(context: PlaywrightPreNavCrawlingContext) -> None:
20+
async def configure_page(context: PlaywrightPreNavCrawlingContext) -> None:
2121
context.log.info(f'Navigating to {context.request.url} ...')
2222

23-
# will set a timeout for all navigation methods
24-
context.page.set_default_navigation_timeout(600_000)
25-
26-
# will set the page size before you go to the target URL
27-
await context.page.set_viewport_size({'width': 1280, 'height': 1024})
23+
# block stylesheets, images, fonts and other static assets
24+
# to speed up page loading
25+
await context.block_requests()
2826

2927
# Run the crawler with the initial list of URLs.
3028
await crawler.run(['https://crawlee.dev'])

docs/guides/playwright_crawler.mdx

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
1111
import MultipleLaunchExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/multiple_launch_example.py';
1212
import BrowserConfigurationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_configuration_example.py';
1313
import PreNavigationExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/pre_navigation_hook_example.py';
14-
14+
import BrowserPoolPageHooksExample from '!!raw-loader!roa-loader!./code_examples/playwright_crawler/browser_pool_page_hooks_example.py';
1515
import PluginBrowserConfigExample from '!!raw-loader!./code_examples/playwright_crawler/plugin_browser_configuration_example.py';
1616

1717
A <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is a browser-based crawler. In contrast to HTTP-based crawlers like <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>, it uses a real browser to render pages and extract data. It is built on top of the [Playwright](https://playwright.dev/python/) browser automation library. While browser-based crawlers are typically slower and less efficient than HTTP-based crawlers, they can handle dynamic, client-side rendered sites that standard HTTP-based crawlers cannot manage.
@@ -57,14 +57,22 @@ You can also configure each plugin used by <ApiLink to="class/BrowserPool">`Brow
5757

5858
For an example of how to implement a custom browser plugin, see the [Camoufox example](../examples/playwright-crawler-with-camoufox). [Camoufox](https://camoufox.com/) is a stealth browser plugin designed to reduce detection by anti-scraping measures and is fully compatible with <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>.
5959

60-
## Page configuration with pre-navigation hooks
60+
## Page configuration with lifecycle page hooks
61+
62+
For additional setup or event-driven actions around page creation and closure, the <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> exposes four lifecycle hooks: <ApiLink to="class/BrowserPool#pre_page_create_hook">`pre_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#post_page_create_hook">`post_page_create_hook`</ApiLink>, <ApiLink to="class/BrowserPool#pre_page_close_hook">`pre_page_close_hook`</ApiLink>, and <ApiLink to="class/BrowserPool#post_page_close_hook">`post_page_close_hook`</ApiLink>. To use them, create a `BrowserPool` instance and pass it to <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> via the `browser_pool` argument.
63+
64+
<RunnableCodeBlock className="language-python" language="python">
65+
{BrowserPoolPageHooksExample}
66+
</RunnableCodeBlock>
67+
68+
## Navigation hooks
6169

62-
In some use cases, you may need to configure the [page](https://playwright.dev/python/docs/api/class-page) before it navigates to the target URL. For instance, you might set navigation timeouts or manipulate other page-level settings. For such cases you can use the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> method of the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink>. This method is called before the page navigates to the target URL and allows you to configure the page instance.
70+
Navigation hooks allow for additional configuration at specific points during page navigation. For example, the <ApiLink to="class/PlaywrightCrawler#pre_navigation_hook">`pre_navigation_hook`</ApiLink> is called before each navigation and provides <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> - including the [page](https://playwright.dev/python/docs/api/class-page) instance and a <ApiLink to="class/PlaywrightPreNavCrawlingContext#block_requests">`block_requests`</ApiLink> helper for filtering unwanted resource types and URL patterns. See the [block requests example](https://crawlee.dev/python/docs/examples/playwright-crawler-with-block-requests) for a dedicated walkthrough.
6371

6472
<RunnableCodeBlock className="language-python" language="python">
6573
{PreNavigationExample}
6674
</RunnableCodeBlock>
6775

6876
## Conclusion
6977

70-
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, and apply pre-navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
78+
This guide introduced the <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and explained how to configure it using <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> and <ApiLink to="class/PlaywrightBrowserPlugin">`PlaywrightBrowserPlugin`</ApiLink>. You learned how to launch multiple browsers, configure browser and context settings, use <ApiLink to="class/BrowserPool">`BrowserPool`</ApiLink> lifecycle page hooks, and apply navigation hooks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!

src/crawlee/browsers/_browser_pool.py

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from crawlee.browsers._types import BrowserType, CrawleePage
2020

2121
if TYPE_CHECKING:
22-
from collections.abc import Mapping, Sequence
22+
from collections.abc import Awaitable, Callable, Mapping, Sequence
2323
from pathlib import Path
2424
from types import TracebackType
2525

@@ -99,6 +99,13 @@ def __init__(
9999
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
100100
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins
101101

102+
self._pre_page_create_hooks: list[
103+
Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
104+
] = []
105+
self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
106+
self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
107+
self._post_page_close_hooks: list[Callable[[str, BrowserController], Awaitable[None]]] = []
108+
102109
# Flag to indicate the context state.
103110
self._active = False
104111

@@ -301,9 +308,15 @@ async def _get_new_page(
301308
try:
302309
if not browser_controller:
303310
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
311+
browser_new_context_options = dict(plugin.browser_new_context_options)
312+
313+
await self._execute_hooks(
314+
self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info
315+
)
316+
304317
page = await asyncio.wait_for(
305318
browser_controller.new_page(
306-
browser_new_context_options=plugin.browser_new_context_options,
319+
browser_new_context_options=browser_new_context_options,
307320
proxy_info=proxy_info,
308321
),
309322
timeout,
@@ -319,6 +332,11 @@ async def _get_new_page(
319332
crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)
320333
self._pages[page_id] = crawlee_page
321334
self._total_pages_count += 1
335+
336+
await self._execute_hooks(self._post_page_create_hooks, crawlee_page, browser_controller)
337+
338+
self._override_page_close(crawlee_page, browser_controller)
339+
322340
return crawlee_page
323341

324342
def _pick_browser_with_free_capacity(
@@ -357,3 +375,73 @@ async def _close_inactive_browsers(self) -> None:
357375
if not browser.pages:
358376
await browser.close()
359377
self._inactive_browsers.remove(browser)
378+
379+
async def _execute_hooks(self, hooks: list[Callable[..., Awaitable[None]]], *args: Any) -> None:
380+
"""Execute the provided hooks with the given arguments."""
381+
for hook in hooks:
382+
await hook(*args)
383+
384+
def _override_page_close(self, crawlee_page: CrawleePage, browser_controller: BrowserController) -> None:
385+
"""Override the page's close method to execute pre and post close hooks."""
386+
if self._pre_page_close_hooks or self._post_page_close_hooks:
387+
original_close = crawlee_page.page.close
388+
389+
async def close_with_hooks(*args: Any, **kwargs: Any) -> None:
390+
try:
391+
await self._execute_hooks(self._pre_page_close_hooks, crawlee_page, browser_controller)
392+
finally:
393+
await original_close(*args, **kwargs)
394+
await self._execute_hooks(self._post_page_close_hooks, crawlee_page.id, browser_controller)
395+
396+
crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks
397+
398+
def pre_page_create_hook(
399+
self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
400+
) -> Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]:
401+
"""Register a hook to be called just before a new page is created.
402+
403+
The hook receives the page ID, `BrowserController`, `browser_new_context_options`, and `ProxyInfo`.
404+
Note that depending on the `BrowserController` implementation, `browser_new_context_options` may not
405+
apply to every page individually. For example, `PlaywrightBrowserController` with
406+
``use_incognito_pages=False`` shares a single context across all pages, so the options are applied
407+
only when the context is first created.
408+
"""
409+
self._pre_page_create_hooks.append(hook)
410+
411+
return hook
412+
413+
def post_page_create_hook(
414+
self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]
415+
) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:
416+
"""Register a hook to be called right after a new page is created.
417+
418+
The hook receives the newly created `CrawleePage` and the `BrowserController`. Use it to apply
419+
changes to all pages, such as injecting scripts or configuring request interception.
420+
"""
421+
self._post_page_create_hooks.append(hook)
422+
423+
return hook
424+
425+
def pre_page_close_hook(
426+
self, hook: Callable[[CrawleePage, BrowserController], Awaitable[None]]
427+
) -> Callable[[CrawleePage, BrowserController], Awaitable[None]]:
428+
"""Register a hook to be called just before a page is closed.
429+
430+
The hook receives the `CrawleePage` and the `BrowserController`. Use it to collect last-second data,
431+
such as taking a screenshot or saving page state before the page is destroyed.
432+
"""
433+
self._pre_page_close_hooks.append(hook)
434+
435+
return hook
436+
437+
def post_page_close_hook(
438+
self, hook: Callable[[str, BrowserController], Awaitable[None]]
439+
) -> Callable[[str, BrowserController], Awaitable[None]]:
440+
"""Register a hook to be called right after a page is closed.
441+
442+
The hook receives the page ID and the `BrowserController`. Use it for cleanup or logging
443+
after a page's lifecycle ends.
444+
"""
445+
self._post_page_close_hooks.append(hook)
446+
447+
return hook

0 commit comments

Comments
 (0)