Skip to content

Commit a07ea34

Browse files
committed
add hooks
1 parent 33b2559 commit a07ea34

2 files changed

Lines changed: 165 additions & 3 deletions

File tree

src/crawlee/browsers/_browser_pool.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def __init__(
100100
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins
101101

102102
self._pre_page_create_hooks: list[
103-
Callable[[str, BrowserController, Mapping[str, Any], ProxyInfo | None], Awaitable[None]]
103+
Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
104104
] = []
105105
self._post_page_create_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
106106
self._pre_page_close_hooks: list[Callable[[CrawleePage, BrowserController], Awaitable[None]]] = []
@@ -308,7 +308,7 @@ async def _get_new_page(
308308
try:
309309
if not browser_controller:
310310
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
311-
browser_new_context_options = plugin.browser_new_context_options
311+
browser_new_context_options = dict(plugin.browser_new_context_options)
312312

313313
await self._execute_hooks(
314314
self._pre_page_create_hooks, page_id, browser_controller, browser_new_context_options, proxy_info
@@ -394,7 +394,7 @@ async def close_with_hooks(*args: Any, **kwargs: Any) -> None:
394394
crawlee_page.page.close: Callable[..., Awaitable[None]] = close_with_hooks
395395

396396
def pre_page_create_hook(
397-
self, hook: Callable[[str, BrowserController, Mapping[str, Any], ProxyInfo | None], Awaitable[None]]
397+
self, hook: Callable[[str, BrowserController, dict[str, Any], ProxyInfo | None], Awaitable[None]]
398398
) -> None:
399399
"""Register a hook to be called just before a new page is created.
400400

tests/unit/browsers/test_browser_pool.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,23 @@
11
from __future__ import annotations
22

33
from typing import TYPE_CHECKING
4+
from unittest.mock import AsyncMock
45

56
import pytest
67

78
from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
9+
from crawlee.browsers._browser_controller import BrowserController
10+
from crawlee.browsers._types import CrawleePage
811
from tests.unit.utils import run_alone_on_mac
912

1013
if TYPE_CHECKING:
14+
from collections.abc import Mapping
15+
from typing import Any
16+
1117
from yarl import URL
1218

19+
from crawlee.proxy_configuration import ProxyInfo
20+
1321

1422
async def test_default_plugin_new_page_creation(server_url: URL) -> None:
1523
async with BrowserPool() as browser_pool:
@@ -192,3 +200,157 @@ async def test_browser_pool_retire_browser_after_page_count(
192200
assert first_browser is second_browser
193201
else:
194202
assert first_browser is not second_browser
203+
204+
205+
async def test_pre_page_create_hook_is_called() -> None:
206+
call_mock = AsyncMock()
207+
208+
async with BrowserPool() as browser_pool:
209+
210+
@browser_pool.pre_page_create_hook
211+
async def hook(
212+
page_id: str,
213+
controller: BrowserController,
214+
browser_new_context_options: dict[str, Any],
215+
proxy_info: ProxyInfo | None,
216+
) -> None:
217+
await call_mock(page_id, controller, browser_new_context_options, proxy_info)
218+
219+
browser_new_context_options['user_agent'] = 'Modified User-Agent'
220+
221+
assert len(controller.pages) == 0
222+
223+
test_page = await browser_pool.new_page()
224+
user_agent = await test_page.page.evaluate('navigator.userAgent')
225+
226+
await test_page.page.close()
227+
228+
assert user_agent == 'Modified User-Agent'
229+
230+
call_mock.assert_awaited_once()
231+
page_id, controller, _, proxy_info = call_mock.call_args[0]
232+
233+
assert isinstance(page_id, str)
234+
assert test_page.id == page_id
235+
assert isinstance(controller, BrowserController)
236+
assert proxy_info is None
237+
238+
239+
async def test_post_page_create_hook_is_called() -> None:
240+
call_mock = AsyncMock()
241+
242+
async with BrowserPool() as browser_pool:
243+
244+
@browser_pool.post_page_create_hook
245+
async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:
246+
await call_mock(crawlee_page, controller)
247+
await crawlee_page.page.evaluate('window.__hook_applied = true')
248+
249+
assert isinstance(crawlee_page, CrawleePage)
250+
251+
assert len(controller.pages) == 1
252+
253+
test_page = await browser_pool.new_page()
254+
255+
js_result = await test_page.page.evaluate('window.__hook_applied')
256+
257+
await test_page.page.close()
258+
259+
assert js_result is True
260+
261+
call_mock.assert_awaited_once()
262+
crawlee_page, controller = call_mock.call_args[0]
263+
264+
assert test_page is crawlee_page
265+
assert isinstance(controller, BrowserController)
266+
267+
268+
async def test_pre_page_close_hook() -> None:
269+
call_mock = AsyncMock()
270+
271+
async with BrowserPool() as browser_pool:
272+
273+
@browser_pool.pre_page_close_hook
274+
async def hook(crawlee_page: CrawleePage, controller: BrowserController) -> None:
275+
await call_mock(crawlee_page, controller)
276+
277+
assert not crawlee_page.page.is_closed()
278+
assert len(controller.pages) == 1
279+
280+
test_page = await browser_pool.new_page()
281+
await test_page.page.close()
282+
283+
call_mock.assert_awaited_once()
284+
assert test_page.page.is_closed()
285+
286+
287+
async def test_post_page_close_hook() -> None:
288+
call_mock = AsyncMock()
289+
290+
async with BrowserPool() as browser_pool:
291+
292+
@browser_pool.post_page_close_hook
293+
async def hook(page_id: str, controller: BrowserController) -> None:
294+
await call_mock(page_id, controller)
295+
296+
assert len(controller.pages) == 0
297+
298+
test_page = await browser_pool.new_page()
299+
await test_page.page.close()
300+
301+
page_id, controller = call_mock.call_args[0]
302+
303+
call_mock.assert_awaited_once()
304+
assert test_page.id == page_id
305+
assert isinstance(controller, BrowserController)
306+
307+
308+
async def test_page_hooks_execution_order() -> None:
309+
call_order: list[str] = []
310+
311+
async with BrowserPool() as browser_pool:
312+
313+
@browser_pool.pre_page_create_hook
314+
async def pre_create(
315+
_page_id: str,
316+
_controller: BrowserController,
317+
_browser_new_context_options: Mapping[str, Any],
318+
_proxy_info: ProxyInfo | None,
319+
) -> None:
320+
call_order.append('pre_create')
321+
322+
@browser_pool.post_page_create_hook
323+
async def post_create(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
324+
call_order.append('post_create')
325+
326+
@browser_pool.pre_page_close_hook
327+
async def pre_close(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
328+
call_order.append('pre_close')
329+
330+
@browser_pool.post_page_close_hook
331+
async def post_close(_page_id: str, _controller: BrowserController) -> None:
332+
call_order.append('post_close')
333+
334+
page = await browser_pool.new_page()
335+
await page.page.close()
336+
337+
assert call_order == ['pre_create', 'post_create', 'pre_close', 'post_close']
338+
339+
340+
async def test_multiple_hooks_all_called() -> None:
341+
call_order: list[str] = []
342+
343+
async with BrowserPool() as browser_pool:
344+
345+
@browser_pool.post_page_create_hook
346+
async def first(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
347+
call_order.append('first')
348+
349+
@browser_pool.post_page_create_hook
350+
async def second(_crawlee_page: CrawleePage, _controller: BrowserController) -> None:
351+
call_order.append('second')
352+
353+
page = await browser_pool.new_page()
354+
await page.page.close()
355+
356+
assert call_order == ['first', 'second']

0 commit comments

Comments
 (0)