Skip to content

Commit 7eeb500

Browse files
authored
fix: Defer page object cleanup to make it accessible in error handlers (#1814)
- closes #1482
1 parent 9cff054 commit 7eeb500

File tree

8 files changed

+122
-73
lines changed

8 files changed

+122
-73
lines changed

src/crawlee/_types.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import json
1515
import logging
1616
import re
17-
from collections.abc import Callable, Coroutine, Sequence
17+
from collections.abc import Awaitable, Coroutine, Sequence
1818

1919
from typing_extensions import NotRequired, Required, Self, Unpack
2020

@@ -39,6 +39,9 @@
3939

4040
HttpPayload = bytes
4141

42+
DeferredCleanupCallback = Callable[[], 'Awaitable[Any]']
43+
"""An async callback to be called after request processing completes (including error handlers)."""
44+
4245
RequestTransformAction = Literal['skip', 'unchanged']
4346

4447
EnqueueStrategy = Literal['all', 'same-domain', 'same-hostname', 'same-origin']
@@ -661,6 +664,9 @@ class BasicCrawlingContext:
661664
log: logging.Logger
662665
"""Logger instance."""
663666

667+
register_deferred_cleanup: Callable[[DeferredCleanupCallback], None]
668+
"""Register an async callback to be called after request processing completes (including error handlers)."""
669+
664670
async def get_snapshot(self) -> PageSnapshot:
665671
"""Get snapshot of crawled page."""
666672
return PageSnapshot()

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ async def get_input_state(
320320
get_key_value_store=result.get_key_value_store,
321321
use_state=use_state_function,
322322
log=context.log,
323+
register_deferred_cleanup=context.register_deferred_cleanup,
323324
)
324325

325326
try:

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1413,6 +1413,8 @@ async def __run_task_function(self) -> None:
14131413
proxy_info = await self._get_proxy_info(request, session)
14141414
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
14151415

1416+
deferred_cleanup: list[Callable[[], Awaitable[None]]] = []
1417+
14161418
context = BasicCrawlingContext(
14171419
request=result.request,
14181420
session=session,
@@ -1423,6 +1425,7 @@ async def __run_task_function(self) -> None:
14231425
get_key_value_store=result.get_key_value_store,
14241426
use_state=self.use_state,
14251427
log=self._logger,
1428+
register_deferred_cleanup=deferred_cleanup.append,
14261429
)
14271430
self._context_result_map[context] = result
14281431

@@ -1509,6 +1512,13 @@ async def __run_task_function(self) -> None:
15091512
)
15101513
raise
15111514

1515+
finally:
1516+
for cleanup in deferred_cleanup:
1517+
try:
1518+
await cleanup()
1519+
except Exception: # noqa: PERF203
1520+
self._logger.exception('Error in deferred cleanup')
1521+
15121522
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
15131523
context.request.state = RequestState.BEFORE_NAV
15141524
await self._context_pipeline(

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 62 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ async def _open_page(
236236
proxy_info=context.proxy_info,
237237
get_key_value_store=context.get_key_value_store,
238238
log=context.log,
239+
register_deferred_cleanup=context.register_deferred_cleanup,
239240
page=crawlee_page.page,
240241
block_requests=partial(block_requests, page=crawlee_page.page),
241242
goto_options=GotoOptions(**self._goto_options),
@@ -296,63 +297,69 @@ async def _navigate(
296297
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
297298
infinite_scroll and block_requests).
298299
"""
299-
async with context.page:
300-
if context.session:
301-
session_cookies = context.session.cookies.get_cookies_as_playwright_format()
302-
await self._update_cookies(context.page, session_cookies)
303-
304-
if context.request.headers:
305-
await context.page.set_extra_http_headers(context.request.headers.model_dump())
306-
# Navigate to the URL and get response.
307-
if context.request.method != 'GET':
308-
# Call the notification only once
309-
warnings.warn(
310-
'Using other request methods than GET or adding payloads has a high impact on performance'
311-
' in recent versions of Playwright. Use only when necessary.',
312-
category=UserWarning,
313-
stacklevel=2,
314-
)
300+
# Enter the page context manager, but defer its cleanup (page.close()) so the page stays open
301+
# during error handler execution.
302+
await context.page.__aenter__()
315303

316-
route_handler = self._prepare_request_interceptor(
317-
method=context.request.method,
318-
headers=context.request.headers,
319-
payload=context.request.payload,
320-
)
304+
context.register_deferred_cleanup(lambda: context.page.__aexit__(None, None, None))
321305

322-
# Set route_handler only for current request
323-
await context.page.route(context.request.url, route_handler)
306+
if context.session:
307+
session_cookies = context.session.cookies.get_cookies_as_playwright_format()
308+
await self._update_cookies(context.page, session_cookies)
309+
310+
if context.request.headers:
311+
await context.page.set_extra_http_headers(context.request.headers.model_dump())
312+
# Navigate to the URL and get response.
313+
if context.request.method != 'GET':
314+
# Call the notification only once
315+
warnings.warn(
316+
'Using other request methods than GET or adding payloads has a high impact on performance'
317+
' in recent versions of Playwright. Use only when necessary.',
318+
category=UserWarning,
319+
stacklevel=2,
320+
)
324321

325-
try:
326-
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
327-
response = await context.page.goto(
328-
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
329-
)
330-
context.request.state = RequestState.AFTER_NAV
331-
except playwright.async_api.TimeoutError as exc:
332-
raise asyncio.TimeoutError from exc
333-
334-
if response is None:
335-
raise SessionError(f'Failed to load the URL: {context.request.url}')
336-
337-
# Set the loaded URL to the actual URL after redirection.
338-
context.request.loaded_url = context.page.url
339-
340-
yield PlaywrightPostNavCrawlingContext(
341-
request=context.request,
342-
session=context.session,
343-
add_requests=context.add_requests,
344-
send_request=context.send_request,
345-
push_data=context.push_data,
346-
use_state=context.use_state,
347-
proxy_info=context.proxy_info,
348-
get_key_value_store=context.get_key_value_store,
349-
log=context.log,
350-
page=context.page,
351-
block_requests=context.block_requests,
352-
goto_options=context.goto_options,
353-
response=response,
322+
route_handler = self._prepare_request_interceptor(
323+
method=context.request.method,
324+
headers=context.request.headers,
325+
payload=context.request.payload,
354326
)
355327

328+
# Set route_handler only for current request
329+
await context.page.route(context.request.url, route_handler)
330+
331+
try:
332+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
333+
response = await context.page.goto(
334+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
335+
)
336+
context.request.state = RequestState.AFTER_NAV
337+
except playwright.async_api.TimeoutError as exc:
338+
raise asyncio.TimeoutError from exc
339+
340+
if response is None:
341+
raise SessionError(f'Failed to load the URL: {context.request.url}')
342+
343+
# Set the loaded URL to the actual URL after redirection.
344+
context.request.loaded_url = context.page.url
345+
346+
yield PlaywrightPostNavCrawlingContext(
347+
request=context.request,
348+
session=context.session,
349+
add_requests=context.add_requests,
350+
send_request=context.send_request,
351+
push_data=context.push_data,
352+
use_state=context.use_state,
353+
proxy_info=context.proxy_info,
354+
get_key_value_store=context.get_key_value_store,
355+
log=context.log,
356+
register_deferred_cleanup=context.register_deferred_cleanup,
357+
page=context.page,
358+
block_requests=context.block_requests,
359+
goto_options=context.goto_options,
360+
response=response,
361+
)
362+
356363
def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction:
357364
"""Create a callback function for extracting links from context.
358365
@@ -495,10 +502,10 @@ async def _execute_post_navigation_hooks(
495502

496503
async def _create_crawling_context(
497504
self, context: PlaywrightPostNavCrawlingContext
498-
) -> AsyncGenerator[PlaywrightCrawlingContext, Exception | None]:
505+
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
499506
extract_links = self._create_extract_links_function(context)
500507

501-
error = yield PlaywrightCrawlingContext(
508+
yield PlaywrightCrawlingContext(
502509
request=context.request,
503510
session=context.session,
504511
add_requests=context.add_requests,
@@ -508,6 +515,7 @@ async def _create_crawling_context(
508515
proxy_info=context.proxy_info,
509516
get_key_value_store=context.get_key_value_store,
510517
log=context.log,
518+
register_deferred_cleanup=context.register_deferred_cleanup,
511519
page=context.page,
512520
goto_options=context.goto_options,
513521
response=context.response,
@@ -521,10 +529,6 @@ async def _create_crawling_context(
521529
pw_cookies = await self._get_cookies(context.page)
522530
context.session.cookies.set_cookies_from_playwright_format(pw_cookies)
523531

524-
# Collect data in case of errors, before the page object is closed.
525-
if error:
526-
await self.statistics.error_tracker.add(error=error, context=context, early=True)
527-
528532
def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None:
529533
"""Register a hook to be called before each navigation.
530534

src/crawlee/statistics/_error_tracker.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,31 +41,19 @@ def __init__(
4141
raise ValueError('`show_error_message` must be `True` if `show_full_message` is set to `True`')
4242
self.show_full_message = show_full_message
4343
self._errors: ErrorFilenameGroups = defaultdict(lambda: defaultdict(Counter))
44-
self._early_reported_errors = set[int]()
4544

4645
async def add(
4746
self,
4847
error: Exception,
4948
*,
5049
context: BasicCrawlingContext | None = None,
51-
early: bool = False,
5250
) -> None:
5351
"""Add an error in the statistics.
5452
5553
Args:
5654
error: Error to be added to statistics.
5755
context: Context used to collect error snapshot.
58-
early: Flag indicating that the error is added earlier than usual to have access to resources that will be
59-
closed before normal error collection. This prevents double reporting during normal error collection.
6056
"""
61-
if id(error) in self._early_reported_errors:
62-
# Error had to be collected earlier before relevant resources are closed.
63-
self._early_reported_errors.remove(id(error))
64-
return
65-
66-
if early:
67-
self._early_reported_errors.add(id(error))
68-
6957
error_group_name = error.__class__.__name__ if self.show_error_name else None
7058
error_group_message = self._get_error_message(error)
7159
new_error_group_message = '' # In case of wildcard similarity match

tests/unit/crawlers/_basic/test_context_pipeline.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ async def test_calls_consumer_without_middleware() -> None:
4141
use_state=AsyncMock(),
4242
get_key_value_store=AsyncMock(),
4343
log=logging.getLogger(),
44+
register_deferred_cleanup=lambda _: None,
4445
)
4546

4647
await pipeline(context, consumer)
@@ -68,6 +69,7 @@ async def middleware_a(context: BasicCrawlingContext) -> AsyncGenerator[Enhanced
6869
use_state=AsyncMock(),
6970
get_key_value_store=AsyncMock(),
7071
log=logging.getLogger(),
72+
register_deferred_cleanup=context.register_deferred_cleanup,
7173
)
7274
events.append('middleware_a_out')
7375

@@ -85,6 +87,7 @@ async def middleware_b(context: EnhancedCrawlingContext) -> AsyncGenerator[MoreE
8587
use_state=AsyncMock(),
8688
get_key_value_store=AsyncMock(),
8789
log=logging.getLogger(),
90+
register_deferred_cleanup=context.register_deferred_cleanup,
8891
)
8992
events.append('middleware_b_out')
9093

@@ -100,6 +103,7 @@ async def middleware_b(context: EnhancedCrawlingContext) -> AsyncGenerator[MoreE
100103
use_state=AsyncMock(),
101104
get_key_value_store=AsyncMock(),
102105
log=logging.getLogger(),
106+
register_deferred_cleanup=lambda _: None,
103107
)
104108
await pipeline(context, consumer)
105109

@@ -126,6 +130,7 @@ async def test_wraps_consumer_errors() -> None:
126130
use_state=AsyncMock(),
127131
get_key_value_store=AsyncMock(),
128132
log=logging.getLogger(),
133+
register_deferred_cleanup=lambda _: None,
129134
)
130135

131136
with pytest.raises(RequestHandlerError):
@@ -155,6 +160,7 @@ async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingC
155160
use_state=AsyncMock(),
156161
get_key_value_store=AsyncMock(),
157162
log=logging.getLogger(),
163+
register_deferred_cleanup=lambda _: None,
158164
)
159165

160166
with pytest.raises(ContextPipelineInitializationError):
@@ -187,6 +193,7 @@ async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingC
187193
use_state=AsyncMock(),
188194
get_key_value_store=AsyncMock(),
189195
log=logging.getLogger(),
196+
register_deferred_cleanup=lambda _: None,
190197
)
191198

192199
with pytest.raises(ContextPipelineFinalizationError):

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121
service_locator,
2222
)
2323
from crawlee.configuration import Configuration
24-
from crawlee.crawlers import PlaywrightCrawler
24+
from crawlee.crawlers import (
25+
PlaywrightCrawler,
26+
PlaywrightCrawlingContext,
27+
)
2528
from crawlee.fingerprint_suite import (
2629
DefaultFingerprintGenerator,
2730
FingerprintGenerator,
@@ -49,7 +52,6 @@
4952
from crawlee.browsers._types import BrowserType
5053
from crawlee.crawlers import (
5154
BasicCrawlingContext,
52-
PlaywrightCrawlingContext,
5355
PlaywrightPostNavCrawlingContext,
5456
PlaywrightPreNavCrawlingContext,
5557
)
@@ -1203,3 +1205,33 @@ async def post_nav_hook_2(_context: PlaywrightPostNavCrawlingContext) -> None:
12031205
'post-navigation-hook 2',
12041206
'final handler',
12051207
]
1208+
1209+
1210+
async def test_error_handler_can_access_page(server_url: URL) -> None:
1211+
"""Test that the error handler can access the Page object via PlaywrightCrawlingContext."""
1212+
1213+
crawler = PlaywrightCrawler(max_request_retries=2)
1214+
1215+
request_handler = mock.AsyncMock(side_effect=RuntimeError('Intentional crash'))
1216+
crawler.router.default_handler(request_handler)
1217+
1218+
error_handler_calls: list[str | None] = []
1219+
1220+
@crawler.error_handler
1221+
async def error_handler(context: BasicCrawlingContext | PlaywrightCrawlingContext, _error: Exception) -> None:
1222+
error_handler_calls.append(
1223+
await context.page.content() if isinstance(context, PlaywrightCrawlingContext) else None
1224+
)
1225+
1226+
failed_handler_calls: list[str | None] = []
1227+
1228+
@crawler.failed_request_handler
1229+
async def failed_handler(context: BasicCrawlingContext | PlaywrightCrawlingContext, _error: Exception) -> None:
1230+
failed_handler_calls.append(
1231+
await context.page.content() if isinstance(context, PlaywrightCrawlingContext) else None
1232+
)
1233+
1234+
await crawler.run([str(server_url / 'hello-world')])
1235+
1236+
assert error_handler_calls == [HELLO_WORLD.decode(), HELLO_WORLD.decode()]
1237+
assert failed_handler_calls == [HELLO_WORLD.decode()]

tests/unit/test_router.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def __init__(self, *, label: str | None) -> None:
2323
use_state=AsyncMock(),
2424
get_key_value_store=AsyncMock(),
2525
log=logging.getLogger(),
26+
register_deferred_cleanup=lambda _: None,
2627
)
2728

2829

0 commit comments

Comments
 (0)