@@ -236,6 +236,7 @@ async def _open_page(
236236 proxy_info = context .proxy_info ,
237237 get_key_value_store = context .get_key_value_store ,
238238 log = context .log ,
239+ register_deferred_cleanup = context .register_deferred_cleanup ,
239240 page = crawlee_page .page ,
240241 block_requests = partial (block_requests , page = crawlee_page .page ),
241242 goto_options = GotoOptions (** self ._goto_options ),
@@ -296,63 +297,69 @@ async def _navigate(
296297 The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
297298 infinite_scroll and block_requests).
298299 """
299- async with context .page :
300- if context .session :
301- session_cookies = context .session .cookies .get_cookies_as_playwright_format ()
302- await self ._update_cookies (context .page , session_cookies )
303-
304- if context .request .headers :
305- await context .page .set_extra_http_headers (context .request .headers .model_dump ())
306- # Navigate to the URL and get response.
307- if context .request .method != 'GET' :
308- # Call the notification only once
309- warnings .warn (
310- 'Using other request methods than GET or adding payloads has a high impact on performance'
311- ' in recent versions of Playwright. Use only when necessary.' ,
312- category = UserWarning ,
313- stacklevel = 2 ,
314- )
300+ # Enter the page context manager, but defer its cleanup (page.close()) so the page stays open
301+ # during error handler execution.
302+ await context .page .__aenter__ ()
315303
316- route_handler = self ._prepare_request_interceptor (
317- method = context .request .method ,
318- headers = context .request .headers ,
319- payload = context .request .payload ,
320- )
304+ context .register_deferred_cleanup (lambda : context .page .__aexit__ (None , None , None ))
321305
322- # Set route_handler only for current request
323- await context .page .route (context .request .url , route_handler )
306+ if context .session :
307+ session_cookies = context .session .cookies .get_cookies_as_playwright_format ()
308+ await self ._update_cookies (context .page , session_cookies )
309+
310+ if context .request .headers :
311+ await context .page .set_extra_http_headers (context .request .headers .model_dump ())
312+ # Navigate to the URL and get response.
313+ if context .request .method != 'GET' :
314+ # Call the notification only once
315+ warnings .warn (
316+ 'Using other request methods than GET or adding payloads has a high impact on performance'
317+ ' in recent versions of Playwright. Use only when necessary.' ,
318+ category = UserWarning ,
319+ stacklevel = 2 ,
320+ )
324321
325- try :
326- async with self ._shared_navigation_timeouts [id (context )] as remaining_timeout :
327- response = await context .page .goto (
328- context .request .url , timeout = remaining_timeout .total_seconds () * 1000 , ** context .goto_options
329- )
330- context .request .state = RequestState .AFTER_NAV
331- except playwright .async_api .TimeoutError as exc :
332- raise asyncio .TimeoutError from exc
333-
334- if response is None :
335- raise SessionError (f'Failed to load the URL: { context .request .url } ' )
336-
337- # Set the loaded URL to the actual URL after redirection.
338- context .request .loaded_url = context .page .url
339-
340- yield PlaywrightPostNavCrawlingContext (
341- request = context .request ,
342- session = context .session ,
343- add_requests = context .add_requests ,
344- send_request = context .send_request ,
345- push_data = context .push_data ,
346- use_state = context .use_state ,
347- proxy_info = context .proxy_info ,
348- get_key_value_store = context .get_key_value_store ,
349- log = context .log ,
350- page = context .page ,
351- block_requests = context .block_requests ,
352- goto_options = context .goto_options ,
353- response = response ,
322+ route_handler = self ._prepare_request_interceptor (
323+ method = context .request .method ,
324+ headers = context .request .headers ,
325+ payload = context .request .payload ,
354326 )
355327
328+ # Set route_handler only for current request
329+ await context .page .route (context .request .url , route_handler )
330+
331+ try :
332+ async with self ._shared_navigation_timeouts [id (context )] as remaining_timeout :
333+ response = await context .page .goto (
334+ context .request .url , timeout = remaining_timeout .total_seconds () * 1000 , ** context .goto_options
335+ )
336+ context .request .state = RequestState .AFTER_NAV
337+ except playwright .async_api .TimeoutError as exc :
338+ raise asyncio .TimeoutError from exc
339+
340+ if response is None :
341+ raise SessionError (f'Failed to load the URL: { context .request .url } ' )
342+
343+ # Set the loaded URL to the actual URL after redirection.
344+ context .request .loaded_url = context .page .url
345+
346+ yield PlaywrightPostNavCrawlingContext (
347+ request = context .request ,
348+ session = context .session ,
349+ add_requests = context .add_requests ,
350+ send_request = context .send_request ,
351+ push_data = context .push_data ,
352+ use_state = context .use_state ,
353+ proxy_info = context .proxy_info ,
354+ get_key_value_store = context .get_key_value_store ,
355+ log = context .log ,
356+ register_deferred_cleanup = context .register_deferred_cleanup ,
357+ page = context .page ,
358+ block_requests = context .block_requests ,
359+ goto_options = context .goto_options ,
360+ response = response ,
361+ )
362+
356363 def _create_extract_links_function (self , context : PlaywrightPreNavCrawlingContext ) -> ExtractLinksFunction :
357364 """Create a callback function for extracting links from context.
358365
@@ -495,10 +502,10 @@ async def _execute_post_navigation_hooks(
495502
496503 async def _create_crawling_context (
497504 self , context : PlaywrightPostNavCrawlingContext
498- ) -> AsyncGenerator [PlaywrightCrawlingContext , Exception | None ]:
505+ ) -> AsyncGenerator [PlaywrightCrawlingContext , None ]:
499506 extract_links = self ._create_extract_links_function (context )
500507
501- error = yield PlaywrightCrawlingContext (
508+ yield PlaywrightCrawlingContext (
502509 request = context .request ,
503510 session = context .session ,
504511 add_requests = context .add_requests ,
@@ -508,6 +515,7 @@ async def _create_crawling_context(
508515 proxy_info = context .proxy_info ,
509516 get_key_value_store = context .get_key_value_store ,
510517 log = context .log ,
518+ register_deferred_cleanup = context .register_deferred_cleanup ,
511519 page = context .page ,
512520 goto_options = context .goto_options ,
513521 response = context .response ,
@@ -521,10 +529,6 @@ async def _create_crawling_context(
521529 pw_cookies = await self ._get_cookies (context .page )
522530 context .session .cookies .set_cookies_from_playwright_format (pw_cookies )
523531
524- # Collect data in case of errors, before the page object is closed.
525- if error :
526- await self .statistics .error_tracker .add (error = error , context = context , early = True )
527-
528532 def pre_navigation_hook (self , hook : Callable [[PlaywrightPreNavCrawlingContext ], Awaitable [None ]]) -> None :
529533 """Register a hook to be called before each navigation.
530534
0 commit comments