Skip to content

Commit 38ceda6

Browse files
authored
feat: Add post_navigation_hooks to crawlers (#1795)
### Description - Add `post_navigation_hooks` that run after navigation. ### Issues - Relates: #1741 ### Testing - Tests for navigation hooks have been added and updated.
1 parent 76b21c4 commit 38ceda6

File tree

12 files changed

+462
-60
lines changed

12 files changed

+462
-60
lines changed

src/crawlee/crawlers/__init__.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,25 @@
1515
with _try_import(__name__, 'ParselCrawler', 'ParselCrawlingContext'):
1616
from ._parsel import ParselCrawler, ParselCrawlingContext
1717

18-
with _try_import(__name__, 'PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightPreNavCrawlingContext'):
19-
from ._playwright import PlaywrightCrawler, PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
18+
with _try_import(
19+
__name__,
20+
'PlaywrightCrawler',
21+
'PlaywrightCrawlingContext',
22+
'PlaywrightPostNavCrawlingContext',
23+
'PlaywrightPreNavCrawlingContext',
24+
):
25+
from ._playwright import (
26+
PlaywrightCrawler,
27+
PlaywrightCrawlingContext,
28+
PlaywrightPostNavCrawlingContext,
29+
PlaywrightPreNavCrawlingContext,
30+
)
2031

2132
with _try_import(
2233
__name__,
2334
'AdaptivePlaywrightCrawler',
2435
'AdaptivePlaywrightCrawlingContext',
36+
'AdaptivePlaywrightPostNavCrawlingContext',
2537
'AdaptivePlaywrightPreNavCrawlingContext',
2638
'AdaptivePlaywrightCrawlerStatisticState',
2739
'RenderingType',
@@ -32,6 +44,7 @@
3244
AdaptivePlaywrightCrawler,
3345
AdaptivePlaywrightCrawlerStatisticState,
3446
AdaptivePlaywrightCrawlingContext,
47+
AdaptivePlaywrightPostNavCrawlingContext,
3548
AdaptivePlaywrightPreNavCrawlingContext,
3649
RenderingType,
3750
RenderingTypePrediction,
@@ -45,6 +58,7 @@
4558
'AdaptivePlaywrightCrawler',
4659
'AdaptivePlaywrightCrawlerStatisticState',
4760
'AdaptivePlaywrightCrawlingContext',
61+
'AdaptivePlaywrightPostNavCrawlingContext',
4862
'AdaptivePlaywrightPreNavCrawlingContext',
4963
'BasicCrawler',
5064
'BasicCrawlerOptions',
@@ -62,6 +76,7 @@
6276
'ParselCrawlingContext',
6377
'PlaywrightCrawler',
6478
'PlaywrightCrawlingContext',
79+
'PlaywrightPostNavCrawlingContext',
6580
'PlaywrightPreNavCrawlingContext',
6681
'RenderingType',
6782
'RenderingTypePrediction',

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def __init__(
7777
self._parser = parser
7878
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
7979
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80+
self._post_navigation_hooks: list[Callable[[HttpCrawlingContext], Awaitable[None]]] = []
8081
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
8182

8283
if '_context_pipeline' not in kwargs:
@@ -120,6 +121,7 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
120121
ContextPipeline()
121122
.compose(self._execute_pre_navigation_hooks)
122123
.compose(self._make_http_request)
124+
.compose(self._execute_post_navigation_hooks)
123125
.compose(self._handle_status_code_response)
124126
.compose(self._parse_http_response)
125127
.compose(self._handle_blocked_request_by_content)
@@ -140,6 +142,14 @@ async def _execute_pre_navigation_hooks(
140142
finally:
141143
self._shared_navigation_timeouts.pop(context_id, None)
142144

145+
async def _execute_post_navigation_hooks(
146+
self, context: HttpCrawlingContext
147+
) -> AsyncGenerator[HttpCrawlingContext, None]:
148+
for hook in self._post_navigation_hooks:
149+
await hook(context)
150+
151+
yield context
152+
143153
async def _parse_http_response(
144154
self, context: HttpCrawlingContext
145155
) -> AsyncGenerator[ParsedHttpCrawlingContext[TParseResult], None]:
@@ -311,3 +321,11 @@ def pre_navigation_hook(self, hook: Callable[[BasicCrawlingContext], Awaitable[N
311321
hook: A coroutine function to be called before each navigation.
312322
"""
313323
self._pre_navigation_hooks.append(hook)
324+
325+
def post_navigation_hook(self, hook: Callable[[HttpCrawlingContext], Awaitable[None]]) -> None:
326+
"""Register a hook to be called after each navigation.
327+
328+
Args:
329+
hook: A coroutine function to be called after each navigation.
330+
"""
331+
self._post_navigation_hooks.append(hook)

src/crawlee/crawlers/_adaptive_playwright/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# These imports have only mandatory dependencies, so they are imported directly.
55
from ._adaptive_playwright_crawling_context import (
66
AdaptivePlaywrightCrawlingContext,
7+
AdaptivePlaywrightPostNavCrawlingContext,
78
AdaptivePlaywrightPreNavCrawlingContext,
89
)
910

@@ -22,6 +23,7 @@
2223
'AdaptivePlaywrightCrawler',
2324
'AdaptivePlaywrightCrawlerStatisticState',
2425
'AdaptivePlaywrightCrawlingContext',
26+
'AdaptivePlaywrightPostNavCrawlingContext',
2527
'AdaptivePlaywrightPreNavCrawlingContext',
2628
'RenderingType',
2729
'RenderingTypePrediction',

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@
2020
AbstractHttpParser,
2121
BasicCrawler,
2222
BeautifulSoupParserType,
23+
HttpCrawlingContext,
2324
ParsedHttpCrawlingContext,
2425
PlaywrightCrawler,
2526
PlaywrightCrawlingContext,
27+
PlaywrightPostNavCrawlingContext,
2628
PlaywrightPreNavCrawlingContext,
2729
)
2830
from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
@@ -33,6 +35,7 @@
3335
from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
3436
from ._adaptive_playwright_crawling_context import (
3537
AdaptivePlaywrightCrawlingContext,
38+
AdaptivePlaywrightPostNavCrawlingContext,
3639
AdaptivePlaywrightPreNavCrawlingContext,
3740
)
3841
from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
@@ -196,6 +199,25 @@ async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingConte
196199
static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static)
197200
playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw)
198201

202+
# Register post navigation hooks on sub crawlers
203+
self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]()
204+
self._post_navigation_hooks_pw_only = list[
205+
Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]
206+
]()
207+
208+
async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None:
209+
adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
210+
for hook in self._post_navigation_hooks:
211+
await hook(adaptive_context)
212+
213+
async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None:
214+
adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
215+
for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only:
216+
await hook(adaptive_context)
217+
218+
static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static)
219+
playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw)
220+
199221
self._additional_context_managers = [
200222
*self._additional_context_managers,
201223
self.rendering_type_predictor,
@@ -437,6 +459,32 @@ def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awa
437459
# Return parametrized decorator that will be executed through decorator syntax if called with parameter.
438460
return register_hooks
439461

462+
def post_navigation_hook(
463+
self,
464+
hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None,
465+
*,
466+
playwright_only: bool = False,
467+
) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]:
468+
"""Post navigation hooks for adaptive crawler are delegated to sub crawlers.
469+
470+
Optionally parametrized decorator.
471+
Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising
472+
`AdaptiveContextError`.
473+
"""
474+
475+
def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:
476+
if playwright_only:
477+
self._post_navigation_hooks_pw_only.append(hook)
478+
else:
479+
self._post_navigation_hooks.append(hook)
480+
481+
# No parameter in decorator. Execute directly.
482+
if hook:
483+
register_hooks(hook)
484+
485+
# Return parametrized decorator that will be executed through decorator syntax if called with parameter.
486+
return register_hooks
487+
440488
def track_http_only_request_handler_runs(self) -> None:
441489
self.statistics.state.http_only_request_handler_runs += 1
442490

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from crawlee._types import BasicCrawlingContext
1010
from crawlee._utils.docs import docs_group
1111
from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext
12+
from crawlee.crawlers._abstract_http._http_crawling_context import HttpCrawlingContext
13+
from crawlee.crawlers._playwright._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext
1214
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse
1315

1416
if TYPE_CHECKING:
@@ -186,7 +188,7 @@ async def from_playwright_crawling_context(
186188
context_kwargs['_page'] = context_kwargs.pop('page')
187189
context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll')
188190
# This might not be always available.
189-
protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol')
191+
protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0]?.nextHopProtocol')
190192
http_response = await PlaywrightHttpResponse.from_playwright_response(
191193
response=context.response, protocol=protocol_guess or ''
192194
)
@@ -245,3 +247,58 @@ async def dummy_block_requests(
245247

246248
context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests)
247249
return cls(**context_kwargs)
250+
251+
252+
@dataclass(frozen=True)
253+
@docs_group('Crawling contexts')
254+
class AdaptivePlaywrightPostNavCrawlingContext(HttpCrawlingContext):
255+
"""A wrapper around HttpCrawlingContext or AdaptivePlaywrightCrawlingContext.
256+
257+
Trying to access `page` on this context will raise AdaptiveContextError if wrapped context is HttpCrawlingContext.
258+
"""
259+
260+
_page: Page | None = None
261+
_response: Response | None = None
262+
263+
@property
264+
def page(self) -> Page:
265+
"""The Playwright `Page` object for the current page.
266+
267+
Raises `AdaptiveContextError` if accessed during static crawling.
268+
"""
269+
if not self._page:
270+
raise AdaptiveContextError('Page was not crawled with PlaywrightCrawler.')
271+
return self._page
272+
273+
@property
274+
def response(self) -> Response:
275+
"""The Playwright `Response` object containing the response details for the current URL.
276+
277+
Raises `AdaptiveContextError` if accessed during static crawling.
278+
"""
279+
if not self._response:
280+
raise AdaptiveContextError('Response was not crawled with PlaywrightCrawler.')
281+
return self._response
282+
283+
@classmethod
284+
async def from_post_navigation_context(
285+
cls, context: HttpCrawlingContext | PlaywrightPostNavCrawlingContext
286+
) -> Self:
287+
"""Initialize a new instance from an existing post-navigation context."""
288+
context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
289+
290+
context_kwargs['_page'] = context_kwargs.pop('page', None)
291+
context_kwargs['_response'] = context_kwargs.pop('response', None)
292+
293+
# block_requests and goto_options are useful only on pre-navigation contexts.
294+
context_kwargs.pop('block_requests', None)
295+
context_kwargs.pop('goto_options', None)
296+
297+
if isinstance(context, PlaywrightPostNavCrawlingContext):
298+
protocol_guess = await context_kwargs['_page'].evaluate(
299+
'() => performance.getEntries()[0]?.nextHopProtocol'
300+
)
301+
context_kwargs['http_response'] = await PlaywrightHttpResponse.from_playwright_response(
302+
response=context.response, protocol=protocol_guess or ''
303+
)
304+
return cls(**context_kwargs)

src/crawlee/crawlers/_playwright/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111
from ._playwright_crawling_context import PlaywrightCrawlingContext
1212
with _try_import(__name__, 'PlaywrightPreNavCrawlingContext'):
1313
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
14+
with _try_import(__name__, 'PlaywrightPostNavCrawlingContext'):
15+
from ._playwright_post_nav_crawling_context import PlaywrightPostNavCrawlingContext
1416

1517
__all__ = [
1618
'PlaywrightCrawler',
1719
'PlaywrightCrawlingContext',
20+
'PlaywrightPostNavCrawlingContext',
1821
'PlaywrightPreNavCrawlingContext',
1922
]

0 commit comments

Comments
 (0)