Skip to content

Commit 88bd05a

Browse files
vdusekclaude
andauthored
fix: Apply SharedTimeout to post-navigation hooks (#1839)
## Summary - Post-navigation hooks in `AbstractHttpCrawler` and `PlaywrightCrawler` had no timeout enforcement, allowing a misbehaving hook to hang the crawler indefinitely. - Applied the existing shared navigation timeout budget to post-navigation hooks, so pre-nav hooks + navigation + post-nav hooks all share the same timeout. - Re-keyed `_shared_navigation_timeouts` by `id(context.request)` (stable across pipeline steps) instead of `id(context)` (changes type between steps). --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 415a0ce commit 88bd05a

File tree

3 files changed

+60
-28
lines changed

3 files changed

+60
-28
lines changed

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
119119
"""Create static content crawler context pipeline with expected pipeline steps."""
120120
return (
121121
ContextPipeline()
122+
.compose(self._manage_shared_navigation_timeout)
122123
.compose(self._execute_pre_navigation_hooks)
123124
.compose(self._make_http_request)
124125
.compose(self._execute_post_navigation_hooks)
@@ -127,26 +128,37 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
127128
.compose(self._handle_blocked_request_by_content)
128129
)
129130

130-
async def _execute_pre_navigation_hooks(
131+
async def _manage_shared_navigation_timeout(
131132
self, context: BasicCrawlingContext
132133
) -> AsyncGenerator[BasicCrawlingContext, None]:
133-
context_id = id(context)
134-
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
134+
"""Initialize and clean up the shared navigation timeout for the current request."""
135+
request_id = id(context.request)
136+
self._shared_navigation_timeouts[request_id] = SharedTimeout(self._navigation_timeout)
135137

136138
try:
137-
for hook in self._pre_navigation_hooks:
138-
async with self._shared_navigation_timeouts[context_id]:
139-
await hook(context)
140-
141139
yield context
142140
finally:
143-
self._shared_navigation_timeouts.pop(context_id, None)
141+
self._shared_navigation_timeouts.pop(request_id, None)
142+
143+
async def _execute_pre_navigation_hooks(
144+
self, context: BasicCrawlingContext
145+
) -> AsyncGenerator[BasicCrawlingContext, None]:
146+
request_id = id(context.request)
147+
148+
for hook in self._pre_navigation_hooks:
149+
async with self._shared_navigation_timeouts[request_id]:
150+
await hook(context)
151+
152+
yield context
144153

145154
async def _execute_post_navigation_hooks(
146155
self, context: HttpCrawlingContext
147156
) -> AsyncGenerator[HttpCrawlingContext, None]:
157+
request_id = id(context.request)
158+
148159
for hook in self._post_navigation_hooks:
149-
await hook(context)
160+
async with self._shared_navigation_timeouts[request_id]:
161+
await hook(context)
150162

151163
yield context
152164

@@ -262,7 +274,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
262274
Yields:
263275
The original crawling context enhanced by HTTP response.
264276
"""
265-
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
277+
async with self._shared_navigation_timeouts[id(context.request)] as remaining_timeout:
266278
result = await self._http_client.crawl(
267279
request=context.request,
268280
session=context.session,

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def __init__(
193193
# Compose the context pipeline with the Playwright-specific context enhancer.
194194
kwargs['_context_pipeline'] = (
195195
ContextPipeline()
196+
.compose(self._manage_shared_navigation_timeout)
196197
.compose(self._open_page)
197198
.compose(self._navigate)
198199
.compose(self._execute_post_navigation_hooks)
@@ -216,6 +217,18 @@ def __init__(
216217

217218
super().__init__(**kwargs)
218219

220+
async def _manage_shared_navigation_timeout(
221+
self, context: BasicCrawlingContext
222+
) -> AsyncGenerator[BasicCrawlingContext, None]:
223+
"""Initialize and clean up the shared navigation timeout for the current request."""
224+
request_id = id(context.request)
225+
self._shared_navigation_timeouts[request_id] = SharedTimeout(self._navigation_timeout)
226+
227+
try:
228+
yield context
229+
finally:
230+
self._shared_navigation_timeouts.pop(request_id, None)
231+
219232
async def _open_page(
220233
self,
221234
context: BasicCrawlingContext,
@@ -242,21 +255,17 @@ async def _open_page(
242255
goto_options=GotoOptions(**self._goto_options),
243256
)
244257

245-
context_id = id(pre_navigation_context)
246-
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
258+
request_id = id(pre_navigation_context.request)
247259

248-
try:
249-
# Only use the page context manager here — it sets the current page in a context variable,
250-
# making it accessible to PlaywrightHttpClient in subsequent pipeline steps.
251-
async with browser_page_context(crawlee_page.page):
252-
for hook in self._pre_navigation_hooks:
253-
async with self._shared_navigation_timeouts[context_id]:
254-
await hook(pre_navigation_context)
255-
256-
# Yield should be inside the browser_page_context.
257-
yield pre_navigation_context
258-
finally:
259-
self._shared_navigation_timeouts.pop(context_id, None)
260+
# Only use the page context manager here — it sets the current page in a context variable,
261+
# making it accessible to PlaywrightHttpClient in subsequent pipeline steps.
262+
async with browser_page_context(crawlee_page.page):
263+
for hook in self._pre_navigation_hooks:
264+
async with self._shared_navigation_timeouts[request_id]:
265+
await hook(pre_navigation_context)
266+
267+
# Yield should be inside the browser_page_context.
268+
yield pre_navigation_context
260269

261270
def _prepare_request_interceptor(
262271
self,
@@ -329,7 +338,7 @@ async def _navigate(
329338
await context.page.route(context.request.url, route_handler)
330339

331340
try:
332-
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
341+
async with self._shared_navigation_timeouts[id(context.request)] as remaining_timeout:
333342
response = await context.page.goto(
334343
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
335344
)
@@ -496,8 +505,12 @@ async def _handle_blocked_request_by_content(
496505
async def _execute_post_navigation_hooks(
497506
self, context: PlaywrightPostNavCrawlingContext
498507
) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
508+
request_id = id(context.request)
509+
499510
for hook in self._post_navigation_hooks:
500-
await hook(context)
511+
async with self._shared_navigation_timeouts[request_id]:
512+
await hook(context)
513+
501514
yield context
502515

503516
async def _create_crawling_context(

tests/unit/otel/test_crawler_instrumentor.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,18 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None:
8282
assert telemetry_data[2]['context']['trace_id'] != telemetry_data[1]['context']['trace_id']
8383

8484
# Crawler telemetry - all crawler spans will be in one trace as there is only one request in this test.
85-
assert telemetry_data[3]['name'] == '_execute_pre_navigation_hooks, action'
86-
assert telemetry_data[3]['attributes']['code.function.name'] == 'AbstractHttpCrawler._execute_pre_navigation_hooks'
85+
assert telemetry_data[3]['name'] == '_manage_shared_navigation_timeout, action'
86+
assert (
87+
telemetry_data[3]['attributes']['code.function.name'] == 'AbstractHttpCrawler._manage_shared_navigation_timeout'
88+
)
8789
assert telemetry_data[3]['attributes']['url.full'] == str(server_url)
8890
assert telemetry_data[3]['resource']['attributes'] == dict(resource.attributes)
8991

92+
assert telemetry_data[4]['name'] == '_execute_pre_navigation_hooks, action'
93+
assert telemetry_data[4]['attributes']['code.function.name'] == 'AbstractHttpCrawler._execute_pre_navigation_hooks'
94+
assert telemetry_data[4]['attributes']['url.full'] == str(server_url)
95+
assert telemetry_data[4]['resource']['attributes'] == dict(resource.attributes)
96+
9097
assert telemetry_data[-1]['name'] == '__run_task_function'
9198
assert telemetry_data[-1]['attributes']['code.function.name'] == 'BasicCrawler.__run_task_function'
9299
assert telemetry_data[-1]['resource']['attributes'] == dict(resource.attributes)

0 commit comments

Comments
 (0)