crawlee-python/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py at e2206498fe215cb357dbda9d3708dd604102e03e · apify/crawlee-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
from __future__ import annotations

import logging
from collections.abc import Awaitable, Callable, Coroutine
from copy import deepcopy
from dataclasses import dataclass
from logging import getLogger
from random import random
from typing import TYPE_CHECKING, Any, Generic, get_args

from bs4 import BeautifulSoup, Tag
from parsel import Selector
from typing_extensions import Self, TypeVar, override

from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
from crawlee._utils.docs import docs_group
from crawlee._utils.wait import wait_for
from crawlee.crawlers import (
    AbstractHttpCrawler,
    AbstractHttpParser,
    BasicCrawler,
    BeautifulSoupParserType,
    HttpCrawlingContext,
    ParsedHttpCrawlingContext,
    PlaywrightCrawler,
    PlaywrightCrawlingContext,
    PlaywrightPostNavCrawlingContext,
    PlaywrightPreNavCrawlingContext,
)
from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
from crawlee.crawlers._parsel._parsel_parser import ParselParser
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
from crawlee.statistics import Statistics, StatisticsState

from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
from ._adaptive_playwright_crawling_context import (
    AdaptivePlaywrightCrawlingContext,
    AdaptivePlaywrightPostNavCrawlingContext,
    AdaptivePlaywrightPreNavCrawlingContext,
)
from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
from ._result_comparator import create_default_comparator

if TYPE_CHECKING:
    from types import TracebackType

    from typing_extensions import Unpack

    from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions


TStaticParseResult = TypeVar('TStaticParseResult')
TStaticSelectResult = TypeVar('TStaticSelectResult')
TStaticCrawlingContext = TypeVar('TStaticCrawlingContext', bound=ParsedHttpCrawlingContext)


class _NonPersistentStatistics(Statistics):
    """Statistics compliant object that is not supposed to do anything when entering/exiting context.

    To be used in sub crawlers.
    """

    def __init__(self) -> None:
        super().__init__(state_model=StatisticsState)

    async def __aenter__(self) -> Self:
        self._active = True
        await self._state.initialize()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_value: BaseException | None,
        exc_traceback: TracebackType | None,
    ) -> None:
        self._active = False


@docs_group('Crawlers')
class AdaptivePlaywrightCrawler(
    BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
    Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
):
    """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.

    It uses a more limited crawling context interface so that it is able to switch to HTTP-only crawling when it detects
    that it may bring a performance benefit.
    It uses specific implementation of `AbstractHttpCrawler` and `PlaywrightCrawler`.

    ### Usage
    ```python
    from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext

    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
        max_requests_per_crawl=10,  # Limit the max requests per crawl.
        playwright_crawler_specific_kwargs={'browser_type': 'chromium'},
    )

    @crawler.router.default_handler
    async def request_handler_for_label(context: AdaptivePlaywrightCrawlingContext) -> None:
        # Do some processing using `parsed_content`
        context.log.info(context.parsed_content.title)

        # Locate element h2 within 5 seconds
        h2 = await context.query_selector_one('h2', timedelta(milliseconds=5000))
        # Do stuff with element found by the selector
        context.log.info(h2)

        # Find more links and enqueue them.
        await context.enqueue_links()
        # Save some data.
        await context.push_data({'Visited url': context.request.url})

    await crawler.run(['https://crawlee.dev/'])
    ```
    """

    def __init__(
        self,
        *,
        static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult],
        rendering_type_predictor: RenderingTypePredictor | None = None,
        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
        statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None,
        **kwargs: Unpack[_BasicCrawlerOptions],
    ) -> None:
        """Initialize a new instance. Recommended way to create instance is to call factory methods.

        Recommended factory methods: `with_beautifulsoup_static_parser`, `with_parsel_static_parser`.

        Args:
            rendering_type_predictor: Object that implements RenderingTypePredictor and is capable of predicting which
                rendering method should be used. If None, then `DefaultRenderingTypePredictor` is used.
            result_checker: Function that evaluates whether crawling result is valid or not.
            result_comparator: Function that compares two crawling results and decides whether they are equivalent.
            static_parser: Implementation of `AbstractHttpParser`. Parser that will be used for static crawling.
            static_crawler_specific_kwargs: `AbstractHttpCrawler` only kwargs that are passed to the sub crawler.
            playwright_crawler_specific_kwargs: `PlaywrightCrawler` only kwargs that are passed to the sub crawler.
            statistics: A custom `Statistics[AdaptivePlaywrightCrawlerStatisticState]` instance, allowing the use of
                non-default configuration.
            kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
        """
        # Adaptive crawling related.
        self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
        self.result_checker = result_checker or (lambda _: True)
        self.result_comparator = result_comparator or create_default_comparator(result_checker)

        # Set default concurrency settings for browser crawlers if not provided
        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)

        adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)

        super().__init__(statistics=adaptive_statistics, **kwargs)

        # Sub crawlers related.
        playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()

        # Each sub crawler will use custom logger .
        static_logger = getLogger('Subcrawler_static')
        static_logger.setLevel(logging.ERROR)
        basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}

        pw_logger = getLogger('Subcrawler_playwright')
        pw_logger.setLevel(logging.ERROR)
        basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}

        # Initialize sub crawlers to create their pipelines.
        static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)

        static_crawler = static_crawler_class(
            parser=static_parser,
            statistics=_NonPersistentStatistics(),
            **basic_crawler_kwargs_for_static_crawler,
        )
        playwright_crawler = PlaywrightCrawler(
            statistics=_NonPersistentStatistics(),
            **playwright_crawler_specific_kwargs,
            **basic_crawler_kwargs_for_pw_crawler,
        )

        # Register pre navigation hooks on sub crawlers
        self._pre_navigation_hooks = list[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]]()
        self._pre_navigation_hooks_pw_only = list[
            Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]
        ]()

        async def adaptive_pre_navigation_hook_static(context: BasicCrawlingContext) -> None:
            for hook in self._pre_navigation_hooks:
                await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context))

        async def adaptive_pre_navigation_hook_pw(context: PlaywrightPreNavCrawlingContext) -> None:
            for hook in self._pre_navigation_hooks + self._pre_navigation_hooks_pw_only:
                await hook(AdaptivePlaywrightPreNavCrawlingContext.from_pre_navigation_context(context))

        static_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_static)
        playwright_crawler.pre_navigation_hook(adaptive_pre_navigation_hook_pw)

        # Register post navigation hooks on sub crawlers
        self._post_navigation_hooks = list[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]]()
        self._post_navigation_hooks_pw_only = list[
            Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]
        ]()

        async def adaptive_post_navigation_hook_static(context: HttpCrawlingContext) -> None:
            adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
            for hook in self._post_navigation_hooks:
                await hook(adaptive_context)

        async def adaptive_post_navigation_hook_pw(context: PlaywrightPostNavCrawlingContext) -> None:
            adaptive_context = await AdaptivePlaywrightPostNavCrawlingContext.from_post_navigation_context(context)
            for hook in self._post_navigation_hooks + self._post_navigation_hooks_pw_only:
                await hook(adaptive_context)

        static_crawler.post_navigation_hook(adaptive_post_navigation_hook_static)
        playwright_crawler.post_navigation_hook(adaptive_post_navigation_hook_pw)

        self._additional_context_managers = [
            *self._additional_context_managers,
            self.rendering_type_predictor,
            static_crawler.statistics,
            playwright_crawler.statistics,
            playwright_crawler._browser_pool,  # noqa: SLF001 # Intentional access to private member.
        ]

        # Sub crawler pipeline related
        self._pw_context_pipeline = playwright_crawler._context_pipeline  # noqa:SLF001  # Intentional access to private member.
        self._static_context_pipeline = static_crawler._context_pipeline  # noqa:SLF001  # Intentional access to private member.
        self._static_parser = static_parser

    @classmethod
    def with_beautifulsoup_static_parser(
        cls,
        rendering_type_predictor: RenderingTypePredictor | None = None,
        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
        parser_type: BeautifulSoupParserType = 'lxml',
        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
        statistics: Statistics[StatisticsState] | None = None,
        **kwargs: Unpack[_BasicCrawlerOptions],
    ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag]:
        """Create `AdaptivePlaywrightCrawler` that uses `BeautifulSoup` for parsing static content."""
        if statistics is not None:
            adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState)
        else:
            adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
        return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[BeautifulSoup], BeautifulSoup, Tag](
            rendering_type_predictor=rendering_type_predictor,
            result_checker=result_checker,
            result_comparator=result_comparator,
            static_parser=BeautifulSoupParser(parser=parser_type),
            playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
            statistics=adaptive_statistics,
            **kwargs,
        )

    @classmethod
    def with_parsel_static_parser(
        cls,
        rendering_type_predictor: RenderingTypePredictor | None = None,
        result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
        result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
        playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
        statistics: Statistics[StatisticsState] | None = None,
        **kwargs: Unpack[_BasicCrawlerOptions],
    ) -> AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector]:
        """Create `AdaptivePlaywrightCrawler` that uses `Parcel` for parsing static content."""
        if statistics is not None:
            adaptive_statistics = statistics.replace_state_model(AdaptivePlaywrightCrawlerStatisticState)
        else:
            adaptive_statistics = Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
        return AdaptivePlaywrightCrawler[ParsedHttpCrawlingContext[Selector], Selector, Selector](
            rendering_type_predictor=rendering_type_predictor,
            result_checker=result_checker,
            result_comparator=result_comparator,
            static_parser=ParselParser(),
            playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
            statistics=adaptive_statistics,
            **kwargs,
        )

    async def _crawl_one(
        self,
        rendering_type: RenderingType,
        context: BasicCrawlingContext,
        state: dict[str, JsonSerializable] | None = None,
    ) -> SubCrawlerRun:
        """Perform a one request crawl with specific context pipeline and return `SubCrawlerRun`.

        `SubCrawlerRun` contains either result of the crawl or the exception that was thrown during the crawl.
        Sub crawler pipeline call is dynamically created based on the `rendering_type`.
        New copy-like context is created from passed `context` and `state` and is passed to sub crawler pipeline.
        """
        if state is not None:

            async def get_input_state(
                default_value: dict[str, JsonSerializable] | None = None,  # noqa:ARG001  # Intentionally unused arguments. Closure, that generates same output regardless of inputs.
            ) -> dict[str, JsonSerializable]:
                return state

            use_state_function = get_input_state
        else:
            use_state_function = context.use_state

        # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
        result = RequestHandlerRunResult(
            key_value_store_getter=self.get_key_value_store,
            request=context.request,
        )
        context_linked_to_result = BasicCrawlingContext(
            request=result.request,
            session=context.session,
            proxy_info=context.proxy_info,
            send_request=context.send_request,
            add_requests=result.add_requests,
            push_data=result.push_data,
            get_key_value_store=result.get_key_value_store,
            use_state=use_state_function,
            log=context.log,
            register_deferred_cleanup=context.register_deferred_cleanup,
        )

        try:
            await wait_for(
                lambda: self._pipeline_call_factory(
                    rendering_type=rendering_type, context_linked_to_result=context_linked_to_result
                ),
                timeout=self._request_handler_timeout,
                timeout_message=(
                    f'{rendering_type=!s} timed out after {self._request_handler_timeout.total_seconds()}seconds'
                ),
                logger=self._logger,
            )
            return SubCrawlerRun(result=result)
        except Exception as e:
            return SubCrawlerRun(exception=e)

    def _pipeline_call_factory(
        self, rendering_type: RenderingType, context_linked_to_result: BasicCrawlingContext
    ) -> Coroutine[Any, Any, None]:
        """Create sub crawler pipeline call."""
        if rendering_type == 'static':

            async def from_static_pipeline_to_top_router(
                context: ParsedHttpCrawlingContext[TStaticParseResult],
            ) -> None:
                adaptive_crawling_context = AdaptivePlaywrightCrawlingContext.from_parsed_http_crawling_context(
                    context=context, parser=self._static_parser
                )
                await self.router(adaptive_crawling_context)

            return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]

        if rendering_type == 'client only':

            async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> None:
                adaptive_crawling_context = await AdaptivePlaywrightCrawlingContext.from_playwright_crawling_context(
                    context=context, parser=self._static_parser
                )
                await self.router(adaptive_crawling_context)

            return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]

        raise RuntimeError(
            f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
        )

    @override
    async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
        """Override BasicCrawler method that delegates request processing to sub crawlers.

        To decide which sub crawler should process the request it runs `rendering_type_predictor`.
        To check if results are valid it uses `result_checker`.
        To compare results of both sub crawlers it uses `result_comparator`.

        Reference implementation: https://github.com/apify/crawlee/blob/master/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts
        """
        rendering_type_prediction = self.rendering_type_predictor.predict(context.request)
        should_detect_rendering_type = random() < rendering_type_prediction.detection_probability_recommendation

        if not should_detect_rendering_type:
            self.log.debug(
                f'Predicted rendering type {rendering_type_prediction.rendering_type} for {context.request.url}'
            )
            if rendering_type_prediction.rendering_type == 'static':
                context.log.debug(f'Running static request for {context.request.url}')
                self.track_http_only_request_handler_runs()

                static_run = await self._crawl_one(rendering_type='static', context=context)
                if static_run.result and self.result_checker(static_run.result):
                    self._context_result_map[context] = static_run.result
                    return
                if static_run.exception:
                    context.log.exception(
                        msg=f'Static crawler: failed for {context.request.url}', exc_info=static_run.exception
                    )
                else:
                    context.log.warning(f'Static crawler: returned a suspicious result for {context.request.url}')
                    self.track_rendering_type_mispredictions()

        context.log.debug(f'Running browser request handler for {context.request.url}')

        old_state_copy = None

        if should_detect_rendering_type:
            # Save copy of global state from `use_state` before it can be mutated by browser crawl.
            # This copy will be used in the static crawl to make sure they both run with same conditions and to
            # avoid static crawl to modify the state.
            # (This static crawl is performed only to evaluate rendering type detection.)
            kvs = await context.get_key_value_store()
            default_value = dict[str, JsonSerializable]()
            old_state: dict[str, JsonSerializable] = await kvs.get_value(self._CRAWLEE_STATE_KEY, default_value)
            old_state_copy = deepcopy(old_state)

        pw_run = await self._crawl_one('client only', context=context)
        self.track_browser_request_handler_runs()

        if pw_run.exception is not None:
            raise pw_run.exception

        if pw_run.result:
            if should_detect_rendering_type:
                detection_result: RenderingType
                static_run = await self._crawl_one('static', context=context, state=old_state_copy)
                if static_run.result and self.result_comparator(static_run.result, pw_run.result):
                    detection_result = 'static'
                else:
                    detection_result = 'client only'

                context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                self.rendering_type_predictor.store_result(context.request, detection_result)

            self._context_result_map[context] = pw_run.result

    def pre_navigation_hook(
        self,
        hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
        *,
        playwright_only: bool = False,
    ) -> Callable[[Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]], None]:
        """Pre navigation hooks for adaptive crawler are delegated to sub crawlers.

        Optionally parametrized decorator.
        Hooks are wrapped in context that handles possibly missing `page` object by raising `AdaptiveContextError`.
        """

        def register_hooks(hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]]) -> None:
            if playwright_only:
                self._pre_navigation_hooks_pw_only.append(hook)
            else:
                self._pre_navigation_hooks.append(hook)

        # No parameter in decorator. Execute directly.
        if hook:
            register_hooks(hook)

        # Return parametrized decorator that will be executed through decorator syntax if called with parameter.
        return register_hooks

    def post_navigation_hook(
        self,
        hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]] | None = None,
        *,
        playwright_only: bool = False,
    ) -> Callable[[Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]], None]:
        """Post navigation hooks for adaptive crawler are delegated to sub crawlers.

        Optionally parametrized decorator.
        Hooks are wrapped in context that handles possibly missing `page` and `response` objects by raising
        `AdaptiveContextError`.
        """

        def register_hooks(hook: Callable[[AdaptivePlaywrightPostNavCrawlingContext], Awaitable[None]]) -> None:
            if playwright_only:
                self._post_navigation_hooks_pw_only.append(hook)
            else:
                self._post_navigation_hooks.append(hook)

        # No parameter in decorator. Execute directly.
        if hook:
            register_hooks(hook)

        # Return parametrized decorator that will be executed through decorator syntax if called with parameter.
        return register_hooks

    def track_http_only_request_handler_runs(self) -> None:
        self.statistics.state.http_only_request_handler_runs += 1

    def track_browser_request_handler_runs(self) -> None:
        self.statistics.state.browser_request_handler_runs += 1

    def track_rendering_type_mispredictions(self) -> None:
        self.statistics.state.rendering_type_mispredictions += 1


@dataclass(frozen=True)
class SubCrawlerRun:
    result: RequestHandlerRunResult | None = None
    exception: Exception | None = None