|
| 1 | +import asyncio |
| 2 | + |
| 3 | +# CloakBrowser is an external package. Install it separately. |
| 4 | +from cloakbrowser.config import IGNORE_DEFAULT_ARGS, get_default_stealth_args |
| 5 | +from cloakbrowser.download import ensure_binary |
| 6 | +from typing_extensions import override |
| 7 | + |
| 8 | +from crawlee.browsers import ( |
| 9 | + BrowserPool, |
| 10 | + PlaywrightBrowserController, |
| 11 | + PlaywrightBrowserPlugin, |
| 12 | +) |
| 13 | +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext |
| 14 | + |
| 15 | + |
| 16 | +class CloakBrowserPlugin(PlaywrightBrowserPlugin): |
| 17 | + """Example browser plugin that uses CloakBrowser's patched Chromium, |
| 18 | + but otherwise keeps the functionality of PlaywrightBrowserPlugin. |
| 19 | + """ |
| 20 | + |
| 21 | + @override |
| 22 | + async def new_browser(self) -> PlaywrightBrowserController: |
| 23 | + if not self._playwright: |
| 24 | + raise RuntimeError('Playwright browser plugin is not initialized.') |
| 25 | + |
| 26 | + binary_path = ensure_binary() |
| 27 | + stealth_args = get_default_stealth_args() |
| 28 | + |
| 29 | + # Merge CloakBrowser stealth args with any user-provided launch options. |
| 30 | + launch_options = dict(self._browser_launch_options) |
| 31 | + launch_options.pop('executable_path', None) |
| 32 | + launch_options.pop('chromium_sandbox', None) |
| 33 | + existing_args = list(launch_options.pop('args', [])) |
| 34 | + launch_options['args'] = [*existing_args, *stealth_args] |
| 35 | + |
| 36 | + return PlaywrightBrowserController( |
| 37 | + browser=await self._playwright.chromium.launch( |
| 38 | + executable_path=binary_path, |
| 39 | + ignore_default_args=IGNORE_DEFAULT_ARGS, |
| 40 | + **launch_options, |
| 41 | + ), |
| 42 | + max_open_pages_per_browser=1, |
| 43 | + # CloakBrowser handles fingerprints at the binary level. |
| 44 | + header_generator=None, |
| 45 | + ) |
| 46 | + |
| 47 | + |
| 48 | +async def main() -> None: |
| 49 | + crawler = PlaywrightCrawler( |
| 50 | + # Limit the crawl to max requests. Remove or increase it for crawling all links. |
| 51 | + max_requests_per_crawl=10, |
| 52 | + # Custom browser pool. Gives users full control over browsers used by the crawler. |
| 53 | + browser_pool=BrowserPool(plugins=[CloakBrowserPlugin()]), |
| 54 | + ) |
| 55 | + |
| 56 | + # Define the default request handler, which will be called for every request. |
| 57 | + @crawler.router.default_handler |
| 58 | + async def request_handler(context: PlaywrightCrawlingContext) -> None: |
| 59 | + context.log.info(f'Processing {context.request.url} ...') |
| 60 | + |
| 61 | + # Extract some data from the page using Playwright's API. |
| 62 | + posts = await context.page.query_selector_all('.athing') |
| 63 | + for post in posts: |
| 64 | + # Get the HTML elements for the title and rank within each post. |
| 65 | + title_element = await post.query_selector('.title a') |
| 66 | + |
| 67 | + # Extract the data we want from the elements. |
| 68 | + title = await title_element.inner_text() if title_element else None |
| 69 | + |
| 70 | + # Push the extracted data to the default dataset. |
| 71 | + await context.push_data({'title': title}) |
| 72 | + |
| 73 | + # Find a link to the next page and enqueue it if it exists. |
| 74 | + await context.enqueue_links(selector='.morelink') |
| 75 | + |
| 76 | + # Run the crawler with the initial list of URLs. |
| 77 | + await crawler.run(['https://news.ycombinator.com/']) |
| 78 | + |
| 79 | + |
| 80 | +if __name__ == '__main__': |
| 81 | + asyncio.run(main()) |
0 commit comments