Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/guides/avoid_blocking.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py';
import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py';
import PlaywrightWithCloakBrowser from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_cloakbrowser.py';

import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py';

Expand Down Expand Up @@ -41,6 +42,14 @@ In some cases even <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</Ap
{PlaywrightWithCamoufox}
</RunnableCodeBlock>

## Using CloakBrowser

For sites with aggressive anti-bot protection, [CloakBrowser](https://github.com/CloakHQ/CloakBrowser) takes a different approach. Instead of overriding fingerprints at the JavaScript level (which anti-bot scripts can detect as tampering), CloakBrowser ships a custom Chromium binary with fingerprints modified directly in the C++ source code. It is also Chromium-based, which can matter when a target site behaves differently with Firefox than with Chrome. Install it separately with `pip install cloakbrowser` — the plugin calls `ensure_binary()` which automatically downloads and caches the Chromium binary on first run.

<RunnableCodeBlock className="language-python" language="python">
{PlaywrightWithCloakBrowser}
</RunnableCodeBlock>

**Related links**

- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import asyncio

# CloakBrowser is an external package. Install it separately.
from cloakbrowser.config import IGNORE_DEFAULT_ARGS, get_default_stealth_args
from cloakbrowser.download import ensure_binary
from typing_extensions import override

from crawlee.browsers import (
BrowserPool,
PlaywrightBrowserController,
PlaywrightBrowserPlugin,
)
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


class CloakBrowserPlugin(PlaywrightBrowserPlugin):
"""Example browser plugin that uses CloakBrowser's patched Chromium,
but otherwise keeps the functionality of PlaywrightBrowserPlugin.
"""

@override
async def new_browser(self) -> PlaywrightBrowserController:
if not self._playwright:
raise RuntimeError('Playwright browser plugin is not initialized.')

binary_path = ensure_binary()
stealth_args = get_default_stealth_args()

# Merge CloakBrowser stealth args with any user-provided launch options.
launch_options = dict(self._browser_launch_options)
launch_options.pop('executable_path', None)
launch_options.pop('chromium_sandbox', None)
existing_args = list(launch_options.pop('args', []))
launch_options['args'] = [*existing_args, *stealth_args]

return PlaywrightBrowserController(
browser=await self._playwright.chromium.launch(
executable_path=binary_path,
ignore_default_args=IGNORE_DEFAULT_ARGS,
**launch_options,
),
max_open_pages_per_browser=1,
# CloakBrowser handles fingerprints at the binary level.
header_generator=None,
)


async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
# Custom browser pool. Gives users full control over browsers used by the crawler.
browser_pool=BrowserPool(plugins=[CloakBrowserPlugin()]),
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Extract some data from the page using Playwright's API.
posts = await context.page.query_selector_all('.athing')
for post in posts:
# Get the HTML elements for the title and rank within each post.
title_element = await post.query_selector('.title a')

# Extract the data we want from the elements.
title = await title_element.inner_text() if title_element else None

# Push the extracted data to the default dataset.
await context.push_data({'title': title})

# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')

# Run the crawler with the initial list of URLs.
await crawler.run(['https://news.ycombinator.com/'])


if __name__ == '__main__':
asyncio.run(main())
Loading