Skip to content

Commit 702b1db

Browse files
committed
feat: add AsyncPlasmateCrawlerStrategy — lightweight alternative to Playwright
Closes unclecode#1256 (memory leak in Docker from Chrome) Related to unclecode#1874 (token usage tracking) Plasmate (https://github.com/plasmate-labs/plasmate) is an open-source Rust browser engine that replaces Chrome/Playwright for static pages. No browser process, ~64MB RAM vs ~300MB, 10-100x fewer tokens per page. Changes: - crawl4ai/async_plasmate_strategy.py: AsyncPlasmateCrawlerStrategy - Implements AsyncCrawlerStrategy ABC (drop-in replacement) - Supports output_format: text (default), markdown, som, links - Supports --selector, --header, --timeout flags - Optional fallback_to_playwright=True for JS-heavy SPAs - Subprocess runs in asyncio executor — safe for concurrent use - crawl4ai/__init__.py: export AsyncPlasmateCrawlerStrategy - tests/general/test_plasmate_strategy.py: 20 unit tests Install: pip install plasmate Usage: from crawl4ai import AsyncWebCrawler from crawl4ai.async_plasmate_strategy import AsyncPlasmateCrawlerStrategy strategy = AsyncPlasmateCrawlerStrategy( output_format="markdown", fallback_to_playwright=True, # SPA safety net ) async with AsyncWebCrawler(crawler_strategy=strategy) as crawler: result = await crawler.arun("https://docs.python.org/3/")
1 parent 3d02d75 commit 702b1db

3 files changed

Lines changed: 540 additions & 0 deletions

File tree

crawl4ai/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import warnings
33

44
from .async_webcrawler import AsyncWebCrawler, CacheMode
5+
from .async_plasmate_strategy import AsyncPlasmateCrawlerStrategy
56
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
67
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
78

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
"""
2+
AsyncPlasmateCrawlerStrategy — lightweight alternative to AsyncPlaywrightCrawlerStrategy.
3+
4+
Uses Plasmate (https://github.com/plasmate-labs/plasmate) instead of Chrome/Playwright.
5+
Plasmate is an open-source Rust browser engine that outputs Structured Object Model (SOM)
6+
instead of raw HTML, using ~64MB RAM per session vs ~300MB and delivering
7+
10-100x fewer tokens per page — significantly reducing LLM costs.
8+
9+
Install: pip install plasmate
10+
Docs: https://plasmate.app
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import asyncio
16+
import shutil
17+
import subprocess
18+
from typing import Dict, List, Optional
19+
20+
from .async_crawler_strategy import AsyncCrawlerStrategy
21+
from .async_logger import AsyncLogger
22+
from .models import AsyncCrawlResponse
23+
24+
_INSTALL_MSG = (
25+
"plasmate is required for AsyncPlasmateCrawlerStrategy. "
26+
"Install it with: pip install plasmate\n"
27+
"Docs: https://plasmate.app"
28+
)
29+
30+
_VALID_FORMATS = ("text", "markdown", "som", "links")
31+
32+
33+
def _find_plasmate() -> Optional[str]:
34+
"""Return the resolved path to the plasmate binary, or None."""
35+
path = shutil.which("plasmate")
36+
if path:
37+
return path
38+
try:
39+
import plasmate as _p # noqa: F401
40+
return shutil.which("plasmate")
41+
except ImportError:
42+
return None
43+
44+
45+
class AsyncPlasmateCrawlerStrategy(AsyncCrawlerStrategy):
46+
"""Lightweight crawler strategy using Plasmate instead of Chrome/Playwright.
47+
48+
Plasmate fetches pages and returns them as Structured Object Model (SOM)
49+
or plain text / markdown — no browser process, no GPU, no 300 MB Chrome.
50+
51+
This strategy is a drop-in replacement for ``AsyncPlaywrightCrawlerStrategy``
52+
for static and server-rendered pages. For JavaScript-heavy SPAs that require
53+
a real browser, set ``fallback_to_playwright=True``.
54+
55+
Attributes:
56+
output_format: Page output format — ``"text"`` (default), ``"markdown"``,
57+
``"som"`` (full JSON), or ``"links"``.
58+
timeout: Per-request timeout in seconds. Defaults to 30.
59+
selector: Optional ARIA role or CSS id selector to scope extraction
60+
(e.g. ``"main"`` or ``"#article"``).
61+
extra_headers: Optional HTTP headers forwarded with each request.
62+
fallback_to_playwright: If True, retry with Playwright when Plasmate
63+
returns an empty response (handles SPAs automatically).
64+
verbose: Whether to emit log messages. Defaults to True.
65+
66+
Example — drop-in replacement::
67+
68+
import asyncio
69+
from crawl4ai import AsyncWebCrawler
70+
from crawl4ai.async_plasmate_strategy import AsyncPlasmateCrawlerStrategy
71+
72+
async def main():
73+
strategy = AsyncPlasmateCrawlerStrategy(
74+
output_format="markdown",
75+
timeout=30,
76+
fallback_to_playwright=True,
77+
)
78+
async with AsyncWebCrawler(crawler_strategy=strategy) as crawler:
79+
result = await crawler.arun("https://docs.python.org/3/")
80+
print(result.markdown[:500])
81+
82+
asyncio.run(main())
83+
84+
Example — direct use::
85+
86+
strategy = AsyncPlasmateCrawlerStrategy(output_format="text")
87+
async with strategy:
88+
response = await strategy.crawl("https://example.com")
89+
print(response.html) # clean text output, no HTML boilerplate
90+
"""
91+
92+
def __init__(
93+
self,
94+
output_format: str = "text",
95+
timeout: int = 30,
96+
selector: Optional[str] = None,
97+
extra_headers: Optional[Dict[str, str]] = None,
98+
fallback_to_playwright: bool = False,
99+
verbose: bool = True,
100+
logger: Optional[AsyncLogger] = None,
101+
**kwargs,
102+
):
103+
if output_format not in _VALID_FORMATS:
104+
raise ValueError(
105+
f"output_format must be one of {_VALID_FORMATS}; got {output_format!r}"
106+
)
107+
self.output_format = output_format
108+
self.timeout = timeout
109+
self.selector = selector
110+
self.extra_headers = extra_headers or {}
111+
self.fallback_to_playwright = fallback_to_playwright
112+
self.verbose = verbose
113+
self.logger = logger or AsyncLogger(verbose=verbose)
114+
self._plasmate_bin: Optional[str] = None
115+
116+
# ------------------------------------------------------------------
117+
# Context manager
118+
# ------------------------------------------------------------------
119+
120+
async def __aenter__(self) -> "AsyncPlasmateCrawlerStrategy":
121+
self._plasmate_bin = _find_plasmate()
122+
if self._plasmate_bin is None:
123+
raise ImportError(_INSTALL_MSG)
124+
if self.verbose:
125+
self.logger.info(
126+
f"AsyncPlasmateCrawlerStrategy ready (format={self.output_format}, "
127+
f"timeout={self.timeout}s, fallback={self.fallback_to_playwright})",
128+
tag="INIT",
129+
)
130+
return self
131+
132+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
133+
# No persistent process to clean up — each fetch is a short-lived subprocess.
134+
pass
135+
136+
# ------------------------------------------------------------------
137+
# Internal helpers
138+
# ------------------------------------------------------------------
139+
140+
def _build_cmd(self, url: str) -> List[str]:
141+
"""Build the plasmate CLI command for a URL."""
142+
cmd = [
143+
self._plasmate_bin,
144+
"fetch",
145+
url,
146+
"--format", self.output_format,
147+
"--timeout", str(self.timeout * 1000), # plasmate uses ms
148+
]
149+
if self.selector:
150+
cmd += ["--selector", self.selector]
151+
for key, value in self.extra_headers.items():
152+
cmd += ["--header", f"{key}: {value}"]
153+
return cmd
154+
155+
async def _fetch(self, url: str) -> tuple[str, int]:
156+
"""Run plasmate in a thread-pool executor; returns (content, status_code)."""
157+
loop = asyncio.get_event_loop()
158+
159+
def _run() -> tuple[str, int]:
160+
try:
161+
result = subprocess.run(
162+
self._build_cmd(url),
163+
capture_output=True,
164+
text=True,
165+
timeout=self.timeout + 5,
166+
)
167+
if result.returncode != 0:
168+
if self.verbose:
169+
self.logger.warning(
170+
f"plasmate exited {result.returncode} for {url}: "
171+
f"{result.stderr[:200]}",
172+
tag="FETCH",
173+
)
174+
return "", 500
175+
return result.stdout.strip(), 200
176+
except subprocess.TimeoutExpired:
177+
if self.verbose:
178+
self.logger.warning(f"Timeout fetching {url}", tag="FETCH")
179+
return "", 504
180+
except FileNotFoundError:
181+
raise ImportError(_INSTALL_MSG)
182+
183+
return await loop.run_in_executor(None, _run)
184+
185+
async def _playwright_fallback(self, url: str) -> tuple[str, int]:
186+
"""Delegate to AsyncPlaywrightCrawlerStrategy and return its raw HTML."""
187+
if self.verbose:
188+
self.logger.info(
189+
f"Plasmate returned empty — falling back to Playwright for {url}",
190+
tag="FALLBACK",
191+
)
192+
from .async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
193+
194+
strategy = AsyncPlaywrightCrawlerStrategy()
195+
async with strategy:
196+
response = await strategy.crawl(url)
197+
return response.html, response.status_code
198+
199+
# ------------------------------------------------------------------
200+
# Public API
201+
# ------------------------------------------------------------------
202+
203+
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
204+
"""Fetch *url* with Plasmate and return an :class:`AsyncCrawlResponse`.
205+
206+
The ``html`` field of the response contains the Plasmate output in the
207+
requested format (text / markdown / SOM JSON / links) rather than raw HTML.
208+
Downstream Crawl4AI extraction strategies receive this pre-processed content,
209+
reducing token consumption before any LLM call.
210+
211+
Args:
212+
url: The URL to fetch.
213+
**kwargs: Ignored (accepted for interface compatibility).
214+
215+
Returns:
216+
:class:`AsyncCrawlResponse` with ``html`` set to Plasmate output.
217+
"""
218+
if self.verbose:
219+
self.logger.info(f"Fetching: {url}", tag="FETCH")
220+
221+
content, status_code = await self._fetch(url)
222+
223+
if not content.strip() and self.fallback_to_playwright:
224+
content, status_code = await self._playwright_fallback(url)
225+
226+
if self.verbose and content:
227+
self.logger.success(
228+
f"Got {len(content):,} chars from {url} "
229+
f"(format={self.output_format})",
230+
tag="FETCH",
231+
)
232+
233+
return AsyncCrawlResponse(
234+
html=content,
235+
response_headers={},
236+
status_code=status_code,
237+
)

0 commit comments

Comments
 (0)