Skip to content

Commit 1238738

Browse files
authored
Merge pull request #1062 from dbhurley/feat/plasmate-loader
feat: add PlasmateLoader as lightweight scraping backend (no Chrome needed)
2 parents 7b5733d + 9dd1fb5 commit 1238738

File tree

4 files changed

+498
-0
lines changed

4 files changed

+498
-0
lines changed

scrapegraphai/docloaders/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44

55
from .browser_base import browser_base_fetch
66
from .chromium import ChromiumLoader
7+
from .plasmate import PlasmateLoader
78
from .scrape_do import scrape_do_fetch
89

910
__all__ = [
1011
"browser_base_fetch",
1112
"ChromiumLoader",
13+
"PlasmateLoader",
1214
"scrape_do_fetch",
1315
]
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
"""
2+
PlasmateLoader — lightweight page fetcher using Plasmate (https://github.com/plasmate-labs/plasmate).
3+
4+
Plasmate is an open-source Rust browser engine that outputs a Structured Object Model (SOM)
5+
instead of raw HTML. It requires no Chrome process, uses ~64MB RAM per session vs ~300MB,
6+
and delivers 10-100x fewer tokens per page — lowering LLM costs for AI-powered scraping.
7+
8+
Install: pip install plasmate
9+
Docs: https://plasmate.app
10+
"""
11+
12+
import asyncio
13+
import subprocess
14+
import shutil
15+
from typing import AsyncIterator, Iterator, List, Optional
16+
17+
from langchain_community.document_loaders.base import BaseLoader
18+
from langchain_core.documents import Document
19+
20+
from ..utils import get_logger
21+
22+
logger = get_logger("plasmate-loader")
23+
24+
_INSTALL_MSG = (
25+
"plasmate is required for PlasmateLoader. "
26+
"Install it with: pip install plasmate\n"
27+
"Docs: https://plasmate.app"
28+
)
29+
30+
31+
def _check_plasmate() -> str:
32+
"""Return the path to the plasmate binary, or raise ImportError."""
33+
path = shutil.which("plasmate")
34+
if path is None:
35+
# Also check the Python-installed entry point location
36+
try:
37+
import plasmate as _p # noqa: F401
38+
path = shutil.which("plasmate")
39+
except ImportError:
40+
pass
41+
if path is None:
42+
raise ImportError(_INSTALL_MSG)
43+
return path
44+
45+
46+
class PlasmateLoader(BaseLoader):
47+
"""Fetches pages using Plasmate — a lightweight Rust browser engine that outputs
48+
Structured Object Model (SOM) instead of raw HTML.
49+
50+
Advantages over ChromiumLoader for static / server-rendered pages:
51+
- No Chrome/Playwright required — single binary, installs via pip
52+
- ~64MB RAM per session vs ~300MB for Chromium
53+
- 10-100x fewer tokens per page (SOM strips nav, ads, boilerplate)
54+
- Drops into existing ScrapeGraphAI workflows with minimal config changes
55+
56+
For SPAs or pages that require JavaScript rendering, set ``fallback_to_chrome=True``
57+
to automatically retry with ChromiumLoader on empty or error responses.
58+
59+
Attributes:
60+
urls: List of URLs to fetch.
61+
output_format: Plasmate output format — ``"text"`` (default, most compatible),
62+
``"som"`` (full JSON), or ``"markdown"``.
63+
timeout: Per-request timeout in seconds. Defaults to 30.
64+
selector: Optional ARIA role or CSS id selector to scope extraction
65+
(e.g. ``"main"`` or ``"#content"``).
66+
extra_headers: Optional dict of HTTP headers to pass to each request.
67+
fallback_to_chrome: If True, retry with ChromiumLoader when Plasmate
68+
returns empty content (useful for JS-heavy SPAs). Defaults to False.
69+
chrome_kwargs: Extra kwargs forwarded to ChromiumLoader when fallback is used.
70+
71+
Example::
72+
73+
from scrapegraphai.docloaders import PlasmateLoader
74+
75+
loader = PlasmateLoader(
76+
urls=["https://docs.python.org/3/library/json.html"],
77+
output_format="text",
78+
timeout=30,
79+
)
80+
docs = loader.load()
81+
print(docs[0].page_content[:500])
82+
"""
83+
84+
def __init__(
85+
self,
86+
urls: List[str],
87+
*,
88+
output_format: str = "text",
89+
timeout: int = 30,
90+
selector: Optional[str] = None,
91+
extra_headers: Optional[dict] = None,
92+
fallback_to_chrome: bool = False,
93+
**chrome_kwargs,
94+
):
95+
if output_format not in ("som", "text", "markdown", "links"):
96+
raise ValueError(
97+
f"output_format must be one of 'som', 'text', 'markdown', 'links'; got {output_format!r}"
98+
)
99+
self.urls = urls
100+
self.output_format = output_format
101+
self.timeout = timeout
102+
self.selector = selector
103+
self.extra_headers = extra_headers or {}
104+
self.fallback_to_chrome = fallback_to_chrome
105+
self.chrome_kwargs = chrome_kwargs
106+
107+
def _build_cmd(self, url: str) -> List[str]:
108+
"""Build the plasmate CLI command for a given URL."""
109+
cmd = [
110+
"plasmate", "fetch", url,
111+
"--format", self.output_format,
112+
"--timeout", str(self.timeout * 1000), # plasmate uses milliseconds
113+
]
114+
if self.selector:
115+
cmd += ["--selector", self.selector]
116+
for key, value in self.extra_headers.items():
117+
cmd += ["--header", f"{key}: {value}"]
118+
return cmd
119+
120+
def _fetch_url(self, url: str) -> str:
121+
"""Synchronously fetch a URL via the plasmate binary."""
122+
binary = _check_plasmate()
123+
cmd = self._build_cmd(url)
124+
cmd[0] = binary # use resolved path
125+
126+
logger.info(f"[PlasmateLoader] Fetching: {url}")
127+
try:
128+
result = subprocess.run(
129+
cmd,
130+
capture_output=True,
131+
text=True,
132+
timeout=self.timeout + 5, # outer kill timeout slightly above plasmate's
133+
)
134+
if result.returncode != 0:
135+
logger.warning(
136+
f"[PlasmateLoader] plasmate exited {result.returncode} for {url}: {result.stderr[:200]}"
137+
)
138+
return ""
139+
content = result.stdout.strip()
140+
logger.info(f"[PlasmateLoader] Got {len(content)} chars from {url}")
141+
return content
142+
except subprocess.TimeoutExpired:
143+
logger.warning(f"[PlasmateLoader] Timeout fetching {url}")
144+
return ""
145+
except FileNotFoundError:
146+
raise ImportError(_INSTALL_MSG)
147+
148+
def _fallback_fetch(self, url: str) -> str:
149+
"""Fall back to ChromiumLoader when Plasmate returns empty content."""
150+
from .chromium import ChromiumLoader
151+
152+
logger.info(f"[PlasmateLoader] Falling back to ChromiumLoader for: {url}")
153+
loader = ChromiumLoader([url], **self.chrome_kwargs)
154+
docs = loader.load()
155+
return docs[0].page_content if docs else ""
156+
157+
def lazy_load(self) -> Iterator[Document]:
158+
"""Yield Documents one at a time, fetching each URL synchronously."""
159+
for url in self.urls:
160+
content = self._fetch_url(url)
161+
162+
if not content.strip() and self.fallback_to_chrome:
163+
content = self._fallback_fetch(url)
164+
165+
if not content.strip():
166+
logger.warning(f"[PlasmateLoader] Empty content for {url} — skipping")
167+
continue
168+
169+
yield Document(
170+
page_content=content,
171+
metadata={
172+
"source": url,
173+
"loader": "plasmate",
174+
"format": self.output_format,
175+
},
176+
)
177+
178+
async def _async_fetch_url(self, url: str) -> str:
179+
"""Asynchronously fetch a URL by running the plasmate binary in a thread pool."""
180+
loop = asyncio.get_event_loop()
181+
return await loop.run_in_executor(None, self._fetch_url, url)
182+
183+
async def alazy_load(self) -> AsyncIterator[Document]:
184+
"""Asynchronously yield Documents, fetching all URLs concurrently."""
185+
tasks = [self._async_fetch_url(url) for url in self.urls]
186+
results = await asyncio.gather(*tasks)
187+
188+
for url, content in zip(self.urls, results):
189+
if not content.strip() and self.fallback_to_chrome:
190+
content = self._fallback_fetch(url)
191+
192+
if not content.strip():
193+
logger.warning(f"[PlasmateLoader] Empty content for {url} — skipping")
194+
continue
195+
196+
yield Document(
197+
page_content=content,
198+
metadata={
199+
"source": url,
200+
"loader": "plasmate",
201+
"format": self.output_format,
202+
},
203+
)

scrapegraphai/nodes/fetch_node.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def __init__(
8383
None if node_config is None else node_config.get("scrape_do", None)
8484
)
8585

86+
self.plasmate = (
87+
None if node_config is None else node_config.get("plasmate", None)
88+
)
89+
8690
self.storage_state = (
8791
None if node_config is None else node_config.get("storage_state", None)
8892
)
@@ -351,6 +355,19 @@ def handle_web_source(self, state, source):
351355
)
352356

353357
document = [Document(page_content=data, metadata={"source": source})]
358+
elif self.plasmate is not None:
359+
from ..docloaders.plasmate import PlasmateLoader
360+
361+
plasmate_cfg = self.plasmate if isinstance(self.plasmate, dict) else {}
362+
loader = PlasmateLoader(
363+
[source],
364+
output_format=plasmate_cfg.get("output_format", "text"),
365+
timeout=plasmate_cfg.get("timeout", self.timeout or 30),
366+
selector=plasmate_cfg.get("selector"),
367+
extra_headers=plasmate_cfg.get("extra_headers", {}),
368+
fallback_to_chrome=plasmate_cfg.get("fallback_to_chrome", False),
369+
)
370+
document = loader.load()
354371
else:
355372
loader = ChromiumLoader(
356373
[source],

0 commit comments

Comments
 (0)