|
| 1 | +""" |
| 2 | +PlasmateLoader — lightweight page fetcher using Plasmate (https://github.com/plasmate-labs/plasmate). |
| 3 | +
|
| 4 | +Plasmate is an open-source Rust browser engine that outputs a Structured Object Model (SOM) |
| 5 | +instead of raw HTML. It requires no Chrome process, uses ~64MB RAM per session vs ~300MB, |
| 6 | +and delivers 10-100x fewer tokens per page — lowering LLM costs for AI-powered scraping. |
| 7 | +
|
| 8 | +Install: pip install plasmate |
| 9 | +Docs: https://plasmate.app |
| 10 | +""" |
| 11 | + |
| 12 | +import asyncio |
| 13 | +import subprocess |
| 14 | +import shutil |
| 15 | +from typing import AsyncIterator, Iterator, List, Optional |
| 16 | + |
| 17 | +from langchain_community.document_loaders.base import BaseLoader |
| 18 | +from langchain_core.documents import Document |
| 19 | + |
| 20 | +from ..utils import get_logger |
| 21 | + |
| 22 | +logger = get_logger("plasmate-loader") |
| 23 | + |
| 24 | +_INSTALL_MSG = ( |
| 25 | + "plasmate is required for PlasmateLoader. " |
| 26 | + "Install it with: pip install plasmate\n" |
| 27 | + "Docs: https://plasmate.app" |
| 28 | +) |
| 29 | + |
| 30 | + |
| 31 | +def _check_plasmate() -> str: |
| 32 | + """Return the path to the plasmate binary, or raise ImportError.""" |
| 33 | + path = shutil.which("plasmate") |
| 34 | + if path is None: |
| 35 | + # Also check the Python-installed entry point location |
| 36 | + try: |
| 37 | + import plasmate as _p # noqa: F401 |
| 38 | + path = shutil.which("plasmate") |
| 39 | + except ImportError: |
| 40 | + pass |
| 41 | + if path is None: |
| 42 | + raise ImportError(_INSTALL_MSG) |
| 43 | + return path |
| 44 | + |
| 45 | + |
| 46 | +class PlasmateLoader(BaseLoader): |
| 47 | + """Fetches pages using Plasmate — a lightweight Rust browser engine that outputs |
| 48 | + Structured Object Model (SOM) instead of raw HTML. |
| 49 | +
|
| 50 | + Advantages over ChromiumLoader for static / server-rendered pages: |
| 51 | + - No Chrome/Playwright required — single binary, installs via pip |
| 52 | + - ~64MB RAM per session vs ~300MB for Chromium |
| 53 | + - 10-100x fewer tokens per page (SOM strips nav, ads, boilerplate) |
| 54 | + - Drops into existing ScrapeGraphAI workflows with minimal config changes |
| 55 | +
|
| 56 | + For SPAs or pages that require JavaScript rendering, set ``fallback_to_chrome=True`` |
| 57 | + to automatically retry with ChromiumLoader on empty or error responses. |
| 58 | +
|
| 59 | + Attributes: |
| 60 | + urls: List of URLs to fetch. |
| 61 | + output_format: Plasmate output format — ``"text"`` (default, most compatible), |
| 62 | + ``"som"`` (full JSON), or ``"markdown"``. |
| 63 | + timeout: Per-request timeout in seconds. Defaults to 30. |
| 64 | + selector: Optional ARIA role or CSS id selector to scope extraction |
| 65 | + (e.g. ``"main"`` or ``"#content"``). |
| 66 | + extra_headers: Optional dict of HTTP headers to pass to each request. |
| 67 | + fallback_to_chrome: If True, retry with ChromiumLoader when Plasmate |
| 68 | + returns empty content (useful for JS-heavy SPAs). Defaults to False. |
| 69 | + chrome_kwargs: Extra kwargs forwarded to ChromiumLoader when fallback is used. |
| 70 | +
|
| 71 | + Example:: |
| 72 | +
|
| 73 | + from scrapegraphai.docloaders import PlasmateLoader |
| 74 | +
|
| 75 | + loader = PlasmateLoader( |
| 76 | + urls=["https://docs.python.org/3/library/json.html"], |
| 77 | + output_format="text", |
| 78 | + timeout=30, |
| 79 | + ) |
| 80 | + docs = loader.load() |
| 81 | + print(docs[0].page_content[:500]) |
| 82 | + """ |
| 83 | + |
| 84 | + def __init__( |
| 85 | + self, |
| 86 | + urls: List[str], |
| 87 | + *, |
| 88 | + output_format: str = "text", |
| 89 | + timeout: int = 30, |
| 90 | + selector: Optional[str] = None, |
| 91 | + extra_headers: Optional[dict] = None, |
| 92 | + fallback_to_chrome: bool = False, |
| 93 | + **chrome_kwargs, |
| 94 | + ): |
| 95 | + if output_format not in ("som", "text", "markdown", "links"): |
| 96 | + raise ValueError( |
| 97 | + f"output_format must be one of 'som', 'text', 'markdown', 'links'; got {output_format!r}" |
| 98 | + ) |
| 99 | + self.urls = urls |
| 100 | + self.output_format = output_format |
| 101 | + self.timeout = timeout |
| 102 | + self.selector = selector |
| 103 | + self.extra_headers = extra_headers or {} |
| 104 | + self.fallback_to_chrome = fallback_to_chrome |
| 105 | + self.chrome_kwargs = chrome_kwargs |
| 106 | + |
| 107 | + def _build_cmd(self, url: str) -> List[str]: |
| 108 | + """Build the plasmate CLI command for a given URL.""" |
| 109 | + cmd = [ |
| 110 | + "plasmate", "fetch", url, |
| 111 | + "--format", self.output_format, |
| 112 | + "--timeout", str(self.timeout * 1000), # plasmate uses milliseconds |
| 113 | + ] |
| 114 | + if self.selector: |
| 115 | + cmd += ["--selector", self.selector] |
| 116 | + for key, value in self.extra_headers.items(): |
| 117 | + cmd += ["--header", f"{key}: {value}"] |
| 118 | + return cmd |
| 119 | + |
| 120 | + def _fetch_url(self, url: str) -> str: |
| 121 | + """Synchronously fetch a URL via the plasmate binary.""" |
| 122 | + binary = _check_plasmate() |
| 123 | + cmd = self._build_cmd(url) |
| 124 | + cmd[0] = binary # use resolved path |
| 125 | + |
| 126 | + logger.info(f"[PlasmateLoader] Fetching: {url}") |
| 127 | + try: |
| 128 | + result = subprocess.run( |
| 129 | + cmd, |
| 130 | + capture_output=True, |
| 131 | + text=True, |
| 132 | + timeout=self.timeout + 5, # outer kill timeout slightly above plasmate's |
| 133 | + ) |
| 134 | + if result.returncode != 0: |
| 135 | + logger.warning( |
| 136 | + f"[PlasmateLoader] plasmate exited {result.returncode} for {url}: {result.stderr[:200]}" |
| 137 | + ) |
| 138 | + return "" |
| 139 | + content = result.stdout.strip() |
| 140 | + logger.info(f"[PlasmateLoader] Got {len(content)} chars from {url}") |
| 141 | + return content |
| 142 | + except subprocess.TimeoutExpired: |
| 143 | + logger.warning(f"[PlasmateLoader] Timeout fetching {url}") |
| 144 | + return "" |
| 145 | + except FileNotFoundError: |
| 146 | + raise ImportError(_INSTALL_MSG) |
| 147 | + |
| 148 | + def _fallback_fetch(self, url: str) -> str: |
| 149 | + """Fall back to ChromiumLoader when Plasmate returns empty content.""" |
| 150 | + from .chromium import ChromiumLoader |
| 151 | + |
| 152 | + logger.info(f"[PlasmateLoader] Falling back to ChromiumLoader for: {url}") |
| 153 | + loader = ChromiumLoader([url], **self.chrome_kwargs) |
| 154 | + docs = loader.load() |
| 155 | + return docs[0].page_content if docs else "" |
| 156 | + |
| 157 | + def lazy_load(self) -> Iterator[Document]: |
| 158 | + """Yield Documents one at a time, fetching each URL synchronously.""" |
| 159 | + for url in self.urls: |
| 160 | + content = self._fetch_url(url) |
| 161 | + |
| 162 | + if not content.strip() and self.fallback_to_chrome: |
| 163 | + content = self._fallback_fetch(url) |
| 164 | + |
| 165 | + if not content.strip(): |
| 166 | + logger.warning(f"[PlasmateLoader] Empty content for {url} — skipping") |
| 167 | + continue |
| 168 | + |
| 169 | + yield Document( |
| 170 | + page_content=content, |
| 171 | + metadata={ |
| 172 | + "source": url, |
| 173 | + "loader": "plasmate", |
| 174 | + "format": self.output_format, |
| 175 | + }, |
| 176 | + ) |
| 177 | + |
| 178 | + async def _async_fetch_url(self, url: str) -> str: |
| 179 | + """Asynchronously fetch a URL by running the plasmate binary in a thread pool.""" |
| 180 | + loop = asyncio.get_event_loop() |
| 181 | + return await loop.run_in_executor(None, self._fetch_url, url) |
| 182 | + |
| 183 | + async def alazy_load(self) -> AsyncIterator[Document]: |
| 184 | + """Asynchronously yield Documents, fetching all URLs concurrently.""" |
| 185 | + tasks = [self._async_fetch_url(url) for url in self.urls] |
| 186 | + results = await asyncio.gather(*tasks) |
| 187 | + |
| 188 | + for url, content in zip(self.urls, results): |
| 189 | + if not content.strip() and self.fallback_to_chrome: |
| 190 | + content = self._fallback_fetch(url) |
| 191 | + |
| 192 | + if not content.strip(): |
| 193 | + logger.warning(f"[PlasmateLoader] Empty content for {url} — skipping") |
| 194 | + continue |
| 195 | + |
| 196 | + yield Document( |
| 197 | + page_content=content, |
| 198 | + metadata={ |
| 199 | + "source": url, |
| 200 | + "loader": "plasmate", |
| 201 | + "format": self.output_format, |
| 202 | + }, |
| 203 | + ) |
0 commit comments