apify · candidosales · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/packages/crawlee/package.json b/packages/crawlee/package.json
@@ -62,6 +62,7 @@
         "@crawlee/core": "3.16.0",
         "@crawlee/http": "3.16.0",
         "@crawlee/jsdom": "3.16.0",
+        "@crawlee/lightpanda": "3.16.0",
         "@crawlee/linkedom": "3.16.0",
         "@crawlee/playwright": "3.16.0",
         "@crawlee/puppeteer": "3.16.0",

diff --git a/packages/lightpanda-crawler/CHANGELOG.md b/packages/lightpanda-crawler/CHANGELOG.md
@@ -0,0 +1,10 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [3.16.0] - Initial release
+
+- Add `LightpandaCrawler` for ultra-fast headless crawling via[Lightpanda](https://lightpanda.io) over CDP.
diff --git a/packages/lightpanda-crawler/README.md b/packages/lightpanda-crawler/README.md
@@ -0,0 +1,52 @@
+# @crawlee/lightpanda
+
+[Lightpanda](https://lightpanda.io) browser integration for [Crawlee](https://crawlee.dev).
+
+Lightpanda is a headless browser built from scratch for machines — no graphical rendering, instant startup, up to **10× faster** and **10× less memory** than Chrome. It is compatible with Playwright/Puppeteer via the Chrome DevTools Protocol (CDP).
+
+## Installation
+
+```sh
+npm install @crawlee/lightpanda playwright
+# Optional: let Crawlee manage the Lightpanda process automatically
+npm install @lightpanda/browser
+```
+
+## Usage
+
+```typescript
+import { LightpandaCrawler } from '@crawlee/lightpanda';
+
+const crawler = new LightpandaCrawler({
+    async requestHandler({ page, request, log }) {
+        log.info(`Processing ${request.url}`);
+        const title = await page.title();
+        log.info(`Title: ${title}`);
+    },
+});
+
+await crawler.run(['https://example.com']);
+```
+
+### External Lightpanda server
+
+If you manage the Lightpanda process yourself (e.g. in Docker):
+
+```typescript
+const crawler = new LightpandaCrawler({
+    launchContext: {
+        autoStart: false,
+        host: '127.0.0.1',
+        port: 9222,
+    },
+    async requestHandler({ page }) {
+        const title = await page.title();
+        console.log(title);
+    },
+});
+```
+
+## Requirements
+
+- Lightpanda is **Linux-only** (as of March 2026).
+- Either install `@lightpanda/browser` for automatic process management, or supply `lightpandaPath` pointing to a Lightpanda binary.
diff --git a/packages/lightpanda-crawler/examples/README.md b/packages/lightpanda-crawler/examples/README.md
@@ -0,0 +1,98 @@
+# Lightpanda Crawler — Examples
+
+This folder contains runnable example scripts for `@crawlee/lightpanda`.
+
+## scrape-books.ts
+
+Crawls the entire [books.toscrape.com](https://books.toscrape.com) catalogue (50 pages, 1 000 books) and saves title, price, star rating and stock status to a Crawlee dataset.
+
+### Prerequisites
+
+- [Node.js](https://nodejs.org) ≥ 18
+- [tsx](https://github.com/privatenumber/tsx) — `npm install -g tsx`
+- A running Lightpanda CDP server (see options below)
+
+### Running Lightpanda
+
+#### Option A — Docker (recommended, works on any OS)
+
+```bash
+docker run -d --name lightpanda -p 9222:9222 lightpanda/browser:nightly
+```
+
+Then run the example with `autoStart` disabled:
+
+```bash
+LIGHTPANDA_AUTO_START=false tsx --tsconfig tsconfig.json scrape-books.ts
+```
+
+Stop the container when you're done:
+
+```bash
+docker rm -f lightpanda
+```
+
+#### Option B — Auto-start via `@lightpanda/browser` npm package (Linux only)
+
+This option downloads and manages the Lightpanda binary automatically. It only works on Linux because Lightpanda native binaries are not yet available for macOS or Windows.
+
+```bash
+npm install @lightpanda/browser
+tsx --tsconfig tsconfig.json scrape-books.ts
+```
+
+#### Option C — Explicit binary path (Linux only)
+
+If you already have a Lightpanda binary installed:
+
+```bash
+LIGHTPANDA_PATH=/usr/local/bin/lightpanda tsx --tsconfig tsconfig.json scrape-books.ts
+```
+
+#### Option D — Pre-running Lightpanda server (Linux only)
+
+Start the server manually, then point the crawler at it:
+
+```bash
+# Terminal 1
+lightpanda serve --port 9222
+
+# Terminal 2
+LIGHTPANDA_AUTO_START=false tsx --tsconfig tsconfig.json scrape-books.ts
+```
+
+### Environment Variables
+
+| Variable | Default | Description |
+|---|---|---|
+| `LIGHTPANDA_AUTO_START` | `true` | Set to `false` to connect to an already-running Lightpanda server (required for Docker on macOS) |
+| `LIGHTPANDA_PATH` | _(auto-detected)_ | Absolute path to the `lightpanda` binary (Option C) |
+
+### Expected Output
+
+The crawler logs progress as it navigates page by page:
+
+```
+INFO  LightpandaCrawler: Starting the crawler.
+INFO  LightpandaCrawler: Scraping page 1: https://books.toscrape.com/catalogue/page-1.html
+INFO  LightpandaCrawler: Found 20 books on page 1
+INFO  LightpandaCrawler: Scraping page 2: https://books.toscrape.com/catalogue/page-2.html
+...
+INFO  LightpandaCrawler: Scraping page 50: https://books.toscrape.com/catalogue/page-50.html
+INFO  LightpandaCrawler: Found 20 books on page 50
+INFO  LightpandaCrawler: Pagination complete. Scraped 50 pages.
+✓ Crawl complete. Scraped 1000 books total.
+Sample: {"url":"...","title":"A Light in the Attic","price":"£51.77","rating":"3","inStock":true}
+```
+
+Results are saved to `storage/datasets/default/` as JSON files.
+
+### Known Lightpanda Limitations
+
+These limitations are specific to the current state of Lightpanda's CDP implementation and are reflected in the example code:
+
+| Limitation | Workaround applied |
+|---|---|
+| Lightpanda reuses the same CDP target ID (`FID-0000000001`) for every new page within a session, causing Playwright to throw `Duplicate target` when a second page is opened | All pagination is handled inside a single request handler using `page.goto()` instead of `enqueueLinks()` |
+| Playwright's `waitForSelector` injects a custom selector engine that uses DOM APIs not yet supported by Lightpanda | `waitForSelector` is omitted; `waitUntil: 'domcontentloaded'` + direct `page.evaluate()` is used instead |
+| Retries reconnect to Lightpanda and trigger the duplicate-target crash | `maxRequestRetries: 0` is set |
diff --git a/packages/lightpanda-crawler/examples/scrape-books.ts b/packages/lightpanda-crawler/examples/scrape-books.ts
@@ -0,0 +1,139 @@
+/**
+ * Lightpanda crawler example — scrape books from books.toscrape.com
+ *
+ * This script crawls the catalogue at https://books.toscrape.com, extracts
+ * book titles, prices and star ratings from every page, and follows the
+ * "next" pagination link until all pages are visited.
+ *
+ * Run on Linux with Lightpanda installed:
+ *
+ *   # Option A — auto-start via @lightpanda/browser npm package
+ *   npm install @lightpanda/browser
+ *   npx tsx scrape-books.ts
+ *
+ *   # Option B — explicit binary path
+ *   LIGHTPANDA_PATH=/usr/local/bin/lightpanda npx tsx scrape-books.ts
+ *
+ *   # Option C — pre-running Lightpanda server (./lightpanda serve --port 9222)
+ *   LIGHTPANDA_AUTO_START=false npx tsx scrape-books.ts
+ */
+
+import { Dataset, log, LogLevel } from '@crawlee/core';
+import { LightpandaCrawler } from '@crawlee/lightpanda';
+
+log.setLevel(LogLevel.INFO);
+
+// ── Configuration ────────────────────────────────────────────────────────────
+
+const LIGHTPANDA_PATH = process.env.LIGHTPANDA_PATH;
+const AUTO_START = process.env.LIGHTPANDA_AUTO_START !== 'false';
+const START_URL = 'https://books.toscrape.com/catalogue/page-1.html';
+
+interface BookRecord {
+    url: string;
+    title: string;
+    price: string;
+    rating: string;
+    inStock: boolean;
+}
+
+// ── Crawler ──────────────────────────────────────────────────────────────────
+
+const crawler = new LightpandaCrawler({
+    launchContext: {
+        lightpandaConfig: {
+            host: '127.0.0.1',
+            port: 9222,
+            autoStart: AUTO_START,
+            ...(LIGHTPANDA_PATH ? { lightpandaPath: LIGHTPANDA_PATH } : {}),
+        },
+    },
+
+    // LightpandaCrawler enforces maxConcurrency: 1 and defaults maxRequestRetries: 0
+    // because Lightpanda reuses the same CDP target ID for every page.
+    // Pagination is handled inside a single requestHandler using page.goto().
+    requestHandlerTimeoutSecs: 300,
+
+    async requestHandler({ page, request, pushData, log: reqLog }) {
+        let currentUrl: string = request.url;
+        let pageNum = 1;
+
+        while (currentUrl) {
+            reqLog.info(`Scraping page ${pageNum}: ${currentUrl}`);
+
+            if (pageNum > 1) {
+                await page.goto(currentUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 });
+            }
+            // books.toscrape.com is static HTML — all article elements are present
+            // in the DOM after domcontentloaded. Playwright's waitForSelector injects
+            // a custom selector engine that requires DOM APIs not yet supported by
+            // Lightpanda, so we skip it and go straight to evaluate().
+
+            // ── Extract book data ──────────────────────────────────────────────
+            const books: BookRecord[] = await page.evaluate(() => {
+                const ratingWords: Record<string, string> = {
+                    One: '1',
+                    Two: '2',
+                    Three: '3',
+                    Four: '4',
+                    Five: '5',
+                };
+
+                return Array.from(document.querySelectorAll('article.product_pod')).map((el) => {
+                    const titleEl = el.querySelector('h3 a');
+                    const priceEl = el.querySelector('p.price_color');
+                    const ratingEl = el.querySelector('p.star-rating');
+                    const stockEl = el.querySelector('p.availability');
+
+                    const ratingClass = ratingEl?.className.replace('star-rating', '').trim() ?? '';
+
+                    return {
+                        url: (titleEl as HTMLAnchorElement | null)?.href ?? '',
+                        title: titleEl?.getAttribute('title') ?? titleEl?.textContent?.trim() ?? '',
+                        price: priceEl?.textContent?.trim() ?? '',
+                        rating: ratingWords[ratingClass] ?? ratingClass,
+                        inStock: (stockEl?.textContent?.trim() ?? '').toLowerCase().includes('in stock'),
+                    };
+                });
+            });
+
+            reqLog.info(`Found ${books.length} books on page ${pageNum}`);
+            await pushData(books);
+
+            // ── Follow pagination using page.goto() to avoid multi-target issues ──
+            const nextUrl: string | null = await page.evaluate(() => {
+                const nextLink = document.querySelector('li.next a') as HTMLAnchorElement | null;
+                if (!nextLink) return null;
+                return new URL(nextLink.href, 'https://books.toscrape.com/catalogue/').href;
+            });
+
+            currentUrl = nextUrl ?? '';
+            pageNum++;
+        }
+
+        reqLog.info(`Pagination complete. Scraped ${pageNum - 1} pages.`);
+    },
+});
+
+// ── Run ───────────────────────────────────────────────────────────────────────
+
+async function main() {
+    await crawler.run([START_URL]);
+
+    const dataset = await Dataset.open();
+    const { items } = await dataset.getData();
+
+    log.info(`\n✓ Crawl complete. Scraped ${items.length} books total.`);
+    if (items.length > 0) {
+        log.info('Sample (first 3 books):');
+        for (const book of items.slice(0, 3)) {
+            const b = book as BookRecord;
+            log.info(`  ${b.rating}★  ${b.price}  ${b.title}`);
+        }
+    }
+}
+
+main().catch((err) => {
+    log.error(String(err));
+    process.exit(1);
+});
diff --git a/packages/lightpanda-crawler/examples/tsconfig.json b/packages/lightpanda-crawler/examples/tsconfig.json
@@ -0,0 +1,31 @@
+{
+    "compilerOptions": {
+        "target": "ES2022",
+        "module": "CommonJS",
+        "moduleResolution": "Node",
+        "esModuleInterop": true,
+        "allowSyntheticDefaultImports": true,
+        "strict": false,
+        "skipLibCheck": true,
+        "resolveJsonModule": true,
+        "baseUrl": ".",
+        "paths": {
+            "@crawlee/lightpanda": ["../src/index.ts"],
+            "@crawlee/basic": ["../../basic-crawler/src/index.ts"],
+            "@crawlee/browser": ["../../browser-crawler/src/index.ts"],
+            "@crawlee/browser-pool": ["../../browser-pool/src/index.ts"],
+            "@crawlee/cheerio": ["../../cheerio-crawler/src/index.ts"],
+            "@crawlee/core": ["../../core/src/index.ts"],
+            "@crawlee/http": ["../../http-crawler/src/index.ts"],
+            "@crawlee/jsdom": ["../../jsdom-crawler/src/index.ts"],
+            "@crawlee/linkedom": ["../../linkedom-crawler/src/index.ts"],
+            "@crawlee/memory-storage": ["../../memory-storage/src/index.ts"],
+            "@crawlee/playwright": ["../../playwright-crawler/src/index.ts"],
+            "@crawlee/puppeteer": ["../../puppeteer-crawler/src/index.ts"],
+            "@crawlee/types": ["../../types/src/index.ts"],
+            "@crawlee/utils": ["../../utils/src/index.ts"],
+            "crawlee": ["../../crawlee/src/index.ts"]
+        }
+    },
+    "include": ["./**/*.ts"]
+}