apify
diff --git a/‎docs/guides/ai_crawler.mdx‎
Lines changed: 150 additions & 0 deletions b/‎docs/guides/ai_crawler.mdx‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎docs/guides/architecture_overview.mdx‎
Lines changed: 10 additions & 1 deletion b/‎docs/guides/architecture_overview.mdx‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎docs/guides/code_examples/ai_crawler/additional_instructions_example.py‎
Lines changed: 44 additions & 0 deletions b/‎docs/guides/code_examples/ai_crawler/additional_instructions_example.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎docs/guides/code_examples/ai_crawler/basic_example.py‎
Lines changed: 41 additions & 0 deletions b/‎docs/guides/code_examples/ai_crawler/basic_example.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎docs/guides/code_examples/ai_crawler/custom_distiller_example.py‎
Lines changed: 67 additions & 0 deletions b/‎docs/guides/code_examples/ai_crawler/custom_distiller_example.py‎
Lines changed: 67 additions & 0 deletions
@@ -0,0 +1,150 @@
+---
+id: ai-crawler
+title: AI crawler
+description: Learn how to use AiCrawler to extract structured data from HTML pages with an LLM.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import BasicExample from '!!raw-loader!./code_examples/ai_crawler/basic_example.py';
+import AdditionalInstructionsExample from '!!raw-loader!./code_examples/ai_crawler/additional_instructions_example.py';
+import CustomDistillerExample from '!!raw-loader!./code_examples/ai_crawler/custom_distiller_example.py';
+import SelectorExtractorExample from '!!raw-loader!./code_examples/ai_crawler/selector_extractor_example.py';
+import UsageLimitExample from '!!raw-loader!./code_examples/ai_crawler/usage_limit_example.py';
+
+An <ApiLink to="class/AiCrawler">`AiCrawler`</ApiLink> extracts structured data from a page with an LLM. It fetches each page over plain HTTP and parses it with Parsel, then exposes an <ApiLink to="class/ExtractFunction">`extract`</ApiLink> helper: pass a Pydantic model and get a validated instance back. Instead of writing CSS selectors for every field, you describe the data with a schema and the model fills it in.
+
+The model layer is [Pydantic AI](https://ai.pydantic.dev/), so any provider it supports (OpenAI, Anthropic, Gemini, Ollama, ...) works through the `model` argument. The context is an <ApiLink to="class/AiCrawlingContext">`AiCrawlingContext`</ApiLink>, which extends the <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink>, so the manual <ApiLink to="class/ParselCrawlingContext#selector">`selector`</ApiLink> and <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> stay available next to <ApiLink to="class/ExtractFunction">`extract`</ApiLink>.
+
+:::caution Experimental
+
+<ApiLink to="class/AiCrawler">`AiCrawler`</ApiLink> is experimental. Its public API may change in future releases.
+
+:::
+
+## When to use AiCrawler
+
+Use <ApiLink to="class/AiCrawler">`AiCrawler`</ApiLink> when:
+
+- Selectors are unknown or brittle. The model reads the content, so it tolerates markup that varies or changes.
+- One schema spans many layouts. A single Pydantic model fits differently structured pages, with no per-page selectors.
+- Rapid prototyping. You describe the data with a schema instead of writing selectors.
+
+For pages with a stable, known structure, a plain <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> or <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> is cheaper, since it runs no model calls.
+
+<ApiLink to="class/AiCrawler">`AiCrawler`</ApiLink> fetches pages over plain HTTP and does not render JavaScript. For pages that need a browser, or for complex multi-step interactions, use <ApiLink to="class/StagehandCrawler">`StagehandCrawler`</ApiLink>. See the [Stagehand crawler guide](./stagehand-crawler).
+
+## Installation
+
+<ApiLink to="class/AiCrawler">`AiCrawler`</ApiLink> requires the `ai` optional dependency group:
+
+```bash
+pip install 'crawlee[ai]'
+```
+
+or with uv:
+
+```bash
+uv add 'crawlee[ai]'
+```
+
+The `ai` extra installs the OpenAI integration by default. To use another provider, add the matching [pydantic-ai-slim](https://ai.pydantic.dev/install/#use-with-pydantic-ai-slim) extra. For example, for Anthropic:
+
+```bash
+pip install 'crawlee[ai]' 'pydantic-ai-slim[anthropic]'
+```
+
+## Basic usage
+
+Provide a `model` and call <ApiLink to="class/AiCrawlingContext#extract">`context.extract`</ApiLink> with a Pydantic model inside the handler. The example below extracts an article and pushes it to the dataset.
+
+<CodeBlock className="language-python">
+    {BasicExample}
+</CodeBlock>
+
+The `model` builds the crawler's default extractor, an <ApiLink to="class/AiDirectExtractor">`AiDirectExtractor`</ApiLink>. With neither `model` nor `extractor`, a default OpenAI model is used.
+
+The `model` argument accepts a provider-prefixed name or a Pydantic AI `Model` instance.
+
+```python
+# A provider-prefixed name reads credentials from the provider's environment variable (e.g. OPENAI_API_KEY).
+crawler = AiCrawler(model='openai:gpt-5.4-nano')
+
+# A Model instance takes credentials explicitly.
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+model = OpenAIChatModel('gpt-5.4-nano', provider=OpenAIProvider(api_key='...'))
+crawler = AiCrawler(model=model)
+```
+
+## Extractors
+
+An extractor turns a page into your schema. Extractors implement different strategies for working with the LLM, and each one uses an <ApiLink to="class/AiHtmlDistiller">`AiHtmlDistiller`</ApiLink> to shape the model's input. Crawlee ships two.
+
+### AiDirectExtractor
+
+<ApiLink to="class/AiDirectExtractor">`AiDirectExtractor`</ApiLink> sends the distilled page to the model in one call. The schema is the model's output type. Pydantic AI validates the result; on a mismatch, it sends the error back to the model to fix, bounded by `retries`.
+
+It reads each page on its own, so extraction is accurate per page. It accepts schemas of any shape: nested models, lists, dictionaries, unions, and deep nesting. The cost is one model call per page, which scales poorly on a large site.
+
+Use `additional_instructions` to focus the model on the data you want:
+
+<CodeBlock className="language-python">
+    {AdditionalInstructionsExample}
+</CodeBlock>
+
+### AiSelectorExtractor
+
+<ApiLink to="class/AiSelectorExtractor">`AiSelectorExtractor`</ApiLink> asks the model for reusable CSS selectors on the first page of a route, caches them, and reuses them with no model call on later pages of the same layout, so it scales to large sites. When a page matches none of the cached selectors (a different markup variant), it generates and caches a new set, so one bucket can hold several variants. If selector generation fails, or the schema shape is unsupported, it degrades to the `fallback` extractor when one is set, and raises otherwise. Selectors are bucketed by `cache_tag`, which defaults to the request label, so each route keeps its own set. The cache is persisted to a <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, so a later run reuses selectors learned earlier.
+
+<CodeBlock className="language-python">
+    {SelectorExtractorExample}
+</CodeBlock>
+
+It supports schemas built from scalar fields, lists of scalars, lists of items, and a single nested item, one level deep. For shapes it cannot serve (such as a `dict` field), set a `fallback` or use <ApiLink to="class/AiDirectExtractor">`AiDirectExtractor`</ApiLink>.
+
+Both extractors share two more knobs. `retries` caps how many times the model may fix output that fails schema validation (default 1 for <ApiLink to="class/AiDirectExtractor">`AiDirectExtractor`</ApiLink>, 3 for <ApiLink to="class/AiSelectorExtractor">`AiSelectorExtractor`</ApiLink>). `instructions` replaces the base task instructions entirely.
+
+## Distillers
+
+A distiller reduces raw HTML to a compact representation the model reads cheaply. Each extractor uses one. Replace it with the extractor's `distiller` argument (the crawler itself has no `distiller` argument).
+
+<ApiLink to="class/AiDirectExtractor">`AiDirectExtractor`</ApiLink> defaults to an <ApiLink to="class/AiCleanHtmlDistiller">`AiCleanHtmlDistiller`</ApiLink>: cleaned, structure-preserving HTML that keeps the full page text. <ApiLink to="class/AiSelectorExtractor">`AiSelectorExtractor`</ApiLink> uses an <ApiLink to="class/AiSkeletonDistiller">`AiSkeletonDistiller`</ApiLink> internally to ask the model for selectors; you rarely set it yourself.
+
+### Custom distiller
+
+Subclass <ApiLink to="class/BaseAiHtmlDistiller">`BaseAiHtmlDistiller`</ApiLink> and implement <ApiLink to="class/BaseAiHtmlDistiller#distill">`distill`</ApiLink> to send a different representation. Set `prompt_notes` so the model knows the input format. The extractor appends the notes to its instructions.
+
+The example below converts the cleaned page to Markdown with [html-to-markdown](https://pypi.org/project/html-to-markdown/), an extra dependency:
+
+```bash
+pip install html-to-markdown
+```
+
+<CodeBlock className="language-python">
+    {CustomDistillerExample}
+</CodeBlock>
+
+## Extract options
+
+<ApiLink to="class/AiCrawlingContext#extract">`context.extract`</ApiLink> takes options alongside the schema:
+
+- `scope` - a CSS selector that restricts extraction to the first matching subtree (e.g. `main` or `article.post`). It saves tokens and keeps the model away from unrelated parts of the page.
+- `cache_tag` - the bucket for cached selectors. It defaults to the request label.
+- `additional_instructions` - extra instructions for this call, appended to the base instructions. With <ApiLink to="class/AiSelectorExtractor">`AiSelectorExtractor`</ApiLink> they steer the one-time selector generation, not each extraction, so use them to point the model at the right region.
+
+## Usage and cost
+
+Token usage accumulates on <ApiLink to="class/AiCrawlingContext#ai_usage">`context.ai_usage`</ApiLink>, and on <ApiLink to="class/AiCrawler#ai_usage">`crawler.ai_usage`</ApiLink> for the whole crawl. The accumulator is an <ApiLink to="class/AiUsageStats">`AiUsageStats`</ApiLink> with <ApiLink to="class/AiUsageStats#requests">`requests`</ApiLink>, <ApiLink to="class/AiUsageStats#input_tokens">`input_tokens`</ApiLink>, <ApiLink to="class/AiUsageStats#output_tokens">`output_tokens`</ApiLink>, and <ApiLink to="class/AiUsageStats#total_tokens">`total_tokens`</ApiLink>.
+
+To cap spend, pass `usage_limits` (a pydantic-ai `UsageLimits`) to an extractor. It applies to every model run, and <ApiLink to="class/ExtractFunction">`extract`</ApiLink> raises `UsageLimitExceeded` when a page needs more. The example below caps each extraction, logs and skips pages that exceed it, and stops the whole crawl once a token budget is spent.
+
+<CodeBlock className="language-python">
+    {UsageLimitExample}
+</CodeBlock>
+
+## Conclusion
+
+This guide introduced <ApiLink to="class/AiCrawler">`AiCrawler`</ApiLink> and its <ApiLink to="class/ExtractFunction">`extract`</ApiLink> helper, the <ApiLink to="class/AiDirectExtractor">`AiDirectExtractor`</ApiLink> and <ApiLink to="class/AiSelectorExtractor">`AiSelectorExtractor`</ApiLink> strategies, the built-in and custom distillers, the extract options, and how failures and cost are handled. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
@@ -49,6 +49,8 @@ class ParselCrawler
 
 class BeautifulSoupCrawler
 
+class AiCrawler
+
 class PlaywrightCrawler
 
 class AdaptivePlaywrightCrawler
@@ -65,18 +67,20 @@ BasicCrawler --|> AdaptivePlaywrightCrawler
 AbstractHttpCrawler --|> HttpCrawler
 AbstractHttpCrawler --|> ParselCrawler
 AbstractHttpCrawler --|> BeautifulSoupCrawler
+AbstractHttpCrawler --|> AiCrawler
 PlaywrightCrawler --|> StagehandCrawler
 ```
 
 ### HTTP crawlers
 
 HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients).
 
-HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are three crawlers that belong to this category:
+HTTP crawlers inherit from <ApiLink to="class/AbstractHttpCrawler">`AbstractHttpCrawler`</ApiLink> and there are four crawlers that belong to this category:
 
 - <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser.
 - <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML.
 - <ApiLink to="class/HttpCrawler">`HttpCrawler`</ApiLink> does not parse HTTP responses at all and is used when no content parsing is required.
+- <ApiLink to="class/AiCrawler">`AiCrawler`</ApiLink> parses HTML with Parsel and uses an LLM to extract structured data into a validated Pydantic model.
 
 You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers).
 
@@ -120,6 +124,8 @@ class ParselCrawlingContext
 
 class BeautifulSoupCrawlingContext
 
+class AiCrawlingContext
+
 class PlaywrightPreNavCrawlingContext
 
 class PlaywrightCrawlingContext
@@ -148,6 +154,8 @@ ParsedHttpCrawlingContext --|> ParselCrawlingContext
 
 ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext
 
+ParselCrawlingContext --|> AiCrawlingContext
+
 BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext
 
 PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext
@@ -168,6 +176,7 @@ They have a similar inheritance structure as the crawlers, with the base class b
 - <ApiLink to="class/ParsedHttpCrawlingContext">`ParsedHttpCrawlingContext`</ApiLink> for HTTP crawlers with parsed responses.
 - <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing.
 - <ApiLink to="class/BeautifulSoupCrawlingContext">`BeautifulSoupCrawlingContext`</ApiLink> for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing.
+- <ApiLink to="class/AiCrawlingContext">`AiCrawlingContext`</ApiLink> for the AI crawler, extending the Parsel context with an `extract` helper.
 - <ApiLink to="class/PlaywrightPreNavCrawlingContext">`PlaywrightPreNavCrawlingContext`</ApiLink> for Playwright crawlers before the page is navigated.
 - <ApiLink to="class/PlaywrightCrawlingContext">`PlaywrightCrawlingContext`</ApiLink> for Playwright crawlers.
 - <ApiLink to="class/AdaptivePlaywrightPreNavCrawlingContext">`AdaptivePlaywrightPreNavCrawlingContext`</ApiLink> for Adaptive Playwright crawlers before the page is navigated.
 
@@ -0,0 +1,44 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import AiCrawler, AiCrawlingContext
+
+
+class Post(BaseModel):
+    """Model representing a single post."""
+
+    title: str
+    url: str
+
+
+class Posts(BaseModel):
+    """Model representing the extracted list of posts."""
+
+    posts: list[Post]
+
+
+async def main() -> None:
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+    crawler = AiCrawler(model=model, max_requests_per_crawl=5)
+
+    @crawler.router.default_handler
+    async def handler(context: AiCrawlingContext) -> None:
+        # The instruction narrows what the model returns from the page.
+        posts = await context.extract(
+            Posts,
+            additional_instructions='Extract only the top five posts on the page.',
+        )
+
+        await context.push_data(posts.model_dump())
+
+    await crawler.run(['https://news.ycombinator.com'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
@@ -0,0 +1,41 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import AiCrawler, AiCrawlingContext
+
+
+class Article(BaseModel):
+    """Model representing the extracted data for an article."""
+
+    title: str
+    short_text: str
+
+
+async def main() -> None:
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        # Set the provider with the API key explicitly.
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+
+    crawler = AiCrawler(model=model, max_requests_per_crawl=5)
+
+    @crawler.router.default_handler
+    async def handler(context: AiCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Pass a Pydantic model and get a validated instance back.
+        article = await context.extract(Article)
+
+        await context.push_data(article.model_dump())
+
+        await context.enqueue_links()
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
@@ -0,0 +1,67 @@
+import asyncio
+
+from html_to_markdown import convert
+from lxml_html_clean import Cleaner
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import (
+    AiCrawler,
+    AiCrawlingContext,
+    AiDirectExtractor,
+    BaseAiHtmlDistiller,
+    get_basic_ai_cleaner,
+)
+
+# Notes appended to the model instructions so it knows the input format.
+MARKDOWN_PROMPT_NOTES = 'The document is Markdown converted from the HTML page.'
+
+
+class MarkdownDistiller(BaseAiHtmlDistiller):
+    """Distiller that cleans the page HTML and converts it to Markdown."""
+
+    def __init__(self, cleaner: Cleaner | None = None) -> None:
+        super().__init__(prompt_notes=MARKDOWN_PROMPT_NOTES)
+
+        # Strip scripts, styles, and other noise before the conversion.
+        self._cleaner = cleaner or get_basic_ai_cleaner()
+
+    def distill(self, html: str) -> str:
+        return convert(self._cleaner.clean_html(html)).content or ''
+
+
+class Article(BaseModel):
+    """Model representing the extracted data for an article."""
+
+    title: str
+    short_text: str
+
+
+async def main() -> None:
+    model = OpenAIChatModel(
+        'gpt-5.4-nano',
+        # Set the provider with the API key explicitly.
+        provider=OpenAIProvider(api_key='your-openai-api-key'),
+    )
+    crawler = AiCrawler(
+        # Use the custom distiller to convert the page to Markdown before extraction.
+        extractor=AiDirectExtractor(model=model, distiller=MarkdownDistiller()),
+        max_requests_per_crawl=5,
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: AiCrawlingContext) -> None:
+        # Pass a Pydantic model and get a validated instance back.
+        article = await context.extract(Article)
+        await context.push_data(article.model_dump())
+
+        # Enqueue links as usual, the distillation and extraction don't affect
+        # the rest of the crawling logic.
+        await context.enqueue_links()
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())