-
Notifications
You must be signed in to change notification settings - Fork 762
Expand file tree
/
Copy pathcustom_distiller_example.py
More file actions
67 lines (49 loc) · 2.13 KB
/
Copy pathcustom_distiller_example.py
File metadata and controls
67 lines (49 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import asyncio
from html_to_markdown import convert
from lxml_html_clean import Cleaner
from pydantic import BaseModel
from pydantic_ai.models.openai import OpenAIChatModel
from pydantic_ai.providers.openai import OpenAIProvider
from crawlee.crawlers import (
BasePydanticAiHtmlDistiller,
PydanticAiCrawler,
PydanticAiCrawlingContext,
PydanticAiDirectExtractor,
get_basic_http_cleaner,
)
# Notes appended to the model instructions so it knows the input format.
MARKDOWN_PROMPT_NOTES = 'The document is Markdown converted from the HTML page.'
class MarkdownDistiller(BasePydanticAiHtmlDistiller):
"""Distiller that cleans the page HTML and converts it to Markdown."""
def __init__(self, cleaner: Cleaner | None = None) -> None:
super().__init__(prompt_notes=MARKDOWN_PROMPT_NOTES)
# Strip scripts, styles, and other noise before the conversion.
self._cleaner = cleaner or get_basic_http_cleaner()
def distill(self, html: str) -> str:
return convert(self._cleaner.clean_html(html)).content or ''
class Article(BaseModel):
"""Model representing the extracted data for an article."""
title: str
short_text: str
async def main() -> None:
model = OpenAIChatModel(
'gpt-5.4-nano',
# Set the provider with the API key explicitly.
provider=OpenAIProvider(api_key='your-openai-api-key'),
)
crawler = PydanticAiCrawler(
# Use the custom distiller to convert the page to Markdown before extraction.
extractor=PydanticAiDirectExtractor(model=model, distiller=MarkdownDistiller()),
max_requests_per_crawl=5,
)
@crawler.router.default_handler
async def handler(context: PydanticAiCrawlingContext) -> None:
# Pass a Pydantic model and get a validated instance back.
article = await context.extract(Article)
await context.push_data(article.model_dump())
# Enqueue links as usual, the distillation and extraction don't affect
# the rest of the crawling logic.
await context.enqueue_links()
await crawler.run(['https://crawlee.dev/'])
if __name__ == '__main__':
asyncio.run(main())