55from datetime import datetime , timezone
66from typing import Any
77
8- from haystack import Document , component , default_from_dict , default_to_dict , logging
9- from haystack .utils import Secret , deserialize_secrets_inplace
8+ from haystack import Document , component , logging
9+ from haystack .utils import Secret
1010
1111from firecrawl import AsyncFirecrawl , Firecrawl # type: ignore[import-untyped]
1212
1313logger = logging .getLogger (__name__ )
1414
1515
1616@component
17- class FirecrawlFetcher :
17+ class FirecrawlCrawler :
1818 """
1919 A component that uses Firecrawl to crawl one or more URLs and return the content as Haystack Documents.
2020
21+ Crawling starts from each given URL and follows links to discover subpages, up to a configurable limit.
22+ This is useful for ingesting entire websites or documentation sites, not just single pages.
23+
2124 Firecrawl is a service that crawls websites and returns content in a structured format (e.g. Markdown)
2225 suitable for LLMs. You need a Firecrawl API key from [firecrawl.dev](https://firecrawl.dev).
2326
@@ -63,30 +66,6 @@ def __init__(
6366 self ._firecrawl_client : Firecrawl | None = None
6467 self ._async_firecrawl_client : AsyncFirecrawl | None = None
6568
66- def to_dict (self ) -> dict [str , Any ]:
67- """
68- Serializes a FirecrawlFetcher instance to a dictionary.
69-
70- :returns: Dictionary with serialized data.
71- """
72- return default_to_dict (
73- self ,
74- api_key = self .api_key .to_dict (),
75- params = self .params ,
76- )
77-
78- @classmethod
79- def from_dict (cls , data : dict [str , Any ]) -> "FirecrawlFetcher" :
80- """
81- Deserializes a FirecrawlFetcher instance from a dictionary.
82-
83- :param data: Dictionary to deserialize from.
84- :returns: Deserialized FirecrawlFetcher instance.
85- """
86- init_params = data .get ("init_parameters" , {})
87- deserialize_secrets_inplace (init_params , keys = ["api_key" ])
88- return default_from_dict (cls , data )
89-
9069 @component .output_types (documents = list [Document ])
9170 def run (
9271 self ,
@@ -100,9 +79,14 @@ def run(
10079 List of URLs to crawl.
10180 :param params:
10281 Optional override of crawl parameters for this run.
103- :returns: A dictionary with key `documents` containing a list of Haystack `Document` instances.
82+ If provided, fully replaces the init-time params.
83+ :returns: A dictionary with the following keys:
84+ - `documents`: List of documents, one for each URL crawled.
10485 """
105- current_params = dict (self ._params , ** (params or {}))
86+ if self ._firecrawl_client is None :
87+ self .warm_up ()
88+
89+ current_params = params if params is not None else self ._params
10690 documents : list [Document ] = []
10791 for url in urls :
10892 docs = self ._crawl_url (url = url , params = current_params )
@@ -123,16 +107,31 @@ async def run_async(
123107 List of URLs to crawl.
124108 :param params:
125109 Optional override of crawl parameters for this run.
126- :returns: A dictionary with key `documents` containing a list of Haystack `Document` instances.
110+ If provided, fully replaces the init-time params.
111+ :returns: A dictionary with the following keys:
112+ - `documents`: List of documents, one for each URL crawled.
127113 """
128- current_params = dict (self ._params , ** (params or {}))
114+ if self ._async_firecrawl_client is None :
115+ self .warm_up ()
116+
117+ current_params = params if params is not None else self ._params
129118 documents : list [Document ] = []
130119 for url in urls :
131120 docs = await self ._crawl_url_async (url = url , params = current_params )
132121 documents .extend (docs )
133122
134123 return {"documents" : documents }
135124
125+ def warm_up (self ) -> None :
126+ """
127+ Warm up the Firecrawl client by initializing the clients.
128+ This is useful to avoid cold start delays when crawling many URLs.
129+ """
130+ if self ._firecrawl_client is None :
131+ self ._firecrawl_client = Firecrawl (api_key = self .api_key .resolve_value ())
132+ if self ._async_firecrawl_client is None :
133+ self ._async_firecrawl_client = AsyncFirecrawl (api_key = self .api_key .resolve_value ())
134+
136135 def _crawl_url (self , url : str , params : dict [str , Any ]) -> list [Document ]:
137136 """
138137 Crawl a single URL and return Documents.
@@ -141,11 +140,8 @@ def _crawl_url(self, url: str, params: dict[str, Any]) -> list[Document]:
141140 :param params: Crawl request parameters.
142141 :return: List of Documents from the crawl result.
143142 """
144- if self ._firecrawl_client is None :
145- self ._firecrawl_client = Firecrawl (api_key = self .api_key .resolve_value ())
146-
147143 try :
148- crawl_response = self ._firecrawl_client .crawl (
144+ crawl_response = self ._firecrawl_client .crawl ( # type: ignore[union-attr]
149145 url = url ,
150146 ** params ,
151147 )
@@ -163,11 +159,8 @@ async def _crawl_url_async(self, url: str, params: dict[str, Any]) -> list[Docum
163159 :param params: Crawl request parameters.
164160 :return: List of Documents from the crawl result.
165161 """
166- if self ._async_firecrawl_client is None :
167- self ._async_firecrawl_client = AsyncFirecrawl (api_key = self .api_key .resolve_value ())
168-
169162 try :
170- crawl_response = await self ._async_firecrawl_client .crawl (
163+ crawl_response = await self ._async_firecrawl_client .crawl ( # type: ignore[union-attr]
171164 url = url ,
172165 ** params ,
173166 )
0 commit comments