Skip to content

Commit 20a2ff6

Browse files
committed
Integrate review feedback
1 parent ca00df3 commit 20a2ff6

7 files changed

Lines changed: 305 additions & 212 deletions

File tree

.github/workflows/firecrawl.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
fail-fast: false
3333
max-parallel: 3
3434
matrix:
35-
os: [ubuntu-latest, windows-latest]
35+
os: [ubuntu-latest, windows-latest, macos-latest]
3636
python-version: ["3.10", "3.13"]
3737

3838
steps:

integrations/firecrawl/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# azure-doc-intelligence-haystack
1+
# firecrawl-haystack
22

33
[![PyPI - Version](https://img.shields.io/pypi/v/firecrawl-haystack.svg)](https://pypi.org/project/firecrawl-haystack)
44
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/firecrawl-haystack.svg)](https://pypi.org/project/firecrawl-haystack)
@@ -10,3 +10,6 @@
1010
## Contributing
1111

1212
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
13+
14+
15+
To run integration tests locally, you need to export the `FIRECRAWL_API_KEY` environment variable.

integrations/firecrawl/pyproject.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,6 @@ non_interactive = true
8181
check_untyped_defs = true
8282
disallow_incomplete_defs = true
8383

84-
[tool.hatch.metadata]
85-
allow-direct-references = true
86-
8784
[tool.ruff]
8885
line-length = 120
8986

integrations/firecrawl/src/haystack_integrations/components/fetchers/firecrawl/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
from haystack_integrations.components.fetchers.firecrawl.firecrawl_fetcher import FirecrawlFetcher
5+
from haystack_integrations.components.fetchers.firecrawl.firecrawl_crawler import FirecrawlCrawler
66

7-
__all__ = ["FirecrawlFetcher"]
7+
__all__ = ["FirecrawlCrawler"]

integrations/firecrawl/src/haystack_integrations/components/fetchers/firecrawl/firecrawl_fetcher.py renamed to integrations/firecrawl/src/haystack_integrations/components/fetchers/firecrawl/firecrawl_crawler.py

Lines changed: 32 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,22 @@
55
from datetime import datetime, timezone
66
from typing import Any
77

8-
from haystack import Document, component, default_from_dict, default_to_dict, logging
9-
from haystack.utils import Secret, deserialize_secrets_inplace
8+
from haystack import Document, component, logging
9+
from haystack.utils import Secret
1010

1111
from firecrawl import AsyncFirecrawl, Firecrawl # type: ignore[import-untyped]
1212

1313
logger = logging.getLogger(__name__)
1414

1515

1616
@component
17-
class FirecrawlFetcher:
17+
class FirecrawlCrawler:
1818
"""
1919
A component that uses Firecrawl to crawl one or more URLs and return the content as Haystack Documents.
2020
21+
Crawling starts from each given URL and follows links to discover subpages, up to a configurable limit.
22+
This is useful for ingesting entire websites or documentation sites, not just single pages.
23+
2124
Firecrawl is a service that crawls websites and returns content in a structured format (e.g. Markdown)
2225
suitable for LLMs. You need a Firecrawl API key from [firecrawl.dev](https://firecrawl.dev).
2326
@@ -63,30 +66,6 @@ def __init__(
6366
self._firecrawl_client: Firecrawl | None = None
6467
self._async_firecrawl_client: AsyncFirecrawl | None = None
6568

66-
def to_dict(self) -> dict[str, Any]:
67-
"""
68-
Serializes a FirecrawlFetcher instance to a dictionary.
69-
70-
:returns: Dictionary with serialized data.
71-
"""
72-
return default_to_dict(
73-
self,
74-
api_key=self.api_key.to_dict(),
75-
params=self.params,
76-
)
77-
78-
@classmethod
79-
def from_dict(cls, data: dict[str, Any]) -> "FirecrawlFetcher":
80-
"""
81-
Deserializes a FirecrawlFetcher instance from a dictionary.
82-
83-
:param data: Dictionary to deserialize from.
84-
:returns: Deserialized FirecrawlFetcher instance.
85-
"""
86-
init_params = data.get("init_parameters", {})
87-
deserialize_secrets_inplace(init_params, keys=["api_key"])
88-
return default_from_dict(cls, data)
89-
9069
@component.output_types(documents=list[Document])
9170
def run(
9271
self,
@@ -100,9 +79,14 @@ def run(
10079
List of URLs to crawl.
10180
:param params:
10281
Optional override of crawl parameters for this run.
103-
:returns: A dictionary with key `documents` containing a list of Haystack `Document` instances.
82+
If provided, fully replaces the init-time params.
83+
:returns: A dictionary with the following keys:
84+
- `documents`: List of documents, one for each URL crawled.
10485
"""
105-
current_params = dict(self._params, **(params or {}))
86+
if self._firecrawl_client is None:
87+
self.warm_up()
88+
89+
current_params = params if params is not None else self._params
10690
documents: list[Document] = []
10791
for url in urls:
10892
docs = self._crawl_url(url=url, params=current_params)
@@ -123,16 +107,31 @@ async def run_async(
123107
List of URLs to crawl.
124108
:param params:
125109
Optional override of crawl parameters for this run.
126-
:returns: A dictionary with key `documents` containing a list of Haystack `Document` instances.
110+
If provided, fully replaces the init-time params.
111+
:returns: A dictionary with the following keys:
112+
- `documents`: List of documents, one for each URL crawled.
127113
"""
128-
current_params = dict(self._params, **(params or {}))
114+
if self._async_firecrawl_client is None:
115+
self.warm_up()
116+
117+
current_params = params if params is not None else self._params
129118
documents: list[Document] = []
130119
for url in urls:
131120
docs = await self._crawl_url_async(url=url, params=current_params)
132121
documents.extend(docs)
133122

134123
return {"documents": documents}
135124

125+
def warm_up(self) -> None:
126+
"""
127+
Warm up the Firecrawl client by initializing the clients.
128+
This is useful to avoid cold start delays when crawling many URLs.
129+
"""
130+
if self._firecrawl_client is None:
131+
self._firecrawl_client = Firecrawl(api_key=self.api_key.resolve_value())
132+
if self._async_firecrawl_client is None:
133+
self._async_firecrawl_client = AsyncFirecrawl(api_key=self.api_key.resolve_value())
134+
136135
def _crawl_url(self, url: str, params: dict[str, Any]) -> list[Document]:
137136
"""
138137
Crawl a single URL and return Documents.
@@ -141,11 +140,8 @@ def _crawl_url(self, url: str, params: dict[str, Any]) -> list[Document]:
141140
:param params: Crawl request parameters.
142141
:return: List of Documents from the crawl result.
143142
"""
144-
if self._firecrawl_client is None:
145-
self._firecrawl_client = Firecrawl(api_key=self.api_key.resolve_value())
146-
147143
try:
148-
crawl_response = self._firecrawl_client.crawl(
144+
crawl_response = self._firecrawl_client.crawl( # type: ignore[union-attr]
149145
url=url,
150146
**params,
151147
)
@@ -163,11 +159,8 @@ async def _crawl_url_async(self, url: str, params: dict[str, Any]) -> list[Docum
163159
:param params: Crawl request parameters.
164160
:return: List of Documents from the crawl result.
165161
"""
166-
if self._async_firecrawl_client is None:
167-
self._async_firecrawl_client = AsyncFirecrawl(api_key=self.api_key.resolve_value())
168-
169162
try:
170-
crawl_response = await self._async_firecrawl_client.crawl(
163+
crawl_response = await self._async_firecrawl_client.crawl( # type: ignore[union-attr]
171164
url=url,
172165
**params,
173166
)

0 commit comments

Comments
 (0)