Skip to content

Commit bce847e

Browse files
feat: Add FirecrawlWebSearch component
Add a new FirecrawlWebSearch component that enables web search queries using the Firecrawl Search API. The component follows the standard Haystack WebSearch interface (query input, documents + links output) and supports both synchronous and asynchronous execution. Closes #2870
1 parent c7bd08b commit bce847e

6 files changed

Lines changed: 527 additions & 1 deletion

File tree

integrations/firecrawl/pydoc/config_docusaurus.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
loaders:
22
- modules:
33
- haystack_integrations.components.fetchers.firecrawl.firecrawl_crawler
4+
- haystack_integrations.components.websearch.firecrawl.firecrawl_websearch
45
search_path: [../src]
56
processors:
67
- type: filter

integrations/firecrawl/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ keywords = [
1414
"Haystack",
1515
"Web Crawler",
1616
"Web Scraping",
17+
"Web Search",
1718
]
1819
authors = [
1920
{ name = "deepset GmbH", email = "info@deepset.ai" },
@@ -73,7 +74,7 @@ unit = 'pytest -m "not integration" {args:tests}'
7374
integration = 'pytest -m "integration" {args:tests}'
7475
all = 'pytest {args:tests}'
7576
cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}'
76-
types = "mypy -p haystack_integrations.components.fetchers.firecrawl {args}"
77+
types = "mypy -p haystack_integrations.components.fetchers.firecrawl -p haystack_integrations.components.websearch.firecrawl {args}"
7778

7879
[tool.mypy]
7980
install_types = true
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SPDX-FileCopyrightText: 2026-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: 2026-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from haystack_integrations.components.websearch.firecrawl.firecrawl_websearch import FirecrawlWebSearch
6+
7+
__all__ = ["FirecrawlWebSearch"]
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
# SPDX-FileCopyrightText: 2026-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Any
6+
7+
from firecrawl import AsyncFirecrawl, Firecrawl # type: ignore[import-untyped]
8+
from haystack import Document, component, default_from_dict, default_to_dict, logging
9+
from haystack.utils import Secret
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
@component
15+
class FirecrawlWebSearch:
16+
"""
17+
A component that uses Firecrawl to search the web and return results as Haystack Documents.
18+
19+
This component wraps the Firecrawl Search API, enabling web search queries that return
20+
structured documents with content and links. It follows the standard Haystack WebSearch
21+
component interface.
22+
23+
Firecrawl is a service that crawls and scrapes websites, returning content in formats suitable
24+
for LLMs. You need a Firecrawl API key from [firecrawl.dev](https://firecrawl.dev).
25+
26+
### Usage example
27+
28+
```python
29+
from haystack_integrations.components.websearch.firecrawl import FirecrawlWebSearch
30+
from haystack.utils import Secret
31+
32+
websearch = FirecrawlWebSearch(
33+
api_key=Secret.from_env_var("FIRECRAWL_API_KEY"),
34+
top_k=5,
35+
)
36+
websearch.warm_up()
37+
38+
result = websearch.run(query="What is Haystack by deepset?")
39+
documents = result["documents"]
40+
links = result["links"]
41+
```
42+
"""
43+
44+
def __init__(
45+
self,
46+
api_key: Secret = Secret.from_env_var("FIRECRAWL_API_KEY"),
47+
top_k: int | None = 10,
48+
search_params: dict[str, Any] | None = None,
49+
) -> None:
50+
"""
51+
Initialize the FirecrawlWebSearch component.
52+
53+
:param api_key:
54+
API key for Firecrawl.
55+
Defaults to the `FIRECRAWL_API_KEY` environment variable.
56+
:param top_k:
57+
Maximum number of documents to return.
58+
Defaults to 10.
59+
:param search_params:
60+
Additional parameters passed to the Firecrawl search API.
61+
See the [Firecrawl API reference](https://docs.firecrawl.dev/api-reference/endpoint/search)
62+
for available parameters. Supported keys include: `tbs`, `location`,
63+
`scrape_options`, `sources`, `categories`, `timeout`.
64+
"""
65+
self.api_key = api_key
66+
self.top_k = top_k
67+
self.search_params = search_params
68+
self._search_params = {} if search_params is None else search_params.copy()
69+
self._firecrawl_client: Firecrawl | None = None
70+
self._async_firecrawl_client: AsyncFirecrawl | None = None
71+
72+
def warm_up(self) -> None:
73+
"""
74+
Warm up the Firecrawl clients by initializing the sync and async clients.
75+
This is useful to avoid cold start delays when performing searches.
76+
"""
77+
if self._firecrawl_client is None:
78+
self._firecrawl_client = Firecrawl(api_key=self.api_key.resolve_value())
79+
if self._async_firecrawl_client is None:
80+
self._async_firecrawl_client = AsyncFirecrawl(api_key=self.api_key.resolve_value())
81+
82+
def to_dict(self) -> dict[str, Any]:
83+
"""Serializes the component to a dictionary."""
84+
return default_to_dict(
85+
self,
86+
api_key=self.api_key.to_dict(),
87+
top_k=self.top_k,
88+
search_params=self.search_params,
89+
)
90+
91+
@classmethod
92+
def from_dict(cls, data: dict[str, Any]) -> "FirecrawlWebSearch":
93+
"""Deserializes the component from a dictionary."""
94+
return default_from_dict(cls, data)
95+
96+
@component.output_types(documents=list[Document], links=list[str])
97+
def run(
98+
self,
99+
query: str,
100+
search_params: dict[str, Any] | None = None,
101+
) -> dict[str, Any]:
102+
"""
103+
Search the web using Firecrawl and return results as Documents.
104+
105+
:param query: Search query string.
106+
:param search_params:
107+
Optional override of search parameters for this run.
108+
If provided, fully replaces the init-time search_params.
109+
:returns: A dictionary with the following keys:
110+
- `documents`: List of documents with search result content.
111+
- `links`: List of URLs from the search results.
112+
"""
113+
if self._firecrawl_client is None:
114+
self.warm_up()
115+
116+
current_params = search_params if search_params is not None else self._search_params
117+
params = current_params.copy()
118+
if "limit" not in params and self.top_k is not None:
119+
params["limit"] = self.top_k
120+
121+
try:
122+
search_response = self._firecrawl_client.search( # type: ignore[union-attr]
123+
query=query,
124+
**params,
125+
)
126+
except Exception as error:
127+
logger.exception(f"Failed to search for query '{query}': {error}")
128+
return {"documents": [], "links": []}
129+
130+
documents, links = self._parse_search_response(search_response)
131+
132+
if self.top_k is not None:
133+
documents = documents[: self.top_k]
134+
links = links[: self.top_k]
135+
136+
return {"documents": documents, "links": links}
137+
138+
@component.output_types(documents=list[Document], links=list[str])
139+
async def run_async(
140+
self,
141+
query: str,
142+
search_params: dict[str, Any] | None = None,
143+
) -> dict[str, Any]:
144+
"""
145+
Asynchronously search the web using Firecrawl and return results as Documents.
146+
147+
:param query: Search query string.
148+
:param search_params:
149+
Optional override of search parameters for this run.
150+
If provided, fully replaces the init-time search_params.
151+
:returns: A dictionary with the following keys:
152+
- `documents`: List of documents with search result content.
153+
- `links`: List of URLs from the search results.
154+
"""
155+
if self._async_firecrawl_client is None:
156+
self.warm_up()
157+
158+
current_params = search_params if search_params is not None else self._search_params
159+
params = current_params.copy()
160+
if "limit" not in params and self.top_k is not None:
161+
params["limit"] = self.top_k
162+
163+
try:
164+
search_response = await self._async_firecrawl_client.search( # type: ignore[union-attr]
165+
query=query,
166+
**params,
167+
)
168+
except Exception as error:
169+
logger.exception(f"Failed to search for query '{query}': {error}")
170+
return {"documents": [], "links": []}
171+
172+
documents, links = self._parse_search_response(search_response)
173+
174+
if self.top_k is not None:
175+
documents = documents[: self.top_k]
176+
links = links[: self.top_k]
177+
178+
return {"documents": documents, "links": links}
179+
180+
@staticmethod
181+
def _parse_search_response(search_response: Any) -> tuple[list[Document], list[str]]:
182+
"""
183+
Convert a Firecrawl search response to Haystack Documents and links.
184+
185+
:param search_response: Firecrawl search response object.
186+
:returns: Tuple of (documents, links).
187+
"""
188+
documents: list[Document] = []
189+
links: list[str] = []
190+
191+
web_results = getattr(search_response, "web", None) or []
192+
for result in web_results:
193+
url = ""
194+
title = ""
195+
content = ""
196+
197+
if hasattr(result, "markdown") and result.markdown:
198+
content = result.markdown
199+
metadata = result.metadata_dict if hasattr(result, "metadata_dict") else {}
200+
url = metadata.get("url", getattr(result, "url", ""))
201+
title = metadata.get("title", "")
202+
else:
203+
url = getattr(result, "url", "") or ""
204+
title = getattr(result, "title", "") or ""
205+
content = getattr(result, "description", "") or ""
206+
207+
doc = Document(
208+
content=content,
209+
meta={
210+
"title": title,
211+
"url": url,
212+
},
213+
)
214+
documents.append(doc)
215+
if url:
216+
links.append(url)
217+
218+
return documents, links

0 commit comments

Comments
 (0)