|
| 1 | +# SPDX-FileCopyrightText: 2026-present deepset GmbH <info@deepset.ai> |
| 2 | +# |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +from typing import Any |
| 6 | + |
| 7 | +from firecrawl import AsyncFirecrawl, Firecrawl # type: ignore[import-untyped] |
| 8 | +from haystack import Document, component, default_from_dict, default_to_dict, logging |
| 9 | +from haystack.utils import Secret |
| 10 | + |
| 11 | +logger = logging.getLogger(__name__) |
| 12 | + |
| 13 | + |
| 14 | +@component |
| 15 | +class FirecrawlWebSearch: |
| 16 | + """ |
| 17 | + A component that uses Firecrawl to search the web and return results as Haystack Documents. |
| 18 | +
|
| 19 | + This component wraps the Firecrawl Search API, enabling web search queries that return |
| 20 | + structured documents with content and links. It follows the standard Haystack WebSearch |
| 21 | + component interface. |
| 22 | +
|
| 23 | + Firecrawl is a service that crawls and scrapes websites, returning content in formats suitable |
| 24 | + for LLMs. You need a Firecrawl API key from [firecrawl.dev](https://firecrawl.dev). |
| 25 | +
|
| 26 | + ### Usage example |
| 27 | +
|
| 28 | + ```python |
| 29 | + from haystack_integrations.components.websearch.firecrawl import FirecrawlWebSearch |
| 30 | + from haystack.utils import Secret |
| 31 | +
|
| 32 | + websearch = FirecrawlWebSearch( |
| 33 | + api_key=Secret.from_env_var("FIRECRAWL_API_KEY"), |
| 34 | + top_k=5, |
| 35 | + ) |
| 36 | + websearch.warm_up() |
| 37 | +
|
| 38 | + result = websearch.run(query="What is Haystack by deepset?") |
| 39 | + documents = result["documents"] |
| 40 | + links = result["links"] |
| 41 | + ``` |
| 42 | + """ |
| 43 | + |
| 44 | + def __init__( |
| 45 | + self, |
| 46 | + api_key: Secret = Secret.from_env_var("FIRECRAWL_API_KEY"), |
| 47 | + top_k: int | None = 10, |
| 48 | + search_params: dict[str, Any] | None = None, |
| 49 | + ) -> None: |
| 50 | + """ |
| 51 | + Initialize the FirecrawlWebSearch component. |
| 52 | +
|
| 53 | + :param api_key: |
| 54 | + API key for Firecrawl. |
| 55 | + Defaults to the `FIRECRAWL_API_KEY` environment variable. |
| 56 | + :param top_k: |
| 57 | + Maximum number of documents to return. |
| 58 | + Defaults to 10. |
| 59 | + :param search_params: |
| 60 | + Additional parameters passed to the Firecrawl search API. |
| 61 | + See the [Firecrawl API reference](https://docs.firecrawl.dev/api-reference/endpoint/search) |
| 62 | + for available parameters. Supported keys include: `tbs`, `location`, |
| 63 | + `scrape_options`, `sources`, `categories`, `timeout`. |
| 64 | + """ |
| 65 | + self.api_key = api_key |
| 66 | + self.top_k = top_k |
| 67 | + self.search_params = search_params |
| 68 | + self._search_params = {} if search_params is None else search_params.copy() |
| 69 | + self._firecrawl_client: Firecrawl | None = None |
| 70 | + self._async_firecrawl_client: AsyncFirecrawl | None = None |
| 71 | + |
| 72 | + def warm_up(self) -> None: |
| 73 | + """ |
| 74 | + Warm up the Firecrawl clients by initializing the sync and async clients. |
| 75 | + This is useful to avoid cold start delays when performing searches. |
| 76 | + """ |
| 77 | + if self._firecrawl_client is None: |
| 78 | + self._firecrawl_client = Firecrawl(api_key=self.api_key.resolve_value()) |
| 79 | + if self._async_firecrawl_client is None: |
| 80 | + self._async_firecrawl_client = AsyncFirecrawl(api_key=self.api_key.resolve_value()) |
| 81 | + |
| 82 | + def to_dict(self) -> dict[str, Any]: |
| 83 | + """Serializes the component to a dictionary.""" |
| 84 | + return default_to_dict( |
| 85 | + self, |
| 86 | + api_key=self.api_key.to_dict(), |
| 87 | + top_k=self.top_k, |
| 88 | + search_params=self.search_params, |
| 89 | + ) |
| 90 | + |
| 91 | + @classmethod |
| 92 | + def from_dict(cls, data: dict[str, Any]) -> "FirecrawlWebSearch": |
| 93 | + """Deserializes the component from a dictionary.""" |
| 94 | + return default_from_dict(cls, data) |
| 95 | + |
| 96 | + @component.output_types(documents=list[Document], links=list[str]) |
| 97 | + def run( |
| 98 | + self, |
| 99 | + query: str, |
| 100 | + search_params: dict[str, Any] | None = None, |
| 101 | + ) -> dict[str, Any]: |
| 102 | + """ |
| 103 | + Search the web using Firecrawl and return results as Documents. |
| 104 | +
|
| 105 | + :param query: Search query string. |
| 106 | + :param search_params: |
| 107 | + Optional override of search parameters for this run. |
| 108 | + If provided, fully replaces the init-time search_params. |
| 109 | + :returns: A dictionary with the following keys: |
| 110 | + - `documents`: List of documents with search result content. |
| 111 | + - `links`: List of URLs from the search results. |
| 112 | + """ |
| 113 | + if self._firecrawl_client is None: |
| 114 | + self.warm_up() |
| 115 | + |
| 116 | + current_params = search_params if search_params is not None else self._search_params |
| 117 | + params = current_params.copy() |
| 118 | + if "limit" not in params and self.top_k is not None: |
| 119 | + params["limit"] = self.top_k |
| 120 | + |
| 121 | + try: |
| 122 | + search_response = self._firecrawl_client.search( # type: ignore[union-attr] |
| 123 | + query=query, |
| 124 | + **params, |
| 125 | + ) |
| 126 | + except Exception as error: |
| 127 | + logger.exception(f"Failed to search for query '{query}': {error}") |
| 128 | + return {"documents": [], "links": []} |
| 129 | + |
| 130 | + documents, links = self._parse_search_response(search_response) |
| 131 | + |
| 132 | + if self.top_k is not None: |
| 133 | + documents = documents[: self.top_k] |
| 134 | + links = links[: self.top_k] |
| 135 | + |
| 136 | + return {"documents": documents, "links": links} |
| 137 | + |
| 138 | + @component.output_types(documents=list[Document], links=list[str]) |
| 139 | + async def run_async( |
| 140 | + self, |
| 141 | + query: str, |
| 142 | + search_params: dict[str, Any] | None = None, |
| 143 | + ) -> dict[str, Any]: |
| 144 | + """ |
| 145 | + Asynchronously search the web using Firecrawl and return results as Documents. |
| 146 | +
|
| 147 | + :param query: Search query string. |
| 148 | + :param search_params: |
| 149 | + Optional override of search parameters for this run. |
| 150 | + If provided, fully replaces the init-time search_params. |
| 151 | + :returns: A dictionary with the following keys: |
| 152 | + - `documents`: List of documents with search result content. |
| 153 | + - `links`: List of URLs from the search results. |
| 154 | + """ |
| 155 | + if self._async_firecrawl_client is None: |
| 156 | + self.warm_up() |
| 157 | + |
| 158 | + current_params = search_params if search_params is not None else self._search_params |
| 159 | + params = current_params.copy() |
| 160 | + if "limit" not in params and self.top_k is not None: |
| 161 | + params["limit"] = self.top_k |
| 162 | + |
| 163 | + try: |
| 164 | + search_response = await self._async_firecrawl_client.search( # type: ignore[union-attr] |
| 165 | + query=query, |
| 166 | + **params, |
| 167 | + ) |
| 168 | + except Exception as error: |
| 169 | + logger.exception(f"Failed to search for query '{query}': {error}") |
| 170 | + return {"documents": [], "links": []} |
| 171 | + |
| 172 | + documents, links = self._parse_search_response(search_response) |
| 173 | + |
| 174 | + if self.top_k is not None: |
| 175 | + documents = documents[: self.top_k] |
| 176 | + links = links[: self.top_k] |
| 177 | + |
| 178 | + return {"documents": documents, "links": links} |
| 179 | + |
| 180 | + @staticmethod |
| 181 | + def _parse_search_response(search_response: Any) -> tuple[list[Document], list[str]]: |
| 182 | + """ |
| 183 | + Convert a Firecrawl search response to Haystack Documents and links. |
| 184 | +
|
| 185 | + :param search_response: Firecrawl search response object. |
| 186 | + :returns: Tuple of (documents, links). |
| 187 | + """ |
| 188 | + documents: list[Document] = [] |
| 189 | + links: list[str] = [] |
| 190 | + |
| 191 | + web_results = getattr(search_response, "web", None) or [] |
| 192 | + for result in web_results: |
| 193 | + url = "" |
| 194 | + title = "" |
| 195 | + content = "" |
| 196 | + |
| 197 | + if hasattr(result, "markdown") and result.markdown: |
| 198 | + content = result.markdown |
| 199 | + metadata = result.metadata_dict if hasattr(result, "metadata_dict") else {} |
| 200 | + url = metadata.get("url", getattr(result, "url", "")) |
| 201 | + title = metadata.get("title", "") |
| 202 | + else: |
| 203 | + url = getattr(result, "url", "") or "" |
| 204 | + title = getattr(result, "title", "") or "" |
| 205 | + content = getattr(result, "description", "") or "" |
| 206 | + |
| 207 | + doc = Document( |
| 208 | + content=content, |
| 209 | + meta={ |
| 210 | + "title": title, |
| 211 | + "url": url, |
| 212 | + }, |
| 213 | + ) |
| 214 | + documents.append(doc) |
| 215 | + if url: |
| 216 | + links.append(url) |
| 217 | + |
| 218 | + return documents, links |
0 commit comments