diff --git a/README.md b/README.md index adf1e76f92..535fef84a2 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | | [faiss-haystack](integrations/faiss/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/faiss-haystack.svg)](https://pypi.org/project/faiss-haystack) | [![Test / faiss](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml) | | [fastembed-haystack](integrations/fastembed/) | Embedder, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) | -| [firecrawl-haystack](integrations/firecrawl/) | Fetcher | [![PyPI - Version](https://img.shields.io/pypi/v/firecrawl-haystack.svg)](https://pypi.org/project/firecrawl-haystack/) | [![Test / firecrawl](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/firecrawl.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/firecrawl.yml) | +| [firecrawl-haystack](integrations/firecrawl/) | Fetcher, Web Search | [![PyPI - Version](https://img.shields.io/pypi/v/firecrawl-haystack.svg)](https://pypi.org/project/firecrawl-haystack/) | [![Test / firecrawl](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/firecrawl.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/firecrawl.yml) | | [github-haystack](integrations/github/) | Connector | [![PyPI - Version](https://img.shields.io/pypi/v/github-haystack.svg)](https://pypi.org/project/github-haystack) | [![Test / github](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/github.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/github.yml) | | [google-ai-haystack](integrations/google_ai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-ai-haystack.svg)](https://pypi.org/project/google-ai-haystack) | **Archived** - use [google-genai-haystack](https://pypi.org/project/google-genai-haystack) instead | | [google-genai-haystack](integrations/google_genai/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-genai-haystack.svg)](https://pypi.org/project/google-genai-haystack) | [![Test / google-genai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_genai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_genai.yml) | diff --git a/integrations/firecrawl/pydoc/config_docusaurus.yml b/integrations/firecrawl/pydoc/config_docusaurus.yml index dec7e6e24a..a45a2cf04b 100644 --- a/integrations/firecrawl/pydoc/config_docusaurus.yml +++ b/integrations/firecrawl/pydoc/config_docusaurus.yml @@ -1,6 +1,7 @@ loaders: - modules: - haystack_integrations.components.fetchers.firecrawl.firecrawl_crawler + - haystack_integrations.components.websearch.firecrawl.firecrawl_websearch search_path: [../src] processors: - type: filter diff --git a/integrations/firecrawl/pyproject.toml b/integrations/firecrawl/pyproject.toml index bb980eb557..f72ef53535 100644 --- a/integrations/firecrawl/pyproject.toml +++ b/integrations/firecrawl/pyproject.toml @@ -14,6 +14,7 @@ keywords = [ "Haystack", "Web Crawler", "Web Scraping", + "Web Search", ] authors = [ { name = "deepset GmbH", email = "info@deepset.ai" }, @@ -73,7 +74,7 @@ unit = 'pytest -m "not integration" {args:tests}' integration = 'pytest -m "integration" {args:tests}' all = 'pytest {args:tests}' cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' -types = "mypy -p haystack_integrations.components.fetchers.firecrawl {args}" +types = "mypy -p haystack_integrations.components.fetchers.firecrawl -p haystack_integrations.components.websearch.firecrawl {args}" [tool.mypy] install_types = true diff --git a/integrations/firecrawl/src/haystack_integrations/components/websearch/__init__.py b/integrations/firecrawl/src/haystack_integrations/components/websearch/__init__.py new file mode 100644 index 0000000000..1872f673d7 --- /dev/null +++ b/integrations/firecrawl/src/haystack_integrations/components/websearch/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2026-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/firecrawl/src/haystack_integrations/components/websearch/firecrawl/__init__.py b/integrations/firecrawl/src/haystack_integrations/components/websearch/firecrawl/__init__.py new file mode 100644 index 0000000000..f681a89ea0 --- /dev/null +++ b/integrations/firecrawl/src/haystack_integrations/components/websearch/firecrawl/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2026-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.websearch.firecrawl.firecrawl_websearch import FirecrawlWebSearch + +__all__ = ["FirecrawlWebSearch"] diff --git a/integrations/firecrawl/src/haystack_integrations/components/websearch/firecrawl/firecrawl_websearch.py b/integrations/firecrawl/src/haystack_integrations/components/websearch/firecrawl/firecrawl_websearch.py new file mode 100644 index 0000000000..62ab7263a4 --- /dev/null +++ b/integrations/firecrawl/src/haystack_integrations/components/websearch/firecrawl/firecrawl_websearch.py @@ -0,0 +1,193 @@ +# SPDX-FileCopyrightText: 2026-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from firecrawl import AsyncFirecrawl, Firecrawl # type: ignore[import-untyped] +from firecrawl.types import SearchData # type: ignore[import-untyped] +from haystack import Document, component, logging +from haystack.utils import Secret + +logger = logging.getLogger(__name__) + + +@component +class FirecrawlWebSearch: + """ + A component that uses Firecrawl to search the web and return results as Haystack Documents. + + This component wraps the Firecrawl Search API, enabling web search queries that return + structured documents with content and links. It follows the standard Haystack WebSearch + component interface. + + Firecrawl is a service that crawls and scrapes websites, returning content in formats suitable + for LLMs. You need a Firecrawl API key from [firecrawl.dev](https://firecrawl.dev). + + ### Usage example + + ```python + from haystack_integrations.components.websearch.firecrawl import FirecrawlWebSearch + from haystack.utils import Secret + + websearch = FirecrawlWebSearch( + api_key=Secret.from_env_var("FIRECRAWL_API_KEY"), + top_k=5, + ) + result = websearch.run(query="What is Haystack by deepset?") + documents = result["documents"] + links = result["links"] + ``` + """ + + def __init__( + self, + api_key: Secret = Secret.from_env_var("FIRECRAWL_API_KEY"), + top_k: int | None = 10, + search_params: dict[str, Any] | None = None, + ) -> None: + """ + Initialize the FirecrawlWebSearch component. + + :param api_key: + API key for Firecrawl. + Defaults to the `FIRECRAWL_API_KEY` environment variable. + :param top_k: + Maximum number of documents to return. + Defaults to 10. This can be overridden by the `"limit"` parameter in `search_params`. + :param search_params: + Additional parameters passed to the Firecrawl search API. + See the [Firecrawl API reference](https://docs.firecrawl.dev/api-reference/endpoint/search) + for available parameters. Supported keys include: `tbs`, `location`, + `scrape_options`, `sources`, `categories`, `timeout`. + """ + self.api_key = api_key + self.top_k = top_k + self.search_params = search_params + self._search_params = {} if search_params is None else search_params.copy() + self._firecrawl_client: Firecrawl | None = None + self._async_firecrawl_client: AsyncFirecrawl | None = None + + def warm_up(self) -> None: + """ + Warm up the Firecrawl clients by initializing the sync and async clients. + This is useful to avoid cold start delays when performing searches. + """ + if self._firecrawl_client is None: + self._firecrawl_client = Firecrawl(api_key=self.api_key.resolve_value()) + if self._async_firecrawl_client is None: + self._async_firecrawl_client = AsyncFirecrawl(api_key=self.api_key.resolve_value()) + + @component.output_types(documents=list[Document], links=list[str]) + def run( + self, + query: str, + search_params: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """ + Search the web using Firecrawl and return results as Documents. + + :param query: Search query string. + :param search_params: + Optional override of search parameters for this run. + If provided, fully replaces the init-time search_params. + :returns: A dictionary with the following keys: + - `documents`: List of documents with search result content. + - `links`: List of URLs from the search results. + """ + if self._firecrawl_client is None: + self.warm_up() + + current_params = search_params if search_params is not None else self._search_params + params = current_params.copy() + if "limit" not in params and self.top_k is not None: + params["limit"] = self.top_k + + try: + search_response = self._firecrawl_client.search( # type: ignore[union-attr] + query=query, + **params, + ) + except Exception as error: + logger.exception(f"Failed to search for query '{query}': {error}") + return {"documents": [], "links": []} + + documents, links = self._parse_search_response(search_response) + return {"documents": documents, "links": links} + + @component.output_types(documents=list[Document], links=list[str]) + async def run_async( + self, + query: str, + search_params: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """ + Asynchronously search the web using Firecrawl and return results as Documents. + + :param query: Search query string. + :param search_params: + Optional override of search parameters for this run. + If provided, fully replaces the init-time search_params. + :returns: A dictionary with the following keys: + - `documents`: List of documents with search result content. + - `links`: List of URLs from the search results. + """ + if self._async_firecrawl_client is None: + self.warm_up() + + current_params = search_params if search_params is not None else self._search_params + params = current_params.copy() + if "limit" not in params and self.top_k is not None: + params["limit"] = self.top_k + + try: + search_response = await self._async_firecrawl_client.search( # type: ignore[union-attr] + query=query, + **params, + ) + except Exception as error: + logger.exception(f"Failed to search for query '{query}': {error}") + return {"documents": [], "links": []} + + documents, links = self._parse_search_response(search_response) + return {"documents": documents, "links": links} + + @staticmethod + def _parse_search_response(search_response: SearchData) -> tuple[list[Document], list[str]]: + """ + Convert a Firecrawl search response to Haystack Documents and links. + + :param search_response: Firecrawl search response object. + :returns: Tuple of (documents, links). + """ + documents: list[Document] = [] + links: list[str] = [] + + web_results = search_response.web or [] + for result in web_results: + url = "" + title = "" + content = "" + + if hasattr(result, "markdown") and result.markdown: + content = result.markdown + metadata = result.metadata_dict if hasattr(result, "metadata_dict") else {} + url = metadata.get("url", getattr(result, "url", "")) + title = metadata.get("title", "") + else: + url = getattr(result, "url", "") + title = getattr(result, "title", "") + content = getattr(result, "description", "") + + doc = Document( + content=content, + meta={ + "title": title, + "url": url, + }, + ) + documents.append(doc) + if url: + links.append(url) + + return documents, links diff --git a/integrations/firecrawl/src/haystack_integrations/components/websearch/py.typed b/integrations/firecrawl/src/haystack_integrations/components/websearch/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/firecrawl/tests/test_firecrawl_websearch.py b/integrations/firecrawl/tests/test_firecrawl_websearch.py new file mode 100644 index 0000000000..632095a921 --- /dev/null +++ b/integrations/firecrawl/tests/test_firecrawl_websearch.py @@ -0,0 +1,276 @@ +# SPDX-FileCopyrightText: 2026-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from haystack import Document +from haystack.core.serialization import component_from_dict, component_to_dict +from haystack.utils import Secret + +from haystack_integrations.components.websearch.firecrawl import FirecrawlWebSearch + + +class TestFirecrawlWebSearch: + @pytest.fixture + def search_result_web(self) -> MagicMock: + """A SearchResultWeb-like object (no markdown, just search metadata).""" + result = MagicMock(spec=["url", "title", "description"]) + result.url = "https://example.com" + result.title = "Example Title" + result.description = "Example description snippet" + return result + + @pytest.fixture + def search_result_document(self) -> MagicMock: + """A Document-like object (with markdown, from scrapeOptions).""" + result = MagicMock(spec=["markdown", "metadata_dict", "url"]) + result.markdown = "# Full page content\nSome markdown text." + result.metadata_dict = {"url": "https://example.com/page", "title": "Page Title"} + result.url = "https://example.com/page" + return result + + @pytest.fixture + def search_response(self, search_result_web) -> MagicMock: + """Standard search response with web results.""" + response = MagicMock() + response.web = [search_result_web] + return response + + @pytest.fixture + def mock_client(self, search_response) -> MagicMock: + client = MagicMock() + client.search.return_value = search_response + return client + + @pytest.fixture + def mock_async_client(self, search_response) -> MagicMock: + client = MagicMock() + client.search = AsyncMock(return_value=search_response) + return client + + def test_init_default(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FIRECRAWL_API_KEY", "test-key") + ws = FirecrawlWebSearch() + assert ws.top_k == 10 + assert ws._search_params == {} + assert ws.api_key.resolve_value() == "test-key" + + def test_init_with_params(self) -> None: + ws = FirecrawlWebSearch( + api_key=Secret.from_token("custom-key"), + top_k=5, + search_params={"tbs": "qdr:d", "location": "US"}, + ) + assert ws.top_k == 5 + assert ws._search_params == {"tbs": "qdr:d", "location": "US"} + assert ws.api_key.resolve_value() == "custom-key" + + def test_to_dict(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FIRECRAWL_API_KEY", "test-key") + ws = FirecrawlWebSearch(top_k=5, search_params={"tbs": "qdr:d"}) + data = component_to_dict(ws, "FirecrawlWebSearch") + assert ( + data["type"] + == "haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.FirecrawlWebSearch" + ) + assert data["init_parameters"]["top_k"] == 5 + assert data["init_parameters"]["search_params"] == {"tbs": "qdr:d"} + + def test_from_dict(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FIRECRAWL_API_KEY", "test-key") + data = { + "type": ("haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.FirecrawlWebSearch"), + "init_parameters": { + "top_k": 3, + "search_params": {"location": "UK"}, + "api_key": {"env_vars": ["FIRECRAWL_API_KEY"], "strict": True, "type": "env_var"}, + }, + } + ws = component_from_dict(FirecrawlWebSearch, data, "FirecrawlWebSearch") + assert ws.top_k == 3 + assert ws.search_params == {"location": "UK"} + assert ws.api_key.resolve_value() == "test-key" + + def test_run_returns_documents_and_links(self, mock_client) -> None: + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key"), top_k=10) + ws._firecrawl_client = mock_client + + result = ws.run(query="test query") + + assert "documents" in result + assert "links" in result + assert len(result["documents"]) == 1 + assert isinstance(result["documents"][0], Document) + assert result["documents"][0].content == "Example description snippet" + assert result["documents"][0].meta["url"] == "https://example.com" + assert result["documents"][0].meta["title"] == "Example Title" + assert result["links"] == ["https://example.com"] + mock_client.search.assert_called_once_with(query="test query", limit=10) + + def test_run_with_scraped_documents(self, search_result_document, mock_client) -> None: + """When scrapeOptions are used, results contain markdown content.""" + search_response = MagicMock() + search_response.web = [search_result_document] + mock_client.search.return_value = search_response + + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key"), top_k=10) + ws._firecrawl_client = mock_client + + result = ws.run(query="test query") + + assert result["documents"][0].content == "# Full page content\nSome markdown text." + assert result["documents"][0].meta["url"] == "https://example.com/page" + assert result["documents"][0].meta["title"] == "Page Title" + + def test_run_overrides_init_params_with_runtime_params(self, mock_client) -> None: + ws = FirecrawlWebSearch( + api_key=Secret.from_token("test-key"), + top_k=10, + search_params={"location": "US"}, + ) + ws._firecrawl_client = mock_client + + ws.run(query="test", search_params={"location": "UK", "limit": 5}) + + mock_client.search.assert_called_once_with( + query="test", + location="UK", + limit=5, + ) + + @pytest.mark.asyncio + async def test_run_async(self, mock_async_client) -> None: + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key"), top_k=10) + ws._async_firecrawl_client = mock_async_client + + result = await ws.run_async(query="test query") + + assert len(result["documents"]) == 1 + assert isinstance(result["documents"][0], Document) + assert result["links"] == ["https://example.com"] + mock_async_client.search.assert_awaited_once() + + def test_run_returns_empty_on_error(self, mock_client) -> None: + mock_client.search.side_effect = Exception("API error") + + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key")) + ws._firecrawl_client = mock_client + + result = ws.run(query="test") + assert result["documents"] == [] + assert result["links"] == [] + + @pytest.mark.asyncio + async def test_run_async_returns_empty_on_error(self, mock_async_client) -> None: + mock_async_client.search = AsyncMock(side_effect=Exception("API error")) + + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key")) + ws._async_firecrawl_client = mock_async_client + + result = await ws.run_async(query="test") + assert result["documents"] == [] + assert result["links"] == [] + + def test_run_calls_warm_up(self, search_response) -> None: + with ( + patch( + "haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.Firecrawl" + ) as mock_firecrawl_client, + patch("haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.AsyncFirecrawl"), + ): + mock_firecrawl_client.return_value.search.return_value = search_response + + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key")) + ws.run(query="test") + + assert ws._firecrawl_client is mock_firecrawl_client.return_value + mock_firecrawl_client.assert_called_once_with(api_key="test-key") + + @pytest.mark.asyncio + async def test_run_async_calls_warm_up(self, search_response) -> None: + with ( + patch("haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.Firecrawl"), + patch( + "haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.AsyncFirecrawl" + ) as mock_async_firecrawl_client, + ): + mock_async_firecrawl_client.return_value.search = AsyncMock(return_value=search_response) + + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key")) + await ws.run_async(query="test") + + assert ws._async_firecrawl_client is mock_async_firecrawl_client.return_value + mock_async_firecrawl_client.assert_called_once_with(api_key="test-key") + + def test_warm_up_initializes_clients(self) -> None: + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key")) + assert ws._firecrawl_client is None + assert ws._async_firecrawl_client is None + + ws.warm_up() + + assert ws._firecrawl_client is not None + assert ws._async_firecrawl_client is not None + + def test_run_empty_web_results(self, mock_client) -> None: + empty_response = MagicMock() + empty_response.web = [] + mock_client.search.return_value = empty_response + + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key")) + ws._firecrawl_client = mock_client + result = ws.run(query="obscure query") + assert result["documents"] == [] + assert result["links"] == [] + + def test_run_none_web_results(self, mock_client) -> None: + none_response = MagicMock() + none_response.web = None + mock_client.search.return_value = none_response + + ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key")) + ws._firecrawl_client = mock_client + result = ws.run(query="obscure query") + assert result["documents"] == [] + assert result["links"] == [] + + @pytest.mark.skipif( + not os.environ.get("FIRECRAWL_API_KEY"), + reason="Export FIRECRAWL_API_KEY to run integration tests.", + ) + @pytest.mark.integration + def test_run_integration(self) -> None: + ws = FirecrawlWebSearch( + api_key=Secret.from_env_var("FIRECRAWL_API_KEY"), + top_k=3, + ) + result = ws.run(query="What is Haystack by deepset?") + + assert "documents" in result + assert "links" in result + assert isinstance(result["documents"], list) + assert len(result["documents"]) > 0 + assert len(result["links"]) > 0 + assert isinstance(result["documents"][0], Document) + assert result["documents"][0].content + + @pytest.mark.skipif( + not os.environ.get("FIRECRAWL_API_KEY"), + reason="Export FIRECRAWL_API_KEY to run integration tests.", + ) + @pytest.mark.integration + @pytest.mark.asyncio + async def test_run_async_integration(self) -> None: + ws = FirecrawlWebSearch( + api_key=Secret.from_env_var("FIRECRAWL_API_KEY"), + top_k=3, + ) + result = await ws.run_async(query="What is Haystack by deepset?") + + assert "documents" in result + assert "links" in result + assert len(result["documents"]) > 0 + assert len(result["links"]) > 0