Skip to content

Commit a279d5b

Browse files
refactor: Address review feedback for FirecrawlWebSearch
- Remove warm_up() from docstring example (auto-called on first run) - Remove custom to_dict()/from_dict() in favor of default serialization - Update top_k docstring to note it can be overridden by limit in search_params - Remove client-side top_k truncation (Firecrawl respects limit server-side) - Use SearchData type instead of Any for _parse_search_response - Remove redundant or "" from getattr() calls - Use search_response.web directly instead of getattr() - Remove corresponding tests for removed functionality
1 parent 6c69a76 commit a279d5b

2 files changed

Lines changed: 8 additions & 80 deletions

File tree

integrations/firecrawl/src/haystack_integrations/components/websearch/firecrawl/firecrawl_websearch.py

Lines changed: 8 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from typing import Any
66

77
from firecrawl import AsyncFirecrawl, Firecrawl # type: ignore[import-untyped]
8-
from haystack import Document, component, default_from_dict, default_to_dict, logging
8+
from firecrawl.types import SearchData # type: ignore[import-untyped]
9+
from haystack import Document, component, logging
910
from haystack.utils import Secret
1011

1112
logger = logging.getLogger(__name__)
@@ -33,8 +34,6 @@ class FirecrawlWebSearch:
3334
api_key=Secret.from_env_var("FIRECRAWL_API_KEY"),
3435
top_k=5,
3536
)
36-
websearch.warm_up()
37-
3837
result = websearch.run(query="What is Haystack by deepset?")
3938
documents = result["documents"]
4039
links = result["links"]
@@ -55,7 +54,7 @@ def __init__(
5554
Defaults to the `FIRECRAWL_API_KEY` environment variable.
5655
:param top_k:
5756
Maximum number of documents to return.
58-
Defaults to 10.
57+
Defaults to 10. This can be overridden by the `"limit"` parameter in `search_params`.
5958
:param search_params:
6059
Additional parameters passed to the Firecrawl search API.
6160
See the [Firecrawl API reference](https://docs.firecrawl.dev/api-reference/endpoint/search)
@@ -79,20 +78,6 @@ def warm_up(self) -> None:
7978
if self._async_firecrawl_client is None:
8079
self._async_firecrawl_client = AsyncFirecrawl(api_key=self.api_key.resolve_value())
8180

82-
def to_dict(self) -> dict[str, Any]:
83-
"""Serializes the component to a dictionary."""
84-
return default_to_dict(
85-
self,
86-
api_key=self.api_key.to_dict(),
87-
top_k=self.top_k,
88-
search_params=self.search_params,
89-
)
90-
91-
@classmethod
92-
def from_dict(cls, data: dict[str, Any]) -> "FirecrawlWebSearch":
93-
"""Deserializes the component from a dictionary."""
94-
return default_from_dict(cls, data)
95-
9681
@component.output_types(documents=list[Document], links=list[str])
9782
def run(
9883
self,
@@ -128,11 +113,6 @@ def run(
128113
return {"documents": [], "links": []}
129114

130115
documents, links = self._parse_search_response(search_response)
131-
132-
if self.top_k is not None:
133-
documents = documents[: self.top_k]
134-
links = links[: self.top_k]
135-
136116
return {"documents": documents, "links": links}
137117

138118
@component.output_types(documents=list[Document], links=list[str])
@@ -170,15 +150,10 @@ async def run_async(
170150
return {"documents": [], "links": []}
171151

172152
documents, links = self._parse_search_response(search_response)
173-
174-
if self.top_k is not None:
175-
documents = documents[: self.top_k]
176-
links = links[: self.top_k]
177-
178153
return {"documents": documents, "links": links}
179154

180155
@staticmethod
181-
def _parse_search_response(search_response: Any) -> tuple[list[Document], list[str]]:
156+
def _parse_search_response(search_response: SearchData) -> tuple[list[Document], list[str]]:
182157
"""
183158
Convert a Firecrawl search response to Haystack Documents and links.
184159
@@ -188,7 +163,7 @@ def _parse_search_response(search_response: Any) -> tuple[list[Document], list[s
188163
documents: list[Document] = []
189164
links: list[str] = []
190165

191-
web_results = getattr(search_response, "web", None) or []
166+
web_results = search_response.web or []
192167
for result in web_results:
193168
url = ""
194169
title = ""
@@ -200,9 +175,9 @@ def _parse_search_response(search_response: Any) -> tuple[list[Document], list[s
200175
url = metadata.get("url", getattr(result, "url", ""))
201176
title = metadata.get("title", "")
202177
else:
203-
url = getattr(result, "url", "") or ""
204-
title = getattr(result, "title", "") or ""
205-
content = getattr(result, "description", "") or ""
178+
url = getattr(result, "url", "")
179+
title = getattr(result, "title", "")
180+
content = getattr(result, "description", "")
206181

207182
doc = Document(
208183
content=content,

integrations/firecrawl/tests/test_firecrawl_websearch.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import pytest
99
from haystack import Document
10-
from haystack.core.serialization import component_from_dict, component_to_dict
1110
from haystack.utils import Secret
1211

1312
from haystack_integrations.components.websearch.firecrawl import FirecrawlWebSearch
@@ -68,32 +67,6 @@ def test_init_with_params(self) -> None:
6867
assert ws._search_params == {"tbs": "qdr:d", "location": "US"}
6968
assert ws.api_key.resolve_value() == "custom-key"
7069

71-
def test_to_dict(self, monkeypatch: pytest.MonkeyPatch) -> None:
72-
monkeypatch.setenv("FIRECRAWL_API_KEY", "test-key")
73-
ws = FirecrawlWebSearch(top_k=5, search_params={"tbs": "qdr:d"})
74-
data = component_to_dict(ws, "FirecrawlWebSearch")
75-
assert (
76-
data["type"]
77-
== "haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.FirecrawlWebSearch"
78-
)
79-
assert data["init_parameters"]["top_k"] == 5
80-
assert data["init_parameters"]["search_params"] == {"tbs": "qdr:d"}
81-
82-
def test_from_dict(self, monkeypatch: pytest.MonkeyPatch) -> None:
83-
monkeypatch.setenv("FIRECRAWL_API_KEY", "test-key")
84-
data = {
85-
"type": ("haystack_integrations.components.websearch.firecrawl.firecrawl_websearch.FirecrawlWebSearch"),
86-
"init_parameters": {
87-
"top_k": 3,
88-
"search_params": {"location": "UK"},
89-
"api_key": {"env_vars": ["FIRECRAWL_API_KEY"], "strict": True, "type": "env_var"},
90-
},
91-
}
92-
ws = component_from_dict(FirecrawlWebSearch, data, "FirecrawlWebSearch")
93-
assert ws.top_k == 3
94-
assert ws.search_params == {"location": "UK"}
95-
assert ws.api_key.resolve_value() == "test-key"
96-
9770
def test_run_returns_documents_and_links(self, mock_client) -> None:
9871
ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key"), top_k=10)
9972
ws._firecrawl_client = mock_client
@@ -141,26 +114,6 @@ def test_run_overrides_init_params_with_runtime_params(self, mock_client) -> Non
141114
limit=5,
142115
)
143116

144-
def test_run_respects_top_k(self, mock_client) -> None:
145-
results = []
146-
for i in range(5):
147-
r = MagicMock(spec=["url", "title", "description"])
148-
r.url = f"https://example.com/{i}"
149-
r.title = f"Title {i}"
150-
r.description = f"Description {i}"
151-
results.append(r)
152-
response = MagicMock()
153-
response.web = results
154-
mock_client.search.return_value = response
155-
156-
ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key"), top_k=3)
157-
ws._firecrawl_client = mock_client
158-
159-
result = ws.run(query="test")
160-
161-
assert len(result["documents"]) == 3
162-
assert len(result["links"]) == 3
163-
164117
@pytest.mark.asyncio
165118
async def test_run_async(self, mock_async_client) -> None:
166119
ws = FirecrawlWebSearch(api_key=Secret.from_token("test-key"), top_k=10)

0 commit comments

Comments
 (0)