Skip to content

Commit 25020f2

Browse files
authored
test: Jina - add unit tests (#3214)
1 parent e462250 commit 25020f2

4 files changed

Lines changed: 130 additions & 0 deletions

File tree

integrations/jina/tests/test_document_image_embedder.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,39 @@ def mock_response_func(*_args, **kwargs):
240240
assert len(call_args_list[1][1]["json"]["input"]) == 5 # Second batch: 5 images
241241
assert len(call_args_list[2][1]["json"]["input"]) == 2 # Third batch: 2 images
242242

243+
@patch("haystack_integrations.components.embedders.jina.document_image_embedder._extract_image_sources_info")
244+
def test_extract_images_pdf_missing_page_number_raises_value_error(self, mock_extract_info):
245+
documents = [Document(content="PDF doc", meta={"file_path": "test.pdf"})]
246+
mock_extract_info.return_value = [{"path": "test.pdf", "mime_type": "application/pdf"}]
247+
248+
embedder = JinaDocumentImageEmbedder(api_key=Secret.from_token("fake-api-key"))
249+
with pytest.raises(ValueError, match="Page number is required for PDF document at index 0"):
250+
embedder._extract_images_to_embed(documents)
251+
252+
@patch("haystack_integrations.components.embedders.jina.document_image_embedder._extract_image_sources_info")
253+
@patch("haystack_integrations.components.embedders.jina.document_image_embedder._batch_convert_pdf_pages_to_images")
254+
def test_extract_images_pdf_conversion_silent_failure_raises_runtime_error(
255+
self, mock_batch_convert, mock_extract_info
256+
):
257+
documents = [Document(content="PDF doc", meta={"file_path": "test.pdf", "page_number": 1})]
258+
mock_extract_info.return_value = [{"path": "test.pdf", "mime_type": "application/pdf", "page_number": 1}]
259+
# PDF conversion returns empty dict: image stays None without raising
260+
mock_batch_convert.return_value = {}
261+
262+
embedder = JinaDocumentImageEmbedder(api_key=Secret.from_token("fake-api-key"))
263+
with pytest.raises(RuntimeError, match="Conversion failed for some documents"):
264+
embedder._extract_images_to_embed(documents)
265+
266+
@pytest.mark.asyncio
267+
async def test_run_async_with_connection_error(self):
268+
documents = [Document(content="img", meta={"file_path": "test.jpg"})]
269+
embedder = JinaDocumentImageEmbedder(api_key=Secret.from_token("fake-api-key"))
270+
271+
with patch.object(embedder, "_extract_images_to_embed", return_value=["data:image/jpeg;base64,x"]):
272+
with patch("httpx.AsyncClient.post", side_effect=Exception("Connection failed")):
273+
with pytest.raises(RuntimeError, match="Error calling Jina API: Connection failed"):
274+
await embedder.run_async(documents=documents)
275+
243276
@pytest.mark.asyncio
244277
async def test_run_async_with_successful_request(self):
245278
documents = [Document(content="Test image", meta={"file_path": "test.jpg"})]

integrations/jina/tests/test_ranker.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,30 @@ def test_init_fail_wo_api_key(self, monkeypatch):
5353
with pytest.raises(ValueError):
5454
JinaRanker()
5555

56+
def test_init_fails_with_invalid_top_k(self):
57+
with pytest.raises(ValueError, match="top_k must be > 0, but got 0"):
58+
JinaRanker(api_key=Secret.from_token("fake-api-key"), top_k=0)
59+
60+
def test_from_dict(self, monkeypatch):
61+
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
62+
data = {
63+
"type": "haystack_integrations.components.rankers.jina.ranker.JinaRanker",
64+
"init_parameters": {
65+
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
66+
"model": "model",
67+
"base_url": "https://my.custom.url/v1/rerank",
68+
"top_k": 5,
69+
"score_threshold": 0.3,
70+
},
71+
}
72+
ranker = JinaRanker.from_dict(data)
73+
74+
assert ranker.api_key == Secret.from_env_var("JINA_API_KEY")
75+
assert ranker.model == "model"
76+
assert ranker.base_url == "https://my.custom.url/v1/rerank"
77+
assert ranker.top_k == 5
78+
assert ranker.score_threshold == 0.3
79+
5680
def test_to_dict(self, monkeypatch):
5781
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
5882
component = JinaRanker()
@@ -190,6 +214,34 @@ def test_run_on_empty_docs(self):
190214
assert result["documents"] is not None
191215
assert not result["documents"] # empty list
192216

217+
def test_run_raises_runtime_error_on_api_error(self):
218+
mock_response = httpx.Response(400, json={"detail": "Bad request"})
219+
with patch("httpx.Client.post", return_value=mock_response):
220+
ranker = JinaRanker(api_key=Secret.from_token("fake-api-key"))
221+
with pytest.raises(RuntimeError, match="Bad request"):
222+
ranker.run(query="q", documents=[Document(content="doc")])
223+
224+
def test_run_with_score_threshold_filters_results(self):
225+
docs = [Document(content=f"doc {i}") for i in range(4)]
226+
227+
with patch("httpx.Client.post", side_effect=mock_httpx_post_response):
228+
ranker = JinaRanker(api_key=Secret.from_token("fake-api-key"), score_threshold=2.5)
229+
result = ranker.run(query="q", documents=docs)
230+
231+
# mock assigns scores len(docs)-i, so for 4 docs scores are 4, 3, 2, 1 - only first two pass
232+
ranked = result["documents"]
233+
assert len(ranked) == 2
234+
assert all(doc.score >= 2.5 for doc in ranked)
235+
236+
def test_run_with_top_k_truncates_results(self):
237+
docs = [Document(content=f"doc {i}") for i in range(5)]
238+
239+
with patch("httpx.Client.post", side_effect=mock_httpx_post_response):
240+
ranker = JinaRanker(api_key=Secret.from_token("fake-api-key"))
241+
result = ranker.run(query="q", documents=docs, top_k=2)
242+
243+
assert len(result["documents"]) == 2
244+
193245
@pytest.mark.skipif(not os.environ.get("JINA_API_KEY", None), reason="JINA_API_KEY env var not set")
194246
@pytest.mark.integration
195247
def test_run_integration(self):

integrations/jina/tests/test_reader_connector.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,25 @@ def test_run_with_mocked_response(self, monkeypatch):
116116
assert document.meta["title"] == "Mocked Title"
117117
assert document.meta["url"] == "https://example.com"
118118

119+
def test_run_with_raw_response(self, monkeypatch):
120+
monkeypatch.setenv("JINA_API_KEY", "test-api-key")
121+
mock_response = httpx.Response(
122+
200,
123+
text="raw page content",
124+
headers={"Content-Type": "text/plain"},
125+
)
126+
127+
with patch("httpx.Client.get", return_value=mock_response) as mock_get:
128+
reader = JinaReaderConnector(mode="read", json_response=False)
129+
result = reader.run(query="https://example.com")
130+
131+
# no Accept: application/json header when raw response is requested
132+
assert "Accept" not in mock_get.call_args[1]["headers"]
133+
134+
document = result["documents"][0]
135+
assert document.content == "raw page content"
136+
assert document.meta == {"content_type": "text/plain", "query": "https://example.com"}
137+
119138
@pytest.mark.asyncio
120139
async def test_run_async_with_mocked_response(self, monkeypatch):
121140
monkeypatch.setenv("JINA_API_KEY", "test-api-key")

integrations/jina/tests/test_text_embedder.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,32 @@ def test_to_dict(self, monkeypatch):
5858
},
5959
}
6060

61+
def test_from_dict(self, monkeypatch):
62+
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
63+
data = {
64+
"type": "haystack_integrations.components.embedders.jina.text_embedder.JinaTextEmbedder",
65+
"init_parameters": {
66+
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
67+
"model": "model",
68+
"base_url": "https://my.custom.url/v1/embeddings",
69+
"prefix": "prefix",
70+
"suffix": "suffix",
71+
"task": "retrieval.query",
72+
"dimensions": 1024,
73+
"late_chunking": True,
74+
},
75+
}
76+
embedder = JinaTextEmbedder.from_dict(data)
77+
78+
assert embedder.api_key == Secret.from_env_var("JINA_API_KEY")
79+
assert embedder.model_name == "model"
80+
assert embedder.base_url == "https://my.custom.url/v1/embeddings"
81+
assert embedder.prefix == "prefix"
82+
assert embedder.suffix == "suffix"
83+
assert embedder.task == "retrieval.query"
84+
assert embedder.dimensions == 1024
85+
assert embedder.late_chunking is True
86+
6187
def test_to_dict_with_custom_init_parameters(self, monkeypatch):
6288
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
6389
component = JinaTextEmbedder(

0 commit comments

Comments
 (0)