1+ import inspect
12import os
2- from unittest .mock import MagicMock
3+ from unittest .mock import AsyncMock , MagicMock
34
45import pytest
56from haystack import Document , Pipeline
@@ -47,6 +48,20 @@ async def ascore(user_input: str, response: str, retrieved_contexts: list) -> Me
4748 return metric
4849
4950
51+ def make_metric_async (name : str , score : float = 0.8 , reason : str = "test reason" ) -> MagicMock :
52+ """Create a mock SimpleBaseMetric with a concrete ascore signature for inspect.signature."""
53+ metric = MagicMock (spec = SimpleBaseMetric )
54+ metric .name = name
55+
56+ async def ascore (user_input : str , response : str , retrieved_contexts : list ) -> MetricResult :
57+ return MetricResult (value = score , reason = reason )
58+
59+ mock_ascore = AsyncMock (return_value = MetricResult (value = score , reason = reason ))
60+ mock_ascore .__signature__ = inspect .signature (ascore )
61+ metric .ascore = mock_ascore
62+ return metric
63+
64+
5065class TestInit :
5166 def test_init (self , monkeypatch ):
5267 monkeypatch .setenv ("OPENAI_API_KEY" , "test" )
@@ -67,7 +82,7 @@ def test_init_with_multiple_metrics(self, monkeypatch):
6782 assert len (evaluator .metrics ) == 2
6883
6984 def test_invalid_metrics_raises_type_error (self ):
70- with pytest .raises (TypeError , match = "All items in ragas_metrics must be instances of SimpleBaseMetric." ):
85+ with pytest .raises (TypeError , match = r "All items in ragas_metrics must be instances of SimpleBaseMetric." ):
7186 RagasEvaluator (ragas_metrics = ["not_a_metric" ])
7287
7388
@@ -167,6 +182,119 @@ def test_run_raises_on_invalid_input_types(self, invalid_input, field_name, erro
167182 assert error_message in str (exc_info .value )
168183
169184
185+ class TestRunAsync :
186+ @pytest .mark .asyncio
187+ async def test_run_async_returns_result_by_metric_name (self ) -> None :
188+ metric = make_metric_async ("faithfulness" , score = 0.9 )
189+ evaluator = RagasEvaluator (ragas_metrics = [metric ])
190+ output = await evaluator .run_async (
191+ query = "Which is the most popular global sport?" ,
192+ response = "Football is the most popular sport." ,
193+ documents = ["Football is undoubtedly the world's most popular sport." ],
194+ )
195+ assert "result" in output
196+ assert "faithfulness" in output ["result" ]
197+ result = output ["result" ]["faithfulness" ]
198+ assert isinstance (result , MetricResult )
199+ assert result .value == 0.9
200+
201+ @pytest .mark .asyncio
202+ async def test_run_async_scores_all_metrics (self ) -> None :
203+ metrics = [make_metric_async ("faithfulness" , 0.9 ), make_metric_async ("answer_relevancy" , 0.7 )]
204+ evaluator = RagasEvaluator (ragas_metrics = metrics )
205+ output = await evaluator .run_async (query = "test?" , response = "answer" , documents = ["doc" ])
206+ assert set (output ["result" ].keys ()) == {"faithfulness" , "answer_relevancy" }
207+ assert output ["result" ]["faithfulness" ].value == 0.9
208+ assert output ["result" ]["answer_relevancy" ].value == 0.7
209+
210+ @pytest .mark .asyncio
211+ async def test_run_async_calls_ascore_on_each_metric (self ) -> None :
212+ metric_a = make_metric_async ("faithfulness" )
213+ metric_b = make_metric_async ("answer_relevancy" )
214+ evaluator = RagasEvaluator (ragas_metrics = [metric_a , metric_b ])
215+ await evaluator .run_async (query = "test?" , response = "answer" , documents = ["doc" ])
216+ metric_a .ascore .assert_called_once ()
217+ metric_b .ascore .assert_called_once ()
218+
219+ @pytest .mark .asyncio
220+ async def test_score_metric_async_passes_only_matching_params (self ) -> None :
221+ """Metric that only needs user_input + response should not receive retrieved_contexts."""
222+ metric = MagicMock (spec = SimpleBaseMetric )
223+ metric .name = "selective_metric"
224+
225+ async def ascore (user_input : str , response : str ) -> MetricResult :
226+ return MetricResult (value = 0.5 , reason = "ok" )
227+
228+ metric .ascore = ascore
229+
230+ evaluator = RagasEvaluator (ragas_metrics = [metric ])
231+ await evaluator .run_async (query = "test?" , response = "answer" , documents = ["doc" ], reference = "ref" )
232+ # Only user_input and response should have been passed — not retrieved_contexts or reference
233+ # We wrap ascore to capture kwargs
234+ captured = {}
235+
236+ async def capturing_ascore (user_input : str , response : str ) -> MetricResult :
237+ captured .update ({"user_input" : user_input , "response" : response })
238+ return MetricResult (value = 0.5 , reason = "ok" )
239+
240+ metric .ascore = capturing_ascore
241+ await evaluator .run_async (query = "test?" , response = "answer" , documents = ["doc" ], reference = "ref" )
242+ assert set (captured .keys ()) == {"user_input" , "response" }
243+
244+ @pytest .mark .asyncio
245+ async def test_score_metric_async_omits_none_fields (self ) -> None :
246+ metric = make_metric_async ("faithfulness" )
247+ evaluator = RagasEvaluator (ragas_metrics = [metric ])
248+ await evaluator .run_async (query = "test?" , response = "answer" ) # no documents → retrieved_contexts=None
249+ _ , kwargs = metric .ascore .call_args
250+ assert "retrieved_contexts" not in kwargs
251+
252+ @pytest .mark .asyncio
253+ async def test_run_async_accepts_document_objects (self ) -> None :
254+ metric = make_metric_async ("faithfulness" )
255+ evaluator = RagasEvaluator (ragas_metrics = [metric ])
256+ await evaluator .run_async (
257+ query = "test?" ,
258+ response = "answer" ,
259+ documents = [Document (content = "some content" ), Document (content = "more content" )],
260+ )
261+ _ , kwargs = metric .ascore .call_args
262+ assert kwargs ["retrieved_contexts" ] == ["some content" , "more content" ]
263+
264+ @pytest .mark .asyncio
265+ async def test_run_async_accepts_string_documents (self ):
266+ metric = make_metric_async ("faithfulness" )
267+ evaluator = RagasEvaluator (ragas_metrics = [metric ])
268+ await evaluator .run_async (query = "test?" , response = "answer" , documents = ["doc one" , "doc two" ])
269+ _ , kwargs = metric .ascore .call_args
270+ assert kwargs ["retrieved_contexts" ] == ["doc one" , "doc two" ]
271+
272+ @pytest .mark .asyncio
273+ @pytest .mark .parametrize (
274+ "invalid_input,field_name,error_message" ,
275+ [
276+ (["Invalid query type" ], "query" , "'query' field expected" ),
277+ ([123 , ["Invalid document" ]], "documents" , "'documents' must be a list" ),
278+ (["score_1" ], "rubrics" , "'rubrics' field expected" ),
279+ ],
280+ )
281+ async def test_run_async_raises_on_invalid_input_types (self , invalid_input , field_name , error_message ):
282+ evaluator = RagasEvaluator (ragas_metrics = [make_metric_async ("faithfulness" )])
283+ query = "Which is the most popular global sport?"
284+ documents = ["Football is the most popular sport." ]
285+ response = "Football is the most popular sport in the world"
286+
287+ with pytest .raises (ValueError ) as exc_info :
288+ if field_name == "query" :
289+ await evaluator .run_async (query = invalid_input , documents = documents , response = response )
290+ elif field_name == "documents" :
291+ await evaluator .run_async (query = query , documents = invalid_input , response = response )
292+ elif field_name == "rubrics" :
293+ await evaluator .run_async (query = query , rubrics = invalid_input , documents = documents , response = response )
294+
295+ assert error_message in str (exc_info .value )
296+
297+
170298class TestSerialization :
171299 def test_to_dict (self , monkeypatch ):
172300 monkeypatch .setenv ("OPENAI_API_KEY" , "test" )
0 commit comments