@@ -320,3 +320,87 @@ def test_live_run(self):
320320 assert "prompt_tokens" in result ["meta" ][0 ]["usage" ]
321321 assert "completion_tokens" in result ["meta" ][0 ]["usage" ]
322322 assert "total_tokens" in result ["meta" ][0 ]["usage" ]
323+
324+
325+ class TestFaithfulnessEvaluatorAsync :
326+ @pytest .mark .asyncio
327+ async def test_run_async_calculates_mean_score (self , monkeypatch ):
328+ monkeypatch .setenv ("OPENAI_API_KEY" , "test-api-key" )
329+ component = FaithfulnessEvaluator ()
330+
331+ async def chat_generator_run_async (self , * args , ** kwargs ):
332+ if "Football" in kwargs ["messages" ][0 ].text :
333+ return {
334+ "replies" : [ChatMessage .from_assistant ('{"statements": ["a", "b"], "statement_scores": [1, 0]}' )]
335+ }
336+ return {"replies" : [ChatMessage .from_assistant ('{"statements": ["c", "d"], "statement_scores": [1, 1]}' )]}
337+
338+ monkeypatch .setattr (
339+ "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run_async" , chat_generator_run_async
340+ )
341+
342+ questions = ["Which is the most popular global sport?" , "Who created the Python language?" ]
343+ contexts = [["Football is the world's most popular sport." ], ["Python was created by Guido van Rossum." ]]
344+ predicted_answers = ["Football is the most popular sport." , "Python is a language created by George Lucas." ]
345+ results = await component .run_async (questions = questions , contexts = contexts , predicted_answers = predicted_answers )
346+ assert results == {
347+ "individual_scores" : [0.5 , 1.0 ],
348+ "results" : [
349+ {"score" : 0.5 , "statement_scores" : [1 , 0 ], "statements" : ["a" , "b" ]},
350+ {"score" : 1.0 , "statement_scores" : [1 , 1 ], "statements" : ["c" , "d" ]},
351+ ],
352+ "score" : 0.75 ,
353+ "meta" : None ,
354+ }
355+
356+ @pytest .mark .asyncio
357+ async def test_run_async_returns_nan_raise_on_failure_false (self , monkeypatch , caplog ):
358+ monkeypatch .setenv ("OPENAI_API_KEY" , "test-api-key" )
359+ component = FaithfulnessEvaluator (raise_on_failure = False )
360+
361+ async def chat_generator_run_async (self , * args , ** kwargs ):
362+ if "Python" in kwargs ["messages" ][0 ].text :
363+ raise Exception ("OpenAI API request failed." )
364+ return {"replies" : [ChatMessage .from_assistant ('{"statements": ["c", "d"], "statement_scores": [1, 1]}' )]}
365+
366+ monkeypatch .setattr (
367+ "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run_async" , chat_generator_run_async
368+ )
369+
370+ questions = ["Which is the most popular global sport?" , "Who created the Python language?" ]
371+ contexts = [["Football is popular." ], ["Python was created by Guido." ]]
372+ predicted_answers = ["Football is popular." , "Guido van Rossum." ]
373+
374+ with caplog .at_level ("WARNING" , logger = "haystack.components.evaluators.faithfulness" ):
375+ results = await component .run_async (
376+ questions = questions , contexts = contexts , predicted_answers = predicted_answers
377+ )
378+
379+ assert results ["score" ] == 1.0
380+ assert results ["individual_scores" ][0 ] == 1.0
381+ assert math .isnan (results ["individual_scores" ][1 ])
382+ assert "1 query(s) failed and were excluded from the score." in caplog .text
383+
384+ @pytest .mark .asyncio
385+ @pytest .mark .skipif (
386+ not os .environ .get ("OPENAI_API_KEY" , None ),
387+ reason = "Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test." ,
388+ )
389+ @pytest .mark .integration
390+ async def test_live_run_async (self ):
391+ questions = ["What is Python and who created it?" ]
392+ contexts = [["Python is a programming language created by Guido van Rossum." ]]
393+ predicted_answers = ["Python is a programming language created by George Lucas." ]
394+ evaluator = FaithfulnessEvaluator (chat_generator = OpenAIChatGenerator (model = "gpt-4.1-nano" ))
395+ result = await evaluator .run_async (questions = questions , contexts = contexts , predicted_answers = predicted_answers )
396+
397+ required_fields = {"individual_scores" , "results" , "score" }
398+ assert all (field in result for field in required_fields )
399+ nested_required_fields = {"score" , "statement_scores" , "statements" }
400+ assert all (field in result ["results" ][0 ] for field in nested_required_fields )
401+
402+ # assert that metadata is present in the result
403+ assert "meta" in result
404+ assert "prompt_tokens" in result ["meta" ][0 ]["usage" ]
405+ assert "completion_tokens" in result ["meta" ][0 ]["usage" ]
406+ assert "total_tokens" in result ["meta" ][0 ]["usage" ]
0 commit comments