diff --git a/integrations/deepeval/pyproject.toml b/integrations/deepeval/pyproject.toml index 58a9a5e470..01e3011cdc 100644 --- a/integrations/deepeval/pyproject.toml +++ b/integrations/deepeval/pyproject.toml @@ -82,6 +82,7 @@ line-length = 120 [tool.ruff.lint] select = [ "A", + "ANN", "ARG", "B", "C", @@ -110,6 +111,7 @@ select = [ ignore = [ # Allow non-abstract empty methods in abstract base classes "B027", + "ANN401", # Allow Any - used legitimately for dynamic types and SDK boundaries # Allow boolean positional values in function calls, like `dict.get(... True)` "FBT003", # Ignore checks for possible passwords @@ -139,7 +141,7 @@ ban-relative-imports = "all" [tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, and relative imports -"tests/**/*" = ["PLR2004", "S101", "TID252"] +"tests/**/*" = ["PLR2004", "S101", "TID252", "ANN"] [tool.coverage.run] source = ["haystack_integrations"] diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py index f73d4872cd..9f25d642f9 100644 --- a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py +++ b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/evaluator.py @@ -54,7 +54,7 @@ def __init__( self, metric: str | DeepEvalMetric, metric_params: dict[str, Any] | None = None, - ): + ) -> None: """ Construct a new DeepEval evaluator. @@ -144,7 +144,7 @@ def from_dict(cls, data: dict[str, Any]) -> "DeepEvalEvaluator": def _invoke_deepeval(test_cases: list[LLMTestCase], metric: BaseMetric) -> EvaluationResult: return evaluate(test_cases=test_cases, metrics=[metric]) - def _init_backend(self): + def _init_backend(self) -> None: """ Initialize the DeepEval backend. """ diff --git a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py index 8575521097..a115ab2c16 100644 --- a/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py +++ b/integrations/deepeval/src/haystack_integrations/components/evaluators/deepeval/metrics.py @@ -49,7 +49,7 @@ class DeepEvalMetric(Enum): #: Inputs - `questions: List[str], contexts: List[List[str]], responses: List[str]` CONTEXTUAL_RELEVANCE = "contextual_relevance" - def __str__(self): + def __str__(self) -> str: return self.value @classmethod @@ -87,7 +87,13 @@ class MetricResult: score: float | None = None explanation: str | None = None - def to_dict(self): + def to_dict(self) -> dict[str, Any]: + """ + Convert the metric result to a dictionary. + + :returns: + A dictionary with the metric result fields. + """ return dataclasses.asdict(self) @@ -129,6 +135,23 @@ def new( *, init_parameters: Mapping[str, type] | None = None, ) -> "MetricDescriptor": + """ + Create a new metric descriptor, inferring input parameters from the converter signature. + + :param metric: + The metric enum value. + :param backend: + The DeepEval metric class to instantiate. + :param input_converter: + Callable that converts Haystack inputs to DeepEval test cases. + :param output_converter: + Callable that converts DeepEval results to `MetricResult` objects. + If ``None``, the default output converter is used. + :param init_parameters: + Optional mapping of parameter names to types accepted by the backend metric's constructor. + :returns: + A new `MetricDescriptor` instance. + """ input_converter_signature = inspect.signature(input_converter) input_parameters = {} for name, param in input_converter_signature.parameters.items(): @@ -158,7 +181,7 @@ class InputConverters: """ @staticmethod - def _validate_input_elements(**kwargs): + def _validate_input_elements(**kwargs: Any) -> None: for k, collection in kwargs.items(): if not isinstance(collection, list): msg = ( @@ -177,6 +200,18 @@ def _validate_input_elements(**kwargs): @staticmethod def validate_input_parameters(metric: DeepEvalMetric, expected: dict[str, Any], received: dict[str, Any]) -> None: + """ + Validate that all expected input parameters are present in the received inputs. + + :param metric: + The metric being evaluated, used for error messages. + :param expected: + Dictionary of expected parameter names to their types. + :param received: + Dictionary of received parameter names to their values. + :raises ValueError: + If a required parameter is missing from ``received``. + """ for param, _ in expected.items(): if param not in received: msg = f"DeepEval evaluator expected input parameter '{param}' for metric '{metric}'" @@ -186,6 +221,18 @@ def validate_input_parameters(metric: DeepEvalMetric, expected: dict[str, Any], def question_context_response( questions: list[str], contexts: list[list[str]], responses: list[str] ) -> Iterable[LLMTestCase]: + """ + Convert question, context, and response inputs to DeepEval test cases. + + :param questions: + List of input questions. + :param contexts: + List of retrieval context lists, one per question. + :param responses: + List of model responses, one per question. + :returns: + An iterable of `LLMTestCase` objects. + """ InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses) for q, c, r in zip(questions, contexts, responses, strict=True): # type: ignore test_case = LLMTestCase(input=q, actual_output=r, retrieval_context=c) @@ -195,6 +242,20 @@ def question_context_response( def question_context_response_ground_truth( questions: list[str], contexts: list[list[str]], responses: list[str], ground_truths: list[str] ) -> Iterable[LLMTestCase]: + """ + Convert question, context, response, and ground truth inputs to DeepEval test cases. + + :param questions: + List of input questions. + :param contexts: + List of retrieval context lists, one per question. + :param responses: + List of model responses, one per question. + :param ground_truths: + List of expected (ground truth) responses, one per question. + :returns: + An iterable of `LLMTestCase` objects. + """ InputConverters._validate_input_elements(questions=questions, contexts=contexts, responses=responses) for q, c, r, gt in zip(questions, contexts, responses, ground_truths, strict=True): # type: ignore test_case = LLMTestCase(input=q, actual_output=r, retrieval_context=c, expected_output=gt) @@ -212,6 +273,15 @@ class OutputConverters: def default( metric: DeepEvalMetric, ) -> Callable[[TestResult], list[MetricResult]]: + """ + Return the default output converter for a given metric. + + :param metric: + The metric for which to create the converter. + :returns: + A callable that converts a `TestResult` to a list of `MetricResult` objects. + """ + def inner(output: TestResult, metric: DeepEvalMetric) -> list[MetricResult]: metric_name = str(metric) assert output.metrics_data