FIRE-572 FIRE-589 | Add support for tsq detector in python sdk (#262)

yuval-qf · web-flow · commit f413b2beb3b8 · 2025-05-26T07:46:11.000+03:00
* Add support for tsq detector in python sqk

* Update pydoc

* Update pydoc and fix ToolDefinition name

* CR

* Bump version 0.8.0 -&gt; 0.9.0

* Improve test coverage

* Fix tests

* Fix tracing base_url
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "qualifire"
-version = "0.8.0"
+version = "0.9.0"
 description = "Qualifire Python SDK"
 readme = "README.md"
 authors = ["qualifire-dev <dror@qualifire.ai>"]
diff --git a/qualifire/client.py b/qualifire/client.py
@@ -6,7 +6,13 @@
 
 import requests
 
-from .types import EvaluationRequest, EvaluationResponse, LLMMessage, SyntaxCheckArgs
+from .types import (
+    EvaluationRequest,
+    EvaluationResponse,
+    LLMMessage,
+    LLMToolDefinition,
+    SyntaxCheckArgs,
+)
 from .utils import get_api_key, get_base_url
 
 logger = logging.getLogger("qualifire")
@@ -27,26 +33,32 @@ def __init__(
 
     def evaluate(
         self,
-        input: str,
-        output: str,
+        input: Optional[str] = None,
+        output: Optional[str] = None,
+        messages: Optional[List[LLMMessage]] = None,
+        available_tools: Optional[List[LLMToolDefinition]] = None,
         assertions: Optional[List[str]] = None,
         dangerous_content_check: bool = False,
         grounding_check: bool = False,
         hallucinations_check: bool = False,
         harassment_check: bool = False,
         hate_speech_check: bool = False,
         instructions_following_check: bool = False,
-        messages: Optional[List[LLMMessage]] = None,
         pii_check: bool = False,
         prompt_injections: bool = False,
         sexual_content_check: bool = False,
         syntax_checks: Optional[Dict[str, SyntaxCheckArgs]] = None,
+        tool_selection_quality_check: bool = False,
     ) -> Union[EvaluationResponse, None]:
         """
         Evaluates the given input and output pairs.
 
         :param input: The primary input for the evaluation.
         :param output: The primary output (e.g., LLM response) to evaluate.
+        :param messages: List of message objects representing conversation history.
+            Must be set if tool_selection_quality_check is True.
+        :param available_tools: List of available tools.
+            Must be set if tool_selection_quality_check is True.
         :param assertions: A list of custom assertions to check against the output.
         :param dangerous_content_check: Check for dangerous content generation.
         :param grounding_check: Check if the output is grounded in the provided
@@ -56,11 +68,12 @@ def evaluate(
         :param hate_speech_check: Check for hate speech.
         :param instructions_following_check: Check if the output follows instructions
                                              in the input/messages.
-        :param messages: List of message objects representing conversation history.
         :param pii_check: Check for personally identifiable information.
         :param prompt_injections: Check for attempts at prompt injection.
         :param sexual_content_check: Check for sexually explicit content.
         :param syntax_checks: Dictionary defining syntax checks (e.g., JSON, SQL).
+        :param tool_selection_quality_check: Check for tool selection quality.
+            Only works when `available_tools` and `messages` are provided.
 
         :return: An EvaluationResponse object containing the evaluation results.
         :raises Exception: If an error occurs during the evaluation.
@@ -95,27 +108,83 @@ def evaluate(
             sexual_content_check=True,
             syntax_checks={
                 "json": SyntaxCheckArgs(args="strict") # Example syntax check
-            }
+            },
         )
         ```
-        """
 
+        Example with tools:
+        ```python
+        from qualifire import Qualifire
+        from qualifire.types import LLMMessage, LLMToolDefinition
+
+        qualifire = Qualifire(api_key="your_api_key")
+
+        evaluation_response = qualifire.evaluate(
+            messages=[
+                LLMMessage(
+                    role="user",
+                    content="What is the weather tomorrow in New York?",
+                ),
+                LLMMessage(
+                    role="assistant",
+                    content='please run the following tool'
+                    tool_calls=[
+                        LLMToolCall(
+                           "id": "tool_call_id",
+                            "name": "get_weather_forecast",
+                            "arguments": {
+                                "location": "New York, NY",
+                                "date": "tomorrow",
+                            },
+                        ),
+                    ],
+                ),
+            ],
+            available_tools=[
+                LLMToolDefinition(
+                    name="get_weather_forecast",
+                    description="Provides the weather forecast for a given location and date.",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g., San Francisco, CA",
+                            },
+                            "date": {
+                                "type": "string",
+                                "description": "The date for the forecast, e.g., tomorrow, or YYYY-MM-DD",
+                            },
+                        },
+                        "required": [
+                            "location",
+                            "date",
+                        ],
+                    },
+                ),
+            ],
+            tool_selection_quality_check=True,
+        )
+        ```
+        """  # noqa E501
         url = f"{self._base_url}/api/evaluation/evaluate"
         request = EvaluationRequest(
             input=input,
             output=output,
+            messages=messages,
+            available_tools=available_tools,
             assertions=assertions,
             dangerous_content_check=dangerous_content_check,
             grounding_check=grounding_check,
             hallucinations_check=hallucinations_check,
             harassment_check=harassment_check,
             hate_speech_check=hate_speech_check,
             instructions_following_check=instructions_following_check,
-            messages=messages,
             pii_check=pii_check,
             prompt_injections=prompt_injections,
             sexual_content_check=sexual_content_check,
             syntax_checks=syntax_checks,
+            tool_selection_quality_check=tool_selection_quality_check,
         )
 
         # Filter out None values before dumping to JSON
diff --git a/qualifire/tracer_init.py b/qualifire/tracer_init.py
@@ -37,7 +37,7 @@ def __configure_tracer(api_key: str) -> None:
     __suppress_prints(
         Traceloop.init,
         app_name="qualifire-agent",
-        api_endpoint=f"{get_base_url()}/telemetry",  # /v1/traces is automatically added  # noqa: E501
+        api_endpoint=f"{get_base_url()}/api/telemetry",  # /v1/traces is automatically added  # noqa: E501
         headers={"X-Qualifire-API-Key": api_key},
         telemetry_enabled=False,
         traceloop_sync_enabled=False,
diff --git a/qualifire/types.py b/qualifire/types.py
@@ -1,12 +1,27 @@
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 from dataclasses import dataclass, field
 
 
+@dataclass
+class LLMToolDefinition:
+    name: str
+    description: str
+    parameters: Dict[str, Any]
+
+
+@dataclass
+class LLMToolCall:
+    name: str
+    arguments: Dict[str, Any]
+    id: Optional[str]
+
+
 @dataclass
 class LLMMessage:
-    content: str
     role: str
+    content: str
+    tool_calls: Optional[List[LLMToolCall]] = None
 
 
 @dataclass
@@ -16,20 +31,44 @@ class SyntaxCheckArgs:
 
 @dataclass
 class EvaluationRequest:
-    input: str
-    output: str
-    dangerous_content_check: bool
-    hallucinations_check: bool
-    harassment_check: bool
-    hate_speech_check: bool
-    pii_check: bool
-    prompt_injections: bool
-    sexual_content_check: bool
+    input: Optional[str] = None
+    output: Optional[str] = None
+    messages: Optional[List[LLMMessage]] = field(default_factory=list)
+    available_tools: Optional[List[LLMToolDefinition]] = None
+    dangerous_content_check: bool = False
+    hallucinations_check: bool = False
+    harassment_check: bool = False
+    hate_speech_check: bool = False
+    pii_check: bool = False
+    prompt_injections: bool = False
+    sexual_content_check: bool = False
     grounding_check: bool = False
     instructions_following_check: bool = False
     syntax_checks: Optional[Dict[str, SyntaxCheckArgs]] = None
-    messages: Optional[List[LLMMessage]] = field(default_factory=list)
     assertions: Optional[List[str]] = field(default_factory=list)
+    tool_selection_quality_check: bool = False
+
+    def __post_init__(self):
+        self._validate_messages_input_output()
+        self._validate_tsq_requirements()
+
+    def _validate_messages_input_output(self):
+        if not self.messages and not self.input and not self.output:
+            raise ValueError(
+                "At least one of messages, input, or output must be set",
+            )
+
+    def _validate_tsq_requirements(self):
+        if self.tool_selection_quality_check and not self.messages:
+            raise ValueError(
+                "messages must be provided in conjunction "
+                "with tool_selection_quality_check=True."
+            )
+        if self.tool_selection_quality_check and not self.available_tools:
+            raise ValueError(
+                "available_tools must be provided in conjunction "
+                "with tool_selection_quality_check=True."
+            )
 
 
 @dataclass
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 if __name__ == "__main__":
     setup(
         name="qualifire",
-        version="0.8.0",
+        version="0.9.0",
         description="Qualifire Python SDK",
         author="qualifire-dev",
         author_email="dror@qualifire.ai",
diff --git a/tests/test_types.py b/tests/test_types.py
@@ -0,0 +1,113 @@
+import contextlib
+
+import pytest
+
+from qualifire.types import EvaluationRequest, LLMMessage, LLMToolDefinition
+
+_test_llm_messages = [
+    LLMMessage(
+        role="user",
+        content="test",
+    ),
+]
+
+_test_available_tools = [
+    LLMToolDefinition(
+        name="foo",
+        description="foo tool function definition",
+        parameters={
+            "type": "object",
+            "properties": {
+                "bar": {
+                    "type": "string",
+                },
+                "baz": {
+                    "type": "integer",
+                },
+            },
+            "required": ["bar", "baz"],
+        },
+    )
+]
+
+
+class TestEvaluationRequest:
+    @pytest.mark.parametrize(
+        "messages,input_,output,expected_error",
+        [
+            (None, None, None, True),
+            ([], None, None, True),
+            (None, "", None, True),
+            (None, None, "", True),
+            (_test_llm_messages, None, None, False),
+            (_test_llm_messages, "", None, False),
+            (_test_llm_messages, None, "", False),
+            (_test_llm_messages, "", "", False),
+            (None, "input", None, False),
+            (None, "input", "", False),
+            ([], "input", None, False),
+            ([], "input", "", False),
+            (None, None, "output", False),
+            (None, "", "output", False),
+            ([], None, "output", False),
+            ([], "", "output", False),
+            (_test_llm_messages, "input", None, False),
+            (_test_llm_messages, "input", "", False),
+            (_test_llm_messages, None, "output", False),
+            (_test_llm_messages, "", "output", False),
+            (None, "input", "output", False),
+            ([], "input", "output", False),
+            (_test_llm_messages, "input", "output", False),
+        ],
+    )
+    def test_validate_messages_input_output(
+        self,
+        messages,
+        input_,
+        output,
+        expected_error,
+    ):
+        with pytest.raises(ValueError) if expected_error else contextlib.nullcontext():
+            EvaluationRequest(
+                messages=messages,
+                input=input_,
+                output=output,
+            )
+
+    @pytest.mark.parametrize(
+        "tsq_check,messages,available_tools,expected_error",
+        [
+            (True, None, None, True),
+            (True, [], None, True),
+            (True, None, [], True),
+            (True, [], [], True),
+            (True, _test_llm_messages, None, True),
+            (True, _test_llm_messages, [], True),
+            (True, None, _test_available_tools, True),
+            (True, [], _test_available_tools, True),
+            (True, _test_llm_messages, _test_available_tools, False),
+            (False, None, None, False),
+            (False, [], None, False),
+            (False, None, [], False),
+            (False, [], [], False),
+            (False, _test_llm_messages, None, False),
+            (False, _test_llm_messages, [], False),
+            (False, None, _test_available_tools, False),
+            (False, [], _test_available_tools, False),
+            (False, _test_llm_messages, _test_available_tools, False),
+        ],
+    )
+    def test_validate_tsq_requirements(
+        self,
+        tsq_check,
+        messages,
+        available_tools,
+        expected_error,
+    ):
+        with pytest.raises(ValueError) if expected_error else contextlib.nullcontext():
+            EvaluationRequest(
+                input="input",  # To pass the messages-input-output validation
+                messages=messages,
+                available_tools=available_tools,
+                tool_selection_quality_check=tsq_check,
+            )