Merge pull request #1414 from asimurka/e2e_tests_responses_tools

tisnik · web-flow · commit 18b9a10ecb3b · 2026-03-31T18:03:34.000+02:00
LCORE-1270: E2e tests responses tools
diff --git a/tests/e2e/features/responses.feature b/tests/e2e/features/responses.feature
@@ -427,3 +427,172 @@ Feature: Responses endpoint API tests
     """
      Then The status code of the response is 503
       And The body of the response contains Unable to connect to Llama Stack
+
+
+  Scenario: Responses endpoint with tool_choice none answers knowledge question without file search usage
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": "none"
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should not include any tool invocation item types
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with tool_choice auto answers a knowledge question using file search
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": "auto"
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The responses output_text should contain following fragments
+        | Fragments in LLM response |
+        | great work                |
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with tool_choice required still invokes document search for a basic question
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "Hello World!",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "tool_choice": "required"
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with file search as the chosen tool answers using file search
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": {"type": "file_search"}
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The responses output_text should contain following fragments
+        | Fragments in LLM response |
+        | great work                |
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with allowed tools in automatic mode answers knowledge question using file search
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": {
+        "type": "allowed_tools",
+        "mode": "auto",
+        "tools": [{"type": "file_search"}]
+      }
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The responses output_text should contain following fragments
+        | Fragments in LLM response |
+        | great work                |
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with allowed tools in required mode invokes file search for a basic question
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "Hello world!",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "tool_choice": {
+        "type": "allowed_tools",
+        "mode": "required",
+        "tools": [{"type": "file_search"}]
+      }
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The token metrics should have increased
+
+  Scenario: Allowed tools auto mode with only MCP in allowlist does not use file search for knowledge question
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. Answer in lowercase.",
+      "tool_choice": {
+        "type": "allowed_tools",
+        "mode": "auto",
+        "tools": [{"type": "mcp"}]
+      }
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should not include an item with type "file_search_call"
+      And The token metrics should have increased
+
+  Scenario: Required allowed_tools with invalid filter returns no tool invocations on knowledge question
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tools": [],
+      "tool_choice": {
+        "type": "allowed_tools",
+        "mode": "required",
+        "tools": [{"non-existing": "tool"}]
+      }
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should not include any tool invocation item types
+      And The token metrics should have increased
diff --git a/tests/e2e/features/steps/llm_query_response.py b/tests/e2e/features/steps/llm_query_response.py
@@ -2,6 +2,7 @@
 
 import json
 import os
+from typing import Any, cast
 
 import requests
 from behave import step, then  # pyright: ignore[reportAttributeAccessIssue]
@@ -12,6 +13,75 @@
 # Longer timeout for Prow/OpenShift with CPU-based vLLM
 DEFAULT_LLM_TIMEOUT = 180 if os.getenv("RUNNING_PROW") else 60
 
+# Responses API ``output`` item types that indicate tool listing or invocation.
+_RESPONSE_TOOL_OUTPUT_ITEM_TYPES = frozenset(
+    {
+        "file_search_call",
+        "mcp_call",
+        "mcp_list_tools",
+        "function_call",
+        "web_search_call",
+    }
+)
+
+
+def _collect_output_item_types(response_body: dict[str, Any]) -> list[str]:
+    """Collect ``type`` from each top-level ``output`` item in a Responses API JSON body."""
+    output = cast(list[dict[str, Any]], response_body["output"])
+    return [item["type"] for item in output]
+
+
+@then("The responses output should not include any tool invocation item types")
+def responses_output_should_not_include_tool_items(context: Context) -> None:
+    """Assert no tool-related items appear in the Responses JSON ``output`` array."""
+    assert context.response is not None, "Request needs to be performed first"
+    response_json = cast(dict[str, Any], context.response.json())
+    types_found = _collect_output_item_types(response_json)
+    bad = [t for t in types_found if t in _RESPONSE_TOOL_OUTPUT_ITEM_TYPES]
+    assert not bad, (
+        "Expected no tool-related output items, but found types "
+        f"{bad!r} among all output types {types_found!r}"
+    )
+
+
+@then('The responses output should include an item with type "{item_type}"')
+def responses_output_should_include_item_type(context: Context, item_type: str) -> None:
+    """Assert at least one ``output`` item has the given ``type``."""
+    assert context.response is not None, "Request needs to be performed first"
+    response_json = cast(dict[str, Any], context.response.json())
+    types_found = _collect_output_item_types(response_json)
+    assert item_type in types_found, (
+        f"Expected output item type {item_type!r} not found; "
+        f"had types {types_found!r}"
+    )
+
+
+@then('The responses output should not include an item with type "{item_type}"')
+def responses_output_should_not_include_item_type(
+    context: Context, item_type: str
+) -> None:
+    """Assert no ``output`` item has the given ``type``."""
+    assert context.response is not None, "Request needs to be performed first"
+    response_json = cast(dict[str, Any], context.response.json())
+    types_found = _collect_output_item_types(response_json)
+    assert item_type not in types_found, (
+        f"Expected output item type {item_type!r} to be absent; "
+        f"but found types {types_found!r}"
+    )
+
+
+@then("The responses output should include an item with one of these types")
+def responses_output_should_include_one_of_types(context: Context) -> None:
+    """Assert at least one output item type matches a row in the scenario table."""
+    assert context.response is not None, "Request needs to be performed first"
+    assert context.table is not None, "Table with column 'item type' is required"
+    allowed = [row["item type"].strip() for row in context.table]
+    response_json = cast(dict[str, Any], context.response.json())
+    types_found = _collect_output_item_types(response_json)
+    assert any(
+        a in types_found for a in allowed
+    ), f"Expected at least one of {allowed!r} in output types {types_found!r}"
+
 
 @step("I wait for the response to be completed")
 def wait_for_complete_response(context: Context) -> None:
@@ -163,6 +233,28 @@ def check_referenced_documents_present(context: Context) -> None:
     ), "referenced_documents is empty — no documents were referenced"
 
 
+@then("The responses output_text should contain following fragments")
+def check_fragments_in_responses_output_text(context: Context) -> None:
+    """Check that fragments from the scenario table appear in JSON ``output_text``.
+
+    Used for POST ``/v1/responses`` (query endpoint uses the ``response`` field).
+    """
+    assert context.response is not None, "Request needs to be performed first"
+    response_json = context.response.json()
+    assert (
+        "output_text" in response_json
+    ), f"Expected 'output_text' in JSON body, got keys: {list(response_json.keys())}"
+    output_text = response_json["output_text"]
+
+    assert context.table is not None, "Fragments are not specified in table"
+
+    for fragment in context.table:
+        expected = fragment["Fragments in LLM response"]
+        assert (
+            expected in output_text
+        ), f"Fragment '{expected}' not found in output_text: '{output_text}'"
+
+
 @then("The response should contain following fragments")
 def check_fragments_in_response(context: Context) -> None:
     """Check that all specified fragments are present in the LLM response.