lightspeed-core
diff --git a/‎src/app/endpoints/responses.py‎
Lines changed: 9 additions & 7 deletions b/‎src/app/endpoints/responses.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎src/app/main.py‎
Lines changed: 1 addition & 1 deletion b/‎src/app/main.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/utils/responses.py‎
Lines changed: 91 additions & 19 deletions b/‎src/utils/responses.py‎
Lines changed: 91 additions & 19 deletions
diff --git a/‎tests/e2e/features/responses.feature‎
Lines changed: 126 additions & 3 deletions b/‎tests/e2e/features/responses.feature‎
Lines changed: 126 additions & 3 deletions
@@ -234,13 +234,14 @@ async def responses_endpoint_handler(
     )
 
     # Build RAG context from Inline RAG sources
-    inline_rag_context = await build_rag_context(
-        client,
-        moderation_result.decision,
-        input_text,
-        vector_store_ids,
-        responses_request.solr,
-    )
+    # inline_rag_context = await build_rag_context(
+    #     client,
+    #     moderation_result.decision,
+    #     input_text,
+    #     vector_store_ids,
+    #     responses_request.solr,
+    # )
+    inline_rag_context = RAGContext()
     if moderation_result.decision == "passed":
         responses_request.input = append_inline_rag_context_to_responses_input(
             responses_request.input, inline_rag_context.context_text
@@ -654,6 +655,7 @@ async def handle_non_streaming_response(
             )
     else:
         try:
+            print("API Params: ", api_params.model_dump(exclude_none=True))
             api_response = cast(
                 OpenAIResponseObject,
                 await client.responses.create(
 
@@ -229,5 +229,5 @@ async def send_wrapper(message: Message) -> None:
 # RestApiMetricsMiddleware (registered last) is outermost.  This ensures metrics
 # always observe a status code — including 500s synthesised by the exception
 # middleware — rather than seeing a raw exception with no response.
-app.add_middleware(GlobalExceptionMiddleware)
+#app.add_middleware(GlobalExceptionMiddleware)
 app.add_middleware(RestApiMetricsMiddleware)
@@ -26,6 +26,9 @@
 from llama_stack_api.openai_responses import (
     OpenAIResponseInputToolChoice as ToolChoice,
 )
+from llama_stack_api.openai_responses import (
+    OpenAIResponseInputToolChoiceAllowedTools as AllowedTools,
+)
 from llama_stack_api.openai_responses import (
     OpenAIResponseInputToolChoiceMode as ToolChoiceMode,
 )
@@ -417,6 +420,55 @@ def extract_vector_store_ids_from_tools(
     return vector_store_ids
 
 
+def _tool_matches_allowed_entry(tool: InputTool, entry: dict[str, str]) -> bool:
+    """Return True if the tool satisfies every key in the allowlist entry.
+
+    ``OpenAIResponseInputToolChoiceAllowedTools.tools`` entries use string keys
+    and values (e.g. ``type``, ``server_label``, ``name``); each must match the
+    corresponding attribute on the tool.
+
+    Parameters:
+        tool: A configured input tool.
+        entry: One allowlist entry from ``allowed_tools.tools``.
+
+    Returns:
+        True if all entry keys match the tool.
+    """
+    for key, value in entry.items():
+        if not hasattr(tool, key):
+            return False
+        attr = getattr(tool, key)
+        if attr is None:
+            return False
+        if attr != value and str(attr) != value:
+            return False
+    return True
+
+
+def filter_tools_by_allowed_entries(
+    tools: list[InputTool],
+    allowed_entries: list[dict[str, str]],
+) -> list[InputTool]:
+    """Keep tools that match at least one allowlist entry.
+
+    If ``allowed_entries`` is empty, no tools are kept (strict allowlist).
+
+    Parameters:
+        tools: Tools to filter (typically after translation / preparation).
+        allowed_entries: Entries from ``OpenAIResponseInputToolChoiceAllowedTools.tools``.
+
+    Returns:
+        A sublist of ``tools`` matching the allowlist.
+    """
+    if not allowed_entries:
+        return []
+    return [
+        t
+        for t in tools
+        if any(_tool_matches_allowed_entry(t, e) for e in allowed_entries)
+    ]
+
+
 def resolve_vector_store_ids(
     vector_store_ids: list[str], byok_rags: list[ByokRag]
 ) -> list[str]:
@@ -1332,10 +1384,19 @@ async def resolve_tool_choice(
 ) -> tuple[Optional[list[InputTool]], Optional[ToolChoice], Optional[list[str]]]:
     """Resolve tools and tool_choice for the Responses API.
 
-    If the request includes tools, uses them as-is and derives vector_store_ids
-    from tool configs; otherwise loads tools via prepare_tools (using all
-    configured vector stores) and honors tool_choice "none" via the no_tools
-    flag. When no tools end up configured, tool_choice is cleared to None.
+    If ``tool_choice`` is ``none``, always returns ``(None, None, None)`` — no
+    tools are sent to Llama Stack, even when the request included explicit
+    ``tools`` (e.g. file_search).
+
+    If ``tool_choice`` is ``allowed_tools``, it is rewritten for downstream
+    services: tools are filtered to those matching the allowlist entries, and
+    ``tool_choice`` becomes ``auto`` or ``required`` per the allowlist ``mode``.
+
+    If the request includes tools and tool_choice is not ``none``, uses them
+    (after allowlist filtering) and derives vector_store_ids from the prepared
+    tools; otherwise loads tools via prepare_tools (using all configured vector
+    stores), then applies allowlist filtering when present. When no tools end
+    up configured, tool_choice is cleared to None.
 
     Args:
         tools: Tools from the request, or None to use LCORE-configured tools.
@@ -1349,35 +1410,46 @@ async def resolve_tool_choice(
         prepared_tools is the list of tools to use, or None if none configured;
         prepared_tool_choice is the resolved tool choice, or None when there
         are no tools; vector_store_ids is extracted from tools (in user-facing format)
-        when provided, otherwise None.
+        when provided, otherwise None (also None when tool_choice is ``none``).
     """
+    if isinstance(tool_choice, ToolChoiceMode) and tool_choice == ToolChoiceMode.none:
+        return None, None, None
+
+    allowed_filters: Optional[list[dict[str, str]]] = None
+    if isinstance(tool_choice, AllowedTools):
+        allowed_filters = tool_choice.tools
+        tool_choice = ToolChoiceMode(tool_choice.mode)
+
     prepared_tools: Optional[list[InputTool]] = None
-    client = AsyncLlamaStackClientHolder().get_client()
     if tools:  # explicitly specified in request
-        # Per-request override of vector stores (user-facing rag_ids)
-        vector_store_ids = extract_vector_store_ids_from_tools(tools)
-        # Translate user-facing rag_ids to llama-stack vector_store_ids in each file_search tool
         byok_rags = configuration.configuration.byok_rag
         prepared_tools = translate_tools_vector_store_ids(tools, byok_rags)
+        if allowed_filters is not None:
+            prepared_tools = filter_tools_by_allowed_entries(
+                prepared_tools, allowed_filters
+            )
+        if not prepared_tools:
+            return None, None, None
+        vector_store_ids_list = extract_vector_store_ids_from_tools(prepared_tools)
+        vector_store_ids = vector_store_ids_list if vector_store_ids_list else None
         prepared_tool_choice = tool_choice or ToolChoiceMode.auto
     else:
-        # Vector stores were not overwritten in request, use all configured vector stores
         vector_store_ids = None
-        # Get all tools configured in LCORE (returns None or non-empty list)
-        no_tools = (
-            isinstance(tool_choice, ToolChoiceMode)
-            and tool_choice == ToolChoiceMode.none
-        )
-        # Vector stores are prepared in llama-stack format
+        client = AsyncLlamaStackClientHolder().get_client()
         prepared_tools = await prepare_tools(
             client=client,
-            vector_store_ids=vector_store_ids,  # allow all configured vector stores
-            no_tools=no_tools,
+            vector_store_ids=vector_store_ids,
+            no_tools=False,
             token=token,
             mcp_headers=mcp_headers,
             request_headers=request_headers,
         )
-        # If there are no tools, tool_choice cannot be set at all - LLS implicit behavior
+        if allowed_filters is not None and prepared_tools:
+            prepared_tools = filter_tools_by_allowed_entries(
+                prepared_tools, allowed_filters
+            )
+        if not prepared_tools:
+            prepared_tools = None
         prepared_tool_choice = tool_choice if prepared_tools else None
 
     return prepared_tools, prepared_tool_choice, vector_store_ids
@@ -5,7 +5,7 @@ Feature: Responses endpoint API tests
     Given The service is started locally
       And REST API service prefix is /v1
 
-  Scenario: Check if responses endpoint returns 200 for minimal request
+  Scenario: Check if responses endpoint answers a minimal question
     Given The system is in default state
       And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
     When I use "responses" to ask question with authorization header
@@ -14,11 +14,134 @@ Feature: Responses endpoint API tests
     """
     Then The status code of the response is 200
 
-  Scenario: Check if responses endpoint returns 200 for minimal streaming request
+  Scenario: Check if responses endpoint streams a minimal answer
     Given The system is in default state
       And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
     When I use "responses" to ask question with authorization header
     """
     {"input": "Say hello", "model": "{PROVIDER}/{MODEL}", "stream": true}
     """
-    Then The status code of the response is 200
+    Then The status code of the response is 200
+
+  Scenario: Check if responses endpoint with tool_choice none answers knowledge question without file search usage
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": "none"
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should not include any tool invocation item types
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with tool_choice auto answers a knowledge question using file search
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": "auto"
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The responses output_text should contain following fragments
+        | Fragments in LLM response |
+        | great work                |
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with tool_choice required still invokes document search for a basic question
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "Hello World!",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "tool_choice": "required"
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with file search as the chosen tool answers using file search
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": {"type": "file_search"}
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The responses output_text should contain following fragments
+        | Fragments in LLM response |
+        | great work                |
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with allowed tools in automatic mode answers knowledge question using file search
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "What is the title of the article from Paul?",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "instructions": "You are an assistant. You MUST use the file_search tool to answer. Answer in lowercase.",
+      "tool_choice": {
+        "type": "allowed_tools",
+        "mode": "auto",
+        "tools": [{"type": "file_search"}]
+      }
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The responses output_text should contain following fragments
+        | Fragments in LLM response |
+        | great work                |
+      And The token metrics should have increased
+
+  Scenario: Check if responses endpoint with allowed tools in required mode invokes file search for a basic question
+    Given The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And I capture the current token metrics
+    When I use "responses" to ask question with authorization header
+    """
+    {
+      "input": "Hello world!",
+      "model": "{PROVIDER}/{MODEL}",
+      "stream": false,
+      "tool_choice": {
+        "type": "allowed_tools",
+        "mode": "required",
+        "tools": [{"type": "file_search"}]
+      }
+    }
+    """
+    Then The status code of the response is 200
+      And The responses output should include an item with type "file_search_call"
+      And The token metrics should have increased