@@ -253,6 +253,37 @@ async def retrieve_simple_response(
253253 Returns:
254254 The LLM-generated response text.
255255
256+ Raises:
257+ APIConnectionError: If the Llama Stack service is unreachable.
258+ HTTPException: 503 if no default model is configured.
259+ """
260+ resolved_model_id = model_id or await _get_default_model_id ()
261+ response = await _call_llm (question , instructions , tools , resolved_model_id )
262+ extract_token_usage (response .usage , resolved_model_id )
263+ return extract_text_from_response_items (response .output )
264+
265+
266+ async def _call_llm (
267+ question : str ,
268+ instructions : str ,
269+ tools : Optional [list [Any ]] = None ,
270+ model_id : Optional [str ] = None ,
271+ ) -> OpenAIResponseObject :
272+ """Call the LLM via the Responses API and return the full response object.
273+
274+ This is a transport-only function: it calls the LLM and returns the raw
275+ response. Callers are responsible for token usage extraction and metrics.
276+
277+ Args:
278+ question: The combined user input (question + context).
279+ instructions: System instructions for the LLM.
280+ tools: Optional list of MCP tool definitions for the LLM.
281+ model_id: Fully qualified model identifier in provider/model format.
282+ When omitted, the configured default model is used.
283+
284+ Returns:
285+ The full OpenAIResponseObject from the LLM.
286+
256287 Raises:
257288 APIConnectionError: If the Llama Stack service is unreachable.
258289 HTTPException: 503 if no default model is configured.
@@ -269,10 +300,7 @@ async def retrieve_simple_response(
269300 stream = False ,
270301 store = False ,
271302 )
272- response = cast (OpenAIResponseObject , response )
273- extract_token_usage (response .usage , resolved_model_id )
274-
275- return extract_text_from_response_items (response .output )
303+ return cast (OpenAIResponseObject , response )
276304
277305
278306def _get_cla_version (request : Request ) -> str :
@@ -355,6 +383,79 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
355383 return inference_time
356384
357385
386+ def _is_verbose_enabled (infer_request : RlsapiV1InferRequest ) -> bool :
387+ """Check whether verbose metadata should be included in the response.
388+
389+ Verbose mode requires dual opt-in: the server configuration must allow it
390+ via ``allow_verbose_infer``, and the client must request it via the
391+ ``include_metadata`` field.
392+
393+ Args:
394+ infer_request: The inference request to check.
395+
396+ Returns:
397+ True if both server config and client request enable verbose mode.
398+ """
399+ return (
400+ configuration .customization is not None
401+ and configuration .customization .allow_verbose_infer
402+ and infer_request .include_metadata
403+ )
404+
405+
406+ def _build_infer_response (
407+ response_text : str ,
408+ request_id : str ,
409+ response : Optional [OpenAIResponseObject ],
410+ model_id : str ,
411+ ) -> RlsapiV1InferResponse :
412+ """Build the final inference response, with optional verbose metadata.
413+
414+ When ``response`` is provided, verbose metadata (tool calls, RAG chunks,
415+ token counts) is extracted via ``build_turn_summary`` and included.
416+ When ``response`` is None, a minimal response with only text is returned.
417+
418+ Args:
419+ response_text: The LLM-generated response text.
420+ request_id: Unique identifier for the request.
421+ response: The full LLM response object. Pass None for non-verbose
422+ responses; pass the object to include extended metadata.
423+ model_id: The model identifier used for inference.
424+
425+ Returns:
426+ The assembled RlsapiV1InferResponse.
427+ """
428+ if response is not None :
429+ turn_summary = build_turn_summary (
430+ response , model_id , vector_store_ids = None , rag_id_mapping = None
431+ )
432+ return RlsapiV1InferResponse (
433+ data = RlsapiV1InferData (
434+ text = response_text ,
435+ request_id = request_id ,
436+ tool_calls = turn_summary .tool_calls ,
437+ tool_results = turn_summary .tool_results ,
438+ rag_chunks = turn_summary .rag_chunks ,
439+ referenced_documents = turn_summary .referenced_documents ,
440+ input_tokens = turn_summary .token_usage .input_tokens ,
441+ output_tokens = turn_summary .token_usage .output_tokens ,
442+ )
443+ )
444+
445+ return RlsapiV1InferResponse (
446+ data = RlsapiV1InferData (
447+ text = response_text ,
448+ request_id = request_id ,
449+ tool_calls = None ,
450+ tool_results = None ,
451+ rag_chunks = None ,
452+ referenced_documents = None ,
453+ input_tokens = None ,
454+ output_tokens = None ,
455+ )
456+ )
457+
458+
358459def _map_inference_error_to_http_exception ( # pylint: disable=too-many-return-statements
359460 error : Exception , model_id : str , request_id : str
360461) -> Optional [HTTPException ]:
@@ -449,38 +550,18 @@ async def infer_endpoint( # pylint: disable=R0914
449550 )
450551
451552 start_time = time .monotonic ()
452-
453- # Check if verbose metadata should be returned
454- verbose_enabled = (
455- configuration .customization is not None
456- and configuration .customization .allow_verbose_infer
457- and infer_request .include_metadata
458- )
553+ verbose_enabled = _is_verbose_enabled (infer_request )
459554
460555 response = None
461556 try :
462557 instructions = _build_instructions (infer_request .context .systeminfo )
463-
464- # For verbose mode, retrieve the full response object instead of just text
465- if verbose_enabled :
466- client = AsyncLlamaStackClientHolder ().get_client ()
467- response = await client .responses .create (
468- input = input_source ,
469- model = model_id ,
470- instructions = instructions ,
471- tools = mcp_tools or [],
472- stream = False ,
473- store = False ,
474- )
475- response = cast (OpenAIResponseObject , response )
476- response_text = extract_text_from_response_items (response .output )
477- else :
478- response_text = await retrieve_simple_response (
479- input_source ,
480- instructions ,
481- tools = cast (list [Any ], mcp_tools ),
482- model_id = model_id ,
483- )
558+ response = await _call_llm (
559+ input_source ,
560+ instructions ,
561+ tools = cast (list [Any ], mcp_tools ),
562+ model_id = model_id ,
563+ )
564+ response_text = extract_text_from_response_items (response .output )
484565 inference_time = time .monotonic () - start_time
485566 except _INFER_HANDLED_EXCEPTIONS as error :
486567 if verbose_enabled and response is not None :
@@ -520,36 +601,9 @@ async def infer_endpoint( # pylint: disable=R0914
520601
521602 logger .info ("Completed rlsapi v1 /infer request %s" , request_id )
522603
523- # Build response with optional extended metadata
524- if verbose_enabled and response is not None :
525- # Extract metadata from full response object
526- turn_summary = build_turn_summary (
527- response , model_id , vector_store_ids = None , rag_id_mapping = None
528- )
529-
530- return RlsapiV1InferResponse (
531- data = RlsapiV1InferData (
532- text = response_text ,
533- request_id = request_id ,
534- tool_calls = turn_summary .tool_calls ,
535- tool_results = turn_summary .tool_results ,
536- rag_chunks = turn_summary .rag_chunks ,
537- referenced_documents = turn_summary .referenced_documents ,
538- input_tokens = turn_summary .token_usage .input_tokens ,
539- output_tokens = turn_summary .token_usage .output_tokens ,
540- )
541- )
542-
543- # Standard minimal response
544- return RlsapiV1InferResponse (
545- data = RlsapiV1InferData (
546- text = response_text ,
547- request_id = request_id ,
548- tool_calls = None ,
549- tool_results = None ,
550- rag_chunks = None ,
551- referenced_documents = None ,
552- input_tokens = None ,
553- output_tokens = None ,
554- )
604+ return _build_infer_response (
605+ response_text ,
606+ request_id ,
607+ response if verbose_enabled else None ,
608+ model_id ,
555609 )
0 commit comments