Skip to content

Commit 23cc2cc

Browse files
committed
refactor: reduce infer_endpoint cyclomatic complexity from C(13) to B(7)
Extract helpers from infer_endpoint to eliminate the verbose mode branching that inflated its complexity: - _call_llm: transport-only LLM call (no metrics side effects) - _is_verbose_enabled: the 3-way config+request check - _build_infer_response: verbose vs minimal response construction, keyed on response object presence rather than a boolean flag retrieve_simple_response now delegates to _call_llm internally and handles its own token usage extraction. The verbose failure path in infer_endpoint is preserved: if the LLM call succeeded but later processing fails, token usage is still recorded. No behavior changes, pure refactor. Signed-off-by: Major Hayden <major@redhat.com>
1 parent 4961334 commit 23cc2cc

1 file changed

Lines changed: 118 additions & 64 deletions

File tree

src/app/endpoints/rlsapi_v1.py

Lines changed: 118 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,37 @@ async def retrieve_simple_response(
253253
Returns:
254254
The LLM-generated response text.
255255
256+
Raises:
257+
APIConnectionError: If the Llama Stack service is unreachable.
258+
HTTPException: 503 if no default model is configured.
259+
"""
260+
resolved_model_id = model_id or await _get_default_model_id()
261+
response = await _call_llm(question, instructions, tools, resolved_model_id)
262+
extract_token_usage(response.usage, resolved_model_id)
263+
return extract_text_from_response_items(response.output)
264+
265+
266+
async def _call_llm(
267+
question: str,
268+
instructions: str,
269+
tools: Optional[list[Any]] = None,
270+
model_id: Optional[str] = None,
271+
) -> OpenAIResponseObject:
272+
"""Call the LLM via the Responses API and return the full response object.
273+
274+
This is a transport-only function: it calls the LLM and returns the raw
275+
response. Callers are responsible for token usage extraction and metrics.
276+
277+
Args:
278+
question: The combined user input (question + context).
279+
instructions: System instructions for the LLM.
280+
tools: Optional list of MCP tool definitions for the LLM.
281+
model_id: Fully qualified model identifier in provider/model format.
282+
When omitted, the configured default model is used.
283+
284+
Returns:
285+
The full OpenAIResponseObject from the LLM.
286+
256287
Raises:
257288
APIConnectionError: If the Llama Stack service is unreachable.
258289
HTTPException: 503 if no default model is configured.
@@ -269,10 +300,7 @@ async def retrieve_simple_response(
269300
stream=False,
270301
store=False,
271302
)
272-
response = cast(OpenAIResponseObject, response)
273-
extract_token_usage(response.usage, resolved_model_id)
274-
275-
return extract_text_from_response_items(response.output)
303+
return cast(OpenAIResponseObject, response)
276304

277305

278306
def _get_cla_version(request: Request) -> str:
@@ -355,6 +383,79 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
355383
return inference_time
356384

357385

386+
def _is_verbose_enabled(infer_request: RlsapiV1InferRequest) -> bool:
387+
"""Check whether verbose metadata should be included in the response.
388+
389+
Verbose mode requires dual opt-in: the server configuration must allow it
390+
via ``allow_verbose_infer``, and the client must request it via the
391+
``include_metadata`` field.
392+
393+
Args:
394+
infer_request: The inference request to check.
395+
396+
Returns:
397+
True if both server config and client request enable verbose mode.
398+
"""
399+
return (
400+
configuration.customization is not None
401+
and configuration.customization.allow_verbose_infer
402+
and infer_request.include_metadata
403+
)
404+
405+
406+
def _build_infer_response(
407+
response_text: str,
408+
request_id: str,
409+
response: Optional[OpenAIResponseObject],
410+
model_id: str,
411+
) -> RlsapiV1InferResponse:
412+
"""Build the final inference response, with optional verbose metadata.
413+
414+
When ``response`` is provided, verbose metadata (tool calls, RAG chunks,
415+
token counts) is extracted via ``build_turn_summary`` and included.
416+
When ``response`` is None, a minimal response with only text is returned.
417+
418+
Args:
419+
response_text: The LLM-generated response text.
420+
request_id: Unique identifier for the request.
421+
response: The full LLM response object. Pass None for non-verbose
422+
responses; pass the object to include extended metadata.
423+
model_id: The model identifier used for inference.
424+
425+
Returns:
426+
The assembled RlsapiV1InferResponse.
427+
"""
428+
if response is not None:
429+
turn_summary = build_turn_summary(
430+
response, model_id, vector_store_ids=None, rag_id_mapping=None
431+
)
432+
return RlsapiV1InferResponse(
433+
data=RlsapiV1InferData(
434+
text=response_text,
435+
request_id=request_id,
436+
tool_calls=turn_summary.tool_calls,
437+
tool_results=turn_summary.tool_results,
438+
rag_chunks=turn_summary.rag_chunks,
439+
referenced_documents=turn_summary.referenced_documents,
440+
input_tokens=turn_summary.token_usage.input_tokens,
441+
output_tokens=turn_summary.token_usage.output_tokens,
442+
)
443+
)
444+
445+
return RlsapiV1InferResponse(
446+
data=RlsapiV1InferData(
447+
text=response_text,
448+
request_id=request_id,
449+
tool_calls=None,
450+
tool_results=None,
451+
rag_chunks=None,
452+
referenced_documents=None,
453+
input_tokens=None,
454+
output_tokens=None,
455+
)
456+
)
457+
458+
358459
def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-statements
359460
error: Exception, model_id: str, request_id: str
360461
) -> Optional[HTTPException]:
@@ -449,38 +550,18 @@ async def infer_endpoint( # pylint: disable=R0914
449550
)
450551

451552
start_time = time.monotonic()
452-
453-
# Check if verbose metadata should be returned
454-
verbose_enabled = (
455-
configuration.customization is not None
456-
and configuration.customization.allow_verbose_infer
457-
and infer_request.include_metadata
458-
)
553+
verbose_enabled = _is_verbose_enabled(infer_request)
459554

460555
response = None
461556
try:
462557
instructions = _build_instructions(infer_request.context.systeminfo)
463-
464-
# For verbose mode, retrieve the full response object instead of just text
465-
if verbose_enabled:
466-
client = AsyncLlamaStackClientHolder().get_client()
467-
response = await client.responses.create(
468-
input=input_source,
469-
model=model_id,
470-
instructions=instructions,
471-
tools=mcp_tools or [],
472-
stream=False,
473-
store=False,
474-
)
475-
response = cast(OpenAIResponseObject, response)
476-
response_text = extract_text_from_response_items(response.output)
477-
else:
478-
response_text = await retrieve_simple_response(
479-
input_source,
480-
instructions,
481-
tools=cast(list[Any], mcp_tools),
482-
model_id=model_id,
483-
)
558+
response = await _call_llm(
559+
input_source,
560+
instructions,
561+
tools=cast(list[Any], mcp_tools),
562+
model_id=model_id,
563+
)
564+
response_text = extract_text_from_response_items(response.output)
484565
inference_time = time.monotonic() - start_time
485566
except _INFER_HANDLED_EXCEPTIONS as error:
486567
if verbose_enabled and response is not None:
@@ -520,36 +601,9 @@ async def infer_endpoint( # pylint: disable=R0914
520601

521602
logger.info("Completed rlsapi v1 /infer request %s", request_id)
522603

523-
# Build response with optional extended metadata
524-
if verbose_enabled and response is not None:
525-
# Extract metadata from full response object
526-
turn_summary = build_turn_summary(
527-
response, model_id, vector_store_ids=None, rag_id_mapping=None
528-
)
529-
530-
return RlsapiV1InferResponse(
531-
data=RlsapiV1InferData(
532-
text=response_text,
533-
request_id=request_id,
534-
tool_calls=turn_summary.tool_calls,
535-
tool_results=turn_summary.tool_results,
536-
rag_chunks=turn_summary.rag_chunks,
537-
referenced_documents=turn_summary.referenced_documents,
538-
input_tokens=turn_summary.token_usage.input_tokens,
539-
output_tokens=turn_summary.token_usage.output_tokens,
540-
)
541-
)
542-
543-
# Standard minimal response
544-
return RlsapiV1InferResponse(
545-
data=RlsapiV1InferData(
546-
text=response_text,
547-
request_id=request_id,
548-
tool_calls=None,
549-
tool_results=None,
550-
rag_chunks=None,
551-
referenced_documents=None,
552-
input_tokens=None,
553-
output_tokens=None,
554-
)
604+
return _build_infer_response(
605+
response_text,
606+
request_id,
607+
response if verbose_enabled else None,
608+
model_id,
555609
)

0 commit comments

Comments
 (0)