@@ -241,6 +241,7 @@ async def retrieve_simple_response(
241241 instructions : str ,
242242 tools : Optional [list [Any ]] = None ,
243243 model_id : Optional [str ] = None ,
244+ endpoint_path : str = "/v1/infer" ,
244245) -> str :
245246 """Retrieve a simple response from the LLM for a stateless query.
246247
@@ -263,7 +264,7 @@ async def retrieve_simple_response(
263264 """
264265 resolved_model_id = model_id or await _get_default_model_id ()
265266 response = await _call_llm (question , instructions , tools , resolved_model_id )
266- extract_token_usage (response .usage , resolved_model_id )
267+ extract_token_usage (response .usage , resolved_model_id , endpoint_path )
267268 return extract_text_from_response_items (response .output )
268269
269270
@@ -366,12 +367,13 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position
366367 background_tasks .add_task (send_splunk_event , event , sourcetype )
367368
368369
369- async def _check_shield_moderation (
370+ async def _check_shield_moderation ( # pylint: disable=too-many-arguments,too-many-positional-arguments
370371 input_text : str ,
371372 request_id : str ,
372373 background_tasks : BackgroundTasks ,
373374 infer_request : RlsapiV1InferRequest ,
374375 request : Request ,
376+ endpoint_path : str ,
375377) -> Optional [RlsapiV1InferResponse ]:
376378 """Run shield moderation and return a refusal response if blocked.
377379
@@ -384,13 +386,14 @@ async def _check_shield_moderation(
384386 background_tasks: FastAPI background tasks for async Splunk event sending.
385387 infer_request: The original inference request (for Splunk event context).
386388 request: The FastAPI request object (for Splunk event context).
389+ endpoint_path: The API endpoint path for metric labeling.
387390
388391 Returns:
389392 An RlsapiV1InferResponse containing the refusal message if the input
390393 was blocked, or None if moderation passed.
391394 """
392395 client = AsyncLlamaStackClientHolder ().get_client ()
393- moderation_result = await run_shield_moderation (client , input_text )
396+ moderation_result = await run_shield_moderation (client , input_text , endpoint_path )
394397
395398 if moderation_result .decision != "blocked" :
396399 return None
@@ -432,6 +435,7 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
432435 start_time : float ,
433436 model : str ,
434437 provider : str ,
438+ endpoint_path : str ,
435439) -> float :
436440 """Record metrics and queue Splunk event for an inference failure.
437441
@@ -442,12 +446,15 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
442446 request_id: Unique identifier for the request.
443447 error: The exception that caused the failure.
444448 start_time: Monotonic clock time when inference started.
449+ model: The model name.
450+ provider: The provider name.
451+ endpoint_path: The API endpoint path for metric labeling.
445452
446453 Returns:
447454 The total inference time in seconds.
448455 """
449456 inference_time = time .monotonic () - start_time
450- recording .record_llm_failure (provider , model )
457+ recording .record_llm_failure (provider , model , endpoint_path )
451458 _queue_splunk_event (
452459 background_tasks ,
453460 infer_request ,
@@ -530,6 +537,7 @@ def _build_infer_response(
530537 request_id : str ,
531538 response : Optional [OpenAIResponseObject ],
532539 model_id : str ,
540+ endpoint_path : str ,
533541) -> RlsapiV1InferResponse :
534542 """Build the final inference response, with optional verbose metadata.
535543
@@ -549,7 +557,11 @@ def _build_infer_response(
549557 """
550558 if response is not None :
551559 turn_summary = build_turn_summary (
552- response , model_id , vector_store_ids = None , rag_id_mapping = None
560+ response ,
561+ model_id ,
562+ endpoint_path ,
563+ vector_store_ids = None ,
564+ rag_id_mapping = None ,
553565 )
554566 return RlsapiV1InferResponse (
555567 data = RlsapiV1InferData (
@@ -673,12 +685,19 @@ async def infer_endpoint( # pylint: disable=R0914
673685 "Request %s: Combined input source length: %d" , request_id , len (input_source )
674686 )
675687
688+ endpoint_path = "/v1/infer"
689+
676690 # Run shield moderation on user input before inference.
677691 # Uses all configured shields; no-op when no shields are registered.
678692 # Runs before model/tool discovery so blocked requests short-circuit
679693 # without incurring external I/O.
680694 blocked_response = await _check_shield_moderation (
681- input_source , request_id , background_tasks , infer_request , request
695+ input_source ,
696+ request_id ,
697+ background_tasks ,
698+ infer_request ,
699+ request ,
700+ endpoint_path ,
682701 )
683702 if blocked_response is not None :
684703 return blocked_response
@@ -700,11 +719,11 @@ async def infer_endpoint( # pylint: disable=R0914
700719 model_id = model_id ,
701720 )
702721 response_text = extract_text_from_response_items (response .output )
703- token_usage = extract_token_usage (response .usage , model_id )
722+ token_usage = extract_token_usage (response .usage , model_id , endpoint_path )
704723 inference_time = time .monotonic () - start_time
705724 except _INFER_HANDLED_EXCEPTIONS as error :
706725 if response is not None :
707- extract_token_usage (response .usage , model_id ) # type: ignore[arg-type]
726+ extract_token_usage (response .usage , model_id , endpoint_path ) # type: ignore[arg-type]
708727 _record_inference_failure (
709728 background_tasks ,
710729 infer_request ,
@@ -714,6 +733,7 @@ async def infer_endpoint( # pylint: disable=R0914
714733 start_time ,
715734 model ,
716735 provider ,
736+ endpoint_path ,
717737 )
718738 mapped_error = _map_inference_error_to_http_exception (
719739 error ,
@@ -755,4 +775,5 @@ async def infer_endpoint( # pylint: disable=R0914
755775 request_id ,
756776 response if verbose_enabled else None ,
757777 model_id ,
778+ endpoint_path ,
758779 )
0 commit comments