@@ -180,9 +180,18 @@ async def _get_default_model_id() -> str:
180180 provider_id = configuration .inference .default_provider
181181
182182 if model_id and provider_id :
183+ logger .info (
184+ "Using configured default model for rlsapi v1: %s/%s" ,
185+ provider_id ,
186+ model_id ,
187+ )
183188 return f"{ provider_id } /{ model_id } "
184189
185190 # 2. Auto-discover from Llama Stack
191+ logger .info (
192+ "No complete default model configured for rlsapi v1, "
193+ "auto-discovering LLM model"
194+ )
186195 client = AsyncLlamaStackClientHolder ().get_client ()
187196 try :
188197 models = await client .models .list ()
@@ -234,6 +243,7 @@ async def _resolve_validated_model_id() -> str:
234243 _ , model_name = extract_provider_and_model_from_model_id (model_id )
235244 error_response = NotFoundResponse (resource = "model" , resource_id = model_name )
236245 raise HTTPException (** error_response .model_dump ())
246+ logger .info ("Validated rlsapi v1 model availability: %s" , model_id )
237247 return model_id
238248
239249
@@ -366,6 +376,11 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position
366376
367377 event = build_inference_event (event_data )
368378 background_tasks .add_task (send_splunk_event , event , sourcetype )
379+ logger .info (
380+ "Queued rlsapi v1 Splunk event for request %s with sourcetype %s" ,
381+ request_id ,
382+ sourcetype ,
383+ )
369384
370385
371386async def _check_shield_moderation ( # pylint: disable=too-many-arguments,too-many-positional-arguments
@@ -394,16 +409,14 @@ async def _check_shield_moderation( # pylint: disable=too-many-arguments,too-ma
394409 was blocked, or None if moderation passed.
395410 """
396411 client = AsyncLlamaStackClientHolder ().get_client ()
412+ logger .info ("Running shield moderation for rlsapi v1 request %s" , request_id )
397413 moderation_result = await run_shield_moderation (client , input_text , endpoint_path )
398414
399415 if moderation_result .decision != "blocked" :
416+ logger .info ("Shield moderation passed for rlsapi v1 request %s" , request_id )
400417 return None
401418
402- logger .info (
403- "Request %s blocked by shield moderation: %s" ,
404- request_id ,
405- moderation_result .message ,
406- )
419+ logger .info ("Shield moderation blocked rlsapi v1 request %s" , request_id )
407420 _queue_splunk_event (
408421 background_tasks ,
409422 infer_request ,
@@ -468,6 +481,11 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
468481 inference_time ,
469482 "infer_error" ,
470483 )
484+ logger .info (
485+ "Recorded rlsapi v1 inference failure for request %s in %.3f seconds" ,
486+ request_id ,
487+ inference_time ,
488+ )
471489 return inference_time
472490
473491
@@ -605,22 +623,34 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
605623 """
606624 if isinstance (error , TemplateRenderError ):
607625 logger .error (
608- "Invalid system prompt template for request %s: %s" , request_id , error
626+ "Invalid system prompt template for request %s: %s" ,
627+ request_id ,
628+ type (error ).__name__ ,
609629 )
610630 error_response = InternalServerErrorResponse .generic ()
611631 return HTTPException (** error_response .model_dump ())
612632
613633 if isinstance (error , RuntimeError ):
614634 if is_context_length_error (str (error )):
615- logger .error ("Prompt too long for request %s: %s" , request_id , error )
635+ logger .error (
636+ "Prompt too long for request %s: %s" ,
637+ request_id ,
638+ type (error ).__name__ ,
639+ )
616640 error_response = PromptTooLongResponse (model = model_id )
617641 return HTTPException (** error_response .model_dump ())
618- logger .error ("Unexpected RuntimeError for request %s: %s" , request_id , error )
642+ logger .error (
643+ "Unexpected RuntimeError for request %s: %s" ,
644+ request_id ,
645+ type (error ).__name__ ,
646+ )
619647 return None
620648
621649 if isinstance (error , APIConnectionError ):
622650 logger .error (
623- "Unable to connect to Llama Stack for request %s: %s" , request_id , error
651+ "Unable to connect to Llama Stack for request %s: %s" ,
652+ request_id ,
653+ type (error ).__name__ ,
624654 )
625655 error_response = ServiceUnavailableResponse (
626656 backend_name = "Llama Stack" ,
@@ -629,15 +659,19 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
629659 return HTTPException (** error_response .model_dump ())
630660
631661 if isinstance (error , RateLimitError ):
632- logger .error ("Rate limit exceeded for request %s: %s" , request_id , error )
662+ logger .error (
663+ "Rate limit exceeded for request %s: %s" ,
664+ request_id ,
665+ type (error ).__name__ ,
666+ )
633667 error_response = QuotaExceededResponse (
634668 response = "The quota has been exceeded" ,
635669 cause = "Rate limit exceeded, please try again later" ,
636670 )
637671 return HTTPException (** error_response .model_dump ())
638672
639673 if isinstance (error , (APIStatusError , OpenAIAPIStatusError )):
640- logger .exception ("API error for request %s: %s" , request_id , error )
674+ logger .error ("API error for request %s: %s" , request_id , type ( error ). __name__ )
641675 error_response = handle_known_apistatus_errors (error , model_id )
642676 return HTTPException (** error_response .model_dump ())
643677
@@ -646,7 +680,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
646680
647681@router .post ("/infer" , responses = infer_responses , response_model_exclude_none = True )
648682@authorize (Action .RLSAPI_V1_INFER )
649- async def infer_endpoint ( # pylint: disable=R0914
683+ async def infer_endpoint ( # pylint: disable=R0914,R0915
650684 infer_request : RlsapiV1InferRequest ,
651685 request : Request ,
652686 background_tasks : BackgroundTasks ,
@@ -673,21 +707,32 @@ async def infer_endpoint( # pylint: disable=R0914
673707 """
674708 # Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
675709 check_configuration_loaded (configuration )
710+ endpoint_path = ENDPOINT_PATH_INFER
711+ request_id = get_suid ()
712+
713+ logger .info ("Processing rlsapi v1 /infer request %s" , request_id )
714+
676715 # Quota enforcement: resolve subject and check availability before any work.
677716 # No-op when quota_subject is not configured or no quota limiters exist.
678717 quota_id = _resolve_quota_subject (request , auth )
679718 if quota_id is not None :
719+ logger .info (
720+ "Checking quota availability for rlsapi v1 request %s using subject type %s" ,
721+ request_id ,
722+ configuration .rlsapi_v1 .quota_subject ,
723+ )
680724 check_tokens_available (configuration .quota_limiters , quota_id )
681-
682- endpoint_path = ENDPOINT_PATH_INFER
683-
684- request_id = get_suid ()
685-
686- logger .info ("Processing rlsapi v1 /infer request %s" , request_id )
725+ logger .info (
726+ "Quota availability check passed for rlsapi v1 request %s" , request_id
727+ )
728+ else :
729+ logger .info ("Quota enforcement disabled for rlsapi v1 request %s" , request_id )
687730
688731 input_source = infer_request .get_input_source ()
689- logger .debug (
690- "Request %s: Combined input source length: %d" , request_id , len (input_source )
732+ logger .info (
733+ "Prepared rlsapi v1 request %s input source; metadata requested: %s" ,
734+ request_id ,
735+ infer_request .include_metadata ,
691736 )
692737
693738 # Run shield moderation on user input before inference.
@@ -707,13 +752,30 @@ async def infer_endpoint( # pylint: disable=R0914
707752
708753 model_id = await _resolve_validated_model_id ()
709754 provider , model = extract_provider_and_model_from_model_id (model_id )
755+ logger .info (
756+ "Resolved rlsapi v1 request %s model provider=%s model=%s" ,
757+ request_id ,
758+ provider ,
759+ model ,
760+ )
710761 mcp_tools : list [Any ] = await get_mcp_tools (request_headers = request .headers )
762+ logger .info (
763+ "Retrieved %d MCP tools for rlsapi v1 request %s" ,
764+ len (mcp_tools ),
765+ request_id ,
766+ )
711767
712768 start_time = time .monotonic ()
713769 verbose_enabled = _is_verbose_enabled (infer_request )
770+ logger .info (
771+ "Starting LLM call for rlsapi v1 request %s with verbose metadata enabled: %s" ,
772+ request_id ,
773+ verbose_enabled ,
774+ )
714775
715776 response = None
716777 try :
778+ logger .info ("Building instructions for rlsapi v1 request %s" , request_id )
717779 instructions = _build_instructions (infer_request .context .systeminfo )
718780 response = await _call_llm (
719781 input_source ,
@@ -727,9 +789,17 @@ async def infer_endpoint( # pylint: disable=R0914
727789 recording .record_llm_inference_duration (
728790 provider , model , endpoint_path , "success" , inference_time
729791 )
792+ logger .info (
793+ "LLM call completed for rlsapi v1 request %s in %.3f seconds "
794+ "with %d input tokens and %d output tokens" ,
795+ request_id ,
796+ inference_time ,
797+ token_usage .input_tokens ,
798+ token_usage .output_tokens ,
799+ )
730800 except _INFER_HANDLED_EXCEPTIONS as error :
731801 if response is not None :
732- extract_token_usage (response .usage , model_id , endpoint_path ) # type: ignore[arg-type]
802+ extract_token_usage (response .usage , model_id , endpoint_path )
733803 _record_inference_failure (
734804 background_tasks ,
735805 infer_request ,
@@ -756,11 +826,20 @@ async def infer_endpoint( # pylint: disable=R0914
756826
757827 # Consume quota tokens after successful inference.
758828 if quota_id is not None :
829+ logger .info (
830+ "Consuming quota tokens for rlsapi v1 request %s: input=%d output=%d" ,
831+ request_id ,
832+ token_usage .input_tokens ,
833+ token_usage .output_tokens ,
834+ )
759835 consume_query_tokens (
760836 user_id = quota_id ,
761837 model_id = model_id ,
762838 token_usage = token_usage ,
763839 )
840+ logger .info (
841+ "Quota token consumption completed for rlsapi v1 request %s" , request_id
842+ )
764843
765844 _queue_splunk_event (
766845 background_tasks ,
@@ -774,7 +853,11 @@ async def infer_endpoint( # pylint: disable=R0914
774853 output_tokens = token_usage .output_tokens ,
775854 )
776855
777- logger .info ("Completed rlsapi v1 /infer request %s" , request_id )
856+ logger .info (
857+ "Completed rlsapi v1 /infer request %s in %.3f seconds" ,
858+ request_id ,
859+ inference_time ,
860+ )
778861
779862 return _build_infer_response (
780863 response_text ,
0 commit comments