@@ -201,9 +201,18 @@ async def _get_default_model_id() -> str:
201201 provider_id = configuration .inference .default_provider
202202
203203 if model_id and provider_id :
204+ logger .info (
205+ "Using configured default model for rlsapi v1: %s/%s" ,
206+ provider_id ,
207+ model_id ,
208+ )
204209 return f"{ provider_id } /{ model_id } "
205210
206211 # 2. Auto-discover from Llama Stack
212+ logger .info (
213+ "No complete default model configured for rlsapi v1, "
214+ "auto-discovering LLM model"
215+ )
207216 client = AsyncLlamaStackClientHolder ().get_client ()
208217 try :
209218 models = await client .models .list ()
@@ -255,6 +264,7 @@ async def _resolve_validated_model_id() -> str:
255264 _ , model_name = extract_provider_and_model_from_model_id (model_id )
256265 error_response = NotFoundResponse (resource = "model" , resource_id = model_name )
257266 raise HTTPException (** error_response .model_dump ())
267+ logger .info ("Validated rlsapi v1 model availability: %s" , model_id )
258268 return model_id
259269
260270
@@ -387,6 +397,11 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position
387397
388398 event = build_inference_event (event_data )
389399 background_tasks .add_task (send_splunk_event , event , sourcetype )
400+ logger .info (
401+ "Queued rlsapi v1 Splunk event for request %s with sourcetype %s" ,
402+ request_id ,
403+ sourcetype ,
404+ )
390405
391406
392407async def _check_shield_moderation ( # pylint: disable=too-many-arguments,too-many-positional-arguments
@@ -415,16 +430,14 @@ async def _check_shield_moderation( # pylint: disable=too-many-arguments,too-ma
415430 was blocked, or None if moderation passed.
416431 """
417432 client = AsyncLlamaStackClientHolder ().get_client ()
433+ logger .info ("Running shield moderation for rlsapi v1 request %s" , request_id )
418434 moderation_result = await run_shield_moderation (client , input_text , endpoint_path )
419435
420436 if moderation_result .decision != "blocked" :
437+ logger .info ("Shield moderation passed for rlsapi v1 request %s" , request_id )
421438 return None
422439
423- logger .info (
424- "Request %s blocked by shield moderation: %s" ,
425- request_id ,
426- moderation_result .message ,
427- )
440+ logger .info ("Shield moderation blocked rlsapi v1 request %s" , request_id )
428441 _queue_splunk_event (
429442 background_tasks ,
430443 infer_request ,
@@ -480,16 +493,20 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
480493 recording .record_llm_inference_duration (
481494 provider , model , endpoint_path , "failure" , inference_time
482495 )
483- redacted_error = _redact_sensitive_error_text (str (error ))
484496 _queue_splunk_event (
485497 background_tasks ,
486498 infer_request ,
487499 request ,
488500 request_id ,
489- redacted_error ,
501+ type ( error ). __name__ ,
490502 inference_time ,
491503 "infer_error" ,
492504 )
505+ logger .info (
506+ "Recorded rlsapi v1 inference failure for request %s in %.3f seconds" ,
507+ request_id ,
508+ inference_time ,
509+ )
493510 return inference_time
494511
495512
@@ -627,22 +644,34 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
627644 """
628645 if isinstance (error , TemplateRenderError ):
629646 logger .error (
630- "Invalid system prompt template for request %s: %s" , request_id , error
647+ "Invalid system prompt template for request %s: %s" ,
648+ request_id ,
649+ type (error ).__name__ ,
631650 )
632651 error_response = InternalServerErrorResponse .generic ()
633652 return HTTPException (** error_response .model_dump ())
634653
635654 if isinstance (error , RuntimeError ):
636655 if is_context_length_error (str (error )):
637- logger .error ("Prompt too long for request %s: %s" , request_id , error )
656+ logger .error (
657+ "Prompt too long for request %s: %s" ,
658+ request_id ,
659+ type (error ).__name__ ,
660+ )
638661 error_response = PromptTooLongResponse (model = model_id )
639662 return HTTPException (** error_response .model_dump ())
640- logger .error ("Unexpected RuntimeError for request %s: %s" , request_id , error )
663+ logger .error (
664+ "Unexpected RuntimeError for request %s: %s" ,
665+ request_id ,
666+ type (error ).__name__ ,
667+ )
641668 return None
642669
643670 if isinstance (error , APIConnectionError ):
644671 logger .error (
645- "Unable to connect to Llama Stack for request %s: %s" , request_id , error
672+ "Unable to connect to Llama Stack for request %s: %s" ,
673+ request_id ,
674+ type (error ).__name__ ,
646675 )
647676 error_response = ServiceUnavailableResponse (
648677 backend_name = "Llama Stack" ,
@@ -651,15 +680,19 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
651680 return HTTPException (** error_response .model_dump ())
652681
653682 if isinstance (error , RateLimitError ):
654- logger .error ("Rate limit exceeded for request %s: %s" , request_id , error )
683+ logger .error (
684+ "Rate limit exceeded for request %s: %s" ,
685+ request_id ,
686+ type (error ).__name__ ,
687+ )
655688 error_response = QuotaExceededResponse (
656689 response = "The quota has been exceeded" ,
657690 cause = "Rate limit exceeded, please try again later" ,
658691 )
659692 return HTTPException (** error_response .model_dump ())
660693
661694 if isinstance (error , (APIStatusError , OpenAIAPIStatusError )):
662- logger .exception ("API error for request %s: %s" , request_id , error )
695+ logger .error ("API error for request %s: %s" , request_id , type ( error ). __name__ )
663696 error_response = handle_known_apistatus_errors (error , model_id )
664697 return HTTPException (** error_response .model_dump ())
665698
@@ -668,7 +701,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
668701
669702@router .post ("/infer" , responses = infer_responses , response_model_exclude_none = True )
670703@authorize (Action .RLSAPI_V1_INFER )
671- async def infer_endpoint ( # pylint: disable=R0914
704+ async def infer_endpoint ( # pylint: disable=R0914,R0915
672705 infer_request : RlsapiV1InferRequest ,
673706 request : Request ,
674707 background_tasks : BackgroundTasks ,
@@ -695,21 +728,32 @@ async def infer_endpoint( # pylint: disable=R0914
695728 """
696729 # Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
697730 check_configuration_loaded (configuration )
731+ endpoint_path = ENDPOINT_PATH_INFER
732+ request_id = get_suid ()
733+
734+ logger .info ("Processing rlsapi v1 /infer request %s" , request_id )
735+
698736 # Quota enforcement: resolve subject and check availability before any work.
699737 # No-op when quota_subject is not configured or no quota limiters exist.
700738 quota_id = _resolve_quota_subject (request , auth )
701739 if quota_id is not None :
740+ logger .info (
741+ "Checking quota availability for rlsapi v1 request %s using subject type %s" ,
742+ request_id ,
743+ configuration .rlsapi_v1 .quota_subject ,
744+ )
702745 check_tokens_available (configuration .quota_limiters , quota_id )
703-
704- endpoint_path = ENDPOINT_PATH_INFER
705-
706- request_id = get_suid ()
707-
708- logger .info ("Processing rlsapi v1 /infer request %s" , request_id )
746+ logger .info (
747+ "Quota availability check passed for rlsapi v1 request %s" , request_id
748+ )
749+ else :
750+ logger .info ("Quota enforcement disabled for rlsapi v1 request %s" , request_id )
709751
710752 input_source = infer_request .get_input_source ()
711- logger .debug (
712- "Request %s: Combined input source length: %d" , request_id , len (input_source )
753+ logger .info (
754+ "Prepared rlsapi v1 request %s input source; metadata requested: %s" ,
755+ request_id ,
756+ infer_request .include_metadata ,
713757 )
714758
715759 # Run shield moderation on user input before inference.
@@ -729,13 +773,30 @@ async def infer_endpoint( # pylint: disable=R0914
729773
730774 model_id = await _resolve_validated_model_id ()
731775 provider , model = extract_provider_and_model_from_model_id (model_id )
776+ logger .info (
777+ "Resolved rlsapi v1 request %s model provider=%s model=%s" ,
778+ request_id ,
779+ provider ,
780+ model ,
781+ )
732782 mcp_tools : list [Any ] = await get_mcp_tools (request_headers = request .headers )
783+ logger .info (
784+ "Retrieved %d MCP tools for rlsapi v1 request %s" ,
785+ len (mcp_tools ),
786+ request_id ,
787+ )
733788
734789 start_time = time .monotonic ()
735790 verbose_enabled = _is_verbose_enabled (infer_request )
791+ logger .info (
792+ "Starting LLM call for rlsapi v1 request %s with verbose metadata enabled: %s" ,
793+ request_id ,
794+ verbose_enabled ,
795+ )
736796
737797 response = None
738798 try :
799+ logger .info ("Building instructions for rlsapi v1 request %s" , request_id )
739800 instructions = _build_instructions (infer_request .context .systeminfo )
740801 response = await _call_llm (
741802 input_source ,
@@ -749,9 +810,17 @@ async def infer_endpoint( # pylint: disable=R0914
749810 recording .record_llm_inference_duration (
750811 provider , model , endpoint_path , "success" , inference_time
751812 )
813+ logger .info (
814+ "LLM call completed for rlsapi v1 request %s in %.3f seconds "
815+ "with %d input tokens and %d output tokens" ,
816+ request_id ,
817+ inference_time ,
818+ token_usage .input_tokens ,
819+ token_usage .output_tokens ,
820+ )
752821 except _INFER_HANDLED_EXCEPTIONS as error :
753822 if response is not None :
754- extract_token_usage (response .usage , model_id , endpoint_path ) # type: ignore[arg-type]
823+ extract_token_usage (response .usage , model_id , endpoint_path )
755824 _record_inference_failure (
756825 background_tasks ,
757826 infer_request ,
@@ -778,11 +847,20 @@ async def infer_endpoint( # pylint: disable=R0914
778847
779848 # Consume quota tokens after successful inference.
780849 if quota_id is not None :
850+ logger .info (
851+ "Consuming quota tokens for rlsapi v1 request %s: input=%d output=%d" ,
852+ request_id ,
853+ token_usage .input_tokens ,
854+ token_usage .output_tokens ,
855+ )
781856 consume_query_tokens (
782857 user_id = quota_id ,
783858 model_id = model_id ,
784859 token_usage = token_usage ,
785860 )
861+ logger .info (
862+ "Quota token consumption completed for rlsapi v1 request %s" , request_id
863+ )
786864
787865 _queue_splunk_event (
788866 background_tasks ,
@@ -796,7 +874,11 @@ async def infer_endpoint( # pylint: disable=R0914
796874 output_tokens = token_usage .output_tokens ,
797875 )
798876
799- logger .info ("Completed rlsapi v1 /infer request %s" , request_id )
877+ logger .info (
878+ "Completed rlsapi v1 /infer request %s in %.3f seconds" ,
879+ request_id ,
880+ inference_time ,
881+ )
800882
801883 return _build_infer_response (
802884 response_text ,
0 commit comments