Skip to content

Commit 8e7ab18

Browse files
committed
fix: add privacy-safe RLSAPI observability logs
RSPEED-2974 Signed-off-by: Major Hayden <major@redhat.com>
1 parent b601f34 commit 8e7ab18

2 files changed

Lines changed: 200 additions & 23 deletions

File tree

src/app/endpoints/rlsapi_v1.py

Lines changed: 105 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,18 @@ async def _get_default_model_id() -> str:
180180
provider_id = configuration.inference.default_provider
181181

182182
if model_id and provider_id:
183+
logger.info(
184+
"Using configured default model for rlsapi v1: %s/%s",
185+
provider_id,
186+
model_id,
187+
)
183188
return f"{provider_id}/{model_id}"
184189

185190
# 2. Auto-discover from Llama Stack
191+
logger.info(
192+
"No complete default model configured for rlsapi v1, "
193+
"auto-discovering LLM model"
194+
)
186195
client = AsyncLlamaStackClientHolder().get_client()
187196
try:
188197
models = await client.models.list()
@@ -234,6 +243,7 @@ async def _resolve_validated_model_id() -> str:
234243
_, model_name = extract_provider_and_model_from_model_id(model_id)
235244
error_response = NotFoundResponse(resource="model", resource_id=model_name)
236245
raise HTTPException(**error_response.model_dump())
246+
logger.info("Validated rlsapi v1 model availability: %s", model_id)
237247
return model_id
238248

239249

@@ -366,6 +376,11 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position
366376

367377
event = build_inference_event(event_data)
368378
background_tasks.add_task(send_splunk_event, event, sourcetype)
379+
logger.info(
380+
"Queued rlsapi v1 Splunk event for request %s with sourcetype %s",
381+
request_id,
382+
sourcetype,
383+
)
369384

370385

371386
async def _check_shield_moderation( # pylint: disable=too-many-arguments,too-many-positional-arguments
@@ -394,16 +409,14 @@ async def _check_shield_moderation( # pylint: disable=too-many-arguments,too-ma
394409
was blocked, or None if moderation passed.
395410
"""
396411
client = AsyncLlamaStackClientHolder().get_client()
412+
logger.info("Running shield moderation for rlsapi v1 request %s", request_id)
397413
moderation_result = await run_shield_moderation(client, input_text, endpoint_path)
398414

399415
if moderation_result.decision != "blocked":
416+
logger.info("Shield moderation passed for rlsapi v1 request %s", request_id)
400417
return None
401418

402-
logger.info(
403-
"Request %s blocked by shield moderation: %s",
404-
request_id,
405-
moderation_result.message,
406-
)
419+
logger.info("Shield moderation blocked rlsapi v1 request %s", request_id)
407420
_queue_splunk_event(
408421
background_tasks,
409422
infer_request,
@@ -468,6 +481,11 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
468481
inference_time,
469482
"infer_error",
470483
)
484+
logger.info(
485+
"Recorded rlsapi v1 inference failure for request %s in %.3f seconds",
486+
request_id,
487+
inference_time,
488+
)
471489
return inference_time
472490

473491

@@ -605,22 +623,34 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
605623
"""
606624
if isinstance(error, TemplateRenderError):
607625
logger.error(
608-
"Invalid system prompt template for request %s: %s", request_id, error
626+
"Invalid system prompt template for request %s: %s",
627+
request_id,
628+
type(error).__name__,
609629
)
610630
error_response = InternalServerErrorResponse.generic()
611631
return HTTPException(**error_response.model_dump())
612632

613633
if isinstance(error, RuntimeError):
614634
if is_context_length_error(str(error)):
615-
logger.error("Prompt too long for request %s: %s", request_id, error)
635+
logger.error(
636+
"Prompt too long for request %s: %s",
637+
request_id,
638+
type(error).__name__,
639+
)
616640
error_response = PromptTooLongResponse(model=model_id)
617641
return HTTPException(**error_response.model_dump())
618-
logger.error("Unexpected RuntimeError for request %s: %s", request_id, error)
642+
logger.error(
643+
"Unexpected RuntimeError for request %s: %s",
644+
request_id,
645+
type(error).__name__,
646+
)
619647
return None
620648

621649
if isinstance(error, APIConnectionError):
622650
logger.error(
623-
"Unable to connect to Llama Stack for request %s: %s", request_id, error
651+
"Unable to connect to Llama Stack for request %s: %s",
652+
request_id,
653+
type(error).__name__,
624654
)
625655
error_response = ServiceUnavailableResponse(
626656
backend_name="Llama Stack",
@@ -629,15 +659,19 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
629659
return HTTPException(**error_response.model_dump())
630660

631661
if isinstance(error, RateLimitError):
632-
logger.error("Rate limit exceeded for request %s: %s", request_id, error)
662+
logger.error(
663+
"Rate limit exceeded for request %s: %s",
664+
request_id,
665+
type(error).__name__,
666+
)
633667
error_response = QuotaExceededResponse(
634668
response="The quota has been exceeded",
635669
cause="Rate limit exceeded, please try again later",
636670
)
637671
return HTTPException(**error_response.model_dump())
638672

639673
if isinstance(error, (APIStatusError, OpenAIAPIStatusError)):
640-
logger.exception("API error for request %s: %s", request_id, error)
674+
logger.error("API error for request %s: %s", request_id, type(error).__name__)
641675
error_response = handle_known_apistatus_errors(error, model_id)
642676
return HTTPException(**error_response.model_dump())
643677

@@ -646,7 +680,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
646680

647681
@router.post("/infer", responses=infer_responses, response_model_exclude_none=True)
648682
@authorize(Action.RLSAPI_V1_INFER)
649-
async def infer_endpoint( # pylint: disable=R0914
683+
async def infer_endpoint( # pylint: disable=R0914,R0915
650684
infer_request: RlsapiV1InferRequest,
651685
request: Request,
652686
background_tasks: BackgroundTasks,
@@ -673,21 +707,32 @@ async def infer_endpoint( # pylint: disable=R0914
673707
"""
674708
# Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
675709
check_configuration_loaded(configuration)
710+
endpoint_path = ENDPOINT_PATH_INFER
711+
request_id = get_suid()
712+
713+
logger.info("Processing rlsapi v1 /infer request %s", request_id)
714+
676715
# Quota enforcement: resolve subject and check availability before any work.
677716
# No-op when quota_subject is not configured or no quota limiters exist.
678717
quota_id = _resolve_quota_subject(request, auth)
679718
if quota_id is not None:
719+
logger.info(
720+
"Checking quota availability for rlsapi v1 request %s using subject type %s",
721+
request_id,
722+
configuration.rlsapi_v1.quota_subject,
723+
)
680724
check_tokens_available(configuration.quota_limiters, quota_id)
681-
682-
endpoint_path = ENDPOINT_PATH_INFER
683-
684-
request_id = get_suid()
685-
686-
logger.info("Processing rlsapi v1 /infer request %s", request_id)
725+
logger.info(
726+
"Quota availability check passed for rlsapi v1 request %s", request_id
727+
)
728+
else:
729+
logger.info("Quota enforcement disabled for rlsapi v1 request %s", request_id)
687730

688731
input_source = infer_request.get_input_source()
689-
logger.debug(
690-
"Request %s: Combined input source length: %d", request_id, len(input_source)
732+
logger.info(
733+
"Prepared rlsapi v1 request %s input source; metadata requested: %s",
734+
request_id,
735+
infer_request.include_metadata,
691736
)
692737

693738
# Run shield moderation on user input before inference.
@@ -707,13 +752,30 @@ async def infer_endpoint( # pylint: disable=R0914
707752

708753
model_id = await _resolve_validated_model_id()
709754
provider, model = extract_provider_and_model_from_model_id(model_id)
755+
logger.info(
756+
"Resolved rlsapi v1 request %s model provider=%s model=%s",
757+
request_id,
758+
provider,
759+
model,
760+
)
710761
mcp_tools: list[Any] = await get_mcp_tools(request_headers=request.headers)
762+
logger.info(
763+
"Retrieved %d MCP tools for rlsapi v1 request %s",
764+
len(mcp_tools),
765+
request_id,
766+
)
711767

712768
start_time = time.monotonic()
713769
verbose_enabled = _is_verbose_enabled(infer_request)
770+
logger.info(
771+
"Starting LLM call for rlsapi v1 request %s with verbose metadata enabled: %s",
772+
request_id,
773+
verbose_enabled,
774+
)
714775

715776
response = None
716777
try:
778+
logger.info("Building instructions for rlsapi v1 request %s", request_id)
717779
instructions = _build_instructions(infer_request.context.systeminfo)
718780
response = await _call_llm(
719781
input_source,
@@ -727,9 +789,17 @@ async def infer_endpoint( # pylint: disable=R0914
727789
recording.record_llm_inference_duration(
728790
provider, model, endpoint_path, "success", inference_time
729791
)
792+
logger.info(
793+
"LLM call completed for rlsapi v1 request %s in %.3f seconds "
794+
"with %d input tokens and %d output tokens",
795+
request_id,
796+
inference_time,
797+
token_usage.input_tokens,
798+
token_usage.output_tokens,
799+
)
730800
except _INFER_HANDLED_EXCEPTIONS as error:
731801
if response is not None:
732-
extract_token_usage(response.usage, model_id, endpoint_path) # type: ignore[arg-type]
802+
extract_token_usage(response.usage, model_id, endpoint_path)
733803
_record_inference_failure(
734804
background_tasks,
735805
infer_request,
@@ -756,11 +826,20 @@ async def infer_endpoint( # pylint: disable=R0914
756826

757827
# Consume quota tokens after successful inference.
758828
if quota_id is not None:
829+
logger.info(
830+
"Consuming quota tokens for rlsapi v1 request %s: input=%d output=%d",
831+
request_id,
832+
token_usage.input_tokens,
833+
token_usage.output_tokens,
834+
)
759835
consume_query_tokens(
760836
user_id=quota_id,
761837
model_id=model_id,
762838
token_usage=token_usage,
763839
)
840+
logger.info(
841+
"Quota token consumption completed for rlsapi v1 request %s", request_id
842+
)
764843

765844
_queue_splunk_event(
766845
background_tasks,
@@ -774,7 +853,11 @@ async def infer_endpoint( # pylint: disable=R0914
774853
output_tokens=token_usage.output_tokens,
775854
)
776855

777-
logger.info("Completed rlsapi v1 /infer request %s", request_id)
856+
logger.info(
857+
"Completed rlsapi v1 /infer request %s in %.3f seconds",
858+
request_id,
859+
inference_time,
860+
)
778861

779862
return _build_infer_response(
780863
response_text,

0 commit comments

Comments
 (0)