Skip to content

Commit 774964a

Browse files
authored
Merge pull request #1646 from major/rspeed-2974-rlsapi-safe-logging
RSPEED-2974: add privacy-safe RLSAPI observability logs
2 parents f5c35a8 + db59c76 commit 774964a

2 files changed

Lines changed: 201 additions & 25 deletions

File tree

src/app/endpoints/rlsapi_v1.py

Lines changed: 106 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,18 @@ async def _get_default_model_id() -> str:
201201
provider_id = configuration.inference.default_provider
202202

203203
if model_id and provider_id:
204+
logger.info(
205+
"Using configured default model for rlsapi v1: %s/%s",
206+
provider_id,
207+
model_id,
208+
)
204209
return f"{provider_id}/{model_id}"
205210

206211
# 2. Auto-discover from Llama Stack
212+
logger.info(
213+
"No complete default model configured for rlsapi v1, "
214+
"auto-discovering LLM model"
215+
)
207216
client = AsyncLlamaStackClientHolder().get_client()
208217
try:
209218
models = await client.models.list()
@@ -255,6 +264,7 @@ async def _resolve_validated_model_id() -> str:
255264
_, model_name = extract_provider_and_model_from_model_id(model_id)
256265
error_response = NotFoundResponse(resource="model", resource_id=model_name)
257266
raise HTTPException(**error_response.model_dump())
267+
logger.info("Validated rlsapi v1 model availability: %s", model_id)
258268
return model_id
259269

260270

@@ -387,6 +397,11 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position
387397

388398
event = build_inference_event(event_data)
389399
background_tasks.add_task(send_splunk_event, event, sourcetype)
400+
logger.info(
401+
"Queued rlsapi v1 Splunk event for request %s with sourcetype %s",
402+
request_id,
403+
sourcetype,
404+
)
390405

391406

392407
async def _check_shield_moderation( # pylint: disable=too-many-arguments,too-many-positional-arguments
@@ -415,16 +430,14 @@ async def _check_shield_moderation( # pylint: disable=too-many-arguments,too-ma
415430
was blocked, or None if moderation passed.
416431
"""
417432
client = AsyncLlamaStackClientHolder().get_client()
433+
logger.info("Running shield moderation for rlsapi v1 request %s", request_id)
418434
moderation_result = await run_shield_moderation(client, input_text, endpoint_path)
419435

420436
if moderation_result.decision != "blocked":
437+
logger.info("Shield moderation passed for rlsapi v1 request %s", request_id)
421438
return None
422439

423-
logger.info(
424-
"Request %s blocked by shield moderation: %s",
425-
request_id,
426-
moderation_result.message,
427-
)
440+
logger.info("Shield moderation blocked rlsapi v1 request %s", request_id)
428441
_queue_splunk_event(
429442
background_tasks,
430443
infer_request,
@@ -480,16 +493,20 @@ def _record_inference_failure( # pylint: disable=too-many-arguments,too-many-po
480493
recording.record_llm_inference_duration(
481494
provider, model, endpoint_path, "failure", inference_time
482495
)
483-
redacted_error = _redact_sensitive_error_text(str(error))
484496
_queue_splunk_event(
485497
background_tasks,
486498
infer_request,
487499
request,
488500
request_id,
489-
redacted_error,
501+
type(error).__name__,
490502
inference_time,
491503
"infer_error",
492504
)
505+
logger.info(
506+
"Recorded rlsapi v1 inference failure for request %s in %.3f seconds",
507+
request_id,
508+
inference_time,
509+
)
493510
return inference_time
494511

495512

@@ -627,22 +644,34 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
627644
"""
628645
if isinstance(error, TemplateRenderError):
629646
logger.error(
630-
"Invalid system prompt template for request %s: %s", request_id, error
647+
"Invalid system prompt template for request %s: %s",
648+
request_id,
649+
type(error).__name__,
631650
)
632651
error_response = InternalServerErrorResponse.generic()
633652
return HTTPException(**error_response.model_dump())
634653

635654
if isinstance(error, RuntimeError):
636655
if is_context_length_error(str(error)):
637-
logger.error("Prompt too long for request %s: %s", request_id, error)
656+
logger.error(
657+
"Prompt too long for request %s: %s",
658+
request_id,
659+
type(error).__name__,
660+
)
638661
error_response = PromptTooLongResponse(model=model_id)
639662
return HTTPException(**error_response.model_dump())
640-
logger.error("Unexpected RuntimeError for request %s: %s", request_id, error)
663+
logger.error(
664+
"Unexpected RuntimeError for request %s: %s",
665+
request_id,
666+
type(error).__name__,
667+
)
641668
return None
642669

643670
if isinstance(error, APIConnectionError):
644671
logger.error(
645-
"Unable to connect to Llama Stack for request %s: %s", request_id, error
672+
"Unable to connect to Llama Stack for request %s: %s",
673+
request_id,
674+
type(error).__name__,
646675
)
647676
error_response = ServiceUnavailableResponse(
648677
backend_name="Llama Stack",
@@ -651,15 +680,19 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
651680
return HTTPException(**error_response.model_dump())
652681

653682
if isinstance(error, RateLimitError):
654-
logger.error("Rate limit exceeded for request %s: %s", request_id, error)
683+
logger.error(
684+
"Rate limit exceeded for request %s: %s",
685+
request_id,
686+
type(error).__name__,
687+
)
655688
error_response = QuotaExceededResponse(
656689
response="The quota has been exceeded",
657690
cause="Rate limit exceeded, please try again later",
658691
)
659692
return HTTPException(**error_response.model_dump())
660693

661694
if isinstance(error, (APIStatusError, OpenAIAPIStatusError)):
662-
logger.exception("API error for request %s: %s", request_id, error)
695+
logger.error("API error for request %s: %s", request_id, type(error).__name__)
663696
error_response = handle_known_apistatus_errors(error, model_id)
664697
return HTTPException(**error_response.model_dump())
665698

@@ -668,7 +701,7 @@ def _map_inference_error_to_http_exception( # pylint: disable=too-many-return-s
668701

669702
@router.post("/infer", responses=infer_responses, response_model_exclude_none=True)
670703
@authorize(Action.RLSAPI_V1_INFER)
671-
async def infer_endpoint( # pylint: disable=R0914
704+
async def infer_endpoint( # pylint: disable=R0914,R0915
672705
infer_request: RlsapiV1InferRequest,
673706
request: Request,
674707
background_tasks: BackgroundTasks,
@@ -695,21 +728,32 @@ async def infer_endpoint( # pylint: disable=R0914
695728
"""
696729
# Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
697730
check_configuration_loaded(configuration)
731+
endpoint_path = ENDPOINT_PATH_INFER
732+
request_id = get_suid()
733+
734+
logger.info("Processing rlsapi v1 /infer request %s", request_id)
735+
698736
# Quota enforcement: resolve subject and check availability before any work.
699737
# No-op when quota_subject is not configured or no quota limiters exist.
700738
quota_id = _resolve_quota_subject(request, auth)
701739
if quota_id is not None:
740+
logger.info(
741+
"Checking quota availability for rlsapi v1 request %s using subject type %s",
742+
request_id,
743+
configuration.rlsapi_v1.quota_subject,
744+
)
702745
check_tokens_available(configuration.quota_limiters, quota_id)
703-
704-
endpoint_path = ENDPOINT_PATH_INFER
705-
706-
request_id = get_suid()
707-
708-
logger.info("Processing rlsapi v1 /infer request %s", request_id)
746+
logger.info(
747+
"Quota availability check passed for rlsapi v1 request %s", request_id
748+
)
749+
else:
750+
logger.info("Quota enforcement disabled for rlsapi v1 request %s", request_id)
709751

710752
input_source = infer_request.get_input_source()
711-
logger.debug(
712-
"Request %s: Combined input source length: %d", request_id, len(input_source)
753+
logger.info(
754+
"Prepared rlsapi v1 request %s input source; metadata requested: %s",
755+
request_id,
756+
infer_request.include_metadata,
713757
)
714758

715759
# Run shield moderation on user input before inference.
@@ -729,13 +773,30 @@ async def infer_endpoint( # pylint: disable=R0914
729773

730774
model_id = await _resolve_validated_model_id()
731775
provider, model = extract_provider_and_model_from_model_id(model_id)
776+
logger.info(
777+
"Resolved rlsapi v1 request %s model provider=%s model=%s",
778+
request_id,
779+
provider,
780+
model,
781+
)
732782
mcp_tools: list[Any] = await get_mcp_tools(request_headers=request.headers)
783+
logger.info(
784+
"Retrieved %d MCP tools for rlsapi v1 request %s",
785+
len(mcp_tools),
786+
request_id,
787+
)
733788

734789
start_time = time.monotonic()
735790
verbose_enabled = _is_verbose_enabled(infer_request)
791+
logger.info(
792+
"Starting LLM call for rlsapi v1 request %s with verbose metadata enabled: %s",
793+
request_id,
794+
verbose_enabled,
795+
)
736796

737797
response = None
738798
try:
799+
logger.info("Building instructions for rlsapi v1 request %s", request_id)
739800
instructions = _build_instructions(infer_request.context.systeminfo)
740801
response = await _call_llm(
741802
input_source,
@@ -749,9 +810,17 @@ async def infer_endpoint( # pylint: disable=R0914
749810
recording.record_llm_inference_duration(
750811
provider, model, endpoint_path, "success", inference_time
751812
)
813+
logger.info(
814+
"LLM call completed for rlsapi v1 request %s in %.3f seconds "
815+
"with %d input tokens and %d output tokens",
816+
request_id,
817+
inference_time,
818+
token_usage.input_tokens,
819+
token_usage.output_tokens,
820+
)
752821
except _INFER_HANDLED_EXCEPTIONS as error:
753822
if response is not None:
754-
extract_token_usage(response.usage, model_id, endpoint_path) # type: ignore[arg-type]
823+
extract_token_usage(response.usage, model_id, endpoint_path)
755824
_record_inference_failure(
756825
background_tasks,
757826
infer_request,
@@ -778,11 +847,20 @@ async def infer_endpoint( # pylint: disable=R0914
778847

779848
# Consume quota tokens after successful inference.
780849
if quota_id is not None:
850+
logger.info(
851+
"Consuming quota tokens for rlsapi v1 request %s: input=%d output=%d",
852+
request_id,
853+
token_usage.input_tokens,
854+
token_usage.output_tokens,
855+
)
781856
consume_query_tokens(
782857
user_id=quota_id,
783858
model_id=model_id,
784859
token_usage=token_usage,
785860
)
861+
logger.info(
862+
"Quota token consumption completed for rlsapi v1 request %s", request_id
863+
)
786864

787865
_queue_splunk_event(
788866
background_tasks,
@@ -796,7 +874,11 @@ async def infer_endpoint( # pylint: disable=R0914
796874
output_tokens=token_usage.output_tokens,
797875
)
798876

799-
logger.info("Completed rlsapi v1 /infer request %s", request_id)
877+
logger.info(
878+
"Completed rlsapi v1 /infer request %s in %.3f seconds",
879+
request_id,
880+
inference_time,
881+
)
800882

801883
return _build_infer_response(
802884
response_text,

0 commit comments

Comments
 (0)