55"""
66
77import logging
8+ import time
89from typing import Annotated , Any , cast
910
10- from fastapi import APIRouter , Depends , HTTPException
11+ from fastapi import APIRouter , BackgroundTasks , Depends , HTTPException , Request
1112from llama_stack_api .openai_responses import OpenAIResponseObject
1213from llama_stack_client import APIConnectionError , APIStatusError , RateLimitError
1314
1415import constants
1516import metrics
1617from authentication import get_auth_dependency
1718from authentication .interface import AuthTuple
19+ from authentication .rh_identity import RHIdentityData
1820from authorization .middleware import authorize
1921from client import AsyncLlamaStackClientHolder
2022from configuration import configuration
2931)
3032from models .rlsapi .requests import RlsapiV1InferRequest , RlsapiV1SystemInfo
3133from models .rlsapi .responses import RlsapiV1InferData , RlsapiV1InferResponse
34+ from observability import InferenceEventData , build_inference_event , send_splunk_event
3235from utils .responses import extract_text_from_response_output_item
3336from utils .suid import get_suid
3437
3538logger = logging .getLogger (__name__ )
3639router = APIRouter (tags = ["rlsapi-v1" ])
3740
41+ # Default values when RH Identity auth is not configured
42+ AUTH_DISABLED = "auth_disabled"
43+
44+
45+ def _get_rh_identity_context (request : Request ) -> tuple [str , str ]:
46+ """Extract org_id and system_id from RH Identity request state.
47+
48+ When RH Identity authentication is configured, the auth dependency stores
49+ the RHIdentityData object in request.state.rh_identity_data. This function
50+ extracts the org_id and system_id for telemetry purposes.
51+
52+ Args:
53+ request: The FastAPI request object.
54+
55+ Returns:
56+ Tuple of (org_id, system_id). Returns ("auth_disabled", "auth_disabled")
57+ when RH Identity auth is not configured or data is unavailable.
58+ """
59+ rh_identity : RHIdentityData | None = getattr (
60+ request .state , "rh_identity_data" , None
61+ )
62+ if rh_identity is None :
63+ return AUTH_DISABLED , AUTH_DISABLED
64+
65+ org_id = rh_identity .get_org_id () or AUTH_DISABLED
66+ system_id = rh_identity .get_user_id () or AUTH_DISABLED
67+ return org_id , system_id
68+
3869
3970infer_responses : dict [int | str , dict [str , Any ]] = {
4071 200 : RlsapiV1InferResponse .openapi_response (),
@@ -148,10 +179,52 @@ async def retrieve_simple_response(question: str, instructions: str) -> str:
148179 )
149180
150181
182+ def _get_cla_version (request : Request ) -> str :
183+ """Extract CLA version from User-Agent header."""
184+ return request .headers .get ("User-Agent" , "" )
185+
186+
187+ def _queue_splunk_event ( # pylint: disable=too-many-arguments,too-many-positional-arguments
188+ background_tasks : BackgroundTasks ,
189+ infer_request : RlsapiV1InferRequest ,
190+ request : Request ,
191+ request_id : str ,
192+ response_text : str ,
193+ inference_time : float ,
194+ sourcetype : str ,
195+ ) -> None :
196+ """Build and queue a Splunk telemetry event for background sending."""
197+ org_id , system_id = _get_rh_identity_context (request )
198+ systeminfo = infer_request .context .systeminfo
199+
200+ event_data = InferenceEventData (
201+ question = infer_request .question ,
202+ response = response_text ,
203+ inference_time = inference_time ,
204+ model = (
205+ (configuration .inference .default_model or "" )
206+ if configuration .inference
207+ else ""
208+ ),
209+ org_id = org_id ,
210+ system_id = system_id ,
211+ request_id = request_id ,
212+ cla_version = _get_cla_version (request ),
213+ system_os = systeminfo .os ,
214+ system_version = systeminfo .version ,
215+ system_arch = systeminfo .arch ,
216+ )
217+
218+ event = build_inference_event (event_data )
219+ background_tasks .add_task (send_splunk_event , event , sourcetype )
220+
221+
151222@router .post ("/infer" , responses = infer_responses )
152223@authorize (Action .RLSAPI_V1_INFER )
153224async def infer_endpoint (
154225 infer_request : RlsapiV1InferRequest ,
226+ request : Request ,
227+ background_tasks : BackgroundTasks ,
155228 auth : Annotated [AuthTuple , Depends (get_auth_dependency ())],
156229) -> RlsapiV1InferResponse :
157230 """Handle rlsapi v1 /infer requests for stateless inference.
@@ -163,6 +236,8 @@ async def infer_endpoint(
163236
164237 Args:
165238 infer_request: The inference request containing question and context.
239+ request: The FastAPI request object for accessing headers and state.
240+ background_tasks: FastAPI background tasks for async Splunk event sending.
166241 auth: Authentication tuple from the configured auth provider.
167242
168243 Returns:
@@ -174,7 +249,6 @@ async def infer_endpoint(
174249 # Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
175250 _ = auth
176251
177- # Generate unique request ID
178252 request_id = get_suid ()
179253
180254 logger .info ("Processing rlsapi v1 /infer request %s" , request_id )
@@ -185,35 +259,77 @@ async def infer_endpoint(
185259 "Request %s: Combined input source length: %d" , request_id , len (input_source )
186260 )
187261
262+ start_time = time .monotonic ()
188263 try :
189264 response_text = await retrieve_simple_response (input_source , instructions )
265+ inference_time = time .monotonic () - start_time
190266 except APIConnectionError as e :
267+ inference_time = time .monotonic () - start_time
191268 metrics .llm_calls_failures_total .inc ()
192269 logger .error (
193270 "Unable to connect to Llama Stack for request %s: %s" , request_id , e
194271 )
272+ _queue_splunk_event (
273+ background_tasks ,
274+ infer_request ,
275+ request ,
276+ request_id ,
277+ str (e ),
278+ inference_time ,
279+ "infer_error" ,
280+ )
195281 response = ServiceUnavailableResponse (
196282 backend_name = "Llama Stack" ,
197283 cause = str (e ),
198284 )
199285 raise HTTPException (** response .model_dump ()) from e
200286 except RateLimitError as e :
287+ inference_time = time .monotonic () - start_time
201288 metrics .llm_calls_failures_total .inc ()
202289 logger .error ("Rate limit exceeded for request %s: %s" , request_id , e )
290+ _queue_splunk_event (
291+ background_tasks ,
292+ infer_request ,
293+ request ,
294+ request_id ,
295+ str (e ),
296+ inference_time ,
297+ "infer_error" ,
298+ )
203299 response = QuotaExceededResponse (
204300 response = "The quota has been exceeded" , cause = str (e )
205301 )
206302 raise HTTPException (** response .model_dump ()) from e
207303 except APIStatusError as e :
304+ inference_time = time .monotonic () - start_time
208305 metrics .llm_calls_failures_total .inc ()
209306 logger .exception ("API error for request %s: %s" , request_id , e )
307+ _queue_splunk_event (
308+ background_tasks ,
309+ infer_request ,
310+ request ,
311+ request_id ,
312+ str (e ),
313+ inference_time ,
314+ "infer_error" ,
315+ )
210316 response = InternalServerErrorResponse .generic ()
211317 raise HTTPException (** response .model_dump ()) from e
212318
213319 if not response_text :
214320 logger .warning ("Empty response from LLM for request %s" , request_id )
215321 response_text = constants .UNABLE_TO_PROCESS_RESPONSE
216322
323+ _queue_splunk_event (
324+ background_tasks ,
325+ infer_request ,
326+ request ,
327+ request_id ,
328+ response_text ,
329+ inference_time ,
330+ "infer_with_llm" ,
331+ )
332+
217333 logger .info ("Completed rlsapi v1 /infer request %s" , request_id )
218334
219335 return RlsapiV1InferResponse (
0 commit comments