Skip to content

Commit 46669a3

Browse files
committed
fix: plumb LLM token counts into rlsapi Splunk telemetry
total_llm_tokens was hardcoded to 0 in the rlsapi Splunk event builder despite token counting being implemented via extract_token_usage(). Add input_tokens and output_tokens to InferenceEventData and pass actual counts from the endpoint handler. Ref: RSPEED-2857 Signed-off-by: Major Hayden <major@redhat.com>
1 parent 8ec0967 commit 46669a3

5 files changed

Lines changed: 77 additions & 3 deletions

File tree

src/app/endpoints/rlsapi_v1.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,8 +353,22 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position
353353
response_text: str,
354354
inference_time: float,
355355
sourcetype: str,
356+
input_tokens: int = 0,
357+
output_tokens: int = 0,
356358
) -> None:
357-
"""Build and queue a Splunk telemetry event for background sending."""
359+
"""Build and queue a Splunk telemetry event for background sending.
360+
361+
Args:
362+
background_tasks: FastAPI background task manager.
363+
infer_request: Original rlsapi v1 inference request.
364+
request: FastAPI request object used to resolve identity context.
365+
request_id: Unique identifier for the request.
366+
response_text: Response text to include in the telemetry event.
367+
inference_time: Request processing duration in seconds.
368+
sourcetype: Splunk sourcetype to use when sending the event.
369+
input_tokens: Number of prompt tokens consumed by the LLM call.
370+
output_tokens: Number of completion tokens produced by the LLM call.
371+
"""
358372
org_id, system_id = _get_rh_identity_context(request)
359373
systeminfo = infer_request.context.systeminfo
360374

@@ -370,6 +384,8 @@ def _queue_splunk_event( # pylint: disable=too-many-arguments,too-many-position
370384
system_os=systeminfo.os,
371385
system_version=systeminfo.version,
372386
system_arch=systeminfo.arch,
387+
input_tokens=input_tokens,
388+
output_tokens=output_tokens,
373389
)
374390

375391
event = build_inference_event(event_data)
@@ -754,6 +770,8 @@ async def infer_endpoint( # pylint: disable=R0914
754770
response_text,
755771
inference_time,
756772
"infer_with_llm",
773+
input_tokens=token_usage.input_tokens,
774+
output_tokens=token_usage.output_tokens,
757775
)
758776

759777
logger.info("Completed rlsapi v1 /infer request %s", request_id)

src/observability/formats/rlsapi.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class InferenceEventData: # pylint: disable=too-many-instance-attributes
2626
system_os: str
2727
system_version: str
2828
system_arch: str
29+
input_tokens: int = 0
30+
output_tokens: int = 0
2931

3032

3133
def build_inference_event(data: InferenceEventData) -> dict[str, Any]:
@@ -47,8 +49,7 @@ def build_inference_event(data: InferenceEventData) -> dict[str, Any]:
4749
"deployment": configuration.deployment_environment,
4850
"org_id": data.org_id,
4951
"system_id": data.system_id,
50-
# Token counting not yet implemented in lightspeed-stack; rlsapi uses 0 as default
51-
"total_llm_tokens": 0,
52+
"total_llm_tokens": data.input_tokens + data.output_tokens,
5253
"request_id": data.request_id,
5354
"cla_version": data.cla_version,
5455
"system_os": data.system_os,

tests/integration/endpoints/test_rlsapi_v1_integration.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@ def _setup_responses_mock(
115115
"""Set up responses.create mock with the given response text."""
116116
mock_response = mocker.Mock()
117117
mock_response.output = [_create_mock_response_output(mocker, response_text)]
118+
mock_usage = mocker.Mock()
119+
mock_usage.input_tokens = 10
120+
mock_usage.output_tokens = 5
121+
mock_response.usage = mock_usage
118122

119123
mock_responses = mocker.Mock()
120124
mock_responses.create = mocker.AsyncMock(return_value=mock_response)
@@ -303,6 +307,10 @@ async def test_rlsapi_v1_infer_fallback_response_empty_output(
303307

304308
mock_response = mocker.Mock()
305309
mock_response.output = []
310+
mock_usage = mocker.Mock()
311+
mock_usage.input_tokens = 10
312+
mock_usage.output_tokens = 5
313+
mock_response.usage = mock_usage
306314

307315
mock_responses = mocker.Mock()
308316
mock_responses.create = mocker.AsyncMock(return_value=mock_response)
@@ -342,6 +350,10 @@ async def test_rlsapi_v1_infer_input_source_combination(
342350

343351
mock_response = mocker.Mock()
344352
mock_response.output = [_create_mock_response_output(mocker, "response text")]
353+
mock_usage = mocker.Mock()
354+
mock_usage.input_tokens = 10
355+
mock_usage.output_tokens = 5
356+
mock_response.usage = mock_usage
345357

346358
mock_responses = mocker.Mock()
347359
mock_responses.create = mocker.AsyncMock(return_value=mock_response)
@@ -401,6 +413,10 @@ async def test_rlsapi_v1_infer_no_mcp_servers_passes_empty_tools(
401413

402414
mock_response = mocker.Mock()
403415
mock_response.output = [_create_mock_response_output(mocker, "response text")]
416+
mock_usage = mocker.Mock()
417+
mock_usage.input_tokens = 10
418+
mock_usage.output_tokens = 5
419+
mock_response.usage = mock_usage
404420

405421
mock_responses = mocker.Mock()
406422
mock_responses.create = mocker.AsyncMock(return_value=mock_response)
@@ -442,6 +458,10 @@ async def test_rlsapi_v1_infer_mcp_tools_passed_to_llm(
442458

443459
mock_response = mocker.Mock()
444460
mock_response.output = [_create_mock_response_output(mocker, "enriched response")]
461+
mock_usage = mocker.Mock()
462+
mock_usage.input_tokens = 10
463+
mock_usage.output_tokens = 5
464+
mock_response.usage = mock_usage
445465

446466
mock_responses = mocker.Mock()
447467
mock_responses.create = mocker.AsyncMock(return_value=mock_response)

tests/unit/app/endpoints/test_rlsapi_v1.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,10 @@ def mock_llm_response_fixture(mocker: MockerFixture) -> None:
116116
mock_response.output = [
117117
_create_mock_response_output(mocker, "This is a test LLM response.")
118118
]
119+
mock_usage = mocker.Mock()
120+
mock_usage.input_tokens = 10
121+
mock_usage.output_tokens = 5
122+
mock_response.usage = mock_usage
119123
_setup_responses_mock(mocker, mocker.AsyncMock(return_value=mock_response))
120124

121125

@@ -124,6 +128,10 @@ def mock_empty_llm_response_fixture(mocker: MockerFixture) -> None:
124128
"""Mock responses.create to return empty output list."""
125129
mock_response = mocker.Mock()
126130
mock_response.output = []
131+
mock_usage = mocker.Mock()
132+
mock_usage.input_tokens = 10
133+
mock_usage.output_tokens = 5
134+
mock_response.usage = mock_usage
127135
_setup_responses_mock(mocker, mocker.AsyncMock(return_value=mock_response))
128136

129137

tests/unit/observability/formats/test_rlsapi.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,33 @@ def test_builds_event_with_all_fields(
5050
assert event["total_llm_tokens"] == 0
5151

5252

53+
def test_builds_event_with_token_counts(mocker: MockerFixture) -> None:
54+
"""Test total_llm_tokens is computed from input and output token counts."""
55+
mocker.patch(
56+
"observability.formats.rlsapi.configuration"
57+
).deployment_environment = "production"
58+
59+
data = InferenceEventData(
60+
question="test",
61+
response="test",
62+
inference_time=1.0,
63+
model="test-model",
64+
org_id="org1",
65+
system_id="sys1",
66+
request_id="req_1",
67+
cla_version="CLA/1.0",
68+
system_os="RHEL",
69+
system_version="9.4",
70+
system_arch="x86_64",
71+
input_tokens=150,
72+
output_tokens=75,
73+
)
74+
75+
event = build_inference_event(data)
76+
77+
assert event["total_llm_tokens"] == 225
78+
79+
5380
def test_handles_auth_disabled_values(mocker: MockerFixture) -> None:
5481
"""Test event handles auth_disabled placeholder values."""
5582
data = InferenceEventData(

0 commit comments

Comments
 (0)