Skip to content

Commit 5f1ffaa

Browse files
authored
Merge pull request #1032 from major/rspeed-2326-splunk-config
RSPEED-2326: feat(observability): add Splunk HEC integration
2 parents f9d3060 + 8c3acd2 commit 5f1ffaa

12 files changed

Lines changed: 628 additions & 1 deletion

File tree

src/configuration.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
DatabaseConfiguration,
2424
ConversationHistoryConfiguration,
2525
QuotaHandlersConfiguration,
26+
SplunkConfiguration,
2627
)
2728

2829
from cache.cache import Cache
@@ -39,7 +40,7 @@ class LogicError(Exception):
3940
"""Error in application logic."""
4041

4142

42-
class AppConfig:
43+
class AppConfig: # pylint: disable=too-many-public-methods
4344
"""Singleton class to load and store the configuration."""
4445

4546
_instance = None
@@ -348,5 +349,19 @@ def azure_entra_id(self) -> Optional[AzureEntraIdConfiguration]:
348349
raise LogicError("logic error: configuration is not loaded")
349350
return self._configuration.azure_entra_id
350351

352+
@property
353+
def splunk(self) -> Optional[SplunkConfiguration]:
354+
"""Return Splunk configuration, or None if not provided."""
355+
if self._configuration is None:
356+
raise LogicError("logic error: configuration is not loaded")
357+
return self._configuration.splunk
358+
359+
@property
360+
def deployment_environment(self) -> str:
361+
"""Return deployment environment name."""
362+
if self._configuration is None:
363+
raise LogicError("logic error: configuration is not loaded")
364+
return self._configuration.deployment_environment
365+
351366

352367
configuration: AppConfig = AppConfig()

src/models/config.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,87 @@ def check_llama_stack_model(self) -> Self:
612612
return self
613613

614614

615+
class SplunkConfiguration(ConfigurationBase):
616+
"""Splunk HEC (HTTP Event Collector) configuration.
617+
618+
Splunk HEC allows sending events directly to Splunk over HTTP/HTTPS.
619+
This configuration is used to send telemetry events for inference
620+
requests to the corporate Splunk deployment.
621+
622+
Useful resources:
623+
624+
- [Splunk HEC Docs](https://docs.splunk.com/Documentation/SplunkCloud)
625+
- [About HEC](https://docs.splunk.com/Documentation/Splunk/latest/Data)
626+
"""
627+
628+
enabled: bool = Field(
629+
False,
630+
title="Enabled",
631+
description="Enable or disable Splunk HEC integration.",
632+
)
633+
634+
url: Optional[str] = Field(
635+
None,
636+
title="HEC URL",
637+
description="Splunk HEC endpoint URL.",
638+
)
639+
640+
token_path: Optional[FilePath] = Field(
641+
None,
642+
title="Token path",
643+
description="Path to file containing the Splunk HEC authentication token.",
644+
)
645+
646+
index: Optional[str] = Field(
647+
None,
648+
title="Index",
649+
description="Target Splunk index for events.",
650+
)
651+
652+
source: str = Field(
653+
"lightspeed-stack",
654+
title="Source",
655+
description="Event source identifier.",
656+
)
657+
658+
timeout: PositiveInt = Field(
659+
5,
660+
title="Timeout",
661+
description="HTTP timeout in seconds for HEC requests.",
662+
)
663+
664+
verify_ssl: bool = Field(
665+
True,
666+
title="Verify SSL",
667+
description="Whether to verify SSL certificates for HEC endpoint.",
668+
)
669+
670+
@model_validator(mode="after")
671+
def check_splunk_configuration(self) -> Self:
672+
"""Validate that required fields are set when Splunk is enabled.
673+
674+
Returns:
675+
Self: The validated configuration instance.
676+
677+
Raises:
678+
ValueError: If enabled is True but required fields are missing.
679+
"""
680+
if self.enabled:
681+
missing_fields = []
682+
if not self.url:
683+
missing_fields.append("url")
684+
if not self.token_path:
685+
missing_fields.append("token_path")
686+
if not self.index:
687+
missing_fields.append("index")
688+
if missing_fields:
689+
raise ValueError(
690+
f"Splunk is enabled but required fields are missing: "
691+
f"{', '.join(missing_fields)}"
692+
)
693+
return self
694+
695+
615696
class UserDataCollection(ConfigurationBase):
616697
"""User data collection configuration."""
617698

@@ -1659,6 +1740,19 @@ class Configuration(ConfigurationBase):
16591740
)
16601741
azure_entra_id: Optional[AzureEntraIdConfiguration] = None
16611742

1743+
splunk: Optional[SplunkConfiguration] = Field(
1744+
default=None,
1745+
title="Splunk configuration",
1746+
description="Splunk HEC configuration for sending telemetry events.",
1747+
)
1748+
1749+
deployment_environment: str = Field(
1750+
"development",
1751+
title="Deployment environment",
1752+
description="Deployment environment name (e.g., 'development', 'staging', 'production'). "
1753+
"Used in telemetry events.",
1754+
)
1755+
16621756
@model_validator(mode="after")
16631757
def validate_mcp_auth_headers(self) -> Self:
16641758
"""

src/observability/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""Observability module for telemetry and event collection.
2+
3+
This module provides functionality for sending telemetry events to external
4+
systems like Splunk HEC for monitoring and analytics.
5+
6+
The splunk module provides a format-agnostic send_splunk_event() function.
7+
Event formats are in the formats subpackage - see formats.rlsapi for the
8+
default implementation, or create your own format module.
9+
"""
10+
11+
from observability.formats import InferenceEventData, build_inference_event
12+
from observability.splunk import send_splunk_event
13+
14+
__all__ = ["send_splunk_event", "InferenceEventData", "build_inference_event"]
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""Event format builders for Splunk telemetry.
2+
3+
Each submodule provides format-specific event builders. The rlsapi module
4+
provides the default format matching Red Hat's rlsapi v1 specification.
5+
"""
6+
7+
from observability.formats.rlsapi import InferenceEventData, build_inference_event
8+
9+
__all__ = ["InferenceEventData", "build_inference_event"]
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Event builders for rlsapi v1 Splunk format.
2+
3+
This module provides event builders specific to the rlsapi v1 telemetry format.
4+
To implement a custom format, create a new module in this package with your own
5+
event builder function that returns a dict, then pass the result to send_splunk_event().
6+
"""
7+
8+
from dataclasses import dataclass
9+
from typing import Any
10+
11+
from configuration import configuration
12+
13+
14+
@dataclass
15+
class InferenceEventData: # pylint: disable=too-many-instance-attributes
16+
"""Data required to build an inference telemetry event."""
17+
18+
question: str
19+
response: str
20+
inference_time: float
21+
model: str
22+
org_id: str
23+
system_id: str
24+
request_id: str
25+
cla_version: str
26+
system_os: str
27+
system_version: str
28+
system_arch: str
29+
30+
31+
def build_inference_event(data: InferenceEventData) -> dict[str, Any]:
32+
"""Build an inference telemetry event payload matching rlsapi format.
33+
34+
Args:
35+
data: The inference event data.
36+
37+
Returns:
38+
A dictionary matching the rlsapi Splunk event format.
39+
"""
40+
return {
41+
"question": data.question,
42+
"refined_questions": [],
43+
"context": "",
44+
"response": data.response,
45+
"inference_time": data.inference_time,
46+
"model": data.model,
47+
"deployment": configuration.deployment_environment,
48+
"org_id": data.org_id,
49+
"system_id": data.system_id,
50+
# Token counting not yet implemented in lightspeed-stack; rlsapi uses 0 as default
51+
"total_llm_tokens": 0,
52+
"request_id": data.request_id,
53+
"cla_version": data.cla_version,
54+
"system_os": data.system_os,
55+
"system_version": data.system_version,
56+
"system_arch": data.system_arch,
57+
}

src/observability/splunk.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
"""Async Splunk HEC client for sending telemetry events."""
2+
3+
import logging
4+
import platform
5+
import time
6+
from typing import Any
7+
8+
import aiohttp
9+
10+
from configuration import configuration
11+
from version import __version__
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
def _get_hostname() -> str:
17+
"""Get the hostname for Splunk event metadata."""
18+
return platform.node() or "unknown"
19+
20+
21+
def _read_token_from_file(token_path: str) -> str | None:
22+
"""Read HEC token from file path."""
23+
try:
24+
with open(token_path, encoding="utf-8") as f:
25+
return f.read().strip()
26+
except OSError as e:
27+
logger.warning("Failed to read Splunk HEC token from %s: %s", token_path, e)
28+
return None
29+
30+
31+
async def send_splunk_event(event: dict[str, Any], sourcetype: str) -> None:
32+
"""Send an event to Splunk HEC.
33+
34+
This function sends events asynchronously and handles failures gracefully
35+
by logging warnings instead of raising exceptions. This ensures that
36+
Splunk connectivity issues don't affect the main application flow.
37+
38+
Args:
39+
event: The event payload to send.
40+
sourcetype: The Splunk sourcetype (e.g., "infer_with_llm", "infer_error").
41+
"""
42+
splunk_config = configuration.splunk
43+
if splunk_config is None or not splunk_config.enabled:
44+
logger.debug("Splunk integration disabled, skipping event")
45+
return
46+
47+
if not splunk_config.url or not splunk_config.token_path or not splunk_config.index:
48+
logger.warning("Splunk configuration incomplete, skipping event")
49+
return
50+
51+
# Read token on each request to support rotation without restart
52+
token = _read_token_from_file(str(splunk_config.token_path))
53+
if not token:
54+
return
55+
56+
payload = {
57+
"time": int(time.time()),
58+
"host": _get_hostname(),
59+
"source": f"{splunk_config.source} (v{__version__})",
60+
"sourcetype": sourcetype,
61+
"index": splunk_config.index,
62+
"event": event,
63+
}
64+
65+
headers = {
66+
"Authorization": f"Splunk {token}",
67+
"Content-Type": "application/json",
68+
}
69+
70+
timeout = aiohttp.ClientTimeout(total=splunk_config.timeout)
71+
connector = aiohttp.TCPConnector(ssl=splunk_config.verify_ssl)
72+
73+
try:
74+
async with aiohttp.ClientSession(
75+
timeout=timeout, connector=connector
76+
) as session:
77+
async with session.post(
78+
splunk_config.url, json=payload, headers=headers
79+
) as response:
80+
if response.status >= 400:
81+
body = await response.text()
82+
logger.warning(
83+
"Splunk HEC request failed with status %d: %s",
84+
response.status,
85+
body[:200],
86+
)
87+
except aiohttp.ClientError as e:
88+
logger.warning("Splunk HEC request failed: %s", e)
89+
except TimeoutError:
90+
logger.warning("Splunk HEC request timed out after %ds", splunk_config.timeout)

tests/unit/models/config/test_dump_configuration.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,8 @@ def test_dump_configuration(tmp_path: Path) -> None:
204204
"postgres": None,
205205
},
206206
"azure_entra_id": None,
207+
"splunk": None,
208+
"deployment_environment": "development",
207209
}
208210

209211

@@ -539,6 +541,8 @@ def test_dump_configuration_with_quota_limiters(tmp_path: Path) -> None:
539541
"postgres": None,
540542
},
541543
"azure_entra_id": None,
544+
"splunk": None,
545+
"deployment_environment": "development",
542546
}
543547

544548

@@ -756,6 +760,8 @@ def test_dump_configuration_with_quota_limiters_different_values(
756760
"postgres": None,
757761
},
758762
"azure_entra_id": None,
763+
"splunk": None,
764+
"deployment_environment": "development",
759765
}
760766

761767

@@ -947,6 +953,8 @@ def test_dump_configuration_byok(tmp_path: Path) -> None:
947953
"postgres": None,
948954
},
949955
"azure_entra_id": None,
956+
"splunk": None,
957+
"deployment_environment": "development",
950958
}
951959

952960

@@ -1124,4 +1132,6 @@ def test_dump_configuration_pg_namespace(tmp_path: Path) -> None:
11241132
"postgres": None,
11251133
},
11261134
"azure_entra_id": None,
1135+
"splunk": None,
1136+
"deployment_environment": "development",
11271137
}

0 commit comments

Comments
 (0)