Skip to content

Commit cee4a9a

Browse files
committed
LCORE-1857: Add metrics to track degraded mode startup state
Add Prometheus metrics to monitor when Lightspeed Core Stack starts in degraded mode (without llama-stack connectivity). During implementation, discovered that PR #327's lazy initialization pattern prevented model metrics from being set up correctly, requiring a revert. When LCORE starts in degraded mode, the @run_once_async decorator marks setup_model_metrics() as "done" after it returns early. On restart with llama-stack available, the function never runs again, leaving model metrics uninitialized. Moving setup back to startup fixes this while degraded mode already provides the startup resilience that PR #327 originally addressed. Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
1 parent c9871df commit cee4a9a

8 files changed

Lines changed: 47 additions & 28 deletions

File tree

docs/openapi.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10400,7 +10400,7 @@
1040010400
"metrics"
1040110401
],
1040210402
"summary": "Metrics Endpoint Handler",
10403-
"description": "Handle request to the /metrics endpoint.\n\nProcess GET requests to the /metrics endpoint, returning the\nlatest Prometheus metrics in form of a plain text.\n\nInitializes model metrics on the first request if not already\nset up, then responds with the current metrics snapshot in\nPrometheus format.\n\n### Parameters:\n- request: The incoming HTTP request (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Returns:\n- PlainTextResponse: Response body containing the Prometheus metrics text\n and the Prometheus content type.",
10403+
"description": "Handle request to the /metrics endpoint.\n\nProcess GET requests to the /metrics endpoint, returning the\nlatest Prometheus metrics in plain text Prometheus format.\n\n### Parameters:\n- request: The incoming HTTP request (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Returns:\n- PlainTextResponse: Response body containing the Prometheus metrics text\n and the Prometheus content type.",
1040410404
"operationId": "metrics_endpoint_handler_metrics_get",
1040510405
"responses": {
1040610406
"200": {

src/app/endpoints/metrics.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from authentication import get_auth_dependency
1313
from authentication.interface import AuthTuple
1414
from authorization.middleware import authorize
15-
from metrics.utils import setup_model_metrics
1615
from models.api.responses.constants import UNAUTHORIZED_OPENAPI_EXAMPLES
1716
from models.api.responses.error import (
1817
ForbiddenResponse,
@@ -47,11 +46,7 @@ async def metrics_endpoint_handler(
4746
Handle request to the /metrics endpoint.
4847
4948
Process GET requests to the /metrics endpoint, returning the
50-
latest Prometheus metrics in form of a plain text.
51-
52-
Initializes model metrics on the first request if not already
53-
set up, then responds with the current metrics snapshot in
54-
Prometheus format.
49+
latest Prometheus metrics in plain text Prometheus format.
5550
5651
### Parameters:
5752
- request: The incoming HTTP request (used by middleware).
@@ -67,7 +62,4 @@ async def metrics_endpoint_handler(
6762
# Nothing interesting in the request
6863
_ = request
6964

70-
# Setup the model metrics if not already done. This is a one-time setup
71-
# and will not be run again on subsequent calls to this endpoint
72-
await setup_model_metrics()
7365
return PlainTextResponse(generate_latest(), media_type=str(CONTENT_TYPE_LATEST))

src/app/main.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from configuration import configuration
2424
from log import get_logger
2525
from metrics import recording
26+
from metrics.utils import setup_model_metrics
2627
from models.api.responses.error import InternalServerErrorResponse
2728
from sentry import initialize_sentry
2829
from utils.common import register_mcp_servers_async
@@ -119,6 +120,14 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
119120
AzureEntraIDManager().set_base_url(azure_base_url)
120121
logger.info("Registering MCP servers")
121122
await register_mcp_servers_async(logger, configuration.configuration)
123+
124+
# Set up model metrics if in healthy mode
125+
if not degraded_tracker.is_degraded():
126+
try:
127+
await setup_model_metrics()
128+
except APIConnectionError as e:
129+
logger.warning("Failed to set up model metrics: %s", e, exc_info=True)
130+
122131
logger.info("App startup complete")
123132

124133
initialize_database()

src/metrics/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,9 @@
8282
["provider", "model", "endpoint", "result"],
8383
buckets=LLM_INFERENCE_DURATION_BUCKETS,
8484
)
85+
86+
# Gauge to track degraded mode startup state
87+
started_in_degraded_mode = Gauge(
88+
"ls_started_in_degraded_mode",
89+
"Indicates if service started in degraded mode (1 = degraded, 0 = healthy)",
90+
)

src/metrics/recording.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,18 @@ def record_llm_inference_duration(
157157
).observe(duration)
158158
except (AttributeError, TypeError, ValueError):
159159
logger.warning("Failed to update LLM inference duration metric", exc_info=True)
160+
161+
162+
def set_started_in_degraded_mode(is_degraded: bool) -> None:
163+
"""Set the startup degraded mode gauge.
164+
165+
This metric tracks whether the service started in degraded mode.
166+
It is set once at startup and does not change at runtime.
167+
168+
Args:
169+
is_degraded: True if service started in degraded mode, False if healthy.
170+
"""
171+
try:
172+
metrics.started_in_degraded_mode.set(1 if is_degraded else 0)
173+
except (AttributeError, TypeError, ValueError):
174+
logger.warning("Failed to update started_in_degraded_mode gauge", exc_info=True)

src/metrics/utils.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,23 @@
11
"""Utility functions for metrics handling."""
22

3-
from fastapi import HTTPException
4-
from llama_stack_client import APIConnectionError, APIStatusError
5-
63
import metrics
74
from client import AsyncLlamaStackClientHolder
85
from configuration import configuration
96
from log import get_logger
10-
from models.api.responses.error import ServiceUnavailableResponse
11-
from utils.common import run_once_async
127
from utils.endpoints import check_configuration_loaded
138

149
logger = get_logger(__name__)
1510

1611

17-
@run_once_async
1812
async def setup_model_metrics() -> None:
19-
"""Perform setup of all metrics related to LLM model and provider."""
13+
"""Perform setup of all metrics related to LLM model and provider.
14+
15+
Should be called during startup when service is in healthy mode.
16+
Skipped in degraded mode to avoid blocking on unavailable llama-stack.
17+
"""
2018
logger.info("Setting up model metrics")
2119
check_configuration_loaded(configuration)
22-
try:
23-
model_list = await AsyncLlamaStackClientHolder().get_client().models.list()
24-
except (APIConnectionError, APIStatusError) as e:
25-
response = ServiceUnavailableResponse(backend_name="Llama Stack", cause=str(e))
26-
raise HTTPException(**response.model_dump()) from e
20+
model_list = await AsyncLlamaStackClientHolder().get_client().models.list()
2721

2822
models = [
2923
model

src/utils/degraded_mode.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from typing import Optional
88

9+
from metrics import recording
910
from utils.types import Singleton
1011

1112

@@ -31,11 +32,17 @@ def set_degraded(self, reason: str) -> None:
3132
self._is_degraded = True
3233
self._degraded_reason = reason
3334

35+
# Record startup state metric
36+
recording.set_started_in_degraded_mode(True)
37+
3438
def set_healthy(self) -> None:
3539
"""Mark the service as running in healthy mode."""
3640
self._is_degraded = False
3741
self._degraded_reason = None
3842

43+
# Record startup state metric
44+
recording.set_started_in_degraded_mode(False)
45+
3946
def is_degraded(self) -> bool:
4047
"""Check if the service is running in degraded mode.
4148

tests/unit/app/endpoints/test_metrics.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from fastapi import Request
55
from pytest_mock import MockerFixture
66

7+
import metrics # Import to register custom Prometheus metrics
78
from app.endpoints.metrics import metrics_endpoint_handler
89
from authentication.interface import AuthTuple
910
from tests.unit.utils.auth_helpers import mock_authorization_resolvers
@@ -14,10 +15,6 @@ async def test_metrics_endpoint(mocker: MockerFixture) -> None:
1415
"""Test the metrics endpoint handler."""
1516
mock_authorization_resolvers(mocker)
1617

17-
mock_setup_metrics = mocker.patch(
18-
"app.endpoints.metrics.setup_model_metrics",
19-
new=mocker.AsyncMock(return_value=None),
20-
)
2118
request = Request(
2219
scope={
2320
"type": "http",
@@ -34,8 +31,6 @@ async def test_metrics_endpoint(mocker: MockerFixture) -> None:
3431

3532
response_body = response.body.decode() # type: ignore
3633

37-
# Assert metrics were set up
38-
mock_setup_metrics.assert_called_once()
3934
# Check if the response contains Prometheus metrics format
4035
assert "# TYPE ls_rest_api_calls_total counter" in response_body
4136
assert "# TYPE ls_response_duration_seconds histogram" in response_body
@@ -45,3 +40,4 @@ async def test_metrics_endpoint(mocker: MockerFixture) -> None:
4540
assert "# TYPE ls_llm_validation_errors_total counter" in response_body
4641
assert "# TYPE ls_llm_token_sent_total counter" in response_body
4742
assert "# TYPE ls_llm_token_received_total counter" in response_body
43+
assert "# TYPE ls_started_in_degraded_mode gauge" in response_body

0 commit comments

Comments
 (0)