From 359b19e3f930f0e489b3743e82e9c93dffd9325c Mon Sep 17 00:00:00 2001
From: Anik Bhattacharjee <anbhatta@redhat.com>
Date: Tue, 23 Jun 2026 17:13:05 -0400
Subject: [PATCH] LCORE-1857: Add metrics to track degraded mode startup state

Add Prometheus metrics to monitor when Lightspeed Core Stack starts in
degraded mode (without llama-stack connectivity). During implementation,
discovered that PR #327's lazy initialization pattern prevented model
metrics from being set up correctly, requiring a revert.

When LCORE starts in degraded mode, the @run_once_async
decorator marks setup_model_metrics() as "done" after it returns early.
On restart with llama-stack available, the function never runs again,
leaving model metrics uninitialized. Moving setup back to startup fixes
this while degraded mode already provides the startup resilience that
PR #327 originally addressed.

Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
---
 docs/openapi.json                        |  2 +-
 src/app/endpoints/metrics.py             | 10 +---------
 src/app/main.py                          |  9 +++++++++
 src/metrics/__init__.py                  |  6 ++++++
 src/metrics/recording.py                 | 15 +++++++++++++++
 src/metrics/utils.py                     | 18 ++++++------------
 src/utils/degraded_mode.py               |  7 +++++++
 tests/unit/app/endpoints/test_metrics.py |  8 ++------
 8 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/docs/openapi.json b/docs/openapi.json
index 1f2eb46ab..0ea650f2d 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10400,7 +10400,7 @@
                     "metrics"
                 ],
                 "summary": "Metrics Endpoint Handler",
-                "description": "Handle request to the /metrics endpoint.\n\nProcess GET requests to the /metrics endpoint, returning the\nlatest Prometheus metrics in form of a plain text.\n\nInitializes model metrics on the first request if not already\nset up, then responds with the current metrics snapshot in\nPrometheus format.\n\n### Parameters:\n- request: The incoming HTTP request (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Returns:\n- PlainTextResponse: Response body containing the Prometheus metrics text\n  and the Prometheus content type.",
+                "description": "Handle request to the /metrics endpoint.\n\nProcess GET requests to the /metrics endpoint, returning the\nlatest Prometheus metrics in plain text Prometheus format.\n\n### Parameters:\n- request: The incoming HTTP request (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Returns:\n- PlainTextResponse: Response body containing the Prometheus metrics text\n  and the Prometheus content type.",
                 "operationId": "metrics_endpoint_handler_metrics_get",
                 "responses": {
                     "200": {
diff --git a/src/app/endpoints/metrics.py b/src/app/endpoints/metrics.py
index c33a6866c..4b44799c3 100644
--- a/src/app/endpoints/metrics.py
+++ b/src/app/endpoints/metrics.py
@@ -12,7 +12,6 @@
 from authentication import get_auth_dependency
 from authentication.interface import AuthTuple
 from authorization.middleware import authorize
-from metrics.utils import setup_model_metrics
 from models.api.responses.constants import UNAUTHORIZED_OPENAPI_EXAMPLES
 from models.api.responses.error import (
     ForbiddenResponse,
@@ -47,11 +46,7 @@ async def metrics_endpoint_handler(
     Handle request to the /metrics endpoint.
 
     Process GET requests to the /metrics endpoint, returning the
-    latest Prometheus metrics in form of a plain text.
-
-    Initializes model metrics on the first request if not already
-    set up, then responds with the current metrics snapshot in
-    Prometheus format.
+    latest Prometheus metrics in plain text Prometheus format.
 
     ### Parameters:
     - request: The incoming HTTP request (used by middleware).
@@ -67,7 +62,4 @@ async def metrics_endpoint_handler(
     # Nothing interesting in the request
     _ = request
 
-    # Setup the model metrics if not already done. This is a one-time setup
-    # and will not be run again on subsequent calls to this endpoint
-    await setup_model_metrics()
     return PlainTextResponse(generate_latest(), media_type=str(CONTENT_TYPE_LATEST))
diff --git a/src/app/main.py b/src/app/main.py
index 41466e143..e54c730bc 100644
--- a/src/app/main.py
+++ b/src/app/main.py
@@ -23,6 +23,7 @@
 from configuration import configuration
 from log import get_logger
 from metrics import recording
+from metrics.utils import setup_model_metrics
 from models.api.responses.error import InternalServerErrorResponse
 from sentry import initialize_sentry
 from utils.common import register_mcp_servers_async
@@ -119,6 +120,14 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
         AzureEntraIDManager().set_base_url(azure_base_url)
     logger.info("Registering MCP servers")
     await register_mcp_servers_async(logger, configuration.configuration)
+
+    # Set up model metrics if in healthy mode
+    if not degraded_tracker.is_degraded():
+        try:
+            await setup_model_metrics()
+        except APIConnectionError as e:
+            logger.warning("Failed to set up model metrics: %s", e, exc_info=True)
+
     logger.info("App startup complete")
 
     initialize_database()
diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py
index 63d7b8e29..ae6496501 100644
--- a/src/metrics/__init__.py
+++ b/src/metrics/__init__.py
@@ -82,3 +82,9 @@
     ["provider", "model", "endpoint", "result"],
     buckets=LLM_INFERENCE_DURATION_BUCKETS,
 )
+
+# Gauge to track degraded mode startup state
+started_in_degraded_mode = Gauge(
+    "ls_started_in_degraded_mode",
+    "Indicates if service started in degraded mode (1 = degraded, 0 = healthy)",
+)
diff --git a/src/metrics/recording.py b/src/metrics/recording.py
index a9b35d208..c41c105b5 100644
--- a/src/metrics/recording.py
+++ b/src/metrics/recording.py
@@ -157,3 +157,18 @@ def record_llm_inference_duration(
         ).observe(duration)
     except (AttributeError, TypeError, ValueError):
         logger.warning("Failed to update LLM inference duration metric", exc_info=True)
+
+
+def set_started_in_degraded_mode(is_degraded: bool) -> None:
+    """Set the startup degraded mode gauge.
+
+    This metric tracks whether the service started in degraded mode.
+    It is set once at startup and does not change at runtime.
+
+    Args:
+        is_degraded: True if service started in degraded mode, False if healthy.
+    """
+    try:
+        metrics.started_in_degraded_mode.set(1 if is_degraded else 0)
+    except (AttributeError, TypeError, ValueError):
+        logger.warning("Failed to update started_in_degraded_mode gauge", exc_info=True)
diff --git a/src/metrics/utils.py b/src/metrics/utils.py
index 806e7a336..afb832d29 100644
--- a/src/metrics/utils.py
+++ b/src/metrics/utils.py
@@ -1,29 +1,23 @@
 """Utility functions for metrics handling."""
 
-from fastapi import HTTPException
-from llama_stack_client import APIConnectionError, APIStatusError
-
 import metrics
 from client import AsyncLlamaStackClientHolder
 from configuration import configuration
 from log import get_logger
-from models.api.responses.error import ServiceUnavailableResponse
-from utils.common import run_once_async
 from utils.endpoints import check_configuration_loaded
 
 logger = get_logger(__name__)
 
 
-@run_once_async
 async def setup_model_metrics() -> None:
-    """Perform setup of all metrics related to LLM model and provider."""
+    """Perform setup of all metrics related to LLM model and provider.
+
+    Should be called during startup when service is in healthy mode.
+    Skipped in degraded mode to avoid blocking on unavailable llama-stack.
+    """
     logger.info("Setting up model metrics")
     check_configuration_loaded(configuration)
-    try:
-        model_list = await AsyncLlamaStackClientHolder().get_client().models.list()
-    except (APIConnectionError, APIStatusError) as e:
-        response = ServiceUnavailableResponse(backend_name="Llama Stack", cause=str(e))
-        raise HTTPException(**response.model_dump()) from e
+    model_list = await AsyncLlamaStackClientHolder().get_client().models.list()
 
     models = [
         model
diff --git a/src/utils/degraded_mode.py b/src/utils/degraded_mode.py
index 661272da7..f40a480fc 100644
--- a/src/utils/degraded_mode.py
+++ b/src/utils/degraded_mode.py
@@ -6,6 +6,7 @@
 
 from typing import Optional
 
+from metrics import recording
 from utils.types import Singleton
 
 
@@ -31,11 +32,17 @@ def set_degraded(self, reason: str) -> None:
         self._is_degraded = True
         self._degraded_reason = reason
 
+        # Record startup state metric
+        recording.set_started_in_degraded_mode(True)
+
     def set_healthy(self) -> None:
         """Mark the service as running in healthy mode."""
         self._is_degraded = False
         self._degraded_reason = None
 
+        # Record startup state metric
+        recording.set_started_in_degraded_mode(False)
+
     def is_degraded(self) -> bool:
         """Check if the service is running in degraded mode.
 
diff --git a/tests/unit/app/endpoints/test_metrics.py b/tests/unit/app/endpoints/test_metrics.py
index 5cebc0529..bf826e5a1 100644
--- a/tests/unit/app/endpoints/test_metrics.py
+++ b/tests/unit/app/endpoints/test_metrics.py
@@ -4,6 +4,7 @@
 from fastapi import Request
 from pytest_mock import MockerFixture
 
+import metrics  # noqa: F401 pylint: disable=unused-import
 from app.endpoints.metrics import metrics_endpoint_handler
 from authentication.interface import AuthTuple
 from tests.unit.utils.auth_helpers import mock_authorization_resolvers
@@ -14,10 +15,6 @@ async def test_metrics_endpoint(mocker: MockerFixture) -> None:
     """Test the metrics endpoint handler."""
     mock_authorization_resolvers(mocker)
 
-    mock_setup_metrics = mocker.patch(
-        "app.endpoints.metrics.setup_model_metrics",
-        new=mocker.AsyncMock(return_value=None),
-    )
     request = Request(
         scope={
             "type": "http",
@@ -34,8 +31,6 @@ async def test_metrics_endpoint(mocker: MockerFixture) -> None:
 
     response_body = response.body.decode()  # type: ignore
 
-    # Assert metrics were set up
-    mock_setup_metrics.assert_called_once()
     # Check if the response contains Prometheus metrics format
     assert "# TYPE ls_rest_api_calls_total counter" in response_body
     assert "# TYPE ls_response_duration_seconds histogram" in response_body
@@ -45,3 +40,4 @@ async def test_metrics_endpoint(mocker: MockerFixture) -> None:
     assert "# TYPE ls_llm_validation_errors_total counter" in response_body
     assert "# TYPE ls_llm_token_sent_total counter" in response_body
     assert "# TYPE ls_llm_token_received_total counter" in response_body
+    assert "# TYPE ls_started_in_degraded_mode gauge" in response_body