lightspeed-core
diff --git a/‎docs/openapi.json‎
Lines changed: 67 additions & 8 deletions b/‎docs/openapi.json‎
Lines changed: 67 additions & 8 deletions
diff --git a/‎src/app/endpoints/health.py‎
Lines changed: 68 additions & 21 deletions b/‎src/app/endpoints/health.py‎
Lines changed: 68 additions & 21 deletions
diff --git a/‎src/app/main.py‎
Lines changed: 7 additions & 1 deletion b/‎src/app/main.py‎
Lines changed: 7 additions & 1 deletion
@@ -9964,7 +9964,7 @@
                     "health"
                 ],
                 "summary": "Readiness Probe Get Method",
-                "description": "Handle the readiness probe endpoint, returning service readiness.\n\nIf any provider reports an error status, responds with HTTP 503\nand details of unhealthy providers; otherwise, indicates the\nservice is ready.\n\n### Parameters:\n- response: The outgoing HTTP response (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Raises:\n- HTTPException: with status 401 for unauthorized access.\n- HTTPException: with status 403 if permission is denied.\n- HTTPException: with status 500 and a detail object containing `response`\n  and `cause` when service configuration is wrong or incomplete.\n- HTTPException: with status 503 and a detail object containing `response`\n  and `cause` when unable to connect to Llama Stack.\n\n### Returns:\n- ReadinessResponse: Object with `ready` indicating overall readiness,\n  `reason` explaining the outcome, and `providers` containing the list of\n  unhealthy ProviderHealthStatus entries (empty when ready).",
+                "description": "Handle the readiness probe endpoint, returning service readiness and health status.\n\nReturns comprehensive health information including overall service status,\nprovider health, and functional impacts. The service is considered \"ready\" even\nin degraded mode (returns 200), but reports reduced functionality.\n\n### Parameters:\n- response: The outgoing HTTP response (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Raises:\n- HTTPException: with status 401 for unauthorized access.\n- HTTPException: with status 403 if permission is denied.\n- HTTPException: with status 503 when service is unhealthy (providers down,\n  models unavailable) and degraded mode is not enabled.\n\n### Returns:\n- ReadinessResponse: Object with comprehensive health status including:\n  - ready: True if service can handle requests (even in degraded mode)\n  - reason: Description of service state\n  - overall_status: healthy, degraded, or unhealthy\n  - impacts: Functional limitations when degraded/unhealthy\n  - providers: List of unhealthy providers",
                 "operationId": "readiness_probe_get_method_readiness_get",
                 "responses": {
                     "200": {
@@ -9975,9 +9975,10 @@
                                     "$ref": "#/components/schemas/ReadinessResponse"
                                 },
                                 "example": {
+                                    "overall_status": "healthy",
                                     "providers": [],
                                     "ready": true,
-                                    "reason": "Service is ready"
+                                    "reason": "All providers are healthy"
                                 }
                             }
                         }
@@ -13485,6 +13486,20 @@
                 "type": "object",
                 "title": "HTTPValidationError"
             },
+            "HealthStatus": {
+                "type": "string",
+                "enum": [
+                    "ok",
+                    "Error",
+                    "not_implemented",
+                    "unknown",
+                    "healthy",
+                    "degraded",
+                    "unhealthy"
+                ],
+                "title": "HealthStatus",
+                "description": "Health status enum for provider and service health checks.\n\nThis enum serves two purposes:\n\n1. Provider-level health (returned by Llama Stack providers):\n   - OK: Provider is healthy and operational\n   - ERROR: Provider is unhealthy or failed health check\n   - NOT_IMPLEMENTED: Provider does not implement health checks\n   - UNKNOWN: Fallback when provider status cannot be determined\n\n2. Service-level health (overall LCORE status):\n   - HEALTHY: All systems operational, LLS connected, all providers healthy\n   - DEGRADED: Service running with reduced functionality (e.g., LLS unavailable)\n   - UNHEALTHY: Service connected but one or more providers are unhealthy"
+            },
             "ImplicitOAuthFlow": {
                 "properties": {
                     "authorizationUrl": {
@@ -16888,7 +16903,7 @@
                         "description": "Optional message about the health status",
                         "examples": [
                             "All systems operational",
-                            "Llama Stack is unavailable"
+                            "Provider is unavailable"
                         ]
                     }
                 },
@@ -17866,7 +17881,7 @@
                     "ready": {
                         "type": "boolean",
                         "title": "Ready",
-                        "description": "Flag indicating if service is ready",
+                        "description": "Flag indicating if service is ready to handle requests",
                         "examples": [
                             true,
                             false
@@ -17875,34 +17890,78 @@
                     "reason": {
                         "type": "string",
                         "title": "Reason",
-                        "description": "The reason for the readiness",
+                        "description": "The reason for the readiness status",
                         "examples": [
                             "Service is ready"
                         ]
                     },
+                    "overall_status": {
+                        "$ref": "#/components/schemas/HealthStatus",
+                        "description": "Overall service health status",
+                        "examples": [
+                            "healthy",
+                            "degraded",
+                            "unhealthy"
+                        ]
+                    },
+                    "impacts": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Impacts",
+                        "description": "List of functional impacts when service is degraded or unhealthy",
+                        "examples": [
+                            [
+                                "LLM inference unavailable",
+                                "RAG functionality unavailable",
+                                "Agent tools unavailable"
+                            ]
+                        ]
+                    },
                     "providers": {
                         "items": {
                             "$ref": "#/components/schemas/ProviderHealthStatus"
                         },
                         "type": "array",
                         "title": "Providers",
-                        "description": "List of unhealthy providers in case of readiness failure.",
+                        "description": "List of unhealthy providers (empty when all healthy)",
                         "examples": []
                     }
                 },
                 "type": "object",
                 "required": [
                     "ready",
                     "reason",
+                    "overall_status",
                     "providers"
                 ],
                 "title": "ReadinessResponse",
-                "description": "Model representing response to a readiness request.\n\nAttributes:\n    ready: If service is ready.\n    reason: The reason for the readiness.\n    providers: List of unhealthy providers in case of readiness failure.",
+                "description": "Model representing response to a readiness request.\n\nAttributes:\n    ready: If service is ready to handle requests.\n    reason: The reason for the readiness status.\n    overall_status: Overall service health status (healthy/degraded/unhealthy).\n    impacts: Optional list of functional impacts when degraded or unhealthy.\n    providers: List of unhealthy providers (empty when all healthy).",
                 "examples": [
                     {
+                        "overall_status": "healthy",
+                        "providers": [],
+                        "ready": true,
+                        "reason": "All providers are healthy"
+                    },
+                    {
+                        "impacts": [
+                            "LLM inference unavailable",
+                            "RAG functionality unavailable",
+                            "Agent tools unavailable"
+                        ],
+                        "overall_status": "degraded",
                         "providers": [],
                         "ready": true,
-                        "reason": "Service is ready"
+                        "reason": "Service running in degraded mode"
                     }
                 ]
             },
 
@@ -26,8 +26,12 @@
     LivenessResponse,
     ReadinessResponse,
 )
-from models.common import HealthStatus, ProviderHealthStatus
+from models.common import (
+    HealthStatus,
+    ProviderHealthStatus,
+)
 from models.config import Action
+from utils.degraded_mode import DegradedModeTracker
 
 logger = get_logger(__name__)
 router = APIRouter(tags=["health"])
@@ -117,11 +121,11 @@ async def readiness_probe_get_method(
     response: Response,
 ) -> ReadinessResponse:
     """
-    Handle the readiness probe endpoint, returning service readiness.
+    Handle the readiness probe endpoint, returning service readiness and health status.
 
-    If any provider reports an error status, responds with HTTP 503
-    and details of unhealthy providers; otherwise, indicates the
-    service is ready.
+    Returns comprehensive health information including overall service status,
+    provider health, and functional impacts. The service is considered "ready" even
+    in degraded mode (returns 200), but reports reduced functionality.
 
     ### Parameters:
     - response: The outgoing HTTP response (used by middleware).
@@ -130,47 +134,90 @@ async def readiness_probe_get_method(
     ### Raises:
     - HTTPException: with status 401 for unauthorized access.
     - HTTPException: with status 403 if permission is denied.
-    - HTTPException: with status 500 and a detail object containing `response`
-      and `cause` when service configuration is wrong or incomplete.
-    - HTTPException: with status 503 and a detail object containing `response`
-      and `cause` when unable to connect to Llama Stack.
+    - HTTPException: with status 503 when service is unhealthy (providers down,
+      models unavailable) and degraded mode is not enabled.
 
     ### Returns:
-    - ReadinessResponse: Object with `ready` indicating overall readiness,
-      `reason` explaining the outcome, and `providers` containing the list of
-      unhealthy ProviderHealthStatus entries (empty when ready).
+    - ReadinessResponse: Object with comprehensive health status including:
+      - ready: True if service can handle requests (even in degraded mode)
+      - reason: Description of service state
+      - overall_status: healthy, degraded, or unhealthy
+      - impacts: Functional limitations when degraded/unhealthy
+      - providers: List of unhealthy providers
     """
     # Used only for authorization
     _ = auth
 
-    logger.info("Response to /v1/readiness endpoint")
+    logger.info("Response to /readiness endpoint")
 
-    provider_statuses = await get_providers_health_statuses()
+    degraded_tracker = DegradedModeTracker()
+    is_degraded = degraded_tracker.is_degraded()
 
-    # Check if any provider is unhealthy (not counting not_implemented as unhealthy)
+    # Determine overall status
+    if is_degraded:
+        # Service is ready (can serve health checks, metrics, etc.) but degraded
+        impacts = [
+            "LLM inference unavailable",
+            "RAG functionality unavailable",
+            "Agent tools unavailable",
+        ]
+        return ReadinessResponse(
+            ready=True,
+            reason="Service running in degraded mode",
+            overall_status=HealthStatus.DEGRADED,
+            impacts=impacts,
+            providers=[],
+        )
+
+    # Not in degraded mode - check provider health
+    provider_statuses = await get_providers_health_statuses()
     unhealthy_providers = [
         p for p in provider_statuses if p.status == HealthStatus.ERROR.value
     ]
 
     if unhealthy_providers:
-        ready = False
-        unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
-        reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
+        # Check if this is a connection error (provider_id="unknown")
+        is_connection_error = any(p.provider_id == "unknown" for p in unhealthy_providers)
+
+        if is_connection_error:
+            reason = "Cannot connect to backend service"
+            impacts = ["LLM inference unavailable", "Provider health checks unavailable"]
+        else:
+            unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
+            reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
+            impacts = [
+                f"Provider {p.provider_id}: {p.message}"
+                for p in unhealthy_providers
+            ]
+
         response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
         return ReadinessResponse(
-            ready=ready, reason=reason, providers=unhealthy_providers
+            ready=False,
+            reason=reason,
+            overall_status=HealthStatus.UNHEALTHY,
+            impacts=impacts,
+            providers=unhealthy_providers if not is_connection_error else [],
         )
 
     # Check that the default model is registered in the model registry
     model_available, model_reason = await check_default_model_available()
     if not model_available:
         response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
         return ReadinessResponse(
-            ready=False, reason=model_reason, providers=unhealthy_providers
+            ready=False,
+            reason=model_reason,
+            overall_status=HealthStatus.UNHEALTHY,
+            impacts=["Default model not available in registry"],
+            providers=[],
         )
 
+    # All healthy
     return ReadinessResponse(
-        ready=True, reason="All providers are healthy", providers=unhealthy_providers
+        ready=True,
+        reason="All providers are healthy",
+        overall_status=HealthStatus.HEALTHY,
+        impacts=None,
+        providers=[],
     )
 
 
 
@@ -26,6 +26,7 @@
 from models.api.responses.error import InternalServerErrorResponse
 from sentry import initialize_sentry
 from utils.common import register_mcp_servers_async
+from utils.degraded_mode import DegradedModeTracker
 from utils.llama_stack_version import check_llama_stack_version
 
 logger = get_logger(__name__)
@@ -81,15 +82,19 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
     await AsyncLlamaStackClientHolder().load(llama_stack_config)
     client: AsyncLlamaStackClient = AsyncLlamaStackClientHolder().get_client()
     logger.debug("Llama Stack client initialized, trying to connect to Llama Stack")
-    # check if the Llama Stack version is supported by the service
+    # Check connectivity to Llama Stack and set degraded mode if unavailable
+    degraded_tracker = DegradedModeTracker()
     try:
         llama_stack_version = await check_llama_stack_version(
             client, llama_stack_config.max_retries, llama_stack_config.retry_delay
         )
         if llama_stack_version is None:
             logger.error("Cannot retrieve Llama Stack version, check connection")
+            if llama_stack_config.allow_degraded_mode:
+                degraded_tracker.set_degraded("Llama Stack connection check failed")
         else:
             logger.debug("Llama Stack version: %s", llama_stack_version)
+            degraded_tracker.set_healthy()
     except APIConnectionError as e:
         # if degraded mode is allowed, simply ignore the exception
         llama_stack_url = llama_stack_config.url
@@ -103,6 +108,7 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
         )
         if llama_stack_config.allow_degraded_mode:
             logger.info("Entering degraded mode: LCORE running w/o Llama Stack")
+            degraded_tracker.set_degraded(f"Failed to connect to Llama Stack: {e!s}")
         else:
             raise