Skip to content

Commit d70548e

Browse files
committed
LCORE-1859: Enhance /readiness endpoint with degraded mode reporting
#1781 introduced "degraded mode support" - the ability to start lightspeed-stack and keep it running even when llama-stack server might not be available. This PR adds comprehensive degraded mode status reporting to the /readiness endpoint while maintaining clean API boundaries and Kubernetes probe semantics. - Enhanced HealthStatus enum with DEGRADED and UNHEALTHY service-level statuses while preserving provider-level statuses (OK, ERROR, NOT_IMPLEMENTED, UNKNOWN) - Enhanced /readiness endpoint to return 200 (ready=true) in degraded mode following Kubernetes semantics; only returns 503 when truly unhealthy - Refactored to avoid leaking implementation details in API responses: * Removed llama_stack field from ReadinessResponse * Removed Llama Stack version tracking from DegradedModeTracker * Focus on functional impacts rather than internal technology stack This design keeps internal implementation details (Llama Stack) private while exposing clear functional impacts to API consumers. Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
1 parent 570a66e commit d70548e

9 files changed

Lines changed: 384 additions & 45 deletions

File tree

docs/openapi.json

Lines changed: 67 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9964,7 +9964,7 @@
99649964
"health"
99659965
],
99669966
"summary": "Readiness Probe Get Method",
9967-
"description": "Handle the readiness probe endpoint, returning service readiness.\n\nIf any provider reports an error status, responds with HTTP 503\nand details of unhealthy providers; otherwise, indicates the\nservice is ready.\n\n### Parameters:\n- response: The outgoing HTTP response (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Raises:\n- HTTPException: with status 401 for unauthorized access.\n- HTTPException: with status 403 if permission is denied.\n- HTTPException: with status 500 and a detail object containing `response`\n and `cause` when service configuration is wrong or incomplete.\n- HTTPException: with status 503 and a detail object containing `response`\n and `cause` when unable to connect to Llama Stack.\n\n### Returns:\n- ReadinessResponse: Object with `ready` indicating overall readiness,\n `reason` explaining the outcome, and `providers` containing the list of\n unhealthy ProviderHealthStatus entries (empty when ready).",
9967+
"description": "Handle the readiness probe endpoint, returning service readiness and health status.\n\nReturns comprehensive health information including overall service status,\nprovider health, and functional impacts. The service is considered \"ready\" even\nin degraded mode (returns 200), but reports reduced functionality.\n\n### Parameters:\n- response: The outgoing HTTP response (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Raises:\n- HTTPException: with status 401 for unauthorized access.\n- HTTPException: with status 403 if permission is denied.\n- HTTPException: with status 503 when service is unhealthy (providers down,\n models unavailable) and degraded mode is not enabled.\n\n### Returns:\n- ReadinessResponse: Object with comprehensive health status including:\n - ready: True if service can handle requests (even in degraded mode)\n - reason: Description of service state\n - overall_status: healthy, degraded, or unhealthy\n - impacts: Functional limitations when degraded/unhealthy\n - providers: List of unhealthy providers",
99689968
"operationId": "readiness_probe_get_method_readiness_get",
99699969
"responses": {
99709970
"200": {
@@ -9975,9 +9975,10 @@
99759975
"$ref": "#/components/schemas/ReadinessResponse"
99769976
},
99779977
"example": {
9978+
"overall_status": "healthy",
99789979
"providers": [],
99799980
"ready": true,
9980-
"reason": "Service is ready"
9981+
"reason": "All providers are healthy"
99819982
}
99829983
}
99839984
}
@@ -13485,6 +13486,20 @@
1348513486
"type": "object",
1348613487
"title": "HTTPValidationError"
1348713488
},
13489+
"HealthStatus": {
13490+
"type": "string",
13491+
"enum": [
13492+
"ok",
13493+
"Error",
13494+
"not_implemented",
13495+
"unknown",
13496+
"healthy",
13497+
"degraded",
13498+
"unhealthy"
13499+
],
13500+
"title": "HealthStatus",
13501+
"description": "Health status enum for provider and service health checks.\n\nThis enum serves two purposes:\n\n1. Provider-level health (returned by Llama Stack providers):\n - OK: Provider is healthy and operational\n - ERROR: Provider is unhealthy or failed health check\n - NOT_IMPLEMENTED: Provider does not implement health checks\n - UNKNOWN: Fallback when provider status cannot be determined\n\n2. Service-level health (overall LCORE status):\n - HEALTHY: All systems operational, LLS connected, all providers healthy\n - DEGRADED: Service running with reduced functionality (e.g., LLS unavailable)\n - UNHEALTHY: Service connected but one or more providers are unhealthy"
13502+
},
1348813503
"ImplicitOAuthFlow": {
1348913504
"properties": {
1349013505
"authorizationUrl": {
@@ -16888,7 +16903,7 @@
1688816903
"description": "Optional message about the health status",
1688916904
"examples": [
1689016905
"All systems operational",
16891-
"Llama Stack is unavailable"
16906+
"Provider is unavailable"
1689216907
]
1689316908
}
1689416909
},
@@ -17866,7 +17881,7 @@
1786617881
"ready": {
1786717882
"type": "boolean",
1786817883
"title": "Ready",
17869-
"description": "Flag indicating if service is ready",
17884+
"description": "Flag indicating if service is ready to handle requests",
1787017885
"examples": [
1787117886
true,
1787217887
false
@@ -17875,34 +17890,78 @@
1787517890
"reason": {
1787617891
"type": "string",
1787717892
"title": "Reason",
17878-
"description": "The reason for the readiness",
17893+
"description": "The reason for the readiness status",
1787917894
"examples": [
1788017895
"Service is ready"
1788117896
]
1788217897
},
17898+
"overall_status": {
17899+
"$ref": "#/components/schemas/HealthStatus",
17900+
"description": "Overall service health status",
17901+
"examples": [
17902+
"healthy",
17903+
"degraded",
17904+
"unhealthy"
17905+
]
17906+
},
17907+
"impacts": {
17908+
"anyOf": [
17909+
{
17910+
"items": {
17911+
"type": "string"
17912+
},
17913+
"type": "array"
17914+
},
17915+
{
17916+
"type": "null"
17917+
}
17918+
],
17919+
"title": "Impacts",
17920+
"description": "List of functional impacts when service is degraded or unhealthy",
17921+
"examples": [
17922+
[
17923+
"LLM inference unavailable",
17924+
"RAG functionality unavailable",
17925+
"Agent tools unavailable"
17926+
]
17927+
]
17928+
},
1788317929
"providers": {
1788417930
"items": {
1788517931
"$ref": "#/components/schemas/ProviderHealthStatus"
1788617932
},
1788717933
"type": "array",
1788817934
"title": "Providers",
17889-
"description": "List of unhealthy providers in case of readiness failure.",
17935+
"description": "List of unhealthy providers (empty when all healthy)",
1789017936
"examples": []
1789117937
}
1789217938
},
1789317939
"type": "object",
1789417940
"required": [
1789517941
"ready",
1789617942
"reason",
17943+
"overall_status",
1789717944
"providers"
1789817945
],
1789917946
"title": "ReadinessResponse",
17900-
"description": "Model representing response to a readiness request.\n\nAttributes:\n ready: If service is ready.\n reason: The reason for the readiness.\n providers: List of unhealthy providers in case of readiness failure.",
17947+
"description": "Model representing response to a readiness request.\n\nAttributes:\n ready: If service is ready to handle requests.\n reason: The reason for the readiness status.\n overall_status: Overall service health status (healthy/degraded/unhealthy).\n impacts: Optional list of functional impacts when degraded or unhealthy.\n providers: List of unhealthy providers (empty when all healthy).",
1790117948
"examples": [
1790217949
{
17950+
"overall_status": "healthy",
17951+
"providers": [],
17952+
"ready": true,
17953+
"reason": "All providers are healthy"
17954+
},
17955+
{
17956+
"impacts": [
17957+
"LLM inference unavailable",
17958+
"RAG functionality unavailable",
17959+
"Agent tools unavailable"
17960+
],
17961+
"overall_status": "degraded",
1790317962
"providers": [],
1790417963
"ready": true,
17905-
"reason": "Service is ready"
17964+
"reason": "Service running in degraded mode"
1790617965
}
1790717966
]
1790817967
},

src/app/endpoints/health.py

Lines changed: 68 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@
2626
LivenessResponse,
2727
ReadinessResponse,
2828
)
29-
from models.common import HealthStatus, ProviderHealthStatus
29+
from models.common import (
30+
HealthStatus,
31+
ProviderHealthStatus,
32+
)
3033
from models.config import Action
34+
from utils.degraded_mode import DegradedModeTracker
3135

3236
logger = get_logger(__name__)
3337
router = APIRouter(tags=["health"])
@@ -117,11 +121,11 @@ async def readiness_probe_get_method(
117121
response: Response,
118122
) -> ReadinessResponse:
119123
"""
120-
Handle the readiness probe endpoint, returning service readiness.
124+
Handle the readiness probe endpoint, returning service readiness and health status.
121125
122-
If any provider reports an error status, responds with HTTP 503
123-
and details of unhealthy providers; otherwise, indicates the
124-
service is ready.
126+
Returns comprehensive health information including overall service status,
127+
provider health, and functional impacts. The service is considered "ready" even
128+
in degraded mode (returns 200), but reports reduced functionality.
125129
126130
### Parameters:
127131
- response: The outgoing HTTP response (used by middleware).
@@ -130,47 +134,90 @@ async def readiness_probe_get_method(
130134
### Raises:
131135
- HTTPException: with status 401 for unauthorized access.
132136
- HTTPException: with status 403 if permission is denied.
133-
- HTTPException: with status 500 and a detail object containing `response`
134-
and `cause` when service configuration is wrong or incomplete.
135-
- HTTPException: with status 503 and a detail object containing `response`
136-
and `cause` when unable to connect to Llama Stack.
137+
- HTTPException: with status 503 when service is unhealthy (providers down,
138+
models unavailable) and degraded mode is not enabled.
137139
138140
### Returns:
139-
- ReadinessResponse: Object with `ready` indicating overall readiness,
140-
`reason` explaining the outcome, and `providers` containing the list of
141-
unhealthy ProviderHealthStatus entries (empty when ready).
141+
- ReadinessResponse: Object with comprehensive health status including:
142+
- ready: True if service can handle requests (even in degraded mode)
143+
- reason: Description of service state
144+
- overall_status: healthy, degraded, or unhealthy
145+
- impacts: Functional limitations when degraded/unhealthy
146+
- providers: List of unhealthy providers
142147
"""
143148
# Used only for authorization
144149
_ = auth
145150

146-
logger.info("Response to /v1/readiness endpoint")
151+
logger.info("Response to /readiness endpoint")
147152

148-
provider_statuses = await get_providers_health_statuses()
153+
degraded_tracker = DegradedModeTracker()
154+
is_degraded = degraded_tracker.is_degraded()
149155

150-
# Check if any provider is unhealthy (not counting not_implemented as unhealthy)
156+
# Determine overall status
157+
if is_degraded:
158+
# Service is ready (can serve health checks, metrics, etc.) but degraded
159+
impacts = [
160+
"LLM inference unavailable",
161+
"RAG functionality unavailable",
162+
"Agent tools unavailable",
163+
]
164+
return ReadinessResponse(
165+
ready=True,
166+
reason="Service running in degraded mode",
167+
overall_status=HealthStatus.DEGRADED,
168+
impacts=impacts,
169+
providers=[],
170+
)
171+
172+
# Not in degraded mode - check provider health
173+
provider_statuses = await get_providers_health_statuses()
151174
unhealthy_providers = [
152175
p for p in provider_statuses if p.status == HealthStatus.ERROR.value
153176
]
154177

155178
if unhealthy_providers:
156-
ready = False
157-
unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
158-
reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
179+
# Check if this is a connection error (provider_id="unknown")
180+
is_connection_error = any(p.provider_id == "unknown" for p in unhealthy_providers)
181+
182+
if is_connection_error:
183+
reason = "Cannot connect to backend service"
184+
impacts = ["LLM inference unavailable", "Provider health checks unavailable"]
185+
else:
186+
unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
187+
reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
188+
impacts = [
189+
f"Provider {p.provider_id}: {p.message}"
190+
for p in unhealthy_providers
191+
]
192+
159193
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
160194
return ReadinessResponse(
161-
ready=ready, reason=reason, providers=unhealthy_providers
195+
ready=False,
196+
reason=reason,
197+
overall_status=HealthStatus.UNHEALTHY,
198+
impacts=impacts,
199+
providers=unhealthy_providers if not is_connection_error else [],
162200
)
163201

164202
# Check that the default model is registered in the model registry
165203
model_available, model_reason = await check_default_model_available()
166204
if not model_available:
167205
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
168206
return ReadinessResponse(
169-
ready=False, reason=model_reason, providers=unhealthy_providers
207+
ready=False,
208+
reason=model_reason,
209+
overall_status=HealthStatus.UNHEALTHY,
210+
impacts=["Default model not available in registry"],
211+
providers=[],
170212
)
171213

214+
# All healthy
172215
return ReadinessResponse(
173-
ready=True, reason="All providers are healthy", providers=unhealthy_providers
216+
ready=True,
217+
reason="All providers are healthy",
218+
overall_status=HealthStatus.HEALTHY,
219+
impacts=None,
220+
providers=[],
174221
)
175222

176223

src/app/main.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from models.api.responses.error import InternalServerErrorResponse
2727
from sentry import initialize_sentry
2828
from utils.common import register_mcp_servers_async
29+
from utils.degraded_mode import DegradedModeTracker
2930
from utils.llama_stack_version import check_llama_stack_version
3031

3132
logger = get_logger(__name__)
@@ -81,15 +82,19 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
8182
await AsyncLlamaStackClientHolder().load(llama_stack_config)
8283
client: AsyncLlamaStackClient = AsyncLlamaStackClientHolder().get_client()
8384
logger.debug("Llama Stack client initialized, trying to connect to Llama Stack")
84-
# check if the Llama Stack version is supported by the service
85+
# Check connectivity to Llama Stack and set degraded mode if unavailable
86+
degraded_tracker = DegradedModeTracker()
8587
try:
8688
llama_stack_version = await check_llama_stack_version(
8789
client, llama_stack_config.max_retries, llama_stack_config.retry_delay
8890
)
8991
if llama_stack_version is None:
9092
logger.error("Cannot retrieve Llama Stack version, check connection")
93+
if llama_stack_config.allow_degraded_mode:
94+
degraded_tracker.set_degraded("Llama Stack connection check failed")
9195
else:
9296
logger.debug("Llama Stack version: %s", llama_stack_version)
97+
degraded_tracker.set_healthy()
9398
except APIConnectionError as e:
9499
# if degraded mode is allowed, simply ignore the exception
95100
llama_stack_url = llama_stack_config.url
@@ -103,6 +108,7 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
103108
)
104109
if llama_stack_config.allow_degraded_mode:
105110
logger.info("Entering degraded mode: LCORE running w/o Llama Stack")
111+
degraded_tracker.set_degraded(f"Failed to connect to Llama Stack: {e!s}")
106112
else:
107113
raise
108114

0 commit comments

Comments
 (0)