Skip to content

Commit 9bd1ce3

Browse files
committed
LCORE-1859: Enhance /readiness endpoint with degraded mode reporting
#1781 introduced "degraded mode support" - the ability to start lightspeed-stack and keep it running even when llama-stack server might not be available. This PR adds comprehensive degraded mode status reporting to the /readiness endpoint while maintaining clean API boundaries and Kubernetes probe semantics. - Enhanced HealthStatus enum with DEGRADED and UNHEALTHY service-level statuses while preserving provider-level statuses (OK, ERROR, NOT_IMPLEMENTED, UNKNOWN) - Enhanced /readiness endpoint to return 200 (ready=true) in degraded mode following Kubernetes semantics; only returns 503 when truly unhealthy - Refactored to avoid leaking implementation details in API responses: * Removed llama_stack field from ReadinessResponse * Removed Llama Stack version tracking from DegradedModeTracker * Focus on functional impacts rather than internal technology stack This design keeps internal implementation details (Llama Stack) private while exposing clear functional impacts to API consumers. Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
1 parent 570a66e commit 9bd1ce3

8 files changed

Lines changed: 307 additions & 35 deletions

File tree

src/app/endpoints/health.py

Lines changed: 58 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,12 @@
2626
LivenessResponse,
2727
ReadinessResponse,
2828
)
29-
from models.common import HealthStatus, ProviderHealthStatus
29+
from models.common import (
30+
HealthStatus,
31+
ProviderHealthStatus,
32+
)
3033
from models.config import Action
34+
from utils.degraded_mode import DegradedModeTracker
3135

3236
logger = get_logger(__name__)
3337
router = APIRouter(tags=["health"])
@@ -117,11 +121,11 @@ async def readiness_probe_get_method(
117121
response: Response,
118122
) -> ReadinessResponse:
119123
"""
120-
Handle the readiness probe endpoint, returning service readiness.
124+
Handle the readiness probe endpoint, returning service readiness and health status.
121125
122-
If any provider reports an error status, responds with HTTP 503
123-
and details of unhealthy providers; otherwise, indicates the
124-
service is ready.
126+
Returns comprehensive health information including overall service status,
127+
provider health, and functional impacts. The service is considered "ready" even
128+
in degraded mode (returns 200), but reports reduced functionality.
125129
126130
### Parameters:
127131
- response: The outgoing HTTP response (used by middleware).
@@ -130,47 +134,82 @@ async def readiness_probe_get_method(
130134
### Raises:
131135
- HTTPException: with status 401 for unauthorized access.
132136
- HTTPException: with status 403 if permission is denied.
133-
- HTTPException: with status 500 and a detail object containing `response`
134-
and `cause` when service configuration is wrong or incomplete.
135-
- HTTPException: with status 503 and a detail object containing `response`
136-
and `cause` when unable to connect to Llama Stack.
137+
- HTTPException: with status 503 when service is unhealthy (providers down,
138+
models unavailable) and degraded mode is not enabled.
137139
138140
### Returns:
139-
- ReadinessResponse: Object with `ready` indicating overall readiness,
140-
`reason` explaining the outcome, and `providers` containing the list of
141-
unhealthy ProviderHealthStatus entries (empty when ready).
141+
- ReadinessResponse: Object with comprehensive health status including:
142+
- ready: True if service can handle requests (even in degraded mode)
143+
- reason: Description of service state
144+
- overall_status: healthy, degraded, or unhealthy
145+
- impacts: Functional limitations when degraded/unhealthy
146+
- providers: List of unhealthy providers
142147
"""
143148
# Used only for authorization
144149
_ = auth
145150

146-
logger.info("Response to /v1/readiness endpoint")
151+
logger.info("Response to /readiness endpoint")
147152

148-
provider_statuses = await get_providers_health_statuses()
153+
degraded_tracker = DegradedModeTracker()
154+
is_degraded = degraded_tracker.is_degraded()
149155

150-
# Check if any provider is unhealthy (not counting not_implemented as unhealthy)
156+
# Determine overall status
157+
if is_degraded:
158+
# Service is ready (can serve health checks, metrics, etc.) but degraded
159+
impacts = [
160+
"LLM inference unavailable",
161+
"RAG functionality unavailable",
162+
"Agent tools unavailable",
163+
]
164+
return ReadinessResponse(
165+
ready=True,
166+
reason="Service running in degraded mode",
167+
overall_status=HealthStatus.DEGRADED,
168+
impacts=impacts,
169+
providers=[],
170+
)
171+
172+
# Not in degraded mode - check provider health
173+
provider_statuses = await get_providers_health_statuses()
151174
unhealthy_providers = [
152175
p for p in provider_statuses if p.status == HealthStatus.ERROR.value
153176
]
154177

155178
if unhealthy_providers:
156-
ready = False
157179
unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
158180
reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
181+
impacts = [
182+
f"Provider {p.provider_id} unhealthy: {p.message}"
183+
for p in unhealthy_providers
184+
]
159185
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
160186
return ReadinessResponse(
161-
ready=ready, reason=reason, providers=unhealthy_providers
187+
ready=False,
188+
reason=reason,
189+
overall_status=HealthStatus.UNHEALTHY,
190+
impacts=impacts,
191+
providers=unhealthy_providers,
162192
)
163193

164194
# Check that the default model is registered in the model registry
165195
model_available, model_reason = await check_default_model_available()
166196
if not model_available:
167197
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
168198
return ReadinessResponse(
169-
ready=False, reason=model_reason, providers=unhealthy_providers
199+
ready=False,
200+
reason=model_reason,
201+
overall_status=HealthStatus.UNHEALTHY,
202+
impacts=["Default model not available in registry"],
203+
providers=[],
170204
)
171205

206+
# All healthy
172207
return ReadinessResponse(
173-
ready=True, reason="All providers are healthy", providers=unhealthy_providers
208+
ready=True,
209+
reason="All providers are healthy",
210+
overall_status=HealthStatus.HEALTHY,
211+
impacts=None,
212+
providers=[],
174213
)
175214

176215

src/app/main.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from models.api.responses.error import InternalServerErrorResponse
2727
from sentry import initialize_sentry
2828
from utils.common import register_mcp_servers_async
29+
from utils.degraded_mode import DegradedModeTracker
2930
from utils.llama_stack_version import check_llama_stack_version
3031

3132
logger = get_logger(__name__)
@@ -81,15 +82,19 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
8182
await AsyncLlamaStackClientHolder().load(llama_stack_config)
8283
client: AsyncLlamaStackClient = AsyncLlamaStackClientHolder().get_client()
8384
logger.debug("Llama Stack client initialized, trying to connect to Llama Stack")
84-
# check if the Llama Stack version is supported by the service
85+
# Check connectivity to Llama Stack and set degraded mode if unavailable
86+
degraded_tracker = DegradedModeTracker()
8587
try:
8688
llama_stack_version = await check_llama_stack_version(
8789
client, llama_stack_config.max_retries, llama_stack_config.retry_delay
8890
)
8991
if llama_stack_version is None:
9092
logger.error("Cannot retrieve Llama Stack version, check connection")
93+
if llama_stack_config.allow_degraded_mode:
94+
degraded_tracker.set_degraded("Llama Stack connection check failed")
9195
else:
9296
logger.debug("Llama Stack version: %s", llama_stack_version)
97+
degraded_tracker.set_healthy()
9398
except APIConnectionError as e:
9499
# if degraded mode is allowed, simply ignore the exception
95100
llama_stack_url = llama_stack_config.url
@@ -103,6 +108,7 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
103108
)
104109
if llama_stack_config.allow_degraded_mode:
105110
logger.info("Entering degraded mode: LCORE running w/o Llama Stack")
111+
degraded_tracker.set_degraded(f"Failed to connect to Llama Stack: {e!s}")
106112
else:
107113
raise
108114

src/models/api/responses/successful/probes.py

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
"""Successful probe-related API responses (info, readiness, liveness, status, auth)."""
22

3-
from typing import Any
3+
from typing import Any, Optional
44

55
from pydantic import Field
66

77
from models.api.responses.successful.bases import AbstractSuccessfulResponse
8-
from models.common.health import ProviderHealthStatus
8+
from models.common.health import (
9+
HealthStatus,
10+
ProviderHealthStatus,
11+
)
912

1013

1114
class InfoResponse(AbstractSuccessfulResponse):
@@ -50,26 +53,46 @@ class ReadinessResponse(AbstractSuccessfulResponse):
5053
"""Model representing response to a readiness request.
5154
5255
Attributes:
53-
ready: If service is ready.
54-
reason: The reason for the readiness.
55-
providers: List of unhealthy providers in case of readiness failure.
56+
ready: If service is ready to handle requests.
57+
reason: The reason for the readiness status.
58+
overall_status: Overall service health status (healthy/degraded/unhealthy).
59+
impacts: Optional list of functional impacts when degraded or unhealthy.
60+
providers: List of unhealthy providers (empty when all healthy).
5661
"""
5762

5863
ready: bool = Field(
5964
...,
60-
description="Flag indicating if service is ready",
65+
description="Flag indicating if service is ready to handle requests",
6166
examples=[True, False],
6267
)
6368

6469
reason: str = Field(
6570
...,
66-
description="The reason for the readiness",
71+
description="The reason for the readiness status",
6772
examples=["Service is ready"],
6873
)
6974

75+
overall_status: HealthStatus = Field(
76+
...,
77+
description="Overall service health status",
78+
examples=["healthy", "degraded", "unhealthy"],
79+
)
80+
81+
impacts: Optional[list[str]] = Field(
82+
None,
83+
description="List of functional impacts when service is degraded or unhealthy",
84+
examples=[
85+
[
86+
"LLM inference unavailable",
87+
"RAG functionality unavailable",
88+
"Agent tools unavailable",
89+
]
90+
],
91+
)
92+
7093
providers: list[ProviderHealthStatus] = Field(
7194
...,
72-
description="List of unhealthy providers in case of readiness failure.",
95+
description="List of unhealthy providers (empty when all healthy)",
7396
examples=[],
7497
)
7598

@@ -79,9 +102,22 @@ class ReadinessResponse(AbstractSuccessfulResponse):
79102
"examples": [
80103
{
81104
"ready": True,
82-
"reason": "Service is ready",
105+
"reason": "All providers are healthy",
106+
"overall_status": "healthy",
107+
"impacts": None,
83108
"providers": [],
84-
}
109+
},
110+
{
111+
"ready": True,
112+
"reason": "Service running in degraded mode",
113+
"overall_status": "degraded",
114+
"impacts": [
115+
"LLM inference unavailable",
116+
"RAG functionality unavailable",
117+
"Agent tools unavailable",
118+
],
119+
"providers": [],
120+
},
85121
]
86122
}
87123
}

src/models/common/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
Message,
88
)
99
from models.common.feedback import FeedbackCategory
10-
from models.common.health import HealthStatus, ProviderHealthStatus
10+
from models.common.health import (
11+
HealthStatus,
12+
ProviderHealthStatus,
13+
)
1114
from models.common.mcp import MCPServerAuthInfo, MCPServerInfo
1215
from models.common.moderation import (
1316
ShieldModerationBlocked,

src/models/common/health.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,33 @@
77

88

99
class HealthStatus(str, Enum):
10-
"""Health status enum for provider health checks."""
10+
"""Health status enum for provider and service health checks.
1111
12+
This enum serves two purposes:
13+
14+
1. Provider-level health (returned by Llama Stack providers):
15+
- OK: Provider is healthy and operational
16+
- ERROR: Provider is unhealthy or failed health check
17+
- NOT_IMPLEMENTED: Provider does not implement health checks
18+
- UNKNOWN: Fallback when provider status cannot be determined
19+
20+
2. Service-level health (overall LCORE status):
21+
- HEALTHY: All systems operational, LLS connected, all providers healthy
22+
- DEGRADED: Service running with reduced functionality (e.g., LLS unavailable)
23+
- UNHEALTHY: Service connected but one or more providers are unhealthy
24+
"""
25+
26+
# Provider-level statuses (from Llama Stack)
1227
OK = "ok"
1328
ERROR = "Error"
1429
NOT_IMPLEMENTED = "not_implemented"
15-
HEALTHY = "healthy"
1630
UNKNOWN = "unknown"
1731

32+
# Service-level statuses (LCORE overall health)
33+
HEALTHY = "healthy"
34+
DEGRADED = "degraded"
35+
UNHEALTHY = "unhealthy"
36+
1837

1938
class ProviderHealthStatus(BaseModel):
2039
"""Model representing the health status of a provider.
@@ -35,5 +54,5 @@ class ProviderHealthStatus(BaseModel):
3554
message: Optional[str] = Field(
3655
None,
3756
description="Optional message about the health status",
38-
examples=["All systems operational", "Llama Stack is unavailable"],
57+
examples=["All systems operational", "Provider is unavailable"],
3958
)

src/utils/degraded_mode.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""Degraded mode state tracking.
2+
3+
This module provides a singleton to track whether Lightspeed Core Stack is
4+
running in degraded mode (i.e., without Llama Stack connectivity).
5+
"""
6+
7+
from typing import Optional
8+
9+
from utils.types import Singleton
10+
11+
12+
class DegradedModeTracker(metaclass=Singleton):
13+
"""Track degraded mode state for Lightspeed Core Stack.
14+
15+
When LCORE cannot connect to Llama Stack during startup and
16+
allow_degraded_mode is enabled, the service enters degraded mode.
17+
This tracker maintains that state for health reporting.
18+
"""
19+
20+
def __init__(self) -> None:
21+
"""Initialize the degraded mode tracker."""
22+
self._is_degraded: bool = False
23+
self._degraded_reason: Optional[str] = None
24+
25+
def set_degraded(self, reason: str) -> None:
26+
"""Mark the service as running in degraded mode.
27+
28+
Parameters:
29+
reason: Description of why degraded mode was entered.
30+
"""
31+
self._is_degraded = True
32+
self._degraded_reason = reason
33+
34+
def set_healthy(self) -> None:
35+
"""Mark the service as running in healthy mode."""
36+
self._is_degraded = False
37+
self._degraded_reason = None
38+
39+
def is_degraded(self) -> bool:
40+
"""Check if the service is running in degraded mode.
41+
42+
Returns:
43+
True if service is in degraded mode, False otherwise.
44+
"""
45+
return self._is_degraded
46+
47+
def get_degraded_reason(self) -> Optional[str]:
48+
"""Get the reason for degraded mode.
49+
50+
Returns:
51+
Description of why degraded mode was entered, or None if healthy.
52+
"""
53+
return self._degraded_reason

0 commit comments

Comments
 (0)