diff --git a/docs/openapi.json b/docs/openapi.json index fa969fb98..dd774a3b6 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -9964,7 +9964,7 @@ "health" ], "summary": "Readiness Probe Get Method", - "description": "Handle the readiness probe endpoint, returning service readiness.\n\nIf any provider reports an error status, responds with HTTP 503\nand details of unhealthy providers; otherwise, indicates the\nservice is ready.\n\n### Parameters:\n- response: The outgoing HTTP response (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Raises:\n- HTTPException: with status 401 for unauthorized access.\n- HTTPException: with status 403 if permission is denied.\n- HTTPException: with status 500 and a detail object containing `response`\n and `cause` when service configuration is wrong or incomplete.\n- HTTPException: with status 503 and a detail object containing `response`\n and `cause` when unable to connect to Llama Stack.\n\n### Returns:\n- ReadinessResponse: Object with `ready` indicating overall readiness,\n `reason` explaining the outcome, and `providers` containing the list of\n unhealthy ProviderHealthStatus entries (empty when ready).", + "description": "Handle the readiness probe endpoint, returning service readiness and health status.\n\nReturns comprehensive health information including overall service status,\nprovider health, and functional impacts. The service is considered \"ready\" even\nin degraded mode (returns 200), but reports reduced functionality.\n\n### Parameters:\n- response: The outgoing HTTP response (used by middleware).\n- auth: Authentication tuple from the auth dependency (used by middleware).\n\n### Raises:\n- HTTPException: with status 401 for unauthorized access.\n- HTTPException: with status 403 if permission is denied.\n- HTTPException: with status 503 when service is unhealthy (providers down,\n models unavailable) and degraded mode is not enabled.\n\n### Returns:\n- ReadinessResponse: Object with comprehensive health status including:\n - ready: True if service can handle requests (even in degraded mode)\n - reason: Description of service state\n - overall_status: healthy, degraded, or unhealthy\n - impacts: Functional limitations when degraded/unhealthy\n - providers: List of unhealthy providers", "operationId": "readiness_probe_get_method_readiness_get", "responses": { "200": { @@ -9975,9 +9975,10 @@ "$ref": "#/components/schemas/ReadinessResponse" }, "example": { + "overall_status": "healthy", "providers": [], "ready": true, - "reason": "Service is ready" + "reason": "All providers are healthy" } } } @@ -13485,6 +13486,20 @@ "type": "object", "title": "HTTPValidationError" }, + "HealthStatus": { + "type": "string", + "enum": [ + "ok", + "Error", + "not_implemented", + "unknown", + "healthy", + "degraded", + "unhealthy" + ], + "title": "HealthStatus", + "description": "Health status enum for provider and service health checks.\n\nThis enum serves two purposes:\n\n1. Provider-level health (returned by Llama Stack providers):\n - OK: Provider is healthy and operational\n - ERROR: Provider is unhealthy or failed health check\n - NOT_IMPLEMENTED: Provider does not implement health checks\n - UNKNOWN: Fallback when provider status cannot be determined\n\n2. Service-level health (overall LCORE status):\n - HEALTHY: All systems operational, LLS connected, all providers healthy\n - DEGRADED: Service running with reduced functionality (e.g., LLS unavailable)\n - UNHEALTHY: Service connected but one or more providers are unhealthy" + }, "ImplicitOAuthFlow": { "properties": { "authorizationUrl": { @@ -16888,7 +16903,7 @@ "description": "Optional message about the health status", "examples": [ "All systems operational", - "Llama Stack is unavailable" + "Provider is unavailable" ] } }, @@ -17866,7 +17881,7 @@ "ready": { "type": "boolean", "title": "Ready", - "description": "Flag indicating if service is ready", + "description": "Flag indicating if service is ready to handle requests", "examples": [ true, false @@ -17875,18 +17890,49 @@ "reason": { "type": "string", "title": "Reason", - "description": "The reason for the readiness", + "description": "The reason for the readiness status", "examples": [ "Service is ready" ] }, + "overall_status": { + "$ref": "#/components/schemas/HealthStatus", + "description": "Overall service health status", + "examples": [ + "healthy", + "degraded", + "unhealthy" + ] + }, + "impacts": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Impacts", + "description": "List of functional impacts when service is degraded or unhealthy", + "examples": [ + [ + "LLM inference unavailable", + "RAG functionality unavailable", + "Agent tools unavailable" + ] + ] + }, "providers": { "items": { "$ref": "#/components/schemas/ProviderHealthStatus" }, "type": "array", "title": "Providers", - "description": "List of unhealthy providers in case of readiness failure.", + "description": "List of unhealthy providers (empty when all healthy)", "examples": [] } }, @@ -17894,15 +17940,28 @@ "required": [ "ready", "reason", + "overall_status", "providers" ], "title": "ReadinessResponse", - "description": "Model representing response to a readiness request.\n\nAttributes:\n ready: If service is ready.\n reason: The reason for the readiness.\n providers: List of unhealthy providers in case of readiness failure.", + "description": "Model representing response to a readiness request.\n\nAttributes:\n ready: If service is ready to handle requests.\n reason: The reason for the readiness status.\n overall_status: Overall service health status (healthy/degraded/unhealthy).\n impacts: Optional list of functional impacts when degraded or unhealthy.\n providers: List of unhealthy providers (empty when all healthy).", "examples": [ { + "overall_status": "healthy", + "providers": [], + "ready": true, + "reason": "All providers are healthy" + }, + { + "impacts": [ + "LLM inference unavailable", + "RAG functionality unavailable", + "Agent tools unavailable" + ], + "overall_status": "degraded", "providers": [], "ready": true, - "reason": "Service is ready" + "reason": "Service running in degraded mode" } ] }, diff --git a/src/app/endpoints/health.py b/src/app/endpoints/health.py index 6122562f2..b718dc178 100644 --- a/src/app/endpoints/health.py +++ b/src/app/endpoints/health.py @@ -26,8 +26,12 @@ LivenessResponse, ReadinessResponse, ) -from models.common import HealthStatus, ProviderHealthStatus +from models.common import ( + HealthStatus, + ProviderHealthStatus, +) from models.config import Action +from utils.degraded_mode import DegradedModeTracker logger = get_logger(__name__) router = APIRouter(tags=["health"]) @@ -117,11 +121,11 @@ async def readiness_probe_get_method( response: Response, ) -> ReadinessResponse: """ - Handle the readiness probe endpoint, returning service readiness. + Handle the readiness probe endpoint, returning service readiness and health status. - If any provider reports an error status, responds with HTTP 503 - and details of unhealthy providers; otherwise, indicates the - service is ready. + Returns comprehensive health information including overall service status, + provider health, and functional impacts. The service is considered "ready" even + in degraded mode (returns 200), but reports reduced functionality. ### Parameters: - response: The outgoing HTTP response (used by middleware). @@ -130,35 +134,73 @@ async def readiness_probe_get_method( ### Raises: - HTTPException: with status 401 for unauthorized access. - HTTPException: with status 403 if permission is denied. - - HTTPException: with status 500 and a detail object containing `response` - and `cause` when service configuration is wrong or incomplete. - - HTTPException: with status 503 and a detail object containing `response` - and `cause` when unable to connect to Llama Stack. + - HTTPException: with status 503 when service is unhealthy (providers down, + models unavailable) and degraded mode is not enabled. ### Returns: - - ReadinessResponse: Object with `ready` indicating overall readiness, - `reason` explaining the outcome, and `providers` containing the list of - unhealthy ProviderHealthStatus entries (empty when ready). + - ReadinessResponse: Object with comprehensive health status including: + - ready: True if service can handle requests (even in degraded mode) + - reason: Description of service state + - overall_status: healthy, degraded, or unhealthy + - impacts: Functional limitations when degraded/unhealthy + - providers: List of unhealthy providers """ # Used only for authorization _ = auth - logger.info("Response to /v1/readiness endpoint") + logger.info("Response to /readiness endpoint") - provider_statuses = await get_providers_health_statuses() + degraded_tracker = DegradedModeTracker() + is_degraded = degraded_tracker.is_degraded() + + # Determine overall status + if is_degraded: + # Service is ready (can serve health checks, metrics, etc.) but degraded + impacts = [ + "LLM inference unavailable", + "RAG functionality unavailable", + "Agent tools unavailable", + ] + return ReadinessResponse( + ready=True, + reason="Service running in degraded mode", + overall_status=HealthStatus.DEGRADED, + impacts=impacts, + providers=[], + ) - # Check if any provider is unhealthy (not counting not_implemented as unhealthy) + # Not in degraded mode - check provider health + provider_statuses = await get_providers_health_statuses() unhealthy_providers = [ p for p in provider_statuses if p.status == HealthStatus.ERROR.value ] if unhealthy_providers: - ready = False - unhealthy_provider_names = [p.provider_id for p in unhealthy_providers] - reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}" + # Check if this is a connection error (provider_id="unknown") + is_connection_error = any( + p.provider_id == "unknown" for p in unhealthy_providers + ) + + if is_connection_error: + reason = "Cannot connect to backend service" + impacts = [ + "LLM inference unavailable", + "Provider health checks unavailable", + ] + else: + unhealthy_provider_names = [p.provider_id for p in unhealthy_providers] + reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}" + impacts = [ + f"Provider {p.provider_id}: {p.message}" for p in unhealthy_providers + ] + response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE return ReadinessResponse( - ready=ready, reason=reason, providers=unhealthy_providers + ready=False, + reason=reason, + overall_status=HealthStatus.UNHEALTHY, + impacts=impacts, + providers=unhealthy_providers if not is_connection_error else [], ) # Check that the default model is registered in the model registry @@ -166,11 +208,20 @@ async def readiness_probe_get_method( if not model_available: response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE return ReadinessResponse( - ready=False, reason=model_reason, providers=unhealthy_providers + ready=False, + reason=model_reason, + overall_status=HealthStatus.UNHEALTHY, + impacts=["Default model not available in registry"], + providers=[], ) + # All healthy return ReadinessResponse( - ready=True, reason="All providers are healthy", providers=unhealthy_providers + ready=True, + reason="All providers are healthy", + overall_status=HealthStatus.HEALTHY, + impacts=None, + providers=[], ) diff --git a/src/app/main.py b/src/app/main.py index f1c2f6df9..41466e143 100644 --- a/src/app/main.py +++ b/src/app/main.py @@ -26,6 +26,7 @@ from models.api.responses.error import InternalServerErrorResponse from sentry import initialize_sentry from utils.common import register_mcp_servers_async +from utils.degraded_mode import DegradedModeTracker from utils.llama_stack_version import check_llama_stack_version logger = get_logger(__name__) @@ -81,15 +82,19 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]: await AsyncLlamaStackClientHolder().load(llama_stack_config) client: AsyncLlamaStackClient = AsyncLlamaStackClientHolder().get_client() logger.debug("Llama Stack client initialized, trying to connect to Llama Stack") - # check if the Llama Stack version is supported by the service + # Check connectivity to Llama Stack and set degraded mode if unavailable + degraded_tracker = DegradedModeTracker() try: llama_stack_version = await check_llama_stack_version( client, llama_stack_config.max_retries, llama_stack_config.retry_delay ) if llama_stack_version is None: logger.error("Cannot retrieve Llama Stack version, check connection") + if llama_stack_config.allow_degraded_mode: + degraded_tracker.set_degraded("Llama Stack connection check failed") else: logger.debug("Llama Stack version: %s", llama_stack_version) + degraded_tracker.set_healthy() except APIConnectionError as e: # if degraded mode is allowed, simply ignore the exception llama_stack_url = llama_stack_config.url @@ -103,6 +108,7 @@ async def lifespan(_app: FastAPI) -> AsyncIterator[None]: ) if llama_stack_config.allow_degraded_mode: logger.info("Entering degraded mode: LCORE running w/o Llama Stack") + degraded_tracker.set_degraded(f"Failed to connect to Llama Stack: {e!s}") else: raise diff --git a/src/models/api/responses/successful/probes.py b/src/models/api/responses/successful/probes.py index d0c428e33..a30e7eeee 100644 --- a/src/models/api/responses/successful/probes.py +++ b/src/models/api/responses/successful/probes.py @@ -1,11 +1,14 @@ """Successful probe-related API responses (info, readiness, liveness, status, auth).""" -from typing import Any +from typing import Any, Optional from pydantic import Field from models.api.responses.successful.bases import AbstractSuccessfulResponse -from models.common.health import ProviderHealthStatus +from models.common.health import ( + HealthStatus, + ProviderHealthStatus, +) class InfoResponse(AbstractSuccessfulResponse): @@ -50,26 +53,46 @@ class ReadinessResponse(AbstractSuccessfulResponse): """Model representing response to a readiness request. Attributes: - ready: If service is ready. - reason: The reason for the readiness. - providers: List of unhealthy providers in case of readiness failure. + ready: If service is ready to handle requests. + reason: The reason for the readiness status. + overall_status: Overall service health status (healthy/degraded/unhealthy). + impacts: Optional list of functional impacts when degraded or unhealthy. + providers: List of unhealthy providers (empty when all healthy). """ ready: bool = Field( ..., - description="Flag indicating if service is ready", + description="Flag indicating if service is ready to handle requests", examples=[True, False], ) reason: str = Field( ..., - description="The reason for the readiness", + description="The reason for the readiness status", examples=["Service is ready"], ) + overall_status: HealthStatus = Field( + ..., + description="Overall service health status", + examples=["healthy", "degraded", "unhealthy"], + ) + + impacts: Optional[list[str]] = Field( + None, + description="List of functional impacts when service is degraded or unhealthy", + examples=[ + [ + "LLM inference unavailable", + "RAG functionality unavailable", + "Agent tools unavailable", + ] + ], + ) + providers: list[ProviderHealthStatus] = Field( ..., - description="List of unhealthy providers in case of readiness failure.", + description="List of unhealthy providers (empty when all healthy)", examples=[], ) @@ -79,9 +102,22 @@ class ReadinessResponse(AbstractSuccessfulResponse): "examples": [ { "ready": True, - "reason": "Service is ready", + "reason": "All providers are healthy", + "overall_status": "healthy", + "impacts": None, "providers": [], - } + }, + { + "ready": True, + "reason": "Service running in degraded mode", + "overall_status": "degraded", + "impacts": [ + "LLM inference unavailable", + "RAG functionality unavailable", + "Agent tools unavailable", + ], + "providers": [], + }, ] } } diff --git a/src/models/common/__init__.py b/src/models/common/__init__.py index 797894d1c..048134b98 100644 --- a/src/models/common/__init__.py +++ b/src/models/common/__init__.py @@ -7,7 +7,10 @@ Message, ) from models.common.feedback import FeedbackCategory -from models.common.health import HealthStatus, ProviderHealthStatus +from models.common.health import ( + HealthStatus, + ProviderHealthStatus, +) from models.common.mcp import MCPServerAuthInfo, MCPServerInfo from models.common.moderation import ( ShieldModerationBlocked, diff --git a/src/models/common/health.py b/src/models/common/health.py index b90f17af1..d0a34b8b1 100644 --- a/src/models/common/health.py +++ b/src/models/common/health.py @@ -7,14 +7,33 @@ class HealthStatus(str, Enum): - """Health status enum for provider health checks.""" + """Health status enum for provider and service health checks. + This enum serves two purposes: + + 1. Provider-level health (returned by Llama Stack providers): + - OK: Provider is healthy and operational + - ERROR: Provider is unhealthy or failed health check + - NOT_IMPLEMENTED: Provider does not implement health checks + - UNKNOWN: Fallback when provider status cannot be determined + + 2. Service-level health (overall LCORE status): + - HEALTHY: All systems operational, LLS connected, all providers healthy + - DEGRADED: Service running with reduced functionality (e.g., LLS unavailable) + - UNHEALTHY: Service connected but one or more providers are unhealthy + """ + + # Provider-level statuses (from Llama Stack) OK = "ok" ERROR = "Error" NOT_IMPLEMENTED = "not_implemented" - HEALTHY = "healthy" UNKNOWN = "unknown" + # Service-level statuses (LCORE overall health) + HEALTHY = "healthy" + DEGRADED = "degraded" + UNHEALTHY = "unhealthy" + class ProviderHealthStatus(BaseModel): """Model representing the health status of a provider. @@ -35,5 +54,5 @@ class ProviderHealthStatus(BaseModel): message: Optional[str] = Field( None, description="Optional message about the health status", - examples=["All systems operational", "Llama Stack is unavailable"], + examples=["All systems operational", "Provider is unavailable"], ) diff --git a/src/utils/degraded_mode.py b/src/utils/degraded_mode.py new file mode 100644 index 000000000..661272da7 --- /dev/null +++ b/src/utils/degraded_mode.py @@ -0,0 +1,53 @@ +"""Degraded mode state tracking. + +This module provides a singleton to track whether Lightspeed Core Stack is +running in degraded mode (i.e., without Llama Stack connectivity). +""" + +from typing import Optional + +from utils.types import Singleton + + +class DegradedModeTracker(metaclass=Singleton): + """Track degraded mode state for Lightspeed Core Stack. + + When LCORE cannot connect to Llama Stack during startup and + allow_degraded_mode is enabled, the service enters degraded mode. + This tracker maintains that state for health reporting. + """ + + def __init__(self) -> None: + """Initialize the degraded mode tracker.""" + self._is_degraded: bool = False + self._degraded_reason: Optional[str] = None + + def set_degraded(self, reason: str) -> None: + """Mark the service as running in degraded mode. + + Parameters: + reason: Description of why degraded mode was entered. + """ + self._is_degraded = True + self._degraded_reason = reason + + def set_healthy(self) -> None: + """Mark the service as running in healthy mode.""" + self._is_degraded = False + self._degraded_reason = None + + def is_degraded(self) -> bool: + """Check if the service is running in degraded mode. + + Returns: + True if service is in degraded mode, False otherwise. + """ + return self._is_degraded + + def get_degraded_reason(self) -> Optional[str]: + """Get the reason for degraded mode. + + Returns: + Description of why degraded mode was entered, or None if healthy. + """ + return self._degraded_reason diff --git a/tests/unit/app/endpoints/test_health.py b/tests/unit/app/endpoints/test_health.py index 85e0ea477..e7e9160aa 100644 --- a/tests/unit/app/endpoints/test_health.py +++ b/tests/unit/app/endpoints/test_health.py @@ -14,7 +14,10 @@ ) from authentication.interface import AuthTuple from models.api.responses.successful import ReadinessResponse -from models.common import HealthStatus, ProviderHealthStatus +from models.common import ( + HealthStatus, + ProviderHealthStatus, +) from tests.unit.utils.auth_helpers import mock_authorization_resolvers @@ -25,6 +28,11 @@ async def test_readiness_probe_fails_due_to_unhealthy_providers( """Test the readiness endpoint handler fails when providers are unhealthy.""" mock_authorization_resolvers(mocker) + # Mock DegradedModeTracker to return healthy (not degraded) state + mock_tracker = mocker.patch("app.endpoints.health.DegradedModeTracker") + mock_instance = mock_tracker.return_value + mock_instance.is_degraded.return_value = False + # Mock get_providers_health_statuses to return an unhealthy provider mock_get_providers_health_statuses = mocker.patch( "app.endpoints.health.get_providers_health_statuses" @@ -48,6 +56,8 @@ async def test_readiness_probe_fails_due_to_unhealthy_providers( assert response.ready is False assert "test_provider" in response.reason assert "Providers not healthy" in response.reason + assert response.overall_status == HealthStatus.UNHEALTHY + assert response.impacts is not None assert mock_response.status_code == 503 @@ -58,6 +68,11 @@ async def test_readiness_probe_success_when_all_providers_healthy( """Test the readiness endpoint handler succeeds when all providers are healthy.""" mock_authorization_resolvers(mocker) + # Mock DegradedModeTracker to return healthy (not degraded) state + mock_tracker = mocker.patch("app.endpoints.health.DegradedModeTracker") + mock_instance = mock_tracker.return_value + mock_instance.is_degraded.return_value = False + # Mock get_providers_health_statuses to return healthy providers mock_get_providers_health_statuses = mocker.patch( "app.endpoints.health.get_providers_health_statuses" @@ -92,6 +107,8 @@ async def test_readiness_probe_success_when_all_providers_healthy( assert isinstance(response, ReadinessResponse) assert response.ready is True assert response.reason == "All providers are healthy" + assert response.overall_status == HealthStatus.HEALTHY + assert response.impacts is None # Should return empty list since no providers are unhealthy assert len(response.providers) == 0 @@ -103,6 +120,11 @@ async def test_readiness_probe_fails_when_model_not_available( """Test readiness returns 503 when providers are healthy but default model is missing.""" mock_authorization_resolvers(mocker) + # Mock DegradedModeTracker to return healthy (not degraded) state + mock_tracker = mocker.patch("app.endpoints.health.DegradedModeTracker") + mock_instance = mock_tracker.return_value + mock_instance.is_degraded.return_value = False + mock_get_providers = mocker.patch( "app.endpoints.health.get_providers_health_statuses" ) @@ -130,6 +152,8 @@ async def test_readiness_probe_fails_when_model_not_available( assert response.ready is False assert "not found in model registry" in response.reason + assert response.overall_status == HealthStatus.UNHEALTHY + assert response.impacts is not None assert mock_response.status_code == 503 @@ -337,3 +361,36 @@ async def test_returns_holder_failure( assert available is False assert "not found in model registry" in reason + + +class TestReadinessDegradedMode: # pylint: disable=too-few-public-methods + """Test cases for /readiness endpoint with degraded mode.""" + + @pytest.mark.asyncio + async def test_readiness_degraded_mode(self, mocker: MockerFixture) -> None: + """Test /readiness endpoint returns ready=True in degraded mode.""" + mock_authorization_resolvers(mocker) + + # Mock DegradedModeTracker to return degraded state + mock_tracker = mocker.patch("app.endpoints.health.DegradedModeTracker") + mock_instance = mock_tracker.return_value + mock_instance.is_degraded.return_value = True + mock_instance.get_degraded_reason.return_value = ( + "Failed to connect to Llama Stack: Connection error" + ) + + mock_response = mocker.Mock() + auth: AuthTuple = ("test_user_id", "test_user", True, "test_token") + + response = await readiness_probe_get_method(auth=auth, response=mock_response) + + assert response is not None + assert isinstance(response, ReadinessResponse) + assert response.ready is True # Service is ready even in degraded mode + assert response.reason == "Service running in degraded mode" + assert response.overall_status == HealthStatus.DEGRADED + assert response.impacts is not None + assert "LLM inference unavailable" in response.impacts + assert "RAG functionality unavailable" in response.impacts + assert "Agent tools unavailable" in response.impacts + assert len(response.providers) == 0 diff --git a/tests/unit/models/responses/test_successful_responses.py b/tests/unit/models/responses/test_successful_responses.py index 48a928b96..df5ae1ef4 100644 --- a/tests/unit/models/responses/test_successful_responses.py +++ b/tests/unit/models/responses/test_successful_responses.py @@ -35,6 +35,7 @@ from models.common import ( ConversationData, ConversationDetails, + HealthStatus, MCPServerAuthInfo, ProviderHealthStatus, ) @@ -412,11 +413,15 @@ class TestReadinessResponse: def test_constructor_ready(self) -> None: """Test ReadinessResponse when service is ready.""" response = ReadinessResponse( - ready=True, reason="Service is ready", providers=[] + ready=True, + reason="Service is ready", + overall_status=HealthStatus.HEALTHY, + providers=[], ) assert isinstance(response, AbstractSuccessfulResponse) assert response.ready is True assert response.reason == "Service is ready" + assert response.overall_status == HealthStatus.HEALTHY assert response.providers == [] def test_constructor_not_ready(self) -> None: @@ -427,9 +432,13 @@ def test_constructor_not_ready(self) -> None: ) ] response = ReadinessResponse( - ready=False, reason="Service is not ready", providers=providers + ready=False, + reason="Service is not ready", + overall_status=HealthStatus.UNHEALTHY, + providers=providers, ) assert response.ready is False + assert response.overall_status == HealthStatus.UNHEALTHY assert len(response.providers) == 1 assert response.providers[0].provider_id == "provider1" @@ -449,7 +458,7 @@ def test_openapi_response(self) -> None: Asserts the returned mapping has description "Successful response", the `model` is ReadinessResponse, and `content["application/json"]` contains an "example". Also verifies the number of examples in - ReadinessResponse.model_json_schema() equals 1. + ReadinessResponse.model_json_schema() equals 2 (healthy and degraded). """ schema = ReadinessResponse.model_json_schema() model_examples = schema.get("examples", []) @@ -460,8 +469,8 @@ def test_openapi_response(self) -> None: assert result["model"] == ReadinessResponse assert "example" in result["content"]["application/json"] - # Verify example count matches schema examples count (should be 1) - assert expected_count == 1 + # Verify example count matches schema examples count (should be 2) + assert expected_count == 2 class TestLivenessResponse: diff --git a/tests/unit/test_degraded_mode.py b/tests/unit/test_degraded_mode.py new file mode 100644 index 000000000..27148bd6e --- /dev/null +++ b/tests/unit/test_degraded_mode.py @@ -0,0 +1,59 @@ +"""Unit tests for the degraded mode tracker.""" + +from utils.degraded_mode import DegradedModeTracker + + +class TestDegradedModeTracker: + """Test cases for DegradedModeTracker.""" + + def test_initial_state_is_healthy(self) -> None: + """Test tracker starts in healthy state.""" + tracker = DegradedModeTracker() + assert tracker.is_degraded() is False + assert tracker.get_degraded_reason() is None + + def test_set_degraded(self) -> None: + """Test setting degraded mode.""" + tracker = DegradedModeTracker() + reason = "Failed to connect to Llama Stack" + + tracker.set_degraded(reason) + + assert tracker.is_degraded() is True + assert tracker.get_degraded_reason() == reason + + def test_set_healthy(self) -> None: + """Test setting healthy mode.""" + tracker = DegradedModeTracker() + + tracker.set_healthy() + + assert tracker.is_degraded() is False + assert tracker.get_degraded_reason() is None + + def test_transition_from_degraded_to_healthy(self) -> None: + """Test transitioning from degraded to healthy state.""" + tracker = DegradedModeTracker() + + # Set degraded + tracker.set_degraded("Connection error") + assert tracker.is_degraded() is True + + # Transition to healthy + tracker.set_healthy() + assert tracker.is_degraded() is False + assert tracker.get_degraded_reason() is None + + def test_singleton_pattern(self) -> None: + """Test that DegradedModeTracker is a singleton.""" + tracker1 = DegradedModeTracker() + tracker2 = DegradedModeTracker() + + assert tracker1 is tracker2 + + # Set state on one instance + tracker1.set_degraded("Test reason") + + # Verify state is shared across instances + assert tracker2.is_degraded() is True + assert tracker2.get_degraded_reason() == "Test reason"