Merge pull request #1636 from major/fix/readiness-check-model-availability

tisnik · web-flow · commit aa41ccd88f8a · 2026-04-30T16:19:38.000+02:00
RSPEED-2959: Add default model availability check to readiness probe
diff --git a/src/app/endpoints/health.py b/src/app/endpoints/health.py
@@ -15,6 +15,7 @@
 from authentication.interface import AuthTuple
 from authorization.middleware import authorize
 from client import AsyncLlamaStackClientHolder
+from configuration import configuration
 from log import get_logger
 from models.config import Action
 from models.responses import (
@@ -95,6 +96,31 @@ async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
         ]
 
 
+async def check_default_model_available() -> tuple[bool, str]:
+    """Check that the configured default model is registered in the model registry.
+
+    Retrieves the default model and provider from configuration and delegates
+    the availability check to the client holder.
+
+    Returns:
+        A tuple of (available, reason) where available is True if the default
+        model was found or no default model is configured, and reason describes
+        the outcome.
+    """
+    inference = configuration.inference
+    if (
+        inference is None
+        or not inference.default_model
+        or not inference.default_provider
+    ):
+        return True, "No default model configured"
+
+    expected_model_id = f"{inference.default_provider}/{inference.default_model}"
+
+    client_holder = AsyncLlamaStackClientHolder()
+    return await client_holder.check_model_available(expected_model_id)
+
+
 @router.get("/readiness", responses=get_readiness_responses)
 @authorize(Action.INFO)
 async def readiness_probe_get_method(
@@ -134,11 +160,21 @@ async def readiness_probe_get_method(
         unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
         reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
         response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
-    else:
-        ready = True
-        reason = "All providers are healthy"
+        return ReadinessResponse(
+            ready=ready, reason=reason, providers=unhealthy_providers
+        )
+
+    # Check that the default model is registered in the model registry
+    model_available, model_reason = await check_default_model_available()
+    if not model_available:
+        response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+        return ReadinessResponse(
+            ready=False, reason=model_reason, providers=unhealthy_providers
+        )
 
-    return ReadinessResponse(ready=ready, reason=reason, providers=unhealthy_providers)
+    return ReadinessResponse(
+        ready=True, reason="All providers are healthy", providers=unhealthy_providers
+    )
 
 
 @router.get("/liveness", responses=get_liveness_responses)
diff --git a/src/client.py b/src/client.py
@@ -8,7 +8,7 @@
 import yaml
 from fastapi import HTTPException
 from llama_stack.core.library_client import AsyncLlamaStackAsLibraryClient
-from llama_stack_client import APIConnectionError, AsyncLlamaStackClient
+from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient
 
 from configuration import configuration
 from llama_stack_configuration import YamlDumper, enrich_byok_rag, enrich_solr
@@ -141,6 +141,76 @@ async def reload_library_client(self) -> AsyncLlamaStackClient:
         self._lsc = client
         return client
 
+    async def check_model_available(self, model_id: str) -> tuple[bool, str]:
+        """Check if a model is available in the registry, attempting reload if needed.
+
+        Verifies the model can be found in the Llama Stack client's model
+        list. If the model is missing and the client is running in library
+        mode, attempts a client reload to re-register models before
+        reporting failure.
+
+        The reload re-runs the full Stack initialization pipeline, which
+        re-attempts model registration with providers. This handles the
+        case where a transient provider failure (e.g. Vertex AI network
+        blip) caused model registration to fail on startup. Since
+        Kubernetes readiness probe failures only remove the pod from
+        service endpoints without restarting it, the reload provides a
+        self-healing path.
+
+        Args:
+            model_id: The expected model identifier to look up.
+
+        Returns:
+            A tuple of (available, reason) where available is True if the
+            model was found, and reason describes the outcome.
+        """
+        try:
+            client = self.get_client()
+            models = await client.models.list()
+        except RuntimeError as e:
+            logger.warning("Client not initialized, skipping model check: %s", e)
+            return False, f"Client not initialized: {e!s}"
+        except (APIConnectionError, APIStatusError) as e:
+            logger.error("Error checking model availability: %s", e)
+            return False, f"Error checking model availability: {e!s}"
+
+        if any(m.id == model_id for m in models):
+            return True, f"Model {model_id} is available"
+
+        # Model not found - attempt self-healing reload for library clients.
+        # In server mode there is no library client to reload, so we can
+        # only detect the missing model and report failure.
+        if self.is_library_client:
+            logger.warning(
+                "Model %s not found, attempting client reload",
+                model_id,
+            )
+            try:
+                await self.reload_library_client()
+                client = self.get_client()
+                reloaded_models = await client.models.list()
+                if any(m.id == model_id for m in reloaded_models):
+                    logger.info(
+                        "Model %s found after client reload",
+                        model_id,
+                    )
+                    return True, f"Model {model_id} is available after reload"
+            except (
+                RuntimeError,
+                HTTPException,
+                APIConnectionError,
+                APIStatusError,
+            ) as err:
+                logger.error("Client reload failed: %s", err)
+
+        registered_ids = [m.id for m in models]
+        logger.error(
+            "Model %s not found in registry. Registered models: %s",
+            model_id,
+            registered_ids,
+        )
+        return False, f"Model {model_id} not found in model registry"
+
     def update_provider_data(self, updates: dict[str, str]) -> AsyncLlamaStackClient:
         """Update provider data headers for service client.
 
diff --git a/tests/integration/endpoints/test_health_integration.py b/tests/integration/endpoints/test_health_integration.py
@@ -159,6 +159,7 @@ async def test_health_readiness(
     mock_llama_stack_client_health: AsyncMockType,
     test_response: Response,
     test_auth: AuthTuple,
+    mocker: MockerFixture,
 ) -> None:
     """Test that readiness probe endpoint returns readiness status.
 
@@ -180,6 +181,12 @@ async def test_health_readiness(
     """
     _ = mock_llama_stack_client_health
 
+    # Mock check_default_model_available since configuration is not loaded
+    mock_check_model = mocker.patch(
+        "app.endpoints.health.check_default_model_available"
+    )
+    mock_check_model.return_value = (True, "Default model is available")
+
     result = await readiness_probe_get_method(auth=test_auth, response=test_response)
 
     # Verify that service returns readiness response
diff --git a/tests/unit/app/endpoints/test_health.py b/tests/unit/app/endpoints/test_health.py
@@ -1,11 +1,14 @@
 """Unit tests for the /health REST API endpoint."""
 
+from typing import Any
+
 import pytest
 from llama_stack_client import APIConnectionError
 from pytest_mock import MockerFixture
 
 from app.endpoints.health import (
     HealthStatus,
+    check_default_model_available,
     get_providers_health_statuses,
     liveness_probe_get_method,
     readiness_probe_get_method,
@@ -72,6 +75,12 @@ async def test_readiness_probe_success_when_all_providers_healthy(
         ),
     ]
 
+    # Mock check_default_model_available so it doesn't hit uninitialized client
+    mock_check_model = mocker.patch(
+        "app.endpoints.health.check_default_model_available"
+    )
+    mock_check_model.return_value = (True, "Default model is available")
+
     # Mock the Response object and auth
     mock_response = mocker.Mock()
 
@@ -87,6 +96,43 @@ async def test_readiness_probe_success_when_all_providers_healthy(
     assert len(response.providers) == 0
 
 
+@pytest.mark.asyncio
+async def test_readiness_probe_fails_when_model_not_available(
+    mocker: MockerFixture,
+) -> None:
+    """Test readiness returns 503 when providers are healthy but default model is missing."""
+    mock_authorization_resolvers(mocker)
+
+    mock_get_providers = mocker.patch(
+        "app.endpoints.health.get_providers_health_statuses"
+    )
+    mock_get_providers.return_value = [
+        ProviderHealthStatus(
+            provider_id="provider1",
+            status=HealthStatus.OK.value,
+            message="Provider is healthy",
+        )
+    ]
+
+    mock_check_model = mocker.patch(
+        "app.endpoints.health.check_default_model_available"
+    )
+    mock_check_model.return_value = (
+        False,
+        "Default model google-vertex/publishers/google/models/gemini-2.5-flash "
+        "not found in model registry",
+    )
+
+    mock_response = mocker.Mock()
+    auth: AuthTuple = ("test_user_id", "test_user", True, "test_token")
+
+    response = await readiness_probe_get_method(auth=auth, response=mock_response)
+
+    assert response.ready is False
+    assert "not found in model registry" in response.reason
+    assert mock_response.status_code == 503
+
+
 @pytest.mark.asyncio
 async def test_liveness_probe(mocker: MockerFixture) -> None:
     """Test the liveness endpoint handler."""
@@ -207,3 +253,87 @@ async def test_get_providers_health_statuses_connection_error(
         assert (
             result[0].message == "Failed to initialize health check: Connection error."
         )
+
+
+class TestCheckDefaultModelAvailable:
+    """Test cases for the check_default_model_available function.
+
+    The model availability logic (registry lookup, reload, error handling)
+    is tested in tests/unit/test_client.py (TestCheckModelAvailable). These
+    tests verify only the config lookup and delegation in health.py.
+    """
+
+    EXPECTED_MODEL_ID = "google-vertex/publishers/google/models/gemini-2.5-flash"
+
+    @pytest.fixture
+    def inference_config(self, mocker: MockerFixture) -> Any:
+        """Patch configuration with default model and provider."""
+        mock_config = mocker.patch("app.endpoints.health.configuration")
+        mock_config.inference.default_model = (
+            "publishers/google/models/gemini-2.5-flash"
+        )
+        mock_config.inference.default_provider = "google-vertex"
+        return mock_config
+
+    @pytest.mark.asyncio
+    async def test_no_inference_config(self, mocker: MockerFixture) -> None:
+        """Test returns True when no inference configuration exists."""
+        mock_config = mocker.patch("app.endpoints.health.configuration")
+        mock_config.inference = None
+
+        available, reason = await check_default_model_available()
+
+        assert available is True
+        assert reason == "No default model configured"
+
+    @pytest.mark.asyncio
+    async def test_no_default_model_configured(self, mocker: MockerFixture) -> None:
+        """Test returns True when no default model is configured."""
+        mock_config = mocker.patch("app.endpoints.health.configuration")
+        mock_config.inference.default_model = None
+        mock_config.inference.default_provider = None
+
+        available, reason = await check_default_model_available()
+
+        assert available is True
+        assert reason == "No default model configured"
+
+    @pytest.mark.asyncio
+    @pytest.mark.usefixtures("inference_config")
+    async def test_delegates_to_client_holder(
+        self,
+        mocker: MockerFixture,
+    ) -> None:
+        """Test delegates to client holder with correct model ID."""
+        mock_holder = mocker.patch("app.endpoints.health.AsyncLlamaStackClientHolder")
+        mock_holder.return_value.check_model_available = mocker.AsyncMock(
+            return_value=(True, f"Model {self.EXPECTED_MODEL_ID} is available")
+        )
+
+        available, reason = await check_default_model_available()
+
+        assert available is True
+        assert "is available" in reason
+        mock_holder.return_value.check_model_available.assert_awaited_once_with(
+            self.EXPECTED_MODEL_ID
+        )
+
+    @pytest.mark.asyncio
+    @pytest.mark.usefixtures("inference_config")
+    async def test_returns_holder_failure(
+        self,
+        mocker: MockerFixture,
+    ) -> None:
+        """Test passes through failure result from client holder."""
+        mock_holder = mocker.patch("app.endpoints.health.AsyncLlamaStackClientHolder")
+        mock_holder.return_value.check_model_available = mocker.AsyncMock(
+            return_value=(
+                False,
+                f"Model {self.EXPECTED_MODEL_ID} not found in model registry",
+            )
+        )
+
+        available, reason = await check_default_model_available()
+
+        assert available is False
+        assert "not found in model registry" in reason
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py