fix: add default model availability check to readiness probe

major · major · commit f625b60d56b6 · 2026-04-30T07:58:30.000-05:00
The /readiness endpoint only verified provider health but did not
check that the configured default model was registered in the Llama
Stack model registry. This allowed pods where model registration
failed during startup to pass readiness and serve 404s on every
inference request.

Add check_default_model_available() that verifies the configured
default model exists in client.models.list(). When the model is
missing, /readiness returns 503 so Kubernetes removes the pod from
the service load balancer.

Ref: RSPEED-2959
Signed-off-by: Major Hayden &lt;major@redhat.com&gt;
diff --git a/src/app/endpoints/health.py b/src/app/endpoints/health.py
@@ -15,6 +15,7 @@
 from authentication.interface import AuthTuple
 from authorization.middleware import authorize
 from client import AsyncLlamaStackClientHolder
+from configuration import configuration
 from log import get_logger
 from models.config import Action
 from models.responses import (
@@ -95,6 +96,31 @@ async def get_providers_health_statuses() -> list[ProviderHealthStatus]:
         ]
 
 
+async def check_default_model_available() -> tuple[bool, str]:
+    """Check that the configured default model is registered in the model registry.
+
+    Retrieves the default model and provider from configuration and delegates
+    the availability check to the client holder.
+
+    Returns:
+        A tuple of (available, reason) where available is True if the default
+        model was found or no default model is configured, and reason describes
+        the outcome.
+    """
+    inference = configuration.inference
+    if (
+        inference is None
+        or not inference.default_model
+        or not inference.default_provider
+    ):
+        return True, "No default model configured"
+
+    expected_model_id = f"{inference.default_provider}/{inference.default_model}"
+
+    client_holder = AsyncLlamaStackClientHolder()
+    return await client_holder.check_model_available(expected_model_id)
+
+
 @router.get("/readiness", responses=get_readiness_responses)
 @authorize(Action.INFO)
 async def readiness_probe_get_method(
@@ -134,11 +160,21 @@ async def readiness_probe_get_method(
         unhealthy_provider_names = [p.provider_id for p in unhealthy_providers]
         reason = f"Providers not healthy: {', '.join(unhealthy_provider_names)}"
         response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
-    else:
-        ready = True
-        reason = "All providers are healthy"
+        return ReadinessResponse(
+            ready=ready, reason=reason, providers=unhealthy_providers
+        )
+
+    # Check that the default model is registered in the model registry
+    model_available, model_reason = await check_default_model_available()
+    if not model_available:
+        response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+        return ReadinessResponse(
+            ready=False, reason=model_reason, providers=unhealthy_providers
+        )
 
-    return ReadinessResponse(ready=ready, reason=reason, providers=unhealthy_providers)
+    return ReadinessResponse(
+        ready=True, reason="All providers are healthy", providers=unhealthy_providers
+    )
 
 
 @router.get("/liveness", responses=get_liveness_responses)
diff --git a/src/client.py b/src/client.py
@@ -8,7 +8,7 @@
 import yaml
 from fastapi import HTTPException
 from llama_stack.core.library_client import AsyncLlamaStackAsLibraryClient
-from llama_stack_client import APIConnectionError, AsyncLlamaStackClient
+from llama_stack_client import APIConnectionError, APIStatusError, AsyncLlamaStackClient
 
 from configuration import configuration
 from llama_stack_configuration import YamlDumper, enrich_byok_rag, enrich_solr
@@ -141,6 +141,76 @@ async def reload_library_client(self) -> AsyncLlamaStackClient:
         self._lsc = client
         return client
 
+    async def check_model_available(self, model_id: str) -> tuple[bool, str]:
+        """Check if a model is available in the registry, attempting reload if needed.
+
+        Verifies the model can be found in the Llama Stack client's model
+        list. If the model is missing and the client is running in library
+        mode, attempts a client reload to re-register models before
+        reporting failure.
+
+        The reload re-runs the full Stack initialization pipeline, which
+        re-attempts model registration with providers. This handles the
+        case where a transient provider failure (e.g. Vertex AI network
+        blip) caused model registration to fail on startup. Since
+        Kubernetes readiness probe failures only remove the pod from
+        service endpoints without restarting it, the reload provides a
+        self-healing path.
+
+        Args:
+            model_id: The expected model identifier to look up.
+
+        Returns:
+            A tuple of (available, reason) where available is True if the
+            model was found, and reason describes the outcome.
+        """
+        try:
+            client = self.get_client()
+            models = await client.models.list()
+        except RuntimeError as e:
+            logger.warning("Client not initialized, skipping model check: %s", e)
+            return False, f"Client not initialized: {e!s}"
+        except (APIConnectionError, APIStatusError) as e:
+            logger.error("Error checking model availability: %s", e)
+            return False, f"Error checking model availability: {e!s}"
+
+        if any(m.id == model_id for m in models):
+            return True, f"Model {model_id} is available"
+
+        # Model not found - attempt self-healing reload for library clients.
+        # In server mode there is no library client to reload, so we can
+        # only detect the missing model and report failure.
+        if self.is_library_client:
+            logger.warning(
+                "Model %s not found, attempting client reload",
+                model_id,
+            )
+            try:
+                await self.reload_library_client()
+                client = self.get_client()
+                reloaded_models = await client.models.list()
+                if any(m.id == model_id for m in reloaded_models):
+                    logger.info(
+                        "Model %s found after client reload",
+                        model_id,
+                    )
+                    return True, f"Model {model_id} is available after reload"
+            except (
+                RuntimeError,
+                HTTPException,
+                APIConnectionError,
+                APIStatusError,
+            ) as err:
+                logger.error("Client reload failed: %s", err)
+
+        registered_ids = [m.id for m in models]
+        logger.error(
+            "Model %s not found in registry. Registered models: %s",
+            model_id,
+            registered_ids,
+        )
+        return False, f"Model {model_id} not found in model registry"
+
     def update_provider_data(self, updates: dict[str, str]) -> AsyncLlamaStackClient:
         """Update provider data headers for service client.
 
diff --git a/tests/integration/endpoints/test_health_integration.py b/tests/integration/endpoints/test_health_integration.py
@@ -159,6 +159,7 @@ async def test_health_readiness(
     mock_llama_stack_client_health: AsyncMockType,
     test_response: Response,
     test_auth: AuthTuple,
+    mocker: MockerFixture,
 ) -> None:
     """Test that readiness probe endpoint returns readiness status.
 
@@ -180,6 +181,12 @@ async def test_health_readiness(
     """
     _ = mock_llama_stack_client_health
 
+    # Mock check_default_model_available since configuration is not loaded
+    mock_check_model = mocker.patch(
+        "app.endpoints.health.check_default_model_available"
+    )
+    mock_check_model.return_value = (True, "Default model is available")
+
     result = await readiness_probe_get_method(auth=test_auth, response=test_response)
 
     # Verify that service returns readiness response
diff --git a/tests/unit/app/endpoints/test_health.py b/tests/unit/app/endpoints/test_health.py
@@ -1,11 +1,14 @@
 """Unit tests for the /health REST API endpoint."""
 
+from typing import Any
+
 import pytest
 from llama_stack_client import APIConnectionError
 from pytest_mock import MockerFixture
 
 from app.endpoints.health import (
     HealthStatus,
+    check_default_model_available,
     get_providers_health_statuses,
     liveness_probe_get_method,
     readiness_probe_get_method,
@@ -72,6 +75,12 @@ async def test_readiness_probe_success_when_all_providers_healthy(
         ),
     ]
 
+    # Mock check_default_model_available so it doesn't hit uninitialized client
+    mock_check_model = mocker.patch(
+        "app.endpoints.health.check_default_model_available"
+    )
+    mock_check_model.return_value = (True, "Default model is available")
+
     # Mock the Response object and auth
     mock_response = mocker.Mock()
 
@@ -87,6 +96,43 @@ async def test_readiness_probe_success_when_all_providers_healthy(
     assert len(response.providers) == 0
 
 
+@pytest.mark.asyncio
+async def test_readiness_probe_fails_when_model_not_available(
+    mocker: MockerFixture,
+) -> None:
+    """Test readiness returns 503 when providers are healthy but default model is missing."""
+    mock_authorization_resolvers(mocker)
+
+    mock_get_providers = mocker.patch(
+        "app.endpoints.health.get_providers_health_statuses"
+    )
+    mock_get_providers.return_value = [
+        ProviderHealthStatus(
+            provider_id="provider1",
+            status=HealthStatus.OK.value,
+            message="Provider is healthy",
+        )
+    ]
+
+    mock_check_model = mocker.patch(
+        "app.endpoints.health.check_default_model_available"
+    )
+    mock_check_model.return_value = (
+        False,
+        "Default model google-vertex/publishers/google/models/gemini-2.5-flash "
+        "not found in model registry",
+    )
+
+    mock_response = mocker.Mock()
+    auth: AuthTuple = ("test_user_id", "test_user", True, "test_token")
+
+    response = await readiness_probe_get_method(auth=auth, response=mock_response)
+
+    assert response.ready is False
+    assert "not found in model registry" in response.reason
+    assert mock_response.status_code == 503
+
+
 @pytest.mark.asyncio
 async def test_liveness_probe(mocker: MockerFixture) -> None:
     """Test the liveness endpoint handler."""
@@ -207,3 +253,87 @@ async def test_get_providers_health_statuses_connection_error(
         assert (
             result[0].message == "Failed to initialize health check: Connection error."
         )
+
+
+class TestCheckDefaultModelAvailable:
+    """Test cases for the check_default_model_available function.
+
+    The model availability logic (registry lookup, reload, error handling)
+    is tested in tests/unit/test_client.py (TestCheckModelAvailable). These
+    tests verify only the config lookup and delegation in health.py.
+    """
+
+    EXPECTED_MODEL_ID = "google-vertex/publishers/google/models/gemini-2.5-flash"
+
+    @pytest.fixture
+    def inference_config(self, mocker: MockerFixture) -> Any:
+        """Patch configuration with default model and provider."""
+        mock_config = mocker.patch("app.endpoints.health.configuration")
+        mock_config.inference.default_model = (
+            "publishers/google/models/gemini-2.5-flash"
+        )
+        mock_config.inference.default_provider = "google-vertex"
+        return mock_config
+
+    @pytest.mark.asyncio
+    async def test_no_inference_config(self, mocker: MockerFixture) -> None:
+        """Test returns True when no inference configuration exists."""
+        mock_config = mocker.patch("app.endpoints.health.configuration")
+        mock_config.inference = None
+
+        available, reason = await check_default_model_available()
+
+        assert available is True
+        assert reason == "No default model configured"
+
+    @pytest.mark.asyncio
+    async def test_no_default_model_configured(self, mocker: MockerFixture) -> None:
+        """Test returns True when no default model is configured."""
+        mock_config = mocker.patch("app.endpoints.health.configuration")
+        mock_config.inference.default_model = None
+        mock_config.inference.default_provider = None
+
+        available, reason = await check_default_model_available()
+
+        assert available is True
+        assert reason == "No default model configured"
+
+    @pytest.mark.asyncio
+    @pytest.mark.usefixtures("inference_config")
+    async def test_delegates_to_client_holder(
+        self,
+        mocker: MockerFixture,
+    ) -> None:
+        """Test delegates to client holder with correct model ID."""
+        mock_holder = mocker.patch("app.endpoints.health.AsyncLlamaStackClientHolder")
+        mock_holder.return_value.check_model_available = mocker.AsyncMock(
+            return_value=(True, f"Model {self.EXPECTED_MODEL_ID} is available")
+        )
+
+        available, reason = await check_default_model_available()
+
+        assert available is True
+        assert "is available" in reason
+        mock_holder.return_value.check_model_available.assert_awaited_once_with(
+            self.EXPECTED_MODEL_ID
+        )
+
+    @pytest.mark.asyncio
+    @pytest.mark.usefixtures("inference_config")
+    async def test_returns_holder_failure(
+        self,
+        mocker: MockerFixture,
+    ) -> None:
+        """Test passes through failure result from client holder."""
+        mock_holder = mocker.patch("app.endpoints.health.AsyncLlamaStackClientHolder")
+        mock_holder.return_value.check_model_available = mocker.AsyncMock(
+            return_value=(
+                False,
+                f"Model {self.EXPECTED_MODEL_ID} not found in model registry",
+            )
+        )
+
+        available, reason = await check_default_model_available()
+
+        assert available is False
+        assert "not found in model registry" in reason
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py