Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,16 @@
}
}
}
},
"503": {
"description": "Service is not alive",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/LivenessResponse"
}
}
}
}
}
}
Expand Down Expand Up @@ -1708,14 +1718,25 @@
"alive": {
"type": "boolean",
"title": "Alive"
},
"reason": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Reason"
}
},
"type": "object",
"required": [
"alive"
],
"title": "LivenessResponse",
"description": "Model representing a response to a liveness request.\n\nAttributes:\n alive: If app is alive.\n\nExample:\n ```python\n liveness_response = LivenessResponse(alive=True)\n ```",
"description": "Model representing a response to a liveness request.\n\nAttributes:\n alive: If app is alive.\n reason: Optional reason when not alive.\n\nExample:\n ```python\n liveness_response = LivenessResponse(alive=True)\n ```",
"examples": [
{
"alive": true
Expand Down
15 changes: 13 additions & 2 deletions ols/app/endpoints/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import time
from typing import Any

from fastapi import APIRouter, HTTPException, status
from fastapi import APIRouter, HTTPException, Response, status
from langchain_core.messages.ai import AIMessage

from ols import config
Expand All @@ -18,6 +18,7 @@
NotAvailableResponse,
ReadinessResponse,
)
from ols.src.cache.postgres_cache import PostgresCache
from ols.src.llms.llm_loader import load_llm

router = APIRouter(tags=["health"])
Expand Down Expand Up @@ -122,10 +123,20 @@ def readiness_probe_get_method() -> ReadinessResponse:
"description": "Service is alive",
"model": LivenessResponse,
},
503: {
"description": "Service is not alive",
"model": LivenessResponse,
},
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}


@router.get("/liveness", responses=get_liveness_responses)
def liveness_probe_get_method() -> LivenessResponse:
def liveness_probe_get_method(response: Response) -> LivenessResponse:
"""Live status of service."""
cache = config._conversation_cache
if isinstance(cache, PostgresCache):
threshold = config.ols_config.liveness_db_failure_threshold
if cache.consecutive_failures >= threshold:
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return LivenessResponse(alive=False, reason="database unreachable")
return LivenessResponse(alive=True)
30 changes: 30 additions & 0 deletions ols/app/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
BaseModel,
Field,
FilePath,
NonNegativeInt,
PositiveInt,
PrivateAttr,
field_validator,
Expand Down Expand Up @@ -52,6 +53,21 @@ def validate_tool_round_cap_fraction_config(v: float) -> float:
return v


def validate_liveness_db_failure_threshold(raw_value: Any) -> int:
"""Validate ``liveness_db_failure_threshold`` for ``OLSConfig``."""
try:
threshold = int(raw_value)
except (TypeError, ValueError) as e:
raise checks.InvalidConfigurationError(
f"liveness_db_failure_threshold must be a positive integer, got {raw_value!r}"
) from e
if threshold < 1:
raise checks.InvalidConfigurationError(
f"liveness_db_failure_threshold must be at least 1, got {threshold}"
)
return threshold


class ModelParameters(BaseModel):
"""Model parameters."""

Expand Down Expand Up @@ -798,6 +814,12 @@ class PostgresConfig(BaseModel):
gss_encmode: str = constants.POSTGRES_CACHE_GSSENCMODE
ca_cert_path: Optional[FilePath] = None
max_entries: PositiveInt = constants.POSTGRES_CACHE_MAX_ENTRIES
statement_timeout: NonNegativeInt = constants.POSTGRES_STATEMENT_TIMEOUT
lock_timeout: PositiveInt = constants.POSTGRES_LOCK_TIMEOUT
health_check_interval: PositiveInt = constants.CACHE_HEALTH_CHECK_INTERVAL
health_check_connect_timeout: PositiveInt = (
constants.CACHE_HEALTH_CHECK_CONNECT_TIMEOUT
)
tls_security_profile: Optional["TLSSecurityProfile"] = None

def __init__(self, **data: Any) -> None:
Expand Down Expand Up @@ -1160,6 +1182,8 @@ class OLSConfig(BaseModel):

offload_storage_path: str = constants.DEFAULT_OFFLOAD_STORAGE_PATH

liveness_db_failure_threshold: int = constants.LIVENESS_DB_FAILURE_THRESHOLD

Comment thread
coderabbitai[bot] marked this conversation as resolved.
def __init__(
self, data: Optional[dict] = None, ignore_missing_certs: bool = False
) -> None:
Expand Down Expand Up @@ -1234,6 +1258,12 @@ def __init__(
"offload_storage_path", constants.DEFAULT_OFFLOAD_STORAGE_PATH
)

self.liveness_db_failure_threshold = validate_liveness_db_failure_threshold(
data.get(
"liveness_db_failure_threshold", constants.LIVENESS_DB_FAILURE_THRESHOLD
)
)

def _propagate_tls_profile(self) -> None:
"""Set the TLS security profile on all PostgresConfig instances."""
if (
Expand Down
2 changes: 2 additions & 0 deletions ols/app/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ class LivenessResponse(BaseModel):

Attributes:
alive: If app is alive.
reason: Optional reason when not alive.

Example:
```python
Expand All @@ -483,6 +484,7 @@ class LivenessResponse(BaseModel):
"""

alive: bool
reason: Optional[str] = None

# provides examples for /docs endpoint
model_config = {
Expand Down
6 changes: 6 additions & 0 deletions ols/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class GenericLLMParameters:
POSTGRES_CACHE_DBNAME = "cache"
POSTGRES_CACHE_USER = "postgres"
POSTGRES_CACHE_MAX_ENTRIES = 1000
POSTGRES_STATEMENT_TIMEOUT = 5000
POSTGRES_LOCK_TIMEOUT = 10

# look at https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE
# for all possible options
Expand All @@ -137,6 +139,10 @@ class GenericLLMParameters:
# for all possible options
POSTGRES_CACHE_GSSENCMODE = "prefer"

CACHE_HEALTH_CHECK_INTERVAL = 30
CACHE_HEALTH_CHECK_CONNECT_TIMEOUT = 10
LIVENESS_DB_FAILURE_THRESHOLD = 3


# default indentity for local testing and deployment
# "nil" UUID is used on purpose, because it will be easier to
Expand Down
Loading