Skip to content

Commit 0ccdee7

Browse files
committed
Handle KG upstream errors gracefully and add /health endpoint
- Map KG 500 responses to HTTP 503 with a "retry later" message via a global exception handler, so clients can distinguish upstream failures from bugs in this service - Map fairgraph AuthenticationError to HTTP 401 in the same handler, fixing a bug where expired-token requests returned 500 - Add allow_interactive=False to both KGClient constructors so the client never makes an eager user_info() call that could raise AuthenticationError before the first real request - Add GET /health endpoint that probes the KG with a minimal query and retries up to 3 times before returning 503, so the monitor can distinguish a consistently-down KG from transient flakiness
1 parent 24cebf5 commit 0ccdee7

2 files changed

Lines changed: 69 additions & 2 deletions

File tree

validation_service_api/validation_service/auth.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,13 @@ def get_kg_client_for_service_account():
5353
client_id=settings.KG_SERVICE_ACCOUNT_CLIENT_ID,
5454
client_secret=settings.KG_SERVICE_ACCOUNT_SECRET,
5555
host=settings.KG_CORE_API_HOST,
56+
allow_interactive=False,
5657
)
5758
return kg_client_for_service_account
5859

5960

6061
def get_kg_client_for_user_account(token):
61-
return KGClient(token=token.credentials, host=settings.KG_CORE_API_HOST)
62+
return KGClient(token=token.credentials, host=settings.KG_CORE_API_HOST, allow_interactive=False)
6263

6364

6465
async def get_collab_info(collab_id, token):

validation_service_api/validation_service/main.py

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
1+
import asyncio
2+
import functools
3+
import logging
14
from contextlib import asynccontextmanager
25

3-
from fastapi import FastAPI
6+
from fastapi import FastAPI, Request
7+
from fastapi.responses import JSONResponse
48
from starlette.middleware.sessions import SessionMiddleware
59
from starlette.middleware.cors import CORSMiddleware
10+
from fairgraph.errors import AuthenticationError
611

712
from .resources import models, tests, vocab, results, auth, comments
813
from . import settings
914
from .auth import get_kg_client_for_service_account
1015

16+
logger = logging.getLogger("validation_service_api")
17+
1118

1219
description = """
1320
The EBRAINS Model Validation Service is a web service to support
@@ -42,6 +49,65 @@ async def lifespan(app: FastAPI):
4249

4350
app = FastAPI(title="EBRAINS Model Validation Service", description=description, version="3beta", lifespan=lifespan)
4451

52+
53+
@app.exception_handler(Exception)
54+
async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
55+
if isinstance(exc, AuthenticationError):
56+
logger.warning("Unauthenticated KG request (likely expired token): %s", exc)
57+
return JSONResponse(
58+
status_code=401,
59+
content={"detail": "Authentication failed. Your token may have expired."},
60+
)
61+
if "code=500" in str(exc):
62+
logger.warning("KG upstream 500: %s", exc)
63+
return JSONResponse(
64+
status_code=503,
65+
content={"detail": "The upstream data service is temporarily unavailable. Please try again in a few minutes."},
66+
)
67+
logger.exception("Unhandled exception", exc_info=exc)
68+
return JSONResponse(status_code=500, content={"detail": "Internal server error"})
69+
70+
71+
@app.get("/health", tags=["Health"])
72+
async def health_check():
73+
service_status = getattr(settings, "SERVICE_STATUS", "ok")
74+
if service_status != "ok":
75+
return JSONResponse(
76+
status_code=503,
77+
content={"status": "unavailable", "reason": service_status},
78+
)
79+
80+
kg_client = get_kg_client_for_service_account()
81+
query = kg_client.retrieve_query("VF_ScientificModelSummary")
82+
if query is None:
83+
return JSONResponse(status_code=503, content={"status": "unavailable"})
84+
85+
probe = functools.partial(
86+
kg_client.query,
87+
query,
88+
{"space": "model"},
89+
size=1,
90+
from_index=0,
91+
release_status="released",
92+
use_stored_query=True,
93+
)
94+
95+
last_exc = None
96+
for attempt in range(3):
97+
if attempt > 0:
98+
await asyncio.sleep(2)
99+
try:
100+
loop = asyncio.get_running_loop()
101+
await asyncio.wait_for(loop.run_in_executor(None, probe), timeout=5.0)
102+
return {"status": "ok"}
103+
except Exception as exc:
104+
last_exc = exc
105+
logger.warning("Health check attempt %d/3 failed: %s", attempt + 1, exc)
106+
107+
logger.error("Health check: KG unavailable after 3 attempts. Last error: %s", last_exc)
108+
return JSONResponse(status_code=503, content={"status": "unavailable"})
109+
110+
45111
app.add_middleware(
46112
SessionMiddleware,
47113
secret_key=settings.SESSIONS_SECRET_KEY

0 commit comments

Comments
 (0)