Skip to content

Commit 962193c

Browse files
authored
Merge pull request #1341 from anik120/better-k8sauth-error-handling
LCORE-1493: Improve K8s authentication error handling
2 parents 4822df0 + d9ca8dd commit 962193c

4 files changed

Lines changed: 606 additions & 43 deletions

File tree

src/authentication/k8s.py

Lines changed: 152 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Manage authentication flow for FastAPI endpoints with K8S/OCP."""
22

33
import os
4+
from http import HTTPStatus
45
from typing import Optional, Self, cast
56

67
import kubernetes.client
@@ -29,8 +30,45 @@
2930
)
3031

3132

32-
class ClusterIDUnavailableError(Exception):
33-
"""Cluster ID is not available."""
33+
class K8sAuthenticationError(Exception):
34+
"""Base exception for Kubernetes authentication errors."""
35+
36+
37+
class K8sAPIConnectionError(K8sAuthenticationError):
38+
"""Cannot connect to Kubernetes API server.
39+
40+
Indicates transient failures that may be resolved by retrying.
41+
Maps to HTTP 503 Service Unavailable.
42+
"""
43+
44+
45+
class K8sConfigurationError(K8sAuthenticationError):
46+
"""Kubernetes cluster configuration issue.
47+
48+
Indicates persistent configuration problems requiring admin intervention.
49+
Maps to HTTP 500 Internal Server Error.
50+
"""
51+
52+
53+
class ClusterVersionNotFoundError(K8sConfigurationError):
54+
"""ClusterVersion resource not found in OpenShift cluster.
55+
56+
Raised when the ClusterVersion custom resource does not exist (HTTP 404).
57+
"""
58+
59+
60+
class ClusterVersionPermissionError(K8sConfigurationError):
61+
"""No permission to access ClusterVersion resource.
62+
63+
Raised when RBAC denies access to the ClusterVersion resource (HTTP 403).
64+
"""
65+
66+
67+
class InvalidClusterVersionError(K8sConfigurationError):
68+
"""ClusterVersion resource has invalid structure or missing required fields.
69+
70+
Raised when the ClusterVersion exists but is missing spec.clusterID or has wrong type.
71+
"""
3472

3573

3674
class K8sClientSingleton:
@@ -156,8 +194,10 @@ def _get_cluster_id(cls) -> str:
156194
str: The cluster's `clusterID`.
157195
158196
Raises:
159-
ClusterIDUnavailableError: If the cluster ID cannot be obtained due
160-
to missing keys, an API error, or any unexpected error.
197+
K8sAPIConnectionError: If the Kubernetes API is unreachable or returns 5xx errors.
198+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist (404).
199+
ClusterVersionPermissionError: If access to ClusterVersion is denied (403).
200+
InvalidClusterVersionError: If ClusterVersion has invalid structure or missing fields.
161201
"""
162202
try:
163203
custom_objects_api = cls.get_custom_objects_api()
@@ -170,27 +210,64 @@ def _get_cluster_id(cls) -> str:
170210
)
171211
spec = version_data.get("spec")
172212
if not isinstance(spec, dict):
173-
raise ClusterIDUnavailableError(
213+
raise InvalidClusterVersionError(
174214
"Missing or invalid 'spec' in ClusterVersion"
175215
)
176216
cluster_id = spec.get("clusterID")
177217
if not isinstance(cluster_id, str) or not cluster_id.strip():
178-
raise ClusterIDUnavailableError(
218+
raise InvalidClusterVersionError(
179219
"Missing or invalid 'clusterID' in ClusterVersion"
180220
)
181221
cls._cluster_id = cluster_id
182222
return cluster_id
183-
except KeyError as e:
223+
except ApiException as e:
224+
# Handle specific HTTP status codes from Kubernetes API
225+
if e.status is None:
226+
# No status code indicates a connection/network issue
227+
logger.error("Kubernetes API error with no status code: %s", e.reason)
228+
raise K8sAPIConnectionError(
229+
f"Failed to connect to Kubernetes API: {e.reason}"
230+
) from e
231+
232+
if e.status == HTTPStatus.NOT_FOUND:
233+
logger.error(
234+
"ClusterVersion resource 'version' not found in cluster: %s",
235+
e.reason,
236+
)
237+
raise ClusterVersionNotFoundError(
238+
"ClusterVersion 'version' resource not found in OpenShift cluster"
239+
) from e
240+
if e.status == HTTPStatus.FORBIDDEN:
241+
logger.error(
242+
"Permission denied to access ClusterVersion resource: %s", e.reason
243+
)
244+
raise ClusterVersionPermissionError(
245+
"Insufficient permissions to read ClusterVersion resource"
246+
) from e
247+
# Classify errors by status code range
248+
# 5xx errors and 429 (rate limit) are transient - map to 503
249+
if (
250+
e.status >= HTTPStatus.INTERNAL_SERVER_ERROR
251+
or e.status == HTTPStatus.TOO_MANY_REQUESTS
252+
):
253+
logger.error(
254+
"Kubernetes API unavailable while fetching ClusterVersion (status %s): %s",
255+
e.status,
256+
e.reason,
257+
)
258+
raise K8sAPIConnectionError(
259+
f"Failed to connect to Kubernetes API: {e.reason} (status {e.status})"
260+
) from e
261+
# All other errors (4xx client errors) are configuration issues - map to 500
184262
logger.error(
185-
"Failed to get cluster_id from cluster, missing keys in version object"
263+
"Kubernetes API returned client error while fetching "
264+
"ClusterVersion (status %s): %s",
265+
e.status,
266+
e.reason,
186267
)
187-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
188-
except ApiException as e:
189-
logger.error("API exception during ClusterInfo: %s", e)
190-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
191-
except Exception as e:
192-
logger.error("Unexpected error during getting cluster ID: %s", e)
193-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
268+
raise K8sConfigurationError(
269+
f"Kubernetes API request failed: {e.reason} (status {e.status})"
270+
) from e
194271

195272
@classmethod
196273
def get_cluster_id(cls) -> str:
@@ -207,7 +284,10 @@ def get_cluster_id(cls) -> str:
207284
str: The cluster identifier.
208285
209286
Raises:
210-
ClusterIDUnavailableError: If running in-cluster and fetching the cluster ID fails.
287+
K8sAPIConnectionError: If the Kubernetes API is unreachable.
288+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist.
289+
ClusterVersionPermissionError: If access to ClusterVersion is denied.
290+
InvalidClusterVersionError: If ClusterVersion has invalid structure.
211291
"""
212292
if cls._instance is None:
213293
cls()
@@ -230,7 +310,10 @@ def get_user_info(token: str) -> Optional[kubernetes.client.V1TokenReviewStatus]
230310
The V1TokenReviewStatus if the token is valid, None otherwise.
231311
232312
Raises:
233-
HTTPException: If unable to connect to Kubernetes API or unexpected error occurs.
313+
HTTPException:
314+
503 if Kubernetes API is unavailable (5xx errors, 429 rate limit).
315+
503 if unable to initialize Kubernetes client.
316+
500 if Kubernetes API configuration issue (4xx errors).
234317
"""
235318
try:
236319
auth_api = K8sClientSingleton.get_authn_api()
@@ -254,8 +337,47 @@ def get_user_info(token: str) -> Optional[kubernetes.client.V1TokenReviewStatus]
254337
if status is not None and status.authenticated:
255338
return status
256339
return None
340+
except ApiException as e:
341+
if e.status is None:
342+
logger.error(
343+
"Kubernetes API error during TokenReview with no status code: %s",
344+
e.reason,
345+
)
346+
response = ServiceUnavailableResponse(
347+
backend_name="Kubernetes API",
348+
cause=f"Failed to connect to Kubernetes API: {e.reason}",
349+
)
350+
raise HTTPException(**response.model_dump()) from e
351+
352+
# 5xx errors and 429 (rate limit) are transient - map to 503
353+
if (
354+
e.status >= HTTPStatus.INTERNAL_SERVER_ERROR
355+
or e.status == HTTPStatus.TOO_MANY_REQUESTS
356+
):
357+
logger.error(
358+
"Kubernetes API unavailable during TokenReview (status %s): %s",
359+
e.status,
360+
e.reason,
361+
)
362+
response = ServiceUnavailableResponse(
363+
backend_name="Kubernetes API",
364+
cause=f"Kubernetes API unavailable: {e.reason} (status {e.status})",
365+
)
366+
raise HTTPException(**response.model_dump()) from e
367+
368+
# All other errors (4xx client errors) are configuration issues - map to 500
369+
logger.error(
370+
"Kubernetes API returned client error during TokenReview (status %s): %s",
371+
e.status,
372+
e.reason,
373+
)
374+
response_obj = InternalServerErrorResponse(
375+
response="Internal server error",
376+
cause=f"Kubernetes API request failed: {e.reason} (status {e.status})",
377+
)
378+
raise HTTPException(**response_obj.model_dump()) from e
257379
except Exception as e: # pylint: disable=broad-exception-caught
258-
logger.error("API exception during TokenReview: %s", e)
380+
logger.error("Unexpected error during TokenReview: %s", e)
259381
return None
260382

261383

@@ -325,11 +447,20 @@ async def __call__(self, request: Request) -> tuple[str, str, bool, str]:
325447
if user.username == "kube:admin":
326448
try:
327449
user.uid = K8sClientSingleton.get_cluster_id()
328-
except ClusterIDUnavailableError as e:
329-
logger.error("Failed to get cluster ID: %s", e)
450+
except K8sAPIConnectionError as e:
451+
# Kubernetes API is unreachable - return 503
452+
logger.error("Cannot connect to Kubernetes API: %s", e)
453+
response = ServiceUnavailableResponse(
454+
backend_name="Kubernetes API",
455+
cause=str(e),
456+
)
457+
raise HTTPException(**response.model_dump()) from e
458+
except K8sConfigurationError as e:
459+
# Cluster misconfiguration or client error - return 500
460+
logger.error("Cluster configuration error: %s", e)
330461
response = InternalServerErrorResponse(
331462
response="Internal server error",
332-
cause="Unable to retrieve cluster ID",
463+
cause=str(e),
333464
)
334465
raise HTTPException(**response.model_dump()) from e
335466

src/models/responses.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2431,6 +2431,27 @@ class InternalServerErrorResponse(AbstractErrorResponse):
24312431
"cause": "Failed to query the database",
24322432
},
24332433
},
2434+
{
2435+
"label": "cluster version not found",
2436+
"detail": {
2437+
"response": "Internal server error",
2438+
"cause": "ClusterVersion 'version' resource not found in OpenShift cluster",
2439+
},
2440+
},
2441+
{
2442+
"label": "cluster version permission denied",
2443+
"detail": {
2444+
"response": "Internal server error",
2445+
"cause": "Insufficient permissions to read ClusterVersion resource",
2446+
},
2447+
},
2448+
{
2449+
"label": "invalid cluster version",
2450+
"detail": {
2451+
"response": "Internal server error",
2452+
"cause": "ClusterVersion missing required field: 'clusterID'",
2453+
},
2454+
},
24342455
]
24352456
}
24362457
}
@@ -2554,7 +2575,17 @@ class ServiceUnavailableResponse(AbstractErrorResponse):
25542575
"response": "Unable to connect to Llama Stack",
25552576
"cause": "Connection error while trying to reach backend service.",
25562577
},
2557-
}
2578+
},
2579+
{
2580+
"label": "kubernetes api",
2581+
"detail": {
2582+
"response": "Unable to connect to Kubernetes API",
2583+
"cause": (
2584+
"Failed to connect to Kubernetes API: "
2585+
"Service Unavailable (status 503)"
2586+
),
2587+
},
2588+
},
25582589
]
25592590
}
25602591
}

0 commit comments

Comments
 (0)