Skip to content

Commit 9e3d1eb

Browse files
committed
Improve K8s authentication error handling
Refactored Kubernetes authentication error handling to replace the generic `ClusterIDUnavailableError` with a granular exception hierarchy that differentiates between transient API failures (503) and persistent configuration issues (500). Error messages are now preserved and returned to clients with appropriate HTTP status codes. **Motivation** The current implementation had several issues: - Single `ClusterIDUnavailableError` for all failure scenarios - Error context lost when converting to generic 500 responses - Always returned HTTP 500, even for transient K8s API failures (should be 503) - Broad `except Exception` catches masked specific errors - Hard to debug production issues due to generic error messages Note: This PR does not introduce any breaking changes. The changes are internal to the K8s authentication module and the public API __call__ method signature remains unchanged. Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
1 parent 5b6d64d commit 9e3d1eb

4 files changed

Lines changed: 599 additions & 43 deletions

File tree

src/authentication/k8s.py

Lines changed: 145 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Manage authentication flow for FastAPI endpoints with K8S/OCP."""
22

33
import os
4+
from http import HTTPStatus
45
from typing import Optional, Self, cast
56

67
import kubernetes.client
@@ -29,8 +30,45 @@
2930
)
3031

3132

32-
class ClusterIDUnavailableError(Exception):
33-
"""Cluster ID is not available."""
33+
class K8sAuthenticationError(Exception):
34+
"""Base exception for Kubernetes authentication errors."""
35+
36+
37+
class K8sAPIConnectionError(K8sAuthenticationError):
38+
"""Cannot connect to Kubernetes API server.
39+
40+
Indicates transient failures that may be resolved by retrying.
41+
Maps to HTTP 503 Service Unavailable.
42+
"""
43+
44+
45+
class K8sConfigurationError(K8sAuthenticationError):
46+
"""Kubernetes cluster configuration issue.
47+
48+
Indicates persistent configuration problems requiring admin intervention.
49+
Maps to HTTP 500 Internal Server Error.
50+
"""
51+
52+
53+
class ClusterVersionNotFoundError(K8sConfigurationError):
54+
"""ClusterVersion resource not found in OpenShift cluster.
55+
56+
Raised when the ClusterVersion custom resource does not exist (HTTP 404).
57+
"""
58+
59+
60+
class ClusterVersionPermissionError(K8sConfigurationError):
61+
"""No permission to access ClusterVersion resource.
62+
63+
Raised when RBAC denies access to the ClusterVersion resource (HTTP 403).
64+
"""
65+
66+
67+
class InvalidClusterVersionError(K8sConfigurationError):
68+
"""ClusterVersion resource has invalid structure or missing required fields.
69+
70+
Raised when the ClusterVersion exists but is missing spec.clusterID or has wrong type.
71+
"""
3472

3573

3674
class K8sClientSingleton:
@@ -156,8 +194,10 @@ def _get_cluster_id(cls) -> str:
156194
str: The cluster's `clusterID`.
157195
158196
Raises:
159-
ClusterIDUnavailableError: If the cluster ID cannot be obtained due
160-
to missing keys, an API error, or any unexpected error.
197+
K8sAPIConnectionError: If the Kubernetes API is unreachable or returns 5xx errors.
198+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist (404).
199+
ClusterVersionPermissionError: If access to ClusterVersion is denied (403).
200+
InvalidClusterVersionError: If ClusterVersion has invalid structure or missing fields.
161201
"""
162202
try:
163203
custom_objects_api = cls.get_custom_objects_api()
@@ -170,27 +210,61 @@ def _get_cluster_id(cls) -> str:
170210
)
171211
spec = version_data.get("spec")
172212
if not isinstance(spec, dict):
173-
raise ClusterIDUnavailableError(
213+
raise InvalidClusterVersionError(
174214
"Missing or invalid 'spec' in ClusterVersion"
175215
)
176216
cluster_id = spec.get("clusterID")
177217
if not isinstance(cluster_id, str) or not cluster_id.strip():
178-
raise ClusterIDUnavailableError(
218+
raise InvalidClusterVersionError(
179219
"Missing or invalid 'clusterID' in ClusterVersion"
180220
)
181221
cls._cluster_id = cluster_id
182222
return cluster_id
183-
except KeyError as e:
223+
except ApiException as e:
224+
# Handle specific HTTP status codes from Kubernetes API
225+
if e.status is None:
226+
# No status code indicates a connection/network issue
227+
logger.error("Kubernetes API error with no status code: %s", e.reason)
228+
raise K8sAPIConnectionError(
229+
f"Failed to connect to Kubernetes API: {e.reason}"
230+
) from e
231+
232+
if e.status == HTTPStatus.NOT_FOUND:
233+
logger.error(
234+
"ClusterVersion resource 'version' not found in cluster: %s",
235+
e.reason,
236+
)
237+
raise ClusterVersionNotFoundError(
238+
"ClusterVersion 'version' resource not found in OpenShift cluster"
239+
) from e
240+
if e.status == HTTPStatus.FORBIDDEN:
241+
logger.error(
242+
"Permission denied to access ClusterVersion resource: %s", e.reason
243+
)
244+
raise ClusterVersionPermissionError(
245+
"Insufficient permissions to read ClusterVersion resource"
246+
) from e
247+
# Classify errors by status code range
248+
# 5xx errors and 429 (rate limit) are transient - map to 503
249+
if e.status >= 500 or e.status == HTTPStatus.TOO_MANY_REQUESTS:
250+
logger.error(
251+
"Kubernetes API unavailable while fetching ClusterVersion (status %s): %s",
252+
e.status,
253+
e.reason,
254+
)
255+
raise K8sAPIConnectionError(
256+
f"Failed to connect to Kubernetes API: {e.reason} (status {e.status})"
257+
) from e
258+
# All other errors (4xx client errors) are configuration issues - map to 500
184259
logger.error(
185-
"Failed to get cluster_id from cluster, missing keys in version object"
260+
"Kubernetes API returned client error while fetching "
261+
"ClusterVersion (status %s): %s",
262+
e.status,
263+
e.reason,
186264
)
187-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
188-
except ApiException as e:
189-
logger.error("API exception during ClusterInfo: %s", e)
190-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
191-
except Exception as e:
192-
logger.error("Unexpected error during getting cluster ID: %s", e)
193-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
265+
raise K8sConfigurationError(
266+
f"Kubernetes API request failed: {e.reason} (status {e.status})"
267+
) from e
194268

195269
@classmethod
196270
def get_cluster_id(cls) -> str:
@@ -207,7 +281,10 @@ def get_cluster_id(cls) -> str:
207281
str: The cluster identifier.
208282
209283
Raises:
210-
ClusterIDUnavailableError: If running in-cluster and fetching the cluster ID fails.
284+
K8sAPIConnectionError: If the Kubernetes API is unreachable.
285+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist.
286+
ClusterVersionPermissionError: If access to ClusterVersion is denied.
287+
InvalidClusterVersionError: If ClusterVersion has invalid structure.
211288
"""
212289
if cls._instance is None:
213290
cls()
@@ -230,7 +307,9 @@ def get_user_info(token: str) -> Optional[kubernetes.client.V1TokenReviewStatus]
230307
The V1TokenReviewStatus if the token is valid, None otherwise.
231308
232309
Raises:
233-
HTTPException: If unable to connect to Kubernetes API or unexpected error occurs.
310+
HTTPException: 503 if Kubernetes API is unavailable (5xx errors, 429 rate limit).
311+
500 if Kubernetes API configuration issue (4xx errors).
312+
503 if unable to initialize Kubernetes client.
234313
"""
235314
try:
236315
auth_api = K8sClientSingleton.get_authn_api()
@@ -254,8 +333,44 @@ def get_user_info(token: str) -> Optional[kubernetes.client.V1TokenReviewStatus]
254333
if status is not None and status.authenticated:
255334
return status
256335
return None
336+
except ApiException as e:
337+
if e.status is None:
338+
logger.error(
339+
"Kubernetes API error during TokenReview with no status code: %s",
340+
e.reason,
341+
)
342+
response = ServiceUnavailableResponse(
343+
backend_name="Kubernetes API",
344+
cause=f"Failed to connect to Kubernetes API: {e.reason}",
345+
)
346+
raise HTTPException(**response.model_dump()) from e
347+
348+
# 5xx errors and 429 (rate limit) are transient - map to 503
349+
if e.status >= 500 or e.status == HTTPStatus.TOO_MANY_REQUESTS:
350+
logger.error(
351+
"Kubernetes API unavailable during TokenReview (status %s): %s",
352+
e.status,
353+
e.reason,
354+
)
355+
response = ServiceUnavailableResponse(
356+
backend_name="Kubernetes API",
357+
cause=f"Kubernetes API unavailable: {e.reason} (status {e.status})",
358+
)
359+
raise HTTPException(**response.model_dump()) from e
360+
361+
# All other errors (4xx client errors) are configuration issues - map to 500
362+
logger.error(
363+
"Kubernetes API returned client error during TokenReview (status %s): %s",
364+
e.status,
365+
e.reason,
366+
)
367+
response_obj = InternalServerErrorResponse(
368+
response="Internal server error",
369+
cause=f"Kubernetes API request failed: {e.reason} (status {e.status})",
370+
)
371+
raise HTTPException(**response_obj.model_dump()) from e
257372
except Exception as e: # pylint: disable=broad-exception-caught
258-
logger.error("API exception during TokenReview: %s", e)
373+
logger.error("Unexpected error during TokenReview: %s", e)
259374
return None
260375

261376

@@ -325,11 +440,20 @@ async def __call__(self, request: Request) -> tuple[str, str, bool, str]:
325440
if user.username == "kube:admin":
326441
try:
327442
user.uid = K8sClientSingleton.get_cluster_id()
328-
except ClusterIDUnavailableError as e:
329-
logger.error("Failed to get cluster ID: %s", e)
443+
except K8sAPIConnectionError as e:
444+
# Kubernetes API is unreachable - return 503
445+
logger.error("Cannot connect to Kubernetes API: %s", e)
446+
response = ServiceUnavailableResponse(
447+
backend_name="Kubernetes API",
448+
cause=str(e),
449+
)
450+
raise HTTPException(**response.model_dump()) from e
451+
except K8sConfigurationError as e:
452+
# Cluster misconfiguration or client error - return 500
453+
logger.error("Cluster configuration error: %s", e)
330454
response = InternalServerErrorResponse(
331455
response="Internal server error",
332-
cause="Unable to retrieve cluster ID",
456+
cause=str(e),
333457
)
334458
raise HTTPException(**response.model_dump()) from e
335459

src/models/responses.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2414,6 +2414,27 @@ class InternalServerErrorResponse(AbstractErrorResponse):
24142414
"cause": "Failed to query the database",
24152415
},
24162416
},
2417+
{
2418+
"label": "cluster version not found",
2419+
"detail": {
2420+
"response": "Internal server error",
2421+
"cause": "ClusterVersion 'version' resource not found in OpenShift cluster",
2422+
},
2423+
},
2424+
{
2425+
"label": "cluster version permission denied",
2426+
"detail": {
2427+
"response": "Internal server error",
2428+
"cause": "Insufficient permissions to read ClusterVersion resource",
2429+
},
2430+
},
2431+
{
2432+
"label": "invalid cluster version",
2433+
"detail": {
2434+
"response": "Internal server error",
2435+
"cause": "ClusterVersion missing required field: 'clusterID'",
2436+
},
2437+
},
24172438
]
24182439
}
24192440
}
@@ -2537,7 +2558,17 @@ class ServiceUnavailableResponse(AbstractErrorResponse):
25372558
"response": "Unable to connect to Llama Stack",
25382559
"cause": "Connection error while trying to reach backend service.",
25392560
},
2540-
}
2561+
},
2562+
{
2563+
"label": "kubernetes api",
2564+
"detail": {
2565+
"response": "Unable to connect to Kubernetes API",
2566+
"cause": (
2567+
"Failed to connect to Kubernetes API: "
2568+
"Service Unavailable (status 503)"
2569+
),
2570+
},
2571+
},
25412572
]
25422573
}
25432574
}

0 commit comments

Comments
 (0)