Skip to content

Commit 0a569eb

Browse files
committed
Improve K8s authentication error handling
Refactored Kubernetes authentication error handling to replace the generic `ClusterIDUnavailableError` with a granular exception hierarchy that differentiates between transient API failures (503) and persistent configuration issues (500). Error messages are now preserved and returned to clients with appropriate HTTP status codes. **Motivation** The current implementation had several issues: - Single `ClusterIDUnavailableError` for all failure scenarios - Error context lost when converting to generic 500 responses - Always returned HTTP 500, even for transient K8s API failures (should be 503) - Broad `except Exception` catches masked specific errors - Hard to debug production issues due to generic error messages Note: This PR does not introduce any breaking changes. The changes are internal to the K8s authentication module and the public API __call__ method signature remains unchanged. Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
1 parent 80d5b9e commit 0a569eb

4 files changed

Lines changed: 477 additions & 38 deletions

File tree

src/authentication/k8s.py

Lines changed: 108 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
from pathlib import Path
5+
from http import HTTPStatus
56
from typing import Optional, Self
67

78
import kubernetes.client
@@ -30,8 +31,45 @@
3031
)
3132

3233

33-
class ClusterIDUnavailableError(Exception):
34-
"""Cluster ID is not available."""
34+
class K8sAuthenticationError(Exception):
35+
"""Base exception for Kubernetes authentication errors."""
36+
37+
38+
class K8sAPIConnectionError(K8sAuthenticationError):
39+
"""Cannot connect to Kubernetes API server.
40+
41+
Indicates transient failures that may be resolved by retrying.
42+
Maps to HTTP 503 Service Unavailable.
43+
"""
44+
45+
46+
class K8sConfigurationError(K8sAuthenticationError):
47+
"""Kubernetes cluster configuration issue.
48+
49+
Indicates persistent configuration problems requiring admin intervention.
50+
Maps to HTTP 500 Internal Server Error.
51+
"""
52+
53+
54+
class ClusterVersionNotFoundError(K8sConfigurationError):
55+
"""ClusterVersion resource not found in OpenShift cluster.
56+
57+
Raised when the ClusterVersion custom resource does not exist (HTTP 404).
58+
"""
59+
60+
61+
class ClusterVersionPermissionError(K8sConfigurationError):
62+
"""No permission to access ClusterVersion resource.
63+
64+
Raised when RBAC denies access to the ClusterVersion resource (HTTP 403).
65+
"""
66+
67+
68+
class InvalidClusterVersionError(K8sConfigurationError):
69+
"""ClusterVersion resource has invalid structure or missing required fields.
70+
71+
Raised when the ClusterVersion exists but is missing spec.clusterID or has wrong type.
72+
"""
3573

3674

3775
class K8sClientSingleton:
@@ -156,9 +194,12 @@ def _get_cluster_id(cls) -> str:
156194
str: The cluster's `clusterID`.
157195
158196
Raises:
159-
ClusterIDUnavailableError: If the cluster ID cannot be obtained due
160-
to missing keys, an API error, or any unexpected error.
197+
K8sAPIConnectionError: If the Kubernetes API is unreachable or returns 5xx errors.
198+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist (404).
199+
ClusterVersionPermissionError: If access to ClusterVersion is denied (403).
200+
InvalidClusterVersionError: If ClusterVersion has invalid structure or missing fields.
161201
"""
202+
version_data: Optional[object] = None
162203
try:
163204
custom_objects_api = cls.get_custom_objects_api()
164205
version_data = custom_objects_api.get_cluster_custom_object(
@@ -167,22 +208,59 @@ def _get_cluster_id(cls) -> str:
167208
cluster_id = version_data["spec"]["clusterID"]
168209
cls._cluster_id = cluster_id
169210
return cluster_id
211+
except ApiException as e:
212+
# Handle specific HTTP status codes from Kubernetes API
213+
if e.status == HTTPStatus.NOT_FOUND:
214+
logger.error(
215+
"ClusterVersion resource 'version' not found in cluster: %s",
216+
e.reason,
217+
)
218+
raise ClusterVersionNotFoundError(
219+
"ClusterVersion 'version' resource not found in OpenShift cluster"
220+
) from e
221+
if e.status == HTTPStatus.FORBIDDEN:
222+
logger.error(
223+
"Permission denied to access ClusterVersion resource: %s", e.reason
224+
)
225+
raise ClusterVersionPermissionError(
226+
"Insufficient permissions to read ClusterVersion resource"
227+
) from e
228+
# Classify errors by status code range
229+
# 5xx errors and 429 (rate limit) are transient - map to 503
230+
if e.status >= 500 or e.status == HTTPStatus.TOO_MANY_REQUESTS:
231+
logger.error(
232+
"Kubernetes API unavailable while fetching ClusterVersion (status %s): %s",
233+
e.status,
234+
e.reason,
235+
)
236+
raise K8sAPIConnectionError(
237+
f"Failed to connect to Kubernetes API: {e.reason} (status {e.status})"
238+
) from e
239+
# All other errors (4xx client errors) are configuration issues - map to 500
240+
logger.error(
241+
"Kubernetes API returned client error while fetching "
242+
"ClusterVersion (status %s): %s",
243+
e.status,
244+
e.reason,
245+
)
246+
raise K8sConfigurationError(
247+
f"Kubernetes API request failed: {e.reason} (status {e.status})"
248+
) from e
170249
except KeyError as e:
171250
logger.error(
172-
"Failed to get cluster_id from cluster, missing keys in version object"
251+
"ClusterVersion missing required field 'spec.clusterID': %s", e
173252
)
174-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
253+
raise InvalidClusterVersionError(
254+
f"ClusterVersion missing required field: {e}"
255+
) from e
175256
except TypeError as e:
176257
logger.error(
177-
"Failed to get cluster_id, version object is: %s", version_data
258+
"ClusterVersion has invalid structure, version_data type: %s",
259+
type(version_data).__name__ if version_data is not None else "unknown",
178260
)
179-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
180-
except ApiException as e:
181-
logger.error("API exception during ClusterInfo: %s", e)
182-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
183-
except Exception as e:
184-
logger.error("Unexpected error during getting cluster ID: %s", e)
185-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
261+
raise InvalidClusterVersionError(
262+
f"ClusterVersion has invalid type or structure: {e}"
263+
) from e
186264

187265
@classmethod
188266
def get_cluster_id(cls) -> str:
@@ -199,7 +277,10 @@ def get_cluster_id(cls) -> str:
199277
str: The cluster identifier.
200278
201279
Raises:
202-
ClusterIDUnavailableError: If running in-cluster and fetching the cluster ID fails.
280+
K8sAPIConnectionError: If the Kubernetes API is unreachable.
281+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist.
282+
ClusterVersionPermissionError: If access to ClusterVersion is denied.
283+
InvalidClusterVersionError: If ClusterVersion has invalid structure.
203284
"""
204285
if cls._instance is None:
205286
cls()
@@ -310,11 +391,20 @@ async def __call__(self, request: Request) -> tuple[str, str, bool, str]:
310391
if user_info.user.username == "kube:admin":
311392
try:
312393
user_info.user.uid = K8sClientSingleton.get_cluster_id()
313-
except ClusterIDUnavailableError as e:
314-
logger.error("Failed to get cluster ID: %s", e)
394+
except K8sAPIConnectionError as e:
395+
# Kubernetes API is unreachable - return 503
396+
logger.error("Cannot connect to Kubernetes API: %s", e)
397+
response = ServiceUnavailableResponse(
398+
backend_name="Kubernetes API",
399+
cause=str(e),
400+
)
401+
raise HTTPException(**response.model_dump()) from e
402+
except K8sConfigurationError as e:
403+
# Cluster misconfiguration or client error - return 500
404+
logger.error("Cluster configuration error: %s", e)
315405
response = InternalServerErrorResponse(
316406
response="Internal server error",
317-
cause="Unable to retrieve cluster ID",
407+
cause=str(e),
318408
)
319409
raise HTTPException(**response.model_dump()) from e
320410

src/models/responses.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2414,6 +2414,27 @@ class InternalServerErrorResponse(AbstractErrorResponse):
24142414
"cause": "Failed to query the database",
24152415
},
24162416
},
2417+
{
2418+
"label": "cluster version not found",
2419+
"detail": {
2420+
"response": "Internal server error",
2421+
"cause": "ClusterVersion 'version' resource not found in OpenShift cluster",
2422+
},
2423+
},
2424+
{
2425+
"label": "cluster version permission denied",
2426+
"detail": {
2427+
"response": "Internal server error",
2428+
"cause": "Insufficient permissions to read ClusterVersion resource",
2429+
},
2430+
},
2431+
{
2432+
"label": "invalid cluster version",
2433+
"detail": {
2434+
"response": "Internal server error",
2435+
"cause": "ClusterVersion missing required field: 'clusterID'",
2436+
},
2437+
},
24172438
]
24182439
}
24192440
}
@@ -2537,7 +2558,17 @@ class ServiceUnavailableResponse(AbstractErrorResponse):
25372558
"response": "Unable to connect to Llama Stack",
25382559
"cause": "Connection error while trying to reach backend service.",
25392560
},
2540-
}
2561+
},
2562+
{
2563+
"label": "kubernetes api",
2564+
"detail": {
2565+
"response": "Unable to connect to Kubernetes API",
2566+
"cause": (
2567+
"Failed to connect to Kubernetes API: "
2568+
"Service Unavailable (status 503)"
2569+
),
2570+
},
2571+
},
25412572
]
25422573
}
25432574
}

0 commit comments

Comments
 (0)