Skip to content

Commit f1bd8fc

Browse files
committed
Improve K8s authentication error handling
Refactored Kubernetes authentication error handling to replace the generic `ClusterIDUnavailableError` with a granular exception hierarchy that differentiates between transient API failures (503) and persistent configuration issues (500). Error messages are now preserved and returned to clients with appropriate HTTP status codes. **Motivation** The current implementation had several issues: - Single `ClusterIDUnavailableError` for all failure scenarios - Error context lost when converting to generic 500 responses - Always returned HTTP 500, even for transient K8s API failures (should be 503) - Broad `except Exception` catches masked specific errors - Hard to debug production issues due to generic error messages Note: This PR does not introduce any breaking changes. The changes are internal to the K8s authentication module and the public API __call__ method signature remains unchanged. Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
1 parent 80d5b9e commit f1bd8fc

4 files changed

Lines changed: 427 additions & 36 deletions

File tree

src/authentication/k8s.py

Lines changed: 98 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,45 @@
3030
)
3131

3232

33-
class ClusterIDUnavailableError(Exception):
34-
"""Cluster ID is not available."""
33+
class K8sAuthenticationError(Exception):
34+
"""Base exception for Kubernetes authentication errors."""
35+
36+
37+
class K8sAPIConnectionError(K8sAuthenticationError):
38+
"""Cannot connect to Kubernetes API server.
39+
40+
Indicates transient failures that may be resolved by retrying.
41+
Maps to HTTP 503 Service Unavailable.
42+
"""
43+
44+
45+
class K8sConfigurationError(K8sAuthenticationError):
46+
"""Kubernetes cluster configuration issue.
47+
48+
Indicates persistent configuration problems requiring admin intervention.
49+
Maps to HTTP 500 Internal Server Error.
50+
"""
51+
52+
53+
class ClusterVersionNotFoundError(K8sConfigurationError):
54+
"""ClusterVersion resource not found in OpenShift cluster.
55+
56+
Raised when the ClusterVersion custom resource does not exist (HTTP 404).
57+
"""
58+
59+
60+
class ClusterVersionPermissionError(K8sConfigurationError):
61+
"""No permission to access ClusterVersion resource.
62+
63+
Raised when RBAC denies access to the ClusterVersion resource (HTTP 403).
64+
"""
65+
66+
67+
class InvalidClusterVersionError(K8sConfigurationError):
68+
"""ClusterVersion resource has invalid structure or missing required fields.
69+
70+
Raised when the ClusterVersion exists but is missing spec.clusterID or has wrong type.
71+
"""
3572

3673

3774
class K8sClientSingleton:
@@ -156,8 +193,10 @@ def _get_cluster_id(cls) -> str:
156193
str: The cluster's `clusterID`.
157194
158195
Raises:
159-
ClusterIDUnavailableError: If the cluster ID cannot be obtained due
160-
to missing keys, an API error, or any unexpected error.
196+
K8sAPIConnectionError: If the Kubernetes API is unreachable or returns 5xx errors.
197+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist (404).
198+
ClusterVersionPermissionError: If access to ClusterVersion is denied (403).
199+
InvalidClusterVersionError: If ClusterVersion has invalid structure or missing fields.
161200
"""
162201
try:
163202
custom_objects_api = cls.get_custom_objects_api()
@@ -167,22 +206,47 @@ def _get_cluster_id(cls) -> str:
167206
cluster_id = version_data["spec"]["clusterID"]
168207
cls._cluster_id = cluster_id
169208
return cluster_id
209+
except ApiException as e:
210+
# Handle specific HTTP status codes from Kubernetes API
211+
if e.status == 404:
212+
logger.error(
213+
"ClusterVersion resource 'version' not found in cluster: %s",
214+
e.reason,
215+
)
216+
raise ClusterVersionNotFoundError(
217+
"ClusterVersion 'version' resource not found in OpenShift cluster"
218+
) from e
219+
if e.status == 403:
220+
logger.error(
221+
"Permission denied to access ClusterVersion resource: %s", e.reason
222+
)
223+
raise ClusterVersionPermissionError(
224+
"Insufficient permissions to read ClusterVersion resource"
225+
) from e
226+
# Treat all other ApiException as connection/server errors (503)
227+
logger.error(
228+
"Kubernetes API error while fetching ClusterVersion (status %s): %s",
229+
e.status,
230+
e.reason,
231+
)
232+
raise K8sAPIConnectionError(
233+
f"Failed to connect to Kubernetes API: {e.reason} (status {e.status})"
234+
) from e
170235
except KeyError as e:
171236
logger.error(
172-
"Failed to get cluster_id from cluster, missing keys in version object"
237+
"ClusterVersion missing required field 'spec.clusterID': %s", e
173238
)
174-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
239+
raise InvalidClusterVersionError(
240+
f"ClusterVersion missing required field: {e}"
241+
) from e
175242
except TypeError as e:
176243
logger.error(
177-
"Failed to get cluster_id, version object is: %s", version_data
244+
"ClusterVersion has invalid structure, version_data type: %s",
245+
type(version_data).__name__,
178246
)
179-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
180-
except ApiException as e:
181-
logger.error("API exception during ClusterInfo: %s", e)
182-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
183-
except Exception as e:
184-
logger.error("Unexpected error during getting cluster ID: %s", e)
185-
raise ClusterIDUnavailableError("Failed to get cluster ID") from e
247+
raise InvalidClusterVersionError(
248+
f"ClusterVersion has invalid type or structure: {e}"
249+
) from e
186250

187251
@classmethod
188252
def get_cluster_id(cls) -> str:
@@ -199,7 +263,10 @@ def get_cluster_id(cls) -> str:
199263
str: The cluster identifier.
200264
201265
Raises:
202-
ClusterIDUnavailableError: If running in-cluster and fetching the cluster ID fails.
266+
K8sAPIConnectionError: If the Kubernetes API is unreachable.
267+
ClusterVersionNotFoundError: If the ClusterVersion resource does not exist.
268+
ClusterVersionPermissionError: If access to ClusterVersion is denied.
269+
InvalidClusterVersionError: If ClusterVersion has invalid structure.
203270
"""
204271
if cls._instance is None:
205272
cls()
@@ -310,11 +377,24 @@ async def __call__(self, request: Request) -> tuple[str, str, bool, str]:
310377
if user_info.user.username == "kube:admin":
311378
try:
312379
user_info.user.uid = K8sClientSingleton.get_cluster_id()
313-
except ClusterIDUnavailableError as e:
314-
logger.error("Failed to get cluster ID: %s", e)
380+
except K8sAPIConnectionError as e:
381+
# Kubernetes API is unreachable - return 503
382+
logger.error("Cannot connect to Kubernetes API: %s", e)
383+
response = ServiceUnavailableResponse(
384+
backend_name="Kubernetes API",
385+
cause=str(e),
386+
)
387+
raise HTTPException(**response.model_dump()) from e
388+
except (
389+
ClusterVersionNotFoundError,
390+
ClusterVersionPermissionError,
391+
InvalidClusterVersionError,
392+
) as e:
393+
# Cluster misconfiguration - return 500
394+
logger.error("Cluster configuration error: %s", e)
315395
response = InternalServerErrorResponse(
316396
response="Internal server error",
317-
cause="Unable to retrieve cluster ID",
397+
cause=str(e),
318398
)
319399
raise HTTPException(**response.model_dump()) from e
320400

src/models/responses.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2414,6 +2414,27 @@ class InternalServerErrorResponse(AbstractErrorResponse):
24142414
"cause": "Failed to query the database",
24152415
},
24162416
},
2417+
{
2418+
"label": "cluster version not found",
2419+
"detail": {
2420+
"response": "Internal server error",
2421+
"cause": "ClusterVersion 'version' resource not found in OpenShift cluster",
2422+
},
2423+
},
2424+
{
2425+
"label": "cluster version permission denied",
2426+
"detail": {
2427+
"response": "Internal server error",
2428+
"cause": "Insufficient permissions to read ClusterVersion resource",
2429+
},
2430+
},
2431+
{
2432+
"label": "invalid cluster version",
2433+
"detail": {
2434+
"response": "Internal server error",
2435+
"cause": "ClusterVersion missing required field: 'clusterID'",
2436+
},
2437+
},
24172438
]
24182439
}
24192440
}
@@ -2537,7 +2558,17 @@ class ServiceUnavailableResponse(AbstractErrorResponse):
25372558
"response": "Unable to connect to Llama Stack",
25382559
"cause": "Connection error while trying to reach backend service.",
25392560
},
2540-
}
2561+
},
2562+
{
2563+
"label": "kubernetes api",
2564+
"detail": {
2565+
"response": "Unable to connect to Kubernetes API",
2566+
"cause": (
2567+
"Failed to connect to Kubernetes API: "
2568+
"Service Unavailable (status 503)"
2569+
),
2570+
},
2571+
},
25412572
]
25422573
}
25432574
}

0 commit comments

Comments
 (0)