Skip to content

Commit b25ab1d

Browse files
chore(RHOAIENG-31062): Improve API error messages
rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED
1 parent d61babf commit b25ab1d

8 files changed

Lines changed: 206 additions & 43 deletions

File tree

src/codeflare_sdk/common/kubernetes_cluster/kube_api_helpers.py

Lines changed: 63 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,61 @@
1717
API error handling or wrapping.
1818
"""
1919

20+
import json
2021
import executing
2122
from kubernetes import client, config
2223

24+
2325
ERROR_MESSAGES = {
24-
"Not Found": "The requested resource could not be located.\n"
25-
"Please verify the resource name and namespace.",
26-
"Unauthorized": "Access to the API is unauthorized.\n"
26+
401: "Access to the API is unauthorized.\n"
2727
"Check your credentials or permissions.",
28-
"Forbidden": "Access denied to the Kubernetes resource.\n"
29-
"Ensure your role has sufficient permissions for this operation.",
30-
"Conflict": "A conflict occurred with the RayCluster resource.\n"
28+
403: "Access denied:\n"
29+
"Ensure your role has sufficient permissions and that you are logged in to the correct cluster.",
30+
404: "The requested resource could not be located.\n"
31+
"Please verify the resource name and namespace.",
32+
409: "A conflict occurred with the RayCluster resource.\n"
3133
"Only one RayCluster with the same name is allowed. "
3234
"Please delete or rename the existing RayCluster before creating a new one with the desired name.",
35+
422: "The request was rejected because something in your cluster configuration is invalid. Fix the value and try again.",
3336
}
3437

38+
ERROR_MESSAGES_FALLBACK = (
39+
"An error occurred while communicating with the cluster.\n"
40+
"Check the details below; if the problem persists, contact your administrator."
41+
)
42+
43+
44+
def _format_api_error_body(body) -> str:
45+
"""
46+
Extract a short, readable detail from a Kubernetes Status API response body.
47+
Returns a single line like "Details: <message>" instead of raw JSON.
48+
For cluster name validation errors, returns a brief, data-scientist-friendly line.
49+
"""
50+
if not body:
51+
return ""
52+
try:
53+
raw = body.decode() if isinstance(body, bytes) else body
54+
data = json.loads(raw)
55+
msg = data.get("message")
56+
if not msg:
57+
return ""
58+
59+
# Short, friendly line for cluster name (metadata.name) validation errors
60+
if (
61+
"metadata.name" in msg
62+
and "Invalid value" in msg
63+
and ("lowercase" in msg or "RFC 1123" in msg or "subdomain" in msg)
64+
):
65+
return (
66+
"Details: Cluster name is invalid. Use only lowercase letters, "
67+
"numbers, and hyphens; it must start and end with a letter or number.\n"
68+
"(e.g. 'mycluster' or 'my-cluster')."
69+
)
70+
71+
return f"Details: {msg}"
72+
except (json.JSONDecodeError, AttributeError, TypeError):
73+
return f"Response: {body}"
74+
3575

3676
# private methods
3777
def _kube_api_error_handling(
@@ -42,12 +82,22 @@ def print_message(message: str):
4282
print(message)
4383

4484
if isinstance(e, client.ApiException):
45-
# Retrieve message based on reason, defaulting if reason is not known
46-
message = ERROR_MESSAGES.get(
47-
e.reason, f"Unexpected API error encountered (Reason: {e.reason})"
48-
)
49-
full_message = f"{message}\nResponse: {e.body}"
50-
print_message(full_message)
85+
detail = _format_api_error_body(e.body)
86+
status_code = getattr(e, "status", None)
87+
88+
if status_code == 422 and detail and "Cluster name is invalid" in detail:
89+
print_message(detail)
90+
else:
91+
message = ERROR_MESSAGES.get(status_code, ERROR_MESSAGES_FALLBACK)
92+
if detail:
93+
full_message = f"{message}\n{detail}"
94+
else:
95+
# Ensure we always show something (reason and status) when no body detail
96+
status_info = f"Reason: {e.reason}"
97+
if status_code is not None:
98+
status_info += f" (HTTP {status_code})"
99+
full_message = f"{message}\n{status_info}."
100+
print_message(full_message)
51101

52102
elif isinstance(e, config.ConfigException):
53103
message = "Configuration error: Unable to load Kubernetes configuration. Verify the config file path and format."
@@ -58,6 +108,6 @@ def print_message(message: str):
58108
print_message(message)
59109

60110
else:
61-
message = f"Unexpected error:\n{str(e)}"
111+
message = "An unexpected error occurred.\n" f"{str(e)}"
62112
print_message(message)
63113
raise e
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright 2025 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
Tests for kube_api_helpers module.
17+
"""
18+
19+
import pytest
20+
from unittest.mock import patch
21+
from kubernetes import client
22+
23+
from codeflare_sdk.common.kubernetes_cluster.kube_api_helpers import (
24+
_format_api_error_body,
25+
_kube_api_error_handling,
26+
)
27+
28+
29+
class TestFormatApiErrorBody:
30+
"""Tests for _format_api_error_body."""
31+
32+
def test_empty_body_returns_empty_string(self):
33+
assert _format_api_error_body(None) == ""
34+
assert _format_api_error_body("") == ""
35+
36+
def test_valid_json_with_message_returns_details_line(self):
37+
body = '{"message": "Something went wrong"}'
38+
assert _format_api_error_body(body) == "Details: Something went wrong"
39+
40+
def test_bytes_body_decoded_and_parsed(self):
41+
body = b'{"message": "Bytes message"}'
42+
assert _format_api_error_body(body) == "Details: Bytes message"
43+
44+
def test_cluster_name_validation_returns_friendly_message(self):
45+
body = '{"message": "Invalid value: metadata.name must be lowercase RFC 1123 subdomain"}'
46+
result = _format_api_error_body(body)
47+
assert "Cluster name is invalid" in result
48+
assert "lowercase" in result and "my-cluster" in result
49+
50+
def test_json_without_message_returns_empty(self):
51+
assert _format_api_error_body('{"code": 404}') == ""
52+
53+
def test_invalid_json_returns_response_line(self):
54+
result = _format_api_error_body("not json")
55+
assert result.startswith("Response:")
56+
assert "not json" in result
57+
58+
59+
class TestKubeApiErrorHandling:
60+
"""Tests for _kube_api_error_handling."""
61+
62+
@patch("builtins.print")
63+
def test_422_cluster_name_error_prints_friendly_detail_only(self, mock_print):
64+
api_exc = client.ApiException(status=422, reason="Unprocessable Entity")
65+
api_exc.body = b'{"message": "Invalid value: metadata.name lowercase RFC 1123"}'
66+
_kube_api_error_handling(api_exc)
67+
# Should print the friendly cluster-name message, not the generic Unprocessable Entity line
68+
call_args = [str(c) for c in mock_print.call_args_list]
69+
assert any("Cluster name is invalid" in c for c in call_args)
70+
assert not any(
71+
"something in your cluster configuration is invalid" in c for c in call_args
72+
)
73+
74+
@patch("builtins.print")
75+
def test_404_with_body_prints_message_and_detail(self, mock_print):
76+
api_exc = client.ApiException(status=404, reason="Not Found")
77+
api_exc.body = '{"message": "RayCluster xyz not found"}'
78+
_kube_api_error_handling(api_exc)
79+
call_args = [str(c) for c in mock_print.call_args_list]
80+
assert any("requested resource could not be located" in c for c in call_args)
81+
assert any("Details: RayCluster xyz not found" in c for c in call_args)
82+
83+
@patch("builtins.print")
84+
def test_403_prints_forbidden_message(self, mock_print):
85+
"""403 uses status-code lookup so it gets the right message (not fallback)."""
86+
api_exc = client.ApiException(status=403, reason="Forbidden")
87+
api_exc.body = None
88+
_kube_api_error_handling(api_exc)
89+
call_args = [str(c) for c in mock_print.call_args_list]
90+
assert any("Access denied:" in c for c in call_args)
91+
92+
@patch("builtins.print")
93+
def test_api_exception_with_no_body_prints_reason_and_status(self, mock_print):
94+
api_exc = client.ApiException(status=500, reason="Internal Server Error")
95+
api_exc.body = None
96+
_kube_api_error_handling(api_exc)
97+
call_args = [str(c) for c in mock_print.call_args_list]
98+
assert any("Reason: Internal Server Error" in c for c in call_args)
99+
assert any("HTTP 500" in c for c in call_args)

src/codeflare_sdk/common/utils/k8s_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def get_current_namespace(): # pragma: no cover
2222
return active_context
2323
except Exception as e:
2424
print("Unable to find current namespace")
25-
print("trying to gather from current context")
25+
print("Trying to gather namespace from current context")
2626
try:
2727
_, active_context = config.list_kube_config_contexts(config_check())
2828
except Exception as e:

src/codeflare_sdk/common/utils/test_k8s_utils.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def test_get_current_namespace_incluster_file_read_error(self):
5353
assert result is None
5454
# Should see both error messages: in-cluster failure and kubeconfig fallback
5555
mock_print.assert_any_call("Unable to find current namespace")
56-
mock_print.assert_any_call("trying to gather from current context")
56+
mock_print.assert_any_call("Trying to gather namespace from current context")
5757

5858
def test_get_current_namespace_incluster_file_open_error(self):
5959
"""Test handling of file open errors when reading service account namespace."""
@@ -76,7 +76,7 @@ def test_get_current_namespace_incluster_file_open_error(self):
7676
assert result is None
7777
# Should see both error messages: in-cluster failure and kubeconfig fallback
7878
mock_print.assert_any_call("Unable to find current namespace")
79-
mock_print.assert_any_call("trying to gather from current context")
79+
mock_print.assert_any_call("Trying to gather namespace from current context")
8080

8181
def test_get_current_namespace_kubeconfig_success(self):
8282
"""Test successful namespace detection from kubeconfig context."""
@@ -102,7 +102,7 @@ def test_get_current_namespace_kubeconfig_success(self):
102102
result = get_current_namespace()
103103

104104
assert result == "test-namespace"
105-
mock_print.assert_called_with("trying to gather from current context")
105+
mock_print.assert_called_with("Trying to gather namespace from current context")
106106

107107
def test_get_current_namespace_kubeconfig_no_namespace_in_context(self):
108108
"""Test handling when kubeconfig context has no namespace field."""
@@ -128,7 +128,7 @@ def test_get_current_namespace_kubeconfig_no_namespace_in_context(self):
128128
result = get_current_namespace()
129129

130130
assert result is None
131-
mock_print.assert_called_with("trying to gather from current context")
131+
mock_print.assert_called_with("Trying to gather namespace from current context")
132132

133133
def test_get_current_namespace_kubeconfig_config_check_error(self):
134134
"""Test handling when config_check raises an exception."""
@@ -145,7 +145,7 @@ def test_get_current_namespace_kubeconfig_config_check_error(self):
145145
result = get_current_namespace()
146146

147147
assert result is None
148-
mock_print.assert_called_with("trying to gather from current context")
148+
mock_print.assert_called_with("Trying to gather namespace from current context")
149149
mock_error_handler.assert_called_once()
150150

151151
def test_get_current_namespace_kubeconfig_list_contexts_error(self):
@@ -167,7 +167,7 @@ def test_get_current_namespace_kubeconfig_list_contexts_error(self):
167167
result = get_current_namespace()
168168

169169
assert result is None
170-
mock_print.assert_called_with("trying to gather from current context")
170+
mock_print.assert_called_with("Trying to gather namespace from current context")
171171
mock_error_handler.assert_called_once()
172172

173173
def test_get_current_namespace_kubeconfig_key_error(self):
@@ -188,7 +188,7 @@ def test_get_current_namespace_kubeconfig_key_error(self):
188188
result = get_current_namespace()
189189

190190
assert result is None
191-
mock_print.assert_called_with("trying to gather from current context")
191+
mock_print.assert_called_with("Trying to gather namespace from current context")
192192

193193
def test_get_current_namespace_fallback_flow(self):
194194
"""Test the complete fallback flow from in-cluster to kubeconfig."""
@@ -215,7 +215,7 @@ def test_get_current_namespace_fallback_flow(self):
215215
result = get_current_namespace()
216216

217217
assert result == "fallback-namespace"
218-
mock_print.assert_called_with("trying to gather from current context")
218+
mock_print.assert_called_with("Trying to gather namespace from current context")
219219

220220
def test_get_current_namespace_complete_failure(self):
221221
"""Test complete failure scenario where no namespace can be detected."""
@@ -232,7 +232,7 @@ def test_get_current_namespace_complete_failure(self):
232232
result = get_current_namespace()
233233

234234
assert result is None
235-
mock_print.assert_called_with("trying to gather from current context")
235+
mock_print.assert_called_with("Trying to gather namespace from current context")
236236

237237
def test_get_current_namespace_mixed_errors(self):
238238
"""Test scenario with mixed error conditions."""

src/codeflare_sdk/common/widgets/widgets.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -299,18 +299,24 @@ def cluster_apply_down_buttons(
299299
def on_apply_button_clicked(b): # Handle the apply button click event
300300
with output:
301301
output.clear_output()
302-
# Use shorter TLS timeout (60s) for widget button clicks to avoid blocking UI
303-
# Users who need full TLS wait can use wait_ready() checkbox or call apply() directly
304-
cluster.apply(timeout=60)
302+
try:
303+
# Use shorter TLS timeout (60s) for widget button clicks to avoid blocking UI
304+
# Users who need full TLS wait can use wait_ready() checkbox or call apply() directly
305+
cluster.apply(timeout=60)
305306

306-
# If the wait_ready Checkbox is clicked(value == True) trigger the wait_ready function
307-
if wait_ready_check.value:
308-
cluster.wait_ready()
307+
# If the wait_ready Checkbox is clicked(value == True) trigger the wait_ready function
308+
if wait_ready_check.value:
309+
cluster.wait_ready()
310+
except RuntimeError:
311+
pass
309312

310313
def on_down_button_clicked(b): # Handle the down button click event
311314
with output:
312315
output.clear_output()
313-
cluster.down()
316+
try:
317+
cluster.down()
318+
except RuntimeError:
319+
pass
314320

315321
apply_button.on_click(on_apply_button_clicked)
316322
delete_button.on_click(on_down_button_clicked)

src/codeflare_sdk/ray/cluster/build_ray_cluster.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,6 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
212212
if cluster.config.write_to_file:
213213
return write_to_file(cluster, resource) # Writes the file and returns its name
214214
else:
215-
print(f"Yaml resources loaded for {cluster.config.name}")
216215
return resource # Returns the Resource as a dict
217216

218217

0 commit comments

Comments
 (0)