Skip to content

Commit f8368da

Browse files
fix(cluster): add bounds check for workerGroupSpecs array access
Add defensive bounds checking before accessing workerGroupSpecs[0] in three functions to prevent IndexError on head-only Ray clusters (num_workers=0). When workerGroupSpecs is empty, worker-related values default to 0. Functions fixed: - _head_worker_extended_resources_from_rc_dict() - get_cluster() - _map_to_ray_cluster() Fixes: RHOAIENG-54729 Made-with: Cursor
1 parent 8c37f5d commit f8368da

2 files changed

Lines changed: 154 additions & 34 deletions

File tree

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 68 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -687,14 +687,17 @@ def job_logs(self, job_id: str) -> str:
687687
@staticmethod
688688
def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]:
689689
head_extended_resources, worker_extended_resources = {}, {}
690-
for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
691-
"containers"
692-
][0]["resources"]["limits"].keys():
693-
if resource in ["memory", "cpu"]:
694-
continue
695-
worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][
696-
"template"
697-
]["spec"]["containers"][0]["resources"]["limits"][resource]
690+
691+
# Fix for RHOAIENG-54729: Check if workerGroupSpecs exists before accessing [0]
692+
if len(rc["spec"].get("workerGroupSpecs", [])) > 0:
693+
for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
694+
"containers"
695+
][0]["resources"]["limits"].keys():
696+
if resource in ["memory", "cpu"]:
697+
continue
698+
worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][
699+
"template"
700+
]["spec"]["containers"][0]["resources"]["limits"][resource]
698701

699702
for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
700703
0
@@ -823,6 +826,30 @@ def get_cluster(
823826
head_extended_resources,
824827
worker_extended_resources,
825828
) = Cluster._head_worker_extended_resources_from_rc_dict(resource)
829+
830+
# Fix for RHOAIENG-54729: Handle head-only clusters (no workers)
831+
if len(resource["spec"].get("workerGroupSpecs", [])) > 0:
832+
num_workers = resource["spec"]["workerGroupSpecs"][0]["minReplicas"]
833+
worker_cpu_limits = resource["spec"]["workerGroupSpecs"][0]["template"]["spec"][
834+
"containers"
835+
][0]["resources"]["limits"]["cpu"]
836+
worker_cpu_requests = resource["spec"]["workerGroupSpecs"][0]["template"][
837+
"spec"
838+
]["containers"][0]["resources"]["requests"]["cpu"]
839+
worker_memory_limits = resource["spec"]["workerGroupSpecs"][0]["template"][
840+
"spec"
841+
]["containers"][0]["resources"]["limits"]["memory"]
842+
worker_memory_requests = resource["spec"]["workerGroupSpecs"][0]["template"][
843+
"spec"
844+
]["containers"][0]["resources"]["requests"]["memory"]
845+
else:
846+
# Head-only cluster - use defaults for worker specs
847+
num_workers = 0
848+
worker_cpu_limits = 0
849+
worker_cpu_requests = 0
850+
worker_memory_limits = 0
851+
worker_memory_requests = 0
852+
826853
# Create a Cluster Configuration with just the necessary provided parameters
827854
cluster_config = ClusterConfiguration(
828855
name=cluster_name,
@@ -841,19 +868,11 @@ def get_cluster(
841868
head_memory_requests=resource["spec"]["headGroupSpec"]["template"]["spec"][
842869
"containers"
843870
][0]["resources"]["requests"]["memory"],
844-
num_workers=resource["spec"]["workerGroupSpecs"][0]["minReplicas"],
845-
worker_cpu_limits=resource["spec"]["workerGroupSpecs"][0]["template"]["spec"][
846-
"containers"
847-
][0]["resources"]["limits"]["cpu"],
848-
worker_cpu_requests=resource["spec"]["workerGroupSpecs"][0]["template"]["spec"][
849-
"containers"
850-
][0]["resources"]["requests"]["cpu"],
851-
worker_memory_limits=resource["spec"]["workerGroupSpecs"][0]["template"][
852-
"spec"
853-
]["containers"][0]["resources"]["limits"]["memory"],
854-
worker_memory_requests=resource["spec"]["workerGroupSpecs"][0]["template"][
855-
"spec"
856-
]["containers"][0]["resources"]["requests"]["memory"],
871+
num_workers=num_workers,
872+
worker_cpu_limits=worker_cpu_limits,
873+
worker_cpu_requests=worker_cpu_requests,
874+
worker_memory_limits=worker_memory_limits,
875+
worker_memory_requests=worker_memory_requests,
857876
head_extended_resource_requests=head_extended_resources,
858877
worker_extended_resource_requests=worker_extended_resources,
859878
)
@@ -1086,23 +1105,38 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
10861105
worker_extended_resources,
10871106
) = Cluster._head_worker_extended_resources_from_rc_dict(rc)
10881107

1089-
return RayCluster(
1090-
name=rc["metadata"]["name"],
1091-
status=status,
1092-
# for now we are not using autoscaling so same replicas is fine
1093-
num_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
1094-
worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
1108+
# Fix for RHOAIENG-54729: Handle head-only clusters (no workers)
1109+
if len(rc["spec"].get("workerGroupSpecs", [])) > 0:
1110+
num_workers = rc["spec"]["workerGroupSpecs"][0]["replicas"]
1111+
worker_mem_limits = rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
10951112
"containers"
1096-
][0]["resources"]["limits"]["memory"],
1097-
worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
1113+
][0]["resources"]["limits"]["memory"]
1114+
worker_mem_requests = rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
10981115
"containers"
1099-
][0]["resources"]["requests"]["memory"],
1100-
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
1116+
][0]["resources"]["requests"]["memory"]
1117+
worker_cpu_requests = rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
11011118
"containers"
1102-
][0]["resources"]["requests"]["cpu"],
1103-
worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
1119+
][0]["resources"]["requests"]["cpu"]
1120+
worker_cpu_limits = rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
11041121
"containers"
1105-
][0]["resources"]["limits"]["cpu"],
1122+
][0]["resources"]["limits"]["cpu"]
1123+
else:
1124+
# Head-only cluster - use defaults for worker specs
1125+
num_workers = 0
1126+
worker_mem_limits = 0
1127+
worker_mem_requests = 0
1128+
worker_cpu_requests = 0
1129+
worker_cpu_limits = 0
1130+
1131+
return RayCluster(
1132+
name=rc["metadata"]["name"],
1133+
status=status,
1134+
# for now we are not using autoscaling so same replicas is fine
1135+
num_workers=num_workers,
1136+
worker_mem_limits=worker_mem_limits,
1137+
worker_mem_requests=worker_mem_requests,
1138+
worker_cpu_requests=worker_cpu_requests,
1139+
worker_cpu_limits=worker_cpu_limits,
11061140
worker_extended_resources=worker_extended_resources,
11071141
namespace=rc["metadata"]["namespace"],
11081142
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][

src/codeflare_sdk/ray/cluster/test_cluster.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2115,6 +2115,92 @@ def test_job_logs(mocker):
21152115
mock_job_client.get_job_logs.assert_called_once_with("job-123")
21162116

21172117

2118+
def test_head_only_cluster_no_workers(mocker):
2119+
"""
2120+
Test for RHOAIENG-54729: Functions should handle head-only clusters (num_workers=0)
2121+
without crashing on IndexError when accessing workerGroupSpecs[0]
2122+
"""
2123+
from codeflare_sdk.ray.cluster.cluster import (
2124+
Cluster,
2125+
_map_to_ray_cluster,
2126+
get_cluster,
2127+
)
2128+
2129+
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
2130+
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
2131+
mocker.patch(
2132+
"codeflare_sdk.ray.cluster.cluster._is_openshift_cluster", return_value=False
2133+
)
2134+
2135+
mock_api_client = mocker.MagicMock(spec=client.ApiClient)
2136+
mocker.patch(
2137+
"codeflare_sdk.common.kubernetes_cluster.auth.get_api_client",
2138+
return_value=mock_api_client,
2139+
)
2140+
2141+
# Create a head-only cluster dict (workerGroupSpecs is empty list)
2142+
head_only_rc = {
2143+
"apiVersion": "ray.io/v1",
2144+
"kind": "RayCluster",
2145+
"metadata": {
2146+
"name": "head-only-cluster",
2147+
"namespace": "ns",
2148+
},
2149+
"spec": {
2150+
"headGroupSpec": {
2151+
"template": {
2152+
"spec": {
2153+
"containers": [
2154+
{
2155+
"name": "ray-head",
2156+
"resources": {
2157+
"limits": {"cpu": "2", "memory": "8G"},
2158+
"requests": {"cpu": "2", "memory": "8G"},
2159+
},
2160+
}
2161+
]
2162+
}
2163+
}
2164+
},
2165+
"workerGroupSpecs": [], # Empty - head-only cluster
2166+
},
2167+
"status": {"state": "ready"},
2168+
}
2169+
2170+
# Test 1: _head_worker_extended_resources_from_rc_dict should not crash
2171+
head_ext, worker_ext = Cluster._head_worker_extended_resources_from_rc_dict(
2172+
head_only_rc
2173+
)
2174+
assert isinstance(head_ext, dict)
2175+
assert isinstance(worker_ext, dict)
2176+
assert worker_ext == {} # Should be empty for head-only cluster
2177+
2178+
# Test 2: _map_to_ray_cluster should not crash
2179+
mocker.patch(
2180+
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
2181+
return_value=mocker.Mock(items=[]),
2182+
)
2183+
result = _map_to_ray_cluster(head_only_rc)
2184+
assert result is not None
2185+
assert result.num_workers == 0
2186+
assert result.worker_cpu_limits == 0
2187+
assert result.worker_cpu_requests == 0
2188+
assert result.worker_mem_limits == 0
2189+
assert result.worker_mem_requests == 0
2190+
2191+
# Test 3: get_cluster should not crash
2192+
mocker.patch(
2193+
"kubernetes.client.CustomObjectsApi.get_namespaced_custom_object",
2194+
return_value=head_only_rc,
2195+
)
2196+
cluster = get_cluster("head-only-cluster", "ns")
2197+
assert cluster.config.num_workers == 0
2198+
assert cluster.config.worker_cpu_limits == 0
2199+
assert cluster.config.worker_cpu_requests == 0
2200+
assert cluster.config.worker_memory_limits == "0G"
2201+
assert cluster.config.worker_memory_requests == "0G"
2202+
2203+
21182204
# Make sure to always keep this function last
21192205
def test_cleanup():
21202206
# Clean up test files if they exist

0 commit comments

Comments
 (0)