Skip to content

Commit b940332

Browse files
fix(RHOAIENG-57445): fix autoscaling test race condition and reduce resource pressure
Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 0d1ffab commit b940332

3 files changed

Lines changed: 29 additions & 21 deletions

File tree

tests/e2e/autoscaling_raycluster_sdk_kind_test.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from time import sleep
2-
31
import pytest
42

53
from codeflare_sdk import Cluster, ClusterConfiguration
@@ -28,7 +26,7 @@ def test_autoscaling_scale_up_and_down_kind(self):
2826
namespace=self.namespace,
2927
enable_autoscaling=True,
3028
min_workers=1,
31-
max_workers=4,
29+
max_workers=2,
3230
head_cpu_requests="500m",
3331
head_cpu_limits="500m",
3432
worker_cpu_requests="500m",
@@ -47,14 +45,16 @@ def test_autoscaling_scale_up_and_down_kind(self):
4745
# Verify initial state: 1 worker (min_workers)
4846
wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=300)
4947

50-
# Trigger scale-up via load script in head pod
51-
run_autoscaling_load_in_head_pod(self, cluster_name)
48+
# Trigger scale-up via load script in head pod (async)
49+
load_proc = run_autoscaling_load_in_head_pod(self, cluster_name)
5250

53-
# Verify scale-up
51+
# Verify scale-up while load is still running
5452
wait_for_worker_count(self, cluster_name, lambda n: n >= 2, timeout_s=600)
5553

56-
# Wait for idle timeout + verify scale-down back to min_workers
57-
sleep(90)
54+
# Wait for load to finish, then verify scale-down back to min_workers
55+
load_proc.wait(timeout=600)
56+
assert load_proc.returncode == 0, "Load script failed"
57+
5858
wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=600)
5959

6060
cluster.down()

tests/e2e/autoscaling_raycluster_sdk_oauth_test.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from time import sleep
2-
31
import pytest
42

53
from codeflare_sdk import Cluster, ClusterConfiguration
@@ -36,7 +34,7 @@ def test_autoscaling_scale_up_and_down_openshift_oauth(self):
3634
namespace=self.namespace,
3735
enable_autoscaling=True,
3836
min_workers=1,
39-
max_workers=4,
37+
max_workers=2,
4038
image=ray_image,
4139
write_to_file=True,
4240
verify_tls=False,
@@ -50,14 +48,18 @@ def test_autoscaling_scale_up_and_down_openshift_oauth(self):
5048
# Verify initial state: 1 worker (min_workers)
5149
wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=600)
5250

53-
# Trigger scale-up via load script in head pod
54-
run_autoscaling_load_in_head_pod(self, cluster_name, tasks=4, sleep_s=180)
51+
# Trigger scale-up via load script in head pod (async)
52+
load_proc = run_autoscaling_load_in_head_pod(
53+
self, cluster_name, tasks=2, sleep_s=180
54+
)
5555

56-
# Verify scale-up
56+
# Verify scale-up while load is still running
5757
wait_for_worker_count(self, cluster_name, lambda n: n >= 2, timeout_s=900)
5858

59-
# Wait for idle timeout + verify scale-down back to min_workers
60-
sleep(120)
59+
# Wait for load to finish, then verify scale-down back to min_workers
60+
load_proc.wait(timeout=600)
61+
assert load_proc.returncode == 0, "Load script failed"
62+
6163
wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=900)
6264

6365
cluster.down()

tests/e2e/support.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -364,27 +364,33 @@ def wait_for_worker_count(self, cluster_name, predicate, timeout_s=600):
364364
)
365365

366366

367-
def run_autoscaling_load_in_head_pod(self, cluster_name, tasks=4, sleep_s=120):
367+
def run_autoscaling_load_in_head_pod(self, cluster_name, tasks=2, sleep_s=120):
368368
"""
369-
Copy autoscaling_load.py into the head pod and run it.
370-
Avoids port-forwarding / Ray Dashboard API dependency.
369+
Copy autoscaling_load.py into the head pod and run it asynchronously.
370+
Returns the Popen handle so the caller can check for scale-up while
371+
the workload is still running (avoids the race where blocking execution
372+
lets workers scale back down before the assertion runs).
371373
"""
372374
label = f"ray.io/node-type=head,ray.io/cluster={cluster_name}"
373375
pods = self.api_instance.list_namespaced_pod(self.namespace, label_selector=label)
374376
if not pods.items:
375377
raise RuntimeError(f"No head pod found for cluster {cluster_name}")
376378
head_pod = pods.items[0].metadata.name
377379

380+
load_script = os.path.join(
381+
os.path.dirname(os.path.abspath(__file__)), "autoscaling_load.py"
382+
)
383+
378384
subprocess.check_call(
379385
[
380386
"kubectl",
381387
"cp",
382-
"./tests/e2e/autoscaling_load.py",
388+
load_script,
383389
f"{self.namespace}/{head_pod}:/tmp/autoscaling_load.py",
384390
]
385391
)
386392

387-
subprocess.check_call(
393+
return subprocess.Popen(
388394
[
389395
"kubectl",
390396
"exec",

0 commit comments

Comments
 (0)