fix(RHOAIENG-57445): fix autoscaling test race condition and reduce resource pressure

kryanbeane · cursoragent · kryanbeane · commit b940332cf2ed · 2026-05-14T10:23:57.000+01:00
Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/tests/e2e/autoscaling_raycluster_sdk_kind_test.py b/tests/e2e/autoscaling_raycluster_sdk_kind_test.py
@@ -1,5 +1,3 @@
-from time import sleep
-
 import pytest
 
 from codeflare_sdk import Cluster, ClusterConfiguration
@@ -28,7 +26,7 @@ def test_autoscaling_scale_up_and_down_kind(self):
                 namespace=self.namespace,
                 enable_autoscaling=True,
                 min_workers=1,
-                max_workers=4,
+                max_workers=2,
                 head_cpu_requests="500m",
                 head_cpu_limits="500m",
                 worker_cpu_requests="500m",
@@ -47,14 +45,16 @@ def test_autoscaling_scale_up_and_down_kind(self):
         # Verify initial state: 1 worker (min_workers)
         wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=300)
 
-        # Trigger scale-up via load script in head pod
-        run_autoscaling_load_in_head_pod(self, cluster_name)
+        # Trigger scale-up via load script in head pod (async)
+        load_proc = run_autoscaling_load_in_head_pod(self, cluster_name)
 
-        # Verify scale-up
+        # Verify scale-up while load is still running
         wait_for_worker_count(self, cluster_name, lambda n: n >= 2, timeout_s=600)
 
-        # Wait for idle timeout + verify scale-down back to min_workers
-        sleep(90)
+        # Wait for load to finish, then verify scale-down back to min_workers
+        load_proc.wait(timeout=600)
+        assert load_proc.returncode == 0, "Load script failed"
+
         wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=600)
 
         cluster.down()
diff --git a/tests/e2e/autoscaling_raycluster_sdk_oauth_test.py b/tests/e2e/autoscaling_raycluster_sdk_oauth_test.py
@@ -1,5 +1,3 @@
-from time import sleep
-
 import pytest
 
 from codeflare_sdk import Cluster, ClusterConfiguration
@@ -36,7 +34,7 @@ def test_autoscaling_scale_up_and_down_openshift_oauth(self):
                 namespace=self.namespace,
                 enable_autoscaling=True,
                 min_workers=1,
-                max_workers=4,
+                max_workers=2,
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -50,14 +48,18 @@ def test_autoscaling_scale_up_and_down_openshift_oauth(self):
         # Verify initial state: 1 worker (min_workers)
         wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=600)
 
-        # Trigger scale-up via load script in head pod
-        run_autoscaling_load_in_head_pod(self, cluster_name, tasks=4, sleep_s=180)
+        # Trigger scale-up via load script in head pod (async)
+        load_proc = run_autoscaling_load_in_head_pod(
+            self, cluster_name, tasks=2, sleep_s=180
+        )
 
-        # Verify scale-up
+        # Verify scale-up while load is still running
         wait_for_worker_count(self, cluster_name, lambda n: n >= 2, timeout_s=900)
 
-        # Wait for idle timeout + verify scale-down back to min_workers
-        sleep(120)
+        # Wait for load to finish, then verify scale-down back to min_workers
+        load_proc.wait(timeout=600)
+        assert load_proc.returncode == 0, "Load script failed"
+
         wait_for_worker_count(self, cluster_name, lambda n: n == 1, timeout_s=900)
 
         cluster.down()
diff --git a/tests/e2e/support.py b/tests/e2e/support.py
@@ -364,27 +364,33 @@ def wait_for_worker_count(self, cluster_name, predicate, timeout_s=600):
     )
 
 
-def run_autoscaling_load_in_head_pod(self, cluster_name, tasks=4, sleep_s=120):
+def run_autoscaling_load_in_head_pod(self, cluster_name, tasks=2, sleep_s=120):
     """
-    Copy autoscaling_load.py into the head pod and run it.
-    Avoids port-forwarding / Ray Dashboard API dependency.
+    Copy autoscaling_load.py into the head pod and run it asynchronously.
+    Returns the Popen handle so the caller can check for scale-up while
+    the workload is still running (avoids the race where blocking execution
+    lets workers scale back down before the assertion runs).
     """
     label = f"ray.io/node-type=head,ray.io/cluster={cluster_name}"
     pods = self.api_instance.list_namespaced_pod(self.namespace, label_selector=label)
     if not pods.items:
         raise RuntimeError(f"No head pod found for cluster {cluster_name}")
     head_pod = pods.items[0].metadata.name
 
+    load_script = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "autoscaling_load.py"
+    )
+
     subprocess.check_call(
         [
             "kubectl",
             "cp",
-            "./tests/e2e/autoscaling_load.py",
+            load_script,
             f"{self.namespace}/{head_pod}:/tmp/autoscaling_load.py",
         ]
     )
 
-    subprocess.check_call(
+    return subprocess.Popen(
         [
             "kubectl",
             "exec",