Skip to content

Commit 4db0994

Browse files
committed
Add sandbox support to Ray templates and update tests
Signed-off-by: Wei Du <wedu@nvidia.com>
1 parent 0a0c50f commit 4db0994

5 files changed

Lines changed: 114 additions & 3 deletions

File tree

test/core/execution/artifacts/expected_ray_cluster.sub

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,33 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR
290290
elapsed_time=$((elapsed_time + 2))
291291
done
292292

293+
# Run sandbox in parallel across all allocated nodes when explicitly configured.
294+
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
295+
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
296+
mkdir -p "$SANDBOX_PORTS_DIR"
297+
echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
298+
srun --output "$LOG_DIR/sandbox.log" \
299+
--error "$LOG_DIR/sandbox.log" \
300+
--container-image="$SANDBOX_CONTAINER" \
301+
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
302+
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
303+
--no-container-mount-home \
304+
--mpi=pmix \
305+
-A "$SLURM_JOB_ACCOUNT" \
306+
-p "$SLURM_JOB_PARTITION" \
307+
--wait=60 \
308+
--kill-on-bad-exit=1 \
309+
--overlap \
310+
--nodes="$SLURM_JOB_NUM_NODES" \
311+
--ntasks-per-node=1 \
312+
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
313+
bash -c "$SANDBOX_COMMAND" &
314+
SRUN_PIDS["sandbox"]=$!
315+
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
316+
else
317+
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
318+
fi
319+
293320
NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
294321

295322
# Start Ray worker nodes

test/core/execution/artifacts/expected_ray_cluster_enroot.sub

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,33 @@ while true; do
306306
elapsed_time=$((elapsed_time + 2))
307307
done
308308

309+
# Run sandbox in parallel on the head node (overlap) when explicitly configured.
310+
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
311+
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
312+
mkdir -p "$SANDBOX_PORTS_DIR"
313+
echo "[INFO] Starting sandbox on head node in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
314+
srun --output "$LOG_DIR/sandbox.log" \
315+
--error "$LOG_DIR/sandbox.log" \
316+
--container-image="$SANDBOX_CONTAINER" \
317+
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
318+
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
319+
--no-container-mount-home \
320+
--mpi=pmix \
321+
-A "$SLURM_JOB_ACCOUNT" \
322+
-p "$SLURM_JOB_PARTITION" \
323+
--wait=60 \
324+
--kill-on-bad-exit=1 \
325+
--overlap \
326+
--nodes="$SLURM_JOB_NUM_NODES" \
327+
--ntasks-per-node=1 \
328+
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
329+
bash -c "$SANDBOX_COMMAND" &
330+
SRUN_PIDS["sandbox"]=$!
331+
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
332+
else
333+
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
334+
fi
335+
309336
NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
310337

311338
# Start Ray worker nodes
@@ -495,4 +522,4 @@ EOF
495522
echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh 1 # to attach to worker 1"
496523
echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh 2 # to attach to worker 2, etc."
497524
sleep infinity
498-
fi
525+
fi

test/core/execution/artifacts/expected_ray_cluster_ssh.sub

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,33 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR
295295
elapsed_time=$((elapsed_time + 2))
296296
done
297297

298+
# Run sandbox in parallel across all allocated nodes when explicitly configured.
299+
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
300+
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
301+
mkdir -p "$SANDBOX_PORTS_DIR"
302+
echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
303+
srun --output "$LOG_DIR/sandbox.log" \
304+
--error "$LOG_DIR/sandbox.log" \
305+
--container-image="$SANDBOX_CONTAINER" \
306+
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
307+
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
308+
--no-container-mount-home \
309+
--mpi=pmix \
310+
-A "$SLURM_JOB_ACCOUNT" \
311+
-p "$SLURM_JOB_PARTITION" \
312+
--wait=60 \
313+
--kill-on-bad-exit=1 \
314+
--overlap \
315+
--nodes="$SLURM_JOB_NUM_NODES" \
316+
--ntasks-per-node=1 \
317+
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
318+
bash -c "$SANDBOX_COMMAND" &
319+
SRUN_PIDS["sandbox"]=$!
320+
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
321+
else
322+
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
323+
fi
324+
298325
NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
299326

300327
# Start Ray worker nodes

test/core/execution/artifacts/expected_ray_het_cluster.sub

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,33 @@ while ! (srun --het-group=0 --overlap --nodes=1 --ntasks=1 -w $head_node test -f
319319
elapsed_time=$((elapsed_time + 2))
320320
done
321321

322+
# Run sandbox in parallel across all allocated nodes when explicitly configured.
323+
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
324+
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
325+
mkdir -p "$SANDBOX_PORTS_DIR"
326+
echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
327+
srun --het-group=0 --output "$LOG_DIR/sandbox.log" \
328+
--error "$LOG_DIR/sandbox.log" \
329+
--container-image="$SANDBOX_CONTAINER" \
330+
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
331+
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
332+
--no-container-mount-home \
333+
--mpi=pmix \
334+
-A "$SLURM_JOB_ACCOUNT" \
335+
-p "$SLURM_JOB_PARTITION" \
336+
--wait=60 \
337+
--kill-on-bad-exit=1 \
338+
--overlap \
339+
--nodes="$SLURM_JOB_NUM_NODES" \
340+
--ntasks-per-node=1 \
341+
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
342+
bash -c "$SANDBOX_COMMAND" &
343+
SRUN_PIDS["sandbox"]=$!
344+
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
345+
else
346+
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
347+
fi
348+
322349
NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
323350

324351
# Start Ray worker nodes

test/run/ray/test_slurm_ray_request.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,11 @@ def test_container_configurations(self):
285285

286286
# Should use cluster_dir as default workdir
287287
assert "--container-workdir=/tmp/test_jobs/test-ray-cluster" in script
288-
# Should not contain container-image flag when none specified
289-
assert "--container-image" not in script
288+
# The main Ray cluster script should not contain a container-image flag
289+
# when none is specified on the executor. Ignore the optional sandbox
290+
# stanza, which may carry its own container-image placeholder.
291+
pre_sandbox_script = script.split("# Run sandbox", 1)[0]
292+
assert "--container-image" not in pre_sandbox_script
290293

291294
def test_special_mount_handling(self):
292295
"""Test materialize handles special RUNDIR_SPECIAL_NAME mounts."""

0 commit comments

Comments
 (0)