@@ -44,6 +44,12 @@ DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}
4444MIN_WORKER_PORT=${MIN_WORKER_PORT:- 54001}
4545MAX_WORKER_PORT=${MAX_WORKER_PORT:- 54257}
4646
47+ # Ray temp directory (inside container). Used by --temp-dir and log sync sidecar
48+ RAY_TEMP_DIR=${RAY_TEMP_DIR:-/ ray-cluster}
49+
50+ # Number seconds to sync logs from /tmp/ray/session_*/logs to $LOG_DIR/ray/
51+ RAY_LOG_SYNC_FREQUENCY=${RAY_LOG_SYNC_FREQUENCY:- }
52+
4753# Directory setup
4854export CLUSTER_DIR=/lustre/fsw/projects/research/jobs/multi-node-training
4955mkdir -p $CLUSTER_DIR
@@ -131,10 +137,44 @@ monitor-sidecar() {
131137}
132138monitor-sidecar &
133139
140+ # Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds
141+ log-sync-sidecar() {
142+ set +x
143+ if [[ -z "$RAY_LOG_SYNC_FREQUENCY " ]]; then
144+ echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar"
145+ return
146+ fi
147+ mkdir -p $LOG_DIR /ray
148+ while true; do
149+ sleep $RAY_LOG_SYNC_FREQUENCY
150+ if ls ${RAY_TEMP_DIR} /session_[0-9]* > /dev/null 2>&1; then
151+ for session_dir in ${RAY_TEMP_DIR} /session_[0-9]*/; do
152+ if [[ -d "\$ session_dir/logs" ]]; then
153+ session_name=\$ (basename "\$ session_dir")
154+ mkdir -p "$LOG_DIR /ray/\$ session_name"
155+ if command -v rsync > /dev/null 2>&1; then
156+ rsync -ahP "\$ session_dir/logs/" "$LOG_DIR /ray/\$ session_name/logs/" 2>/dev/null || true
157+ else
158+ cp -r "\$ session_dir/logs" "$LOG_DIR /ray/\$ session_name/"
159+ fi
160+ fi
161+ done
162+ fi
163+ if [[ -f "$LOG_DIR /ENDED" ]]; then
164+ echo "Log sync sidecar terminating..."
165+ break
166+ fi
167+ done
168+ }
169+ log-sync-sidecar &
170+
171+ # Patch nsight.py before starting Ray head
172+ sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
173+
134174cat <<EOFINNER | tee /launch-head.sh
135175ray start --head \
136176 --disable-usage-stats \
137- --temp-dir=/ray-cluster \
177+ --temp-dir=${RAY_TEMP_DIR} \
138178 --resources="{\"worker_units\": $gpus_per_node , \"slurm_managed_ray_cluster\": 1}" \
139179 --node-ip-address="$head_node_ip " \
140180 --port=${PORT} \
@@ -206,6 +246,9 @@ monitor-sidecar() {
206246}
207247monitor-sidecar &
208248
249+ # Patch nsight.py before starting Ray worker
250+ sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py
251+
209252cat <<EOFINNER | tee /launch-worker.sh
210253sleep 5
211254ray start --address "$ip_head " \
0 commit comments