|
87 | 87 |
|
88 | 88 | # 2. Launch Ray cluster |
89 | 89 | # 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM |
90 | | -# node_count=${NODE_COUNT:-1} |
91 | | -# if [ "$ACCELERATOR" = "GPU" ]; then |
92 | | -# total_cpus=$((node_count * 128)) |
93 | | -# elif [ "$ACCELERATOR" = "NPU" ]; then |
94 | | -# total_cpus=$((node_count * 256)) |
95 | | -# fi |
| 90 | +node_count=${NODE_COUNT:-1} |
| 91 | +if [ "$ACCELERATOR" = "GPU" ]; then |
| 92 | + total_cpus=$((node_count * 128)) |
| 93 | +elif [ "$ACCELERATOR" = "NPU" ]; then |
| 94 | + total_cpus=$((node_count * 256)) |
| 95 | +fi |
96 | 96 |
|
97 | | -# WORK_DIR=$(realpath "$WORK_DIR") |
98 | | -# if [ "$RAY_RANK" -eq 0 ]; then |
99 | | -# rm -rf /tmp/ray_log |
100 | | -# export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/" |
101 | | -# mkdir -p ${RAY_LOG_DIR} |
102 | | -# ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log |
103 | | -# ray start --head \ |
104 | | -# --node-ip-address="$RAY_MASTER_ADDR" \ |
105 | | -# --port="$RAY_HEAD_PORT" \ |
106 | | -# --dashboard-host=0.0.0.0 \ |
107 | | -# --dashboard-port=$RAY_DASHBOARD_PORT \ |
108 | | -# --include-dashboard=true \ |
109 | | -# --disable-usage-stats \ |
110 | | -# --num-cpus=$total_cpus \ |
111 | | -# --temp-dir="/tmp/ray_log/" |
112 | | -# else |
113 | | -# while true; do |
114 | | -# if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then |
115 | | -# echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" |
116 | | -# break |
117 | | -# else |
118 | | -# echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..." |
119 | | -# sleep 2 |
120 | | -# fi |
121 | | -# done |
122 | | -# ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats |
123 | | -# fi |
| 97 | +WORK_DIR=$(realpath "$WORK_DIR") |
| 98 | +if [ "$RAY_RANK" -eq 0 ]; then |
| 99 | + rm -rf /tmp/ray_log |
| 100 | + export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/" |
| 101 | + mkdir -p ${RAY_LOG_DIR} |
| 102 | + ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log |
| 103 | + ray start --head \ |
| 104 | + --node-ip-address="$RAY_MASTER_ADDR" \ |
| 105 | + --port="$RAY_HEAD_PORT" \ |
| 106 | + --dashboard-host=0.0.0.0 \ |
| 107 | + --dashboard-port=$RAY_DASHBOARD_PORT \ |
| 108 | + --include-dashboard=true \ |
| 109 | + --disable-usage-stats \ |
| 110 | + --num-cpus=$total_cpus \ |
| 111 | + --temp-dir="/tmp/ray_log/" |
| 112 | +else |
| 113 | + while true; do |
| 114 | + if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then |
| 115 | + echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" |
| 116 | + break |
| 117 | + else |
| 118 | + echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..." |
| 119 | + sleep 2 |
| 120 | + fi |
| 121 | + done |
| 122 | + ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats |
| 123 | +fi |
124 | 124 |
|
125 | | -# while true; do |
126 | | -# result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2) |
127 | | -# expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE})) |
128 | | -# if [ "$result" = "$expected_accelerator_count.0" ]; then |
129 | | -# break |
130 | | -# else |
131 | | -# echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result" |
132 | | -# sleep 2 |
133 | | -# fi |
134 | | -# done |
| 125 | +while true; do |
| 126 | + result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2) |
| 127 | + expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE})) |
| 128 | + if [ "$result" = "$expected_accelerator_count.0" ]; then |
| 129 | + break |
| 130 | + else |
| 131 | + echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result" |
| 132 | + sleep 2 |
| 133 | + fi |
| 134 | +done |
135 | 135 |
|
136 | 136 | SCRIPT_NAME=$(basename "$0") |
137 | 137 | cp "$0" "${WORK_DIR}/${SCRIPT_NAME}" |
|
0 commit comments