@@ -50,6 +50,7 @@ if [ "$infer_backend_lower" = "sglang" ]; then
5050 export XTUNER_USE_SGLANG=1
5151 unset PYTORCH_CUDA_ALLOC_CONF
5252 export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
53+ export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False
5354elif [ " $infer_backend_lower " = " lmdeploy" ]; then
5455 export XTUNER_USE_LMDEPLOY=1
5556 export PYTORCH_CUDA_ALLOC_CONF=' expandable_segments:True'
8687
8788# 2. Launch Ray cluster
8889# 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM
89- node_count=${NODE_COUNT:- 1}
90- if [ " $ACCELERATOR " = " GPU" ]; then
91- total_cpus=$(( node_count * 128 ))
92- elif [ " $ACCELERATOR " = " NPU" ]; then
93- total_cpus=$(( node_count * 256 ))
94- fi
90+ # node_count=${NODE_COUNT:-1}
91+ # if [ "$ACCELERATOR" = "GPU" ]; then
92+ # total_cpus=$((node_count * 128))
93+ # elif [ "$ACCELERATOR" = "NPU" ]; then
94+ # total_cpus=$((node_count * 256))
95+ # fi
9596
96- WORK_DIR=$( realpath " $WORK_DIR " )
97- if [ " $RAY_RANK " -eq 0 ]; then
98- rm -rf /tmp/ray_log
99- export RAY_LOG_DIR=" ${WORK_DIR} /ray_${current_time} /"
100- mkdir -p ${RAY_LOG_DIR}
101- ln -sfn " ${RAY_LOG_DIR} " /tmp/ray_log
102- ray start --head \
103- --node-ip-address=" $RAY_MASTER_ADDR " \
104- --port=" $RAY_HEAD_PORT " \
105- --dashboard-host=0.0.0.0 \
106- --dashboard-port=$RAY_DASHBOARD_PORT \
107- --include-dashboard=true \
108- --disable-usage-stats \
109- --num-cpus=$total_cpus \
110- --temp-dir=" /tmp/ray_log/"
111- else
112- while true ; do
113- if curl --connect-timeout 2 " http://${RAY_MASTER_ADDR} :${RAY_DASHBOARD_PORT} " > /dev/null 2>&1 ; then
114- echo " Successfully connected to Ray master at ${RAY_MASTER_ADDR} :${RAY_DASHBOARD_PORT} "
115- break
116- else
117- echo " Waiting for Ray master at ${RAY_MASTER_ADDR} :${RAY_DASHBOARD_PORT} to be available..."
118- sleep 2
119- fi
120- done
121- ray start --address=" $RAY_MASTER_ADDR :$RAY_HEAD_PORT " --block --disable-usage-stats
122- fi
97+ # WORK_DIR=$(realpath "$WORK_DIR")
98+ # if [ "$RAY_RANK" -eq 0 ]; then
99+ # rm -rf /tmp/ray_log
100+ # export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
101+ # mkdir -p ${RAY_LOG_DIR}
102+ # ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
103+ # ray start --head \
104+ # --node-ip-address="$RAY_MASTER_ADDR" \
105+ # --port="$RAY_HEAD_PORT" \
106+ # --dashboard-host=0.0.0.0 \
107+ # --dashboard-port=$RAY_DASHBOARD_PORT \
108+ # --include-dashboard=true \
109+ # --disable-usage-stats \
110+ # --num-cpus=$total_cpus \
111+ # --temp-dir="/tmp/ray_log/"
112+ # else
113+ # while true; do
114+ # if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
115+ # echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
116+ # break
117+ # else
118+ # echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
119+ # sleep 2
120+ # fi
121+ # done
122+ # ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
123+ # fi
123124
124- while true ; do
125- result=$( ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d ' /' -f2)
126- expected_accelerator_count=$(( node_count * ${ACCELERATOR_PER_NODE} ))
127- if [ " $result " = " $expected_accelerator_count .0" ]; then
128- break
129- else
130- echo " Waiting for ${ACCELERATOR} count to be $expected_accelerator_count , current: $result "
131- sleep 2
132- fi
133- done
125+ # while true; do
126+ # result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
127+ # expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
128+ # if [ "$result" = "$expected_accelerator_count.0" ]; then
129+ # break
130+ # else
131+ # echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
132+ # sleep 2
133+ # fi
134+ # done
134135
135136SCRIPT_NAME=$( basename " $0 " )
136137cp " $0 " " ${WORK_DIR} /${SCRIPT_NAME} "
0 commit comments