Skip to content

Commit 9219961

Browse files
committed
fix check health
1 parent 4003964 commit 9219961

6 files changed

Lines changed: 68 additions & 77 deletions

File tree

autotest/config/rl_qwen3_8B_gsm8k_grpo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
data_path = os.environ["DATA_PATH"]
2929
eval_data_path = os.environ["EVAL_DATA_PATH"]
3030
enable_evaluate = True if eval_data_path != "" else False
31-
enbale_partial_rollout = int(os.environ.get("ENBALE_PARTIAL_ROLLOUT", "0"))
31+
enable_partial_rollout = int(os.environ.get("ENABLE_PARTIAL_ROLLOUT", "0"))
3232

3333
# basic settings
3434
experimental_name = "grpo_gsm8k_tiny"
@@ -96,7 +96,7 @@
9696
prompt_repeat_k=prompt_repeat_k,
9797
global_batch_size=global_batch_size,
9898
sample_params=training_sample_params,
99-
enable_partial_rollout=enbale_partial_rollout,
99+
enable_partial_rollout=enable_partial_rollout,
100100
)
101101

102102
evaluator_cfg = (

examples/v1/scripts/run_rl.sh

Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ if [ "$infer_backend_lower" = "sglang" ]; then
5050
export XTUNER_USE_SGLANG=1
5151
unset PYTORCH_CUDA_ALLOC_CONF
5252
export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
53+
export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False
5354
elif [ "$infer_backend_lower" = "lmdeploy" ]; then
5455
export XTUNER_USE_LMDEPLOY=1
5556
export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
@@ -86,51 +87,51 @@ fi
8687

8788
# 2. Launch Ray cluster
8889
# 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM
89-
node_count=${NODE_COUNT:-1}
90-
if [ "$ACCELERATOR" = "GPU" ]; then
91-
total_cpus=$((node_count * 128))
92-
elif [ "$ACCELERATOR" = "NPU" ]; then
93-
total_cpus=$((node_count * 256))
94-
fi
90+
# node_count=${NODE_COUNT:-1}
91+
# if [ "$ACCELERATOR" = "GPU" ]; then
92+
# total_cpus=$((node_count * 128))
93+
# elif [ "$ACCELERATOR" = "NPU" ]; then
94+
# total_cpus=$((node_count * 256))
95+
# fi
9596

96-
WORK_DIR=$(realpath "$WORK_DIR")
97-
if [ "$RAY_RANK" -eq 0 ]; then
98-
rm -rf /tmp/ray_log
99-
export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
100-
mkdir -p ${RAY_LOG_DIR}
101-
ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
102-
ray start --head \
103-
--node-ip-address="$RAY_MASTER_ADDR" \
104-
--port="$RAY_HEAD_PORT" \
105-
--dashboard-host=0.0.0.0 \
106-
--dashboard-port=$RAY_DASHBOARD_PORT \
107-
--include-dashboard=true \
108-
--disable-usage-stats \
109-
--num-cpus=$total_cpus \
110-
--temp-dir="/tmp/ray_log/"
111-
else
112-
while true; do
113-
if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
114-
echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
115-
break
116-
else
117-
echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
118-
sleep 2
119-
fi
120-
done
121-
ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
122-
fi
97+
# WORK_DIR=$(realpath "$WORK_DIR")
98+
# if [ "$RAY_RANK" -eq 0 ]; then
99+
# rm -rf /tmp/ray_log
100+
# export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
101+
# mkdir -p ${RAY_LOG_DIR}
102+
# ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
103+
# ray start --head \
104+
# --node-ip-address="$RAY_MASTER_ADDR" \
105+
# --port="$RAY_HEAD_PORT" \
106+
# --dashboard-host=0.0.0.0 \
107+
# --dashboard-port=$RAY_DASHBOARD_PORT \
108+
# --include-dashboard=true \
109+
# --disable-usage-stats \
110+
# --num-cpus=$total_cpus \
111+
# --temp-dir="/tmp/ray_log/"
112+
# else
113+
# while true; do
114+
# if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
115+
# echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
116+
# break
117+
# else
118+
# echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
119+
# sleep 2
120+
# fi
121+
# done
122+
# ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
123+
# fi
123124

124-
while true; do
125-
result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
126-
expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
127-
if [ "$result" = "$expected_accelerator_count.0" ]; then
128-
break
129-
else
130-
echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
131-
sleep 2
132-
fi
133-
done
125+
# while true; do
126+
# result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
127+
# expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
128+
# if [ "$result" = "$expected_accelerator_count.0" ]; then
129+
# break
130+
# else
131+
# echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
132+
# sleep 2
133+
# fi
134+
# done
134135

135136
SCRIPT_NAME=$(basename "$0")
136137
cp "$0" "${WORK_DIR}/${SCRIPT_NAME}"

examples/v1/scripts/run_rl_deterministic.sh

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ EVAL_DATA_PATH=${4:-""}
1414
export PYTHONPATH=$(pwd):$PYTHONPATH
1515

1616
# deterministic 环境变量
17-
# NOTE: you should use sglang==0.5.5 to reproduce our results deterministic results.
17+
# NOTE: you should use sglang==0.5.9 to reproduce our results deterministic results.
1818
export XTUNER_USE_SGLANG=1
1919
export XTUNER_USE_LMDEPLOY=0
2020
export XTUNER_USE_VLLM=0
@@ -24,7 +24,7 @@ export XTUNER_USE_FA3=0
2424
# sglang 环境变量
2525
unset PYTORCH_CUDA_ALLOC_CONF
2626
export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
27-
27+
export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False
2828
# ray 环境变量
2929
export MASTER_PORT=6000
3030
export WORLD_SIZE=${NODE_COUNT:-"1"}
@@ -36,6 +36,7 @@ export RAY_CLIENT_PORT=${RAY_CLIENT_PORT:-"10001"}
3636
export RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-"8265"}
3737
# TODO: 提供非环境变量方式配置 ray_max_concurrency
3838
export RAY_MAX_CONCURRENCY=${RAY_MAX_CONCURRENCY:-1024} # dataflow_max_concurrency * prompt_repeat_k
39+
export ACCELERATOR=${ACCELERATOR:-"GPU"}
3940

4041
# xtuner 环境变量
4142
export MODEL_PATH=$MODEL_PATH
@@ -48,6 +49,7 @@ current_time=$(date "+%m%d%H")
4849
# 取模型路径的最后一级作为model_name,取数据路径的倒数第二级作为data_name
4950
model_dir_name=$(basename "$MODEL_PATH")
5051
data_dir_name=$(basename "$(dirname "$DATA_PATH")")
52+
infer_backend_lower="sglang"
5153

5254
if [ "x$WORK_DIR" = "x" ]; then
5355
DIR=$(pwd)
@@ -99,7 +101,12 @@ fi
99101

100102
while true; do
101103
result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
102-
expected_accelerator_count=$((node_count * 8))
104+
accelerator_per_node=${ACCELERATOR_PER_NODE:-8}
105+
if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then
106+
IFS=',' read -ra visible_devices <<< "${CUDA_VISIBLE_DEVICES}"
107+
accelerator_per_node=${#visible_devices[@]}
108+
fi
109+
expected_accelerator_count=$((node_count * accelerator_per_node))
103110
if [ "$result" = "$expected_accelerator_count.0" ]; then
104111
break
105112
else
@@ -115,4 +122,4 @@ LOG_FILE="${WORK_DIR}/training_log_${current_time}.txt"
115122

116123
python xtuner/v1/train/cli/rl.py \
117124
--config $CONFIG_PATH \
118-
2>&1 | tee -a "${WORK_DIR}/training_log_${current_time}.txt"
125+
2>&1 | tee -a "${WORK_DIR}/training_log_${current_time}.txt"

examples/v1/scripts/run_rl_submit.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ if [ "$infer_backend_lower" = "sglang" ]; then
4848
export XTUNER_USE_SGLANG=1
4949
unset PYTORCH_CUDA_ALLOC_CONF
5050
export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
51+
export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False
5152
elif [ "$infer_backend_lower" = "lmdeploy" ]; then
5253
export XTUNER_USE_LMDEPLOY=1
5354
export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'

xtuner/v1/ray/rollout/sglang.py

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import numpy as np
66
import requests
7-
import torch
87
from urllib3.exceptions import NewConnectionError
98

109
from transformers import AutoConfig, AutoTokenizer
@@ -150,35 +149,14 @@ def reset_prefix_cache(self):
150149
return self._make_request("release_memory_occupation")
151150

152151
def _decode_routed_experts(self, routed_experts: Any, meta_info: Dict[str, Any]):
153-
if not isinstance(routed_experts, str):
154-
return super()._decode_routed_experts(routed_experts, meta_info)
152+
import ray
155153

156-
prompt_tokens = meta_info.get("prompt_tokens", 0)
157-
completion_tokens = meta_info.get("completion_tokens", 0)
158-
num_tokens = prompt_tokens + completion_tokens - 1
159-
assert num_tokens > 0, (
160-
f"Unexpected routed_experts token count: prompt_tokens={prompt_tokens}, completion_tokens={completion_tokens}"
154+
assert isinstance(routed_experts, str), (
155+
f"Expected routed_experts to be a base64 string, got {type(routed_experts)}"
161156
)
162-
assert self.routed_experts_num_hidden_layers is not None, (
163-
"num_hidden_layers is required to decode routed_experts"
164-
)
165-
assert self.routed_experts_num_experts_per_tok is not None, (
166-
"num_experts_per_tok is required to decode routed_experts"
167-
)
168-
169157
routed_experts_flat = np.frombuffer(base64.b64decode(routed_experts), dtype=np.int32)
170-
expected_size = num_tokens * self.routed_experts_num_hidden_layers * self.routed_experts_num_experts_per_tok
171-
assert routed_experts_flat.size == expected_size, (
172-
f"Unexpected routed_experts size {routed_experts_flat.size}, expected {expected_size}. "
173-
f"num_tokens={num_tokens}, num_hidden_layers={self.routed_experts_num_hidden_layers}, "
174-
f"num_experts_per_tok={self.routed_experts_num_experts_per_tok}"
175-
)
176-
routed_experts_array = routed_experts_flat.reshape(
177-
num_tokens,
178-
self.routed_experts_num_hidden_layers,
179-
self.routed_experts_num_experts_per_tok,
180-
)
181-
return torch.from_numpy(routed_experts_array.copy())
158+
routed_experts_ref = ray.put(routed_experts_flat) # 将 numpy 数组放入 Ray 对象存储
159+
return routed_experts_ref
182160

183161
def _transform_rollout_config_to_server_configs(self):
184162
# remove the CUDA_VISIBLE_DEVICES set by ray and use base_gpu_id

xtuner/v1/rl/base/worker.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,11 @@ def _add_rollout_routed_experts(
432432
if self.sp_mesh.get_local_rank() == 0:
433433
# only free once of sp mesh
434434
to_free_routed_expert_refs.append(rollout_routed_expert_refs)
435-
out_rollout_routed_expert.append(torch.as_tensor(rollout_routed_expert, dtype=torch.long))
435+
rollout_routed_expert = torch.as_tensor(rollout_routed_expert, dtype=torch.long)
436+
rollout_routed_expert = rollout_routed_expert.reshape(
437+
-1, language_cfg.num_hidden_layers, language_cfg.num_experts_per_tok
438+
)
439+
out_rollout_routed_expert.append(rollout_routed_expert)
436440

437441
seq_ctx.rollout_routed_experts = torch.cat(out_rollout_routed_expert, dim=0) # max_len,l,e
438442
else:

0 commit comments

Comments
 (0)