Skip to content

Commit 86e4006

Browse files
committed
fix decode_routed_experts for sglang
1 parent 0c0b4a2 commit 86e4006

File tree

3 files changed

+48
-52
lines changed

3 files changed

+48
-52
lines changed

examples/v1/scripts/run_rl.sh

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -87,51 +87,51 @@ fi
8787

8888
# 2. Launch Ray cluster
8989
# 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM
90-
# node_count=${NODE_COUNT:-1}
91-
# if [ "$ACCELERATOR" = "GPU" ]; then
92-
# total_cpus=$((node_count * 128))
93-
# elif [ "$ACCELERATOR" = "NPU" ]; then
94-
# total_cpus=$((node_count * 256))
95-
# fi
90+
node_count=${NODE_COUNT:-1}
91+
if [ "$ACCELERATOR" = "GPU" ]; then
92+
total_cpus=$((node_count * 128))
93+
elif [ "$ACCELERATOR" = "NPU" ]; then
94+
total_cpus=$((node_count * 256))
95+
fi
9696

97-
# WORK_DIR=$(realpath "$WORK_DIR")
98-
# if [ "$RAY_RANK" -eq 0 ]; then
99-
# rm -rf /tmp/ray_log
100-
# export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
101-
# mkdir -p ${RAY_LOG_DIR}
102-
# ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
103-
# ray start --head \
104-
# --node-ip-address="$RAY_MASTER_ADDR" \
105-
# --port="$RAY_HEAD_PORT" \
106-
# --dashboard-host=0.0.0.0 \
107-
# --dashboard-port=$RAY_DASHBOARD_PORT \
108-
# --include-dashboard=true \
109-
# --disable-usage-stats \
110-
# --num-cpus=$total_cpus \
111-
# --temp-dir="/tmp/ray_log/"
112-
# else
113-
# while true; do
114-
# if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
115-
# echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
116-
# break
117-
# else
118-
# echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
119-
# sleep 2
120-
# fi
121-
# done
122-
# ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
123-
# fi
97+
WORK_DIR=$(realpath "$WORK_DIR")
98+
if [ "$RAY_RANK" -eq 0 ]; then
99+
rm -rf /tmp/ray_log
100+
export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
101+
mkdir -p ${RAY_LOG_DIR}
102+
ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
103+
ray start --head \
104+
--node-ip-address="$RAY_MASTER_ADDR" \
105+
--port="$RAY_HEAD_PORT" \
106+
--dashboard-host=0.0.0.0 \
107+
--dashboard-port=$RAY_DASHBOARD_PORT \
108+
--include-dashboard=true \
109+
--disable-usage-stats \
110+
--num-cpus=$total_cpus \
111+
--temp-dir="/tmp/ray_log/"
112+
else
113+
while true; do
114+
if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
115+
echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
116+
break
117+
else
118+
echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
119+
sleep 2
120+
fi
121+
done
122+
ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
123+
fi
124124

125-
# while true; do
126-
# result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
127-
# expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
128-
# if [ "$result" = "$expected_accelerator_count.0" ]; then
129-
# break
130-
# else
131-
# echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
132-
# sleep 2
133-
# fi
134-
# done
125+
while true; do
126+
result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
127+
expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
128+
if [ "$result" = "$expected_accelerator_count.0" ]; then
129+
break
130+
else
131+
echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
132+
sleep 2
133+
fi
134+
done
135135

136136
SCRIPT_NAME=$(basename "$0")
137137
cp "$0" "${WORK_DIR}/${SCRIPT_NAME}"

xtuner/v1/ray/rollout/sglang.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,14 +144,10 @@ def reset_prefix_cache(self):
144144
return self._make_request("release_memory_occupation")
145145

146146
def _decode_routed_experts(self, routed_experts: Any, meta_info: Dict[str, Any]):
147-
import ray
148-
149-
assert isinstance(routed_experts, str), (
150-
f"Expected routed_experts to be a base64 string, got {type(routed_experts)}"
151-
)
152-
routed_experts_flat = np.frombuffer(base64.b64decode(routed_experts), dtype=np.int32)
153-
routed_experts_ref = ray.put(routed_experts_flat) # 将 numpy 数组放入 Ray 对象存储
154-
return routed_experts_ref
147+
if isinstance(routed_experts, str):
148+
routed_experts_flat = np.frombuffer(base64.b64decode(routed_experts), dtype=np.int32)
149+
return routed_experts_flat
150+
return routed_experts
155151

156152
def _transform_rollout_config_to_server_configs(self):
157153
# remove the CUDA_VISIBLE_DEVICES set by ray and use base_gpu_id

xtuner/v1/ray/rollout/worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ async def _handle_non_stream_response(
572572
if routed_experts is not None and not exist_history_routed_experts:
573573
# 不存在历史专家,先把当前专家存起来
574574
routed_experts = self._decode_routed_experts(routed_experts, response["meta_info"])
575-
if not isinstance(routed_experts, ObjectRef):
575+
if not isinstance(routed_experts, ObjectRef): # 全部转为ray.objectref存储
576576
routed_experts = ray.put(routed_experts)
577577
extra_info["routed_experts"] = routed_experts
578578
elif routed_experts is not None and exist_history_routed_experts:

0 commit comments

Comments
 (0)