fix decode_routed_experts for sglang

YanhuiDua · YanhuiDua · commit 86e40069de01 · 2026-04-09T18:30:20.000+08:00
diff --git a/examples/v1/scripts/run_rl.sh b/examples/v1/scripts/run_rl.sh
@@ -87,51 +87,51 @@ fi
 
 # 2. Launch Ray cluster
 # 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM
-# node_count=${NODE_COUNT:-1}
-# if [ "$ACCELERATOR" = "GPU" ]; then
-#   total_cpus=$((node_count * 128))
-# elif [ "$ACCELERATOR" = "NPU" ]; then
-#   total_cpus=$((node_count * 256))
-# fi
+node_count=${NODE_COUNT:-1}
+if [ "$ACCELERATOR" = "GPU" ]; then
+  total_cpus=$((node_count * 128))
+elif [ "$ACCELERATOR" = "NPU" ]; then
+  total_cpus=$((node_count * 256))
+fi
 
-# WORK_DIR=$(realpath "$WORK_DIR")
-# if [ "$RAY_RANK" -eq 0 ]; then
-#   rm -rf /tmp/ray_log
-#   export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
-#   mkdir -p ${RAY_LOG_DIR}
-#   ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
-#   ray start --head \
-#     --node-ip-address="$RAY_MASTER_ADDR" \
-#     --port="$RAY_HEAD_PORT" \
-#     --dashboard-host=0.0.0.0 \
-#     --dashboard-port=$RAY_DASHBOARD_PORT \
-#     --include-dashboard=true \
-#     --disable-usage-stats \
-#     --num-cpus=$total_cpus \
-#     --temp-dir="/tmp/ray_log/"
-# else
-#   while true; do
-#     if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
-#       echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
-#       break
-#     else
-#       echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
-#       sleep 2
-#     fi
-#   done
-#   ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
-# fi
+WORK_DIR=$(realpath "$WORK_DIR")
+if [ "$RAY_RANK" -eq 0 ]; then
+  rm -rf /tmp/ray_log
+  export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
+  mkdir -p ${RAY_LOG_DIR}
+  ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
+  ray start --head \
+    --node-ip-address="$RAY_MASTER_ADDR" \
+    --port="$RAY_HEAD_PORT" \
+    --dashboard-host=0.0.0.0 \
+    --dashboard-port=$RAY_DASHBOARD_PORT \
+    --include-dashboard=true \
+    --disable-usage-stats \
+    --num-cpus=$total_cpus \
+    --temp-dir="/tmp/ray_log/"
+else
+  while true; do
+    if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
+      echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
+      break
+    else
+      echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
+      sleep 2
+    fi
+  done
+  ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
+fi
 
-# while true; do
-#   result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
-#   expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
-#   if [ "$result" = "$expected_accelerator_count.0" ]; then
-#     break
-#   else
-#     echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
-#     sleep 2
-#   fi
-# done
+while true; do
+  result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
+  expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
+  if [ "$result" = "$expected_accelerator_count.0" ]; then
+    break
+  else
+    echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
+    sleep 2
+  fi
+done
 
 SCRIPT_NAME=$(basename "$0")
 cp "$0" "${WORK_DIR}/${SCRIPT_NAME}"
diff --git a/xtuner/v1/ray/rollout/sglang.py b/xtuner/v1/ray/rollout/sglang.py
@@ -144,14 +144,10 @@ def reset_prefix_cache(self):
         return self._make_request("release_memory_occupation")
 
     def _decode_routed_experts(self, routed_experts: Any, meta_info: Dict[str, Any]):
-        import ray
-
-        assert isinstance(routed_experts, str), (
-            f"Expected routed_experts to be a base64 string, got {type(routed_experts)}"
-        )
-        routed_experts_flat = np.frombuffer(base64.b64decode(routed_experts), dtype=np.int32)
-        routed_experts_ref = ray.put(routed_experts_flat)  # 将 numpy 数组放入 Ray 对象存储
-        return routed_experts_ref
+        if isinstance(routed_experts, str):
+            routed_experts_flat = np.frombuffer(base64.b64decode(routed_experts), dtype=np.int32)
+            return routed_experts_flat
+        return routed_experts
 
     def _transform_rollout_config_to_server_configs(self):
         # remove the CUDA_VISIBLE_DEVICES set by ray and use base_gpu_id
diff --git a/xtuner/v1/ray/rollout/worker.py b/xtuner/v1/ray/rollout/worker.py
@@ -572,7 +572,7 @@ async def _handle_non_stream_response(
                     if routed_experts is not None and not exist_history_routed_experts:
                         # 不存在历史专家，先把当前专家存起来
                         routed_experts = self._decode_routed_experts(routed_experts, response["meta_info"])
-                        if not isinstance(routed_experts, ObjectRef):
+                        if not isinstance(routed_experts, ObjectRef):  # 全部转为ray.objectref存储
                             routed_experts = ray.put(routed_experts)
                         extra_info["routed_experts"] = routed_experts
                     elif routed_experts is not None and exist_history_routed_experts: