fix check health

YanhuiDua · YanhuiDua · commit 9219961b851c · 2026-04-09T17:31:35.000+08:00
diff --git a/autotest/config/rl_qwen3_8B_gsm8k_grpo.py b/autotest/config/rl_qwen3_8B_gsm8k_grpo.py
@@ -28,7 +28,7 @@
 data_path = os.environ["DATA_PATH"]
 eval_data_path = os.environ["EVAL_DATA_PATH"]
 enable_evaluate = True if eval_data_path != "" else False
-enbale_partial_rollout = int(os.environ.get("ENBALE_PARTIAL_ROLLOUT", "0"))
+enable_partial_rollout = int(os.environ.get("ENABLE_PARTIAL_ROLLOUT", "0"))
 
 # basic settings
 experimental_name = "grpo_gsm8k_tiny"
@@ -96,7 +96,7 @@
     prompt_repeat_k=prompt_repeat_k,
     global_batch_size=global_batch_size,
     sample_params=training_sample_params,
-    enable_partial_rollout=enbale_partial_rollout,
+    enable_partial_rollout=enable_partial_rollout,
 )
 
 evaluator_cfg = (
diff --git a/examples/v1/scripts/run_rl.sh b/examples/v1/scripts/run_rl.sh
@@ -50,6 +50,7 @@ if [ "$infer_backend_lower" = "sglang" ]; then
   export XTUNER_USE_SGLANG=1
   unset PYTORCH_CUDA_ALLOC_CONF
   export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+  export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False
 elif [ "$infer_backend_lower" = "lmdeploy" ]; then
   export XTUNER_USE_LMDEPLOY=1
   export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
@@ -86,51 +87,51 @@ fi
 
 # 2. Launch Ray cluster
 # 根据 NODE_COUNT 分配 num_cpus, 防止内存OOM
-node_count=${NODE_COUNT:-1}
-if [ "$ACCELERATOR" = "GPU" ]; then
-  total_cpus=$((node_count * 128))
-elif [ "$ACCELERATOR" = "NPU" ]; then
-  total_cpus=$((node_count * 256))
-fi
+# node_count=${NODE_COUNT:-1}
+# if [ "$ACCELERATOR" = "GPU" ]; then
+#   total_cpus=$((node_count * 128))
+# elif [ "$ACCELERATOR" = "NPU" ]; then
+#   total_cpus=$((node_count * 256))
+# fi
 
-WORK_DIR=$(realpath "$WORK_DIR")
-if [ "$RAY_RANK" -eq 0 ]; then
-  rm -rf /tmp/ray_log
-  export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
-  mkdir -p ${RAY_LOG_DIR}
-  ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
-  ray start --head \
-    --node-ip-address="$RAY_MASTER_ADDR" \
-    --port="$RAY_HEAD_PORT" \
-    --dashboard-host=0.0.0.0 \
-    --dashboard-port=$RAY_DASHBOARD_PORT \
-    --include-dashboard=true \
-    --disable-usage-stats \
-    --num-cpus=$total_cpus \
-    --temp-dir="/tmp/ray_log/"
-else
-  while true; do
-    if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
-      echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
-      break
-    else
-      echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
-      sleep 2
-    fi
-  done
-  ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
-fi
+# WORK_DIR=$(realpath "$WORK_DIR")
+# if [ "$RAY_RANK" -eq 0 ]; then
+#   rm -rf /tmp/ray_log
+#   export RAY_LOG_DIR="${WORK_DIR}/ray_${current_time}/"
+#   mkdir -p ${RAY_LOG_DIR}
+#   ln -sfn "${RAY_LOG_DIR}" /tmp/ray_log
+#   ray start --head \
+#     --node-ip-address="$RAY_MASTER_ADDR" \
+#     --port="$RAY_HEAD_PORT" \
+#     --dashboard-host=0.0.0.0 \
+#     --dashboard-port=$RAY_DASHBOARD_PORT \
+#     --include-dashboard=true \
+#     --disable-usage-stats \
+#     --num-cpus=$total_cpus \
+#     --temp-dir="/tmp/ray_log/"
+# else
+#   while true; do
+#     if curl --connect-timeout 2 "http://${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}" >/dev/null 2>&1; then
+#       echo "Successfully connected to Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT}"
+#       break
+#     else
+#       echo "Waiting for Ray master at ${RAY_MASTER_ADDR}:${RAY_DASHBOARD_PORT} to be available..."
+#       sleep 2
+#     fi
+#   done
+#   ray start --address="$RAY_MASTER_ADDR:$RAY_HEAD_PORT" --block --disable-usage-stats
+# fi
 
-while true; do
-  result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
-  expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
-  if [ "$result" = "$expected_accelerator_count.0" ]; then
-    break
-  else
-    echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
-    sleep 2
-  fi
-done
+# while true; do
+#   result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
+#   expected_accelerator_count=$((node_count * ${ACCELERATOR_PER_NODE}))
+#   if [ "$result" = "$expected_accelerator_count.0" ]; then
+#     break
+#   else
+#     echo "Waiting for ${ACCELERATOR} count to be $expected_accelerator_count, current: $result"
+#     sleep 2
+#   fi
+# done
 
 SCRIPT_NAME=$(basename "$0")
 cp "$0" "${WORK_DIR}/${SCRIPT_NAME}"
diff --git a/examples/v1/scripts/run_rl_deterministic.sh b/examples/v1/scripts/run_rl_deterministic.sh
@@ -14,7 +14,7 @@ EVAL_DATA_PATH=${4:-""}
 export PYTHONPATH=$(pwd):$PYTHONPATH
 
 # deterministic 环境变量
-# NOTE: you should use sglang==0.5.5 to reproduce our results deterministic results.
+# NOTE: you should use sglang==0.5.9 to reproduce our results deterministic results.
 export XTUNER_USE_SGLANG=1
 export XTUNER_USE_LMDEPLOY=0
 export XTUNER_USE_VLLM=0
@@ -24,7 +24,7 @@ export XTUNER_USE_FA3=0
 # sglang 环境变量
 unset PYTORCH_CUDA_ALLOC_CONF
 export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
-
+export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False
 # ray 环境变量
 export MASTER_PORT=6000
 export WORLD_SIZE=${NODE_COUNT:-"1"}
@@ -36,6 +36,7 @@ export RAY_CLIENT_PORT=${RAY_CLIENT_PORT:-"10001"}
 export RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-"8265"}
 # TODO: 提供非环境变量方式配置 ray_max_concurrency
 export RAY_MAX_CONCURRENCY=${RAY_MAX_CONCURRENCY:-1024} # dataflow_max_concurrency * prompt_repeat_k
+export ACCELERATOR=${ACCELERATOR:-"GPU"}
 
 # xtuner 环境变量
 export MODEL_PATH=$MODEL_PATH
@@ -48,6 +49,7 @@ current_time=$(date "+%m%d%H")
 # 取模型路径的最后一级作为model_name，取数据路径的倒数第二级作为data_name
 model_dir_name=$(basename "$MODEL_PATH")
 data_dir_name=$(basename "$(dirname "$DATA_PATH")")
+infer_backend_lower="sglang"
 
 if [ "x$WORK_DIR" = "x" ]; then
   DIR=$(pwd)
@@ -99,7 +101,12 @@ fi
 
 while true; do
   result=$(ray status | grep ${ACCELERATOR} | cut -d ' ' -f2 | cut -d '/' -f2)
-  expected_accelerator_count=$((node_count * 8))
+  accelerator_per_node=${ACCELERATOR_PER_NODE:-8}
+  if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then
+    IFS=',' read -ra visible_devices <<< "${CUDA_VISIBLE_DEVICES}"
+    accelerator_per_node=${#visible_devices[@]}
+  fi
+  expected_accelerator_count=$((node_count * accelerator_per_node))
   if [ "$result" = "$expected_accelerator_count.0" ]; then
     break
   else
@@ -115,4 +122,4 @@ LOG_FILE="${WORK_DIR}/training_log_${current_time}.txt"
 
 python xtuner/v1/train/cli/rl.py \
     --config $CONFIG_PATH \
-    2>&1 | tee -a "${WORK_DIR}/training_log_${current_time}.txt"
+    2>&1 | tee -a "${WORK_DIR}/training_log_${current_time}.txt"
diff --git a/examples/v1/scripts/run_rl_submit.sh b/examples/v1/scripts/run_rl_submit.sh
@@ -48,6 +48,7 @@ if [ "$infer_backend_lower" = "sglang" ]; then
   export XTUNER_USE_SGLANG=1
   unset PYTORCH_CUDA_ALLOC_CONF
   export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+  export SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION=False
 elif [ "$infer_backend_lower" = "lmdeploy" ]; then
   export XTUNER_USE_LMDEPLOY=1
   export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
diff --git a/xtuner/v1/ray/rollout/sglang.py b/xtuner/v1/ray/rollout/sglang.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 import requests
-import torch
 from urllib3.exceptions import NewConnectionError
 
 from transformers import AutoConfig, AutoTokenizer
@@ -150,35 +149,14 @@ def reset_prefix_cache(self):
         return self._make_request("release_memory_occupation")
 
     def _decode_routed_experts(self, routed_experts: Any, meta_info: Dict[str, Any]):
-        if not isinstance(routed_experts, str):
-            return super()._decode_routed_experts(routed_experts, meta_info)
+        import ray
 
-        prompt_tokens = meta_info.get("prompt_tokens", 0)
-        completion_tokens = meta_info.get("completion_tokens", 0)
-        num_tokens = prompt_tokens + completion_tokens - 1
-        assert num_tokens > 0, (
-            f"Unexpected routed_experts token count: prompt_tokens={prompt_tokens}, completion_tokens={completion_tokens}"
+        assert isinstance(routed_experts, str), (
+            f"Expected routed_experts to be a base64 string, got {type(routed_experts)}"
         )
-        assert self.routed_experts_num_hidden_layers is not None, (
-            "num_hidden_layers is required to decode routed_experts"
-        )
-        assert self.routed_experts_num_experts_per_tok is not None, (
-            "num_experts_per_tok is required to decode routed_experts"
-        )
-
         routed_experts_flat = np.frombuffer(base64.b64decode(routed_experts), dtype=np.int32)
-        expected_size = num_tokens * self.routed_experts_num_hidden_layers * self.routed_experts_num_experts_per_tok
-        assert routed_experts_flat.size == expected_size, (
-            f"Unexpected routed_experts size {routed_experts_flat.size}, expected {expected_size}. "
-            f"num_tokens={num_tokens}, num_hidden_layers={self.routed_experts_num_hidden_layers}, "
-            f"num_experts_per_tok={self.routed_experts_num_experts_per_tok}"
-        )
-        routed_experts_array = routed_experts_flat.reshape(
-            num_tokens,
-            self.routed_experts_num_hidden_layers,
-            self.routed_experts_num_experts_per_tok,
-        )
-        return torch.from_numpy(routed_experts_array.copy())
+        routed_experts_ref = ray.put(routed_experts_flat)  # 将 numpy 数组放入 Ray 对象存储
+        return routed_experts_ref
 
     def _transform_rollout_config_to_server_configs(self):
         # remove the CUDA_VISIBLE_DEVICES set by ray and use base_gpu_id
diff --git a/xtuner/v1/rl/base/worker.py b/xtuner/v1/rl/base/worker.py
@@ -432,7 +432,11 @@ def _add_rollout_routed_experts(
                         if self.sp_mesh.get_local_rank() == 0:
                             # only free once of sp mesh
                             to_free_routed_expert_refs.append(rollout_routed_expert_refs)
-                    out_rollout_routed_expert.append(torch.as_tensor(rollout_routed_expert, dtype=torch.long))
+                    rollout_routed_expert = torch.as_tensor(rollout_routed_expert, dtype=torch.long)
+                    rollout_routed_expert = rollout_routed_expert.reshape(
+                        -1, language_cfg.num_hidden_layers, language_cfg.num_experts_per_tok
+                    )
+                    out_rollout_routed_expert.append(rollout_routed_expert)
 
             seq_ctx.rollout_routed_experts = torch.cat(out_rollout_routed_expert, dim=0)  # max_len,l,e
         else: