From 1a83804139f6273b4b5a3b464caaeea2fc772197 Mon Sep 17 00:00:00 2001 From: sumingZero <469434916@qq.com> Date: Fri, 23 Jan 2026 00:07:11 -0800 Subject: [PATCH] [Fix] Failed to start vLLLM service using multi-node launch scripts under CUDA data parallelism --- examples/deployments/scripts/vllm/common.sh | 6 +++--- examples/deployments/scripts/vllm/run_vllm.sh | 6 +++++- examples/deployments/scripts/vllm/start_ray.sh | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/deployments/scripts/vllm/common.sh b/examples/deployments/scripts/vllm/common.sh index e9fe02874..a4964106a 100644 --- a/examples/deployments/scripts/vllm/common.sh +++ b/examples/deployments/scripts/vllm/common.sh @@ -38,13 +38,13 @@ ensure_ifconfig_installed() { if command -v apt-get >/dev/null 2>&1; then echo "Detected apt-get (Debian/Ubuntu). Installing net-tools..." - sudo apt-get update && sudo apt-get install -y net-tools + apt-get update && apt-get install -y net-tools elif command -v yum >/dev/null 2>&1; then echo "Detected yum (RHEL/CentOS). Installing net-tools..." - sudo yum install -y net-tools + yum install -y net-tools elif command -v dnf >/dev/null 2>&1; then echo "Detected dnf (Fedora). Installing net-tools..." - sudo dnf install -y net-tools + dnf install -y net-tools else echo "ERROR: No supported package manager (apt/yum/dnf) found." echo "Please install 'net-tools' manually, 'ifconfig' is required to get network interface information." diff --git a/examples/deployments/scripts/vllm/run_vllm.sh b/examples/deployments/scripts/vllm/run_vllm.sh index 972b50265..24a6cd68e 100644 --- a/examples/deployments/scripts/vllm/run_vllm.sh +++ b/examples/deployments/scripts/vllm/run_vllm.sh @@ -25,7 +25,7 @@ start_server() { echo "pp_size = $pp_size" echo "enable_expert_parallel = $enable_expert_parallel" echo "max_model_len = $max_model_len" - echo "max_num_batched_tokens = $max_num_batch_tokens" + echo "max_num_batched_tokens = $max_num_batched_tokens" echo "max_num_seqs = $max_num_seqs" echo "block_size = $block_size" echo "gpu_memory_utilization = $gpu_memory_utilization" @@ -69,6 +69,10 @@ start_server() { fi # --- Boolean flags --- + if [[ "$distributed_executor_backend" == "ray" ]] && [[ "$dp_size" -gt 1 ]]; then + CMD+=("--data-parallel-backend" "ray") + CMD+=("--data-parallel-size-local" "$((dp_size / node_num))"); + fi if [[ "$async_scheduling" == "true" ]]; then CMD+=("--async-scheduling"); fi if [[ "$enable_expert_parallel" == "true" ]]; then CMD+=("--enable-expert-parallel"); fi if [[ "$enable_prefix_caching" == "false" ]]; then CMD+=("--no-enable-prefix-caching"); fi diff --git a/examples/deployments/scripts/vllm/start_ray.sh b/examples/deployments/scripts/vllm/start_ray.sh index 826e3c90d..f31e848d9 100644 --- a/examples/deployments/scripts/vllm/start_ray.sh +++ b/examples/deployments/scripts/vllm/start_ray.sh @@ -30,7 +30,7 @@ set_node_env(){ export NCCL_SOCKET_IFNAME="$IFACE" export GLOO_SOCKET_IFNAME="$IFACE" export TP_SOCKET_IFNAME="$IFACE" - export NUM_GPUS=$((tp_size / node_num)) + export NUM_GPUS=$((tp_size * dp_size * pp_size / node_num)) echo "" echo "===== ray startup configuration ======"