[Feat] Basic scripts for deployment best practices (#556)

sumingZero · web-flow · commit 5c48af8790e5 · 2026-01-09T10:58:04.000+08:00
# Purpose What this PR does / why we need it? Provide basic scripts and corresponding documentation for best practices. # Usage example ## 1. Start the ray server of master node. <img width="1169" height="772" alt="image" src="https://github.com/user-attachments/assets/0f354633-4510-4ec6-917e-a7080b474d1b" /> ## 2. Start the ray server of the first worker node. <img width="1036" height="517" alt="image" src="https://github.com/user-attachments/assets/8a9f48ed-de9c-44cf-b759-82c54b87e105" /> ## 3. Start the vllm server in master node. <img width="1503" height="933" alt="image" src="https://github.com/user-attachments/assets/c8d17cc4-d99a-4c6e-8a50-c2f3a2060176" />
diff --git a/examples/deployments/scripts/README.md b/examples/deployments/scripts/README.md
@@ -0,0 +1,82 @@
+# Single-Machine Deployment (CUDA or Ascend)
+
+This scenario applies to a single physical server and uses two files:  
+- `vllm/config.properties`
+- `vllm/run_vllm.sh`
+
+Modify the parameters in `config.properties` according to your actual requirements (e.g., model, memory).  
+
+**Note:** `Multi-node Configuration`, `Ray Configuration` and `Ascend Multi-node Data Parallel` **can be ignored**, as they are only used in multi-machine inference scenarios.
+
+After completing the configuration, launch the service with:
+```bash
+bash run_vllm.sh
+```
+
+# Multi-Machine Deployment (CUDA)
+In multi-node CUDA deployments, vLLM relies on Ray as its distributed backend. Therefore, in addition to `vllm/config.properties` and `vllm/run_vllm.sh`, you must also use `vllm/start_ray.sh` to start the Ray cluster. For a two-node deployment, follow these steps:
+
+step1 Modify config.properties
+- Set `master_ip` to the IP address of the head node
+- Set `worker_ip` to the IP address of the worker node
+- Set `node_num` to 2
+- Set `distributed_executor_backend` to `ray`
+- `Ascend Multi-node Data Parallelism` **can be ignored**, as it is only used in Ascend multi-machine data parallelism inference scenarios.
+- Adjust other vLLM parameters as needed
+
+step2 Start the Ray cluster
+- On the head node:
+    ```bash
+    NODE=0 bash start_ray.sh
+    ```
+- On the worker node:
+    ```bash
+    NODE=1 bash start_ray.sh
+    ```
+
+step3 Launch the vLLM service
+
+Run the following command on **either node**:
+```bash
+bash run_vllm.sh
+```
+
+**Scaling Note:**  To deploy across more machines, set `node_num` to the actual number of nodes and ensure that each worker node’s `worker_ip` is configured to its own IP address.
+
+# Multi-Machine Deployment (Ascend)
+
+Ascend multi-node deployments differ based on whether **Data Parallelism (DP)** is enabled.
+
+## Case 1: DP = 1 (No Data Parallelism)
+
+This case follows the same procedure as CUDA multi-machine deployment and requires the following files:
+- `vllm/config.properties`
+- `vllm/run_vllm.sh`
+- `start_ray.sh`
+
+Follow the exact steps described in the **CUDA Multi-Machine Deployment** section above.
+
+## Case 2: DP > 1 (Data Parallelism Enabled)
+This scenario requires the following files:
+- `vllm/config.properties`
+- `vllm/run_vllm_dp.sh`
+  
+For a two-node deployment, follow these steps:
+
+step1 Modify `config.properties`
+- Set `master_ip` and `worker_ip`
+- Set `dp_size_local` to the number of DP per node
+- `Ray Configuration` can be ignored, as it is not used in the current scenario.
+- Adjust other vLLM parameters as needed
+
+step2 Launch the vLLM service
+- On the head node:
+    ```bash
+    NODE=0 bash run_vllm_dp.sh
+    ```
+- On the worker node:
+    ```bash
+    NODE=1 bash run_vllm_dp.sh
+    ```
+
+**Scaling Note:** When deploying across more nodes, ensure that each worker node’s `worker_ip` is correctly set to its local IP address.
diff --git a/examples/deployments/scripts/vllm/common.sh b/examples/deployments/scripts/vllm/common.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+load_config() {
+    local config_file="${CONFIG_FILE:-$(dirname "${BASH_SOURCE[0]}")/config.properties}"
+    
+    if [[ ! -f "$config_file" ]]; then
+        echo "ERROR: Config file '$config_file' not found!" >&2
+        exit 1
+    fi
+
+    while IFS= read -r line; do
+        line=$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+        [[ -z "$line" || "$line" == \#* ]] && continue
+
+        if [[ "$line" == export\ * ]]; then
+            rest="${line#export }"
+            eval "export $rest"
+        else
+            if [[ "$line" == *=* ]]; then
+                key="${line%%=*}"
+                value="${line#*=}"
+                key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+                value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+                eval "$key=\$value"
+            else
+                echo "WARNING: Invalid config line (no '=' found): $line" >&2
+            fi
+        fi
+    done < "$config_file"
+}
+
+ensure_ifconfig_installed() {
+    if command -v ifconfig >/dev/null 2>&1; then
+        return 0
+    fi
+
+    echo "'ifconfig' not found. Attempting to install net-tools..."
+
+    if command -v apt-get >/dev/null 2>&1; then
+        echo "Detected apt-get (Debian/Ubuntu). Installing net-tools..."
+        sudo apt-get update && sudo apt-get install -y net-tools
+    elif command -v yum >/dev/null 2>&1; then
+        echo "Detected yum (RHEL/CentOS). Installing net-tools..."
+        sudo yum install -y net-tools
+    elif command -v dnf >/dev/null 2>&1; then
+        echo "Detected dnf (Fedora). Installing net-tools..."
+        sudo dnf install -y net-tools
+    else
+        echo "ERROR: No supported package manager (apt/yum/dnf) found."
+        echo "Please install 'net-tools' manually, 'ifconfig' is required to get network interface information."
+        exit 1
+    fi
+
+    if ! command -v ifconfig >/dev/null 2>&1; then
+        echo "ERROR: Failed to install net-tools. Please install 'net-tools' manually, 'ifconfig' is required to get network interface information."
+        exit 1
+    fi
+
+    echo "✅ ifconfig is now available."
+}
+
+get_interface_by_ip() {
+    local target_ip="$1"
+    ifconfig | awk -v target="$target_ip" '
+        /^[[:alnum:]]/ {
+            iface = $1
+            sub(/:$/, "", iface)  
+        }
+        /inet / {
+            for (i = 1; i <= NF; i++) {
+                gsub(/addr:/, "", $i)
+                if ($i == target) {
+                    print iface
+                    exit
+                }
+            }
+        }
+    '
+}
diff --git a/examples/deployments/scripts/vllm/config.properties b/examples/deployments/scripts/vllm/config.properties
@@ -0,0 +1,91 @@
+#****************************************
+#     Devices Visible Configuration     *
+#****************************************
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+
+#****************************************
+#       Multi-node Configuration        *
+#****************************************
+master_ip=<MASTER IP>
+worker_ip=<WORKER IP>
+
+
+#****************************************
+#          Ray Configuration            *
+#****************************************
+# Number of nodes in multi-node inference
+node_num=<NUMBER OF NODES>
+
+
+#****************************************
+#   Ascend Multi-node Data Parallelism   *
+#****************************************
+export HCCL_OP_EXPANSION_MODE="AIV"
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export HCCL_BUFFSIZE=200
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export VLLM_ASCEND_ENABLE_MLAPO=1
+export HCCL_INTRA_PCIE_ENABLE=1
+export HCCL_INTRA_ROCE_ENABLE=0
+dp_rpc_port=13389
+dp_size_local=<NUMBER OF DP PER NODE>
+
+
+#****************************************
+#      Common vLLM Configuration        *
+#****************************************
+# For multi-node and multi-npu inference
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0
+# For multi-node and multi-gpu inference
+export RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1
+# Run deepseek v3.1+ on CUDA
+export VLLM_USE_DEEP_GEMM=0
+export VLLM_LOGGING_LEVEL=INFO
+model=/home/models/QwQ-32B
+# served_model_name=QwQ-32B
+server_host=0.0.0.0
+server_port=7850
+tp_size=4
+dp_size=1
+pp_size=1
+seed=1024
+enable_expert_parallel=false
+enable_prefix_caching=false
+max_model_len=20000
+# max_num_batched_tokens=2048
+# max_num_seqs=20
+# block_size=128
+gpu_memory_utilization=0.87
+# NONE | PIECEWISE | FULL | FULL_DECODE_ONLY | FULL_AND_PIECEWISE
+graph_mode=FULL_DECODE_ONLY
+quantization=NONE
+# mp | ray ; mp for single-node inference, ray for multi-node inference
+distributed_executor_backend=mp
+# async_scheduling=false
+
+# speculative decoding configuration
+enable_speculative_decoding=false
+speculative_decode_model=NONE
+speculative_decode_method=deepseek_mtp
+num_speculative_tokens=1
+
+
+#****************************************
+#  extra vLLM Configuration for Ascend  *
+#****************************************
+enable_ascend_scheduler=false
+# enable_torchair_graph=false
+
+
+#****************************************
+#          UCM  Configuration           *
+#****************************************
+# set true to enable UCM
+ucm_enable=false
+ucm_config_yaml_path=/vllm-workspace/unified-cache-management/examples/ucm_config_example.yaml
+
+
diff --git a/examples/deployments/scripts/vllm/run_vllm.sh b/examples/deployments/scripts/vllm/run_vllm.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+echo $CONFIG_FILE
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+start_server() {
+    [[ -z "$model" ]] && { echo "ERROR: model not set in config.properties" >&2; exit 1; }
+
+    if [[ "$ucm_enable" == "true" ]]; then
+        [[ -z "$ucm_config_yaml_path" ]] && {
+            echo "ERROR: ucm_config_yaml_path not set but ucm_enable=true" >&2
+            exit 1
+        }
+        LOG_FILE="vllm_ucm.log"
+    else
+        LOG_FILE="vllm.log"
+    fi
+
+    echo ""
+    echo "===== vllm server configuration ====="
+    echo "model                    = $model"
+    echo "served_model_name        = ${served_model_name:-<default>}"
+    echo "tp_size                  = $tp_size"
+    echo "dp_size                  = $dp_size"
+    echo "pp_size                  = $pp_size"
+    echo "enable_expert_parallel   = $enable_expert_parallel"
+    echo "max_model_len            = $max_model_len"
+    echo "max_num_batched_tokens   = $max_num_batch_tokens"
+    echo "max_num_seqs             = $max_num_seqs"
+    echo "block_size               = $block_size"
+    echo "gpu_memory_utilization   = $gpu_memory_utilization"
+    echo "quantization             = $quantization"
+    echo "server_host              = $server_host"
+    echo "server_port              = $server_port"
+    echo "distributed_backend      = $distributed_executor_backend"
+    echo "enable_prefix_caching    = $enable_prefix_caching"
+    echo "async_scheduling         = $async_scheduling"
+    echo "graph_mode               = $graph_mode"
+    if [[ "$ucm_enable" == "true" ]]; then
+        echo "ucm_config_file          = $ucm_config_yaml_path"
+    fi
+    echo "log_file                 = $LOG_FILE"
+    echo "====================================="
+    echo ""
+
+    CMD=(
+        vllm serve "$model"
+        --max-model-len "$max_model_len"
+        --tensor-parallel-size "$tp_size"
+        --data-parallel-size "$dp_size"
+        --pipeline-parallel-size "$pp_size"
+        --gpu-memory-utilization "$gpu_memory_utilization"
+        --trust-remote-code
+        --host "$server_host"
+        --port "$server_port"
+        --distributed-executor-backend "$distributed_executor_backend"
+    )
+
+    # --- Optional numeric/string params ---
+    if [[ -n "$block_size" ]]; then CMD+=("--block-size" "$block_size"); fi
+    if [[ -n $max_num_batched_tokens ]]; then CMD+=("--max-num-batched-tokens" "$max_num_batched_tokens"); fi
+    if [[ -n $max_num_seqs ]]; then CMD+=("--max-num-seqs" "$max_num_seqs"); fi
+    if [[ -n "$seed" ]]; then CMD+=("--seed" "$seed"); fi
+    if [[ -n "$served_model_name" ]]; then CMD+=("--served-model-name" "$served_model_name"); fi
+    if [[ -n "$quantization" ]] && [[ "$quantization" != "NONE" ]]; then CMD+=("--quantization" "$quantization"); fi
+    if [[ -n "$graph_mode" ]]; then 
+        COMPILATION_CONFIG='{"cudagraph_mode":"'"$graph_mode"'"}'
+        CMD+=("--compilation-config" "$COMPILATION_CONFIG")
+    fi
+    
+    # --- Boolean flags ---
+    if [[ "$async_scheduling" == "true" ]]; then CMD+=("--async-scheduling"); fi
+    if [[ "$enable_expert_parallel" == "true" ]]; then CMD+=("--enable-expert-parallel"); fi
+    if [[ "$enable_prefix_caching" == "false" ]]; then CMD+=("--no-enable-prefix-caching"); fi
+
+    # --- Advanced configs (JSON) ---
+    if [[ "$enable_speculative_decoding" == "true" ]]; then
+        SPECULATIVE_CONFIG='{"model":"'"$speculative_decode_model"'", "num_speculative_tokens": "'"$num_speculative_tokens"'", "method":"'"$speculative_decode_method"'"}'
+        CMD+=("--speculative-config" "$SPECULATIVE_CONFIG")
+    fi
+
+    ADDITIONAL_CONFIG="{"
+    SEP=""
+    if [[ -n "$enable_ascend_scheduler" ]]; then
+        ADDITIONAL_CONFIG+="${SEP}\"ascend_scheduler_config\":{\"enabled\":$enable_ascend_scheduler}"
+        SEP=","
+    fi
+    if [[ -n "$enable_torchair_graph" ]]; then
+        ADDITIONAL_CONFIG+="${SEP}\"torchair_graph_config\":{\"enabled\":$enable_torchair_graph}"
+        SEP=","
+    fi
+    ADDITIONAL_CONFIG+="}"
+    if [[ "$ADDITIONAL_CONFIG" != "{}" ]]; then CMD+=("--additional-config" "$ADDITIONAL_CONFIG"); fi
+
+    if [[ "$ucm_enable" == "true" ]]; then
+        KV_CONFIG_JSON="{
+            \"kv_connector\":\"UCMConnector\",
+            \"kv_connector_module_path\":\"ucm.integration.vllm.ucm_connector\",
+            \"kv_role\":\"kv_both\",
+            \"kv_connector_extra_config\":{\"UCM_CONFIG_FILE\":\"$ucm_config_yaml_path\"}
+        }"
+        CMD+=("--kv-transfer-config" "$KV_CONFIG_JSON")
+    fi
+
+    echo "Executing command: ${CMD[*]}"
+    echo ""
+
+    "${CMD[@]}" 2>&1 | tee "$LOG_FILE"
+}
+
+load_config
+start_server
diff --git a/examples/deployments/scripts/vllm/run_vllm_dp.sh b/examples/deployments/scripts/vllm/run_vllm_dp.sh
diff --git a/examples/deployments/scripts/vllm/start_ray.sh b/examples/deployments/scripts/vllm/start_ray.sh