Skip to content

Commit 69844b2

Browse files
committed
temp fix (#148)
1 parent 6edcc3a commit 69844b2

12 files changed

Lines changed: 375 additions & 43 deletions
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env bash
2+
3+
# === Required Env Vars ===
4+
# HF_TOKEN
5+
# HF_HUB_CACHE
6+
# IMAGE
7+
# MODEL
8+
# ISL
9+
# OSL
10+
# MAX_MODEL_LEN
11+
# RANDOM_RANGE_RATIO
12+
# TP
13+
# CONC
14+
# RESULT_FILENAME
15+
# PORT_OFFSET
16+
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
19+
hf download $MODEL
20+
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
21+
PORT=$(( 8888 + $PORT_OFFSET ))
22+
23+
24+
set -x
25+
26+
# Create llama-config.yml inline
27+
# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
28+
if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
29+
cat > llama-config.yml << 'EOF'
30+
batch_wait_max_tokens_ratio: 0.9
31+
batch_wait_timeout_iters: 20
32+
cuda_graph_config:
33+
enable_padding: true
34+
max_batch_size: 1024
35+
kv_cache_config:
36+
dtype: fp8
37+
enable_block_reuse: false
38+
stream_interval: 10
39+
EOF
40+
else
41+
cat > llama-config.yml << 'EOF'
42+
cuda_graph_config:
43+
enable_padding: true
44+
max_batch_size: 1024
45+
kv_cache_config:
46+
dtype: fp8
47+
enable_block_reuse: false
48+
stream_interval: 10
49+
EOF
50+
fi
51+
52+
# Launch TRT-LLM server
53+
mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
54+
55+
56+
set +x
57+
while IFS= read -r line; do
58+
printf '%s\n' "$line"
59+
if [[ "$line" == *"Application startup complete"* ]]; then
60+
break
61+
fi
62+
done < <(tail -F -n0 "$SERVER_LOG")
63+
64+
set -x
65+
git clone https://github.com/kimbochen/bench_serving.git
66+
python3 bench_serving/benchmark_serving.py \
67+
--model $MODEL --backend openai \
68+
--base-url http://0.0.0.0:$PORT \
69+
--dataset-name random \
70+
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
71+
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
72+
--request-rate inf --ignore-eos \
73+
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
74+
--result-dir /workspace/ \
75+
--result-filename $RESULT_FILENAME.json
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env bash
2+
3+
# === Required Env Vars ===
4+
# HF_TOKEN
5+
# HF_HUB_CACHE
6+
# IMAGE
7+
# MODEL
8+
# ISL
9+
# OSL
10+
# MAX_MODEL_LEN
11+
# RANDOM_RANGE_RATIO
12+
# TP
13+
# CONC
14+
# RESULT_FILENAME
15+
# PORT_OFFSET
16+
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
19+
hf download $MODEL
20+
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
21+
PORT=$(( 8888 + $PORT_OFFSET ))
22+
23+
24+
set -x
25+
26+
# Create llama-config.yml inline
27+
# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
28+
if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
29+
cat > llama-config.yml << 'EOF'
30+
batch_wait_max_tokens_ratio: 0.9
31+
batch_wait_timeout_iters: 20
32+
cuda_graph_config:
33+
enable_padding: true
34+
max_batch_size: 1024
35+
kv_cache_config:
36+
dtype: fp8
37+
enable_block_reuse: false
38+
stream_interval: 10
39+
EOF
40+
else
41+
cat > llama-config.yml << 'EOF'
42+
cuda_graph_config:
43+
enable_padding: true
44+
max_batch_size: 1024
45+
kv_cache_config:
46+
dtype: fp8
47+
enable_block_reuse: false
48+
stream_interval: 10
49+
EOF
50+
fi
51+
52+
# Launch TRT-LLM server
53+
mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
54+
55+
56+
set +x
57+
while IFS= read -r line; do
58+
printf '%s\n' "$line"
59+
if [[ "$line" == *"Application startup complete"* ]]; then
60+
break
61+
fi
62+
done < <(tail -F -n0 "$SERVER_LOG")
63+
64+
set -x
65+
git clone https://github.com/kimbochen/bench_serving.git
66+
python3 bench_serving/benchmark_serving.py \
67+
--model $MODEL --backend openai \
68+
--base-url http://0.0.0.0:$PORT \
69+
--dataset-name random \
70+
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
71+
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
72+
--request-rate inf --ignore-eos \
73+
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
74+
--result-dir /workspace/ \
75+
--result-filename $RESULT_FILENAME.json

benchmarks/70b_fp8_h200_slurm.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env bash
2+
3+
# === Required Env Vars ===
4+
# HF_TOKEN
5+
# HF_HUB_CACHE
6+
# IMAGE
7+
# MODEL
8+
# ISL
9+
# OSL
10+
# MAX_MODEL_LEN
11+
# RANDOM_RANGE_RATIO
12+
# TP
13+
# CONC
14+
# RESULT_FILENAME
15+
# PORT_OFFSET
16+
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
19+
set -x
20+
hf download $MODEL
21+
pip install datasets pandas
22+
23+
# Calculate max-model-len based on ISL and OSL
24+
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
25+
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
26+
elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
27+
CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
28+
else
29+
CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}
30+
fi
31+
32+
# Create config.yaml
33+
cat > config.yaml << EOF
34+
kv-cache-dtype: fp8
35+
async-scheduling: true
36+
no-enable-prefix-caching: true
37+
max-num-batched-tokens: 8192
38+
max-model-len: $CALCULATED_MAX_MODEL_LEN
39+
EOF
40+
41+
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
42+
PORT=$(( 8888 + $PORT_OFFSET ))
43+
44+
export TORCH_CUDA_ARCH_LIST="9.0"
45+
46+
PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
47+
--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC \
48+
--disable-log-requests > $SERVER_LOG 2>&1 &
49+
50+
set +x
51+
while IFS= read -r line; do
52+
printf '%s\n' "$line"
53+
if [[ "$line" == *"Application startup complete"* ]]; then
54+
break
55+
fi
56+
done < <(tail -F -n0 "$SERVER_LOG")
57+
58+
set -x
59+
git clone https://github.com/kimbochen/bench_serving.git
60+
python3 bench_serving/benchmark_serving.py \
61+
--model $MODEL --backend vllm \
62+
--base-url http://0.0.0.0:$PORT \
63+
--dataset-name random \
64+
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
65+
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
66+
--request-rate inf --ignore-eos \
67+
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
68+
--result-dir /workspace/ \
69+
--result-filename $RESULT_FILENAME.json
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env bash
2+
3+
# === Required Env Vars ===
4+
# HF_TOKEN
5+
# HF_HUB_CACHE
6+
# IMAGE
7+
# MODEL
8+
# ISL
9+
# OSL
10+
# MAX_MODEL_LEN
11+
# RANDOM_RANGE_RATIO
12+
# TP
13+
# CONC
14+
# RESULT_FILENAME
15+
# PORT_OFFSET
16+
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
19+
hf download $MODEL
20+
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
21+
PORT=$(( 8888 + $PORT_OFFSET ))
22+
23+
# Create llama-config.yml inline
24+
# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
25+
if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
26+
cat > llama-config.yml << 'EOF'
27+
batch_wait_max_tokens_ratio: 0.9
28+
batch_wait_timeout_iters: 20
29+
cuda_graph_config:
30+
enable_padding: true
31+
max_batch_size: 1024
32+
kv_cache_config:
33+
dtype: fp8
34+
enable_block_reuse: false
35+
stream_interval: 10
36+
EOF
37+
else
38+
cat > llama-config.yml << 'EOF'
39+
cuda_graph_config:
40+
enable_padding: true
41+
max_batch_size: 1024
42+
kv_cache_config:
43+
dtype: fp8
44+
enable_block_reuse: false
45+
stream_interval: 10
46+
EOF
47+
fi
48+
49+
mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
50+
51+
set +x
52+
while IFS= read -r line; do
53+
printf '%s\n' "$line"
54+
if [[ "$line" == *"Application startup complete"* ]]; then
55+
break
56+
fi
57+
done < <(tail -F -n0 "$SERVER_LOG")
58+
59+
set -x
60+
git clone https://github.com/kimbochen/bench_serving.git
61+
python3 bench_serving/benchmark_serving.py \
62+
--model $MODEL --backend openai \
63+
--base-url http://0.0.0.0:$PORT \
64+
--dataset-name random \
65+
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
66+
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
67+
--request-rate inf --ignore-eos \
68+
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
69+
--result-dir /workspace/ \
70+
--result-filename $RESULT_FILENAME.json

benchmarks/70b_fp8_mi325x_slurm.sh

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/usr/bin/bash
2+
3+
# === Required Env Vars ===
4+
# HF_TOKEN
5+
# HF_HUB_CACHE
6+
# IMAGE
7+
# MODEL
8+
# ISL
9+
# OSL
10+
# MAX_MODEL_LEN
11+
# RANDOM_RANGE_RATIO
12+
# TP
13+
# CONC
14+
# RESULT_FILENAME
15+
# PORT_OFFSET
16+
17+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
18+
19+
huggingface-cli download $MODEL
20+
21+
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
22+
PORT=$(( 8888 + $PORT_OFFSET ))
23+
24+
# Reference
25+
# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
26+
27+
cat > config.yaml << EOF
28+
compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
29+
EOF
30+
31+
if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
32+
export VLLM_ROCM_USE_AITER_MHA=0
33+
elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
34+
export VLLM_ROCM_USE_AITER_MHA=0
35+
elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
36+
if [[ "$CONC" -ge "16" ]]; then
37+
export VLLM_ROCM_USE_AITER_MHA=1
38+
else
39+
export VLLM_ROCM_USE_AITER_MHA=0
40+
fi
41+
fi
42+
43+
# Patch the aiter config script to deal
44+
# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
45+
file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
46+
sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#' $file_to_patch
47+
48+
49+
# In this specific case, float16 performs better than the datatype
50+
# picked by vllm when using auto for --dtype (bfloat16).
51+
set -x
52+
vllm serve $MODEL --port=$PORT \
53+
--swap-space=64 \
54+
--gpu-memory-utilization=0.94 \
55+
--dtype=float16 --kv-cache-dtype=fp8 \
56+
--distributed-executor-backend=mp --tensor-parallel-size=$TP \
57+
--max-model-len=$MAX_MODEL_LEN \
58+
--max-seq-len-to-capture=$MAX_MODEL_LEN \
59+
--max-num-seqs=$CONC \
60+
--max-num-batched-tokens=131072 \
61+
--no-enable-prefix-caching \
62+
--config config.yaml \
63+
--async-scheduling \
64+
--disable-log-requests \
65+
> $SERVER_LOG 2>&1 &
66+
67+
set +x
68+
while IFS= read -r line; do
69+
printf '%s\n' "$line"
70+
if [[ "$line" == *"Application startup complete"* ]]; then
71+
break
72+
fi
73+
done < <(tail -F -n0 "$SERVER_LOG")
74+
75+
set -x
76+
git clone https://github.com/kimbochen/bench_serving.git
77+
python3 bench_serving/benchmark_serving.py \
78+
--model $MODEL --backend vllm \
79+
--base-url http://0.0.0.0:$PORT \
80+
--dataset-name random \
81+
--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
82+
--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
83+
--request-rate inf --ignore-eos \
84+
--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
85+
--result-dir /workspace/ \
86+
--result-filename $RESULT_FILENAME.json

benchmarks/dsr1_fp4_b200_trt_slurm.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
104104
set +x
105105
while IFS= read -r line; do
106106
printf '%s\n' "$line"
107-
if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
108-
sleep 5
109-
tail -n100 $SERVER_LOG
110-
echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
111-
exit 1
112-
fi
113107
if [[ "$line" == *"Application startup complete"* ]]; then
114108
break
115109
fi

0 commit comments

Comments
 (0)