SemiAnalysisAI
diff --git a/‎benchmarks/70b_fp4_b200_trt_slurm.sh‎
Lines changed: 75 additions & 0 deletions b/‎benchmarks/70b_fp4_b200_trt_slurm.sh‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎benchmarks/70b_fp8_b200_trt_slurm.sh‎
Lines changed: 75 additions & 0 deletions b/‎benchmarks/70b_fp8_b200_trt_slurm.sh‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎benchmarks/70b_fp8_h200_slurm.sh‎
Lines changed: 69 additions & 0 deletions b/‎benchmarks/70b_fp8_h200_slurm.sh‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎benchmarks/70b_fp8_h200_trt_slurm.sh‎
Lines changed: 70 additions & 0 deletions b/‎benchmarks/70b_fp8_h200_trt_slurm.sh‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎benchmarks/70b_fp8_mi325x_slurm.sh‎
Lines changed: 86 additions & 0 deletions b/‎benchmarks/70b_fp8_mi325x_slurm.sh‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎benchmarks/dsr1_fp4_b200_trt_slurm.sh‎
Lines changed: 0 additions & 6 deletions b/‎benchmarks/dsr1_fp4_b200_trt_slurm.sh‎
Lines changed: 0 additions & 6 deletions
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+pip install datasets pandas
+
+# Calculate max-model-len based on ISL and OSL
+if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
+elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
+else
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+fi
+
+# Create config.yaml
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
+ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
+ --disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
@@ -0,0 +1,86 @@
+#!/usr/bin/bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+huggingface-cli download $MODEL
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+	if [[ "$CONC" -ge "16" ]]; then
+		export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+	fi
+fi
+
+# Patch the aiter config script to deal
+# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
+file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
+sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
+
+
+# In this specific case, float16 performs better than the datatype
+# picked by vllm when using auto for --dtype (bfloat16).
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=float16 --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests \
+> $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
@@ -104,12 +104,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi