|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +source "$(dirname "$0")/../benchmark_lib.sh" |
| 4 | + |
| 5 | +check_env_vars \ |
| 6 | + MODEL \ |
| 7 | + TP \ |
| 8 | + CONC \ |
| 9 | + ISL \ |
| 10 | + OSL \ |
| 11 | + RANDOM_RANGE_RATIO \ |
| 12 | + RESULT_FILENAME \ |
| 13 | + EP_SIZE |
| 14 | + |
| 15 | +if [[ -n "$SLURM_JOB_ID" ]]; then |
| 16 | + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" |
| 17 | +fi |
| 18 | + |
| 19 | +nvidia-smi |
| 20 | + |
| 21 | +hf download "$MODEL" |
| 22 | + |
| 23 | +SERVER_LOG=/workspace/server.log |
| 24 | +PORT=${PORT:-8888} |
| 25 | +MAX_SEQ_LEN=$((ISL + OSL + 20)) |
| 26 | + |
| 27 | +echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" |
| 28 | + |
| 29 | +set -x |
| 30 | +python3 -m sglang.launch_server \ |
| 31 | + --model "$MODEL" \ |
| 32 | + --host 0.0.0.0 \ |
| 33 | + --port "$PORT" \ |
| 34 | + --tp "$TP" \ |
| 35 | + --expert-parallel-size "$EP_SIZE" \ |
| 36 | + --reasoning-parser qwen3 \ |
| 37 | + --tool-call-parser qwen3_coder \ |
| 38 | + --enable-flashinfer-allreduce-fusion \ |
| 39 | + --max-running-requests 128 \ |
| 40 | + --chunked-prefill-size 16384 \ |
| 41 | + --decode-log-interval 1 \ |
| 42 | + --mem-fraction-static 0.8 \ |
| 43 | + --cuda-graph-max-bs "$CONC" \ |
| 44 | + --context-length "$MAX_SEQ_LEN" \ |
| 45 | + --kv-cache-dtype fp8_e4m3 \ |
| 46 | + --quantization fp8 \ |
| 47 | + --attention-backend flashinfer \ |
| 48 | + --stream-interval 50 \ |
| 49 | + --tokenizer-worker-num 6 \ |
| 50 | + --mamba-ssm-dtype bfloat16 \ |
| 51 | + --disable-radix-cache \ |
| 52 | + --trust-remote-code \ |
| 53 | + > "$SERVER_LOG" 2>&1 & |
| 54 | + |
| 55 | +SERVER_PID=$! |
| 56 | + |
| 57 | +# Wait for server to be ready |
| 58 | +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" |
| 59 | + |
| 60 | +pip install -q datasets pandas |
| 61 | + |
| 62 | +run_benchmark_serving \ |
| 63 | + --model "$MODEL" \ |
| 64 | + --port "$PORT" \ |
| 65 | + --backend vllm \ |
| 66 | + --input-len "$ISL" \ |
| 67 | + --output-len "$OSL" \ |
| 68 | + --random-range-ratio "$RANDOM_RANGE_RATIO" \ |
| 69 | + --num-prompts "$((CONC * 10))" \ |
| 70 | + --max-concurrency "$CONC" \ |
| 71 | + --result-filename "$RESULT_FILENAME" \ |
| 72 | + --result-dir /workspace/ |
| 73 | + |
| 74 | +# After throughput, run evaluation only if RUN_EVAL is true |
| 75 | +if [ "${RUN_EVAL}" = "true" ]; then |
| 76 | + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC |
| 77 | + append_lm_eval_summary |
| 78 | +fi |
| 79 | +set +x |
0 commit comments