initial poc

cquil11 · cquil11 · commit 167b1f2b2700 · 2025-11-12T14:19:08.000-06:00
diff --git a/benchmarks/gptoss_fp4_h100_docker.sh b/benchmarks/gptoss_fp4_h100_docker.sh
@@ -7,6 +7,9 @@
 # MAX_MODEL_LEN
 # TP
 # CONC
+# ISL
+# OSL
+
 
 cat > config.yaml << EOF
 compilation-config: '{"cudagraph_mode":"PIECEWISE"}'
@@ -18,11 +21,35 @@ max-model-len: 10240
 EOF
 
 export PYTHONNOUSERSITE=1
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --config config.yaml \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
 --max-num-seqs=$CONC  \
---disable-log-requests
+--disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ Application\ startup\ complete ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+pip install -q datasets pandas
+git clone https://github.com/kimbochen/bench_serving.git
+set -x
+python3 bench_serving/benchmark_serving.py \
+--model=$MODEL \
+--backend=vllm \
+--base-url=\"http://localhost:$PORT\" \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics='ttft,tpot,itl,e2el' \
+--result-dir=/workspace/ \
+--result-filename=$RESULT_FILENAME.json"
diff --git a/benchmarks/gptoss_fp4_h100_slurm.sh b/benchmarks/gptoss_fp4_h100_slurm.sh
@@ -3,7 +3,6 @@
 # === Required Env Vars === 
 # HF_TOKEN
 # HF_HUB_CACHE
-# IMAGE
 # MODEL
 # ISL
 # OSL
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
@@ -4,7 +4,7 @@ HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/"
 PORT=8888
 
 server_name="bmk-server"
-client_name="bmk-client"
+# client_name="bmk-client"
 
 set -x
 docker run --rm -d --network=host --name=$server_name \
@@ -17,38 +17,38 @@ docker run --rm -d --network=host --name=$server_name \
 $IMAGE \
 benchmarks/"${EXP_NAME%%_*}_${PRECISION}_h100_docker.sh"
 
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ Application\ startup\ complete ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then
-    echo "Server container launch failed."
-    exit 1
-fi
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network=host --name=$client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=/bin/bash \
-$IMAGE \
--lc "pip install -q datasets pandas && \
-python3 bench_serving/benchmark_serving.py \
---model=$MODEL \
---backend=vllm \
---base-url=\"http://localhost:$PORT\" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json"
+# set +x
+# while IFS= read -r line; do
+#     printf '%s\n' "$line"
+#     if [[ "$line" =~ Application\ startup\ complete ]]; then
+#         break
+#     fi
+# done < <(docker logs -f --tail=0 $server_name 2>&1)
+
+# if ! docker ps --format "{{.Names}}" | grep -q "$server_name"; then
+#     echo "Server container launch failed."
+#     exit 1
+# fi
+
+# git clone https://github.com/kimbochen/bench_serving.git
+
+# set -x
+# docker run --rm --network=host --name=$client_name \
+# -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
+# -e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+# --entrypoint=/bin/bash \
+# $IMAGE \
+# -lc "pip install -q datasets pandas && \
+# python3 bench_serving/benchmark_serving.py \
+# --model=$MODEL \
+# --backend=vllm \
+# --base-url=\"http://localhost:$PORT\" \
+# --dataset-name=random \
+# --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+# --num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
+# --request-rate=inf --ignore-eos \
+# --save-result --percentile-metrics='ttft,tpot,itl,e2el' \
+# --result-dir=/workspace/ \
+# --result-filename=$RESULT_FILENAME.json"
 
 docker stop $server_name