SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 50 additions & 12 deletions b/‎.github/configs/amd-master.yaml‎
Lines changed: 50 additions & 12 deletions
diff --git a/‎.github/configs/nvidia-master.yaml‎
Lines changed: 4 additions & 4 deletions b/‎.github/configs/nvidia-master.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/benchmark_lib.sh‎
Lines changed: 2 additions & 0 deletions b/‎benchmarks/benchmark_lib.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/env.sh‎
Lines changed: 10 additions & 4 deletions b/‎benchmarks/multi_node/amd_utils/env.sh‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/job.slurm‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/multi_node/amd_utils/job.slurm‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/server_sglang.sh‎
Lines changed: 28 additions & 9 deletions b/‎benchmarks/multi_node/amd_utils/server_sglang.sh‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/server_vllm.sh‎
Lines changed: 1 addition & 34 deletions b/‎benchmarks/multi_node/amd_utils/server_vllm.sh‎
Lines changed: 1 addition & 34 deletions
diff --git a/‎benchmarks/single_node/glm5_fp4_b300.sh‎
Lines changed: 0 additions & 7 deletions b/‎benchmarks/single_node/glm5_fp4_b300.sh‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎benchmarks/single_node/glm5_fp4_b300_mtp.sh‎
Lines changed: 0 additions & 6 deletions b/‎benchmarks/single_node/glm5_fp4_b300_mtp.sh‎
Lines changed: 0 additions & 6 deletions
@@ -1611,7 +1611,7 @@ minimaxm2.5-fp8-mi325x-vllm-disagg:
           - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1800,6 +1800,25 @@ dsr1-fp4-mi355x-sglang-disagg:
           - "DECODE_NODES=2"
           - "DECODE_MTP_SIZE=0"
 
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "none"
+        conc-list: [ 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
       # 2*DEP8 + 1*DEP8
       - spec-decoding: "none"
         conc-list: [ 1024, 2048, 4096 ]
@@ -1820,7 +1839,7 @@ dsr1-fp4-mi355x-sglang-disagg:
           - "DECODE_MTP_SIZE=0"
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1990,24 +2009,43 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=2"
           - "DECODE_MTP_SIZE=2"
 
-      # 1P2D TP4
+      # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
-        conc-list: [ 64, 128, 256 ]
+        conc-list: [ 128, 512 ]
         prefill:
           num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
           - "PREFILL_NODES=1"
         decode:
-          num-worker: 2
+          num-worker: 1
           tp: 8
-          ep: 1
-          dp-attn: false
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "DECODE_NODES=2"
-          - "DECODE_MTP_SIZE=2"
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
+
+      # 1*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 64, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
 
       # 2*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
 
@@ -2352,7 +2352,7 @@ glm5-fp4-b200-sglang-mtp:
   # does not have a B300-specific recipe, so this config reuses the existing
   # GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available.
 glm5-fp4-b300-sglang:
-  image: lmsysorg/sglang:v0.5.12-cu130
+  image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
   runner: b300
@@ -2373,7 +2373,7 @@ glm5-fp4-b300-sglang:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
 glm5-fp4-b300-sglang-mtp:
-  image: lmsysorg/sglang:v0.5.12-cu130
+  image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
   runner: b300
@@ -4754,11 +4754,11 @@ minimaxm2.5-fp8-h200-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 256 }
 
 # Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
 # identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this
 
@@ -15,6 +15,7 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
 
 GPU_MONITOR_PID=""
 GPU_METRICS_CSV="/workspace/gpu_metrics.csv"
+export GPU_METRICS_CSV
 
 # Start background GPU monitoring that logs metrics every second to CSV.
 # Auto-detects NVIDIA (nvidia-smi) or AMD (amd-smi) GPUs.
@@ -32,6 +33,7 @@ start_gpu_monitor() {
     done
 
     GPU_METRICS_CSV="$output"
+    export GPU_METRICS_CSV
 
     if command -v nvidia-smi &>/dev/null; then
         nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \
 
@@ -22,10 +22,11 @@ if [[ -z "$IBDEVICES" ]]; then
     DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
     if [[ -n "$DETECTED" ]]; then
         export IBDEVICES="$DETECTED"
+        echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES via ibv_devinfo on $(hostname -s)"
     else
-        echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
+        echo "ERROR: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
+        exit 1
     fi
-    echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)"
 else
     echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
 fi
@@ -52,6 +53,10 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
     # =========================================================================
     # vLLM/Nixl-specific environment
     # =========================================================================
+    export VLLM_USE_V1=1
+    export VLLM_SERVER_DEV_MODE=0
+    export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1
+
     set -x
 
     export VLLM_MORIIO_QP_PER_TRANSFER=4
@@ -129,7 +134,8 @@ else
     export SGLANG_USE_AITER=1
 
     export SGLANG_MORI_DISPATCH_DTYPE=auto
-    export SGLANG_MORI_FP8_COMB=true
+    export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast
+    export MORI_COMBINE_DTYPE_DECODE=fp8
     export SGLANG_MORI_QP_PER_TRANSFER=4
     export SGLANG_MORI_NUM_WORKERS=4
     export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
@@ -148,7 +154,7 @@ else
 
     # Enable spec v2
     export SGLANG_ENABLE_SPEC_V2=1
-    export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
+    export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0
 
     export SGLANG_LOG_MS=true
     export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 
@@ -388,6 +388,7 @@ DOCKER_ENV_COMMON=(
     -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP
     -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
     -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
+    -e IS_MULTINODE=\$IS_MULTINODE
 )
 
 # Engine-specific env vars
 
@@ -33,6 +33,9 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
 BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
 BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
 
+# Extract the maximum concurrency from the x-delimited list
+BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
+
 # Dry Run for debugging purpose
 DRY_RUN="${DRY_RUN:-0}"
 
@@ -184,6 +187,15 @@ else
     prefill_enable_two_batch_overlap="false"
 fi
 
+# When both DP and EP are enabled, override max-running-requests with max bench concurrency
+if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then
+    prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
+    prefill_dp_ranks=$PREFILL_TP_SIZE
+    # MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
+    MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
+    echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL"
+fi
+
 # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
 if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
     decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
@@ -196,6 +208,18 @@ else
     decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
 fi
 
+# When both DP and EP are enabled, override max-running-requests and dispatch tokens
+if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then
+    decode_max_running_requests=$BENCH_MAX_CONC_VALUE
+    decode_dp_ranks=$DECODE_TP_SIZE
+    MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks))
+    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
+    # Update derived variable
+    SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
+    export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
+    echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD"
+fi
+
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
 PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
@@ -343,11 +367,6 @@ if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]
     DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
     unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
     unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
-    # NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness
-    # or on SGLang native harness for high concurrency 4k and gets no where near the golden score of
-    # 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD
-    # and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising 
-    # that an fast follow PR to fix the evals via having quant correction in the fp8 combine
 fi
 
 # =============================================================================
@@ -398,7 +417,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
     fi
     set +x
-    PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/$MODEL_NAME \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -630,7 +649,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
     fi
     set +x
-    PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/${MODEL_NAME} \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -698,7 +717,7 @@ else
         DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
     fi
     set +x
-    DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+    DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
         --model-path ${MODEL_DIR}/${MODEL_NAME} \
         --disaggregation-mode decode \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -758,4 +777,4 @@ else
 fi
 
 echo "Script completed successfully"
-exit 0
+exit 0
@@ -196,34 +196,6 @@ python3 $WS_PATH/sync.py barrier \
     --wait-for-all-ports \
     --timeout 600
 
-# =============================================================================
-# ETCD Server Setup
-# =============================================================================
-
-# echo "Proceeding to start etcd server on $host_name"
-# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 &
-# etcd_pid=$!
-
-# echo "Waiting at etcd server barrier on $host_name"
-# python3 $WS_PATH/sync.py barrier \
-#     --node-ips ${IPADDRS} \
-#     --node-ports 2379 \
-#     --wait-for-all-ports \
-#     --timeout 300
-
-# echo "All etcd servers are up : $host_name"
-# sleep 3
-
-# echo "etcd endpoint health=================="
-# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true
-# echo "======================================"
-
-# python3 $WS_PATH/sync.py barrier \
-#     --node-ips ${IPADDRS} \
-#     --node-ports 2379 \
-#     --wait-for-all-ports \
-#     --timeout 300
-
 # =============================================================================
 # Cluster Topology Configuration
 # =============================================================================
@@ -246,15 +218,10 @@ echo "Decode  node IPs: ${DECODE_ARGS}"
 # MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address)
 PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
 
-# vLLM environment (UCX transport vars are set at the Docker level in job.slurm)
+# vLLM runtime environment (static vars moved to env.sh; these depend on per-node state)
 setup_vllm_env() {
-    export VLLM_USE_V1=1
-    export VLLM_SERVER_DEV_MODE=0
     export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip}
     export VLLM_NIXL_SIDE_CHANNEL_PORT=5600
-    # Workaround: disable request-ID randomization so MoRI-IO connector can
-    # match completion IDs between prefill and decode without PR #34907 patch.
-    export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1
     for env_pair in ${MODEL_ENVS}; do
         export "$env_pair"
     done
 
@@ -24,13 +24,6 @@ nvidia-smi
 
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
-# Downgrade flashinfer to the version pinned in sglang v0.5.11 to test the
-# trtllm batched-GEMM regression suspicion from sgl-project/sglang#25563
-# (suggested by @trevor-m). sglang v0.5.12's pyproject.toml moved from
-# flashinfer_python==0.6.8.post1 → 0.6.11.post1, and the trtllm GEMM crash
-# at bs=128 + EAGLE on B300 appeared in the same image bump.
-pip install --no-deps "flashinfer_python==0.6.8.post1" "flashinfer_cubin==0.6.8.post1"
-
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
 
@@ -24,12 +24,6 @@ nvidia-smi
 if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
-# Downgrade flashinfer to the version pinned in sglang v0.5.11 to test the
-# trtllm batched-GEMM regression suspicion from sgl-project/sglang#25563
-# (suggested by @trevor-m). sglang v0.5.12's pyproject.toml moved from
-# flashinfer_python==0.6.8.post1 → 0.6.11.post1, and the trtllm GEMM crash
-# at bs=128 + EAGLE on B300 appeared in the same image bump.
-pip install --no-deps "flashinfer_python==0.6.8.post1" "flashinfer_cubin==0.6.8.post1"
 
 export SGL_ENABLE_JIT_DEEPGEMM=1
 export SGLANG_ENABLE_SPEC_V2=1
Original file line number	Diff line number	Diff line change
`@@ -388,6 +388,7 @@ DOCKER_ENV_COMMON=(`
`388`	`388`	`-e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP`
`389`	`389`	`-e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP`
`390`	`390`	`-e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE`
	`391`	`+ -e IS_MULTINODE=\$IS_MULTINODE`
`391`	`392`	`)`
`392`	`393`
`393`	`394`	`# Engine-specific env vars`