11#! /usr/bin/env bash
22
33# MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe.
4- # Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128
5- # is mandatory for MSA sparse attention. Keep the default BF16 KV cache on
6- # gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8
7- # attention, and vLLM's fallback scale of 1.0 corrupts model accuracy.
4+ # Reuses the dedicated ROCm image and converts MXFP8 MoE weights to 128x128
5+ # block FP8 at load time. Block size 128 is mandatory for MSA sparse attention.
6+ # Keep the default BF16 KV cache on gfx942: the checkpoint has no calibrated
7+ # q/prob scales for ROCm FP8 attention, and vLLM's fallback scale of 1.0
8+ # corrupts model accuracy.
9+ # Target image vLLM revision: 4a560dd8db67c270f5e2afb614558271b76f2294.
810
911source " $( dirname " $0 " ) /../../benchmark_lib.sh"
1012
@@ -24,6 +26,29 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
2426 echo " JOB $SLURM_JOB_ID running on $SLURMD_NODENAME "
2527fi
2628
29+ VLLM_PACKAGE_ROOT=" $(
30+ python - << 'PY '
31+ from pathlib import Path
32+
33+ import vllm
34+
35+ print(Path(vllm.__file__).resolve().parent.parent)
36+ PY
37+ ) "
38+ MXFP8_PATCH=" $( dirname " $0 " ) /minimaxm3_mi300x_mxfp8.patch"
39+ MXFP8_ORACLE=" $VLLM_PACKAGE_ROOT /vllm/model_executor/layers/fused_moe/oracle/mxfp8.py"
40+ MXFP8_MARKER=" MXFP8 MoE weights will be converted to 128x128 block FP8"
41+ if ! grep -q " $MXFP8_MARKER " " $MXFP8_ORACLE " ; then
42+ if ! patch --batch --forward -d " $VLLM_PACKAGE_ROOT " -p1 < " $MXFP8_PATCH " ; then
43+ echo " Failed to apply the MI300X MXFP8 patch" >&2
44+ exit 1
45+ fi
46+ fi
47+ if ! grep -q " $MXFP8_MARKER " " $MXFP8_ORACLE " ; then
48+ echo " MI300X MXFP8 conversion marker is missing after patching" >&2
49+ exit 1
50+ fi
51+
2752if [[ " $MODEL " != /* ]]; then hf download " $MODEL " ; fi
2853
2954if [ -n " $ROCR_VISIBLE_DEVICES " ]; then
0 commit comments