Skip to content

Commit 9031110

Browse files
Oseltamivircodex
andcommitted
perf(mi300x): use load-time block FP8 MoE conversion
Co-authored-by: OpenAI Codex <codex@openai.com> Signed-off-by: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
1 parent ba5879b commit 9031110

6 files changed

Lines changed: 520 additions & 9 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2851,10 +2851,11 @@ minimaxm3-fp8-mi355x-vllm-mtp:
28512851
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
28522852
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
28532853

2854-
# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
2855-
# MI355X serving shape, but retain the default BF16 KV cache because this
2856-
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
2857-
# search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency.
2854+
# MiniMax-M3 MXFP8 MI300X recipe. Convert the checkpoint's MXFP8 MoE weights to
2855+
# 128x128 block FP8 at load time and use the regular Triton block-FP8 backend.
2856+
# Retain the default BF16 KV cache because this checkpoint lacks calibrated
2857+
# ROCm FP8 attention scales. Use TP8 for latency and TP8+EP8 at high
2858+
# concurrency.
28582859
minimaxm3-fp8-mi300x-vllm:
28592860
image: vllm/vllm-openai-rocm:minimax-m3
28602861
model: MiniMaxAI/MiniMax-M3-MXFP8

benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
#!/usr/bin/env bash
22

33
# MiniMax-M3 MXFP8 MI300X (gfx942) single-node vLLM recipe.
4-
# Reuses the dedicated ROCm image and the MI355X serving shape. Block size 128
5-
# is mandatory for MSA sparse attention. Keep the default BF16 KV cache on
6-
# gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8
7-
# attention, and vLLM's fallback scale of 1.0 corrupts model accuracy.
4+
# Reuses the dedicated ROCm image and converts MXFP8 MoE weights to 128x128
5+
# block FP8 at load time. Block size 128 is mandatory for MSA sparse attention.
6+
# Keep the default BF16 KV cache on gfx942: the checkpoint has no calibrated
7+
# q/prob scales for ROCm FP8 attention, and vLLM's fallback scale of 1.0
8+
# corrupts model accuracy.
9+
# Target image vLLM revision: 4a560dd8db67c270f5e2afb614558271b76f2294.
810

911
source "$(dirname "$0")/../../benchmark_lib.sh"
1012

@@ -24,6 +26,29 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
2426
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
2527
fi
2628

29+
VLLM_PACKAGE_ROOT="$(
30+
python - <<'PY'
31+
from pathlib import Path
32+
33+
import vllm
34+
35+
print(Path(vllm.__file__).resolve().parent.parent)
36+
PY
37+
)"
38+
MXFP8_PATCH="$(dirname "$0")/minimaxm3_mi300x_mxfp8.patch"
39+
MXFP8_ORACLE="$VLLM_PACKAGE_ROOT/vllm/model_executor/layers/fused_moe/oracle/mxfp8.py"
40+
MXFP8_MARKER="MXFP8 MoE weights will be converted to 128x128 block FP8"
41+
if ! grep -q "$MXFP8_MARKER" "$MXFP8_ORACLE"; then
42+
if ! patch --batch --forward -d "$VLLM_PACKAGE_ROOT" -p1 < "$MXFP8_PATCH"; then
43+
echo "Failed to apply the MI300X MXFP8 patch" >&2
44+
exit 1
45+
fi
46+
fi
47+
if ! grep -q "$MXFP8_MARKER" "$MXFP8_ORACLE"; then
48+
echo "MI300X MXFP8 conversion marker is missing after patching" >&2
49+
exit 1
50+
fi
51+
2752
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2853

2954
if [ -n "$ROCR_VISIBLE_DEVICES" ]; then

0 commit comments

Comments
 (0)