Skip to content
22 changes: 22 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2851,6 +2851,28 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 MXFP4 MI355X atom recipe:
# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
minimaxm3-fp4-mi355x-atom:
image: rocm/atom-dev:M3
model: amd/MiniMax-M3-MXFP4
model-prefix: minimaxm3
runner: mi355x
precision: fp4
framework: atom
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 128 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 128 }

# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
# MI355X serving shape, but retain the default BF16 KV cache because this
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
Expand Down
89 changes: 89 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION \
MAX_MODEL_LEN

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log

export OMP_NUM_THREADS=1

# Use the matrix-supplied MAX_MODEL_LEN (isl + osl + 256). Eval-only jobs need a
# larger window for the eval prompts, so override it from the eval context.
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.8

set -x

# Flags follow the ATOM MiniMax-M3 MXFP4 recipe (FP4 on 4xMI355 section):
# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
# --block-size 128 is mandatory for MiniMax MSA. KV cache is left at the default
# dtype: amd/MiniMax-M3-MXFP4 ships no calibrated FP8 KV scales, so
# --kv_cache_dtype fp8 trips an assertion (k_scale is None) in the MSA
# fused_qknorm kernel during init.
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
-tp $TP \
--max-model-len $MAX_MODEL_LEN $EP \
--block-size 128 \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--trust-remote-code \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3918,6 +3918,13 @@
- "This issue is now fixed in the latest TRTLLM release."
- "Also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767

- config-keys:
- minimaxm3-fp4-mi355x-atom
description:
- "Add day-zero MiniMax-M3 MXFP4 (amd/MiniMax-M3-MXFP4) single-node atom benchmark on MI355X, following the ROCm/ATOM MiniMax-M3 recipe (TP4, block size 128 for MSA, default KV cache dtype)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1813


- config-keys:
- glm5-fp4-gb300-dynamo-trt
Expand Down
5 changes: 3 additions & 2 deletions runners/launch_mi355x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,9 @@ else
fi

# MiniMax-M3 weights are not staged on the node-local /var/lib NVMe cache;
# they are pre-downloaded once to the NFS share instead.
if [[ "$MODEL" == MiniMaxAI/MiniMax-M3* ]]; then
# they are pre-downloaded once to the NFS share instead. Covers both the
# MiniMaxAI MXFP8 checkpoint and the amd MXFP4 atom checkpoint.
if [[ "$MODEL" == MiniMaxAI/MiniMax-M3* || "$MODEL" == amd/MiniMax-M3* ]]; then
export HF_HUB_CACHE_MOUNT="/it-share/hf-hub-cache/"
fi

Expand Down