Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2851,6 +2851,28 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 MXFP4 MI355X atom recipe:
# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
minimaxm3-fp4-mi355x-atom:
image: rocm/atom-dev:M3
model: amd/MiniMax-M3-MXFP4
model-prefix: minimaxm3
runner: mi355x
precision: fp4
framework: atom
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 128 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 128 }

# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
# MI355X serving shape, but retain the default BF16 KV cache because this
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
Expand Down
89 changes: 89 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION \
MAX_MODEL_LEN

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log

export OMP_NUM_THREADS=1

# Use the matrix-supplied MAX_MODEL_LEN (isl + osl + 256). Eval-only jobs need a
# larger window for the eval prompts, so override it from the eval context.
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
Comment thread
cursor[bot] marked this conversation as resolved.

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.8

set -x

# Flags follow the ATOM MiniMax-M3 MXFP4 recipe (FP4 on 4xMI355 section):
# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
# --block-size 128 is mandatory for MiniMax MSA. KV cache is left at the default
# dtype: amd/MiniMax-M3-MXFP4 ships no calibrated FP8 KV scales, so
# --kv_cache_dtype fp8 trips an assertion (k_scale is None) in the MSA
# fused_qknorm kernel during init.
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
-tp $TP \
--max-model-len $MAX_MODEL_LEN $EP \
--block-size 128 \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--trust-remote-code \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
8 changes: 7 additions & 1 deletion perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3918,4 +3918,10 @@
- "This issue is now fixed in the latest TRTLLM release."
- "Also update all configs for DSR1 TRTLLM FP8 to reflect latest released image usage"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1767


- config-keys:
- minimaxm3-fp4-mi355x-atom
description:
- "Add day-zero MiniMax-M3 MXFP4 (amd/MiniMax-M3-MXFP4) single-node atom benchmark on MI355X, following the ROCm/ATOM MiniMax-M3 recipe (TP4, block size 128 for MSA, default KV cache dtype)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1812

5 changes: 3 additions & 2 deletions runners/launch_mi355x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,9 @@ else
fi

# MiniMax-M3 weights are not staged on the node-local /var/lib NVMe cache;
# they are pre-downloaded once to the NFS share instead.
if [[ "$MODEL" == MiniMaxAI/MiniMax-M3* ]]; then
# they are pre-downloaded once to the NFS share instead. Covers both the
# MiniMaxAI MXFP8 checkpoint and the amd MXFP4 atom checkpoint.
if [[ "$MODEL" == MiniMaxAI/MiniMax-M3* || "$MODEL" == amd/MiniMax-M3* ]]; then
export HF_HUB_CACHE_MOUNT="/it-share/hf-hub-cache/"
fi

Expand Down