Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ mi300x:
- 'mi300x-amds_06'
- 'mi300x-amds_07'
- 'mi300x-amds_08'
- 'mi300x-tw_00'
mi300x-disagg:
- 'mi300x-amds_06'
- 'mi300x-amds_07'
Expand Down
7 changes: 7 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_USE_BREAKABLE_CUDAGRAPH=0

export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_MHA=0

export TORCH_BLAS_PREFER_HIPBLASLT=1
export NCCL_MIN_NCHANNELS="${NCCL_MIN_NCHANNELS:-112}"
export GPU_MAX_HW_QUEUES="${GPU_MAX_HW_QUEUES:-2}"

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
fi
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3900,3 +3900,11 @@
description:
- "Add Qwen3.5-397B-A17B-NVFP4 B200 single-node TensorRT-LLM benchmark (1k/1k and 8k/1k) with a TP/TEP/DEP parallelism sweep"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1711

- config-keys:
- minimaxm3-fp8-mi300x-vllm
description:
- "Enable AITER kernels for MiniMax-M3 MXFP8 on MI300X/gfx942 via the single master toggle VLLM_ROCM_USE_AITER=1: the stock image left it unset, so the hot decode GEMMs and fused MoE ran on the generic kernels. The per-component AITER flags (MoE, linear, RMSNorm, FP8 batched-GEMM) default to True and are gated behind the master flag, so they are left at their defaults. Keep attention on TRITON_ATTN (VLLM_ROCM_USE_AITER_MHA=0, which defaults to True) because the MXFP8 checkpoint lacks calibrated q/prob scales for ROCm FP8 attention."
- "Add AMD-recommended, numerically-inert MI300X runtime knobs: TORCH_BLAS_PREFER_HIPBLASLT=1, NCCL_MIN_NCHANNELS=112 (raises RCCL channels above the ~32-64 default for TP8), GPU_MAX_HW_QUEUES=2 (caps HIP streams below the default of 4)."
- "Measured uplift on 8xMI300X, 1k1k random sweep (total tok/s/gpu): conc256 782.7->856.1 (+9.4%), conc128 598.9->637.0 (+6.4%), conc64 365.1->392.0 (+7.4%), conc32 295.6->327.4 (+10.8%), conc16 203.1->216.5 (+6.6%), conc8 127.6->136.6 (+7.1%), conc4 80.1->84.6 (+5.6%); conc1-2 unchanged (latency-bound). GSM8K exact-match holds at ~0.95 (kernel-selection change only)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1808
45 changes: 45 additions & 0 deletions runners/launch_mi300x-tw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash

export HF_HUB_CACHE_MOUNT="/home/cam/gharunners/hf-hub-cache/"
export PORT=8888

PARTITION="compute"
SQUASH_FILE="/home/cam/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

# Route spec-decoding=mtp configs to the _mtp benchmark script (parity with
# the h200 launchers, which have carried SPEC_SUFFIX since #392).
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

set -x

# Exclude known-bad nodes; let Slurm pick from anything else:
JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')

if [ -z "$JOB_ID" ]; then
echo "ERROR: salloc failed to allocate a job"
exit 1
fi

# Use flock to serialize concurrent imports to the same squash file
srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
rm -f \"$SQUASH_FILE\"
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
fi
"
srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,/dev/kfd:/dev/kfd,/dev/dri:/dev/dri \
--container-mount-home \
--container-writable \
--container-remap-root \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x${SPEC_SUFFIX}.sh

scancel $JOB_ID