diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 5999d3f93..1c3446594 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -104,6 +104,7 @@ mi300x: - 'mi300x-amds_06' - 'mi300x-amds_07' - 'mi300x-amds_08' +- 'mi300x-tw_00' mi300x-disagg: - 'mi300x-amds_06' - 'mi300x-amds_07' diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh index f2cdaf284..b6386be4c 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh @@ -34,6 +34,13 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MHA=0 + +export TORCH_BLAS_PREFER_HIPBLASLT=1 +export NCCL_MIN_NCHANNELS="${NCCL_MIN_NCHANNELS:-112}" +export GPU_MAX_HW_QUEUES="${GPU_MAX_HW_QUEUES:-2}" + if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context fi diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1a7eb91e3..fdda04753 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3900,3 +3900,11 @@ description: - "Add Qwen3.5-397B-A17B-NVFP4 B200 single-node TensorRT-LLM benchmark (1k/1k and 8k/1k) with a TP/TEP/DEP parallelism sweep" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1711 + +- config-keys: + - minimaxm3-fp8-mi300x-vllm + description: + - "Enable AITER kernels for MiniMax-M3 MXFP8 on MI300X/gfx942 via the single master toggle VLLM_ROCM_USE_AITER=1: the stock image left it unset, so the hot decode GEMMs and fused MoE ran on the generic kernels. The per-component AITER flags (MoE, linear, RMSNorm, FP8 batched-GEMM) default to True and are gated behind the master flag, so they are left at their defaults. Keep attention on TRITON_ATTN (VLLM_ROCM_USE_AITER_MHA=0, which defaults to True) because the MXFP8 checkpoint lacks calibrated q/prob scales for ROCm FP8 attention." + - "Add AMD-recommended, numerically-inert MI300X runtime knobs: TORCH_BLAS_PREFER_HIPBLASLT=1, NCCL_MIN_NCHANNELS=112 (raises RCCL channels above the ~32-64 default for TP8), GPU_MAX_HW_QUEUES=2 (caps HIP streams below the default of 4)." + - "Measured uplift on 8xMI300X, 1k1k random sweep (total tok/s/gpu): conc256 782.7->856.1 (+9.4%), conc128 598.9->637.0 (+6.4%), conc64 365.1->392.0 (+7.4%), conc32 295.6->327.4 (+10.8%), conc16 203.1->216.5 (+6.6%), conc8 127.6->136.6 (+7.1%), conc4 80.1->84.6 (+5.6%); conc1-2 unchanged (latency-bound). GSM8K exact-match holds at ~0.95 (kernel-selection change only)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1808 diff --git a/runners/launch_mi300x-tw.sh b/runners/launch_mi300x-tw.sh new file mode 100755 index 000000000..517f62e71 --- /dev/null +++ b/runners/launch_mi300x-tw.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +export HF_HUB_CACHE_MOUNT="/home/cam/gharunners/hf-hub-cache/" +export PORT=8888 + +PARTITION="compute" +SQUASH_FILE="/home/cam/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" + +# Route spec-decoding=mtp configs to the _mtp benchmark script (parity with +# the h200 launchers, which have carried SPEC_SUFFIX since #392). +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + +set -x + +# Exclude known-bad nodes; let Slurm pick from anything else: +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + +if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 +fi + +# Use flock to serialize concurrent imports to the same squash file +srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi +" +srun --jobid=$JOB_ID \ +--container-image=$SQUASH_FILE \ +--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,/dev/kfd:/dev/kfd,/dev/dri:/dev/dri \ +--container-mount-home \ +--container-writable \ +--container-remap-root \ +--container-workdir=/workspace/ \ +--no-container-entrypoint --export=ALL \ +bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x${SPEC_SUFFIX}.sh + +scancel $JOB_ID