SemiAnalysisAI · JohnQinAMD · Jun 16, 2026 · Jun 18, 2026
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
@@ -104,6 +104,7 @@ mi300x:
 - 'mi300x-amds_06'
 - 'mi300x-amds_07'
 - 'mi300x-amds_08'
+- 'mi300x-tw_00'
 mi300x-disagg:
 - 'mi300x-amds_06'
 - 'mi300x-amds_07'

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh
@@ -34,6 +34,13 @@ SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MHA=0
+
+export TORCH_BLAS_PREFER_HIPBLASLT=1
+export NCCL_MIN_NCHANNELS="${NCCL_MIN_NCHANNELS:-112}"
+export GPU_MAX_HW_QUEUES="${GPU_MAX_HW_QUEUES:-2}"
+
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
 fi

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3900,3 +3900,11 @@
   description:
     - "Add Qwen3.5-397B-A17B-NVFP4 B200 single-node TensorRT-LLM benchmark (1k/1k and 8k/1k) with a TP/TEP/DEP parallelism sweep"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1711
+
+- config-keys:
+    - minimaxm3-fp8-mi300x-vllm
+  description:
+    - "Enable AITER kernels for MiniMax-M3 MXFP8 on MI300X/gfx942 via the single master toggle VLLM_ROCM_USE_AITER=1: the stock image left it unset, so the hot decode GEMMs and fused MoE ran on the generic kernels. The per-component AITER flags (MoE, linear, RMSNorm, FP8 batched-GEMM) default to True and are gated behind the master flag, so they are left at their defaults. Keep attention on TRITON_ATTN (VLLM_ROCM_USE_AITER_MHA=0, which defaults to True) because the MXFP8 checkpoint lacks calibrated q/prob scales for ROCm FP8 attention."
+    - "Add AMD-recommended, numerically-inert MI300X runtime knobs: TORCH_BLAS_PREFER_HIPBLASLT=1, NCCL_MIN_NCHANNELS=112 (raises RCCL channels above the ~32-64 default for TP8), GPU_MAX_HW_QUEUES=2 (caps HIP streams below the default of 4)."
+    - "Measured uplift on 8xMI300X, 1k1k random sweep (total tok/s/gpu): conc256 782.7->856.1 (+9.4%), conc128 598.9->637.0 (+6.4%), conc64 365.1->392.0 (+7.4%), conc32 295.6->327.4 (+10.8%), conc16 203.1->216.5 (+6.6%), conc8 127.6->136.6 (+7.1%), conc4 80.1->84.6 (+5.6%); conc1-2 unchanged (latency-bound). GSM8K exact-match holds at ~0.95 (kernel-selection change only)."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1808
diff --git a/runners/launch_mi300x-tw.sh b/runners/launch_mi300x-tw.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+export HF_HUB_CACHE_MOUNT="/home/cam/gharunners/hf-hub-cache/"
+export PORT=8888
+
+PARTITION="compute"
+SQUASH_FILE="/home/cam/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+LOCK_FILE="${SQUASH_FILE}.lock"
+
+# Route spec-decoding=mtp configs to the _mtp benchmark script (parity with
+# the h200 launchers, which have carried SPEC_SUFFIX since #392).
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+
+set -x
+
+# Exclude known-bad nodes; let Slurm pick from anything else:
+JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+if [ -z "$JOB_ID" ]; then
+    echo "ERROR: salloc failed to allocate a job"
+    exit 1
+fi
+
+# Use flock to serialize concurrent imports to the same squash file
+srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+    exec 9>\"$LOCK_FILE\"
+    flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+    if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+        echo 'Squash file already exists and is valid, skipping import'
+    else
+        rm -f \"$SQUASH_FILE\"
+        enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+    fi
+"
+srun --jobid=$JOB_ID \
+--container-image=$SQUASH_FILE \
+--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE,/dev/kfd:/dev/kfd,/dev/dri:/dev/dri \
+--container-mount-home \
+--container-writable \
+--container-remap-root \
+--container-workdir=/workspace/ \
+--no-container-entrypoint --export=ALL \
+bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x${SPEC_SUFFIX}.sh
+
+scancel $JOB_ID